aiverify-moonshot 0.4.5__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aiverify_moonshot-0.4.5.dist-info → aiverify_moonshot-0.4.6.dist-info}/METADATA +3 -2
- {aiverify_moonshot-0.4.5.dist-info → aiverify_moonshot-0.4.6.dist-info}/RECORD +24 -22
- moonshot/integrations/cli/benchmark/cookbook.py +226 -42
- moonshot/integrations/cli/benchmark/datasets.py +53 -8
- moonshot/integrations/cli/benchmark/metrics.py +48 -7
- moonshot/integrations/cli/benchmark/recipe.py +283 -42
- moonshot/integrations/cli/benchmark/result.py +73 -30
- moonshot/integrations/cli/benchmark/run.py +43 -11
- moonshot/integrations/cli/benchmark/runner.py +29 -20
- moonshot/integrations/cli/cli_errors.py +511 -0
- moonshot/integrations/cli/common/connectors.py +137 -6
- moonshot/integrations/cli/common/dataset.py +66 -13
- moonshot/integrations/cli/common/prompt_template.py +38 -2
- moonshot/integrations/cli/redteam/session.py +126 -43
- moonshot/integrations/web_api/app.py +1 -1
- moonshot/src/api/api_bookmark.py +6 -6
- moonshot/src/bookmark/bookmark.py +119 -60
- moonshot/src/bookmark/bookmark_arguments.py +10 -0
- moonshot/src/messages_constants.py +40 -0
- moonshot/src/runners/runner.py +1 -1
- {aiverify_moonshot-0.4.5.dist-info → aiverify_moonshot-0.4.6.dist-info}/WHEEL +0 -0
- {aiverify_moonshot-0.4.5.dist-info → aiverify_moonshot-0.4.6.dist-info}/licenses/AUTHORS.md +0 -0
- {aiverify_moonshot-0.4.5.dist-info → aiverify_moonshot-0.4.6.dist-info}/licenses/LICENSE.md +0 -0
- {aiverify_moonshot-0.4.5.dist-info → aiverify_moonshot-0.4.6.dist-info}/licenses/NOTICES.md +0 -0
|
@@ -5,6 +5,13 @@ from rich.console import Console
|
|
|
5
5
|
from rich.table import Table
|
|
6
6
|
|
|
7
7
|
from moonshot.api import api_delete_metric, api_get_all_metric, api_get_all_metric_name
|
|
8
|
+
from moonshot.integrations.cli.cli_errors import (
|
|
9
|
+
ERROR_BENCHMARK_DELETE_METRIC_METRIC_VALIDATION,
|
|
10
|
+
ERROR_BENCHMARK_LIST_METRICS_FIND_VALIDATION,
|
|
11
|
+
ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION,
|
|
12
|
+
ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION_1,
|
|
13
|
+
ERROR_BENCHMARK_VIEW_METRIC_METRIC_FILENAME_VALIDATION,
|
|
14
|
+
)
|
|
8
15
|
from moonshot.integrations.cli.utils.process_data import filter_data
|
|
9
16
|
|
|
10
17
|
console = Console()
|
|
@@ -18,23 +25,44 @@ def list_metrics(args) -> list | None:
|
|
|
18
25
|
List all available metrics.
|
|
19
26
|
|
|
20
27
|
This function retrieves all available metrics by calling the api_get_all_metric function from the
|
|
21
|
-
moonshot.api module. It then
|
|
22
|
-
it prints
|
|
28
|
+
moonshot.api module. It then filters the metrics based on the provided keyword and pagination arguments.
|
|
29
|
+
If there are no metrics, it prints a message indicating that no metrics were found.
|
|
23
30
|
|
|
24
31
|
Args:
|
|
25
|
-
args: A namespace object from argparse. It should have
|
|
26
|
-
|
|
27
|
-
|
|
32
|
+
args: A namespace object from argparse. It should have optional attributes:
|
|
33
|
+
find (str): Optional field to find metric(s) with a keyword.
|
|
34
|
+
pagination (str): Optional field to paginate metrics.
|
|
28
35
|
|
|
29
36
|
Returns:
|
|
30
|
-
list | None: A list of
|
|
37
|
+
list | None: A list of metrics or None if there are no metrics.
|
|
31
38
|
"""
|
|
32
39
|
|
|
33
40
|
try:
|
|
34
41
|
print("Listing metrics may take a while...")
|
|
42
|
+
if args.find is not None:
|
|
43
|
+
if not isinstance(args.find, str) or not args.find:
|
|
44
|
+
raise TypeError(ERROR_BENCHMARK_LIST_METRICS_FIND_VALIDATION)
|
|
45
|
+
|
|
46
|
+
if args.pagination is not None:
|
|
47
|
+
if not isinstance(args.pagination, str) or not args.pagination:
|
|
48
|
+
raise TypeError(ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION)
|
|
49
|
+
try:
|
|
50
|
+
pagination = literal_eval(args.pagination)
|
|
51
|
+
if not (
|
|
52
|
+
isinstance(pagination, tuple)
|
|
53
|
+
and len(pagination) == 2
|
|
54
|
+
and all(isinstance(i, int) for i in pagination)
|
|
55
|
+
):
|
|
56
|
+
raise ValueError(
|
|
57
|
+
ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION_1
|
|
58
|
+
)
|
|
59
|
+
except (ValueError, SyntaxError):
|
|
60
|
+
raise ValueError(ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION_1)
|
|
61
|
+
else:
|
|
62
|
+
pagination = ()
|
|
63
|
+
|
|
35
64
|
metrics_list = api_get_all_metric()
|
|
36
65
|
keyword = args.find.lower() if args.find else ""
|
|
37
|
-
pagination = literal_eval(args.pagination) if args.pagination else ()
|
|
38
66
|
|
|
39
67
|
if metrics_list:
|
|
40
68
|
filtered_metrics_list = filter_data(metrics_list, keyword, pagination)
|
|
@@ -44,8 +72,10 @@ def list_metrics(args) -> list | None:
|
|
|
44
72
|
|
|
45
73
|
console.print("[red]There are no metrics found.[/red]")
|
|
46
74
|
return None
|
|
75
|
+
|
|
47
76
|
except Exception as e:
|
|
48
77
|
print(f"[list_metrics]: {str(e)}")
|
|
78
|
+
return None
|
|
49
79
|
|
|
50
80
|
|
|
51
81
|
def view_metric(args) -> None:
|
|
@@ -65,6 +95,13 @@ def view_metric(args) -> None:
|
|
|
65
95
|
"""
|
|
66
96
|
try:
|
|
67
97
|
print("Viewing metrics may take a while...")
|
|
98
|
+
if (
|
|
99
|
+
not isinstance(args.metric_filename, str)
|
|
100
|
+
or not args.metric_filename
|
|
101
|
+
or args.metric_filename is None
|
|
102
|
+
):
|
|
103
|
+
raise TypeError(ERROR_BENCHMARK_VIEW_METRIC_METRIC_FILENAME_VALIDATION)
|
|
104
|
+
|
|
68
105
|
metrics_list = api_get_all_metric()
|
|
69
106
|
metrics_name_list = api_get_all_metric_name()
|
|
70
107
|
|
|
@@ -100,7 +137,11 @@ def delete_metric(args) -> None:
|
|
|
100
137
|
if confirmation.lower() != "y":
|
|
101
138
|
console.print("[bold yellow]Metric deletion cancelled.[/]")
|
|
102
139
|
return
|
|
140
|
+
|
|
103
141
|
try:
|
|
142
|
+
if args.metric is None or not isinstance(args.metric, str) or not args.metric:
|
|
143
|
+
raise ValueError(ERROR_BENCHMARK_DELETE_METRIC_METRIC_VALIDATION)
|
|
144
|
+
|
|
104
145
|
api_delete_metric(args.metric)
|
|
105
146
|
print("[delete_metric]: Metric deleted.")
|
|
106
147
|
except Exception as e:
|
|
@@ -17,6 +17,41 @@ from moonshot.api import (
|
|
|
17
17
|
api_read_recipe,
|
|
18
18
|
api_update_recipe,
|
|
19
19
|
)
|
|
20
|
+
from moonshot.integrations.cli.cli_errors import (
|
|
21
|
+
ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_LIST_STR_VALIDATION,
|
|
22
|
+
ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_VALIDATION,
|
|
23
|
+
ERROR_BENCHMARK_ADD_RECIPE_DATASETS_LIST_STR_VALIDATION,
|
|
24
|
+
ERROR_BENCHMARK_ADD_RECIPE_DATASETS_VALIDATION,
|
|
25
|
+
ERROR_BENCHMARK_ADD_RECIPE_DESC_VALIDATION,
|
|
26
|
+
ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_DICT_STR_VALIDATION,
|
|
27
|
+
ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_VALIDATION,
|
|
28
|
+
ERROR_BENCHMARK_ADD_RECIPE_METRICS_LIST_STR_VALIDATION,
|
|
29
|
+
ERROR_BENCHMARK_ADD_RECIPE_METRICS_VALIDATION,
|
|
30
|
+
ERROR_BENCHMARK_ADD_RECIPE_NAME_VALIDATION,
|
|
31
|
+
ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_LIST_STR_VALIDATION,
|
|
32
|
+
ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_VALIDATION,
|
|
33
|
+
ERROR_BENCHMARK_ADD_RECIPE_TAGS_LIST_STR_VALIDATION,
|
|
34
|
+
ERROR_BENCHMARK_ADD_RECIPE_TAGS_VALIDATION,
|
|
35
|
+
ERROR_BENCHMARK_DELETE_RECIPE_RECIPE_VALIDATION,
|
|
36
|
+
ERROR_BENCHMARK_LIST_RECIPES_FIND_VALIDATION,
|
|
37
|
+
ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION,
|
|
38
|
+
ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION_1,
|
|
39
|
+
ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION,
|
|
40
|
+
ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION_1,
|
|
41
|
+
ERROR_BENCHMARK_RUN_RECIPE_NAME_VALIDATION,
|
|
42
|
+
ERROR_BENCHMARK_RUN_RECIPE_NO_RESULT,
|
|
43
|
+
ERROR_BENCHMARK_RUN_RECIPE_NUM_OF_PROMPTS_VALIDATION,
|
|
44
|
+
ERROR_BENCHMARK_RUN_RECIPE_RANDOM_SEED_VALIDATION,
|
|
45
|
+
ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION,
|
|
46
|
+
ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION_1,
|
|
47
|
+
ERROR_BENCHMARK_RUN_RECIPE_RESULT_PROC_MOD_VALIDATION,
|
|
48
|
+
ERROR_BENCHMARK_RUN_RECIPE_RUNNER_PROC_MOD_VALIDATION,
|
|
49
|
+
ERROR_BENCHMARK_RUN_RECIPE_SYS_PROMPT_VALIDATION,
|
|
50
|
+
ERROR_BENCHMARK_UPDATE_RECIPE_RECIPE_VALIDATION,
|
|
51
|
+
ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION,
|
|
52
|
+
ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION_1,
|
|
53
|
+
ERROR_BENCHMARK_VIEW_RECIPE_RECIPE_VALIDATION,
|
|
54
|
+
)
|
|
20
55
|
from moonshot.integrations.cli.common.display_helper import display_view_list_format
|
|
21
56
|
from moonshot.integrations.cli.utils.process_data import filter_data
|
|
22
57
|
|
|
@@ -37,23 +72,114 @@ def add_recipe(args) -> None:
|
|
|
37
72
|
|
|
38
73
|
Args:
|
|
39
74
|
args (argparse.Namespace): The arguments provided to the command line interface.
|
|
40
|
-
Expected keys are name, description, tags, categories,
|
|
75
|
+
Expected keys are name, description, tags, categories, datasets, prompt_templates, metrics, and grading_scale.
|
|
41
76
|
|
|
42
77
|
Returns:
|
|
43
78
|
None
|
|
44
79
|
|
|
45
80
|
Raises:
|
|
46
|
-
|
|
81
|
+
TypeError: If any of the required arguments are not strings or are None.
|
|
82
|
+
ValueError: If the evaluated arguments are not of the expected types.
|
|
47
83
|
"""
|
|
48
84
|
try:
|
|
49
|
-
|
|
85
|
+
if not isinstance(args.name, str) or not args.name or args.name is None:
|
|
86
|
+
raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_NAME_VALIDATION)
|
|
87
|
+
|
|
88
|
+
if (
|
|
89
|
+
not isinstance(args.description, str)
|
|
90
|
+
or not args.description
|
|
91
|
+
or args.description is None
|
|
92
|
+
):
|
|
93
|
+
raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_DESC_VALIDATION)
|
|
94
|
+
|
|
95
|
+
if not isinstance(args.tags, str) or not args.tags or args.tags is None:
|
|
96
|
+
raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_TAGS_VALIDATION)
|
|
97
|
+
|
|
98
|
+
if (
|
|
99
|
+
not isinstance(args.categories, str)
|
|
100
|
+
or not args.categories
|
|
101
|
+
or args.categories is None
|
|
102
|
+
):
|
|
103
|
+
raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_VALIDATION)
|
|
104
|
+
|
|
105
|
+
if (
|
|
106
|
+
not isinstance(args.datasets, str)
|
|
107
|
+
or not args.datasets
|
|
108
|
+
or args.datasets is None
|
|
109
|
+
):
|
|
110
|
+
raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_DATASETS_VALIDATION)
|
|
111
|
+
|
|
112
|
+
if (
|
|
113
|
+
not isinstance(args.prompt_templates, str)
|
|
114
|
+
or not args.prompt_templates
|
|
115
|
+
or args.prompt_templates is None
|
|
116
|
+
):
|
|
117
|
+
raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_VALIDATION)
|
|
118
|
+
|
|
119
|
+
if (
|
|
120
|
+
not isinstance(args.metrics, str)
|
|
121
|
+
or not args.metrics
|
|
122
|
+
or args.metrics is None
|
|
123
|
+
):
|
|
124
|
+
raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_METRICS_VALIDATION)
|
|
125
|
+
|
|
126
|
+
if (
|
|
127
|
+
not isinstance(args.grading_scale, str)
|
|
128
|
+
or not args.grading_scale
|
|
129
|
+
or args.grading_scale is None
|
|
130
|
+
):
|
|
131
|
+
raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_VALIDATION)
|
|
132
|
+
|
|
133
|
+
tags = literal_eval(args.tags)
|
|
50
134
|
categories = literal_eval(args.categories)
|
|
51
135
|
datasets = literal_eval(args.datasets)
|
|
52
|
-
prompt_templates = (
|
|
53
|
-
literal_eval(args.prompt_templates) if args.prompt_templates else []
|
|
54
|
-
)
|
|
136
|
+
prompt_templates = literal_eval(args.prompt_templates)
|
|
55
137
|
metrics = literal_eval(args.metrics)
|
|
56
|
-
grading_scale = literal_eval(args.grading_scale)
|
|
138
|
+
grading_scale = literal_eval(args.grading_scale)
|
|
139
|
+
|
|
140
|
+
if not (isinstance(tags, list) and all(isinstance(tag, str) for tag in tags)):
|
|
141
|
+
raise ValueError(ERROR_BENCHMARK_ADD_RECIPE_TAGS_LIST_STR_VALIDATION)
|
|
142
|
+
|
|
143
|
+
if not (
|
|
144
|
+
isinstance(categories, list)
|
|
145
|
+
and all(isinstance(category, str) for category in categories)
|
|
146
|
+
):
|
|
147
|
+
raise ValueError(ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_LIST_STR_VALIDATION)
|
|
148
|
+
|
|
149
|
+
if not (
|
|
150
|
+
isinstance(datasets, list)
|
|
151
|
+
and all(isinstance(dataset, str) for dataset in datasets)
|
|
152
|
+
):
|
|
153
|
+
raise ValueError(ERROR_BENCHMARK_ADD_RECIPE_DATASETS_LIST_STR_VALIDATION)
|
|
154
|
+
|
|
155
|
+
if not (
|
|
156
|
+
isinstance(prompt_templates, list)
|
|
157
|
+
and all(
|
|
158
|
+
isinstance(prompt_template, str) for prompt_template in prompt_templates
|
|
159
|
+
)
|
|
160
|
+
):
|
|
161
|
+
raise ValueError(
|
|
162
|
+
ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_LIST_STR_VALIDATION
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if not (
|
|
166
|
+
isinstance(metrics, list)
|
|
167
|
+
and all(isinstance(metric, str) for metric in metrics)
|
|
168
|
+
):
|
|
169
|
+
raise ValueError(ERROR_BENCHMARK_ADD_RECIPE_METRICS_LIST_STR_VALIDATION)
|
|
170
|
+
|
|
171
|
+
if not (
|
|
172
|
+
isinstance(grading_scale, dict)
|
|
173
|
+
and all(
|
|
174
|
+
isinstance(gs, list)
|
|
175
|
+
and len(gs) == 2
|
|
176
|
+
and all(isinstance(value, int) for value in gs)
|
|
177
|
+
for gs in grading_scale.values()
|
|
178
|
+
)
|
|
179
|
+
):
|
|
180
|
+
raise ValueError(
|
|
181
|
+
ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_DICT_STR_VALIDATION
|
|
182
|
+
)
|
|
57
183
|
|
|
58
184
|
new_recipe_id = api_create_recipe(
|
|
59
185
|
args.name,
|
|
@@ -79,18 +205,42 @@ def list_recipes(args) -> list | None:
|
|
|
79
205
|
It then displays the retrieved recipes using the _display_recipes function.
|
|
80
206
|
|
|
81
207
|
Args:
|
|
82
|
-
args: A namespace object from argparse. It should have
|
|
83
|
-
|
|
84
|
-
|
|
208
|
+
args: A namespace object from argparse. It should have optional attributes:
|
|
209
|
+
find (str): Optional field to find recipe(s) with a keyword.
|
|
210
|
+
pagination (str): Optional field to paginate recipes.
|
|
85
211
|
|
|
86
212
|
Returns:
|
|
87
|
-
list | None: A list of
|
|
88
|
-
"""
|
|
213
|
+
list | None: A list of recipes or None if there is no result.
|
|
89
214
|
|
|
215
|
+
Raises:
|
|
216
|
+
TypeError: If the 'find' or 'pagination' arguments are not strings or are invalid.
|
|
217
|
+
ValueError: If the 'pagination' argument cannot be evaluated into a tuple of two integers.
|
|
218
|
+
"""
|
|
90
219
|
try:
|
|
220
|
+
if args.find is not None:
|
|
221
|
+
if not isinstance(args.find, str) or not args.find:
|
|
222
|
+
raise TypeError(ERROR_BENCHMARK_LIST_RECIPES_FIND_VALIDATION)
|
|
223
|
+
|
|
224
|
+
if args.pagination is not None:
|
|
225
|
+
if not isinstance(args.pagination, str) or not args.pagination:
|
|
226
|
+
raise TypeError(ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION)
|
|
227
|
+
try:
|
|
228
|
+
pagination = literal_eval(args.pagination)
|
|
229
|
+
if not (
|
|
230
|
+
isinstance(pagination, tuple)
|
|
231
|
+
and len(pagination) == 2
|
|
232
|
+
and all(isinstance(i, int) for i in pagination)
|
|
233
|
+
):
|
|
234
|
+
raise ValueError(
|
|
235
|
+
ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION_1
|
|
236
|
+
)
|
|
237
|
+
except (ValueError, SyntaxError):
|
|
238
|
+
raise ValueError(ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION_1)
|
|
239
|
+
else:
|
|
240
|
+
pagination = ()
|
|
241
|
+
|
|
91
242
|
recipes_list = api_get_all_recipe()
|
|
92
243
|
keyword = args.find.lower() if args.find else ""
|
|
93
|
-
pagination = literal_eval(args.pagination) if args.pagination else ()
|
|
94
244
|
|
|
95
245
|
if recipes_list:
|
|
96
246
|
filtered_recipes_list = filter_data(recipes_list, keyword, pagination)
|
|
@@ -103,6 +253,7 @@ def list_recipes(args) -> list | None:
|
|
|
103
253
|
|
|
104
254
|
except Exception as e:
|
|
105
255
|
print(f"[list_recipes]: {str(e)}")
|
|
256
|
+
return None
|
|
106
257
|
|
|
107
258
|
|
|
108
259
|
def view_recipe(args) -> None:
|
|
@@ -111,7 +262,7 @@ def view_recipe(args) -> None:
|
|
|
111
262
|
|
|
112
263
|
This function retrieves a specific recipe by calling the api_read_recipe function from the
|
|
113
264
|
moonshot.api module using the recipe name provided in the args.
|
|
114
|
-
It then displays the retrieved recipe using the
|
|
265
|
+
It then displays the retrieved recipe using the _display_recipes function.
|
|
115
266
|
|
|
116
267
|
Args:
|
|
117
268
|
args: A namespace object from argparse. It should have the following attribute:
|
|
@@ -119,8 +270,14 @@ def view_recipe(args) -> None:
|
|
|
119
270
|
|
|
120
271
|
Returns:
|
|
121
272
|
None
|
|
273
|
+
|
|
274
|
+
Raises:
|
|
275
|
+
TypeError: If the 'recipe' argument is not a string or is None.
|
|
122
276
|
"""
|
|
123
277
|
try:
|
|
278
|
+
if not isinstance(args.recipe, str) or not args.recipe or args.recipe is None:
|
|
279
|
+
raise TypeError(ERROR_BENCHMARK_VIEW_RECIPE_RECIPE_VALIDATION)
|
|
280
|
+
|
|
124
281
|
recipe_info = api_read_recipe(args.recipe)
|
|
125
282
|
_display_recipes([recipe_info])
|
|
126
283
|
except Exception as e:
|
|
@@ -148,46 +305,103 @@ def run_recipe(args) -> None:
|
|
|
148
305
|
|
|
149
306
|
Returns:
|
|
150
307
|
None
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
TypeError: If any of the required arguments are not of the expected types or are None.
|
|
311
|
+
ValueError: If the 'recipes' or 'endpoints' arguments cannot be evaluated into lists of strings.
|
|
312
|
+
RuntimeError: If no results are found after running the recipes.
|
|
151
313
|
"""
|
|
152
314
|
try:
|
|
153
|
-
name
|
|
315
|
+
if not isinstance(args.name, str) or not args.name or args.name is None:
|
|
316
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_NAME_VALIDATION)
|
|
317
|
+
|
|
318
|
+
if (
|
|
319
|
+
not isinstance(args.recipes, str)
|
|
320
|
+
or not args.recipes
|
|
321
|
+
or args.recipes is None
|
|
322
|
+
):
|
|
323
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION)
|
|
324
|
+
|
|
325
|
+
if (
|
|
326
|
+
not isinstance(args.endpoints, str)
|
|
327
|
+
or not args.endpoints
|
|
328
|
+
or args.endpoints is None
|
|
329
|
+
):
|
|
330
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION)
|
|
331
|
+
|
|
332
|
+
if isinstance(args.num_of_prompts, bool) or not isinstance(
|
|
333
|
+
args.num_of_prompts, int
|
|
334
|
+
):
|
|
335
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_NUM_OF_PROMPTS_VALIDATION)
|
|
336
|
+
|
|
337
|
+
if isinstance(args.random_seed, bool) or not isinstance(args.random_seed, int):
|
|
338
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RANDOM_SEED_VALIDATION)
|
|
339
|
+
|
|
340
|
+
if (
|
|
341
|
+
not isinstance(args.system_prompt, str)
|
|
342
|
+
or not args.system_prompt
|
|
343
|
+
or args.system_prompt is None
|
|
344
|
+
):
|
|
345
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_SYS_PROMPT_VALIDATION)
|
|
346
|
+
|
|
347
|
+
if (
|
|
348
|
+
not isinstance(args.runner_proc_module, str)
|
|
349
|
+
or not args.runner_proc_module
|
|
350
|
+
or args.runner_proc_module is None
|
|
351
|
+
):
|
|
352
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RUNNER_PROC_MOD_VALIDATION)
|
|
353
|
+
|
|
354
|
+
if (
|
|
355
|
+
not isinstance(args.result_proc_module, str)
|
|
356
|
+
or not args.result_proc_module
|
|
357
|
+
or args.result_proc_module is None
|
|
358
|
+
):
|
|
359
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RESULT_PROC_MOD_VALIDATION)
|
|
360
|
+
|
|
154
361
|
recipes = literal_eval(args.recipes)
|
|
362
|
+
if not (
|
|
363
|
+
isinstance(recipes, list) and all(isinstance(item, str) for item in recipes)
|
|
364
|
+
):
|
|
365
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION_1)
|
|
366
|
+
|
|
155
367
|
endpoints = literal_eval(args.endpoints)
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
368
|
+
if not (
|
|
369
|
+
isinstance(endpoints, list)
|
|
370
|
+
and all(isinstance(item, str) for item in endpoints)
|
|
371
|
+
):
|
|
372
|
+
raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION_1)
|
|
161
373
|
|
|
162
374
|
# Run the recipes with the defined endpoints
|
|
163
|
-
slugify_id = slugify(name, lowercase=True)
|
|
375
|
+
slugify_id = slugify(args.name, lowercase=True)
|
|
164
376
|
if slugify_id in api_get_all_runner_name():
|
|
165
377
|
rec_runner = api_load_runner(slugify_id)
|
|
166
378
|
else:
|
|
167
|
-
rec_runner = api_create_runner(name, endpoints)
|
|
379
|
+
rec_runner = api_create_runner(args.name, endpoints)
|
|
168
380
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
rec_runner.run_recipes(
|
|
381
|
+
async def run():
|
|
382
|
+
await rec_runner.run_recipes(
|
|
172
383
|
recipes,
|
|
173
|
-
num_of_prompts,
|
|
174
|
-
random_seed,
|
|
175
|
-
system_prompt,
|
|
176
|
-
runner_proc_module,
|
|
177
|
-
result_proc_module,
|
|
384
|
+
args.num_of_prompts,
|
|
385
|
+
args.random_seed,
|
|
386
|
+
args.system_prompt,
|
|
387
|
+
args.runner_proc_module,
|
|
388
|
+
args.result_proc_module,
|
|
178
389
|
)
|
|
179
|
-
|
|
180
|
-
|
|
390
|
+
await rec_runner.close()
|
|
391
|
+
|
|
392
|
+
loop = asyncio.get_event_loop()
|
|
393
|
+
loop.run_until_complete(run())
|
|
181
394
|
|
|
182
395
|
# Display results
|
|
183
396
|
runner_runs = api_get_all_run(rec_runner.id)
|
|
184
397
|
result_info = runner_runs[-1].get("results")
|
|
185
398
|
if result_info:
|
|
186
|
-
|
|
399
|
+
_show_recipe_results(
|
|
187
400
|
recipes, endpoints, result_info, result_info["metadata"]["duration"]
|
|
188
401
|
)
|
|
189
402
|
else:
|
|
190
|
-
raise RuntimeError(
|
|
403
|
+
raise RuntimeError(ERROR_BENCHMARK_RUN_RECIPE_NO_RESULT)
|
|
404
|
+
|
|
191
405
|
except Exception as e:
|
|
192
406
|
print(f"[run_recipe]: {str(e)}")
|
|
193
407
|
|
|
@@ -207,11 +421,31 @@ def update_recipe(args) -> None:
|
|
|
207
421
|
|
|
208
422
|
Returns:
|
|
209
423
|
None
|
|
424
|
+
|
|
425
|
+
Raises:
|
|
426
|
+
ValueError: If the 'recipe' or 'update_values' arguments are not strings or are None.
|
|
427
|
+
ValueError: If the 'update_values' argument cannot be evaluated into a list of tuples.
|
|
210
428
|
"""
|
|
211
429
|
try:
|
|
430
|
+
if args.recipe is None or not isinstance(args.recipe, str) or not args.recipe:
|
|
431
|
+
raise ValueError(ERROR_BENCHMARK_UPDATE_RECIPE_RECIPE_VALIDATION)
|
|
432
|
+
|
|
433
|
+
if (
|
|
434
|
+
args.update_values is None
|
|
435
|
+
or not isinstance(args.update_values, str)
|
|
436
|
+
or not args.update_values
|
|
437
|
+
):
|
|
438
|
+
raise ValueError(ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION)
|
|
439
|
+
|
|
212
440
|
recipe = args.recipe
|
|
213
|
-
|
|
441
|
+
if literal_eval(args.update_values) and all(
|
|
442
|
+
isinstance(i, tuple) for i in literal_eval(args.update_values)
|
|
443
|
+
):
|
|
444
|
+
update_values = dict(literal_eval(args.update_values))
|
|
445
|
+
else:
|
|
446
|
+
raise ValueError(ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION_1)
|
|
214
447
|
api_update_recipe(recipe, **update_values)
|
|
448
|
+
|
|
215
449
|
print("[update_recipe]: Recipe updated.")
|
|
216
450
|
except Exception as e:
|
|
217
451
|
print(f"[update_recipe]: {str(e)}")
|
|
@@ -232,6 +466,9 @@ def delete_recipe(args) -> None:
|
|
|
232
466
|
|
|
233
467
|
Returns:
|
|
234
468
|
None
|
|
469
|
+
|
|
470
|
+
Raises:
|
|
471
|
+
ValueError: If the 'recipe' argument is not a string or is None.
|
|
235
472
|
"""
|
|
236
473
|
# Confirm with the user before deleting a recipe
|
|
237
474
|
confirmation = console.input(
|
|
@@ -240,7 +477,11 @@ def delete_recipe(args) -> None:
|
|
|
240
477
|
if confirmation.lower() != "y":
|
|
241
478
|
console.print("[bold yellow]Recipe deletion cancelled.[/]")
|
|
242
479
|
return
|
|
480
|
+
|
|
243
481
|
try:
|
|
482
|
+
if args.recipe is None or not isinstance(args.recipe, str) or not args.recipe:
|
|
483
|
+
raise ValueError(ERROR_BENCHMARK_DELETE_RECIPE_RECIPE_VALIDATION)
|
|
484
|
+
|
|
244
485
|
api_delete_recipe(args.recipe)
|
|
245
486
|
print("[delete_recipe]: Recipe deleted.")
|
|
246
487
|
except Exception as e:
|
|
@@ -250,7 +491,7 @@ def delete_recipe(args) -> None:
|
|
|
250
491
|
# ------------------------------------------------------------------------------
|
|
251
492
|
# Helper functions: Display on cli
|
|
252
493
|
# ------------------------------------------------------------------------------
|
|
253
|
-
def
|
|
494
|
+
def _display_view_grading_scale_format(title: str, grading_scale: dict) -> str:
|
|
254
495
|
"""
|
|
255
496
|
Format the grading scale for display.
|
|
256
497
|
|
|
@@ -275,7 +516,7 @@ def display_view_grading_scale_format(title: str, grading_scale: dict) -> str:
|
|
|
275
516
|
return f"[blue]{title}[/blue]: nil"
|
|
276
517
|
|
|
277
518
|
|
|
278
|
-
def
|
|
519
|
+
def _display_view_statistics_format(title: str, stats: dict) -> str:
|
|
279
520
|
"""
|
|
280
521
|
Format the statistics for display.
|
|
281
522
|
|
|
@@ -348,10 +589,10 @@ def _display_recipes(recipes_list: list) -> None:
|
|
|
348
589
|
"Prompt Templates", prompt_templates
|
|
349
590
|
)
|
|
350
591
|
metrics_info = display_view_list_format("Metrics", metrics)
|
|
351
|
-
grading_scale_info =
|
|
592
|
+
grading_scale_info = _display_view_grading_scale_format(
|
|
352
593
|
"Grading Scale", grading_scale
|
|
353
594
|
)
|
|
354
|
-
stats_info =
|
|
595
|
+
stats_info = _display_view_statistics_format("Statistics", stats)
|
|
355
596
|
|
|
356
597
|
recipe_info = (
|
|
357
598
|
f"[red]id: {id}[/red]\n\n[blue]{name}[/blue]\n{description}\n\n"
|
|
@@ -364,7 +605,7 @@ def _display_recipes(recipes_list: list) -> None:
|
|
|
364
605
|
console.print(table)
|
|
365
606
|
|
|
366
607
|
|
|
367
|
-
def
|
|
608
|
+
def _show_recipe_results(recipes, endpoints, recipe_results, duration):
|
|
368
609
|
"""
|
|
369
610
|
Show the results of the recipe benchmarking.
|
|
370
611
|
|
|
@@ -384,7 +625,7 @@ def show_recipe_results(recipes, endpoints, recipe_results, duration):
|
|
|
384
625
|
"""
|
|
385
626
|
if recipe_results:
|
|
386
627
|
# Display recipe results
|
|
387
|
-
|
|
628
|
+
_generate_recipe_table(recipes, endpoints, recipe_results)
|
|
388
629
|
else:
|
|
389
630
|
console.print("[red]There are no results.[/red]")
|
|
390
631
|
|
|
@@ -394,7 +635,7 @@ def show_recipe_results(recipes, endpoints, recipe_results, duration):
|
|
|
394
635
|
console.print(run_stats)
|
|
395
636
|
|
|
396
637
|
|
|
397
|
-
def
|
|
638
|
+
def _generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None:
|
|
398
639
|
"""
|
|
399
640
|
Generate and display a table of recipe results.
|
|
400
641
|
|