@cliwatch/cli-bench 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +145 -0
- package/dist/assertions.d.ts +14 -0
- package/dist/assertions.d.ts.map +1 -0
- package/dist/assertions.js +161 -0
- package/dist/assertions.js.map +1 -0
- package/dist/ci.d.ts +29 -0
- package/dist/ci.d.ts.map +1 -0
- package/dist/ci.js +75 -0
- package/dist/ci.js.map +1 -0
- package/dist/client/client/client.gen.d.ts +3 -0
- package/dist/client/client/client.gen.d.ts.map +1 -0
- package/dist/client/client/client.gen.js +235 -0
- package/dist/client/client/client.gen.js.map +1 -0
- package/dist/client/client/index.d.ts +9 -0
- package/dist/client/client/index.d.ts.map +1 -0
- package/dist/client/client/index.js +7 -0
- package/dist/client/client/index.js.map +1 -0
- package/dist/client/client/types.gen.d.ts +118 -0
- package/dist/client/client/types.gen.d.ts.map +1 -0
- package/dist/client/client/types.gen.js +3 -0
- package/dist/client/client/types.gen.js.map +1 -0
- package/dist/client/client/utils.gen.d.ts +34 -0
- package/dist/client/client/utils.gen.d.ts.map +1 -0
- package/dist/client/client/utils.gen.js +229 -0
- package/dist/client/client/utils.gen.js.map +1 -0
- package/dist/client/client.gen.d.ts +13 -0
- package/dist/client/client.gen.d.ts.map +1 -0
- package/dist/client/client.gen.js +4 -0
- package/dist/client/client.gen.js.map +1 -0
- package/dist/client/core/auth.gen.d.ts +19 -0
- package/dist/client/core/auth.gen.d.ts.map +1 -0
- package/dist/client/core/auth.gen.js +15 -0
- package/dist/client/core/auth.gen.js.map +1 -0
- package/dist/client/core/bodySerializer.gen.d.ts +26 -0
- package/dist/client/core/bodySerializer.gen.d.ts.map +1 -0
- package/dist/client/core/bodySerializer.gen.js +58 -0
- package/dist/client/core/bodySerializer.gen.js.map +1 -0
- package/dist/client/core/params.gen.d.ts +44 -0
- package/dist/client/core/params.gen.d.ts.map +1 -0
- package/dist/client/core/params.gen.js +101 -0
- package/dist/client/core/params.gen.js.map +1 -0
- package/dist/client/core/pathSerializer.gen.d.ts +34 -0
- package/dist/client/core/pathSerializer.gen.d.ts.map +1 -0
- package/dist/client/core/pathSerializer.gen.js +107 -0
- package/dist/client/core/pathSerializer.gen.js.map +1 -0
- package/dist/client/core/queryKeySerializer.gen.d.ts +19 -0
- package/dist/client/core/queryKeySerializer.gen.d.ts.map +1 -0
- package/dist/client/core/queryKeySerializer.gen.js +93 -0
- package/dist/client/core/queryKeySerializer.gen.js.map +1 -0
- package/dist/client/core/serverSentEvents.gen.d.ts +72 -0
- package/dist/client/core/serverSentEvents.gen.d.ts.map +1 -0
- package/dist/client/core/serverSentEvents.gen.js +134 -0
- package/dist/client/core/serverSentEvents.gen.js.map +1 -0
- package/dist/client/core/types.gen.d.ts +79 -0
- package/dist/client/core/types.gen.d.ts.map +1 -0
- package/dist/client/core/types.gen.js +3 -0
- package/dist/client/core/types.gen.js.map +1 -0
- package/dist/client/core/utils.gen.d.ts +20 -0
- package/dist/client/core/utils.gen.d.ts.map +1 -0
- package/dist/client/core/utils.gen.js +88 -0
- package/dist/client/core/utils.gen.js.map +1 -0
- package/dist/client/index.d.ts +3 -0
- package/dist/client/index.d.ts.map +1 -0
- package/dist/client/index.js +3 -0
- package/dist/client/index.js.map +1 -0
- package/dist/client/sdk.gen.d.ts +45 -0
- package/dist/client/sdk.gen.d.ts.map +1 -0
- package/dist/client/sdk.gen.js +47 -0
- package/dist/client/sdk.gen.js.map +1 -0
- package/dist/client/types.gen.d.ts +694 -0
- package/dist/client/types.gen.d.ts.map +1 -0
- package/dist/client/types.gen.js +3 -0
- package/dist/client/types.gen.js.map +1 -0
- package/dist/client/zod.gen.d.ts +492 -0
- package/dist/client/zod.gen.d.ts.map +1 -0
- package/dist/client/zod.gen.js +413 -0
- package/dist/client/zod.gen.js.map +1 -0
- package/dist/config.d.ts +22 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +94 -0
- package/dist/config.js.map +1 -0
- package/dist/exec.d.ts +18 -0
- package/dist/exec.d.ts.map +1 -0
- package/dist/exec.js +30 -0
- package/dist/exec.js.map +1 -0
- package/dist/help-loader.d.ts +13 -0
- package/dist/help-loader.d.ts.map +1 -0
- package/dist/help-loader.js +135 -0
- package/dist/help-loader.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +148 -0
- package/dist/index.js.map +1 -0
- package/dist/init.d.ts +5 -0
- package/dist/init.d.ts.map +1 -0
- package/dist/init.js +62 -0
- package/dist/init.js.map +1 -0
- package/dist/models.d.ts +158 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +8 -0
- package/dist/models.js.map +1 -0
- package/dist/project.d.ts +26 -0
- package/dist/project.d.ts.map +1 -0
- package/dist/project.js +101 -0
- package/dist/project.js.map +1 -0
- package/dist/prompt.d.ts +12 -0
- package/dist/prompt.d.ts.map +1 -0
- package/dist/prompt.js +88 -0
- package/dist/prompt.js.map +1 -0
- package/dist/providers.d.ts +26 -0
- package/dist/providers.d.ts.map +1 -0
- package/dist/providers.js +55 -0
- package/dist/providers.js.map +1 -0
- package/dist/runner.d.ts +34 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +434 -0
- package/dist/runner.js.map +1 -0
- package/dist/schemas.d.ts +256 -0
- package/dist/schemas.d.ts.map +1 -0
- package/dist/schemas.js +59 -0
- package/dist/schemas.js.map +1 -0
- package/dist/suite-generator.d.ts +8 -0
- package/dist/suite-generator.d.ts.map +1 -0
- package/dist/suite-generator.js +100 -0
- package/dist/suite-generator.js.map +1 -0
- package/dist/thresholds.d.ts +10 -0
- package/dist/thresholds.d.ts.map +1 -0
- package/dist/thresholds.js +57 -0
- package/dist/thresholds.js.map +1 -0
- package/package.json +41 -0
- package/task_suites/curl.yaml +138 -0
- package/task_suites/docker.yaml +163 -0
- package/task_suites/gh.yaml +118 -0
- package/task_suites/jq.yaml +172 -0
- package/task_suites/kubectl.yaml +74 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod schemas for runtime validation of YAML config and task files.
|
|
3
|
+
*/
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
export declare const AssertionSchema: z.ZodUnion<readonly [z.ZodObject<{
|
|
6
|
+
output_contains: z.ZodString;
|
|
7
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
8
|
+
output_equals: z.ZodString;
|
|
9
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
10
|
+
error_contains: z.ZodString;
|
|
11
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
12
|
+
exit_code: z.ZodNumber;
|
|
13
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
14
|
+
file_exists: z.ZodString;
|
|
15
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
16
|
+
file_contains: z.ZodObject<{
|
|
17
|
+
path: z.ZodString;
|
|
18
|
+
text: z.ZodString;
|
|
19
|
+
}, z.core.$strip>;
|
|
20
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
21
|
+
ran: z.ZodString;
|
|
22
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
23
|
+
not_ran: z.ZodString;
|
|
24
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
25
|
+
run_count: z.ZodObject<{
|
|
26
|
+
pattern: z.ZodString;
|
|
27
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
28
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
29
|
+
}, z.core.$strip>;
|
|
30
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
31
|
+
verify: z.ZodObject<{
|
|
32
|
+
run: z.ZodString;
|
|
33
|
+
output_contains: z.ZodOptional<z.ZodString>;
|
|
34
|
+
output_equals: z.ZodOptional<z.ZodString>;
|
|
35
|
+
}, z.core.$strip>;
|
|
36
|
+
}, z.core.$strip>]>;
|
|
37
|
+
export declare const TaskSchema: z.ZodObject<{
|
|
38
|
+
id: z.ZodString;
|
|
39
|
+
intent: z.ZodString;
|
|
40
|
+
assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
41
|
+
output_contains: z.ZodString;
|
|
42
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
43
|
+
output_equals: z.ZodString;
|
|
44
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
45
|
+
error_contains: z.ZodString;
|
|
46
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
47
|
+
exit_code: z.ZodNumber;
|
|
48
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
49
|
+
file_exists: z.ZodString;
|
|
50
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
51
|
+
file_contains: z.ZodObject<{
|
|
52
|
+
path: z.ZodString;
|
|
53
|
+
text: z.ZodString;
|
|
54
|
+
}, z.core.$strip>;
|
|
55
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
56
|
+
ran: z.ZodString;
|
|
57
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
58
|
+
not_ran: z.ZodString;
|
|
59
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
60
|
+
run_count: z.ZodObject<{
|
|
61
|
+
pattern: z.ZodString;
|
|
62
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
63
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
64
|
+
}, z.core.$strip>;
|
|
65
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
66
|
+
verify: z.ZodObject<{
|
|
67
|
+
run: z.ZodString;
|
|
68
|
+
output_contains: z.ZodOptional<z.ZodString>;
|
|
69
|
+
output_equals: z.ZodOptional<z.ZodString>;
|
|
70
|
+
}, z.core.$strip>;
|
|
71
|
+
}, z.core.$strip>]>>;
|
|
72
|
+
setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
73
|
+
max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
74
|
+
difficulty: z.ZodOptional<z.ZodEnum<{
|
|
75
|
+
easy: "easy";
|
|
76
|
+
medium: "medium";
|
|
77
|
+
hard: "hard";
|
|
78
|
+
}>>;
|
|
79
|
+
category: z.ZodOptional<z.ZodString>;
|
|
80
|
+
repeat: z.ZodOptional<z.ZodNumber>;
|
|
81
|
+
}, z.core.$strip>;
|
|
82
|
+
export declare const TaskSuiteSchema: z.ZodObject<{
|
|
83
|
+
cli: z.ZodString;
|
|
84
|
+
version_command: z.ZodOptional<z.ZodString>;
|
|
85
|
+
tasks: z.ZodArray<z.ZodObject<{
|
|
86
|
+
id: z.ZodString;
|
|
87
|
+
intent: z.ZodString;
|
|
88
|
+
assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
89
|
+
output_contains: z.ZodString;
|
|
90
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
91
|
+
output_equals: z.ZodString;
|
|
92
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
93
|
+
error_contains: z.ZodString;
|
|
94
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
95
|
+
exit_code: z.ZodNumber;
|
|
96
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
97
|
+
file_exists: z.ZodString;
|
|
98
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
99
|
+
file_contains: z.ZodObject<{
|
|
100
|
+
path: z.ZodString;
|
|
101
|
+
text: z.ZodString;
|
|
102
|
+
}, z.core.$strip>;
|
|
103
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
104
|
+
ran: z.ZodString;
|
|
105
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
106
|
+
not_ran: z.ZodString;
|
|
107
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
108
|
+
run_count: z.ZodObject<{
|
|
109
|
+
pattern: z.ZodString;
|
|
110
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
111
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
112
|
+
}, z.core.$strip>;
|
|
113
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
114
|
+
verify: z.ZodObject<{
|
|
115
|
+
run: z.ZodString;
|
|
116
|
+
output_contains: z.ZodOptional<z.ZodString>;
|
|
117
|
+
output_equals: z.ZodOptional<z.ZodString>;
|
|
118
|
+
}, z.core.$strip>;
|
|
119
|
+
}, z.core.$strip>]>>;
|
|
120
|
+
setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
121
|
+
max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
122
|
+
difficulty: z.ZodOptional<z.ZodEnum<{
|
|
123
|
+
easy: "easy";
|
|
124
|
+
medium: "medium";
|
|
125
|
+
hard: "hard";
|
|
126
|
+
}>>;
|
|
127
|
+
category: z.ZodOptional<z.ZodString>;
|
|
128
|
+
repeat: z.ZodOptional<z.ZodNumber>;
|
|
129
|
+
}, z.core.$strip>>;
|
|
130
|
+
}, z.core.$strip>;
|
|
131
|
+
/** Schema for a task file referenced via file:// — plain array of tasks. */
|
|
132
|
+
export declare const TaskFileSchema: z.ZodArray<z.ZodObject<{
|
|
133
|
+
id: z.ZodString;
|
|
134
|
+
intent: z.ZodString;
|
|
135
|
+
assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
136
|
+
output_contains: z.ZodString;
|
|
137
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
138
|
+
output_equals: z.ZodString;
|
|
139
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
140
|
+
error_contains: z.ZodString;
|
|
141
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
142
|
+
exit_code: z.ZodNumber;
|
|
143
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
144
|
+
file_exists: z.ZodString;
|
|
145
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
146
|
+
file_contains: z.ZodObject<{
|
|
147
|
+
path: z.ZodString;
|
|
148
|
+
text: z.ZodString;
|
|
149
|
+
}, z.core.$strip>;
|
|
150
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
151
|
+
ran: z.ZodString;
|
|
152
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
153
|
+
not_ran: z.ZodString;
|
|
154
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
155
|
+
run_count: z.ZodObject<{
|
|
156
|
+
pattern: z.ZodString;
|
|
157
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
158
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
159
|
+
}, z.core.$strip>;
|
|
160
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
161
|
+
verify: z.ZodObject<{
|
|
162
|
+
run: z.ZodString;
|
|
163
|
+
output_contains: z.ZodOptional<z.ZodString>;
|
|
164
|
+
output_equals: z.ZodOptional<z.ZodString>;
|
|
165
|
+
}, z.core.$strip>;
|
|
166
|
+
}, z.core.$strip>]>>;
|
|
167
|
+
setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
168
|
+
max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
169
|
+
difficulty: z.ZodOptional<z.ZodEnum<{
|
|
170
|
+
easy: "easy";
|
|
171
|
+
medium: "medium";
|
|
172
|
+
hard: "hard";
|
|
173
|
+
}>>;
|
|
174
|
+
category: z.ZodOptional<z.ZodString>;
|
|
175
|
+
repeat: z.ZodOptional<z.ZodNumber>;
|
|
176
|
+
}, z.core.$strip>>;
|
|
177
|
+
export declare const ThresholdsSchema: z.ZodOptional<z.ZodObject<{
|
|
178
|
+
default: z.ZodOptional<z.ZodNumber>;
|
|
179
|
+
models: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
180
|
+
tolerance: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
181
|
+
behavior: z.ZodDefault<z.ZodOptional<z.ZodEnum<{
|
|
182
|
+
error: "error";
|
|
183
|
+
informational: "informational";
|
|
184
|
+
}>>>;
|
|
185
|
+
}, z.core.$strip>>;
|
|
186
|
+
/** Schema for the top-level cli-bench.yaml config file. */
|
|
187
|
+
export declare const ConfigFileSchema: z.ZodObject<{
|
|
188
|
+
cli: z.ZodString;
|
|
189
|
+
version_command: z.ZodOptional<z.ZodString>;
|
|
190
|
+
providers: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
191
|
+
help_modes: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
192
|
+
concurrency: z.ZodOptional<z.ZodNumber>;
|
|
193
|
+
workdir: z.ZodOptional<z.ZodString>;
|
|
194
|
+
upload: z.ZodOptional<z.ZodEnum<{
|
|
195
|
+
auto: "auto";
|
|
196
|
+
always: "always";
|
|
197
|
+
never: "never";
|
|
198
|
+
}>>;
|
|
199
|
+
backend_url: z.ZodOptional<z.ZodString>;
|
|
200
|
+
repeat: z.ZodOptional<z.ZodNumber>;
|
|
201
|
+
thresholds: z.ZodOptional<z.ZodObject<{
|
|
202
|
+
default: z.ZodOptional<z.ZodNumber>;
|
|
203
|
+
models: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
204
|
+
tolerance: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
205
|
+
behavior: z.ZodDefault<z.ZodOptional<z.ZodEnum<{
|
|
206
|
+
error: "error";
|
|
207
|
+
informational: "informational";
|
|
208
|
+
}>>>;
|
|
209
|
+
}, z.core.$strip>>;
|
|
210
|
+
tasks: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
211
|
+
id: z.ZodString;
|
|
212
|
+
intent: z.ZodString;
|
|
213
|
+
assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
214
|
+
output_contains: z.ZodString;
|
|
215
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
216
|
+
output_equals: z.ZodString;
|
|
217
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
218
|
+
error_contains: z.ZodString;
|
|
219
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
220
|
+
exit_code: z.ZodNumber;
|
|
221
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
222
|
+
file_exists: z.ZodString;
|
|
223
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
224
|
+
file_contains: z.ZodObject<{
|
|
225
|
+
path: z.ZodString;
|
|
226
|
+
text: z.ZodString;
|
|
227
|
+
}, z.core.$strip>;
|
|
228
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
229
|
+
ran: z.ZodString;
|
|
230
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
231
|
+
not_ran: z.ZodString;
|
|
232
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
233
|
+
run_count: z.ZodObject<{
|
|
234
|
+
pattern: z.ZodString;
|
|
235
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
236
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
237
|
+
}, z.core.$strip>;
|
|
238
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
239
|
+
verify: z.ZodObject<{
|
|
240
|
+
run: z.ZodString;
|
|
241
|
+
output_contains: z.ZodOptional<z.ZodString>;
|
|
242
|
+
output_equals: z.ZodOptional<z.ZodString>;
|
|
243
|
+
}, z.core.$strip>;
|
|
244
|
+
}, z.core.$strip>]>>;
|
|
245
|
+
setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
246
|
+
max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
247
|
+
difficulty: z.ZodOptional<z.ZodEnum<{
|
|
248
|
+
easy: "easy";
|
|
249
|
+
medium: "medium";
|
|
250
|
+
hard: "hard";
|
|
251
|
+
}>>;
|
|
252
|
+
category: z.ZodOptional<z.ZodString>;
|
|
253
|
+
repeat: z.ZodOptional<z.ZodNumber>;
|
|
254
|
+
}, z.core.$strip>, z.ZodString]>>;
|
|
255
|
+
}, z.core.$strip>;
|
|
256
|
+
//# sourceMappingURL=schemas.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mBAW1B,CAAC;AAEH,eAAO,MAAM,UAAU;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBASrB,CAAC;AAEH,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAI1B,CAAC;AAEH,4EAA4E;AAC5E,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kBAA6B,CAAC;AAEzD,eAAO,MAAM,gBAAgB;;;;;;;;kBAKhB,CAAC;AAEd,2DAA2D;AAC3D,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAmB3B,CAAC"}
|
package/dist/schemas.js
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod schemas for runtime validation of YAML config and task files.
|
|
3
|
+
*/
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
export const AssertionSchema = z.union([
|
|
6
|
+
z.object({ output_contains: z.string() }),
|
|
7
|
+
z.object({ output_equals: z.string() }),
|
|
8
|
+
z.object({ error_contains: z.string() }),
|
|
9
|
+
z.object({ exit_code: z.number() }),
|
|
10
|
+
z.object({ file_exists: z.string() }),
|
|
11
|
+
z.object({ file_contains: z.object({ path: z.string(), text: z.string() }) }),
|
|
12
|
+
z.object({ ran: z.string() }),
|
|
13
|
+
z.object({ not_ran: z.string() }),
|
|
14
|
+
z.object({ run_count: z.object({ pattern: z.string(), min: z.number().optional(), max: z.number().optional() }) }),
|
|
15
|
+
z.object({ verify: z.object({ run: z.string(), output_contains: z.string().optional(), output_equals: z.string().optional() }) }),
|
|
16
|
+
]);
|
|
17
|
+
export const TaskSchema = z.object({
|
|
18
|
+
id: z.string(),
|
|
19
|
+
intent: z.string(),
|
|
20
|
+
assert: z.array(AssertionSchema).min(1),
|
|
21
|
+
setup: z.array(z.string()).optional().default([]),
|
|
22
|
+
max_turns: z.number().int().min(1).max(20).optional().default(5),
|
|
23
|
+
difficulty: z.enum(['easy', 'medium', 'hard']).optional(),
|
|
24
|
+
category: z.string().optional(),
|
|
25
|
+
repeat: z.number().int().min(1).max(100).optional(),
|
|
26
|
+
});
|
|
27
|
+
export const TaskSuiteSchema = z.object({
|
|
28
|
+
cli: z.string(),
|
|
29
|
+
version_command: z.string().optional(),
|
|
30
|
+
tasks: z.array(TaskSchema).min(1),
|
|
31
|
+
});
|
|
32
|
+
/** Schema for a task file referenced via file:// — plain array of tasks. */
|
|
33
|
+
export const TaskFileSchema = z.array(TaskSchema).min(1);
|
|
34
|
+
export const ThresholdsSchema = z.object({
|
|
35
|
+
default: z.number().min(0).max(100).optional(),
|
|
36
|
+
models: z.record(z.string(), z.number().min(0).max(100)).optional(),
|
|
37
|
+
tolerance: z.number().min(0).max(100).optional().default(0),
|
|
38
|
+
behavior: z.enum(['error', 'informational']).optional().default('error'),
|
|
39
|
+
}).optional();
|
|
40
|
+
/** Schema for the top-level cli-bench.yaml config file. */
|
|
41
|
+
export const ConfigFileSchema = z.object({
|
|
42
|
+
cli: z.string(),
|
|
43
|
+
version_command: z.string().optional(),
|
|
44
|
+
providers: z.array(z.string()).optional(),
|
|
45
|
+
help_modes: z.array(z.string()).optional(),
|
|
46
|
+
concurrency: z.number().int().min(1).optional(),
|
|
47
|
+
workdir: z.string().optional(),
|
|
48
|
+
upload: z.enum(['auto', 'always', 'never']).optional(),
|
|
49
|
+
backend_url: z.string().optional(),
|
|
50
|
+
repeat: z.number().int().min(1).max(100).optional(),
|
|
51
|
+
thresholds: ThresholdsSchema,
|
|
52
|
+
tasks: z.array(z.union([
|
|
53
|
+
TaskSchema,
|
|
54
|
+
z.string().refine((s) => s.startsWith('file://'), {
|
|
55
|
+
message: 'Task references must start with file://',
|
|
56
|
+
}),
|
|
57
|
+
])).min(1),
|
|
58
|
+
});
|
|
59
|
+
//# sourceMappingURL=schemas.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;CACpD,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAClC,CAAC,CAAC;AAEH,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AAEzD,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IAC9C,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE;IACnE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3D,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC;CACzE,CAAC,CAAC,QAAQ,EAAE,CAAC;AAEd,2DAA2D;AAC3D,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACzC,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC1C,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC/C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,UAAU,EAAE,gBAAgB;IAC5B,KAAK,EAAE,CAAC,CAAC,KAAK,CACZ,CAAC,CAAC,KAAK,CAAC;QACN,UAAU;QACV,CAAC,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;YAChD,OAAO,EAAE,yCAAyC;SACnD,CAAC;KACH,CAAC,CACH,CAAC,GAAG,CAAC,CAAC,CAAC;CACT,CAAC,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generates task suite YAML files for a given CLI using an LLM.
|
|
3
|
+
*
|
|
4
|
+
* Takes CLI name + help text and produces a task suite with
|
|
5
|
+
* assert-based validation.
|
|
6
|
+
*/
|
|
7
|
+
export declare function generateSuite(cliName: string, helpCacheDir: string): Promise<string>;
|
|
8
|
+
//# sourceMappingURL=suite-generator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAwCH,wBAAsB,aAAa,CACjC,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,MAAM,CAAC,CAgEjB"}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generates task suite YAML files for a given CLI using an LLM.
|
|
3
|
+
*
|
|
4
|
+
* Takes CLI name + help text and produces a task suite with
|
|
5
|
+
* assert-based validation.
|
|
6
|
+
*/
|
|
7
|
+
import { readFile } from 'node:fs/promises';
|
|
8
|
+
import { join } from 'node:path';
|
|
9
|
+
import { generateText } from 'ai';
|
|
10
|
+
import { gateway } from 'ai';
|
|
11
|
+
import { loadHelpFromCache, loadHelpLive } from './help-loader.js';
|
|
12
|
+
const GENERATOR_MODEL = 'anthropic/claude-sonnet-4-20250514';
|
|
13
|
+
const SYSTEM_PROMPT = `You are a CLI test suite generator. Given help text for a CLI tool and example task suites, generate a YAML task suite.
|
|
14
|
+
|
|
15
|
+
Task format:
|
|
16
|
+
- id: kebab-case identifier
|
|
17
|
+
- intent: natural language description of what the agent should do
|
|
18
|
+
- assert: array of assertions to validate the result
|
|
19
|
+
- setup: (optional) commands to run before the task
|
|
20
|
+
- max_turns: (optional) max tool calls, default 5
|
|
21
|
+
- difficulty: easy | medium | hard
|
|
22
|
+
- category: query | crud | output | config | workflow | auth
|
|
23
|
+
|
|
24
|
+
Assertion types:
|
|
25
|
+
- ran: regex pattern matched against commands the agent executed
|
|
26
|
+
- not_ran: regex pattern that must NOT match any executed command
|
|
27
|
+
- run_count: { pattern, min?, max? } — count regex matches
|
|
28
|
+
- output_contains: string that must appear in last command stdout
|
|
29
|
+
- output_equals: exact match on last command stdout
|
|
30
|
+
- error_contains: string that must appear in last command stderr
|
|
31
|
+
- exit_code: expected exit code of last command
|
|
32
|
+
- file_exists: path that must exist after execution
|
|
33
|
+
- file_contains: { path, text } — file must contain text
|
|
34
|
+
- verify: { run, output_contains?, output_equals? } — run command to check state
|
|
35
|
+
|
|
36
|
+
Rules:
|
|
37
|
+
- Generate 10-15 tasks: 5 easy, 5 medium, 3-5 hard
|
|
38
|
+
- Every task MUST be executable without authentication or network access
|
|
39
|
+
- Focus on: local operations, file generation, formatting, config, help queries
|
|
40
|
+
- Include setup commands to prepare the environment
|
|
41
|
+
- Use realistic but safe values (no real credentials, no destructive operations)`;
|
|
42
|
+
export async function generateSuite(cliName, helpCacheDir) {
|
|
43
|
+
// Load help text
|
|
44
|
+
let helpCache = await loadHelpFromCache(helpCacheDir, cliName);
|
|
45
|
+
if (!helpCache) {
|
|
46
|
+
try {
|
|
47
|
+
helpCache = await loadHelpLive(cliName);
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
throw new Error(`No help text available for ${cliName}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
// Load example suite for few-shot
|
|
54
|
+
const suiteDir = join(new URL('.', import.meta.url).pathname.replace(/\/src\/$/, '').replace(/\/dist\/$/, ''), 'task_suites');
|
|
55
|
+
let dockerExample = '';
|
|
56
|
+
try {
|
|
57
|
+
dockerExample = await readFile(join(suiteDir, 'docker.yaml'), 'utf-8');
|
|
58
|
+
}
|
|
59
|
+
catch { /* ignore */ }
|
|
60
|
+
// Build help text summary (truncate to fit context)
|
|
61
|
+
const helpEntries = Object.entries(helpCache.help_texts);
|
|
62
|
+
let helpSummary = '';
|
|
63
|
+
for (const [key, text] of helpEntries) {
|
|
64
|
+
const label = key || '(root)';
|
|
65
|
+
const truncated = text.slice(0, 1500);
|
|
66
|
+
helpSummary += `\n--- ${cliName} ${label} --help ---\n${truncated}\n`;
|
|
67
|
+
if (helpSummary.length > 15000)
|
|
68
|
+
break;
|
|
69
|
+
}
|
|
70
|
+
const prompt = `Generate a task suite YAML for the "${cliName}" CLI.
|
|
71
|
+
|
|
72
|
+
## Help Text
|
|
73
|
+
|
|
74
|
+
${helpSummary}
|
|
75
|
+
|
|
76
|
+
## Example Task Suite
|
|
77
|
+
|
|
78
|
+
### docker.yaml
|
|
79
|
+
\`\`\`yaml
|
|
80
|
+
${dockerExample}
|
|
81
|
+
\`\`\`
|
|
82
|
+
|
|
83
|
+
## Output
|
|
84
|
+
|
|
85
|
+
Generate ONLY the YAML content (no markdown fences). Start with \`cli: ${cliName}\`.`;
|
|
86
|
+
const result = await generateText({
|
|
87
|
+
model: gateway(GENERATOR_MODEL),
|
|
88
|
+
system: SYSTEM_PROMPT,
|
|
89
|
+
prompt,
|
|
90
|
+
temperature: 0.3,
|
|
91
|
+
maxOutputTokens: 4096,
|
|
92
|
+
});
|
|
93
|
+
// Clean up: remove any markdown fences the model might add
|
|
94
|
+
let yaml = result.text.trim();
|
|
95
|
+
if (yaml.startsWith('```')) {
|
|
96
|
+
yaml = yaml.replace(/^```\w*\n/, '').replace(/\n```$/, '');
|
|
97
|
+
}
|
|
98
|
+
return yaml;
|
|
99
|
+
}
|
|
100
|
+
//# sourceMappingURL=suite-generator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EAAE,iBAAiB,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAEnE,MAAM,eAAe,GAAG,oCAAoC,CAAC;AAE7D,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;iFA4B2D,CAAC;AAElF,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,YAAoB;IAEpB,iBAAiB;IACjB,IAAI,SAAS,GAAG,MAAM,iBAAiB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,IAAI,CAAC;YACH,SAAS,GAAG,MAAM,YAAY,CAAC,OAAO,CAAC,CAAC;QAC1C,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,kCAAkC;IAClC,MAAM,QAAQ,GAAG,IAAI,CACnB,IAAI,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,EACvF,aAAa,CACd,CAAC;IAEF,IAAI,aAAa,GAAG,EAAE,CAAC;IACvB,IAAI,CAAC;QACH,aAAa,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,EAAE,OAAO,CAAC,CAAC;IACzE,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IAExB,oDAAoD;IACpD,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IACzD,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,GAAG,IAAI,QAAQ,CAAC;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACtC,WAAW,IAAI,SAAS,OAAO,IAAI,KAAK,gBAAgB,SAAS,IAAI,CAAC;QACtE,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK;YAAE,MAAM;IACxC,CAAC;IAED,MAAM,MAAM,GAAG,uCAAuC,OAAO;;;;EAI7D,WAAW;;;;;;EAMX,aAAa;;;;;yEAK0D,OAAO,KAAK,CAAC;IAEpF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;QAChC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC;QAC/B,MAAM,EAAE,aAAa;QACrB,MAAM;QACN,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;IAEH,2DAA2D;IAC3D,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;IAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Threshold checking for CLI bench results.
|
|
3
|
+
*
|
|
4
|
+
* Compares actual model pass rates against configured thresholds
|
|
5
|
+
* with optional tolerance (wiggle room) and per-model overrides.
|
|
6
|
+
*/
|
|
7
|
+
import type { ModelResult, ThresholdsConfig, ThresholdCheckResult } from './models.js';
|
|
8
|
+
export declare function checkThresholds(modelResults: ModelResult[], thresholds: ThresholdsConfig): ThresholdCheckResult;
|
|
9
|
+
export declare function printThresholdResults(check: ThresholdCheckResult): void;
|
|
10
|
+
//# sourceMappingURL=thresholds.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"thresholds.d.ts","sourceRoot":"","sources":["../src/thresholds.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,WAAW,EACX,gBAAgB,EAEhB,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB,wBAAgB,eAAe,CAC7B,YAAY,EAAE,WAAW,EAAE,EAC3B,UAAU,EAAE,gBAAgB,GAC3B,oBAAoB,CAoCtB;AAED,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,oBAAoB,GAAG,IAAI,CAoBvE"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Threshold checking for CLI bench results.
|
|
3
|
+
*
|
|
4
|
+
* Compares actual model pass rates against configured thresholds
|
|
5
|
+
* with optional tolerance (wiggle room) and per-model overrides.
|
|
6
|
+
*/
|
|
7
|
+
export function checkThresholds(modelResults, thresholds) {
|
|
8
|
+
const tolerance = thresholds.tolerance ?? 0;
|
|
9
|
+
const behavior = thresholds.behavior ?? 'error';
|
|
10
|
+
const results = [];
|
|
11
|
+
for (const mr of modelResults) {
|
|
12
|
+
// Look up threshold: per-model → default → skip
|
|
13
|
+
const modelKey = `${mr.provider}/${mr.modelId}`;
|
|
14
|
+
const threshold = thresholds.models?.[modelKey]
|
|
15
|
+
?? thresholds.models?.[mr.modelId]
|
|
16
|
+
?? thresholds.default;
|
|
17
|
+
if (threshold === undefined) {
|
|
18
|
+
continue; // no threshold configured for this model
|
|
19
|
+
}
|
|
20
|
+
const passRate = mr.passRate * 100;
|
|
21
|
+
const effectiveMin = Math.max(0, threshold - tolerance);
|
|
22
|
+
const passed = passRate >= effectiveMin;
|
|
23
|
+
results.push({
|
|
24
|
+
model: mr.displayName,
|
|
25
|
+
passRate: Math.round(passRate * 100) / 100,
|
|
26
|
+
threshold,
|
|
27
|
+
tolerance,
|
|
28
|
+
effectiveMin,
|
|
29
|
+
passed,
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
return {
|
|
33
|
+
allPassed: results.every((r) => r.passed),
|
|
34
|
+
results,
|
|
35
|
+
behavior,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
export function printThresholdResults(check) {
|
|
39
|
+
if (check.results.length === 0)
|
|
40
|
+
return;
|
|
41
|
+
console.log('\n=== Threshold Check ===');
|
|
42
|
+
for (const r of check.results) {
|
|
43
|
+
const icon = r.passed ? '✓' : '✗';
|
|
44
|
+
const cmp = r.passed ? '>=' : '<';
|
|
45
|
+
const tolStr = r.tolerance > 0 ? `, tolerance ${r.tolerance}%` : '';
|
|
46
|
+
console.log(` ${icon} ${r.model}: ${r.passRate}% ${cmp} ${r.effectiveMin}% (threshold ${r.threshold}%${tolStr})`);
|
|
47
|
+
}
|
|
48
|
+
const failCount = check.results.filter((r) => !r.passed).length;
|
|
49
|
+
if (failCount > 0) {
|
|
50
|
+
const label = check.behavior === 'informational' ? 'WARNING' : 'FAILED';
|
|
51
|
+
console.log(`${label}: ${failCount} model(s) below threshold`);
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
console.log('All models meet thresholds');
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
//# sourceMappingURL=thresholds.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"thresholds.js","sourceRoot":"","sources":["../src/thresholds.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AASH,MAAM,UAAU,eAAe,CAC7B,YAA2B,EAC3B,UAA4B;IAE5B,MAAM,SAAS,GAAG,UAAU,CAAC,SAAS,IAAI,CAAC,CAAC;IAC5C,MAAM,QAAQ,GAAG,UAAU,CAAC,QAAQ,IAAI,OAAO,CAAC;IAEhD,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,EAAE,IAAI,YAAY,EAAE,CAAC;QAC9B,gDAAgD;QAChD,MAAM,QAAQ,GAAG,GAAG,EAAE,CAAC,QAAQ,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;QAChD,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC;eAC1C,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,CAAC,OAAO,CAAC;eAC/B,UAAU,CAAC,OAAO,CAAC;QAExB,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;YAC5B,SAAS,CAAC,yCAAyC;QACrD,CAAC;QAED,MAAM,QAAQ,GAAG,EAAE,CAAC,QAAQ,GAAG,GAAG,CAAC;QACnC,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,GAAG,SAAS,CAAC,CAAC;QACxD,MAAM,MAAM,GAAG,QAAQ,IAAI,YAAY,CAAC;QAExC,OAAO,CAAC,IAAI,CAAC;YACX,KAAK,EAAE,EAAE,CAAC,WAAW;YACrB,QAAQ,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,GAAG,GAAG,CAAC,GAAG,GAAG;YAC1C,SAAS;YACT,SAAS;YACT,YAAY;YACZ,MAAM;SACP,CAAC,CAAC;IACL,CAAC;IAED,OAAO;QACL,SAAS,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;QACzC,OAAO;QACP,QAAQ;KACT,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,KAA2B;IAC/D,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAEvC,OAAO,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC;IACzC,KAAK,MAAM,CAAC,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;QAClC,MAAM,GAAG,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC;QAClC,MAAM,MAAM,GAAG,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;QACpE,OAAO,CAAC,GAAG,CACT,KAAK,IAAI,IAAI,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,QAAQ,KAAK,GAAG,IAAI,CAAC,CAAC,YAAY,gBAAgB,CAAC,CAAC,SAAS,IAAI,MAAM,GAAG,CACtG,CAAC;IACJ,CAAC;IAED,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IAChE,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;QAClB,MAAM,KAAK,GAAG,KAAK,CAAC,QAAQ,KAAK,eAAe,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC;QACxE,OAAO,CAAC,GAAG,CAAC,GAAG,KAAK,KAAK,SAAS,2BAA2B,CAAC,CAAC;IACjE,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@cliwatch/cli-bench",
|
|
3
|
+
"version": "0.4.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"bin": {
|
|
6
|
+
"cli-bench": "./dist/index.js"
|
|
7
|
+
},
|
|
8
|
+
"files": [
|
|
9
|
+
"dist",
|
|
10
|
+
"task_suites"
|
|
11
|
+
],
|
|
12
|
+
"exports": {
|
|
13
|
+
".": {
|
|
14
|
+
"types": "./dist/index.d.ts",
|
|
15
|
+
"import": "./dist/index.js"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"publishConfig": {
|
|
19
|
+
"access": "public"
|
|
20
|
+
},
|
|
21
|
+
"scripts": {
|
|
22
|
+
"generate-client": "npx @hey-api/openapi-ts",
|
|
23
|
+
"build": "tsc",
|
|
24
|
+
"start": "node dist/index.js",
|
|
25
|
+
"dev": "tsx src/index.ts",
|
|
26
|
+
"test": "vitest"
|
|
27
|
+
},
|
|
28
|
+
"dependencies": {
|
|
29
|
+
"@hey-api/client-fetch": "^0.13.1",
|
|
30
|
+
"ai": "^6.0.18",
|
|
31
|
+
"yaml": "^2.7.0",
|
|
32
|
+
"zod": "^4.1.12"
|
|
33
|
+
},
|
|
34
|
+
"devDependencies": {
|
|
35
|
+
"@hey-api/openapi-ts": "^0.91.1",
|
|
36
|
+
"@types/node": "^22.0.0",
|
|
37
|
+
"tsx": "^4.19.0",
|
|
38
|
+
"typescript": "^5.8.0",
|
|
39
|
+
"vitest": "^3.2.0"
|
|
40
|
+
}
|
|
41
|
+
}
|