@cliwatch/cli-bench 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/README.md +3 -0
- package/dist/assertions.d.ts +1 -1
- package/dist/assertions.d.ts.map +1 -1
- package/dist/assertions.js +6 -6
- package/dist/assertions.js.map +1 -1
- package/dist/client/index.d.ts +1 -1
- package/dist/client/index.d.ts.map +1 -1
- package/dist/client/types.gen.d.ts +124 -86
- package/dist/client/types.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.d.ts +57 -36
- package/dist/client/zod.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.js +84 -52
- package/dist/client/zod.gen.js.map +1 -1
- package/dist/config.d.ts +2 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +9 -0
- package/dist/config.js.map +1 -1
- package/dist/exec.d.ts +2 -0
- package/dist/exec.d.ts.map +1 -1
- package/dist/exec.js +6 -2
- package/dist/exec.js.map +1 -1
- package/dist/github-comment.d.ts +16 -0
- package/dist/github-comment.d.ts.map +1 -0
- package/dist/github-comment.js +90 -0
- package/dist/github-comment.js.map +1 -0
- package/dist/index.d.ts +2 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +29 -27
- package/dist/index.js.map +1 -1
- package/dist/models.d.ts +8 -0
- package/dist/models.d.ts.map +1 -1
- package/dist/project.d.ts +11 -2
- package/dist/project.d.ts.map +1 -1
- package/dist/project.js +22 -8
- package/dist/project.js.map +1 -1
- package/dist/providers.d.ts +9 -7
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +26 -8
- package/dist/providers.js.map +1 -1
- package/dist/redact.d.ts +20 -6
- package/dist/redact.d.ts.map +1 -1
- package/dist/redact.js +68 -13
- package/dist/redact.js.map +1 -1
- package/dist/runner.d.ts +31 -1
- package/dist/runner.d.ts.map +1 -1
- package/dist/runner.js +84 -78
- package/dist/runner.js.map +1 -1
- package/dist/schemas.d.ts +16 -0
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +7 -0
- package/dist/schemas.js.map +1 -1
- package/dist/suite-generator.d.ts.map +1 -1
- package/dist/suite-generator.js +63 -11
- package/dist/suite-generator.js.map +1 -1
- package/package.json +2 -2
- package/task_suites/curl.yaml +0 -138
- package/task_suites/docker.yaml +0 -163
- package/task_suites/gh.yaml +0 -118
- package/task_suites/jq.yaml +0 -172
- package/task_suites/kubectl.yaml +0 -74
package/dist/schemas.d.ts
CHANGED
|
@@ -70,6 +70,8 @@ export declare const TaskSchema: z.ZodObject<{
|
|
|
70
70
|
}, z.core.$strip>;
|
|
71
71
|
}, z.core.$strip>]>>;
|
|
72
72
|
setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
73
|
+
cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
74
|
+
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
73
75
|
max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
74
76
|
difficulty: z.ZodOptional<z.ZodEnum<{
|
|
75
77
|
easy: "easy";
|
|
@@ -78,6 +80,7 @@ export declare const TaskSchema: z.ZodObject<{
|
|
|
78
80
|
}>>;
|
|
79
81
|
category: z.ZodOptional<z.ZodString>;
|
|
80
82
|
repeat: z.ZodOptional<z.ZodNumber>;
|
|
83
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
81
84
|
}, z.core.$strip>;
|
|
82
85
|
export declare const TaskSuiteSchema: z.ZodObject<{
|
|
83
86
|
cli: z.ZodString;
|
|
@@ -122,6 +125,8 @@ export declare const TaskSuiteSchema: z.ZodObject<{
|
|
|
122
125
|
}, z.core.$strip>;
|
|
123
126
|
}, z.core.$strip>]>>;
|
|
124
127
|
setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
128
|
+
cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
129
|
+
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
125
130
|
max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
126
131
|
difficulty: z.ZodOptional<z.ZodEnum<{
|
|
127
132
|
easy: "easy";
|
|
@@ -130,6 +135,7 @@ export declare const TaskSuiteSchema: z.ZodObject<{
|
|
|
130
135
|
}>>;
|
|
131
136
|
category: z.ZodOptional<z.ZodString>;
|
|
132
137
|
repeat: z.ZodOptional<z.ZodNumber>;
|
|
138
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
133
139
|
}, z.core.$strip>>;
|
|
134
140
|
}, z.core.$strip>;
|
|
135
141
|
/** Schema for a task file referenced via file:// — plain array of tasks. */
|
|
@@ -169,6 +175,8 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodObject<{
|
|
|
169
175
|
}, z.core.$strip>;
|
|
170
176
|
}, z.core.$strip>]>>;
|
|
171
177
|
setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
178
|
+
cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
179
|
+
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
172
180
|
max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
173
181
|
difficulty: z.ZodOptional<z.ZodEnum<{
|
|
174
182
|
easy: "easy";
|
|
@@ -177,6 +185,7 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodObject<{
|
|
|
177
185
|
}>>;
|
|
178
186
|
category: z.ZodOptional<z.ZodString>;
|
|
179
187
|
repeat: z.ZodOptional<z.ZodNumber>;
|
|
188
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
180
189
|
}, z.core.$strip>>;
|
|
181
190
|
export declare const ThresholdsSchema: z.ZodOptional<z.ZodObject<{
|
|
182
191
|
default: z.ZodOptional<z.ZodNumber>;
|
|
@@ -208,6 +217,7 @@ export declare const ConfigFileSchema: z.ZodObject<{
|
|
|
208
217
|
backend_url: z.ZodOptional<z.ZodString>;
|
|
209
218
|
repeat: z.ZodOptional<z.ZodNumber>;
|
|
210
219
|
redact_env: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
220
|
+
redact_patterns: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
211
221
|
thresholds: z.ZodOptional<z.ZodObject<{
|
|
212
222
|
default: z.ZodOptional<z.ZodNumber>;
|
|
213
223
|
models: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
@@ -217,6 +227,9 @@ export declare const ConfigFileSchema: z.ZodObject<{
|
|
|
217
227
|
informational: "informational";
|
|
218
228
|
}>>>;
|
|
219
229
|
}, z.core.$strip>>;
|
|
230
|
+
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
231
|
+
setup: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
232
|
+
cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
220
233
|
tasks: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
221
234
|
id: z.ZodString;
|
|
222
235
|
intent: z.ZodString;
|
|
@@ -253,6 +266,8 @@ export declare const ConfigFileSchema: z.ZodObject<{
|
|
|
253
266
|
}, z.core.$strip>;
|
|
254
267
|
}, z.core.$strip>]>>;
|
|
255
268
|
setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
|
|
269
|
+
cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
270
|
+
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
256
271
|
max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
257
272
|
difficulty: z.ZodOptional<z.ZodEnum<{
|
|
258
273
|
easy: "easy";
|
|
@@ -261,6 +276,7 @@ export declare const ConfigFileSchema: z.ZodObject<{
|
|
|
261
276
|
}>>;
|
|
262
277
|
category: z.ZodOptional<z.ZodString>;
|
|
263
278
|
repeat: z.ZodOptional<z.ZodNumber>;
|
|
279
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
264
280
|
}, z.core.$strip>, z.ZodString]>>;
|
|
265
281
|
}, z.core.$strip>;
|
|
266
282
|
//# sourceMappingURL=schemas.d.ts.map
|
package/dist/schemas.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mBAW1B,CAAC;AAEH,eAAO,MAAM,UAAU
|
|
1
|
+
{"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mBAW1B,CAAC;AAEH,eAAO,MAAM,UAAU;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAYrB,CAAC;AAEH,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAQ1B,CAAC;AAEH,4EAA4E;AAC5E,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kBAA6B,CAAC;AAEzD,eAAO,MAAM,gBAAgB;;;;;;;;kBAKhB,CAAC;AAEd,2DAA2D;AAC3D,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBA6B3B,CAAC"}
|
package/dist/schemas.js
CHANGED
|
@@ -19,10 +19,13 @@ export const TaskSchema = z.object({
|
|
|
19
19
|
intent: z.string(),
|
|
20
20
|
assert: z.array(AssertionSchema).min(1),
|
|
21
21
|
setup: z.array(z.string()).optional().default([]),
|
|
22
|
+
cleanup: z.array(z.string()).optional(),
|
|
23
|
+
env: z.record(z.string(), z.string()).optional(),
|
|
22
24
|
max_turns: z.number().int().min(1).max(20).optional().default(5),
|
|
23
25
|
difficulty: z.enum(['easy', 'medium', 'hard']).optional(),
|
|
24
26
|
category: z.string().optional(),
|
|
25
27
|
repeat: z.number().int().min(1).max(100).optional(),
|
|
28
|
+
tags: z.array(z.string()).optional(),
|
|
26
29
|
});
|
|
27
30
|
export const TaskSuiteSchema = z.object({
|
|
28
31
|
cli: z.string(),
|
|
@@ -58,7 +61,11 @@ export const ConfigFileSchema = z.object({
|
|
|
58
61
|
backend_url: z.string().optional(),
|
|
59
62
|
repeat: z.number().int().min(1).max(100).optional(),
|
|
60
63
|
redact_env: z.array(z.string()).optional(),
|
|
64
|
+
redact_patterns: z.array(z.string()).optional(),
|
|
61
65
|
thresholds: ThresholdsSchema,
|
|
66
|
+
env: z.record(z.string(), z.string()).optional(),
|
|
67
|
+
setup: z.array(z.string()).optional(),
|
|
68
|
+
cleanup: z.array(z.string()).optional(),
|
|
62
69
|
tasks: z.array(z.union([
|
|
63
70
|
TaskSchema,
|
|
64
71
|
z.string().refine((s) => s.startsWith('file://'), {
|
package/dist/schemas.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;
|
|
1
|
+
{"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;CACrC,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAClC,CAAC,CAAC;AAEH,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AAEzD,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IAC9C,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE;IACnE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3D,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC;CACzE,CAAC,CAAC,QAAQ,EAAE,CAAC;AAEd,2DAA2D;AAC3D,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACzC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACpC,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC/C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC1C,eAAe,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC/C,UAAU,EAAE,gBAAgB;IAC5B,GAAG,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACrC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CACZ,CAAC,CAAC,KAAK,CAAC;QACN,UAAU;QACV,CAAC,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;YAChD,OAAO,EAAE,yCAAyC;SACnD,CAAC;KACH,CAAC,CACH,CAAC,GAAG,CAAC,CAAC,CAAC;CACT,CAAC,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;
|
|
1
|
+
{"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAmGH,wBAAsB,aAAa,CACjC,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,MAAM,CAAC,CAwDjB"}
|
package/dist/suite-generator.js
CHANGED
|
@@ -4,8 +4,6 @@
|
|
|
4
4
|
* Takes CLI name + help text and produces a task suite with
|
|
5
5
|
* assert-based validation.
|
|
6
6
|
*/
|
|
7
|
-
import { readFile } from 'node:fs/promises';
|
|
8
|
-
import { join } from 'node:path';
|
|
9
7
|
import { generateText } from 'ai';
|
|
10
8
|
import { gateway } from 'ai';
|
|
11
9
|
import { loadHelpFromCache, loadTopLevelHelp } from './help-loader.js';
|
|
@@ -39,6 +37,66 @@ Rules:
|
|
|
39
37
|
- Focus on: local operations, file generation, formatting, config, help queries
|
|
40
38
|
- Include setup commands to prepare the environment
|
|
41
39
|
- Use realistic but safe values (no real credentials, no destructive operations)`;
|
|
40
|
+
const FEW_SHOT_EXAMPLE = `cli: docker
|
|
41
|
+
version_command: "docker --version"
|
|
42
|
+
|
|
43
|
+
providers:
|
|
44
|
+
- openai/gpt-5-nano
|
|
45
|
+
- google/gemini-2.5-flash-lite
|
|
46
|
+
|
|
47
|
+
context:
|
|
48
|
+
- zero-shot
|
|
49
|
+
|
|
50
|
+
concurrency: 3
|
|
51
|
+
|
|
52
|
+
system_prompt: |
|
|
53
|
+
You are working in a temporary empty directory.
|
|
54
|
+
Docker is installed and the daemon is running.
|
|
55
|
+
Complete each task using docker commands.
|
|
56
|
+
Use unique names/tags to avoid conflicts (prefix with 'bench-').
|
|
57
|
+
|
|
58
|
+
tasks:
|
|
59
|
+
- id: build-image
|
|
60
|
+
intent: "Create a Dockerfile for a simple Alpine-based image that prints 'hello from docker' when run. Build it with the tag 'bench-hello'."
|
|
61
|
+
difficulty: easy
|
|
62
|
+
category: build
|
|
63
|
+
max_turns: 5
|
|
64
|
+
assert:
|
|
65
|
+
- file_exists: "Dockerfile"
|
|
66
|
+
- ran: "docker build"
|
|
67
|
+
- verify:
|
|
68
|
+
run: "docker images bench-hello --format '{{.Repository}}'"
|
|
69
|
+
output_contains: "bench-hello"
|
|
70
|
+
|
|
71
|
+
- id: run-and-capture
|
|
72
|
+
intent: "Run the 'alpine' image with the command 'echo benchmark-test-output' and capture the output."
|
|
73
|
+
difficulty: easy
|
|
74
|
+
category: run
|
|
75
|
+
max_turns: 3
|
|
76
|
+
assert:
|
|
77
|
+
- ran: "docker run"
|
|
78
|
+
- output_contains: "benchmark-test-output"
|
|
79
|
+
|
|
80
|
+
- id: inspect-container
|
|
81
|
+
intent: "Run an alpine container named 'bench-inspect' in detached mode (sleep 300), then use docker inspect to show its IP address."
|
|
82
|
+
difficulty: medium
|
|
83
|
+
category: query
|
|
84
|
+
max_turns: 5
|
|
85
|
+
assert:
|
|
86
|
+
- ran: "docker run"
|
|
87
|
+
- ran: "docker inspect"
|
|
88
|
+
- exit_code: 0
|
|
89
|
+
|
|
90
|
+
- id: volume-mount
|
|
91
|
+
intent: "Create a file called 'data.txt' with content 'volume test'. Run an alpine container that mounts the current directory to /data and reads the file with 'cat /data/data.txt'."
|
|
92
|
+
difficulty: medium
|
|
93
|
+
category: volumes
|
|
94
|
+
max_turns: 5
|
|
95
|
+
setup:
|
|
96
|
+
- "echo 'volume test' > data.txt"
|
|
97
|
+
assert:
|
|
98
|
+
- ran: "docker run"
|
|
99
|
+
- output_contains: "volume test"`;
|
|
42
100
|
export async function generateSuite(cliName, helpCacheDir) {
|
|
43
101
|
// Load help text
|
|
44
102
|
let helpCache = await loadHelpFromCache(helpCacheDir, cliName);
|
|
@@ -51,13 +109,7 @@ export async function generateSuite(cliName, helpCacheDir) {
|
|
|
51
109
|
throw new Error(`No help text available for ${cliName}`);
|
|
52
110
|
}
|
|
53
111
|
}
|
|
54
|
-
|
|
55
|
-
const suiteDir = join(new URL('.', import.meta.url).pathname.replace(/\/src\/$/, '').replace(/\/dist\/$/, ''), 'task_suites');
|
|
56
|
-
let dockerExample = '';
|
|
57
|
-
try {
|
|
58
|
-
dockerExample = await readFile(join(suiteDir, 'docker.yaml'), 'utf-8');
|
|
59
|
-
}
|
|
60
|
-
catch { /* ignore */ }
|
|
112
|
+
const fewShotExample = FEW_SHOT_EXAMPLE;
|
|
61
113
|
// Build help text summary (truncate to fit context)
|
|
62
114
|
const helpEntries = Object.entries(helpCache.help_texts);
|
|
63
115
|
let helpSummary = '';
|
|
@@ -76,9 +128,9 @@ ${helpSummary}
|
|
|
76
128
|
|
|
77
129
|
## Example Task Suite
|
|
78
130
|
|
|
79
|
-
###
|
|
131
|
+
### example cli-bench.yaml
|
|
80
132
|
\`\`\`yaml
|
|
81
|
-
${
|
|
133
|
+
${fewShotExample}
|
|
82
134
|
\`\`\`
|
|
83
135
|
|
|
84
136
|
## Output
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAEvE,MAAM,eAAe,GAAG,oCAAoC,CAAC;AAE7D,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;iFA4B2D,CAAC;AAElF,MAAM,gBAAgB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;uCA2Dc,CAAC;AAExC,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,YAAoB;IAEpB,iBAAiB;IACjB,IAAI,SAAS,GAAG,MAAM,iBAAiB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,MAAM,gBAAgB,CAAC,OAAO,CAAC,CAAC;QACrD,IAAI,YAAY,EAAE,CAAC;YACjB,SAAS,GAAG,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,CAAC;QACtE,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,MAAM,cAAc,GAAG,gBAAgB,CAAC;IAExC,oDAAoD;IACpD,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IACzD,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,GAAG,IAAI,QAAQ,CAAC;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACtC,WAAW,IAAI,SAAS,OAAO,IAAI,KAAK,gBAAgB,SAAS,IAAI,CAAC;QACtE,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK;YAAE,MAAM;IACxC,CAAC;IAED,MAAM,MAAM,GAAG,uCAAuC,OAAO;;;;EAI7D,WAAW;;;;;;EAMX,cAAc;;;;;yEAKyD,OAAO,KAAK,CAAC;IAEpF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;QAChC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC;QAC/B,MAAM,EAAE,aAAa;QACrB,MAAM;QACN,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;IAEH,2DAA2D;IAC3D,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;IAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cliwatch/cli-bench",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.0",
|
|
4
4
|
"description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
|
|
5
5
|
"keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
|
|
6
6
|
"license": "MIT",
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
},
|
|
15
15
|
"files": [
|
|
16
16
|
"dist",
|
|
17
|
-
"task_suites",
|
|
18
17
|
"LICENSE",
|
|
19
18
|
"CHANGELOG.md"
|
|
20
19
|
],
|
|
@@ -35,6 +34,7 @@
|
|
|
35
34
|
"test": "vitest"
|
|
36
35
|
},
|
|
37
36
|
"dependencies": {
|
|
37
|
+
"@ai-sdk/google": "^3.0.0",
|
|
38
38
|
"@hey-api/client-fetch": "^0.13.1",
|
|
39
39
|
"ai": "^6.0.18",
|
|
40
40
|
"yaml": "^2.7.0",
|
package/task_suites/curl.yaml
DELETED
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
cli: curl
|
|
2
|
-
version_command: "curl --version | head -1"
|
|
3
|
-
|
|
4
|
-
tasks:
|
|
5
|
-
# -- Easy --------------------------------------------------------------------
|
|
6
|
-
- id: simple-get
|
|
7
|
-
intent: "Fetch the contents of https://httpbin.org/get"
|
|
8
|
-
difficulty: easy
|
|
9
|
-
category: query
|
|
10
|
-
max_turns: 2
|
|
11
|
-
assert:
|
|
12
|
-
- ran: "curl.*httpbin.org/get"
|
|
13
|
-
- exit_code: 0
|
|
14
|
-
- output_contains: "origin"
|
|
15
|
-
|
|
16
|
-
- id: head-request
|
|
17
|
-
intent: "Send a HEAD request to https://httpbin.org/get and show the response headers"
|
|
18
|
-
difficulty: easy
|
|
19
|
-
category: query
|
|
20
|
-
max_turns: 3
|
|
21
|
-
assert:
|
|
22
|
-
- ran: "curl"
|
|
23
|
-
- ran: "-I|--head"
|
|
24
|
-
- output_contains: "HTTP"
|
|
25
|
-
|
|
26
|
-
- id: download-file
|
|
27
|
-
intent: "Download https://httpbin.org/robots.txt and save it as robots.txt"
|
|
28
|
-
difficulty: easy
|
|
29
|
-
category: crud
|
|
30
|
-
max_turns: 3
|
|
31
|
-
assert:
|
|
32
|
-
- ran: "curl"
|
|
33
|
-
- ran: "-o|--output"
|
|
34
|
-
- file_exists: "robots.txt"
|
|
35
|
-
|
|
36
|
-
- id: follow-redirects
|
|
37
|
-
intent: "Fetch https://httpbin.org/redirect/2 and follow all redirects"
|
|
38
|
-
difficulty: easy
|
|
39
|
-
category: query
|
|
40
|
-
max_turns: 3
|
|
41
|
-
assert:
|
|
42
|
-
- ran: "curl"
|
|
43
|
-
- ran: "-L|--location"
|
|
44
|
-
- exit_code: 0
|
|
45
|
-
|
|
46
|
-
# -- Medium ------------------------------------------------------------------
|
|
47
|
-
- id: post-json
|
|
48
|
-
intent: "Send a POST request to https://httpbin.org/post with JSON body {\"name\": \"bench\", \"version\": 1} and set the Content-Type header to application/json"
|
|
49
|
-
difficulty: medium
|
|
50
|
-
category: crud
|
|
51
|
-
max_turns: 5
|
|
52
|
-
assert:
|
|
53
|
-
- ran: "curl"
|
|
54
|
-
- ran: "-X POST|--request POST|-d|--data"
|
|
55
|
-
- ran: "Content-Type.*application/json"
|
|
56
|
-
- output_contains: "bench"
|
|
57
|
-
|
|
58
|
-
- id: custom-headers
|
|
59
|
-
intent: "Send a GET request to https://httpbin.org/headers with custom headers X-Request-ID: abc123 and Accept: application/xml"
|
|
60
|
-
difficulty: medium
|
|
61
|
-
category: query
|
|
62
|
-
max_turns: 5
|
|
63
|
-
assert:
|
|
64
|
-
- ran: "curl"
|
|
65
|
-
- ran: "-H|--header"
|
|
66
|
-
- ran: "X-Request-ID"
|
|
67
|
-
- output_contains: "abc123"
|
|
68
|
-
|
|
69
|
-
- id: basic-auth
|
|
70
|
-
intent: "Send a GET request to https://httpbin.org/basic-auth/user/passwd using basic authentication with username 'user' and password 'passwd'"
|
|
71
|
-
difficulty: medium
|
|
72
|
-
category: auth
|
|
73
|
-
max_turns: 5
|
|
74
|
-
assert:
|
|
75
|
-
- ran: "curl"
|
|
76
|
-
- ran: "-u|--user|user:passwd"
|
|
77
|
-
- output_contains: "authenticated"
|
|
78
|
-
|
|
79
|
-
- id: verbose-timing
|
|
80
|
-
intent: "Fetch https://httpbin.org/get and show the total time taken for the request using curl's write-out feature"
|
|
81
|
-
difficulty: medium
|
|
82
|
-
category: output
|
|
83
|
-
max_turns: 5
|
|
84
|
-
assert:
|
|
85
|
-
- ran: "curl"
|
|
86
|
-
- ran: "-w|--write-out|time_total"
|
|
87
|
-
|
|
88
|
-
# -- Hard --------------------------------------------------------------------
|
|
89
|
-
- id: put-with-file
|
|
90
|
-
intent: "Upload the file /tmp/bench-workspace/data.json to https://httpbin.org/put using a PUT request with Content-Type application/json"
|
|
91
|
-
difficulty: hard
|
|
92
|
-
category: crud
|
|
93
|
-
setup:
|
|
94
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
95
|
-
- "echo '{\"key\": \"value\"}' > /tmp/bench-workspace/data.json"
|
|
96
|
-
max_turns: 7
|
|
97
|
-
assert:
|
|
98
|
-
- ran: "curl"
|
|
99
|
-
- ran: "-X PUT|--request PUT|-T"
|
|
100
|
-
- ran: "data.json"
|
|
101
|
-
- output_contains: "key"
|
|
102
|
-
|
|
103
|
-
- id: retry-with-timeout
|
|
104
|
-
intent: "Fetch https://httpbin.org/delay/1 with a 5 second timeout, retry 3 times on failure, and save the response to response.json"
|
|
105
|
-
difficulty: hard
|
|
106
|
-
category: crud
|
|
107
|
-
max_turns: 7
|
|
108
|
-
assert:
|
|
109
|
-
- ran: "curl"
|
|
110
|
-
- ran: "--retry.*3"
|
|
111
|
-
- ran: "--max-time|--connect-timeout|-m"
|
|
112
|
-
- ran: "-o|--output"
|
|
113
|
-
|
|
114
|
-
- id: multipart-upload
|
|
115
|
-
intent: "Send a multipart/form-data POST to https://httpbin.org/post with a field 'username' set to 'admin' and a file field 'config' uploading /tmp/bench-workspace/app.conf"
|
|
116
|
-
difficulty: hard
|
|
117
|
-
category: crud
|
|
118
|
-
setup:
|
|
119
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
120
|
-
- "echo 'port=8080' > /tmp/bench-workspace/app.conf"
|
|
121
|
-
max_turns: 7
|
|
122
|
-
assert:
|
|
123
|
-
- ran: "curl"
|
|
124
|
-
- ran: "-F|--form"
|
|
125
|
-
- ran: "username.*admin"
|
|
126
|
-
- ran: "config.*@.*app.conf"
|
|
127
|
-
- output_contains: "admin"
|
|
128
|
-
|
|
129
|
-
- id: conditional-request
|
|
130
|
-
intent: "Fetch https://httpbin.org/cache and use conditional headers: set If-None-Match to '12345' and If-Modified-Since to 'Thu, 01 Jan 2025 00:00:00 GMT'. Show the response status code."
|
|
131
|
-
difficulty: hard
|
|
132
|
-
category: query
|
|
133
|
-
max_turns: 7
|
|
134
|
-
assert:
|
|
135
|
-
- ran: "curl"
|
|
136
|
-
- ran: "If-None-Match"
|
|
137
|
-
- ran: "If-Modified-Since"
|
|
138
|
-
- ran: "-w|--write-out|-I|--head|-v|--verbose"
|
package/task_suites/docker.yaml
DELETED
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
cli: docker
|
|
2
|
-
|
|
3
|
-
tasks:
|
|
4
|
-
# -- Easy --------------------------------------------------------------------
|
|
5
|
-
- id: list-containers
|
|
6
|
-
intent: "List all running containers"
|
|
7
|
-
difficulty: easy
|
|
8
|
-
category: query
|
|
9
|
-
max_turns: 3
|
|
10
|
-
assert:
|
|
11
|
-
- ran: "docker ps"
|
|
12
|
-
- exit_code: 0
|
|
13
|
-
|
|
14
|
-
- id: list-images
|
|
15
|
-
intent: "List all local Docker images"
|
|
16
|
-
difficulty: easy
|
|
17
|
-
category: query
|
|
18
|
-
max_turns: 3
|
|
19
|
-
assert:
|
|
20
|
-
- ran: "docker image"
|
|
21
|
-
- exit_code: 0
|
|
22
|
-
|
|
23
|
-
- id: pull-image
|
|
24
|
-
intent: "Pull the latest nginx image from Docker Hub"
|
|
25
|
-
difficulty: easy
|
|
26
|
-
category: crud
|
|
27
|
-
max_turns: 3
|
|
28
|
-
assert:
|
|
29
|
-
- ran: "docker pull.*nginx"
|
|
30
|
-
- verify:
|
|
31
|
-
run: "docker images nginx --format '{{.Repository}}'"
|
|
32
|
-
output_contains: "nginx"
|
|
33
|
-
|
|
34
|
-
- id: view-logs
|
|
35
|
-
intent: "Show the last 100 lines of logs from the container 'api' and follow new output"
|
|
36
|
-
difficulty: easy
|
|
37
|
-
category: query
|
|
38
|
-
setup:
|
|
39
|
-
- "docker run -d --name api alpine sh -c 'for i in $(seq 1 200); do echo line-$i; done; sleep 3600'"
|
|
40
|
-
max_turns: 3
|
|
41
|
-
assert:
|
|
42
|
-
- ran: "docker logs"
|
|
43
|
-
- ran: "--tail"
|
|
44
|
-
|
|
45
|
-
- id: stop-container
|
|
46
|
-
intent: "Stop the container named 'web-server'"
|
|
47
|
-
difficulty: easy
|
|
48
|
-
category: crud
|
|
49
|
-
setup:
|
|
50
|
-
- "docker run -d --name web-server alpine sleep 3600"
|
|
51
|
-
max_turns: 3
|
|
52
|
-
assert:
|
|
53
|
-
- ran: "docker stop.*web-server"
|
|
54
|
-
- verify:
|
|
55
|
-
run: "docker ps --filter name=web-server --format '{{.Names}}'"
|
|
56
|
-
output_equals: ""
|
|
57
|
-
|
|
58
|
-
# -- Medium ------------------------------------------------------------------
|
|
59
|
-
- id: run-detached
|
|
60
|
-
intent: "Run an nginx container in the background, mapping host port 8080 to container port 80, named 'web'"
|
|
61
|
-
difficulty: medium
|
|
62
|
-
category: crud
|
|
63
|
-
max_turns: 5
|
|
64
|
-
assert:
|
|
65
|
-
- ran: "docker run"
|
|
66
|
-
- ran: "-d"
|
|
67
|
-
- ran: "--name.*web"
|
|
68
|
-
- verify:
|
|
69
|
-
run: "docker ps --filter name=web --format '{{.Names}}'"
|
|
70
|
-
output_contains: "web"
|
|
71
|
-
|
|
72
|
-
- id: build-with-tag
|
|
73
|
-
intent: "Build a Docker image from ./Dockerfile and tag it as myapp:v2"
|
|
74
|
-
difficulty: medium
|
|
75
|
-
category: crud
|
|
76
|
-
setup:
|
|
77
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
78
|
-
- "printf 'FROM alpine:latest\nRUN echo hello' > /tmp/bench-workspace/Dockerfile"
|
|
79
|
-
max_turns: 5
|
|
80
|
-
assert:
|
|
81
|
-
- ran: "docker build"
|
|
82
|
-
- ran: "myapp:v2"
|
|
83
|
-
- verify:
|
|
84
|
-
run: "docker images myapp:v2 --format '{{.Repository}}:{{.Tag}}'"
|
|
85
|
-
output_contains: "myapp:v2"
|
|
86
|
-
|
|
87
|
-
- id: exec-into-container
|
|
88
|
-
intent: "Run the command 'cat /etc/os-release' inside the running container 'web'"
|
|
89
|
-
difficulty: medium
|
|
90
|
-
category: crud
|
|
91
|
-
setup:
|
|
92
|
-
- "docker run -d --name web alpine sleep 3600"
|
|
93
|
-
max_turns: 5
|
|
94
|
-
assert:
|
|
95
|
-
- ran: "docker exec.*web.*cat /etc/os-release"
|
|
96
|
-
- exit_code: 0
|
|
97
|
-
|
|
98
|
-
- id: inspect-json
|
|
99
|
-
intent: "Get the IP address of the container 'web' using docker inspect with a format template"
|
|
100
|
-
difficulty: medium
|
|
101
|
-
category: output
|
|
102
|
-
setup:
|
|
103
|
-
- "docker run -d --name web alpine sleep 3600"
|
|
104
|
-
max_turns: 5
|
|
105
|
-
assert:
|
|
106
|
-
- ran: "docker inspect"
|
|
107
|
-
- ran: "--format"
|
|
108
|
-
|
|
109
|
-
- id: prune-all
|
|
110
|
-
intent: "Remove all stopped containers, unused networks, dangling images, and build cache without prompting for confirmation"
|
|
111
|
-
difficulty: medium
|
|
112
|
-
category: crud
|
|
113
|
-
max_turns: 5
|
|
114
|
-
assert:
|
|
115
|
-
- ran: "docker system prune"
|
|
116
|
-
- ran: "--force|-f"
|
|
117
|
-
|
|
118
|
-
# -- Hard --------------------------------------------------------------------
|
|
119
|
-
- id: run-complex
|
|
120
|
-
intent: "Run a postgres:16 container named 'db' in the background with environment variables POSTGRES_USER=admin and POSTGRES_PASSWORD=secret, mount a volume 'pgdata' to /var/lib/postgresql/data, and connect it to the network 'backend'"
|
|
121
|
-
difficulty: hard
|
|
122
|
-
category: crud
|
|
123
|
-
setup:
|
|
124
|
-
- "docker network create backend || true"
|
|
125
|
-
max_turns: 7
|
|
126
|
-
assert:
|
|
127
|
-
- ran: "docker run"
|
|
128
|
-
- ran: "--name.*db"
|
|
129
|
-
- ran: "POSTGRES_USER=admin"
|
|
130
|
-
- ran: "POSTGRES_PASSWORD=secret"
|
|
131
|
-
- verify:
|
|
132
|
-
run: "docker ps --filter name=db --format '{{.Names}}'"
|
|
133
|
-
output_contains: "db"
|
|
134
|
-
|
|
135
|
-
- id: compose-up
|
|
136
|
-
intent: "Start all services defined in docker-compose.yml in detached mode and rebuild any changed images"
|
|
137
|
-
difficulty: hard
|
|
138
|
-
category: workflow
|
|
139
|
-
setup:
|
|
140
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
141
|
-
- "printf 'services:\n web:\n image: alpine\n command: sleep 3600\n' > /tmp/bench-workspace/docker-compose.yml"
|
|
142
|
-
max_turns: 7
|
|
143
|
-
assert:
|
|
144
|
-
- ran: "docker compose.*up"
|
|
145
|
-
- ran: "-d|--detach"
|
|
146
|
-
- ran: "--build"
|
|
147
|
-
|
|
148
|
-
- id: multi-stage-debug
|
|
149
|
-
intent: "Build only the 'builder' stage from /tmp/bench-workspace/Dockerfile, tag it as 'myapp:debug', and don't use cache"
|
|
150
|
-
difficulty: hard
|
|
151
|
-
category: crud
|
|
152
|
-
setup:
|
|
153
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
154
|
-
- "printf 'FROM alpine:latest AS builder\nRUN echo building\nFROM alpine:latest\nCOPY --from=builder / /\n' > /tmp/bench-workspace/Dockerfile"
|
|
155
|
-
max_turns: 7
|
|
156
|
-
assert:
|
|
157
|
-
- ran: "docker build"
|
|
158
|
-
- ran: "--target.*builder"
|
|
159
|
-
- ran: "myapp:debug"
|
|
160
|
-
- ran: "--no-cache"
|
|
161
|
-
- verify:
|
|
162
|
-
run: "docker images myapp:debug --format '{{.Repository}}:{{.Tag}}'"
|
|
163
|
-
output_contains: "myapp:debug"
|