@cliwatch/cli-bench 0.6.3 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/README.md +3 -0
- package/dist/assertions.d.ts +1 -1
- package/dist/assertions.d.ts.map +1 -1
- package/dist/assertions.js +6 -6
- package/dist/assertions.js.map +1 -1
- package/dist/ci.d.ts.map +1 -1
- package/dist/ci.js +14 -0
- package/dist/ci.js.map +1 -1
- package/dist/client/index.d.ts +1 -1
- package/dist/client/index.d.ts.map +1 -1
- package/dist/client/types.gen.d.ts +143 -93
- package/dist/client/types.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.d.ts +75 -42
- package/dist/client/zod.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.js +86 -54
- package/dist/client/zod.gen.js.map +1 -1
- package/dist/config.d.ts +2 -3
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +8 -15
- package/dist/config.js.map +1 -1
- package/dist/exec.d.ts +2 -0
- package/dist/exec.d.ts.map +1 -1
- package/dist/exec.js +6 -2
- package/dist/exec.js.map +1 -1
- package/dist/github-comment.d.ts +16 -0
- package/dist/github-comment.d.ts.map +1 -0
- package/dist/github-comment.js +90 -0
- package/dist/github-comment.js.map +1 -0
- package/dist/index.d.ts +2 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +31 -36
- package/dist/index.js.map +1 -1
- package/dist/init.js +1 -1
- package/dist/models.d.ts +9 -9
- package/dist/models.d.ts.map +1 -1
- package/dist/models.js +1 -1
- package/dist/models.js.map +1 -1
- package/dist/project.d.ts +11 -2
- package/dist/project.d.ts.map +1 -1
- package/dist/project.js +108 -9
- package/dist/project.js.map +1 -1
- package/dist/prompt.d.ts +2 -8
- package/dist/prompt.d.ts.map +1 -1
- package/dist/prompt.js +2 -35
- package/dist/prompt.js.map +1 -1
- package/dist/providers.d.ts +9 -7
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +26 -8
- package/dist/providers.js.map +1 -1
- package/dist/runner.d.ts +32 -4
- package/dist/runner.d.ts.map +1 -1
- package/dist/runner.js +177 -177
- package/dist/runner.js.map +1 -1
- package/dist/schemas.d.ts +20 -1
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +8 -1
- package/dist/schemas.js.map +1 -1
- package/dist/suite-generator.d.ts.map +1 -1
- package/dist/suite-generator.js +93 -10
- package/dist/suite-generator.js.map +1 -1
- package/package.json +2 -2
- package/dist/help-loader.d.ts +0 -17
- package/dist/help-loader.d.ts.map +0 -1
- package/dist/help-loader.js +0 -65
- package/dist/help-loader.js.map +0 -1
- package/task_suites/curl.yaml +0 -138
- package/task_suites/docker.yaml +0 -163
- package/task_suites/gh.yaml +0 -118
- package/task_suites/jq.yaml +0 -172
- package/task_suites/kubectl.yaml +0 -74
package/dist/schemas.js
CHANGED
|
@@ -19,10 +19,14 @@ export const TaskSchema = z.object({
|
|
|
19
19
|
intent: z.string(),
|
|
20
20
|
assert: z.array(AssertionSchema).min(1),
|
|
21
21
|
setup: z.array(z.string()).optional().default([]),
|
|
22
|
+
cleanup: z.array(z.string()).optional(),
|
|
23
|
+
env: z.record(z.string(), z.string()).optional(),
|
|
22
24
|
max_turns: z.number().int().min(1).max(20).optional().default(5),
|
|
23
25
|
difficulty: z.enum(['easy', 'medium', 'hard']).optional(),
|
|
24
26
|
category: z.string().optional(),
|
|
25
27
|
repeat: z.number().int().min(1).max(100).optional(),
|
|
28
|
+
tags: z.array(z.string()).optional(),
|
|
29
|
+
scaffold: z.union([z.string(), z.literal(false)]).optional(),
|
|
26
30
|
});
|
|
27
31
|
export const TaskSuiteSchema = z.object({
|
|
28
32
|
cli: z.string(),
|
|
@@ -50,7 +54,6 @@ export const ConfigFileSchema = z.object({
|
|
|
50
54
|
website_url: z.string().optional(),
|
|
51
55
|
github_url: z.string().optional(),
|
|
52
56
|
providers: z.array(z.string()).optional(),
|
|
53
|
-
context: z.array(z.string()).optional(),
|
|
54
57
|
system_prompt: z.string().optional(),
|
|
55
58
|
concurrency: z.number().int().min(1).optional(),
|
|
56
59
|
workdir: z.string().optional(),
|
|
@@ -60,6 +63,10 @@ export const ConfigFileSchema = z.object({
|
|
|
60
63
|
redact_env: z.array(z.string()).optional(),
|
|
61
64
|
redact_patterns: z.array(z.string()).optional(),
|
|
62
65
|
thresholds: ThresholdsSchema,
|
|
66
|
+
env: z.record(z.string(), z.string()).optional(),
|
|
67
|
+
setup: z.array(z.string()).optional(),
|
|
68
|
+
scaffold: z.string().optional(),
|
|
69
|
+
cleanup: z.array(z.string()).optional(),
|
|
63
70
|
tasks: z.array(z.union([
|
|
64
71
|
TaskSchema,
|
|
65
72
|
z.string().refine((s) => s.startsWith('file://'), {
|
package/dist/schemas.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;
|
|
1
|
+
{"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACpC,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;CAC7D,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAClC,CAAC,CAAC;AAEH,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AAEzD,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IAC9C,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE;IACnE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3D,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC;CACzE,CAAC,CAAC,QAAQ,EAAE,CAAC;AAEd,2DAA2D;AAC3D,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACzC,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACpC,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC/C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC1C,eAAe,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC/C,UAAU,EAAE,gBAAgB;IAC5B,GAAG,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACrC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CACZ,CAAC,CAAC,KAAK,CAAC;QACN,UAAU;QACV,CAAC,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;YAChD,OAAO,EAAE,yCAAyC;SACnD,CAAC;KACH,CAAC,CACH,CAAC,GAAG,CAAC,CAAC,CAAC;CACT,CAAC,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;
|
|
1
|
+
{"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAuIH,wBAAsB,aAAa,CACjC,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,MAAM,CAAC,CAwDjB"}
|
package/dist/suite-generator.js
CHANGED
|
@@ -5,10 +5,10 @@
|
|
|
5
5
|
* assert-based validation.
|
|
6
6
|
*/
|
|
7
7
|
import { readFile } from 'node:fs/promises';
|
|
8
|
+
import { execFile } from 'node:child_process';
|
|
8
9
|
import { join } from 'node:path';
|
|
9
10
|
import { generateText } from 'ai';
|
|
10
11
|
import { gateway } from 'ai';
|
|
11
|
-
import { loadHelpFromCache, loadTopLevelHelp } from './help-loader.js';
|
|
12
12
|
const GENERATOR_MODEL = 'anthropic/claude-sonnet-4-20250514';
|
|
13
13
|
const SYSTEM_PROMPT = `You are a CLI test suite generator. Given help text for a CLI tool and example task suites, generate a YAML task suite.
|
|
14
14
|
|
|
@@ -39,6 +39,95 @@ Rules:
|
|
|
39
39
|
- Focus on: local operations, file generation, formatting, config, help queries
|
|
40
40
|
- Include setup commands to prepare the environment
|
|
41
41
|
- Use realistic but safe values (no real credentials, no destructive operations)`;
|
|
42
|
+
const FEW_SHOT_EXAMPLE = `cli: docker
|
|
43
|
+
version_command: "docker --version"
|
|
44
|
+
|
|
45
|
+
providers:
|
|
46
|
+
- openai/gpt-5-nano
|
|
47
|
+
- google/gemini-2.5-flash-lite
|
|
48
|
+
|
|
49
|
+
concurrency: 3
|
|
50
|
+
|
|
51
|
+
system_prompt: |
|
|
52
|
+
You are working in a temporary empty directory.
|
|
53
|
+
Docker is installed and the daemon is running.
|
|
54
|
+
Complete each task using docker commands.
|
|
55
|
+
Use unique names/tags to avoid conflicts (prefix with 'bench-').
|
|
56
|
+
|
|
57
|
+
tasks:
|
|
58
|
+
- id: build-image
|
|
59
|
+
intent: "Create a Dockerfile for a simple Alpine-based image that prints 'hello from docker' when run. Build it with the tag 'bench-hello'."
|
|
60
|
+
difficulty: easy
|
|
61
|
+
category: build
|
|
62
|
+
max_turns: 5
|
|
63
|
+
assert:
|
|
64
|
+
- file_exists: "Dockerfile"
|
|
65
|
+
- ran: "docker build"
|
|
66
|
+
- verify:
|
|
67
|
+
run: "docker images bench-hello --format '{{.Repository}}'"
|
|
68
|
+
output_contains: "bench-hello"
|
|
69
|
+
|
|
70
|
+
- id: run-and-capture
|
|
71
|
+
intent: "Run the 'alpine' image with the command 'echo benchmark-test-output' and capture the output."
|
|
72
|
+
difficulty: easy
|
|
73
|
+
category: run
|
|
74
|
+
max_turns: 3
|
|
75
|
+
assert:
|
|
76
|
+
- ran: "docker run"
|
|
77
|
+
- output_contains: "benchmark-test-output"
|
|
78
|
+
|
|
79
|
+
- id: inspect-container
|
|
80
|
+
intent: "Run an alpine container named 'bench-inspect' in detached mode (sleep 300), then use docker inspect to show its IP address."
|
|
81
|
+
difficulty: medium
|
|
82
|
+
category: query
|
|
83
|
+
max_turns: 5
|
|
84
|
+
assert:
|
|
85
|
+
- ran: "docker run"
|
|
86
|
+
- ran: "docker inspect"
|
|
87
|
+
- exit_code: 0
|
|
88
|
+
|
|
89
|
+
- id: volume-mount
|
|
90
|
+
intent: "Create a file called 'data.txt' with content 'volume test'. Run an alpine container that mounts the current directory to /data and reads the file with 'cat /data/data.txt'."
|
|
91
|
+
difficulty: medium
|
|
92
|
+
category: volumes
|
|
93
|
+
max_turns: 5
|
|
94
|
+
setup:
|
|
95
|
+
- "echo 'volume test' > data.txt"
|
|
96
|
+
assert:
|
|
97
|
+
- ran: "docker run"
|
|
98
|
+
- output_contains: "volume test"`;
|
|
99
|
+
async function loadHelpFromCache(cacheDir, cliName) {
|
|
100
|
+
try {
|
|
101
|
+
const filePath = join(cacheDir, `${cliName}.json`);
|
|
102
|
+
const raw = await readFile(filePath, 'utf-8');
|
|
103
|
+
const data = JSON.parse(raw);
|
|
104
|
+
if (Array.isArray(data)) {
|
|
105
|
+
const entry = data.find((d) => d.cli_name === cliName);
|
|
106
|
+
if (entry?.help_texts) {
|
|
107
|
+
return { cli_name: entry.cli_name, help_texts: entry.help_texts };
|
|
108
|
+
}
|
|
109
|
+
return null;
|
|
110
|
+
}
|
|
111
|
+
if (data.help_texts)
|
|
112
|
+
return data;
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
function loadTopLevelHelp(cliName) {
|
|
120
|
+
return new Promise((resolve) => {
|
|
121
|
+
execFile(cliName, ['--help'], { timeout: 30_000 }, (err, stdout, stderr) => {
|
|
122
|
+
if (err) {
|
|
123
|
+
resolve(null);
|
|
124
|
+
return;
|
|
125
|
+
}
|
|
126
|
+
const output = (stdout || stderr).trim();
|
|
127
|
+
resolve(output || null);
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
}
|
|
42
131
|
export async function generateSuite(cliName, helpCacheDir) {
|
|
43
132
|
// Load help text
|
|
44
133
|
let helpCache = await loadHelpFromCache(helpCacheDir, cliName);
|
|
@@ -51,13 +140,7 @@ export async function generateSuite(cliName, helpCacheDir) {
|
|
|
51
140
|
throw new Error(`No help text available for ${cliName}`);
|
|
52
141
|
}
|
|
53
142
|
}
|
|
54
|
-
|
|
55
|
-
const suiteDir = join(new URL('.', import.meta.url).pathname.replace(/\/src\/$/, '').replace(/\/dist\/$/, ''), 'task_suites');
|
|
56
|
-
let dockerExample = '';
|
|
57
|
-
try {
|
|
58
|
-
dockerExample = await readFile(join(suiteDir, 'docker.yaml'), 'utf-8');
|
|
59
|
-
}
|
|
60
|
-
catch { /* ignore */ }
|
|
143
|
+
const fewShotExample = FEW_SHOT_EXAMPLE;
|
|
61
144
|
// Build help text summary (truncate to fit context)
|
|
62
145
|
const helpEntries = Object.entries(helpCache.help_texts);
|
|
63
146
|
let helpSummary = '';
|
|
@@ -76,9 +159,9 @@ ${helpSummary}
|
|
|
76
159
|
|
|
77
160
|
## Example Task Suite
|
|
78
161
|
|
|
79
|
-
###
|
|
162
|
+
### example cli-bench.yaml
|
|
80
163
|
\`\`\`yaml
|
|
81
|
-
${
|
|
164
|
+
${fewShotExample}
|
|
82
165
|
\`\`\`
|
|
83
166
|
|
|
84
167
|
## Output
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAE7B,MAAM,eAAe,GAAG,oCAAoC,CAAC;AAE7D,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;iFA4B2D,CAAC;AAElF,MAAM,gBAAgB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;uCAwDc,CAAC;AAOxC,KAAK,UAAU,iBAAiB,CAC9B,QAAgB,EAChB,OAAe;IAEf,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,GAAG,OAAO,OAAO,CAAC,CAAC;QACnD,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC9C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7B,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAwB,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC;YAC9E,IAAI,KAAK,EAAE,UAAU,EAAE,CAAC;gBACtB,OAAO,EAAE,QAAQ,EAAE,KAAK,CAAC,QAAQ,EAAE,UAAU,EAAE,KAAK,CAAC,UAAU,EAAE,CAAC;YACpE,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;QACD,IAAI,IAAI,CAAC,UAAU;YAAE,OAAO,IAAiB,CAAC;QAC9C,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,gBAAgB,CAAC,OAAe;IACvC,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,QAAQ,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE;YACzE,IAAI,GAAG,EAAE,CAAC;gBAAC,OAAO,CAAC,IAAI,CAAC,CAAC;gBAAC,OAAO;YAAC,CAAC;YACnC,MAAM,MAAM,GAAG,CAAC,MAAM,IAAI,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YACzC,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,YAAoB;IAEpB,iBAAiB;IACjB,IAAI,SAAS,GAAG,MAAM,iBAAiB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,MAAM,gBAAgB,CAAC,OAAO,CAAC,CAAC;QACrD,IAAI,YAAY,EAAE,CAAC;YACjB,SAAS,GAAG,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,CAAC;QACtE,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,MAAM,cAAc,GAAG,gBAAgB,CAAC;IAExC,oDAAoD;IACpD,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IACzD,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,GAAG,IAAI,QAAQ,CAAC;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACtC,WAAW,IAAI,SAAS,OAAO,IAAI,KAAK,gBAAgB,SAAS,IAAI,CAAC;QACtE,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK;YAAE,MAAM;IACxC,CAAC;IAED,MAAM,MAAM,GAAG,uCAAuC,OAAO;;;;EAI7D,WAAW;;;;;;EAMX,cAAc;;;;;yEAKyD,OAAO,KAAK,CAAC;IAEpF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;QAChC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC;QAC/B,MAAM,EAAE,aAAa;QACrB,MAAM;QACN,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;IAEH,2DAA2D;IAC3D,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;IAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cliwatch/cli-bench",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.1",
|
|
4
4
|
"description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
|
|
5
5
|
"keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
|
|
6
6
|
"license": "MIT",
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
},
|
|
15
15
|
"files": [
|
|
16
16
|
"dist",
|
|
17
|
-
"task_suites",
|
|
18
17
|
"LICENSE",
|
|
19
18
|
"CHANGELOG.md"
|
|
20
19
|
],
|
|
@@ -35,6 +34,7 @@
|
|
|
35
34
|
"test": "vitest"
|
|
36
35
|
},
|
|
37
36
|
"dependencies": {
|
|
37
|
+
"@ai-sdk/google": "^3.0.0",
|
|
38
38
|
"@hey-api/client-fetch": "^0.13.1",
|
|
39
39
|
"ai": "^6.0.18",
|
|
40
40
|
"yaml": "^2.7.0",
|
package/dist/help-loader.d.ts
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Loads CLI help text from cached JSON files or live CLI execution.
|
|
3
|
-
*
|
|
4
|
-
* Cached mode reads from help_cache/<cli>.json, produced by
|
|
5
|
-
* `audit-worker --dry-run --output-help --output <file>`.
|
|
6
|
-
*
|
|
7
|
-
* Live mode shells out to the CLI's --help.
|
|
8
|
-
*/
|
|
9
|
-
import type { HelpCache } from './models.js';
|
|
10
|
-
export declare function loadHelpFromCache(cacheDir: string, cliName: string): Promise<HelpCache | null>;
|
|
11
|
-
export declare function listAvailableCaches(cacheDir: string): Promise<string[]>;
|
|
12
|
-
/**
|
|
13
|
-
* Run `cli --help` and return the output as a string.
|
|
14
|
-
* Single top-level invocation — no subcommand crawling.
|
|
15
|
-
*/
|
|
16
|
-
export declare function loadTopLevelHelp(cliName: string): Promise<string | null>;
|
|
17
|
-
//# sourceMappingURL=help-loader.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"help-loader.d.ts","sourceRoot":"","sources":["../src/help-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAKH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAE7C,wBAAsB,iBAAiB,CACrC,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC,CA6B3B;AAED,wBAAsB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAS7E;AAED;;;GAGG;AACH,wBAAsB,gBAAgB,CACpC,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAWxB"}
|
package/dist/help-loader.js
DELETED
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Loads CLI help text from cached JSON files or live CLI execution.
|
|
3
|
-
*
|
|
4
|
-
* Cached mode reads from help_cache/<cli>.json, produced by
|
|
5
|
-
* `audit-worker --dry-run --output-help --output <file>`.
|
|
6
|
-
*
|
|
7
|
-
* Live mode shells out to the CLI's --help.
|
|
8
|
-
*/
|
|
9
|
-
import { readFile, readdir } from 'node:fs/promises';
|
|
10
|
-
import { join } from 'node:path';
|
|
11
|
-
import { execFile } from 'node:child_process';
|
|
12
|
-
export async function loadHelpFromCache(cacheDir, cliName) {
|
|
13
|
-
try {
|
|
14
|
-
const filePath = join(cacheDir, `${cliName}.json`);
|
|
15
|
-
const raw = await readFile(filePath, 'utf-8');
|
|
16
|
-
const data = JSON.parse(raw);
|
|
17
|
-
// Support both formats: direct HelpCache or audit-worker output array
|
|
18
|
-
if (Array.isArray(data)) {
|
|
19
|
-
const entry = data.find((d) => d.cli_name === cliName);
|
|
20
|
-
if (entry?.help_texts) {
|
|
21
|
-
return {
|
|
22
|
-
cli_name: entry.cli_name,
|
|
23
|
-
help_texts: entry.help_texts,
|
|
24
|
-
version: entry.version,
|
|
25
|
-
};
|
|
26
|
-
}
|
|
27
|
-
return null;
|
|
28
|
-
}
|
|
29
|
-
if (data.help_texts) {
|
|
30
|
-
return data;
|
|
31
|
-
}
|
|
32
|
-
return null;
|
|
33
|
-
}
|
|
34
|
-
catch {
|
|
35
|
-
return null;
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
export async function listAvailableCaches(cacheDir) {
|
|
39
|
-
try {
|
|
40
|
-
const files = await readdir(cacheDir);
|
|
41
|
-
return files
|
|
42
|
-
.filter((f) => f.endsWith('.json'))
|
|
43
|
-
.map((f) => f.replace(/\.json$/, ''));
|
|
44
|
-
}
|
|
45
|
-
catch {
|
|
46
|
-
return [];
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
/**
|
|
50
|
-
* Run `cli --help` and return the output as a string.
|
|
51
|
-
* Single top-level invocation — no subcommand crawling.
|
|
52
|
-
*/
|
|
53
|
-
export async function loadTopLevelHelp(cliName) {
|
|
54
|
-
return new Promise((resolve) => {
|
|
55
|
-
execFile(cliName, ['--help'], { timeout: 30_000 }, (err, stdout, stderr) => {
|
|
56
|
-
if (err) {
|
|
57
|
-
resolve(null);
|
|
58
|
-
return;
|
|
59
|
-
}
|
|
60
|
-
const output = (stdout || stderr).trim();
|
|
61
|
-
resolve(output || null);
|
|
62
|
-
});
|
|
63
|
-
});
|
|
64
|
-
}
|
|
65
|
-
//# sourceMappingURL=help-loader.js.map
|
package/dist/help-loader.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"help-loader.js","sourceRoot":"","sources":["../src/help-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAG9C,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,OAAe;IAEf,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,GAAG,OAAO,OAAO,CAAC,CAAC;QACnD,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC9C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAE7B,sEAAsE;QACtE,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CACrB,CAAC,CAAwB,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,OAAO,CACrD,CAAC;YACF,IAAI,KAAK,EAAE,UAAU,EAAE,CAAC;gBACtB,OAAO;oBACL,QAAQ,EAAE,KAAK,CAAC,QAAQ;oBACxB,UAAU,EAAE,KAAK,CAAC,UAAU;oBAC5B,OAAO,EAAE,KAAK,CAAC,OAAO;iBACvB,CAAC;YACJ,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO,IAAiB,CAAC;QAC3B,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAgB;IACxD,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,CAAC;QACtC,OAAO,KAAK;aACT,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;aAClC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,CAAC;IAC1C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,OAAe;IAEf,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,QAAQ,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE;YACzE,IAAI,GAAG,EAAE,CAAC;gBACR,OAAO,CAAC,IAAI,CAAC,CAAC;gBACd,OAAO;YACT,CAAC;YACD,MAAM,MAAM,GAAG,CAAC,MAAM,IAAI,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YACzC,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/task_suites/curl.yaml
DELETED
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
cli: curl
|
|
2
|
-
version_command: "curl --version | head -1"
|
|
3
|
-
|
|
4
|
-
tasks:
|
|
5
|
-
# -- Easy --------------------------------------------------------------------
|
|
6
|
-
- id: simple-get
|
|
7
|
-
intent: "Fetch the contents of https://httpbin.org/get"
|
|
8
|
-
difficulty: easy
|
|
9
|
-
category: query
|
|
10
|
-
max_turns: 2
|
|
11
|
-
assert:
|
|
12
|
-
- ran: "curl.*httpbin.org/get"
|
|
13
|
-
- exit_code: 0
|
|
14
|
-
- output_contains: "origin"
|
|
15
|
-
|
|
16
|
-
- id: head-request
|
|
17
|
-
intent: "Send a HEAD request to https://httpbin.org/get and show the response headers"
|
|
18
|
-
difficulty: easy
|
|
19
|
-
category: query
|
|
20
|
-
max_turns: 3
|
|
21
|
-
assert:
|
|
22
|
-
- ran: "curl"
|
|
23
|
-
- ran: "-I|--head"
|
|
24
|
-
- output_contains: "HTTP"
|
|
25
|
-
|
|
26
|
-
- id: download-file
|
|
27
|
-
intent: "Download https://httpbin.org/robots.txt and save it as robots.txt"
|
|
28
|
-
difficulty: easy
|
|
29
|
-
category: crud
|
|
30
|
-
max_turns: 3
|
|
31
|
-
assert:
|
|
32
|
-
- ran: "curl"
|
|
33
|
-
- ran: "-o|--output"
|
|
34
|
-
- file_exists: "robots.txt"
|
|
35
|
-
|
|
36
|
-
- id: follow-redirects
|
|
37
|
-
intent: "Fetch https://httpbin.org/redirect/2 and follow all redirects"
|
|
38
|
-
difficulty: easy
|
|
39
|
-
category: query
|
|
40
|
-
max_turns: 3
|
|
41
|
-
assert:
|
|
42
|
-
- ran: "curl"
|
|
43
|
-
- ran: "-L|--location"
|
|
44
|
-
- exit_code: 0
|
|
45
|
-
|
|
46
|
-
# -- Medium ------------------------------------------------------------------
|
|
47
|
-
- id: post-json
|
|
48
|
-
intent: "Send a POST request to https://httpbin.org/post with JSON body {\"name\": \"bench\", \"version\": 1} and set the Content-Type header to application/json"
|
|
49
|
-
difficulty: medium
|
|
50
|
-
category: crud
|
|
51
|
-
max_turns: 5
|
|
52
|
-
assert:
|
|
53
|
-
- ran: "curl"
|
|
54
|
-
- ran: "-X POST|--request POST|-d|--data"
|
|
55
|
-
- ran: "Content-Type.*application/json"
|
|
56
|
-
- output_contains: "bench"
|
|
57
|
-
|
|
58
|
-
- id: custom-headers
|
|
59
|
-
intent: "Send a GET request to https://httpbin.org/headers with custom headers X-Request-ID: abc123 and Accept: application/xml"
|
|
60
|
-
difficulty: medium
|
|
61
|
-
category: query
|
|
62
|
-
max_turns: 5
|
|
63
|
-
assert:
|
|
64
|
-
- ran: "curl"
|
|
65
|
-
- ran: "-H|--header"
|
|
66
|
-
- ran: "X-Request-ID"
|
|
67
|
-
- output_contains: "abc123"
|
|
68
|
-
|
|
69
|
-
- id: basic-auth
|
|
70
|
-
intent: "Send a GET request to https://httpbin.org/basic-auth/user/passwd using basic authentication with username 'user' and password 'passwd'"
|
|
71
|
-
difficulty: medium
|
|
72
|
-
category: auth
|
|
73
|
-
max_turns: 5
|
|
74
|
-
assert:
|
|
75
|
-
- ran: "curl"
|
|
76
|
-
- ran: "-u|--user|user:passwd"
|
|
77
|
-
- output_contains: "authenticated"
|
|
78
|
-
|
|
79
|
-
- id: verbose-timing
|
|
80
|
-
intent: "Fetch https://httpbin.org/get and show the total time taken for the request using curl's write-out feature"
|
|
81
|
-
difficulty: medium
|
|
82
|
-
category: output
|
|
83
|
-
max_turns: 5
|
|
84
|
-
assert:
|
|
85
|
-
- ran: "curl"
|
|
86
|
-
- ran: "-w|--write-out|time_total"
|
|
87
|
-
|
|
88
|
-
# -- Hard --------------------------------------------------------------------
|
|
89
|
-
- id: put-with-file
|
|
90
|
-
intent: "Upload the file /tmp/bench-workspace/data.json to https://httpbin.org/put using a PUT request with Content-Type application/json"
|
|
91
|
-
difficulty: hard
|
|
92
|
-
category: crud
|
|
93
|
-
setup:
|
|
94
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
95
|
-
- "echo '{\"key\": \"value\"}' > /tmp/bench-workspace/data.json"
|
|
96
|
-
max_turns: 7
|
|
97
|
-
assert:
|
|
98
|
-
- ran: "curl"
|
|
99
|
-
- ran: "-X PUT|--request PUT|-T"
|
|
100
|
-
- ran: "data.json"
|
|
101
|
-
- output_contains: "key"
|
|
102
|
-
|
|
103
|
-
- id: retry-with-timeout
|
|
104
|
-
intent: "Fetch https://httpbin.org/delay/1 with a 5 second timeout, retry 3 times on failure, and save the response to response.json"
|
|
105
|
-
difficulty: hard
|
|
106
|
-
category: crud
|
|
107
|
-
max_turns: 7
|
|
108
|
-
assert:
|
|
109
|
-
- ran: "curl"
|
|
110
|
-
- ran: "--retry.*3"
|
|
111
|
-
- ran: "--max-time|--connect-timeout|-m"
|
|
112
|
-
- ran: "-o|--output"
|
|
113
|
-
|
|
114
|
-
- id: multipart-upload
|
|
115
|
-
intent: "Send a multipart/form-data POST to https://httpbin.org/post with a field 'username' set to 'admin' and a file field 'config' uploading /tmp/bench-workspace/app.conf"
|
|
116
|
-
difficulty: hard
|
|
117
|
-
category: crud
|
|
118
|
-
setup:
|
|
119
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
120
|
-
- "echo 'port=8080' > /tmp/bench-workspace/app.conf"
|
|
121
|
-
max_turns: 7
|
|
122
|
-
assert:
|
|
123
|
-
- ran: "curl"
|
|
124
|
-
- ran: "-F|--form"
|
|
125
|
-
- ran: "username.*admin"
|
|
126
|
-
- ran: "config.*@.*app.conf"
|
|
127
|
-
- output_contains: "admin"
|
|
128
|
-
|
|
129
|
-
- id: conditional-request
|
|
130
|
-
intent: "Fetch https://httpbin.org/cache and use conditional headers: set If-None-Match to '12345' and If-Modified-Since to 'Thu, 01 Jan 2025 00:00:00 GMT'. Show the response status code."
|
|
131
|
-
difficulty: hard
|
|
132
|
-
category: query
|
|
133
|
-
max_turns: 7
|
|
134
|
-
assert:
|
|
135
|
-
- ran: "curl"
|
|
136
|
-
- ran: "If-None-Match"
|
|
137
|
-
- ran: "If-Modified-Since"
|
|
138
|
-
- ran: "-w|--write-out|-I|--head|-v|--verbose"
|
package/task_suites/docker.yaml
DELETED
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
cli: docker
|
|
2
|
-
|
|
3
|
-
tasks:
|
|
4
|
-
# -- Easy --------------------------------------------------------------------
|
|
5
|
-
- id: list-containers
|
|
6
|
-
intent: "List all running containers"
|
|
7
|
-
difficulty: easy
|
|
8
|
-
category: query
|
|
9
|
-
max_turns: 3
|
|
10
|
-
assert:
|
|
11
|
-
- ran: "docker ps"
|
|
12
|
-
- exit_code: 0
|
|
13
|
-
|
|
14
|
-
- id: list-images
|
|
15
|
-
intent: "List all local Docker images"
|
|
16
|
-
difficulty: easy
|
|
17
|
-
category: query
|
|
18
|
-
max_turns: 3
|
|
19
|
-
assert:
|
|
20
|
-
- ran: "docker image"
|
|
21
|
-
- exit_code: 0
|
|
22
|
-
|
|
23
|
-
- id: pull-image
|
|
24
|
-
intent: "Pull the latest nginx image from Docker Hub"
|
|
25
|
-
difficulty: easy
|
|
26
|
-
category: crud
|
|
27
|
-
max_turns: 3
|
|
28
|
-
assert:
|
|
29
|
-
- ran: "docker pull.*nginx"
|
|
30
|
-
- verify:
|
|
31
|
-
run: "docker images nginx --format '{{.Repository}}'"
|
|
32
|
-
output_contains: "nginx"
|
|
33
|
-
|
|
34
|
-
- id: view-logs
|
|
35
|
-
intent: "Show the last 100 lines of logs from the container 'api' and follow new output"
|
|
36
|
-
difficulty: easy
|
|
37
|
-
category: query
|
|
38
|
-
setup:
|
|
39
|
-
- "docker run -d --name api alpine sh -c 'for i in $(seq 1 200); do echo line-$i; done; sleep 3600'"
|
|
40
|
-
max_turns: 3
|
|
41
|
-
assert:
|
|
42
|
-
- ran: "docker logs"
|
|
43
|
-
- ran: "--tail"
|
|
44
|
-
|
|
45
|
-
- id: stop-container
|
|
46
|
-
intent: "Stop the container named 'web-server'"
|
|
47
|
-
difficulty: easy
|
|
48
|
-
category: crud
|
|
49
|
-
setup:
|
|
50
|
-
- "docker run -d --name web-server alpine sleep 3600"
|
|
51
|
-
max_turns: 3
|
|
52
|
-
assert:
|
|
53
|
-
- ran: "docker stop.*web-server"
|
|
54
|
-
- verify:
|
|
55
|
-
run: "docker ps --filter name=web-server --format '{{.Names}}'"
|
|
56
|
-
output_equals: ""
|
|
57
|
-
|
|
58
|
-
# -- Medium ------------------------------------------------------------------
|
|
59
|
-
- id: run-detached
|
|
60
|
-
intent: "Run an nginx container in the background, mapping host port 8080 to container port 80, named 'web'"
|
|
61
|
-
difficulty: medium
|
|
62
|
-
category: crud
|
|
63
|
-
max_turns: 5
|
|
64
|
-
assert:
|
|
65
|
-
- ran: "docker run"
|
|
66
|
-
- ran: "-d"
|
|
67
|
-
- ran: "--name.*web"
|
|
68
|
-
- verify:
|
|
69
|
-
run: "docker ps --filter name=web --format '{{.Names}}'"
|
|
70
|
-
output_contains: "web"
|
|
71
|
-
|
|
72
|
-
- id: build-with-tag
|
|
73
|
-
intent: "Build a Docker image from ./Dockerfile and tag it as myapp:v2"
|
|
74
|
-
difficulty: medium
|
|
75
|
-
category: crud
|
|
76
|
-
setup:
|
|
77
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
78
|
-
- "printf 'FROM alpine:latest\nRUN echo hello' > /tmp/bench-workspace/Dockerfile"
|
|
79
|
-
max_turns: 5
|
|
80
|
-
assert:
|
|
81
|
-
- ran: "docker build"
|
|
82
|
-
- ran: "myapp:v2"
|
|
83
|
-
- verify:
|
|
84
|
-
run: "docker images myapp:v2 --format '{{.Repository}}:{{.Tag}}'"
|
|
85
|
-
output_contains: "myapp:v2"
|
|
86
|
-
|
|
87
|
-
- id: exec-into-container
|
|
88
|
-
intent: "Run the command 'cat /etc/os-release' inside the running container 'web'"
|
|
89
|
-
difficulty: medium
|
|
90
|
-
category: crud
|
|
91
|
-
setup:
|
|
92
|
-
- "docker run -d --name web alpine sleep 3600"
|
|
93
|
-
max_turns: 5
|
|
94
|
-
assert:
|
|
95
|
-
- ran: "docker exec.*web.*cat /etc/os-release"
|
|
96
|
-
- exit_code: 0
|
|
97
|
-
|
|
98
|
-
- id: inspect-json
|
|
99
|
-
intent: "Get the IP address of the container 'web' using docker inspect with a format template"
|
|
100
|
-
difficulty: medium
|
|
101
|
-
category: output
|
|
102
|
-
setup:
|
|
103
|
-
- "docker run -d --name web alpine sleep 3600"
|
|
104
|
-
max_turns: 5
|
|
105
|
-
assert:
|
|
106
|
-
- ran: "docker inspect"
|
|
107
|
-
- ran: "--format"
|
|
108
|
-
|
|
109
|
-
- id: prune-all
|
|
110
|
-
intent: "Remove all stopped containers, unused networks, dangling images, and build cache without prompting for confirmation"
|
|
111
|
-
difficulty: medium
|
|
112
|
-
category: crud
|
|
113
|
-
max_turns: 5
|
|
114
|
-
assert:
|
|
115
|
-
- ran: "docker system prune"
|
|
116
|
-
- ran: "--force|-f"
|
|
117
|
-
|
|
118
|
-
# -- Hard --------------------------------------------------------------------
|
|
119
|
-
- id: run-complex
|
|
120
|
-
intent: "Run a postgres:16 container named 'db' in the background with environment variables POSTGRES_USER=admin and POSTGRES_PASSWORD=secret, mount a volume 'pgdata' to /var/lib/postgresql/data, and connect it to the network 'backend'"
|
|
121
|
-
difficulty: hard
|
|
122
|
-
category: crud
|
|
123
|
-
setup:
|
|
124
|
-
- "docker network create backend || true"
|
|
125
|
-
max_turns: 7
|
|
126
|
-
assert:
|
|
127
|
-
- ran: "docker run"
|
|
128
|
-
- ran: "--name.*db"
|
|
129
|
-
- ran: "POSTGRES_USER=admin"
|
|
130
|
-
- ran: "POSTGRES_PASSWORD=secret"
|
|
131
|
-
- verify:
|
|
132
|
-
run: "docker ps --filter name=db --format '{{.Names}}'"
|
|
133
|
-
output_contains: "db"
|
|
134
|
-
|
|
135
|
-
- id: compose-up
|
|
136
|
-
intent: "Start all services defined in docker-compose.yml in detached mode and rebuild any changed images"
|
|
137
|
-
difficulty: hard
|
|
138
|
-
category: workflow
|
|
139
|
-
setup:
|
|
140
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
141
|
-
- "printf 'services:\n web:\n image: alpine\n command: sleep 3600\n' > /tmp/bench-workspace/docker-compose.yml"
|
|
142
|
-
max_turns: 7
|
|
143
|
-
assert:
|
|
144
|
-
- ran: "docker compose.*up"
|
|
145
|
-
- ran: "-d|--detach"
|
|
146
|
-
- ran: "--build"
|
|
147
|
-
|
|
148
|
-
- id: multi-stage-debug
|
|
149
|
-
intent: "Build only the 'builder' stage from /tmp/bench-workspace/Dockerfile, tag it as 'myapp:debug', and don't use cache"
|
|
150
|
-
difficulty: hard
|
|
151
|
-
category: crud
|
|
152
|
-
setup:
|
|
153
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
154
|
-
- "printf 'FROM alpine:latest AS builder\nRUN echo building\nFROM alpine:latest\nCOPY --from=builder / /\n' > /tmp/bench-workspace/Dockerfile"
|
|
155
|
-
max_turns: 7
|
|
156
|
-
assert:
|
|
157
|
-
- ran: "docker build"
|
|
158
|
-
- ran: "--target.*builder"
|
|
159
|
-
- ran: "myapp:debug"
|
|
160
|
-
- ran: "--no-cache"
|
|
161
|
-
- verify:
|
|
162
|
-
run: "docker images myapp:debug --format '{{.Repository}}:{{.Tag}}'"
|
|
163
|
-
output_contains: "myapp:debug"
|