@cliwatch/cli-bench 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/CHANGELOG.md +15 -0
  2. package/README.md +3 -0
  3. package/dist/assertions.d.ts +1 -1
  4. package/dist/assertions.d.ts.map +1 -1
  5. package/dist/assertions.js +6 -6
  6. package/dist/assertions.js.map +1 -1
  7. package/dist/client/index.d.ts +1 -1
  8. package/dist/client/index.d.ts.map +1 -1
  9. package/dist/client/types.gen.d.ts +124 -86
  10. package/dist/client/types.gen.d.ts.map +1 -1
  11. package/dist/client/zod.gen.d.ts +57 -36
  12. package/dist/client/zod.gen.d.ts.map +1 -1
  13. package/dist/client/zod.gen.js +84 -52
  14. package/dist/client/zod.gen.js.map +1 -1
  15. package/dist/config.d.ts +2 -0
  16. package/dist/config.d.ts.map +1 -1
  17. package/dist/config.js +9 -0
  18. package/dist/config.js.map +1 -1
  19. package/dist/exec.d.ts +2 -0
  20. package/dist/exec.d.ts.map +1 -1
  21. package/dist/exec.js +6 -2
  22. package/dist/exec.js.map +1 -1
  23. package/dist/github-comment.d.ts +16 -0
  24. package/dist/github-comment.d.ts.map +1 -0
  25. package/dist/github-comment.js +90 -0
  26. package/dist/github-comment.js.map +1 -0
  27. package/dist/index.d.ts +2 -3
  28. package/dist/index.d.ts.map +1 -1
  29. package/dist/index.js +29 -27
  30. package/dist/index.js.map +1 -1
  31. package/dist/models.d.ts +8 -0
  32. package/dist/models.d.ts.map +1 -1
  33. package/dist/project.d.ts +11 -2
  34. package/dist/project.d.ts.map +1 -1
  35. package/dist/project.js +22 -8
  36. package/dist/project.js.map +1 -1
  37. package/dist/providers.d.ts +9 -7
  38. package/dist/providers.d.ts.map +1 -1
  39. package/dist/providers.js +26 -8
  40. package/dist/providers.js.map +1 -1
  41. package/dist/redact.d.ts +20 -6
  42. package/dist/redact.d.ts.map +1 -1
  43. package/dist/redact.js +68 -13
  44. package/dist/redact.js.map +1 -1
  45. package/dist/runner.d.ts +31 -1
  46. package/dist/runner.d.ts.map +1 -1
  47. package/dist/runner.js +84 -78
  48. package/dist/runner.js.map +1 -1
  49. package/dist/schemas.d.ts +16 -0
  50. package/dist/schemas.d.ts.map +1 -1
  51. package/dist/schemas.js +7 -0
  52. package/dist/schemas.js.map +1 -1
  53. package/dist/suite-generator.d.ts.map +1 -1
  54. package/dist/suite-generator.js +63 -11
  55. package/dist/suite-generator.js.map +1 -1
  56. package/package.json +2 -2
  57. package/task_suites/curl.yaml +0 -138
  58. package/task_suites/docker.yaml +0 -163
  59. package/task_suites/gh.yaml +0 -118
  60. package/task_suites/jq.yaml +0 -172
  61. package/task_suites/kubectl.yaml +0 -74
package/dist/schemas.d.ts CHANGED
@@ -70,6 +70,8 @@ export declare const TaskSchema: z.ZodObject<{
70
70
  }, z.core.$strip>;
71
71
  }, z.core.$strip>]>>;
72
72
  setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
73
+ cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
74
+ env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
73
75
  max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
74
76
  difficulty: z.ZodOptional<z.ZodEnum<{
75
77
  easy: "easy";
@@ -78,6 +80,7 @@ export declare const TaskSchema: z.ZodObject<{
78
80
  }>>;
79
81
  category: z.ZodOptional<z.ZodString>;
80
82
  repeat: z.ZodOptional<z.ZodNumber>;
83
+ tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
81
84
  }, z.core.$strip>;
82
85
  export declare const TaskSuiteSchema: z.ZodObject<{
83
86
  cli: z.ZodString;
@@ -122,6 +125,8 @@ export declare const TaskSuiteSchema: z.ZodObject<{
122
125
  }, z.core.$strip>;
123
126
  }, z.core.$strip>]>>;
124
127
  setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
128
+ cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
129
+ env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
125
130
  max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
126
131
  difficulty: z.ZodOptional<z.ZodEnum<{
127
132
  easy: "easy";
@@ -130,6 +135,7 @@ export declare const TaskSuiteSchema: z.ZodObject<{
130
135
  }>>;
131
136
  category: z.ZodOptional<z.ZodString>;
132
137
  repeat: z.ZodOptional<z.ZodNumber>;
138
+ tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
133
139
  }, z.core.$strip>>;
134
140
  }, z.core.$strip>;
135
141
  /** Schema for a task file referenced via file:// — plain array of tasks. */
@@ -169,6 +175,8 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodObject<{
169
175
  }, z.core.$strip>;
170
176
  }, z.core.$strip>]>>;
171
177
  setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
178
+ cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
179
+ env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
172
180
  max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
173
181
  difficulty: z.ZodOptional<z.ZodEnum<{
174
182
  easy: "easy";
@@ -177,6 +185,7 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodObject<{
177
185
  }>>;
178
186
  category: z.ZodOptional<z.ZodString>;
179
187
  repeat: z.ZodOptional<z.ZodNumber>;
188
+ tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
180
189
  }, z.core.$strip>>;
181
190
  export declare const ThresholdsSchema: z.ZodOptional<z.ZodObject<{
182
191
  default: z.ZodOptional<z.ZodNumber>;
@@ -208,6 +217,7 @@ export declare const ConfigFileSchema: z.ZodObject<{
208
217
  backend_url: z.ZodOptional<z.ZodString>;
209
218
  repeat: z.ZodOptional<z.ZodNumber>;
210
219
  redact_env: z.ZodOptional<z.ZodArray<z.ZodString>>;
220
+ redact_patterns: z.ZodOptional<z.ZodArray<z.ZodString>>;
211
221
  thresholds: z.ZodOptional<z.ZodObject<{
212
222
  default: z.ZodOptional<z.ZodNumber>;
213
223
  models: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
@@ -217,6 +227,9 @@ export declare const ConfigFileSchema: z.ZodObject<{
217
227
  informational: "informational";
218
228
  }>>>;
219
229
  }, z.core.$strip>>;
230
+ env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
231
+ setup: z.ZodOptional<z.ZodArray<z.ZodString>>;
232
+ cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
220
233
  tasks: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
221
234
  id: z.ZodString;
222
235
  intent: z.ZodString;
@@ -253,6 +266,8 @@ export declare const ConfigFileSchema: z.ZodObject<{
253
266
  }, z.core.$strip>;
254
267
  }, z.core.$strip>]>>;
255
268
  setup: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
269
+ cleanup: z.ZodOptional<z.ZodArray<z.ZodString>>;
270
+ env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
256
271
  max_turns: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
257
272
  difficulty: z.ZodOptional<z.ZodEnum<{
258
273
  easy: "easy";
@@ -261,6 +276,7 @@ export declare const ConfigFileSchema: z.ZodObject<{
261
276
  }>>;
262
277
  category: z.ZodOptional<z.ZodString>;
263
278
  repeat: z.ZodOptional<z.ZodNumber>;
279
+ tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
264
280
  }, z.core.$strip>, z.ZodString]>>;
265
281
  }, z.core.$strip>;
266
282
  //# sourceMappingURL=schemas.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mBAW1B,CAAC;AAEH,eAAO,MAAM,UAAU;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBASrB,CAAC;AAEH,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAQ1B,CAAC;AAEH,4EAA4E;AAC5E,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kBAA6B,CAAC;AAEzD,eAAO,MAAM,gBAAgB;;;;;;;;kBAKhB,CAAC;AAEd,2DAA2D;AAC3D,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAyB3B,CAAC"}
1
+ {"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mBAW1B,CAAC;AAEH,eAAO,MAAM,UAAU;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAYrB,CAAC;AAEH,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAQ1B,CAAC;AAEH,4EAA4E;AAC5E,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kBAA6B,CAAC;AAEzD,eAAO,MAAM,gBAAgB;;;;;;;;kBAKhB,CAAC;AAEd,2DAA2D;AAC3D,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBA6B3B,CAAC"}
package/dist/schemas.js CHANGED
@@ -19,10 +19,13 @@ export const TaskSchema = z.object({
19
19
  intent: z.string(),
20
20
  assert: z.array(AssertionSchema).min(1),
21
21
  setup: z.array(z.string()).optional().default([]),
22
+ cleanup: z.array(z.string()).optional(),
23
+ env: z.record(z.string(), z.string()).optional(),
22
24
  max_turns: z.number().int().min(1).max(20).optional().default(5),
23
25
  difficulty: z.enum(['easy', 'medium', 'hard']).optional(),
24
26
  category: z.string().optional(),
25
27
  repeat: z.number().int().min(1).max(100).optional(),
28
+ tags: z.array(z.string()).optional(),
26
29
  });
27
30
  export const TaskSuiteSchema = z.object({
28
31
  cli: z.string(),
@@ -58,7 +61,11 @@ export const ConfigFileSchema = z.object({
58
61
  backend_url: z.string().optional(),
59
62
  repeat: z.number().int().min(1).max(100).optional(),
60
63
  redact_env: z.array(z.string()).optional(),
64
+ redact_patterns: z.array(z.string()).optional(),
61
65
  thresholds: ThresholdsSchema,
66
+ env: z.record(z.string(), z.string()).optional(),
67
+ setup: z.array(z.string()).optional(),
68
+ cleanup: z.array(z.string()).optional(),
62
69
  tasks: z.array(z.union([
63
70
  TaskSchema,
64
71
  z.string().refine((s) => s.startsWith('file://'), {
@@ -1 +1 @@
1
- {"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;CACpD,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAClC,CAAC,CAAC;AAEH,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AAEzD,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IAC9C,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE;IACnE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3D,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC;CACzE,CAAC,CAAC,QAAQ,EAAE,CAAC;AAEd,2DAA2D;AAC3D,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACzC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACpC,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC/C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC1C,UAAU,EAAE,gBAAgB;IAC5B,KAAK,EAAE,CAAC,CAAC,KAAK,CACZ,CAAC,CAAC,KAAK,CAAC;QACN,UAAU;QACV,CAAC,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;YAChD,OAAO,EAAE,yCAAyC;SACnD,CAAC;KACH,CAAC,CACH,CAAC,GAAG,CAAC,CAAC,CAAC;CACT,CAAC,CAAC"}
1
+ {"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;CACrC,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAClC,CAAC,CAAC;AAEH,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AAEzD,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IAC9C,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE;IACnE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3D,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC;CACzE,CAAC,CAAC,QAAQ,EAAE,CAAC;AAEd,2DAA2D;AAC3D,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACzC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACpC,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC/C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC1C,eAAe,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC/C,UAAU,EAAE,gBAAgB;IAC5B,GAAG,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACrC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CACZ,CAAC,CAAC,KAAK,CAAC;QACN,UAAU;QACV,CAAC,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;YAChD,OAAO,EAAE,yCAAyC;SACnD,CAAC;KACH,CAAC,CACH,CAAC,GAAG,CAAC,CAAC,CAAC;CACT,CAAC,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAwCH,wBAAsB,aAAa,CACjC,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,MAAM,CAAC,CAiEjB"}
1
+ {"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAmGH,wBAAsB,aAAa,CACjC,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,MAAM,CAAC,CAwDjB"}
@@ -4,8 +4,6 @@
4
4
  * Takes CLI name + help text and produces a task suite with
5
5
  * assert-based validation.
6
6
  */
7
- import { readFile } from 'node:fs/promises';
8
- import { join } from 'node:path';
9
7
  import { generateText } from 'ai';
10
8
  import { gateway } from 'ai';
11
9
  import { loadHelpFromCache, loadTopLevelHelp } from './help-loader.js';
@@ -39,6 +37,66 @@ Rules:
39
37
  - Focus on: local operations, file generation, formatting, config, help queries
40
38
  - Include setup commands to prepare the environment
41
39
  - Use realistic but safe values (no real credentials, no destructive operations)`;
40
+ const FEW_SHOT_EXAMPLE = `cli: docker
41
+ version_command: "docker --version"
42
+
43
+ providers:
44
+ - openai/gpt-5-nano
45
+ - google/gemini-2.5-flash-lite
46
+
47
+ context:
48
+ - zero-shot
49
+
50
+ concurrency: 3
51
+
52
+ system_prompt: |
53
+ You are working in a temporary empty directory.
54
+ Docker is installed and the daemon is running.
55
+ Complete each task using docker commands.
56
+ Use unique names/tags to avoid conflicts (prefix with 'bench-').
57
+
58
+ tasks:
59
+ - id: build-image
60
+ intent: "Create a Dockerfile for a simple Alpine-based image that prints 'hello from docker' when run. Build it with the tag 'bench-hello'."
61
+ difficulty: easy
62
+ category: build
63
+ max_turns: 5
64
+ assert:
65
+ - file_exists: "Dockerfile"
66
+ - ran: "docker build"
67
+ - verify:
68
+ run: "docker images bench-hello --format '{{.Repository}}'"
69
+ output_contains: "bench-hello"
70
+
71
+ - id: run-and-capture
72
+ intent: "Run the 'alpine' image with the command 'echo benchmark-test-output' and capture the output."
73
+ difficulty: easy
74
+ category: run
75
+ max_turns: 3
76
+ assert:
77
+ - ran: "docker run"
78
+ - output_contains: "benchmark-test-output"
79
+
80
+ - id: inspect-container
81
+ intent: "Run an alpine container named 'bench-inspect' in detached mode (sleep 300), then use docker inspect to show its IP address."
82
+ difficulty: medium
83
+ category: query
84
+ max_turns: 5
85
+ assert:
86
+ - ran: "docker run"
87
+ - ran: "docker inspect"
88
+ - exit_code: 0
89
+
90
+ - id: volume-mount
91
+ intent: "Create a file called 'data.txt' with content 'volume test'. Run an alpine container that mounts the current directory to /data and reads the file with 'cat /data/data.txt'."
92
+ difficulty: medium
93
+ category: volumes
94
+ max_turns: 5
95
+ setup:
96
+ - "echo 'volume test' > data.txt"
97
+ assert:
98
+ - ran: "docker run"
99
+ - output_contains: "volume test"`;
42
100
  export async function generateSuite(cliName, helpCacheDir) {
43
101
  // Load help text
44
102
  let helpCache = await loadHelpFromCache(helpCacheDir, cliName);
@@ -51,13 +109,7 @@ export async function generateSuite(cliName, helpCacheDir) {
51
109
  throw new Error(`No help text available for ${cliName}`);
52
110
  }
53
111
  }
54
- // Load example suite for few-shot
55
- const suiteDir = join(new URL('.', import.meta.url).pathname.replace(/\/src\/$/, '').replace(/\/dist\/$/, ''), 'task_suites');
56
- let dockerExample = '';
57
- try {
58
- dockerExample = await readFile(join(suiteDir, 'docker.yaml'), 'utf-8');
59
- }
60
- catch { /* ignore */ }
112
+ const fewShotExample = FEW_SHOT_EXAMPLE;
61
113
  // Build help text summary (truncate to fit context)
62
114
  const helpEntries = Object.entries(helpCache.help_texts);
63
115
  let helpSummary = '';
@@ -76,9 +128,9 @@ ${helpSummary}
76
128
 
77
129
  ## Example Task Suite
78
130
 
79
- ### docker.yaml
131
+ ### example cli-bench.yaml
80
132
  \`\`\`yaml
81
- ${dockerExample}
133
+ ${fewShotExample}
82
134
  \`\`\`
83
135
 
84
136
  ## Output
@@ -1 +1 @@
1
- {"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAEvE,MAAM,eAAe,GAAG,oCAAoC,CAAC;AAE7D,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;iFA4B2D,CAAC;AAElF,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,YAAoB;IAEpB,iBAAiB;IACjB,IAAI,SAAS,GAAG,MAAM,iBAAiB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,MAAM,gBAAgB,CAAC,OAAO,CAAC,CAAC;QACrD,IAAI,YAAY,EAAE,CAAC;YACjB,SAAS,GAAG,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,CAAC;QACtE,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,kCAAkC;IAClC,MAAM,QAAQ,GAAG,IAAI,CACnB,IAAI,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,EACvF,aAAa,CACd,CAAC;IAEF,IAAI,aAAa,GAAG,EAAE,CAAC;IACvB,IAAI,CAAC;QACH,aAAa,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,EAAE,OAAO,CAAC,CAAC;IACzE,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IAExB,oDAAoD;IACpD,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IACzD,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,GAAG,IAAI,QAAQ,CAAC;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACtC,WAAW,IAAI,SAAS,OAAO,IAAI,KAAK,gBAAgB,SAAS,IAAI,CAAC;QACtE,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK;YAAE,MAAM;IACxC,CAAC;IAED,MAAM,MAAM,GAAG,uCAAuC,OAAO;;;;EAI7D,WAAW;;;;;;EAMX,aAAa;;;;;yEAK0D,OAAO,KAAK,CAAC;IAEpF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;QAChC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC;QAC/B,MAAM,EAAE,aAAa;QACrB,MAAM;QACN,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;IAEH,2DAA2D;IAC3D,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;IAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
1
+ {"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAEvE,MAAM,eAAe,GAAG,oCAAoC,CAAC;AAE7D,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;iFA4B2D,CAAC;AAElF,MAAM,gBAAgB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;uCA2Dc,CAAC;AAExC,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,YAAoB;IAEpB,iBAAiB;IACjB,IAAI,SAAS,GAAG,MAAM,iBAAiB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,MAAM,gBAAgB,CAAC,OAAO,CAAC,CAAC;QACrD,IAAI,YAAY,EAAE,CAAC;YACjB,SAAS,GAAG,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,CAAC;QACtE,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,MAAM,cAAc,GAAG,gBAAgB,CAAC;IAExC,oDAAoD;IACpD,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IACzD,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,GAAG,IAAI,QAAQ,CAAC;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACtC,WAAW,IAAI,SAAS,OAAO,IAAI,KAAK,gBAAgB,SAAS,IAAI,CAAC;QACtE,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK;YAAE,MAAM;IACxC,CAAC;IAED,MAAM,MAAM,GAAG,uCAAuC,OAAO;;;;EAI7D,WAAW;;;;;;EAMX,cAAc;;;;;yEAKyD,OAAO,KAAK,CAAC;IAEpF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;QAChC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC;QAC/B,MAAM,EAAE,aAAa;QACrB,MAAM;QACN,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;IAEH,2DAA2D;IAC3D,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;IAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cliwatch/cli-bench",
3
- "version": "0.6.2",
3
+ "version": "0.7.0",
4
4
  "description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
5
5
  "keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
6
6
  "license": "MIT",
@@ -14,7 +14,6 @@
14
14
  },
15
15
  "files": [
16
16
  "dist",
17
- "task_suites",
18
17
  "LICENSE",
19
18
  "CHANGELOG.md"
20
19
  ],
@@ -35,6 +34,7 @@
35
34
  "test": "vitest"
36
35
  },
37
36
  "dependencies": {
37
+ "@ai-sdk/google": "^3.0.0",
38
38
  "@hey-api/client-fetch": "^0.13.1",
39
39
  "ai": "^6.0.18",
40
40
  "yaml": "^2.7.0",
@@ -1,138 +0,0 @@
1
- cli: curl
2
- version_command: "curl --version | head -1"
3
-
4
- tasks:
5
- # -- Easy --------------------------------------------------------------------
6
- - id: simple-get
7
- intent: "Fetch the contents of https://httpbin.org/get"
8
- difficulty: easy
9
- category: query
10
- max_turns: 2
11
- assert:
12
- - ran: "curl.*httpbin.org/get"
13
- - exit_code: 0
14
- - output_contains: "origin"
15
-
16
- - id: head-request
17
- intent: "Send a HEAD request to https://httpbin.org/get and show the response headers"
18
- difficulty: easy
19
- category: query
20
- max_turns: 3
21
- assert:
22
- - ran: "curl"
23
- - ran: "-I|--head"
24
- - output_contains: "HTTP"
25
-
26
- - id: download-file
27
- intent: "Download https://httpbin.org/robots.txt and save it as robots.txt"
28
- difficulty: easy
29
- category: crud
30
- max_turns: 3
31
- assert:
32
- - ran: "curl"
33
- - ran: "-o|--output"
34
- - file_exists: "robots.txt"
35
-
36
- - id: follow-redirects
37
- intent: "Fetch https://httpbin.org/redirect/2 and follow all redirects"
38
- difficulty: easy
39
- category: query
40
- max_turns: 3
41
- assert:
42
- - ran: "curl"
43
- - ran: "-L|--location"
44
- - exit_code: 0
45
-
46
- # -- Medium ------------------------------------------------------------------
47
- - id: post-json
48
- intent: "Send a POST request to https://httpbin.org/post with JSON body {\"name\": \"bench\", \"version\": 1} and set the Content-Type header to application/json"
49
- difficulty: medium
50
- category: crud
51
- max_turns: 5
52
- assert:
53
- - ran: "curl"
54
- - ran: "-X POST|--request POST|-d|--data"
55
- - ran: "Content-Type.*application/json"
56
- - output_contains: "bench"
57
-
58
- - id: custom-headers
59
- intent: "Send a GET request to https://httpbin.org/headers with custom headers X-Request-ID: abc123 and Accept: application/xml"
60
- difficulty: medium
61
- category: query
62
- max_turns: 5
63
- assert:
64
- - ran: "curl"
65
- - ran: "-H|--header"
66
- - ran: "X-Request-ID"
67
- - output_contains: "abc123"
68
-
69
- - id: basic-auth
70
- intent: "Send a GET request to https://httpbin.org/basic-auth/user/passwd using basic authentication with username 'user' and password 'passwd'"
71
- difficulty: medium
72
- category: auth
73
- max_turns: 5
74
- assert:
75
- - ran: "curl"
76
- - ran: "-u|--user|user:passwd"
77
- - output_contains: "authenticated"
78
-
79
- - id: verbose-timing
80
- intent: "Fetch https://httpbin.org/get and show the total time taken for the request using curl's write-out feature"
81
- difficulty: medium
82
- category: output
83
- max_turns: 5
84
- assert:
85
- - ran: "curl"
86
- - ran: "-w|--write-out|time_total"
87
-
88
- # -- Hard --------------------------------------------------------------------
89
- - id: put-with-file
90
- intent: "Upload the file /tmp/bench-workspace/data.json to https://httpbin.org/put using a PUT request with Content-Type application/json"
91
- difficulty: hard
92
- category: crud
93
- setup:
94
- - "mkdir -p /tmp/bench-workspace"
95
- - "echo '{\"key\": \"value\"}' > /tmp/bench-workspace/data.json"
96
- max_turns: 7
97
- assert:
98
- - ran: "curl"
99
- - ran: "-X PUT|--request PUT|-T"
100
- - ran: "data.json"
101
- - output_contains: "key"
102
-
103
- - id: retry-with-timeout
104
- intent: "Fetch https://httpbin.org/delay/1 with a 5 second timeout, retry 3 times on failure, and save the response to response.json"
105
- difficulty: hard
106
- category: crud
107
- max_turns: 7
108
- assert:
109
- - ran: "curl"
110
- - ran: "--retry.*3"
111
- - ran: "--max-time|--connect-timeout|-m"
112
- - ran: "-o|--output"
113
-
114
- - id: multipart-upload
115
- intent: "Send a multipart/form-data POST to https://httpbin.org/post with a field 'username' set to 'admin' and a file field 'config' uploading /tmp/bench-workspace/app.conf"
116
- difficulty: hard
117
- category: crud
118
- setup:
119
- - "mkdir -p /tmp/bench-workspace"
120
- - "echo 'port=8080' > /tmp/bench-workspace/app.conf"
121
- max_turns: 7
122
- assert:
123
- - ran: "curl"
124
- - ran: "-F|--form"
125
- - ran: "username.*admin"
126
- - ran: "config.*@.*app.conf"
127
- - output_contains: "admin"
128
-
129
- - id: conditional-request
130
- intent: "Fetch https://httpbin.org/cache and use conditional headers: set If-None-Match to '12345' and If-Modified-Since to 'Thu, 01 Jan 2025 00:00:00 GMT'. Show the response status code."
131
- difficulty: hard
132
- category: query
133
- max_turns: 7
134
- assert:
135
- - ran: "curl"
136
- - ran: "If-None-Match"
137
- - ran: "If-Modified-Since"
138
- - ran: "-w|--write-out|-I|--head|-v|--verbose"
@@ -1,163 +0,0 @@
1
- cli: docker
2
-
3
- tasks:
4
- # -- Easy --------------------------------------------------------------------
5
- - id: list-containers
6
- intent: "List all running containers"
7
- difficulty: easy
8
- category: query
9
- max_turns: 3
10
- assert:
11
- - ran: "docker ps"
12
- - exit_code: 0
13
-
14
- - id: list-images
15
- intent: "List all local Docker images"
16
- difficulty: easy
17
- category: query
18
- max_turns: 3
19
- assert:
20
- - ran: "docker image"
21
- - exit_code: 0
22
-
23
- - id: pull-image
24
- intent: "Pull the latest nginx image from Docker Hub"
25
- difficulty: easy
26
- category: crud
27
- max_turns: 3
28
- assert:
29
- - ran: "docker pull.*nginx"
30
- - verify:
31
- run: "docker images nginx --format '{{.Repository}}'"
32
- output_contains: "nginx"
33
-
34
- - id: view-logs
35
- intent: "Show the last 100 lines of logs from the container 'api' and follow new output"
36
- difficulty: easy
37
- category: query
38
- setup:
39
- - "docker run -d --name api alpine sh -c 'for i in $(seq 1 200); do echo line-$i; done; sleep 3600'"
40
- max_turns: 3
41
- assert:
42
- - ran: "docker logs"
43
- - ran: "--tail"
44
-
45
- - id: stop-container
46
- intent: "Stop the container named 'web-server'"
47
- difficulty: easy
48
- category: crud
49
- setup:
50
- - "docker run -d --name web-server alpine sleep 3600"
51
- max_turns: 3
52
- assert:
53
- - ran: "docker stop.*web-server"
54
- - verify:
55
- run: "docker ps --filter name=web-server --format '{{.Names}}'"
56
- output_equals: ""
57
-
58
- # -- Medium ------------------------------------------------------------------
59
- - id: run-detached
60
- intent: "Run an nginx container in the background, mapping host port 8080 to container port 80, named 'web'"
61
- difficulty: medium
62
- category: crud
63
- max_turns: 5
64
- assert:
65
- - ran: "docker run"
66
- - ran: "-d"
67
- - ran: "--name.*web"
68
- - verify:
69
- run: "docker ps --filter name=web --format '{{.Names}}'"
70
- output_contains: "web"
71
-
72
- - id: build-with-tag
73
- intent: "Build a Docker image from ./Dockerfile and tag it as myapp:v2"
74
- difficulty: medium
75
- category: crud
76
- setup:
77
- - "mkdir -p /tmp/bench-workspace"
78
- - "printf 'FROM alpine:latest\nRUN echo hello' > /tmp/bench-workspace/Dockerfile"
79
- max_turns: 5
80
- assert:
81
- - ran: "docker build"
82
- - ran: "myapp:v2"
83
- - verify:
84
- run: "docker images myapp:v2 --format '{{.Repository}}:{{.Tag}}'"
85
- output_contains: "myapp:v2"
86
-
87
- - id: exec-into-container
88
- intent: "Run the command 'cat /etc/os-release' inside the running container 'web'"
89
- difficulty: medium
90
- category: crud
91
- setup:
92
- - "docker run -d --name web alpine sleep 3600"
93
- max_turns: 5
94
- assert:
95
- - ran: "docker exec.*web.*cat /etc/os-release"
96
- - exit_code: 0
97
-
98
- - id: inspect-json
99
- intent: "Get the IP address of the container 'web' using docker inspect with a format template"
100
- difficulty: medium
101
- category: output
102
- setup:
103
- - "docker run -d --name web alpine sleep 3600"
104
- max_turns: 5
105
- assert:
106
- - ran: "docker inspect"
107
- - ran: "--format"
108
-
109
- - id: prune-all
110
- intent: "Remove all stopped containers, unused networks, dangling images, and build cache without prompting for confirmation"
111
- difficulty: medium
112
- category: crud
113
- max_turns: 5
114
- assert:
115
- - ran: "docker system prune"
116
- - ran: "--force|-f"
117
-
118
- # -- Hard --------------------------------------------------------------------
119
- - id: run-complex
120
- intent: "Run a postgres:16 container named 'db' in the background with environment variables POSTGRES_USER=admin and POSTGRES_PASSWORD=secret, mount a volume 'pgdata' to /var/lib/postgresql/data, and connect it to the network 'backend'"
121
- difficulty: hard
122
- category: crud
123
- setup:
124
- - "docker network create backend || true"
125
- max_turns: 7
126
- assert:
127
- - ran: "docker run"
128
- - ran: "--name.*db"
129
- - ran: "POSTGRES_USER=admin"
130
- - ran: "POSTGRES_PASSWORD=secret"
131
- - verify:
132
- run: "docker ps --filter name=db --format '{{.Names}}'"
133
- output_contains: "db"
134
-
135
- - id: compose-up
136
- intent: "Start all services defined in docker-compose.yml in detached mode and rebuild any changed images"
137
- difficulty: hard
138
- category: workflow
139
- setup:
140
- - "mkdir -p /tmp/bench-workspace"
141
- - "printf 'services:\n web:\n image: alpine\n command: sleep 3600\n' > /tmp/bench-workspace/docker-compose.yml"
142
- max_turns: 7
143
- assert:
144
- - ran: "docker compose.*up"
145
- - ran: "-d|--detach"
146
- - ran: "--build"
147
-
148
- - id: multi-stage-debug
149
- intent: "Build only the 'builder' stage from /tmp/bench-workspace/Dockerfile, tag it as 'myapp:debug', and don't use cache"
150
- difficulty: hard
151
- category: crud
152
- setup:
153
- - "mkdir -p /tmp/bench-workspace"
154
- - "printf 'FROM alpine:latest AS builder\nRUN echo building\nFROM alpine:latest\nCOPY --from=builder / /\n' > /tmp/bench-workspace/Dockerfile"
155
- max_turns: 7
156
- assert:
157
- - ran: "docker build"
158
- - ran: "--target.*builder"
159
- - ran: "myapp:debug"
160
- - ran: "--no-cache"
161
- - verify:
162
- run: "docker images myapp:debug --format '{{.Repository}}:{{.Tag}}'"
163
- output_contains: "myapp:debug"