@cliwatch/cli-bench 0.6.3 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/CHANGELOG.md +14 -0
  2. package/README.md +3 -0
  3. package/dist/assertions.d.ts +1 -1
  4. package/dist/assertions.d.ts.map +1 -1
  5. package/dist/assertions.js +6 -6
  6. package/dist/assertions.js.map +1 -1
  7. package/dist/ci.d.ts.map +1 -1
  8. package/dist/ci.js +14 -0
  9. package/dist/ci.js.map +1 -1
  10. package/dist/client/index.d.ts +1 -1
  11. package/dist/client/index.d.ts.map +1 -1
  12. package/dist/client/types.gen.d.ts +143 -93
  13. package/dist/client/types.gen.d.ts.map +1 -1
  14. package/dist/client/zod.gen.d.ts +75 -42
  15. package/dist/client/zod.gen.d.ts.map +1 -1
  16. package/dist/client/zod.gen.js +86 -54
  17. package/dist/client/zod.gen.js.map +1 -1
  18. package/dist/config.d.ts +2 -3
  19. package/dist/config.d.ts.map +1 -1
  20. package/dist/config.js +8 -15
  21. package/dist/config.js.map +1 -1
  22. package/dist/exec.d.ts +2 -0
  23. package/dist/exec.d.ts.map +1 -1
  24. package/dist/exec.js +6 -2
  25. package/dist/exec.js.map +1 -1
  26. package/dist/github-comment.d.ts +16 -0
  27. package/dist/github-comment.d.ts.map +1 -0
  28. package/dist/github-comment.js +90 -0
  29. package/dist/github-comment.js.map +1 -0
  30. package/dist/index.d.ts +2 -3
  31. package/dist/index.d.ts.map +1 -1
  32. package/dist/index.js +31 -36
  33. package/dist/index.js.map +1 -1
  34. package/dist/init.js +1 -1
  35. package/dist/models.d.ts +9 -9
  36. package/dist/models.d.ts.map +1 -1
  37. package/dist/models.js +1 -1
  38. package/dist/models.js.map +1 -1
  39. package/dist/project.d.ts +11 -2
  40. package/dist/project.d.ts.map +1 -1
  41. package/dist/project.js +108 -9
  42. package/dist/project.js.map +1 -1
  43. package/dist/prompt.d.ts +2 -8
  44. package/dist/prompt.d.ts.map +1 -1
  45. package/dist/prompt.js +2 -35
  46. package/dist/prompt.js.map +1 -1
  47. package/dist/providers.d.ts +9 -7
  48. package/dist/providers.d.ts.map +1 -1
  49. package/dist/providers.js +26 -8
  50. package/dist/providers.js.map +1 -1
  51. package/dist/runner.d.ts +32 -4
  52. package/dist/runner.d.ts.map +1 -1
  53. package/dist/runner.js +177 -177
  54. package/dist/runner.js.map +1 -1
  55. package/dist/schemas.d.ts +20 -1
  56. package/dist/schemas.d.ts.map +1 -1
  57. package/dist/schemas.js +8 -1
  58. package/dist/schemas.js.map +1 -1
  59. package/dist/suite-generator.d.ts.map +1 -1
  60. package/dist/suite-generator.js +93 -10
  61. package/dist/suite-generator.js.map +1 -1
  62. package/package.json +2 -2
  63. package/dist/help-loader.d.ts +0 -17
  64. package/dist/help-loader.d.ts.map +0 -1
  65. package/dist/help-loader.js +0 -65
  66. package/dist/help-loader.js.map +0 -1
  67. package/task_suites/curl.yaml +0 -138
  68. package/task_suites/docker.yaml +0 -163
  69. package/task_suites/gh.yaml +0 -118
  70. package/task_suites/jq.yaml +0 -172
  71. package/task_suites/kubectl.yaml +0 -74
package/dist/schemas.js CHANGED
@@ -19,10 +19,14 @@ export const TaskSchema = z.object({
19
19
  intent: z.string(),
20
20
  assert: z.array(AssertionSchema).min(1),
21
21
  setup: z.array(z.string()).optional().default([]),
22
+ cleanup: z.array(z.string()).optional(),
23
+ env: z.record(z.string(), z.string()).optional(),
22
24
  max_turns: z.number().int().min(1).max(20).optional().default(5),
23
25
  difficulty: z.enum(['easy', 'medium', 'hard']).optional(),
24
26
  category: z.string().optional(),
25
27
  repeat: z.number().int().min(1).max(100).optional(),
28
+ tags: z.array(z.string()).optional(),
29
+ scaffold: z.union([z.string(), z.literal(false)]).optional(),
26
30
  });
27
31
  export const TaskSuiteSchema = z.object({
28
32
  cli: z.string(),
@@ -50,7 +54,6 @@ export const ConfigFileSchema = z.object({
50
54
  website_url: z.string().optional(),
51
55
  github_url: z.string().optional(),
52
56
  providers: z.array(z.string()).optional(),
53
- context: z.array(z.string()).optional(),
54
57
  system_prompt: z.string().optional(),
55
58
  concurrency: z.number().int().min(1).optional(),
56
59
  workdir: z.string().optional(),
@@ -60,6 +63,10 @@ export const ConfigFileSchema = z.object({
60
63
  redact_env: z.array(z.string()).optional(),
61
64
  redact_patterns: z.array(z.string()).optional(),
62
65
  thresholds: ThresholdsSchema,
66
+ env: z.record(z.string(), z.string()).optional(),
67
+ setup: z.array(z.string()).optional(),
68
+ scaffold: z.string().optional(),
69
+ cleanup: z.array(z.string()).optional(),
63
70
  tasks: z.array(z.union([
64
71
  TaskSchema,
65
72
  z.string().refine((s) => s.startsWith('file://'), {
@@ -1 +1 @@
1
- {"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;CACpD,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAClC,CAAC,CAAC;AAEH,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AAEzD,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IAC9C,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE;IACnE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3D,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC;CACzE,CAAC,CAAC,QAAQ,EAAE,CAAC;AAEd,2DAA2D;AAC3D,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACzC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACpC,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC/C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC1C,eAAe,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC/C,UAAU,EAAE,gBAAgB;IAC5B,KAAK,EAAE,CAAC,CAAC,KAAK,CACZ,CAAC,CAAC,KAAK,CAAC;QACN,UAAU;QACV,CAAC,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;YAChD,OAAO,EAAE,yCAAyC;SACnD,CAAC;KACH,CAAC,CACH,CAAC,GAAG,CAAC,CAAC,CAAC;CACT,CAAC,CAAC"}
1
+ {"version":3,"file":"schemas.js","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACzC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACxC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACnC,CAAC,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACrC,CAAC,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IAC7B,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACjC,CAAC,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;IAClH,CAAC,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC;IACjD,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAChE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACzD,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACpC,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;CAC7D,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAClC,CAAC,CAAC;AAEH,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AAEzD,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IAC9C,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE;IACnE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3D,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC;CACzE,CAAC,CAAC,QAAQ,EAAE,CAAC;AAEd,2DAA2D;AAC3D,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACtC,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACzC,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACpC,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC/C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;IACnD,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC1C,eAAe,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAC/C,UAAU,EAAE,gBAAgB;IAC5B,GAAG,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACrC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,KAAK,EAAE,CAAC,CAAC,KAAK,CACZ,CAAC,CAAC,KAAK,CAAC;QACN,UAAU;QACV,CAAC,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;YAChD,OAAO,EAAE,yCAAyC;SACnD,CAAC;KACH,CAAC,CACH,CAAC,GAAG,CAAC,CAAC,CAAC;CACT,CAAC,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAwCH,wBAAsB,aAAa,CACjC,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,MAAM,CAAC,CAiEjB"}
1
+ {"version":3,"file":"suite-generator.d.ts","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAuIH,wBAAsB,aAAa,CACjC,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,MAAM,CAAC,CAwDjB"}
@@ -5,10 +5,10 @@
5
5
  * assert-based validation.
6
6
  */
7
7
  import { readFile } from 'node:fs/promises';
8
+ import { execFile } from 'node:child_process';
8
9
  import { join } from 'node:path';
9
10
  import { generateText } from 'ai';
10
11
  import { gateway } from 'ai';
11
- import { loadHelpFromCache, loadTopLevelHelp } from './help-loader.js';
12
12
  const GENERATOR_MODEL = 'anthropic/claude-sonnet-4-20250514';
13
13
  const SYSTEM_PROMPT = `You are a CLI test suite generator. Given help text for a CLI tool and example task suites, generate a YAML task suite.
14
14
 
@@ -39,6 +39,95 @@ Rules:
39
39
  - Focus on: local operations, file generation, formatting, config, help queries
40
40
  - Include setup commands to prepare the environment
41
41
  - Use realistic but safe values (no real credentials, no destructive operations)`;
42
+ const FEW_SHOT_EXAMPLE = `cli: docker
43
+ version_command: "docker --version"
44
+
45
+ providers:
46
+ - openai/gpt-5-nano
47
+ - google/gemini-2.5-flash-lite
48
+
49
+ concurrency: 3
50
+
51
+ system_prompt: |
52
+ You are working in a temporary empty directory.
53
+ Docker is installed and the daemon is running.
54
+ Complete each task using docker commands.
55
+ Use unique names/tags to avoid conflicts (prefix with 'bench-').
56
+
57
+ tasks:
58
+ - id: build-image
59
+ intent: "Create a Dockerfile for a simple Alpine-based image that prints 'hello from docker' when run. Build it with the tag 'bench-hello'."
60
+ difficulty: easy
61
+ category: build
62
+ max_turns: 5
63
+ assert:
64
+ - file_exists: "Dockerfile"
65
+ - ran: "docker build"
66
+ - verify:
67
+ run: "docker images bench-hello --format '{{.Repository}}'"
68
+ output_contains: "bench-hello"
69
+
70
+ - id: run-and-capture
71
+ intent: "Run the 'alpine' image with the command 'echo benchmark-test-output' and capture the output."
72
+ difficulty: easy
73
+ category: run
74
+ max_turns: 3
75
+ assert:
76
+ - ran: "docker run"
77
+ - output_contains: "benchmark-test-output"
78
+
79
+ - id: inspect-container
80
+ intent: "Run an alpine container named 'bench-inspect' in detached mode (sleep 300), then use docker inspect to show its IP address."
81
+ difficulty: medium
82
+ category: query
83
+ max_turns: 5
84
+ assert:
85
+ - ran: "docker run"
86
+ - ran: "docker inspect"
87
+ - exit_code: 0
88
+
89
+ - id: volume-mount
90
+ intent: "Create a file called 'data.txt' with content 'volume test'. Run an alpine container that mounts the current directory to /data and reads the file with 'cat /data/data.txt'."
91
+ difficulty: medium
92
+ category: volumes
93
+ max_turns: 5
94
+ setup:
95
+ - "echo 'volume test' > data.txt"
96
+ assert:
97
+ - ran: "docker run"
98
+ - output_contains: "volume test"`;
99
+ async function loadHelpFromCache(cacheDir, cliName) {
100
+ try {
101
+ const filePath = join(cacheDir, `${cliName}.json`);
102
+ const raw = await readFile(filePath, 'utf-8');
103
+ const data = JSON.parse(raw);
104
+ if (Array.isArray(data)) {
105
+ const entry = data.find((d) => d.cli_name === cliName);
106
+ if (entry?.help_texts) {
107
+ return { cli_name: entry.cli_name, help_texts: entry.help_texts };
108
+ }
109
+ return null;
110
+ }
111
+ if (data.help_texts)
112
+ return data;
113
+ return null;
114
+ }
115
+ catch {
116
+ return null;
117
+ }
118
+ }
119
+ function loadTopLevelHelp(cliName) {
120
+ return new Promise((resolve) => {
121
+ execFile(cliName, ['--help'], { timeout: 30_000 }, (err, stdout, stderr) => {
122
+ if (err) {
123
+ resolve(null);
124
+ return;
125
+ }
126
+ const output = (stdout || stderr).trim();
127
+ resolve(output || null);
128
+ });
129
+ });
130
+ }
42
131
  export async function generateSuite(cliName, helpCacheDir) {
43
132
  // Load help text
44
133
  let helpCache = await loadHelpFromCache(helpCacheDir, cliName);
@@ -51,13 +140,7 @@ export async function generateSuite(cliName, helpCacheDir) {
51
140
  throw new Error(`No help text available for ${cliName}`);
52
141
  }
53
142
  }
54
- // Load example suite for few-shot
55
- const suiteDir = join(new URL('.', import.meta.url).pathname.replace(/\/src\/$/, '').replace(/\/dist\/$/, ''), 'task_suites');
56
- let dockerExample = '';
57
- try {
58
- dockerExample = await readFile(join(suiteDir, 'docker.yaml'), 'utf-8');
59
- }
60
- catch { /* ignore */ }
143
+ const fewShotExample = FEW_SHOT_EXAMPLE;
61
144
  // Build help text summary (truncate to fit context)
62
145
  const helpEntries = Object.entries(helpCache.help_texts);
63
146
  let helpSummary = '';
@@ -76,9 +159,9 @@ ${helpSummary}
76
159
 
77
160
  ## Example Task Suite
78
161
 
79
- ### docker.yaml
162
+ ### example cli-bench.yaml
80
163
  \`\`\`yaml
81
- ${dockerExample}
164
+ ${fewShotExample}
82
165
  \`\`\`
83
166
 
84
167
  ## Output
@@ -1 +1 @@
1
- {"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAEvE,MAAM,eAAe,GAAG,oCAAoC,CAAC;AAE7D,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;iFA4B2D,CAAC;AAElF,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,YAAoB;IAEpB,iBAAiB;IACjB,IAAI,SAAS,GAAG,MAAM,iBAAiB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,MAAM,gBAAgB,CAAC,OAAO,CAAC,CAAC;QACrD,IAAI,YAAY,EAAE,CAAC;YACjB,SAAS,GAAG,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,CAAC;QACtE,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,kCAAkC;IAClC,MAAM,QAAQ,GAAG,IAAI,CACnB,IAAI,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,EACvF,aAAa,CACd,CAAC;IAEF,IAAI,aAAa,GAAG,EAAE,CAAC;IACvB,IAAI,CAAC;QACH,aAAa,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,EAAE,OAAO,CAAC,CAAC;IACzE,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IAExB,oDAAoD;IACpD,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IACzD,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,GAAG,IAAI,QAAQ,CAAC;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACtC,WAAW,IAAI,SAAS,OAAO,IAAI,KAAK,gBAAgB,SAAS,IAAI,CAAC;QACtE,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK;YAAE,MAAM;IACxC,CAAC;IAED,MAAM,MAAM,GAAG,uCAAuC,OAAO;;;;EAI7D,WAAW;;;;;;EAMX,aAAa;;;;;yEAK0D,OAAO,KAAK,CAAC;IAEpF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;QAChC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC;QAC/B,MAAM,EAAE,aAAa;QACrB,MAAM;QACN,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;IAEH,2DAA2D;IAC3D,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;IAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
1
+ {"version":3,"file":"suite-generator.js","sourceRoot":"","sources":["../src/suite-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAE7B,MAAM,eAAe,GAAG,oCAAoC,CAAC;AAE7D,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;iFA4B2D,CAAC;AAElF,MAAM,gBAAgB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;uCAwDc,CAAC;AAOxC,KAAK,UAAU,iBAAiB,CAC9B,QAAgB,EAChB,OAAe;IAEf,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,GAAG,OAAO,OAAO,CAAC,CAAC;QACnD,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC9C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7B,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAwB,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC;YAC9E,IAAI,KAAK,EAAE,UAAU,EAAE,CAAC;gBACtB,OAAO,EAAE,QAAQ,EAAE,KAAK,CAAC,QAAQ,EAAE,UAAU,EAAE,KAAK,CAAC,UAAU,EAAE,CAAC;YACpE,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;QACD,IAAI,IAAI,CAAC,UAAU;YAAE,OAAO,IAAiB,CAAC;QAC9C,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,gBAAgB,CAAC,OAAe;IACvC,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,QAAQ,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE;YACzE,IAAI,GAAG,EAAE,CAAC;gBAAC,OAAO,CAAC,IAAI,CAAC,CAAC;gBAAC,OAAO;YAAC,CAAC;YACnC,MAAM,MAAM,GAAG,CAAC,MAAM,IAAI,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YACzC,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,YAAoB;IAEpB,iBAAiB;IACjB,IAAI,SAAS,GAAG,MAAM,iBAAiB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,MAAM,gBAAgB,CAAC,OAAO,CAAC,CAAC;QACrD,IAAI,YAAY,EAAE,CAAC;YACjB,SAAS,GAAG,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,CAAC;QACtE,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,MAAM,cAAc,GAAG,gBAAgB,CAAC;IAExC,oDAAoD;IACpD,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IACzD,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,GAAG,IAAI,QAAQ,CAAC;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACtC,WAAW,IAAI,SAAS,OAAO,IAAI,KAAK,gBAAgB,SAAS,IAAI,CAAC;QACtE,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK;YAAE,MAAM;IACxC,CAAC;IAED,MAAM,MAAM,GAAG,uCAAuC,OAAO;;;;EAI7D,WAAW;;;;;;EAMX,cAAc;;;;;yEAKyD,OAAO,KAAK,CAAC;IAEpF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;QAChC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC;QAC/B,MAAM,EAAE,aAAa;QACrB,MAAM;QACN,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;IAEH,2DAA2D;IAC3D,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;IAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cliwatch/cli-bench",
3
- "version": "0.6.3",
3
+ "version": "0.7.1",
4
4
  "description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
5
5
  "keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
6
6
  "license": "MIT",
@@ -14,7 +14,6 @@
14
14
  },
15
15
  "files": [
16
16
  "dist",
17
- "task_suites",
18
17
  "LICENSE",
19
18
  "CHANGELOG.md"
20
19
  ],
@@ -35,6 +34,7 @@
35
34
  "test": "vitest"
36
35
  },
37
36
  "dependencies": {
37
+ "@ai-sdk/google": "^3.0.0",
38
38
  "@hey-api/client-fetch": "^0.13.1",
39
39
  "ai": "^6.0.18",
40
40
  "yaml": "^2.7.0",
@@ -1,17 +0,0 @@
1
- /**
2
- * Loads CLI help text from cached JSON files or live CLI execution.
3
- *
4
- * Cached mode reads from help_cache/<cli>.json, produced by
5
- * `audit-worker --dry-run --output-help --output <file>`.
6
- *
7
- * Live mode shells out to the CLI's --help.
8
- */
9
- import type { HelpCache } from './models.js';
10
- export declare function loadHelpFromCache(cacheDir: string, cliName: string): Promise<HelpCache | null>;
11
- export declare function listAvailableCaches(cacheDir: string): Promise<string[]>;
12
- /**
13
- * Run `cli --help` and return the output as a string.
14
- * Single top-level invocation — no subcommand crawling.
15
- */
16
- export declare function loadTopLevelHelp(cliName: string): Promise<string | null>;
17
- //# sourceMappingURL=help-loader.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"help-loader.d.ts","sourceRoot":"","sources":["../src/help-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAKH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAE7C,wBAAsB,iBAAiB,CACrC,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC,CA6B3B;AAED,wBAAsB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAS7E;AAED;;;GAGG;AACH,wBAAsB,gBAAgB,CACpC,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAWxB"}
@@ -1,65 +0,0 @@
1
- /**
2
- * Loads CLI help text from cached JSON files or live CLI execution.
3
- *
4
- * Cached mode reads from help_cache/<cli>.json, produced by
5
- * `audit-worker --dry-run --output-help --output <file>`.
6
- *
7
- * Live mode shells out to the CLI's --help.
8
- */
9
- import { readFile, readdir } from 'node:fs/promises';
10
- import { join } from 'node:path';
11
- import { execFile } from 'node:child_process';
12
- export async function loadHelpFromCache(cacheDir, cliName) {
13
- try {
14
- const filePath = join(cacheDir, `${cliName}.json`);
15
- const raw = await readFile(filePath, 'utf-8');
16
- const data = JSON.parse(raw);
17
- // Support both formats: direct HelpCache or audit-worker output array
18
- if (Array.isArray(data)) {
19
- const entry = data.find((d) => d.cli_name === cliName);
20
- if (entry?.help_texts) {
21
- return {
22
- cli_name: entry.cli_name,
23
- help_texts: entry.help_texts,
24
- version: entry.version,
25
- };
26
- }
27
- return null;
28
- }
29
- if (data.help_texts) {
30
- return data;
31
- }
32
- return null;
33
- }
34
- catch {
35
- return null;
36
- }
37
- }
38
- export async function listAvailableCaches(cacheDir) {
39
- try {
40
- const files = await readdir(cacheDir);
41
- return files
42
- .filter((f) => f.endsWith('.json'))
43
- .map((f) => f.replace(/\.json$/, ''));
44
- }
45
- catch {
46
- return [];
47
- }
48
- }
49
- /**
50
- * Run `cli --help` and return the output as a string.
51
- * Single top-level invocation — no subcommand crawling.
52
- */
53
- export async function loadTopLevelHelp(cliName) {
54
- return new Promise((resolve) => {
55
- execFile(cliName, ['--help'], { timeout: 30_000 }, (err, stdout, stderr) => {
56
- if (err) {
57
- resolve(null);
58
- return;
59
- }
60
- const output = (stdout || stderr).trim();
61
- resolve(output || null);
62
- });
63
- });
64
- }
65
- //# sourceMappingURL=help-loader.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"help-loader.js","sourceRoot":"","sources":["../src/help-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAG9C,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,OAAe;IAEf,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,GAAG,OAAO,OAAO,CAAC,CAAC;QACnD,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC9C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAE7B,sEAAsE;QACtE,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CACrB,CAAC,CAAwB,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,OAAO,CACrD,CAAC;YACF,IAAI,KAAK,EAAE,UAAU,EAAE,CAAC;gBACtB,OAAO;oBACL,QAAQ,EAAE,KAAK,CAAC,QAAQ;oBACxB,UAAU,EAAE,KAAK,CAAC,UAAU;oBAC5B,OAAO,EAAE,KAAK,CAAC,OAAO;iBACvB,CAAC;YACJ,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO,IAAiB,CAAC;QAC3B,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAgB;IACxD,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,CAAC;QACtC,OAAO,KAAK;aACT,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;aAClC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,CAAC;IAC1C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,OAAe;IAEf,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,QAAQ,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE;YACzE,IAAI,GAAG,EAAE,CAAC;gBACR,OAAO,CAAC,IAAI,CAAC,CAAC;gBACd,OAAO;YACT,CAAC;YACD,MAAM,MAAM,GAAG,CAAC,MAAM,IAAI,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YACzC,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -1,138 +0,0 @@
1
- cli: curl
2
- version_command: "curl --version | head -1"
3
-
4
- tasks:
5
- # -- Easy --------------------------------------------------------------------
6
- - id: simple-get
7
- intent: "Fetch the contents of https://httpbin.org/get"
8
- difficulty: easy
9
- category: query
10
- max_turns: 2
11
- assert:
12
- - ran: "curl.*httpbin.org/get"
13
- - exit_code: 0
14
- - output_contains: "origin"
15
-
16
- - id: head-request
17
- intent: "Send a HEAD request to https://httpbin.org/get and show the response headers"
18
- difficulty: easy
19
- category: query
20
- max_turns: 3
21
- assert:
22
- - ran: "curl"
23
- - ran: "-I|--head"
24
- - output_contains: "HTTP"
25
-
26
- - id: download-file
27
- intent: "Download https://httpbin.org/robots.txt and save it as robots.txt"
28
- difficulty: easy
29
- category: crud
30
- max_turns: 3
31
- assert:
32
- - ran: "curl"
33
- - ran: "-o|--output"
34
- - file_exists: "robots.txt"
35
-
36
- - id: follow-redirects
37
- intent: "Fetch https://httpbin.org/redirect/2 and follow all redirects"
38
- difficulty: easy
39
- category: query
40
- max_turns: 3
41
- assert:
42
- - ran: "curl"
43
- - ran: "-L|--location"
44
- - exit_code: 0
45
-
46
- # -- Medium ------------------------------------------------------------------
47
- - id: post-json
48
- intent: "Send a POST request to https://httpbin.org/post with JSON body {\"name\": \"bench\", \"version\": 1} and set the Content-Type header to application/json"
49
- difficulty: medium
50
- category: crud
51
- max_turns: 5
52
- assert:
53
- - ran: "curl"
54
- - ran: "-X POST|--request POST|-d|--data"
55
- - ran: "Content-Type.*application/json"
56
- - output_contains: "bench"
57
-
58
- - id: custom-headers
59
- intent: "Send a GET request to https://httpbin.org/headers with custom headers X-Request-ID: abc123 and Accept: application/xml"
60
- difficulty: medium
61
- category: query
62
- max_turns: 5
63
- assert:
64
- - ran: "curl"
65
- - ran: "-H|--header"
66
- - ran: "X-Request-ID"
67
- - output_contains: "abc123"
68
-
69
- - id: basic-auth
70
- intent: "Send a GET request to https://httpbin.org/basic-auth/user/passwd using basic authentication with username 'user' and password 'passwd'"
71
- difficulty: medium
72
- category: auth
73
- max_turns: 5
74
- assert:
75
- - ran: "curl"
76
- - ran: "-u|--user|user:passwd"
77
- - output_contains: "authenticated"
78
-
79
- - id: verbose-timing
80
- intent: "Fetch https://httpbin.org/get and show the total time taken for the request using curl's write-out feature"
81
- difficulty: medium
82
- category: output
83
- max_turns: 5
84
- assert:
85
- - ran: "curl"
86
- - ran: "-w|--write-out|time_total"
87
-
88
- # -- Hard --------------------------------------------------------------------
89
- - id: put-with-file
90
- intent: "Upload the file /tmp/bench-workspace/data.json to https://httpbin.org/put using a PUT request with Content-Type application/json"
91
- difficulty: hard
92
- category: crud
93
- setup:
94
- - "mkdir -p /tmp/bench-workspace"
95
- - "echo '{\"key\": \"value\"}' > /tmp/bench-workspace/data.json"
96
- max_turns: 7
97
- assert:
98
- - ran: "curl"
99
- - ran: "-X PUT|--request PUT|-T"
100
- - ran: "data.json"
101
- - output_contains: "key"
102
-
103
- - id: retry-with-timeout
104
- intent: "Fetch https://httpbin.org/delay/1 with a 5 second timeout, retry 3 times on failure, and save the response to response.json"
105
- difficulty: hard
106
- category: crud
107
- max_turns: 7
108
- assert:
109
- - ran: "curl"
110
- - ran: "--retry.*3"
111
- - ran: "--max-time|--connect-timeout|-m"
112
- - ran: "-o|--output"
113
-
114
- - id: multipart-upload
115
- intent: "Send a multipart/form-data POST to https://httpbin.org/post with a field 'username' set to 'admin' and a file field 'config' uploading /tmp/bench-workspace/app.conf"
116
- difficulty: hard
117
- category: crud
118
- setup:
119
- - "mkdir -p /tmp/bench-workspace"
120
- - "echo 'port=8080' > /tmp/bench-workspace/app.conf"
121
- max_turns: 7
122
- assert:
123
- - ran: "curl"
124
- - ran: "-F|--form"
125
- - ran: "username.*admin"
126
- - ran: "config.*@.*app.conf"
127
- - output_contains: "admin"
128
-
129
- - id: conditional-request
130
- intent: "Fetch https://httpbin.org/cache and use conditional headers: set If-None-Match to '12345' and If-Modified-Since to 'Thu, 01 Jan 2025 00:00:00 GMT'. Show the response status code."
131
- difficulty: hard
132
- category: query
133
- max_turns: 7
134
- assert:
135
- - ran: "curl"
136
- - ran: "If-None-Match"
137
- - ran: "If-Modified-Since"
138
- - ran: "-w|--write-out|-I|--head|-v|--verbose"
@@ -1,163 +0,0 @@
1
- cli: docker
2
-
3
- tasks:
4
- # -- Easy --------------------------------------------------------------------
5
- - id: list-containers
6
- intent: "List all running containers"
7
- difficulty: easy
8
- category: query
9
- max_turns: 3
10
- assert:
11
- - ran: "docker ps"
12
- - exit_code: 0
13
-
14
- - id: list-images
15
- intent: "List all local Docker images"
16
- difficulty: easy
17
- category: query
18
- max_turns: 3
19
- assert:
20
- - ran: "docker image"
21
- - exit_code: 0
22
-
23
- - id: pull-image
24
- intent: "Pull the latest nginx image from Docker Hub"
25
- difficulty: easy
26
- category: crud
27
- max_turns: 3
28
- assert:
29
- - ran: "docker pull.*nginx"
30
- - verify:
31
- run: "docker images nginx --format '{{.Repository}}'"
32
- output_contains: "nginx"
33
-
34
- - id: view-logs
35
- intent: "Show the last 100 lines of logs from the container 'api' and follow new output"
36
- difficulty: easy
37
- category: query
38
- setup:
39
- - "docker run -d --name api alpine sh -c 'for i in $(seq 1 200); do echo line-$i; done; sleep 3600'"
40
- max_turns: 3
41
- assert:
42
- - ran: "docker logs"
43
- - ran: "--tail"
44
-
45
- - id: stop-container
46
- intent: "Stop the container named 'web-server'"
47
- difficulty: easy
48
- category: crud
49
- setup:
50
- - "docker run -d --name web-server alpine sleep 3600"
51
- max_turns: 3
52
- assert:
53
- - ran: "docker stop.*web-server"
54
- - verify:
55
- run: "docker ps --filter name=web-server --format '{{.Names}}'"
56
- output_equals: ""
57
-
58
- # -- Medium ------------------------------------------------------------------
59
- - id: run-detached
60
- intent: "Run an nginx container in the background, mapping host port 8080 to container port 80, named 'web'"
61
- difficulty: medium
62
- category: crud
63
- max_turns: 5
64
- assert:
65
- - ran: "docker run"
66
- - ran: "-d"
67
- - ran: "--name.*web"
68
- - verify:
69
- run: "docker ps --filter name=web --format '{{.Names}}'"
70
- output_contains: "web"
71
-
72
- - id: build-with-tag
73
- intent: "Build a Docker image from ./Dockerfile and tag it as myapp:v2"
74
- difficulty: medium
75
- category: crud
76
- setup:
77
- - "mkdir -p /tmp/bench-workspace"
78
- - "printf 'FROM alpine:latest\nRUN echo hello' > /tmp/bench-workspace/Dockerfile"
79
- max_turns: 5
80
- assert:
81
- - ran: "docker build"
82
- - ran: "myapp:v2"
83
- - verify:
84
- run: "docker images myapp:v2 --format '{{.Repository}}:{{.Tag}}'"
85
- output_contains: "myapp:v2"
86
-
87
- - id: exec-into-container
88
- intent: "Run the command 'cat /etc/os-release' inside the running container 'web'"
89
- difficulty: medium
90
- category: crud
91
- setup:
92
- - "docker run -d --name web alpine sleep 3600"
93
- max_turns: 5
94
- assert:
95
- - ran: "docker exec.*web.*cat /etc/os-release"
96
- - exit_code: 0
97
-
98
- - id: inspect-json
99
- intent: "Get the IP address of the container 'web' using docker inspect with a format template"
100
- difficulty: medium
101
- category: output
102
- setup:
103
- - "docker run -d --name web alpine sleep 3600"
104
- max_turns: 5
105
- assert:
106
- - ran: "docker inspect"
107
- - ran: "--format"
108
-
109
- - id: prune-all
110
- intent: "Remove all stopped containers, unused networks, dangling images, and build cache without prompting for confirmation"
111
- difficulty: medium
112
- category: crud
113
- max_turns: 5
114
- assert:
115
- - ran: "docker system prune"
116
- - ran: "--force|-f"
117
-
118
- # -- Hard --------------------------------------------------------------------
119
- - id: run-complex
120
- intent: "Run a postgres:16 container named 'db' in the background with environment variables POSTGRES_USER=admin and POSTGRES_PASSWORD=secret, mount a volume 'pgdata' to /var/lib/postgresql/data, and connect it to the network 'backend'"
121
- difficulty: hard
122
- category: crud
123
- setup:
124
- - "docker network create backend || true"
125
- max_turns: 7
126
- assert:
127
- - ran: "docker run"
128
- - ran: "--name.*db"
129
- - ran: "POSTGRES_USER=admin"
130
- - ran: "POSTGRES_PASSWORD=secret"
131
- - verify:
132
- run: "docker ps --filter name=db --format '{{.Names}}'"
133
- output_contains: "db"
134
-
135
- - id: compose-up
136
- intent: "Start all services defined in docker-compose.yml in detached mode and rebuild any changed images"
137
- difficulty: hard
138
- category: workflow
139
- setup:
140
- - "mkdir -p /tmp/bench-workspace"
141
- - "printf 'services:\n web:\n image: alpine\n command: sleep 3600\n' > /tmp/bench-workspace/docker-compose.yml"
142
- max_turns: 7
143
- assert:
144
- - ran: "docker compose.*up"
145
- - ran: "-d|--detach"
146
- - ran: "--build"
147
-
148
- - id: multi-stage-debug
149
- intent: "Build only the 'builder' stage from /tmp/bench-workspace/Dockerfile, tag it as 'myapp:debug', and don't use cache"
150
- difficulty: hard
151
- category: crud
152
- setup:
153
- - "mkdir -p /tmp/bench-workspace"
154
- - "printf 'FROM alpine:latest AS builder\nRUN echo building\nFROM alpine:latest\nCOPY --from=builder / /\n' > /tmp/bench-workspace/Dockerfile"
155
- max_turns: 7
156
- assert:
157
- - ran: "docker build"
158
- - ran: "--target.*builder"
159
- - ran: "myapp:debug"
160
- - ran: "--no-cache"
161
- - verify:
162
- run: "docker images myapp:debug --format '{{.Repository}}:{{.Tag}}'"
163
- output_contains: "myapp:debug"