@cliwatch/cli-bench 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +145 -0
  3. package/dist/assertions.d.ts +14 -0
  4. package/dist/assertions.d.ts.map +1 -0
  5. package/dist/assertions.js +161 -0
  6. package/dist/assertions.js.map +1 -0
  7. package/dist/ci.d.ts +29 -0
  8. package/dist/ci.d.ts.map +1 -0
  9. package/dist/ci.js +75 -0
  10. package/dist/ci.js.map +1 -0
  11. package/dist/client/client/client.gen.d.ts +3 -0
  12. package/dist/client/client/client.gen.d.ts.map +1 -0
  13. package/dist/client/client/client.gen.js +235 -0
  14. package/dist/client/client/client.gen.js.map +1 -0
  15. package/dist/client/client/index.d.ts +9 -0
  16. package/dist/client/client/index.d.ts.map +1 -0
  17. package/dist/client/client/index.js +7 -0
  18. package/dist/client/client/index.js.map +1 -0
  19. package/dist/client/client/types.gen.d.ts +118 -0
  20. package/dist/client/client/types.gen.d.ts.map +1 -0
  21. package/dist/client/client/types.gen.js +3 -0
  22. package/dist/client/client/types.gen.js.map +1 -0
  23. package/dist/client/client/utils.gen.d.ts +34 -0
  24. package/dist/client/client/utils.gen.d.ts.map +1 -0
  25. package/dist/client/client/utils.gen.js +229 -0
  26. package/dist/client/client/utils.gen.js.map +1 -0
  27. package/dist/client/client.gen.d.ts +13 -0
  28. package/dist/client/client.gen.d.ts.map +1 -0
  29. package/dist/client/client.gen.js +4 -0
  30. package/dist/client/client.gen.js.map +1 -0
  31. package/dist/client/core/auth.gen.d.ts +19 -0
  32. package/dist/client/core/auth.gen.d.ts.map +1 -0
  33. package/dist/client/core/auth.gen.js +15 -0
  34. package/dist/client/core/auth.gen.js.map +1 -0
  35. package/dist/client/core/bodySerializer.gen.d.ts +26 -0
  36. package/dist/client/core/bodySerializer.gen.d.ts.map +1 -0
  37. package/dist/client/core/bodySerializer.gen.js +58 -0
  38. package/dist/client/core/bodySerializer.gen.js.map +1 -0
  39. package/dist/client/core/params.gen.d.ts +44 -0
  40. package/dist/client/core/params.gen.d.ts.map +1 -0
  41. package/dist/client/core/params.gen.js +101 -0
  42. package/dist/client/core/params.gen.js.map +1 -0
  43. package/dist/client/core/pathSerializer.gen.d.ts +34 -0
  44. package/dist/client/core/pathSerializer.gen.d.ts.map +1 -0
  45. package/dist/client/core/pathSerializer.gen.js +107 -0
  46. package/dist/client/core/pathSerializer.gen.js.map +1 -0
  47. package/dist/client/core/queryKeySerializer.gen.d.ts +19 -0
  48. package/dist/client/core/queryKeySerializer.gen.d.ts.map +1 -0
  49. package/dist/client/core/queryKeySerializer.gen.js +93 -0
  50. package/dist/client/core/queryKeySerializer.gen.js.map +1 -0
  51. package/dist/client/core/serverSentEvents.gen.d.ts +72 -0
  52. package/dist/client/core/serverSentEvents.gen.d.ts.map +1 -0
  53. package/dist/client/core/serverSentEvents.gen.js +134 -0
  54. package/dist/client/core/serverSentEvents.gen.js.map +1 -0
  55. package/dist/client/core/types.gen.d.ts +79 -0
  56. package/dist/client/core/types.gen.d.ts.map +1 -0
  57. package/dist/client/core/types.gen.js +3 -0
  58. package/dist/client/core/types.gen.js.map +1 -0
  59. package/dist/client/core/utils.gen.d.ts +20 -0
  60. package/dist/client/core/utils.gen.d.ts.map +1 -0
  61. package/dist/client/core/utils.gen.js +88 -0
  62. package/dist/client/core/utils.gen.js.map +1 -0
  63. package/dist/client/index.d.ts +3 -0
  64. package/dist/client/index.d.ts.map +1 -0
  65. package/dist/client/index.js +3 -0
  66. package/dist/client/index.js.map +1 -0
  67. package/dist/client/sdk.gen.d.ts +45 -0
  68. package/dist/client/sdk.gen.d.ts.map +1 -0
  69. package/dist/client/sdk.gen.js +47 -0
  70. package/dist/client/sdk.gen.js.map +1 -0
  71. package/dist/client/types.gen.d.ts +694 -0
  72. package/dist/client/types.gen.d.ts.map +1 -0
  73. package/dist/client/types.gen.js +3 -0
  74. package/dist/client/types.gen.js.map +1 -0
  75. package/dist/client/zod.gen.d.ts +492 -0
  76. package/dist/client/zod.gen.d.ts.map +1 -0
  77. package/dist/client/zod.gen.js +413 -0
  78. package/dist/client/zod.gen.js.map +1 -0
  79. package/dist/config.d.ts +22 -0
  80. package/dist/config.d.ts.map +1 -0
  81. package/dist/config.js +94 -0
  82. package/dist/config.js.map +1 -0
  83. package/dist/exec.d.ts +18 -0
  84. package/dist/exec.d.ts.map +1 -0
  85. package/dist/exec.js +30 -0
  86. package/dist/exec.js.map +1 -0
  87. package/dist/help-loader.d.ts +13 -0
  88. package/dist/help-loader.d.ts.map +1 -0
  89. package/dist/help-loader.js +135 -0
  90. package/dist/help-loader.js.map +1 -0
  91. package/dist/index.d.ts +14 -0
  92. package/dist/index.d.ts.map +1 -0
  93. package/dist/index.js +148 -0
  94. package/dist/index.js.map +1 -0
  95. package/dist/init.d.ts +5 -0
  96. package/dist/init.d.ts.map +1 -0
  97. package/dist/init.js +62 -0
  98. package/dist/init.js.map +1 -0
  99. package/dist/models.d.ts +158 -0
  100. package/dist/models.d.ts.map +1 -0
  101. package/dist/models.js +8 -0
  102. package/dist/models.js.map +1 -0
  103. package/dist/project.d.ts +26 -0
  104. package/dist/project.d.ts.map +1 -0
  105. package/dist/project.js +101 -0
  106. package/dist/project.js.map +1 -0
  107. package/dist/prompt.d.ts +12 -0
  108. package/dist/prompt.d.ts.map +1 -0
  109. package/dist/prompt.js +88 -0
  110. package/dist/prompt.js.map +1 -0
  111. package/dist/providers.d.ts +26 -0
  112. package/dist/providers.d.ts.map +1 -0
  113. package/dist/providers.js +55 -0
  114. package/dist/providers.js.map +1 -0
  115. package/dist/runner.d.ts +34 -0
  116. package/dist/runner.d.ts.map +1 -0
  117. package/dist/runner.js +434 -0
  118. package/dist/runner.js.map +1 -0
  119. package/dist/schemas.d.ts +256 -0
  120. package/dist/schemas.d.ts.map +1 -0
  121. package/dist/schemas.js +59 -0
  122. package/dist/schemas.js.map +1 -0
  123. package/dist/suite-generator.d.ts +8 -0
  124. package/dist/suite-generator.d.ts.map +1 -0
  125. package/dist/suite-generator.js +100 -0
  126. package/dist/suite-generator.js.map +1 -0
  127. package/dist/thresholds.d.ts +10 -0
  128. package/dist/thresholds.d.ts.map +1 -0
  129. package/dist/thresholds.js +57 -0
  130. package/dist/thresholds.js.map +1 -0
  131. package/package.json +41 -0
  132. package/task_suites/curl.yaml +138 -0
  133. package/task_suites/docker.yaml +163 -0
  134. package/task_suites/gh.yaml +118 -0
  135. package/task_suites/jq.yaml +172 -0
  136. package/task_suites/kubectl.yaml +74 -0
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Loads CLI help text from cached JSON files or live CLI execution.
3
+ *
4
+ * Cached mode reads from help_cache/<cli>.json, produced by
5
+ * `audit-worker --dry-run --output-help --output <file>`.
6
+ *
7
+ * Live mode shells out to the CLI's --help (requires CLI installed).
8
+ */
9
+ import { readFile, readdir } from 'node:fs/promises';
10
+ import { join } from 'node:path';
11
+ import { execFile } from 'node:child_process';
12
+ export async function loadHelpFromCache(cacheDir, cliName) {
13
+ try {
14
+ const filePath = join(cacheDir, `${cliName}.json`);
15
+ const raw = await readFile(filePath, 'utf-8');
16
+ const data = JSON.parse(raw);
17
+ // Support both formats: direct HelpCache or audit-worker output array
18
+ if (Array.isArray(data)) {
19
+ const entry = data.find((d) => d.cli_name === cliName);
20
+ if (entry?.help_texts) {
21
+ return {
22
+ cli_name: entry.cli_name,
23
+ help_texts: entry.help_texts,
24
+ version: entry.version,
25
+ };
26
+ }
27
+ return null;
28
+ }
29
+ if (data.help_texts) {
30
+ return data;
31
+ }
32
+ return null;
33
+ }
34
+ catch {
35
+ return null;
36
+ }
37
+ }
38
+ export async function listAvailableCaches(cacheDir) {
39
+ try {
40
+ const files = await readdir(cacheDir);
41
+ return files
42
+ .filter((f) => f.endsWith('.json'))
43
+ .map((f) => f.replace(/\.json$/, ''));
44
+ }
45
+ catch {
46
+ return [];
47
+ }
48
+ }
49
+ export async function loadHelpLive(cliName, maxDepth = 2, maxCommands = 50) {
50
+ const helpTexts = {};
51
+ async function capture(commandParts, depth) {
52
+ if (depth > maxDepth || Object.keys(helpTexts).length >= maxCommands) {
53
+ return;
54
+ }
55
+ const pathKey = commandParts.slice(1).join(' ');
56
+ if (pathKey in helpTexts)
57
+ return;
58
+ const output = await runHelp(commandParts);
59
+ if (!output)
60
+ return;
61
+ helpTexts[pathKey] = output;
62
+ if (depth < maxDepth && Object.keys(helpTexts).length < maxCommands) {
63
+ const subcommands = parseSubcommands(output);
64
+ for (const sub of subcommands) {
65
+ if (Object.keys(helpTexts).length >= maxCommands)
66
+ break;
67
+ await capture([...commandParts, sub], depth + 1);
68
+ }
69
+ }
70
+ }
71
+ await capture([cliName], 0);
72
+ return {
73
+ cli_name: cliName,
74
+ help_texts: helpTexts,
75
+ };
76
+ }
77
+ function runHelp(commandParts) {
78
+ return new Promise((resolve) => {
79
+ const [cmd, ...args] = commandParts;
80
+ execFile(cmd, [...args, '--help'], { timeout: 30_000 }, (err, stdout, stderr) => {
81
+ if (err) {
82
+ resolve(null);
83
+ return;
84
+ }
85
+ const output = (stdout || stderr).trim();
86
+ resolve(output || null);
87
+ });
88
+ });
89
+ }
90
+ /**
91
+ * Minimal subcommand parser — matches the audit-worker capturer's logic.
92
+ */
93
+ function parseSubcommands(helpText) {
94
+ const subcommands = [];
95
+ const seen = new Set();
96
+ let inCommandsSection = false;
97
+ const COMMAND_SECTION_RE = /^[A-Z].*\bcommands?\b.*:?\s*(\(.*\))?\s*$/i;
98
+ const NON_COMMAND_SECTION_RE = /^(global\s+|cache\s+)?(flags|options|arguments?|usage|examples?|environment|help\s+topics|learn)\s*:?\s*$/i;
99
+ const SECTION_HEADER_RE = /^[A-Z][\w\s/&-]*(\(.*\)\s*)?:\s*$/;
100
+ const SKIP_WORDS = new Set([
101
+ 'use', 'aliases', 'alias', 'see', 'for', 'more', 'additional',
102
+ 'help', 'usage', 'learn', 'read', 'find',
103
+ ]);
104
+ for (const line of helpText.split('\n')) {
105
+ const stripped = line.trim();
106
+ if (!stripped)
107
+ continue;
108
+ // Only non-indented lines can be section headers
109
+ if (line === line.trimStart()) {
110
+ if (NON_COMMAND_SECTION_RE.test(stripped)) {
111
+ inCommandsSection = false;
112
+ }
113
+ else if (COMMAND_SECTION_RE.test(stripped)) {
114
+ inCommandsSection = true;
115
+ }
116
+ else if (SECTION_HEADER_RE.test(stripped) ||
117
+ (stripped === stripped.toUpperCase() && stripped.length > 3 && /^[A-Z]/.test(stripped))) {
118
+ inCommandsSection = false;
119
+ }
120
+ continue;
121
+ }
122
+ if (!inCommandsSection)
123
+ continue;
124
+ const match = line.match(/^\s{2,}([\w][\w-]*)\*?\s*:?\s/);
125
+ if (match) {
126
+ const cmd = match[1];
127
+ if (!SKIP_WORDS.has(cmd.toLowerCase()) && !seen.has(cmd)) {
128
+ seen.add(cmd);
129
+ subcommands.push(cmd);
130
+ }
131
+ }
132
+ }
133
+ return subcommands;
134
+ }
135
+ //# sourceMappingURL=help-loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"help-loader.js","sourceRoot":"","sources":["../src/help-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAG9C,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,OAAe;IAEf,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,GAAG,OAAO,OAAO,CAAC,CAAC;QACnD,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC9C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAE7B,sEAAsE;QACtE,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CACrB,CAAC,CAAwB,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,OAAO,CACrD,CAAC;YACF,IAAI,KAAK,EAAE,UAAU,EAAE,CAAC;gBACtB,OAAO;oBACL,QAAQ,EAAE,KAAK,CAAC,QAAQ;oBACxB,UAAU,EAAE,KAAK,CAAC,UAAU;oBAC5B,OAAO,EAAE,KAAK,CAAC,OAAO;iBACvB,CAAC;YACJ,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO,IAAiB,CAAC;QAC3B,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAgB;IACxD,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,CAAC;QACtC,OAAO,KAAK;aACT,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;aAClC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,CAAC;IAC1C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,OAAe,EACf,WAAmB,CAAC,EACpB,cAAsB,EAAE;IAExB,MAAM,SAAS,GAA2B,EAAE,CAAC;IAE7C,KAAK,UAAU,OAAO,CACpB,YAAsB,EACtB,KAAa;QAEb,IAAI,KAAK,GAAG,QAAQ,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,MAAM,IAAI,WAAW,EAAE,CAAC;YACrE,OAAO;QACT,CAAC;QAED,MAAM,OAAO,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChD,IAAI,OAAO,IAAI,SAAS;YAAE,OAAO;QAEjC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,YAAY,CAAC,CAAC;QAC3C,IAAI,CAAC,MAAM;YAAE,OAAO;QAEpB,SAAS,CAAC,OAAO,CAAC,GAAG,MAAM,CAAC;QAE5B,IAAI,KAAK,GAAG,QAAQ,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,MAAM,GAAG,WAAW,EAAE,CAAC;YACpE,MAAM,WAAW,GAAG,gBAAgB,CAAC,MAAM,CAAC,CAAC;YAC7C,KAAK,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;gBAC9B,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,MAAM,IAAI,WAAW;oBAAE,MAAM;gBACxD,MAAM,OAAO,CAAC,CAAC,GAAG,YAAY,EAAE,GAAG,CAAC,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;YACnD,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,OAAO,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC;IAE5B,OAAO;QACL,QAAQ,EAAE,OAAO;QACjB,UAAU,EAAE,SAAS;KACtB,CAAC;AACJ,CAAC;AAED,SAAS,OAAO,CAAC,YAAsB;IACrC,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,MAAM,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,YAAY,CAAC;QACpC,QAAQ,CAAC,GAAI,EAAE,CAAC,GAAG,IAAI,EAAE,QAAQ,CAAC,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE;YAC/E,IAAI,GAAG,EAAE,CAAC;gBACR,OAAO,CAAC,IAAI,CAAC,CAAC;gBACd,OAAO;YACT,CAAC;YACD,MAAM,MAAM,GAAG,CAAC,MAAM,IAAI,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YACzC,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,QAAgB;IACxC,MAAM,WAAW,GAAa,EAAE,CAAC;IACjC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,IAAI,iBAAiB,GAAG,KAAK,CAAC;IAE9B,MAAM,kBAAkB,GAAG,4CAA4C,CAAC;IACxE,MAAM,sBAAsB,GAC1B,4GAA4G,CAAC;IAC/G,MAAM,iBAAiB,GAAG,mCAAmC,CAAC;IAE9D,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC;QACzB,KAAK,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,YAAY;QAC7D,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM;KACzC,CAAC,CAAC;IAEH,KAAK,MAAM,IAAI,IAAI,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACxC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC7B,IAAI,CAAC,QAAQ;YAAE,SAAS;QAExB,iDAAiD;QACjD,IAAI,IAAI,KAAK,IAAI,CAAC,SAAS,EAAE,EAAE,CAAC;YAC9B,IAAI,sBAAsB,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC1C,iBAAiB,GAAG,KAAK,CAAC;YAC5B,CAAC;iBAAM,IAAI,kBAAkB,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC7C,iBAAiB,GAAG,IAAI,CAAC;YAC3B,CAAC;iBAAM,IACL,iBAAiB,CAAC,IAAI,CAAC,QAAQ,CAAC;gBAChC,CAAC,QAAQ,KAAK,QAAQ,CAAC,WAAW,EAAE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,IAAI,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,EACvF,CAAC;gBACD,iBAAiB,GAAG,KAAK,CAAC;YAC5B,CAAC;YACD,SAAS;QACX,CAAC;QAED,IAAI,CAAC,iBAAiB;YAAE,SAAS;QAEjC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;QAC1D,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;YACtB,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzD,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;gBACd,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC"}
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * @cliwatch/cli-bench — LLM CLI Testing Engine
4
+ *
5
+ * Tests CLI agent-readiness by having LLMs execute tasks,
6
+ * then validating results with assertion-based checks.
7
+ *
8
+ * Dual-mode entry:
9
+ * 1. Config file mode: cli-bench.yaml found → load config → run grid
10
+ * 2. Legacy mode: no config → discover task_suites/ → run grid
11
+ * 3. Init mode: scaffold cli-bench.yaml
12
+ */
13
+ export {};
14
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;GAUG"}
package/dist/index.js ADDED
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * @cliwatch/cli-bench — LLM CLI Testing Engine
4
+ *
5
+ * Tests CLI agent-readiness by having LLMs execute tasks,
6
+ * then validating results with assertion-based checks.
7
+ *
8
+ * Dual-mode entry:
9
+ * 1. Config file mode: cli-bench.yaml found → load config → run grid
10
+ * 2. Legacy mode: no config → discover task_suites/ → run grid
11
+ * 3. Init mode: scaffold cli-bench.yaml
12
+ */
13
+ import { writeFile } from 'node:fs/promises';
14
+ import { parseArgs } from './config.js';
15
+ import { runGrid, uploadReport } from './runner.js';
16
+ import { resolveConfigFile, loadProject } from './project.js';
17
+ import { scaffoldProject } from './init.js';
18
+ import { validateGatewayKey, resolveProviders } from './providers.js';
19
+ import { checkThresholds, printThresholdResults } from './thresholds.js';
20
+ async function main() {
21
+ const config = parseArgs(process.argv);
22
+ // Init mode — scaffold and exit
23
+ if (config.initMode) {
24
+ try {
25
+ const path = await scaffoldProject(process.cwd());
26
+ console.log(`Created ${path}`);
27
+ console.log('Edit cli-bench.yaml to configure your CLI tests, then run: npx @cliwatch/cli-bench');
28
+ }
29
+ catch (e) {
30
+ console.error(e instanceof Error ? e.message : e);
31
+ process.exit(1);
32
+ }
33
+ return;
34
+ }
35
+ console.log('@cliwatch/cli-bench v0.4.0');
36
+ // Try to find a config file
37
+ const configPath = await resolveConfigFile(config.configFile);
38
+ let reports;
39
+ let thresholdsConfig;
40
+ let thresholdFailed = false;
41
+ if (configPath) {
42
+ // Config file mode
43
+ console.log(`Config: ${configPath}`);
44
+ const { config: fileConfig, tasks } = await loadProject(configPath);
45
+ thresholdsConfig = fileConfig.thresholds;
46
+ // Merge CLI args with file config
47
+ const providers = config.models.length > 0
48
+ ? config.models
49
+ : fileConfig.providers ?? ['anthropic/claude-sonnet-4-20250514'];
50
+ const helpModes = fileConfig.help_modes
51
+ ? fileConfig.help_modes.filter((s) => ['injected', 'discoverable', 'none'].includes(s))
52
+ : config.helpModes;
53
+ const concurrency = fileConfig.concurrency ?? config.concurrency;
54
+ // Determine upload behavior
55
+ const uploadMode = fileConfig.upload ?? 'auto';
56
+ const shouldUpload = config.upload
57
+ || uploadMode === 'always'
58
+ || (uploadMode === 'auto' && !!config.apiKey);
59
+ console.log(`CLI: ${fileConfig.cli}`);
60
+ console.log(`Providers: ${providers.join(', ')}`);
61
+ console.log(`Tasks: ${tasks.length}`);
62
+ console.log(`Help modes: ${helpModes.join(', ')}`);
63
+ console.log(`Dry run: ${config.dryRun}`);
64
+ // Validate gateway key before running
65
+ if (!config.dryRun) {
66
+ validateGatewayKey();
67
+ }
68
+ const models = resolveProviders(providers);
69
+ const globalRepeat = config.repeat ?? fileConfig.repeat;
70
+ reports = await runGrid({
71
+ config: { ...config, concurrency, helpModes },
72
+ tasks,
73
+ cliName: fileConfig.cli,
74
+ models,
75
+ versionCommand: fileConfig.version_command,
76
+ workdir: fileConfig.workdir ?? config.workdir,
77
+ globalRepeat,
78
+ });
79
+ // Check thresholds before upload so results are included in the payload
80
+ if (thresholdsConfig && reports.length > 0 && !config.dryRun) {
81
+ const check = checkThresholds(reports.flatMap((r) => r.modelResults), thresholdsConfig);
82
+ printThresholdResults(check);
83
+ for (const report of reports) {
84
+ report.thresholdResults = check;
85
+ }
86
+ // Defer exit(1) until after upload so results still get posted
87
+ if (!check.allPassed && check.behavior === 'error') {
88
+ thresholdFailed = true;
89
+ }
90
+ }
91
+ // Upload if configured
92
+ if (shouldUpload && !config.dryRun) {
93
+ const backendUrl = fileConfig.backend_url ?? config.backendUrl;
94
+ for (const report of reports) {
95
+ try {
96
+ await uploadReport(report, backendUrl, config.apiKey);
97
+ }
98
+ catch (e) {
99
+ console.error(`Failed to upload report for ${report.cli}: ${e instanceof Error ? e.message : e}`);
100
+ }
101
+ }
102
+ }
103
+ }
104
+ else {
105
+ // Legacy task_suites/ discovery mode
106
+ console.log(`Filter: ${config.filter.length > 0 ? config.filter.join(', ') : 'all'}`);
107
+ console.log(`Models: ${config.models.length > 0 ? config.models.join(', ') : 'all'}`);
108
+ console.log(`Help modes: ${config.helpModes.join(', ')}`);
109
+ console.log(`Dry run: ${config.dryRun}`);
110
+ if (!config.dryRun && config.models.length > 0) {
111
+ validateGatewayKey();
112
+ }
113
+ reports = await runGrid({ config, globalRepeat: config.repeat });
114
+ if (config.upload) {
115
+ for (const report of reports) {
116
+ try {
117
+ await uploadReport(report, config.backendUrl, config.apiKey);
118
+ }
119
+ catch (e) {
120
+ console.error(`Failed to upload report for ${report.cli}: ${e instanceof Error ? e.message : e}`);
121
+ }
122
+ }
123
+ }
124
+ }
125
+ if (config.output && reports.length > 0) {
126
+ const output = JSON.stringify(reports.length === 1 ? reports[0] : reports, null, 2);
127
+ await writeFile(config.output, output, 'utf-8');
128
+ console.log(`\nResults written to ${config.output}`);
129
+ }
130
+ // Exit summary
131
+ if (reports.length > 0 && !config.dryRun) {
132
+ console.log('\n=== Final Summary ===');
133
+ for (const report of reports) {
134
+ for (const mr of report.modelResults) {
135
+ console.log(`${report.cli} x ${mr.displayName} [${mr.helpMode}]: ${(mr.passRate * 100).toFixed(0)}% pass, avgTurns=${mr.avgTurnsToSuccess.toFixed(1)}`);
136
+ }
137
+ }
138
+ }
139
+ // Exit with failure after upload + summary so results are still posted
140
+ if (thresholdFailed) {
141
+ process.exit(1);
142
+ }
143
+ }
144
+ main().catch((e) => {
145
+ console.error(e);
146
+ process.exit(1);
147
+ });
148
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AACpD,OAAO,EAAE,iBAAiB,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAC9D,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAgB,MAAM,gBAAgB,CAAC;AAEpF,OAAO,EAAE,eAAe,EAAE,qBAAqB,EAAE,MAAM,iBAAiB,CAAC;AAEzE,KAAK,UAAU,IAAI;IACjB,MAAM,MAAM,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAEvC,gCAAgC;IAChC,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,eAAe,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC;YAClD,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC;YAC/B,OAAO,CAAC,GAAG,CAAC,oFAAoF,CAAC,CAAC;QACpG,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QACD,OAAO;IACT,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAE1C,4BAA4B;IAC5B,MAAM,UAAU,GAAG,MAAM,iBAAiB,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;IAE9D,IAAI,OAAO,CAAC;IACZ,IAAI,gBAAoE,CAAC;IACzE,IAAI,eAAe,GAAG,KAAK,CAAC;IAE5B,IAAI,UAAU,EAAE,CAAC;QACf,mBAAmB;QACnB,OAAO,CAAC,GAAG,CAAC,WAAW,UAAU,EAAE,CAAC,CAAC;QACrC,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,KAAK,EAAE,GAAG,MAAM,WAAW,CAAC,UAAU,CAAC,CAAC;QACpE,gBAAgB,GAAG,UAAU,CAAC,UAAU,CAAC;QAEzC,kCAAkC;QAClC,MAAM,SAAS,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC;YACxC,CAAC,CAAC,MAAM,CAAC,MAAM;YACf,CAAC,CAAC,UAAU,CAAC,SAAS,IAAI,CAAC,oCAAoC,CAAC,CAAC;QACnE,MAAM,SAAS,GAAG,UAAU,CAAC,UAAU;YACrC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAiB,EAAE,CAAC,CAAC,UAAU,EAAE,cAAc,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YACtG,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC;QACrB,MAAM,WAAW,GAAG,UAAU,CAAC,WAAW,IAAI,MAAM,CAAC,WAAW,CAAC;QAEjE,4BAA4B;QAC5B,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,IAAI,MAAM,CAAC;QAC/C,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM;eAC7B,UAAU,KAAK,QAAQ;eACvB,CAAC,UAAU,KAAK,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAEhD,OAAO,CAAC,GAAG,CAAC,QAAQ,UAAU,CAAC,GAAG,EAAE,CAAC,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,cAAc,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,GAAG,CAAC,UAAU,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,eAAe,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnD,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QAEzC,sCAAsC;QACtC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnB,kBAAkB,EAAE,CAAC;QACvB,CAAC;QAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;QAE3C,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,IAAI,UAAU,CAAC,MAAM,CAAC;QAExD,OAAO,GAAG,MAAM,OAAO,CAAC;YACtB,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,WAAW,EAAE,SAAS,EAAE;YAC7C,KAAK;YACL,OAAO,EAAE,UAAU,CAAC,GAAG;YACvB,MAAM;YACN,cAAc,EAAE,UAAU,CAAC,eAAe;YAC1C,OAAO,EAAE,UAAU,CAAC,OAAO,IAAI,MAAM,CAAC,OAAO;YAC7C,YAAY;SACb,CAAC,CAAC;QAEH,wEAAwE;QACxE,IAAI,gBAAgB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YAC7D,MAAM,KAAK,GAAG,eAAe,CAC3B,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,EACtC,gBAAgB,CACjB,CAAC;YACF,qBAAqB,CAAC,KAAK,CAAC,CAAC;YAC7B,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,MAAM,CAAC,gBAAgB,GAAG,KAAK,CAAC;YAClC,CAAC;YACD,+DAA+D;YAC/D,IAAI,CAAC,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,QAAQ,KAAK,OAAO,EAAE,CAAC;gBACnD,eAAe,GAAG,IAAI,CAAC;YACzB,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,YAAY,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnC,MAAM,UAAU,GAAG,UAAU,CAAC,WAAW,IAAI,MAAM,CAAC,UAAU,CAAC;YAC/D,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,IAAI,CAAC;oBACH,MAAM,YAAY,CAAC,MAAM,EAAE,UAAU,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;gBACxD,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CACX,+BAA+B,MAAM,CAAC,GAAG,KAAK,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CACnF,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;SAAM,CAAC;QACN,qCAAqC;QACrC,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;QACtF,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;QACtF,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QAEzC,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC/C,kBAAkB,EAAE,CAAC;QACvB,CAAC;QAED,OAAO,GAAG,MAAM,OAAO,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QAEjE,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,IAAI,CAAC;oBACH,MAAM,YAAY,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;gBAC/D,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CACX,+BAA+B,MAAM,CAAC,GAAG,KAAK,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CACnF,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,CAC3B,OAAO,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,EAC3C,IAAI,EACJ,CAAC,CACF,CAAC;QACF,MAAM,SAAS,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAChD,OAAO,CAAC,GAAG,CAAC,wBAAwB,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IACvD,CAAC;IAED,eAAe;IACf,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;QACzC,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,CAAC;QACvC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,KAAK,MAAM,EAAE,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;gBACrC,OAAO,CAAC,GAAG,CACT,GAAG,MAAM,CAAC,GAAG,MAAM,EAAE,CAAC,WAAW,KAAK,EAAE,CAAC,QAAQ,MAAM,CAAC,EAAE,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,oBAAoB,EAAE,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAC3I,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,uEAAuE;IACvE,IAAI,eAAe,EAAE,CAAC;QACpB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;IACjB,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACjB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}
package/dist/init.d.ts ADDED
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Scaffolding — writes a starter cli-bench.yaml.
3
+ */
4
+ export declare function scaffoldProject(cwd: string): Promise<string>;
5
+ //# sourceMappingURL=init.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"init.d.ts","sourceRoot":"","sources":["../src/init.ts"],"names":[],"mappings":"AAAA;;GAEG;AAiDH,wBAAsB,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAYlE"}
package/dist/init.js ADDED
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Scaffolding — writes a starter cli-bench.yaml.
3
+ */
4
+ import { writeFile, access } from 'node:fs/promises';
5
+ import { join } from 'node:path';
6
+ const TEMPLATE = `# cli-bench.yaml — @cliwatch/cli-bench configuration
7
+ # Docs: https://github.com/anthropics/cliwatch
8
+
9
+ # Required: the CLI to test (must be in PATH)
10
+ cli: my-cli
11
+ # version_command: "my-cli --version"
12
+
13
+ # LLM providers to test against (requires corresponding API key env vars)
14
+ providers:
15
+ - anthropic/claude-sonnet-4-20250514
16
+ # - openai/gpt-4o
17
+ # - google/gemini-2.5-pro
18
+
19
+ # Optional settings
20
+ # help_modes: [injected] # injected | discoverable | none
21
+ # concurrency: 3 # max concurrent API calls
22
+ # workdir: ./workspace # working directory for commands (default: temp dir)
23
+ # upload: auto # auto | always | never (auto uploads if CLIWATCH_API_KEY is set)
24
+
25
+ tasks:
26
+ - id: show-help
27
+ intent: "Show the help information for the CLI"
28
+ assert:
29
+ - ran: "my-cli.*--help"
30
+ - exit_code: 0
31
+
32
+ - id: example-task
33
+ intent: "Describe what the agent should do in natural language"
34
+ # setup:
35
+ # - "mkdir -p /tmp/workspace"
36
+ assert:
37
+ - ran: "my-cli"
38
+ - exit_code: 0
39
+ # - output_contains: "expected output"
40
+ # - file_exists: "/tmp/workspace/output.txt"
41
+ # - verify:
42
+ # run: "my-cli status --json"
43
+ # output_contains: "ok"
44
+
45
+ # Split tasks across files for larger suites:
46
+ # - file://tasks/basics.yaml
47
+ # - file://tasks/advanced/*.yaml
48
+ `;
49
+ export async function scaffoldProject(cwd) {
50
+ const filePath = join(cwd, 'cli-bench.yaml');
51
+ try {
52
+ await access(filePath);
53
+ throw new Error(`cli-bench.yaml already exists in ${cwd}`);
54
+ }
55
+ catch (e) {
56
+ if (e instanceof Error && e.message.includes('already exists'))
57
+ throw e;
58
+ }
59
+ await writeFile(filePath, TEMPLATE, 'utf-8');
60
+ return filePath;
61
+ }
62
+ //# sourceMappingURL=init.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"init.js","sourceRoot":"","sources":["../src/init.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,MAAM,QAAQ,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA0ChB,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,GAAW;IAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,gBAAgB,CAAC,CAAC;IAE7C,IAAI,CAAC;QACH,MAAM,MAAM,CAAC,QAAQ,CAAC,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,oCAAoC,GAAG,EAAE,CAAC,CAAC;IAC7D,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,IAAI,CAAC,YAAY,KAAK,IAAI,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAC;YAAE,MAAM,CAAC,CAAC;IAC1E,CAAC;IAED,MAAM,SAAS,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;IAC7C,OAAO,QAAQ,CAAC;AAClB,CAAC"}
@@ -0,0 +1,158 @@
1
+ /**
2
+ * TypeScript types for @cliwatch/cli-bench.
3
+ *
4
+ * Assertion-based evaluation: tasks define assertions that are checked
5
+ * against the agent's execution trace.
6
+ */
7
+ export type HelpMode = 'injected' | 'discoverable' | 'none';
8
+ export type Assertion = {
9
+ output_contains: string;
10
+ } | {
11
+ output_equals: string;
12
+ } | {
13
+ error_contains: string;
14
+ } | {
15
+ exit_code: number;
16
+ } | {
17
+ file_exists: string;
18
+ } | {
19
+ file_contains: {
20
+ path: string;
21
+ text: string;
22
+ };
23
+ } | {
24
+ ran: string;
25
+ } | {
26
+ not_ran: string;
27
+ } | {
28
+ run_count: {
29
+ pattern: string;
30
+ min?: number;
31
+ max?: number;
32
+ };
33
+ } | {
34
+ verify: {
35
+ run: string;
36
+ output_contains?: string;
37
+ output_equals?: string;
38
+ };
39
+ };
40
+ export interface AssertionResult {
41
+ assertion: Assertion;
42
+ passed: boolean;
43
+ actual?: string;
44
+ expected?: string;
45
+ }
46
+ export interface Task {
47
+ id: string;
48
+ intent: string;
49
+ assert: Assertion[];
50
+ setup?: string[];
51
+ max_turns?: number;
52
+ difficulty?: 'easy' | 'medium' | 'hard';
53
+ category?: string;
54
+ repeat?: number;
55
+ }
56
+ export interface TaskSuite {
57
+ cli: string;
58
+ version_command?: string;
59
+ tasks: Task[];
60
+ }
61
+ export type UploadMode = 'auto' | 'always' | 'never';
62
+ export interface ThresholdsConfig {
63
+ default?: number;
64
+ models?: Record<string, number>;
65
+ tolerance?: number;
66
+ behavior?: 'error' | 'informational';
67
+ }
68
+ export interface ConfigFile {
69
+ cli: string;
70
+ version_command?: string;
71
+ providers?: string[];
72
+ help_modes?: string[];
73
+ concurrency?: number;
74
+ workdir?: string;
75
+ upload?: UploadMode;
76
+ backend_url?: string;
77
+ repeat?: number;
78
+ thresholds?: ThresholdsConfig;
79
+ tasks: (Task | string)[];
80
+ }
81
+ export type Provider = string;
82
+ export interface ModelConfig {
83
+ provider: Provider;
84
+ modelId: string;
85
+ displayName: string;
86
+ temperature: number;
87
+ maxTokens: number;
88
+ }
89
+ export interface TaskEval {
90
+ taskId: string;
91
+ passed: boolean;
92
+ failureReason?: string;
93
+ assertionResults: AssertionResult[];
94
+ turnsUsed: number;
95
+ totalInputTokens: number;
96
+ totalOutputTokens: number;
97
+ totalLatencyMs: number;
98
+ commandsRun: string[];
99
+ difficulty?: 'easy' | 'medium' | 'hard';
100
+ category?: string;
101
+ conversationTrace?: unknown[];
102
+ taskDefinition?: Record<string, unknown>;
103
+ repeatIndex?: number;
104
+ }
105
+ export interface ModelResult {
106
+ provider: Provider;
107
+ modelId: string;
108
+ displayName: string;
109
+ helpMode: HelpMode;
110
+ taskResults: TaskEval[];
111
+ passRate: number;
112
+ avgTurnsToSuccess: number;
113
+ avgTotalTokens: number;
114
+ avgLatencyMs: number;
115
+ }
116
+ export interface ThresholdResult {
117
+ model: string;
118
+ passRate: number;
119
+ threshold: number;
120
+ tolerance: number;
121
+ effectiveMin: number;
122
+ passed: boolean;
123
+ }
124
+ export interface ThresholdCheckResult {
125
+ allPassed: boolean;
126
+ results: ThresholdResult[];
127
+ behavior: 'error' | 'informational';
128
+ }
129
+ export interface GridReport {
130
+ cli: string;
131
+ cliVersion?: string;
132
+ taskSuiteVersion: string;
133
+ gridVersion: string;
134
+ taskCount: number;
135
+ totalEvals?: number;
136
+ generatedAt: string;
137
+ modelResults: ModelResult[];
138
+ systemPrompt?: string;
139
+ gitSha?: string;
140
+ gitRef?: string;
141
+ gitBaseRef?: string;
142
+ commitMessage?: string;
143
+ gitAuthor?: string;
144
+ ciProvider?: string;
145
+ ciBuildUrl?: string;
146
+ ciEvent?: string;
147
+ prNumber?: number;
148
+ repository?: string;
149
+ tags?: string[];
150
+ taskSuiteHash?: string;
151
+ thresholdResults?: ThresholdCheckResult;
152
+ }
153
+ export interface HelpCache {
154
+ cli_name: string;
155
+ help_texts: Record<string, string>;
156
+ version?: string;
157
+ }
158
+ //# sourceMappingURL=models.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,MAAM,MAAM,QAAQ,GAAG,UAAU,GAAG,cAAc,GAAG,MAAM,CAAC;AAM5D,MAAM,MAAM,SAAS,GACjB;IAAE,eAAe,EAAE,MAAM,CAAA;CAAE,GAC3B;IAAE,aAAa,EAAE,MAAM,CAAA;CAAE,GACzB;IAAE,cAAc,EAAE,MAAM,CAAA;CAAE,GAC1B;IAAE,SAAS,EAAE,MAAM,CAAA;CAAE,GACrB;IAAE,WAAW,EAAE,MAAM,CAAA;CAAE,GACvB;IAAE,aAAa,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GACjD;IAAE,GAAG,EAAE,MAAM,CAAA;CAAE,GACf;IAAE,OAAO,EAAE,MAAM,CAAA;CAAE,GACnB;IAAE,SAAS,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GAC9D;IAAE,MAAM,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,eAAe,CAAC,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC;AAElF,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,SAAS,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,IAAI;IACnB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,SAAS,EAAE,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,SAAS;IACxB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,KAAK,EAAE,IAAI,EAAE,CAAC;CACf;AAMD,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;AAErD,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,OAAO,GAAG,eAAe,CAAC;CACtC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,UAAU,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,gBAAgB,CAAC;IAC9B,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC;CAC1B;AAMD,MAAM,MAAM,QAAQ,GAAG,MAAM,CAAC;AAE9B,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD,MAAM,WAAW,QAAQ;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,OAAO,EAAE,CAAC;IAC9B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,QAAQ,CAAC;IACnB,WAAW,EAAE,QAAQ,EAAE,CAAC;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,QAAQ,EAAE,OAAO,GAAG,eAAe,CAAC;CACrC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,EAAE,MAAM,CAAC;IACzB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,WAAW,EAAE,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,oBAAoB,CAAC;CACzC;AAMD,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB"}
package/dist/models.js ADDED
@@ -0,0 +1,8 @@
1
+ /**
2
+ * TypeScript types for @cliwatch/cli-bench.
3
+ *
4
+ * Assertion-based evaluation: tasks define assertions that are checked
5
+ * against the agent's execution trace.
6
+ */
7
+ export {};
8
+ //# sourceMappingURL=models.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Config file loader — discovers and parses cli-bench.yaml,
3
+ * resolves file:// task references with glob support.
4
+ */
5
+ import type { Task, ConfigFile } from './models.js';
6
+ /**
7
+ * Find cli-bench.yaml in the given directory (or CWD).
8
+ */
9
+ export declare function resolveConfigFile(explicitPath?: string): Promise<string | null>;
10
+ /**
11
+ * Parse and validate a cli-bench.yaml config file.
12
+ */
13
+ export declare function loadConfigFile(path: string): Promise<ConfigFile>;
14
+ /**
15
+ * Resolve file:// references and inline tasks into a flat task array.
16
+ * Deduplicates by task ID (first occurrence wins).
17
+ */
18
+ export declare function resolveTaskRefs(tasks: (Task | string)[], baseDir: string): Promise<Task[]>;
19
+ /**
20
+ * Load config file and resolve all task references.
21
+ */
22
+ export declare function loadProject(configPath: string): Promise<{
23
+ config: ConfigFile;
24
+ tasks: Task[];
25
+ }>;
26
+ //# sourceMappingURL=project.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"project.d.ts","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH,OAAO,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAIpD;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAqBrF;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAItE;AAED;;;GAGG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,EACxB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,IAAI,EAAE,CAAC,CA6CjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,KAAK,EAAE,IAAI,EAAE,CAAA;CAAE,CAAC,CAKpG"}