@cliwatch/cli-bench 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +145 -0
  3. package/dist/assertions.d.ts +14 -0
  4. package/dist/assertions.d.ts.map +1 -0
  5. package/dist/assertions.js +161 -0
  6. package/dist/assertions.js.map +1 -0
  7. package/dist/ci.d.ts +29 -0
  8. package/dist/ci.d.ts.map +1 -0
  9. package/dist/ci.js +75 -0
  10. package/dist/ci.js.map +1 -0
  11. package/dist/client/client/client.gen.d.ts +3 -0
  12. package/dist/client/client/client.gen.d.ts.map +1 -0
  13. package/dist/client/client/client.gen.js +235 -0
  14. package/dist/client/client/client.gen.js.map +1 -0
  15. package/dist/client/client/index.d.ts +9 -0
  16. package/dist/client/client/index.d.ts.map +1 -0
  17. package/dist/client/client/index.js +7 -0
  18. package/dist/client/client/index.js.map +1 -0
  19. package/dist/client/client/types.gen.d.ts +118 -0
  20. package/dist/client/client/types.gen.d.ts.map +1 -0
  21. package/dist/client/client/types.gen.js +3 -0
  22. package/dist/client/client/types.gen.js.map +1 -0
  23. package/dist/client/client/utils.gen.d.ts +34 -0
  24. package/dist/client/client/utils.gen.d.ts.map +1 -0
  25. package/dist/client/client/utils.gen.js +229 -0
  26. package/dist/client/client/utils.gen.js.map +1 -0
  27. package/dist/client/client.gen.d.ts +13 -0
  28. package/dist/client/client.gen.d.ts.map +1 -0
  29. package/dist/client/client.gen.js +4 -0
  30. package/dist/client/client.gen.js.map +1 -0
  31. package/dist/client/core/auth.gen.d.ts +19 -0
  32. package/dist/client/core/auth.gen.d.ts.map +1 -0
  33. package/dist/client/core/auth.gen.js +15 -0
  34. package/dist/client/core/auth.gen.js.map +1 -0
  35. package/dist/client/core/bodySerializer.gen.d.ts +26 -0
  36. package/dist/client/core/bodySerializer.gen.d.ts.map +1 -0
  37. package/dist/client/core/bodySerializer.gen.js +58 -0
  38. package/dist/client/core/bodySerializer.gen.js.map +1 -0
  39. package/dist/client/core/params.gen.d.ts +44 -0
  40. package/dist/client/core/params.gen.d.ts.map +1 -0
  41. package/dist/client/core/params.gen.js +101 -0
  42. package/dist/client/core/params.gen.js.map +1 -0
  43. package/dist/client/core/pathSerializer.gen.d.ts +34 -0
  44. package/dist/client/core/pathSerializer.gen.d.ts.map +1 -0
  45. package/dist/client/core/pathSerializer.gen.js +107 -0
  46. package/dist/client/core/pathSerializer.gen.js.map +1 -0
  47. package/dist/client/core/queryKeySerializer.gen.d.ts +19 -0
  48. package/dist/client/core/queryKeySerializer.gen.d.ts.map +1 -0
  49. package/dist/client/core/queryKeySerializer.gen.js +93 -0
  50. package/dist/client/core/queryKeySerializer.gen.js.map +1 -0
  51. package/dist/client/core/serverSentEvents.gen.d.ts +72 -0
  52. package/dist/client/core/serverSentEvents.gen.d.ts.map +1 -0
  53. package/dist/client/core/serverSentEvents.gen.js +134 -0
  54. package/dist/client/core/serverSentEvents.gen.js.map +1 -0
  55. package/dist/client/core/types.gen.d.ts +79 -0
  56. package/dist/client/core/types.gen.d.ts.map +1 -0
  57. package/dist/client/core/types.gen.js +3 -0
  58. package/dist/client/core/types.gen.js.map +1 -0
  59. package/dist/client/core/utils.gen.d.ts +20 -0
  60. package/dist/client/core/utils.gen.d.ts.map +1 -0
  61. package/dist/client/core/utils.gen.js +88 -0
  62. package/dist/client/core/utils.gen.js.map +1 -0
  63. package/dist/client/index.d.ts +3 -0
  64. package/dist/client/index.d.ts.map +1 -0
  65. package/dist/client/index.js +3 -0
  66. package/dist/client/index.js.map +1 -0
  67. package/dist/client/sdk.gen.d.ts +45 -0
  68. package/dist/client/sdk.gen.d.ts.map +1 -0
  69. package/dist/client/sdk.gen.js +47 -0
  70. package/dist/client/sdk.gen.js.map +1 -0
  71. package/dist/client/types.gen.d.ts +694 -0
  72. package/dist/client/types.gen.d.ts.map +1 -0
  73. package/dist/client/types.gen.js +3 -0
  74. package/dist/client/types.gen.js.map +1 -0
  75. package/dist/client/zod.gen.d.ts +492 -0
  76. package/dist/client/zod.gen.d.ts.map +1 -0
  77. package/dist/client/zod.gen.js +413 -0
  78. package/dist/client/zod.gen.js.map +1 -0
  79. package/dist/config.d.ts +22 -0
  80. package/dist/config.d.ts.map +1 -0
  81. package/dist/config.js +94 -0
  82. package/dist/config.js.map +1 -0
  83. package/dist/exec.d.ts +18 -0
  84. package/dist/exec.d.ts.map +1 -0
  85. package/dist/exec.js +30 -0
  86. package/dist/exec.js.map +1 -0
  87. package/dist/help-loader.d.ts +13 -0
  88. package/dist/help-loader.d.ts.map +1 -0
  89. package/dist/help-loader.js +135 -0
  90. package/dist/help-loader.js.map +1 -0
  91. package/dist/index.d.ts +14 -0
  92. package/dist/index.d.ts.map +1 -0
  93. package/dist/index.js +148 -0
  94. package/dist/index.js.map +1 -0
  95. package/dist/init.d.ts +5 -0
  96. package/dist/init.d.ts.map +1 -0
  97. package/dist/init.js +62 -0
  98. package/dist/init.js.map +1 -0
  99. package/dist/models.d.ts +158 -0
  100. package/dist/models.d.ts.map +1 -0
  101. package/dist/models.js +8 -0
  102. package/dist/models.js.map +1 -0
  103. package/dist/project.d.ts +26 -0
  104. package/dist/project.d.ts.map +1 -0
  105. package/dist/project.js +101 -0
  106. package/dist/project.js.map +1 -0
  107. package/dist/prompt.d.ts +12 -0
  108. package/dist/prompt.d.ts.map +1 -0
  109. package/dist/prompt.js +88 -0
  110. package/dist/prompt.js.map +1 -0
  111. package/dist/providers.d.ts +26 -0
  112. package/dist/providers.d.ts.map +1 -0
  113. package/dist/providers.js +55 -0
  114. package/dist/providers.js.map +1 -0
  115. package/dist/runner.d.ts +34 -0
  116. package/dist/runner.d.ts.map +1 -0
  117. package/dist/runner.js +434 -0
  118. package/dist/runner.js.map +1 -0
  119. package/dist/schemas.d.ts +256 -0
  120. package/dist/schemas.d.ts.map +1 -0
  121. package/dist/schemas.js +59 -0
  122. package/dist/schemas.js.map +1 -0
  123. package/dist/suite-generator.d.ts +8 -0
  124. package/dist/suite-generator.d.ts.map +1 -0
  125. package/dist/suite-generator.js +100 -0
  126. package/dist/suite-generator.js.map +1 -0
  127. package/dist/thresholds.d.ts +10 -0
  128. package/dist/thresholds.d.ts.map +1 -0
  129. package/dist/thresholds.js +57 -0
  130. package/dist/thresholds.js.map +1 -0
  131. package/package.json +41 -0
  132. package/task_suites/curl.yaml +138 -0
  133. package/task_suites/docker.yaml +163 -0
  134. package/task_suites/gh.yaml +118 -0
  135. package/task_suites/jq.yaml +172 -0
  136. package/task_suites/kubectl.yaml +74 -0
@@ -0,0 +1,101 @@
1
+ /**
2
+ * Config file loader — discovers and parses cli-bench.yaml,
3
+ * resolves file:// task references with glob support.
4
+ */
5
+ import { readFile, access } from 'node:fs/promises';
6
+ import { join, dirname, resolve } from 'node:path';
7
+ import { glob } from 'node:fs/promises';
8
+ import { parse as parseYaml } from 'yaml';
9
+ import { ConfigFileSchema, TaskFileSchema, TaskSchema } from './schemas.js';
10
+ const CONFIG_FILENAMES = ['cli-bench.yaml', 'cli-bench.yml'];
11
+ /**
12
+ * Find cli-bench.yaml in the given directory (or CWD).
13
+ */
14
+ export async function resolveConfigFile(explicitPath) {
15
+ if (explicitPath) {
16
+ try {
17
+ await access(explicitPath);
18
+ return resolve(explicitPath);
19
+ }
20
+ catch {
21
+ return null;
22
+ }
23
+ }
24
+ for (const name of CONFIG_FILENAMES) {
25
+ const candidate = resolve(name);
26
+ try {
27
+ await access(candidate);
28
+ return candidate;
29
+ }
30
+ catch {
31
+ continue;
32
+ }
33
+ }
34
+ return null;
35
+ }
36
+ /**
37
+ * Parse and validate a cli-bench.yaml config file.
38
+ */
39
+ export async function loadConfigFile(path) {
40
+ const raw = await readFile(path, 'utf-8');
41
+ const parsed = parseYaml(raw);
42
+ return ConfigFileSchema.parse(parsed);
43
+ }
44
+ /**
45
+ * Resolve file:// references and inline tasks into a flat task array.
46
+ * Deduplicates by task ID (first occurrence wins).
47
+ */
48
+ export async function resolveTaskRefs(tasks, baseDir) {
49
+ const resolved = [];
50
+ const seenIds = new Set();
51
+ for (const entry of tasks) {
52
+ if (typeof entry === 'string') {
53
+ // file:// reference
54
+ const ref = entry.replace(/^file:\/\//, '');
55
+ const pattern = join(baseDir, ref);
56
+ // Use glob to resolve wildcards
57
+ const paths = [];
58
+ if (ref.includes('*')) {
59
+ for await (const match of glob(pattern)) {
60
+ if (match.endsWith('.yaml') || match.endsWith('.yml')) {
61
+ paths.push(match);
62
+ }
63
+ }
64
+ paths.sort();
65
+ }
66
+ else {
67
+ paths.push(pattern);
68
+ }
69
+ for (const filePath of paths) {
70
+ const raw = await readFile(filePath, 'utf-8');
71
+ const parsed = parseYaml(raw);
72
+ const fileTasks = TaskFileSchema.parse(parsed);
73
+ for (const task of fileTasks) {
74
+ if (!seenIds.has(task.id)) {
75
+ seenIds.add(task.id);
76
+ resolved.push(task);
77
+ }
78
+ }
79
+ }
80
+ }
81
+ else {
82
+ // Inline task — validate through schema
83
+ const task = TaskSchema.parse(entry);
84
+ if (!seenIds.has(task.id)) {
85
+ seenIds.add(task.id);
86
+ resolved.push(task);
87
+ }
88
+ }
89
+ }
90
+ return resolved;
91
+ }
92
+ /**
93
+ * Load config file and resolve all task references.
94
+ */
95
+ export async function loadProject(configPath) {
96
+ const config = await loadConfigFile(configPath);
97
+ const baseDir = dirname(configPath);
98
+ const tasks = await resolveTaskRefs(config.tasks, baseDir);
99
+ return { config, tasks };
100
+ }
101
+ //# sourceMappingURL=project.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"project.js","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AACxC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAC1C,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAG5E,MAAM,gBAAgB,GAAG,CAAC,gBAAgB,EAAE,eAAe,CAAC,CAAC;AAE7D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,YAAqB;IAC3D,IAAI,YAAY,EAAE,CAAC;QACjB,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;YAC3B,OAAO,OAAO,CAAC,YAAY,CAAC,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,gBAAgB,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;QAChC,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YACxB,OAAO,SAAS,CAAC;QACnB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,IAAY;IAC/C,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAC9B,OAAO,gBAAgB,CAAC,KAAK,CAAC,MAAM,CAAe,CAAC;AACtD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAwB,EACxB,OAAe;IAEf,MAAM,QAAQ,GAAW,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,oBAAoB;YACpB,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC;YAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAEnC,gCAAgC;YAChC,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACtB,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;oBACxC,IAAI,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;wBACtD,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACpB,CAAC;gBACH,CAAC;gBACD,KAAK,CAAC,IAAI,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,CAAC;YAED,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;gBAC7B,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBAC9C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;gBAC9B,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,MAAM,CAAW,CAAC;gBACzD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;oBAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;wBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACtB,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,wCAAwC;YACxC,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAS,CAAC;YAC7C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,UAAkB;IAClD,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,UAAU,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,MAAM,KAAK,GAAG,MAAM,eAAe,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAC3D,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;AAC3B,CAAC"}
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Builds prompts from help text + task definition.
3
+ *
4
+ * Three help modes:
5
+ * - injected: help text included in prompt (current behavior)
6
+ * - discoverable: agent must run --help to discover commands
7
+ * - none: agent relies on training knowledge only
8
+ */
9
+ import type { HelpMode, Task } from './models.js';
10
+ export declare function buildSystemMessage(helpMode: HelpMode): string;
11
+ export declare function buildUserMessage(cliName: string, helpTexts: Record<string, string> | null, task: Task, helpMode: HelpMode): string;
12
+ //# sourceMappingURL=prompt.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AA6BlD,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,CAS7D;AAED,wBAAgB,gBAAgB,CAC9B,OAAO,EAAE,MAAM,EACf,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,EACxC,IAAI,EAAE,IAAI,EACV,QAAQ,EAAE,QAAQ,GACjB,MAAM,CAaR"}
package/dist/prompt.js ADDED
@@ -0,0 +1,88 @@
1
+ /**
2
+ * Builds prompts from help text + task definition.
3
+ *
4
+ * Three help modes:
5
+ * - injected: help text included in prompt (current behavior)
6
+ * - discoverable: agent must run --help to discover commands
7
+ * - none: agent relies on training knowledge only
8
+ */
9
+ const AGENT_SYSTEM_MESSAGE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
10
+
11
+ Rules:
12
+ - Read the help text carefully to understand available subcommands and flags
13
+ - Execute commands using the run_command tool
14
+ - If a command fails, read the error and retry with corrected flags
15
+ - Do NOT invent flags that don't exist in the help text
16
+ - When the task is complete, stop calling tools`;
17
+ const AGENT_SYSTEM_MESSAGE_DISCOVERABLE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
18
+
19
+ Rules:
20
+ - Use <cli> --help and <cli> <subcommand> --help to discover available commands and flags
21
+ - Execute commands using the run_command tool
22
+ - If a command fails, read the error and retry with corrected flags
23
+ - Do NOT invent flags — always check --help first
24
+ - When the task is complete, stop calling tools`;
25
+ const AGENT_SYSTEM_MESSAGE_NONE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
26
+
27
+ Rules:
28
+ - Use your training knowledge of CLI tools to construct commands
29
+ - Execute commands using the run_command tool
30
+ - If a command fails, read the error and retry with corrected flags
31
+ - Do NOT run --help commands — rely on your knowledge
32
+ - When the task is complete, stop calling tools`;
33
+ export function buildSystemMessage(helpMode) {
34
+ switch (helpMode) {
35
+ case 'injected':
36
+ return AGENT_SYSTEM_MESSAGE;
37
+ case 'discoverable':
38
+ return AGENT_SYSTEM_MESSAGE_DISCOVERABLE;
39
+ case 'none':
40
+ return AGENT_SYSTEM_MESSAGE_NONE;
41
+ }
42
+ }
43
+ export function buildUserMessage(cliName, helpTexts, task, helpMode) {
44
+ if (helpMode === 'injected' && helpTexts) {
45
+ const relevantHelp = selectRelevantHelp(cliName, helpTexts, task);
46
+ return `CLI: ${cliName}
47
+
48
+ Help text:
49
+ ${relevantHelp}
50
+
51
+ Task: ${task.intent}`;
52
+ }
53
+ return `CLI: ${cliName}
54
+
55
+ Task: ${task.intent}`;
56
+ }
57
+ /**
58
+ * Select the most relevant help text sections for a task.
59
+ * Trims to stay within reasonable context limits (~4K chars).
60
+ */
61
+ function selectRelevantHelp(cliName, helpTexts, task) {
62
+ const sections = [];
63
+ const maxChars = 4000;
64
+ let totalChars = 0;
65
+ // Always include root help
66
+ const rootHelp = helpTexts[''];
67
+ if (rootHelp) {
68
+ sections.push(`$ ${cliName} --help\n${rootHelp}`);
69
+ totalChars += rootHelp.length;
70
+ }
71
+ // Add help sections that match task keywords
72
+ if (totalChars < maxChars * 0.7) {
73
+ const taskWords = task.intent.toLowerCase().split(/\s+/);
74
+ for (const [key, help] of Object.entries(helpTexts)) {
75
+ if (key === '' || sections.some((s) => s.includes(`${cliName} ${key} --help`))) {
76
+ continue;
77
+ }
78
+ const keyWords = key.split(' ');
79
+ const matches = keyWords.some((kw) => taskWords.includes(kw));
80
+ if (matches && totalChars + help.length < maxChars) {
81
+ sections.push(`$ ${cliName} ${key} --help\n${help}`);
82
+ totalChars += help.length;
83
+ }
84
+ }
85
+ }
86
+ return sections.join('\n\n');
87
+ }
88
+ //# sourceMappingURL=prompt.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,MAAM,oBAAoB,GAAG;;;;;;;gDAOmB,CAAC;AAEjD,MAAM,iCAAiC,GAAG;;;;;;;gDAOM,CAAC;AAEjD,MAAM,yBAAyB,GAAG;;;;;;;gDAOc,CAAC;AAEjD,MAAM,UAAU,kBAAkB,CAAC,QAAkB;IACnD,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,UAAU;YACb,OAAO,oBAAoB,CAAC;QAC9B,KAAK,cAAc;YACjB,OAAO,iCAAiC,CAAC;QAC3C,KAAK,MAAM;YACT,OAAO,yBAAyB,CAAC;IACrC,CAAC;AACH,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,OAAe,EACf,SAAwC,EACxC,IAAU,EACV,QAAkB;IAElB,IAAI,QAAQ,KAAK,UAAU,IAAI,SAAS,EAAE,CAAC;QACzC,MAAM,YAAY,GAAG,kBAAkB,CAAC,OAAO,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAClE,OAAO,QAAQ,OAAO;;;EAGxB,YAAY;;QAEN,IAAI,CAAC,MAAM,EAAE,CAAC;IACpB,CAAC;IACD,OAAO,QAAQ,OAAO;;QAEhB,IAAI,CAAC,MAAM,EAAE,CAAC;AACtB,CAAC;AAED;;;GAGG;AACH,SAAS,kBAAkB,CACzB,OAAe,EACf,SAAiC,EACjC,IAAU;IAEV,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,QAAQ,GAAG,IAAI,CAAC;IACtB,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,2BAA2B;IAC3B,MAAM,QAAQ,GAAG,SAAS,CAAC,EAAE,CAAC,CAAC;IAC/B,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,IAAI,CAAC,KAAK,OAAO,YAAY,QAAQ,EAAE,CAAC,CAAC;QAClD,UAAU,IAAI,QAAQ,CAAC,MAAM,CAAC;IAChC,CAAC;IAED,6CAA6C;IAC7C,IAAI,UAAU,GAAG,QAAQ,GAAG,GAAG,EAAE,CAAC;QAChC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACzD,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;YACpD,IAAI,GAAG,KAAK,EAAE,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,CAAC,CAAC,EAAE,CAAC;gBAC/E,SAAS;YACX,CAAC;YACD,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAChC,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC;YAC9D,IAAI,OAAO,IAAI,UAAU,GAAG,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;gBACnD,QAAQ,CAAC,IAAI,CAAC,KAAK,OAAO,IAAI,GAAG,YAAY,IAAI,EAAE,CAAC,CAAC;gBACrD,UAAU,IAAI,IAAI,CAAC,MAAM,CAAC;YAC5B,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Model registry + AI Gateway configuration.
3
+ *
4
+ * Uses Vercel AI SDK v6 gateway() which is re-exported from 'ai'.
5
+ * Model IDs use the gateway format: "provider/model-id".
6
+ */
7
+ export interface ModelEntry {
8
+ id: string;
9
+ displayName: string;
10
+ provider: string;
11
+ }
12
+ export declare const MODELS: readonly ModelEntry[];
13
+ export declare function getModel(modelId: string): import("@ai-sdk/provider").LanguageModelV3;
14
+ export declare function getModelEntry(modelId: string): ModelEntry | undefined;
15
+ export declare function filterModels(modelIds: string[]): ModelEntry[];
16
+ /**
17
+ * Build ModelEntry objects from provider strings (e.g. "anthropic/claude-sonnet-4-20250514").
18
+ * For known models, returns the registry entry. For unknown models, creates a synthetic entry.
19
+ */
20
+ export declare function resolveProviders(providerIds: string[]): ModelEntry[];
21
+ /**
22
+ * Validate that AI_GATEWAY_API_KEY is set.
23
+ * All model calls go through the AI Gateway — no per-provider keys are needed.
24
+ */
25
+ export declare function validateGatewayKey(): void;
26
+ //# sourceMappingURL=providers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,MAAM,CAAC;IACX,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,eAAO,MAAM,MAAM,EAAE,SAAS,UAAU,EAS9B,CAAC;AAEX,wBAAgB,QAAQ,CAAC,OAAO,EAAE,MAAM,8CAEvC;AAED,wBAAgB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS,CAErE;AAED,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,UAAU,EAAE,CAG7D;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,EAAE,GAAG,UAAU,EAAE,CAWpE;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,IAAI,IAAI,CAMzC"}
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Model registry + AI Gateway configuration.
3
+ *
4
+ * Uses Vercel AI SDK v6 gateway() which is re-exported from 'ai'.
5
+ * Model IDs use the gateway format: "provider/model-id".
6
+ */
7
+ import { gateway } from 'ai';
8
+ export const MODELS = [
9
+ { id: 'anthropic/claude-sonnet-4-20250514', displayName: 'Claude Sonnet 4', provider: 'anthropic' },
10
+ { id: 'anthropic/claude-haiku-4-5-20251001', displayName: 'Claude Haiku 4.5', provider: 'anthropic' },
11
+ { id: 'openai/gpt-4o', displayName: 'GPT-4o', provider: 'openai' },
12
+ { id: 'openai/gpt-4o-mini', displayName: 'GPT-4o Mini', provider: 'openai' },
13
+ { id: 'google/gemini-2.5-pro', displayName: 'Gemini 2.5 Pro', provider: 'google' },
14
+ { id: 'google/gemini-2.5-flash', displayName: 'Gemini 2.5 Flash', provider: 'google' },
15
+ { id: 'meta/llama-3.1-8b', displayName: 'Llama 3.1 8B', provider: 'meta' },
16
+ { id: 'mistral/ministral-3b', displayName: 'Ministral 3B', provider: 'mistral' },
17
+ ];
18
+ export function getModel(modelId) {
19
+ return gateway(modelId);
20
+ }
21
+ export function getModelEntry(modelId) {
22
+ return MODELS.find((m) => m.id === modelId);
23
+ }
24
+ export function filterModels(modelIds) {
25
+ if (modelIds.length === 0)
26
+ return [...MODELS];
27
+ return MODELS.filter((m) => modelIds.includes(m.id));
28
+ }
29
+ /**
30
+ * Build ModelEntry objects from provider strings (e.g. "anthropic/claude-sonnet-4-20250514").
31
+ * For known models, returns the registry entry. For unknown models, creates a synthetic entry.
32
+ */
33
+ export function resolveProviders(providerIds) {
34
+ return providerIds.map((id) => {
35
+ const existing = MODELS.find((m) => m.id === id);
36
+ if (existing)
37
+ return { ...existing };
38
+ const [provider, ...rest] = id.split('/');
39
+ return {
40
+ id,
41
+ displayName: rest.join('/'),
42
+ provider: provider,
43
+ };
44
+ });
45
+ }
46
+ /**
47
+ * Validate that AI_GATEWAY_API_KEY is set.
48
+ * All model calls go through the AI Gateway — no per-provider keys are needed.
49
+ */
50
+ export function validateGatewayKey() {
51
+ if (!process.env['AI_GATEWAY_API_KEY']) {
52
+ throw new Error('Missing AI_GATEWAY_API_KEY. All model calls require the AI Gateway.\nSet AI_GATEWAY_API_KEY in your environment before running cli-bench.');
53
+ }
54
+ }
55
+ //# sourceMappingURL=providers.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"providers.js","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAQ7B,MAAM,CAAC,MAAM,MAAM,GAA0B;IAC3C,EAAE,EAAE,EAAE,oCAAoC,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACnG,EAAE,EAAE,EAAE,qCAAqC,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACrG,EAAE,EAAE,EAAE,eAAe,EAAE,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClE,EAAE,EAAE,EAAE,oBAAoB,EAAE,WAAW,EAAE,aAAa,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAC5E,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,yBAAyB,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACtF,EAAE,EAAE,EAAE,mBAAmB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,EAAE;IAC1E,EAAE,EAAE,EAAE,sBAAsB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAS,EAAE;CACxE,CAAC;AAEX,MAAM,UAAU,QAAQ,CAAC,OAAe;IACtC,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,OAAe;IAC3C,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AAC9C,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,QAAkB;IAC7C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,GAAG,MAAM,CAAC,CAAC;IAC9C,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACvD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,WAAqB;IACpD,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QACjD,IAAI,QAAQ;YAAE,OAAO,EAAE,GAAG,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC1C,OAAO;YACL,EAAE;YACF,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;YAC3B,QAAQ,EAAE,QAAkC;SAC7C,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB;IAChC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CACb,2IAA2I,CAC5I,CAAC;IACJ,CAAC;AACH,CAAC"}
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Orchestrates the task x model x helpMode matrix with tool-calling agents.
3
+ *
4
+ * For each task:
5
+ * - Create temp workdir (or use configured one)
6
+ * - Run setup commands directly on the host
7
+ * - Give LLM the run_command tool
8
+ * - LLM calls tool -> execute on host -> return stdout/stderr/exit
9
+ * - After maxSteps or completion: run assertions
10
+ * - Aggregate into GridReport
11
+ */
12
+ import type { TaskSuite, Task, GridReport } from './models.js';
13
+ import type { Config } from './config.js';
14
+ import type { ModelEntry } from './providers.js';
15
+ export declare function loadTaskSuite(filePath: string): Promise<TaskSuite>;
16
+ export declare function discoverTaskSuites(suiteDir: string): Promise<Map<string, string>>;
17
+ export interface RunGridOptions {
18
+ config: Config;
19
+ /** Override tasks (from config file mode). */
20
+ tasks?: Task[];
21
+ /** Override CLI name (from config file mode). */
22
+ cliName?: string;
23
+ /** Override models (from config file mode). */
24
+ models?: ModelEntry[];
25
+ /** Override version command (from config file mode). */
26
+ versionCommand?: string;
27
+ /** Override working directory (from config file mode). */
28
+ workdir?: string;
29
+ /** Global repeat count — overrides per-task repeat. */
30
+ globalRepeat?: number;
31
+ }
32
+ export declare function runGrid(opts: RunGridOptions): Promise<GridReport[]>;
33
+ export declare function uploadReport(report: GridReport, backendUrl: string, apiKey: string): Promise<void>;
34
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AASH,OAAO,KAAK,EACV,SAAS,EACT,IAAI,EACJ,UAAU,EAKX,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AASjD,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAKxE;AAED,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAc9B;AAkND,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,8CAA8C;IAC9C,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC;IACf,iDAAiD;IACjD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+CAA+C;IAC/C,MAAM,CAAC,EAAE,UAAU,EAAE,CAAC;IACtB,wDAAwD;IACxD,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,0DAA0D;IAC1D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED,wBAAsB,OAAO,CAAC,IAAI,EAAE,cAAc,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC,CAyDzE;AAmND,wBAAsB,YAAY,CAChC,MAAM,EAAE,UAAU,EAClB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,IAAI,CAAC,CAsBf"}