@cliwatch/cli-bench 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/dist/client/types.gen.d.ts +28 -0
- package/dist/client/types.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.d.ts +16 -0
- package/dist/client/zod.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.js +25 -0
- package/dist/client/zod.gen.js.map +1 -1
- package/dist/config.d.ts +2 -2
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +8 -7
- package/dist/config.js.map +1 -1
- package/dist/help-loader.d.ts +6 -2
- package/dist/help-loader.d.ts.map +1 -1
- package/dist/help-loader.js +7 -77
- package/dist/help-loader.js.map +1 -1
- package/dist/index.js +16 -8
- package/dist/index.js.map +1 -1
- package/dist/init.js +1 -1
- package/dist/models.d.ts +17 -3
- package/dist/models.d.ts.map +1 -1
- package/dist/models.js +1 -1
- package/dist/models.js.map +1 -1
- package/dist/project.d.ts +1 -0
- package/dist/project.d.ts.map +1 -1
- package/dist/project.js +3 -2
- package/dist/project.js.map +1 -1
- package/dist/prompt.d.ts +9 -8
- package/dist/prompt.d.ts.map +1 -1
- package/dist/prompt.js +30 -74
- package/dist/prompt.js.map +1 -1
- package/dist/runner.d.ts +10 -1
- package/dist/runner.d.ts.map +1 -1
- package/dist/runner.js +65 -37
- package/dist/runner.js.map +1 -1
- package/dist/schemas.d.ts +9 -1
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +9 -1
- package/dist/schemas.js.map +1 -1
- package/dist/suite-generator.d.ts.map +1 -1
- package/dist/suite-generator.js +5 -4
- package/dist/suite-generator.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -11,11 +11,13 @@
|
|
|
11
11
|
* 3. Init mode: scaffold cli-bench.yaml
|
|
12
12
|
*/
|
|
13
13
|
import { writeFile } from 'node:fs/promises';
|
|
14
|
+
import { dirname } from 'node:path';
|
|
14
15
|
import { parseArgs } from './config.js';
|
|
15
16
|
import { runGrid, uploadReport } from './runner.js';
|
|
16
17
|
import { resolveConfigFile, loadProject } from './project.js';
|
|
17
18
|
import { scaffoldProject } from './init.js';
|
|
18
19
|
import { validateGatewayKey, resolveProviders } from './providers.js';
|
|
20
|
+
import { CONTEXT_MODES } from './models.js';
|
|
19
21
|
import { checkThresholds, printThresholdResults } from './thresholds.js';
|
|
20
22
|
async function main() {
|
|
21
23
|
const config = parseArgs(process.argv);
|
|
@@ -41,15 +43,15 @@ async function main() {
|
|
|
41
43
|
if (configPath) {
|
|
42
44
|
// Config file mode
|
|
43
45
|
console.log(`Config: ${configPath}`);
|
|
44
|
-
const { config: fileConfig, tasks } = await loadProject(configPath);
|
|
46
|
+
const { config: fileConfig, tasks, taskSuiteContent } = await loadProject(configPath);
|
|
45
47
|
thresholdsConfig = fileConfig.thresholds;
|
|
46
48
|
// Merge CLI args with file config
|
|
47
49
|
const providers = config.models.length > 0
|
|
48
50
|
? config.models
|
|
49
51
|
: fileConfig.providers ?? ['anthropic/claude-sonnet-4-20250514'];
|
|
50
|
-
const
|
|
51
|
-
? fileConfig.
|
|
52
|
-
: config.
|
|
52
|
+
const contextModes = fileConfig.context
|
|
53
|
+
? fileConfig.context.filter((s) => CONTEXT_MODES.includes(s))
|
|
54
|
+
: config.contextModes;
|
|
53
55
|
const concurrency = fileConfig.concurrency ?? config.concurrency;
|
|
54
56
|
// Determine upload behavior
|
|
55
57
|
const uploadMode = fileConfig.upload ?? 'auto';
|
|
@@ -59,7 +61,7 @@ async function main() {
|
|
|
59
61
|
console.log(`CLI: ${fileConfig.cli}`);
|
|
60
62
|
console.log(`Providers: ${providers.join(', ')}`);
|
|
61
63
|
console.log(`Tasks: ${tasks.length}`);
|
|
62
|
-
console.log(`
|
|
64
|
+
console.log(`Context: ${contextModes.join(', ')}`);
|
|
63
65
|
console.log(`Dry run: ${config.dryRun}`);
|
|
64
66
|
// Validate gateway key before running
|
|
65
67
|
if (!config.dryRun) {
|
|
@@ -68,7 +70,7 @@ async function main() {
|
|
|
68
70
|
const models = resolveProviders(providers);
|
|
69
71
|
const globalRepeat = config.repeat ?? fileConfig.repeat;
|
|
70
72
|
reports = await runGrid({
|
|
71
|
-
config: { ...config, concurrency,
|
|
73
|
+
config: { ...config, concurrency, contextModes },
|
|
72
74
|
tasks,
|
|
73
75
|
cliName: fileConfig.cli,
|
|
74
76
|
models,
|
|
@@ -76,6 +78,12 @@ async function main() {
|
|
|
76
78
|
workdir: fileConfig.workdir ?? config.workdir,
|
|
77
79
|
globalRepeat,
|
|
78
80
|
systemPrompt: fileConfig.system_prompt,
|
|
81
|
+
displayName: fileConfig.display_name,
|
|
82
|
+
category: fileConfig.category,
|
|
83
|
+
websiteUrl: fileConfig.website_url,
|
|
84
|
+
githubUrl: fileConfig.github_url,
|
|
85
|
+
taskSuiteContent,
|
|
86
|
+
configDir: dirname(configPath),
|
|
79
87
|
});
|
|
80
88
|
// Check thresholds before upload so results are included in the payload
|
|
81
89
|
if (thresholdsConfig && reports.length > 0 && !config.dryRun) {
|
|
@@ -106,7 +114,7 @@ async function main() {
|
|
|
106
114
|
// Legacy task_suites/ discovery mode
|
|
107
115
|
console.log(`Filter: ${config.filter.length > 0 ? config.filter.join(', ') : 'all'}`);
|
|
108
116
|
console.log(`Models: ${config.models.length > 0 ? config.models.join(', ') : 'all'}`);
|
|
109
|
-
console.log(`
|
|
117
|
+
console.log(`Context: ${config.contextModes.join(', ')}`);
|
|
110
118
|
console.log(`Dry run: ${config.dryRun}`);
|
|
111
119
|
if (!config.dryRun && config.models.length > 0) {
|
|
112
120
|
validateGatewayKey();
|
|
@@ -133,7 +141,7 @@ async function main() {
|
|
|
133
141
|
console.log('\n=== Final Summary ===');
|
|
134
142
|
for (const report of reports) {
|
|
135
143
|
for (const mr of report.modelResults) {
|
|
136
|
-
console.log(`${report.cli} x ${mr.displayName} [${mr.
|
|
144
|
+
console.log(`${report.cli} x ${mr.displayName} [${mr.contextMode}]: ${(mr.passRate * 100).toFixed(0)}% pass, avgTurns=${mr.avgTurnsToSuccess.toFixed(1)}`);
|
|
137
145
|
}
|
|
138
146
|
}
|
|
139
147
|
}
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AACpD,OAAO,EAAE,iBAAiB,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAC9D,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAgB,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AACpD,OAAO,EAAE,iBAAiB,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAC9D,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAgB,MAAM,gBAAgB,CAAC;AACpF,OAAO,EAAoB,aAAa,EAAE,MAAM,aAAa,CAAC;AAC9D,OAAO,EAAE,eAAe,EAAE,qBAAqB,EAAE,MAAM,iBAAiB,CAAC;AAEzE,KAAK,UAAU,IAAI;IACjB,MAAM,MAAM,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAEvC,gCAAgC;IAChC,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,eAAe,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC;YAClD,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC;YAC/B,OAAO,CAAC,GAAG,CAAC,oFAAoF,CAAC,CAAC;QACpG,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QACD,OAAO;IACT,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAE1C,4BAA4B;IAC5B,MAAM,UAAU,GAAG,MAAM,iBAAiB,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;IAE9D,IAAI,OAAO,CAAC;IACZ,IAAI,gBAAoE,CAAC;IACzE,IAAI,eAAe,GAAG,KAAK,CAAC;IAE5B,IAAI,UAAU,EAAE,CAAC;QACf,mBAAmB;QACnB,OAAO,CAAC,GAAG,CAAC,WAAW,UAAU,EAAE,CAAC,CAAC;QACrC,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,KAAK,EAAE,gBAAgB,EAAE,GAAG,MAAM,WAAW,CAAC,UAAU,CAAC,CAAC;QACtF,gBAAgB,GAAG,UAAU,CAAC,UAAU,CAAC;QAEzC,kCAAkC;QAClC,MAAM,SAAS,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC;YACxC,CAAC,CAAC,MAAM,CAAC,MAAM;YACf,CAAC,CAAC,UAAU,CAAC,SAAS,IAAI,CAAC,oCAAoC,CAAC,CAAC;QACnE,MAAM,YAAY,GAAG,UAAU,CAAC,OAAO;YACrC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAoB,EAAE,CAAE,aAA0B,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YAC7F,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC;QACxB,MAAM,WAAW,GAAG,UAAU,CAAC,WAAW,IAAI,MAAM,CAAC,WAAW,CAAC;QAEjE,4BAA4B;QAC5B,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,IAAI,MAAM,CAAC;QAC/C,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM;eAC7B,UAAU,KAAK,QAAQ;eACvB,CAAC,UAAU,KAAK,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAEhD,OAAO,CAAC,GAAG,CAAC,QAAQ,UAAU,CAAC,GAAG,EAAE,CAAC,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,cAAc,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,GAAG,CAAC,UAAU,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,YAAY,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnD,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QAEzC,sCAAsC;QACtC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnB,kBAAkB,EAAE,CAAC;QACvB,CAAC;QAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;QAE3C,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,IAAI,UAAU,CAAC,MAAM,CAAC;QAExD,OAAO,GAAG,MAAM,OAAO,CAAC;YACtB,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,WAAW,EAAE,YAAY,EAAE;YAChD,KAAK;YACL,OAAO,EAAE,UAAU,CAAC,GAAG;YACvB,MAAM;YACN,cAAc,EAAE,UAAU,CAAC,eAAe;YAC1C,OAAO,EAAE,UAAU,CAAC,OAAO,IAAI,MAAM,CAAC,OAAO;YAC7C,YAAY;YACZ,YAAY,EAAE,UAAU,CAAC,aAAa;YACtC,WAAW,EAAE,UAAU,CAAC,YAAY;YACpC,QAAQ,EAAE,UAAU,CAAC,QAAQ;YAC7B,UAAU,EAAE,UAAU,CAAC,WAAW;YAClC,SAAS,EAAE,UAAU,CAAC,UAAU;YAChC,gBAAgB;YAChB,SAAS,EAAE,OAAO,CAAC,UAAU,CAAC;SAC/B,CAAC,CAAC;QAEH,wEAAwE;QACxE,IAAI,gBAAgB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YAC7D,MAAM,KAAK,GAAG,eAAe,CAC3B,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,EACtC,gBAAgB,CACjB,CAAC;YACF,qBAAqB,CAAC,KAAK,CAAC,CAAC;YAC7B,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,MAAM,CAAC,gBAAgB,GAAG,KAAK,CAAC;YAClC,CAAC;YACD,+DAA+D;YAC/D,IAAI,CAAC,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,QAAQ,KAAK,OAAO,EAAE,CAAC;gBACnD,eAAe,GAAG,IAAI,CAAC;YACzB,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,YAAY,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnC,MAAM,UAAU,GAAG,UAAU,CAAC,WAAW,IAAI,MAAM,CAAC,UAAU,CAAC;YAC/D,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,IAAI,CAAC;oBACH,MAAM,YAAY,CAAC,MAAM,EAAE,UAAU,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;gBACxD,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CACX,+BAA+B,MAAM,CAAC,GAAG,KAAK,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CACnF,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;SAAM,CAAC;QACN,qCAAqC;QACrC,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;QACtF,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;QACtF,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QAEzC,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC/C,kBAAkB,EAAE,CAAC;QACvB,CAAC;QAED,OAAO,GAAG,MAAM,OAAO,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QAEjE,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,IAAI,CAAC;oBACH,MAAM,YAAY,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;gBAC/D,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CACX,+BAA+B,MAAM,CAAC,GAAG,KAAK,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CACnF,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,CAC3B,OAAO,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,EAC3C,IAAI,EACJ,CAAC,CACF,CAAC;QACF,MAAM,SAAS,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAChD,OAAO,CAAC,GAAG,CAAC,wBAAwB,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IACvD,CAAC;IAED,eAAe;IACf,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;QACzC,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,CAAC;QACvC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,KAAK,MAAM,EAAE,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;gBACrC,OAAO,CAAC,GAAG,CACT,GAAG,MAAM,CAAC,GAAG,MAAM,EAAE,CAAC,WAAW,KAAK,EAAE,CAAC,WAAW,MAAM,CAAC,EAAE,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,oBAAoB,EAAE,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAC9I,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,uEAAuE;IACvE,IAAI,eAAe,EAAE,CAAC;QACpB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;IACjB,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACjB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}
|
package/dist/init.js
CHANGED
|
@@ -17,7 +17,7 @@ providers:
|
|
|
17
17
|
# - google/gemini-2.5-pro
|
|
18
18
|
|
|
19
19
|
# Optional settings
|
|
20
|
-
#
|
|
20
|
+
# context: [zero-shot] # zero-shot | help | docs
|
|
21
21
|
# concurrency: 3 # max concurrent API calls
|
|
22
22
|
# workdir: ./workspace # working directory for commands (default: temp dir)
|
|
23
23
|
# upload: auto # auto | always | never (auto uploads if CLIWATCH_API_KEY is set)
|
package/dist/models.d.ts
CHANGED
|
@@ -4,7 +4,8 @@
|
|
|
4
4
|
* Assertion-based evaluation: tasks define assertions that are checked
|
|
5
5
|
* against the agent's execution trace.
|
|
6
6
|
*/
|
|
7
|
-
export type
|
|
7
|
+
export type ContextMode = 'zero-shot' | 'help' | 'docs';
|
|
8
|
+
export declare const CONTEXT_MODES: ContextMode[];
|
|
8
9
|
export type Assertion = {
|
|
9
10
|
output_contains: string;
|
|
10
11
|
} | {
|
|
@@ -56,6 +57,10 @@ export interface Task {
|
|
|
56
57
|
export interface TaskSuite {
|
|
57
58
|
cli: string;
|
|
58
59
|
version_command?: string;
|
|
60
|
+
display_name?: string;
|
|
61
|
+
category?: string;
|
|
62
|
+
website_url?: string;
|
|
63
|
+
github_url?: string;
|
|
59
64
|
tasks: Task[];
|
|
60
65
|
}
|
|
61
66
|
export type UploadMode = 'auto' | 'always' | 'never';
|
|
@@ -68,8 +73,12 @@ export interface ThresholdsConfig {
|
|
|
68
73
|
export interface ConfigFile {
|
|
69
74
|
cli: string;
|
|
70
75
|
version_command?: string;
|
|
76
|
+
display_name?: string;
|
|
77
|
+
category?: string;
|
|
78
|
+
website_url?: string;
|
|
79
|
+
github_url?: string;
|
|
71
80
|
providers?: string[];
|
|
72
|
-
|
|
81
|
+
context?: string[];
|
|
73
82
|
system_prompt?: string;
|
|
74
83
|
concurrency?: number;
|
|
75
84
|
workdir?: string;
|
|
@@ -107,7 +116,7 @@ export interface ModelResult {
|
|
|
107
116
|
provider: Provider;
|
|
108
117
|
modelId: string;
|
|
109
118
|
displayName: string;
|
|
110
|
-
|
|
119
|
+
contextMode: ContextMode;
|
|
111
120
|
taskResults: TaskEval[];
|
|
112
121
|
passRate: number;
|
|
113
122
|
avgTurnsToSuccess: number;
|
|
@@ -137,6 +146,10 @@ export interface GridReport {
|
|
|
137
146
|
generatedAt: string;
|
|
138
147
|
modelResults: ModelResult[];
|
|
139
148
|
systemPrompt?: string;
|
|
149
|
+
displayName?: string;
|
|
150
|
+
category?: string;
|
|
151
|
+
websiteUrl?: string;
|
|
152
|
+
githubUrl?: string;
|
|
140
153
|
gitSha?: string;
|
|
141
154
|
gitRef?: string;
|
|
142
155
|
gitHeadRef?: string;
|
|
@@ -150,6 +163,7 @@ export interface GridReport {
|
|
|
150
163
|
repository?: string;
|
|
151
164
|
tags?: string[];
|
|
152
165
|
taskSuiteHash?: string;
|
|
166
|
+
taskSuiteContent?: string;
|
|
153
167
|
thresholdResults?: ThresholdCheckResult;
|
|
154
168
|
}
|
|
155
169
|
export interface HelpCache {
|
package/dist/models.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,MAAM,MAAM,
|
|
1
|
+
{"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,MAAM,MAAM,WAAW,GAAG,WAAW,GAAG,MAAM,GAAG,MAAM,CAAC;AACxD,eAAO,MAAM,aAAa,EAAE,WAAW,EAAkC,CAAC;AAM1E,MAAM,MAAM,SAAS,GACjB;IAAE,eAAe,EAAE,MAAM,CAAA;CAAE,GAC3B;IAAE,aAAa,EAAE,MAAM,CAAA;CAAE,GACzB;IAAE,cAAc,EAAE,MAAM,CAAA;CAAE,GAC1B;IAAE,SAAS,EAAE,MAAM,CAAA;CAAE,GACrB;IAAE,WAAW,EAAE,MAAM,CAAA;CAAE,GACvB;IAAE,aAAa,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GACjD;IAAE,GAAG,EAAE,MAAM,CAAA;CAAE,GACf;IAAE,OAAO,EAAE,MAAM,CAAA;CAAE,GACnB;IAAE,SAAS,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GAC9D;IAAE,MAAM,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,eAAe,CAAC,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC;AAElF,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,SAAS,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,IAAI;IACnB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,SAAS,EAAE,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,SAAS;IACxB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,IAAI,EAAE,CAAC;CACf;AAMD,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;AAErD,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,OAAO,GAAG,eAAe,CAAC;CACtC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,UAAU,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,gBAAgB,CAAC;IAC9B,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC;CAC1B;AAMD,MAAM,MAAM,QAAQ,GAAG,MAAM,CAAC;AAE9B,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD,MAAM,WAAW,QAAQ;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,OAAO,EAAE,CAAC;IAC9B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,WAAW,CAAC;IACzB,WAAW,EAAE,QAAQ,EAAE,CAAC;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,QAAQ,EAAE,OAAO,GAAG,eAAe,CAAC;CACrC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,EAAE,MAAM,CAAC;IACzB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,WAAW,EAAE,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,gBAAgB,CAAC,EAAE,oBAAoB,CAAC;CACzC;AAMD,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB"}
|
package/dist/models.js
CHANGED
package/dist/models.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG"}
|
|
1
|
+
{"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH,MAAM,CAAC,MAAM,aAAa,GAAkB,CAAC,WAAW,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC"}
|
package/dist/project.d.ts
CHANGED
|
@@ -22,5 +22,6 @@ export declare function resolveTaskRefs(tasks: (Task | string)[], baseDir: strin
|
|
|
22
22
|
export declare function loadProject(configPath: string): Promise<{
|
|
23
23
|
config: ConfigFile;
|
|
24
24
|
tasks: Task[];
|
|
25
|
+
taskSuiteContent: string;
|
|
25
26
|
}>;
|
|
26
27
|
//# sourceMappingURL=project.d.ts.map
|
package/dist/project.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"project.d.ts","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH,OAAO,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAIpD;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAqBrF;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAItE;AAED;;;GAGG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,EACxB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,IAAI,EAAE,CAAC,CA6CjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,KAAK,EAAE,IAAI,EAAE,CAAA;CAAE,CAAC,
|
|
1
|
+
{"version":3,"file":"project.d.ts","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH,OAAO,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAIpD;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAqBrF;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAItE;AAED;;;GAGG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,EACxB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,IAAI,EAAE,CAAC,CA6CjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,KAAK,EAAE,IAAI,EAAE,CAAC;IAAC,gBAAgB,EAAE,MAAM,CAAA;CAAE,CAAC,CAM9H"}
|
package/dist/project.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import { readFile, access } from 'node:fs/promises';
|
|
6
6
|
import { join, dirname, resolve } from 'node:path';
|
|
7
7
|
import { glob } from 'node:fs/promises';
|
|
8
|
-
import { parse as parseYaml } from 'yaml';
|
|
8
|
+
import { parse as parseYaml, stringify as stringifyYaml } from 'yaml';
|
|
9
9
|
import { ConfigFileSchema, TaskFileSchema, TaskSchema } from './schemas.js';
|
|
10
10
|
const CONFIG_FILENAMES = ['cli-bench.yaml', 'cli-bench.yml'];
|
|
11
11
|
/**
|
|
@@ -96,6 +96,7 @@ export async function loadProject(configPath) {
|
|
|
96
96
|
const config = await loadConfigFile(configPath);
|
|
97
97
|
const baseDir = dirname(configPath);
|
|
98
98
|
const tasks = await resolveTaskRefs(config.tasks, baseDir);
|
|
99
|
-
|
|
99
|
+
const taskSuiteContent = stringifyYaml(tasks);
|
|
100
|
+
return { config, tasks, taskSuiteContent };
|
|
100
101
|
}
|
|
101
102
|
//# sourceMappingURL=project.js.map
|
package/dist/project.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"project.js","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AACxC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"project.js","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AACxC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,SAAS,IAAI,aAAa,EAAE,MAAM,MAAM,CAAC;AACtE,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAG5E,MAAM,gBAAgB,GAAG,CAAC,gBAAgB,EAAE,eAAe,CAAC,CAAC;AAE7D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,YAAqB;IAC3D,IAAI,YAAY,EAAE,CAAC;QACjB,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;YAC3B,OAAO,OAAO,CAAC,YAAY,CAAC,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,gBAAgB,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;QAChC,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YACxB,OAAO,SAAS,CAAC;QACnB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,IAAY;IAC/C,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAC9B,OAAO,gBAAgB,CAAC,KAAK,CAAC,MAAM,CAAe,CAAC;AACtD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAwB,EACxB,OAAe;IAEf,MAAM,QAAQ,GAAW,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,oBAAoB;YACpB,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC;YAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAEnC,gCAAgC;YAChC,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACtB,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;oBACxC,IAAI,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;wBACtD,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACpB,CAAC;gBACH,CAAC;gBACD,KAAK,CAAC,IAAI,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,CAAC;YAED,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;gBAC7B,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBAC9C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;gBAC9B,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,MAAM,CAAW,CAAC;gBACzD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;oBAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;wBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACtB,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,wCAAwC;YACxC,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAS,CAAC;YAC7C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,UAAkB;IAClD,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,UAAU,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,MAAM,KAAK,GAAG,MAAM,eAAe,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAC3D,MAAM,gBAAgB,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;IAC9C,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,gBAAgB,EAAE,CAAC;AAC7C,CAAC"}
|
package/dist/prompt.d.ts
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Builds prompts
|
|
2
|
+
* Builds prompts for CLI benchmark agents.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
4
|
+
* One unified system message for all context modes.
|
|
5
|
+
* The user message varies by context mode:
|
|
6
|
+
* - zero-shot: CLI name + task intent only
|
|
7
|
+
* - help: CLI name + top-level --help output + task intent
|
|
8
|
+
* - docs: CLI name + documentation contents + task intent
|
|
8
9
|
*/
|
|
9
|
-
import type {
|
|
10
|
-
export declare function buildSystemMessage(
|
|
11
|
-
export declare function buildUserMessage(cliName: string,
|
|
10
|
+
import type { ContextMode, Task } from './models.js';
|
|
11
|
+
export declare function buildSystemMessage(customPrompt?: string): string;
|
|
12
|
+
export declare function buildUserMessage(cliName: string, task: Task, contextMode: ContextMode, contextPayload?: string | null): string;
|
|
12
13
|
//# sourceMappingURL=prompt.d.ts.map
|
package/dist/prompt.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AASrD,wBAAgB,kBAAkB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,CAKhE;AAED,wBAAgB,gBAAgB,CAC9B,OAAO,EAAE,MAAM,EACf,IAAI,EAAE,IAAI,EACV,WAAW,EAAE,WAAW,EACxB,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,GAC7B,MAAM,CA+BR"}
|
package/dist/prompt.js
CHANGED
|
@@ -1,96 +1,52 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Builds prompts
|
|
2
|
+
* Builds prompts for CLI benchmark agents.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
4
|
+
* One unified system message for all context modes.
|
|
5
|
+
* The user message varies by context mode:
|
|
6
|
+
* - zero-shot: CLI name + task intent only
|
|
7
|
+
* - help: CLI name + top-level --help output + task intent
|
|
8
|
+
* - docs: CLI name + documentation contents + task intent
|
|
8
9
|
*/
|
|
9
10
|
const AGENT_SYSTEM_MESSAGE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
|
|
10
11
|
|
|
11
12
|
Rules:
|
|
12
|
-
- Read the help text carefully to understand available subcommands and flags
|
|
13
13
|
- Execute commands using the run_command tool
|
|
14
14
|
- If a command fails, read the error and retry with corrected flags
|
|
15
|
-
- Do NOT invent flags that don't exist in the help text
|
|
16
15
|
- When the task is complete, stop calling tools`;
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
Rules:
|
|
20
|
-
- Use <cli> --help and <cli> <subcommand> --help to discover available commands and flags
|
|
21
|
-
- Execute commands using the run_command tool
|
|
22
|
-
- If a command fails, read the error and retry with corrected flags
|
|
23
|
-
- Do NOT invent flags — always check --help first
|
|
24
|
-
- When the task is complete, stop calling tools`;
|
|
25
|
-
const AGENT_SYSTEM_MESSAGE_NONE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
|
|
26
|
-
|
|
27
|
-
Rules:
|
|
28
|
-
- Use your training knowledge of CLI tools to construct commands
|
|
29
|
-
- Execute commands using the run_command tool
|
|
30
|
-
- If a command fails, read the error and retry with corrected flags
|
|
31
|
-
- Do NOT run --help commands — rely on your knowledge
|
|
32
|
-
- When the task is complete, stop calling tools`;
|
|
33
|
-
export function buildSystemMessage(helpMode, customPrompt) {
|
|
34
|
-
let base;
|
|
35
|
-
switch (helpMode) {
|
|
36
|
-
case 'injected':
|
|
37
|
-
base = AGENT_SYSTEM_MESSAGE;
|
|
38
|
-
break;
|
|
39
|
-
case 'discoverable':
|
|
40
|
-
base = AGENT_SYSTEM_MESSAGE_DISCOVERABLE;
|
|
41
|
-
break;
|
|
42
|
-
case 'none':
|
|
43
|
-
base = AGENT_SYSTEM_MESSAGE_NONE;
|
|
44
|
-
break;
|
|
45
|
-
}
|
|
16
|
+
export function buildSystemMessage(customPrompt) {
|
|
46
17
|
if (customPrompt) {
|
|
47
|
-
return `${
|
|
18
|
+
return `${AGENT_SYSTEM_MESSAGE}\n\n${customPrompt}`;
|
|
48
19
|
}
|
|
49
|
-
return
|
|
20
|
+
return AGENT_SYSTEM_MESSAGE;
|
|
50
21
|
}
|
|
51
|
-
export function buildUserMessage(cliName,
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
22
|
+
export function buildUserMessage(cliName, task, contextMode, contextPayload) {
|
|
23
|
+
switch (contextMode) {
|
|
24
|
+
case 'help':
|
|
25
|
+
if (contextPayload) {
|
|
26
|
+
return `CLI: ${cliName}
|
|
55
27
|
|
|
56
28
|
Help text:
|
|
57
|
-
${
|
|
29
|
+
$ ${cliName} --help
|
|
30
|
+
${contextPayload}
|
|
58
31
|
|
|
59
32
|
Task: ${task.intent}`;
|
|
60
|
-
|
|
61
|
-
|
|
33
|
+
}
|
|
34
|
+
// Fall through to zero-shot if no help available
|
|
35
|
+
return `CLI: ${cliName}\n\nTask: ${task.intent}`;
|
|
36
|
+
case 'docs':
|
|
37
|
+
if (contextPayload) {
|
|
38
|
+
return `CLI: ${cliName}
|
|
39
|
+
|
|
40
|
+
Documentation:
|
|
41
|
+
${contextPayload}
|
|
62
42
|
|
|
63
43
|
Task: ${task.intent}`;
|
|
64
|
-
}
|
|
65
|
-
/**
|
|
66
|
-
* Select the most relevant help text sections for a task.
|
|
67
|
-
* Trims to stay within reasonable context limits (~4K chars).
|
|
68
|
-
*/
|
|
69
|
-
function selectRelevantHelp(cliName, helpTexts, task) {
|
|
70
|
-
const sections = [];
|
|
71
|
-
const maxChars = 4000;
|
|
72
|
-
let totalChars = 0;
|
|
73
|
-
// Always include root help
|
|
74
|
-
const rootHelp = helpTexts[''];
|
|
75
|
-
if (rootHelp) {
|
|
76
|
-
sections.push(`$ ${cliName} --help\n${rootHelp}`);
|
|
77
|
-
totalChars += rootHelp.length;
|
|
78
|
-
}
|
|
79
|
-
// Add help sections that match task keywords
|
|
80
|
-
if (totalChars < maxChars * 0.7) {
|
|
81
|
-
const taskWords = task.intent.toLowerCase().split(/\s+/);
|
|
82
|
-
for (const [key, help] of Object.entries(helpTexts)) {
|
|
83
|
-
if (key === '' || sections.some((s) => s.includes(`${cliName} ${key} --help`))) {
|
|
84
|
-
continue;
|
|
85
|
-
}
|
|
86
|
-
const keyWords = key.split(' ');
|
|
87
|
-
const matches = keyWords.some((kw) => taskWords.includes(kw));
|
|
88
|
-
if (matches && totalChars + help.length < maxChars) {
|
|
89
|
-
sections.push(`$ ${cliName} ${key} --help\n${help}`);
|
|
90
|
-
totalChars += help.length;
|
|
91
44
|
}
|
|
92
|
-
|
|
45
|
+
// Fall through to zero-shot if no docs available
|
|
46
|
+
return `CLI: ${cliName}\n\nTask: ${task.intent}`;
|
|
47
|
+
case 'zero-shot':
|
|
48
|
+
default:
|
|
49
|
+
return `CLI: ${cliName}\n\nTask: ${task.intent}`;
|
|
93
50
|
}
|
|
94
|
-
return sections.join('\n\n');
|
|
95
51
|
}
|
|
96
52
|
//# sourceMappingURL=prompt.js.map
|
package/dist/prompt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH,MAAM,oBAAoB,GAAG;;;;;gDAKmB,CAAC;AAEjD,MAAM,UAAU,kBAAkB,CAAC,YAAqB;IACtD,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,GAAG,oBAAoB,OAAO,YAAY,EAAE,CAAC;IACtD,CAAC;IACD,OAAO,oBAAoB,CAAC;AAC9B,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,OAAe,EACf,IAAU,EACV,WAAwB,EACxB,cAA8B;IAE9B,QAAQ,WAAW,EAAE,CAAC;QACpB,KAAK,MAAM;YACT,IAAI,cAAc,EAAE,CAAC;gBACnB,OAAO,QAAQ,OAAO;;;IAG1B,OAAO;EACT,cAAc;;QAER,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;YACD,iDAAiD;YACjD,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;QAEnD,KAAK,MAAM;YACT,IAAI,cAAc,EAAE,CAAC;gBACnB,OAAO,QAAQ,OAAO;;;EAG5B,cAAc;;QAER,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;YACD,iDAAiD;YACjD,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;QAEnD,KAAK,WAAW,CAAC;QACjB;YACE,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;IACrD,CAAC;AACH,CAAC"}
|
package/dist/runner.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Orchestrates the task x model x
|
|
2
|
+
* Orchestrates the task x model x contextMode matrix with tool-calling agents.
|
|
3
3
|
*
|
|
4
4
|
* For each task:
|
|
5
5
|
* - Create temp workdir (or use configured one)
|
|
@@ -30,6 +30,15 @@ export interface RunGridOptions {
|
|
|
30
30
|
globalRepeat?: number;
|
|
31
31
|
/** Custom system prompt appended to the default. */
|
|
32
32
|
systemPrompt?: string;
|
|
33
|
+
/** Profile metadata (passed through to upload). */
|
|
34
|
+
displayName?: string;
|
|
35
|
+
category?: string;
|
|
36
|
+
websiteUrl?: string;
|
|
37
|
+
githubUrl?: string;
|
|
38
|
+
/** Serialized YAML of the resolved task suite. */
|
|
39
|
+
taskSuiteContent?: string;
|
|
40
|
+
/** Directory containing the config file (for resolving docs.md). */
|
|
41
|
+
configDir?: string;
|
|
33
42
|
}
|
|
34
43
|
export declare function runGrid(opts: RunGridOptions): Promise<GridReport[]>;
|
|
35
44
|
export declare function uploadReport(report: GridReport, backendUrl: string, apiKey: string): Promise<void>;
|
package/dist/runner.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AASH,OAAO,KAAK,EACV,SAAS,EACT,IAAI,EACJ,UAAU,EAKX,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AASjD,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAKxE;AAED,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAc9B;AAkND,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,8CAA8C;IAC9C,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC;IACf,iDAAiD;IACjD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+CAA+C;IAC/C,MAAM,CAAC,EAAE,UAAU,EAAE,CAAC;IACtB,wDAAwD;IACxD,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,0DAA0D;IAC1D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,YAAY,CAAC,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AASH,OAAO,KAAK,EACV,SAAS,EACT,IAAI,EACJ,UAAU,EAKX,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AASjD,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAKxE;AAED,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAc9B;AAkND,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,8CAA8C;IAC9C,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC;IACf,iDAAiD;IACjD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+CAA+C;IAC/C,MAAM,CAAC,EAAE,UAAU,EAAE,CAAC;IACtB,wDAAwD;IACxD,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,0DAA0D;IAC1D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,mDAAmD;IACnD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kDAAkD;IAClD,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,oEAAoE;IACpE,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,wBAAsB,OAAO,CAAC,IAAI,EAAE,cAAc,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC,CAoEzE;AAiPD,wBAAsB,YAAY,CAChC,MAAM,EAAE,UAAU,EAClB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,IAAI,CAAC,CAsBf"}
|