@cliwatch/cli-bench 0.5.4 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/client/types.gen.d.ts +12 -0
- package/dist/client/types.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.d.ts +8 -0
- package/dist/client/zod.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.js +9 -0
- package/dist/client/zod.gen.js.map +1 -1
- package/dist/config.d.ts +2 -2
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +8 -7
- package/dist/config.js.map +1 -1
- package/dist/help-loader.d.ts +6 -2
- package/dist/help-loader.d.ts.map +1 -1
- package/dist/help-loader.js +7 -77
- package/dist/help-loader.js.map +1 -1
- package/dist/index.js +12 -8
- package/dist/index.js.map +1 -1
- package/dist/init.js +1 -1
- package/dist/models.d.ts +5 -3
- package/dist/models.d.ts.map +1 -1
- package/dist/models.js +1 -1
- package/dist/models.js.map +1 -1
- package/dist/project.d.ts +1 -0
- package/dist/project.d.ts.map +1 -1
- package/dist/project.js +3 -2
- package/dist/project.js.map +1 -1
- package/dist/prompt.d.ts +9 -8
- package/dist/prompt.d.ts.map +1 -1
- package/dist/prompt.js +30 -74
- package/dist/prompt.js.map +1 -1
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +1 -0
- package/dist/providers.js.map +1 -1
- package/dist/runner.d.ts +5 -1
- package/dist/runner.d.ts.map +1 -1
- package/dist/runner.js +79 -50
- package/dist/runner.js.map +1 -1
- package/dist/schemas.d.ts +1 -1
- package/dist/schemas.js +1 -1
- package/dist/schemas.js.map +1 -1
- package/dist/suite-generator.d.ts.map +1 -1
- package/dist/suite-generator.js +5 -4
- package/dist/suite-generator.js.map +1 -1
- package/package.json +1 -1
package/dist/models.d.ts
CHANGED
|
@@ -4,7 +4,8 @@
|
|
|
4
4
|
* Assertion-based evaluation: tasks define assertions that are checked
|
|
5
5
|
* against the agent's execution trace.
|
|
6
6
|
*/
|
|
7
|
-
export type
|
|
7
|
+
export type ContextMode = 'zero-shot' | 'help' | 'docs';
|
|
8
|
+
export declare const CONTEXT_MODES: ContextMode[];
|
|
8
9
|
export type Assertion = {
|
|
9
10
|
output_contains: string;
|
|
10
11
|
} | {
|
|
@@ -77,7 +78,7 @@ export interface ConfigFile {
|
|
|
77
78
|
website_url?: string;
|
|
78
79
|
github_url?: string;
|
|
79
80
|
providers?: string[];
|
|
80
|
-
|
|
81
|
+
context?: string[];
|
|
81
82
|
system_prompt?: string;
|
|
82
83
|
concurrency?: number;
|
|
83
84
|
workdir?: string;
|
|
@@ -115,7 +116,7 @@ export interface ModelResult {
|
|
|
115
116
|
provider: Provider;
|
|
116
117
|
modelId: string;
|
|
117
118
|
displayName: string;
|
|
118
|
-
|
|
119
|
+
contextMode: ContextMode;
|
|
119
120
|
taskResults: TaskEval[];
|
|
120
121
|
passRate: number;
|
|
121
122
|
avgTurnsToSuccess: number;
|
|
@@ -162,6 +163,7 @@ export interface GridReport {
|
|
|
162
163
|
repository?: string;
|
|
163
164
|
tags?: string[];
|
|
164
165
|
taskSuiteHash?: string;
|
|
166
|
+
taskSuiteContent?: string;
|
|
165
167
|
thresholdResults?: ThresholdCheckResult;
|
|
166
168
|
}
|
|
167
169
|
export interface HelpCache {
|
package/dist/models.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,MAAM,MAAM,
|
|
1
|
+
{"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,MAAM,MAAM,WAAW,GAAG,WAAW,GAAG,MAAM,GAAG,MAAM,CAAC;AACxD,eAAO,MAAM,aAAa,EAAE,WAAW,EAAkC,CAAC;AAM1E,MAAM,MAAM,SAAS,GACjB;IAAE,eAAe,EAAE,MAAM,CAAA;CAAE,GAC3B;IAAE,aAAa,EAAE,MAAM,CAAA;CAAE,GACzB;IAAE,cAAc,EAAE,MAAM,CAAA;CAAE,GAC1B;IAAE,SAAS,EAAE,MAAM,CAAA;CAAE,GACrB;IAAE,WAAW,EAAE,MAAM,CAAA;CAAE,GACvB;IAAE,aAAa,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GACjD;IAAE,GAAG,EAAE,MAAM,CAAA;CAAE,GACf;IAAE,OAAO,EAAE,MAAM,CAAA;CAAE,GACnB;IAAE,SAAS,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GAC9D;IAAE,MAAM,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,eAAe,CAAC,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC;AAElF,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,SAAS,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,IAAI;IACnB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,SAAS,EAAE,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,SAAS;IACxB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,IAAI,EAAE,CAAC;CACf;AAMD,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;AAErD,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,OAAO,GAAG,eAAe,CAAC;CACtC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,UAAU,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,gBAAgB,CAAC;IAC9B,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC;CAC1B;AAMD,MAAM,MAAM,QAAQ,GAAG,MAAM,CAAC;AAE9B,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD,MAAM,WAAW,QAAQ;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,OAAO,EAAE,CAAC;IAC9B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,WAAW,CAAC;IACzB,WAAW,EAAE,QAAQ,EAAE,CAAC;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,QAAQ,EAAE,OAAO,GAAG,eAAe,CAAC;CACrC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,EAAE,MAAM,CAAC;IACzB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,WAAW,EAAE,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,gBAAgB,CAAC,EAAE,oBAAoB,CAAC;CACzC;AAMD,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB"}
|
package/dist/models.js
CHANGED
package/dist/models.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG"}
|
|
1
|
+
{"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH,MAAM,CAAC,MAAM,aAAa,GAAkB,CAAC,WAAW,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC"}
|
package/dist/project.d.ts
CHANGED
|
@@ -22,5 +22,6 @@ export declare function resolveTaskRefs(tasks: (Task | string)[], baseDir: strin
|
|
|
22
22
|
export declare function loadProject(configPath: string): Promise<{
|
|
23
23
|
config: ConfigFile;
|
|
24
24
|
tasks: Task[];
|
|
25
|
+
taskSuiteContent: string;
|
|
25
26
|
}>;
|
|
26
27
|
//# sourceMappingURL=project.d.ts.map
|
package/dist/project.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"project.d.ts","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH,OAAO,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAIpD;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAqBrF;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAItE;AAED;;;GAGG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,EACxB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,IAAI,EAAE,CAAC,CA6CjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,KAAK,EAAE,IAAI,EAAE,CAAA;CAAE,CAAC,
|
|
1
|
+
{"version":3,"file":"project.d.ts","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH,OAAO,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAIpD;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAqBrF;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAItE;AAED;;;GAGG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,EACxB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,IAAI,EAAE,CAAC,CA6CjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,KAAK,EAAE,IAAI,EAAE,CAAC;IAAC,gBAAgB,EAAE,MAAM,CAAA;CAAE,CAAC,CAM9H"}
|
package/dist/project.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import { readFile, access } from 'node:fs/promises';
|
|
6
6
|
import { join, dirname, resolve } from 'node:path';
|
|
7
7
|
import { glob } from 'node:fs/promises';
|
|
8
|
-
import { parse as parseYaml } from 'yaml';
|
|
8
|
+
import { parse as parseYaml, stringify as stringifyYaml } from 'yaml';
|
|
9
9
|
import { ConfigFileSchema, TaskFileSchema, TaskSchema } from './schemas.js';
|
|
10
10
|
const CONFIG_FILENAMES = ['cli-bench.yaml', 'cli-bench.yml'];
|
|
11
11
|
/**
|
|
@@ -96,6 +96,7 @@ export async function loadProject(configPath) {
|
|
|
96
96
|
const config = await loadConfigFile(configPath);
|
|
97
97
|
const baseDir = dirname(configPath);
|
|
98
98
|
const tasks = await resolveTaskRefs(config.tasks, baseDir);
|
|
99
|
-
|
|
99
|
+
const taskSuiteContent = stringifyYaml(tasks);
|
|
100
|
+
return { config, tasks, taskSuiteContent };
|
|
100
101
|
}
|
|
101
102
|
//# sourceMappingURL=project.js.map
|
package/dist/project.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"project.js","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AACxC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"project.js","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AACxC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,SAAS,IAAI,aAAa,EAAE,MAAM,MAAM,CAAC;AACtE,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAG5E,MAAM,gBAAgB,GAAG,CAAC,gBAAgB,EAAE,eAAe,CAAC,CAAC;AAE7D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,YAAqB;IAC3D,IAAI,YAAY,EAAE,CAAC;QACjB,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;YAC3B,OAAO,OAAO,CAAC,YAAY,CAAC,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,gBAAgB,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;QAChC,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YACxB,OAAO,SAAS,CAAC;QACnB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,IAAY;IAC/C,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAC9B,OAAO,gBAAgB,CAAC,KAAK,CAAC,MAAM,CAAe,CAAC;AACtD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAwB,EACxB,OAAe;IAEf,MAAM,QAAQ,GAAW,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,oBAAoB;YACpB,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC;YAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAEnC,gCAAgC;YAChC,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACtB,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;oBACxC,IAAI,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;wBACtD,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACpB,CAAC;gBACH,CAAC;gBACD,KAAK,CAAC,IAAI,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,CAAC;YAED,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;gBAC7B,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBAC9C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;gBAC9B,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,MAAM,CAAW,CAAC;gBACzD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;oBAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;wBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACtB,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,wCAAwC;YACxC,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAS,CAAC;YAC7C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,UAAkB;IAClD,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,UAAU,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,MAAM,KAAK,GAAG,MAAM,eAAe,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAC3D,MAAM,gBAAgB,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;IAC9C,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,gBAAgB,EAAE,CAAC;AAC7C,CAAC"}
|
package/dist/prompt.d.ts
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Builds prompts
|
|
2
|
+
* Builds prompts for CLI benchmark agents.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
4
|
+
* One unified system message for all context modes.
|
|
5
|
+
* The user message varies by context mode:
|
|
6
|
+
* - zero-shot: CLI name + task intent only
|
|
7
|
+
* - help: CLI name + top-level --help output + task intent
|
|
8
|
+
* - docs: CLI name + documentation contents + task intent
|
|
8
9
|
*/
|
|
9
|
-
import type {
|
|
10
|
-
export declare function buildSystemMessage(
|
|
11
|
-
export declare function buildUserMessage(cliName: string,
|
|
10
|
+
import type { ContextMode, Task } from './models.js';
|
|
11
|
+
export declare function buildSystemMessage(customPrompt?: string): string;
|
|
12
|
+
export declare function buildUserMessage(cliName: string, task: Task, contextMode: ContextMode, contextPayload?: string | null): string;
|
|
12
13
|
//# sourceMappingURL=prompt.d.ts.map
|
package/dist/prompt.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AASrD,wBAAgB,kBAAkB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,CAKhE;AAED,wBAAgB,gBAAgB,CAC9B,OAAO,EAAE,MAAM,EACf,IAAI,EAAE,IAAI,EACV,WAAW,EAAE,WAAW,EACxB,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,GAC7B,MAAM,CA+BR"}
|
package/dist/prompt.js
CHANGED
|
@@ -1,96 +1,52 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Builds prompts
|
|
2
|
+
* Builds prompts for CLI benchmark agents.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
4
|
+
* One unified system message for all context modes.
|
|
5
|
+
* The user message varies by context mode:
|
|
6
|
+
* - zero-shot: CLI name + task intent only
|
|
7
|
+
* - help: CLI name + top-level --help output + task intent
|
|
8
|
+
* - docs: CLI name + documentation contents + task intent
|
|
8
9
|
*/
|
|
9
10
|
const AGENT_SYSTEM_MESSAGE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
|
|
10
11
|
|
|
11
12
|
Rules:
|
|
12
|
-
- Read the help text carefully to understand available subcommands and flags
|
|
13
13
|
- Execute commands using the run_command tool
|
|
14
14
|
- If a command fails, read the error and retry with corrected flags
|
|
15
|
-
- Do NOT invent flags that don't exist in the help text
|
|
16
15
|
- When the task is complete, stop calling tools`;
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
Rules:
|
|
20
|
-
- Use <cli> --help and <cli> <subcommand> --help to discover available commands and flags
|
|
21
|
-
- Execute commands using the run_command tool
|
|
22
|
-
- If a command fails, read the error and retry with corrected flags
|
|
23
|
-
- Do NOT invent flags — always check --help first
|
|
24
|
-
- When the task is complete, stop calling tools`;
|
|
25
|
-
const AGENT_SYSTEM_MESSAGE_NONE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
|
|
26
|
-
|
|
27
|
-
Rules:
|
|
28
|
-
- Use your training knowledge of CLI tools to construct commands
|
|
29
|
-
- Execute commands using the run_command tool
|
|
30
|
-
- If a command fails, read the error and retry with corrected flags
|
|
31
|
-
- Do NOT run --help commands — rely on your knowledge
|
|
32
|
-
- When the task is complete, stop calling tools`;
|
|
33
|
-
export function buildSystemMessage(helpMode, customPrompt) {
|
|
34
|
-
let base;
|
|
35
|
-
switch (helpMode) {
|
|
36
|
-
case 'injected':
|
|
37
|
-
base = AGENT_SYSTEM_MESSAGE;
|
|
38
|
-
break;
|
|
39
|
-
case 'discoverable':
|
|
40
|
-
base = AGENT_SYSTEM_MESSAGE_DISCOVERABLE;
|
|
41
|
-
break;
|
|
42
|
-
case 'none':
|
|
43
|
-
base = AGENT_SYSTEM_MESSAGE_NONE;
|
|
44
|
-
break;
|
|
45
|
-
}
|
|
16
|
+
export function buildSystemMessage(customPrompt) {
|
|
46
17
|
if (customPrompt) {
|
|
47
|
-
return `${
|
|
18
|
+
return `${AGENT_SYSTEM_MESSAGE}\n\n${customPrompt}`;
|
|
48
19
|
}
|
|
49
|
-
return
|
|
20
|
+
return AGENT_SYSTEM_MESSAGE;
|
|
50
21
|
}
|
|
51
|
-
export function buildUserMessage(cliName,
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
22
|
+
export function buildUserMessage(cliName, task, contextMode, contextPayload) {
|
|
23
|
+
switch (contextMode) {
|
|
24
|
+
case 'help':
|
|
25
|
+
if (contextPayload) {
|
|
26
|
+
return `CLI: ${cliName}
|
|
55
27
|
|
|
56
28
|
Help text:
|
|
57
|
-
${
|
|
29
|
+
$ ${cliName} --help
|
|
30
|
+
${contextPayload}
|
|
58
31
|
|
|
59
32
|
Task: ${task.intent}`;
|
|
60
|
-
|
|
61
|
-
|
|
33
|
+
}
|
|
34
|
+
// Fall through to zero-shot if no help available
|
|
35
|
+
return `CLI: ${cliName}\n\nTask: ${task.intent}`;
|
|
36
|
+
case 'docs':
|
|
37
|
+
if (contextPayload) {
|
|
38
|
+
return `CLI: ${cliName}
|
|
39
|
+
|
|
40
|
+
Documentation:
|
|
41
|
+
${contextPayload}
|
|
62
42
|
|
|
63
43
|
Task: ${task.intent}`;
|
|
64
|
-
}
|
|
65
|
-
/**
|
|
66
|
-
* Select the most relevant help text sections for a task.
|
|
67
|
-
* Trims to stay within reasonable context limits (~4K chars).
|
|
68
|
-
*/
|
|
69
|
-
function selectRelevantHelp(cliName, helpTexts, task) {
|
|
70
|
-
const sections = [];
|
|
71
|
-
const maxChars = 4000;
|
|
72
|
-
let totalChars = 0;
|
|
73
|
-
// Always include root help
|
|
74
|
-
const rootHelp = helpTexts[''];
|
|
75
|
-
if (rootHelp) {
|
|
76
|
-
sections.push(`$ ${cliName} --help\n${rootHelp}`);
|
|
77
|
-
totalChars += rootHelp.length;
|
|
78
|
-
}
|
|
79
|
-
// Add help sections that match task keywords
|
|
80
|
-
if (totalChars < maxChars * 0.7) {
|
|
81
|
-
const taskWords = task.intent.toLowerCase().split(/\s+/);
|
|
82
|
-
for (const [key, help] of Object.entries(helpTexts)) {
|
|
83
|
-
if (key === '' || sections.some((s) => s.includes(`${cliName} ${key} --help`))) {
|
|
84
|
-
continue;
|
|
85
|
-
}
|
|
86
|
-
const keyWords = key.split(' ');
|
|
87
|
-
const matches = keyWords.some((kw) => taskWords.includes(kw));
|
|
88
|
-
if (matches && totalChars + help.length < maxChars) {
|
|
89
|
-
sections.push(`$ ${cliName} ${key} --help\n${help}`);
|
|
90
|
-
totalChars += help.length;
|
|
91
44
|
}
|
|
92
|
-
|
|
45
|
+
// Fall through to zero-shot if no docs available
|
|
46
|
+
return `CLI: ${cliName}\n\nTask: ${task.intent}`;
|
|
47
|
+
case 'zero-shot':
|
|
48
|
+
default:
|
|
49
|
+
return `CLI: ${cliName}\n\nTask: ${task.intent}`;
|
|
93
50
|
}
|
|
94
|
-
return sections.join('\n\n');
|
|
95
51
|
}
|
|
96
52
|
//# sourceMappingURL=prompt.js.map
|
package/dist/prompt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH,MAAM,oBAAoB,GAAG;;;;;gDAKmB,CAAC;AAEjD,MAAM,UAAU,kBAAkB,CAAC,YAAqB;IACtD,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,GAAG,oBAAoB,OAAO,YAAY,EAAE,CAAC;IACtD,CAAC;IACD,OAAO,oBAAoB,CAAC;AAC9B,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,OAAe,EACf,IAAU,EACV,WAAwB,EACxB,cAA8B;IAE9B,QAAQ,WAAW,EAAE,CAAC;QACpB,KAAK,MAAM;YACT,IAAI,cAAc,EAAE,CAAC;gBACnB,OAAO,QAAQ,OAAO;;;IAG1B,OAAO;EACT,cAAc;;QAER,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;YACD,iDAAiD;YACjD,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;QAEnD,KAAK,MAAM;YACT,IAAI,cAAc,EAAE,CAAC;gBACnB,OAAO,QAAQ,OAAO;;;EAG5B,cAAc;;QAER,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;YACD,iDAAiD;YACjD,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;QAEnD,KAAK,WAAW,CAAC;QACjB;YACE,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;IACrD,CAAC;AACH,CAAC"}
|
package/dist/providers.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,MAAM,CAAC;IACX,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,eAAO,MAAM,MAAM,EAAE,SAAS,UAAU,
|
|
1
|
+
{"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,MAAM,CAAC;IACX,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,eAAO,MAAM,MAAM,EAAE,SAAS,UAAU,EAc9B,CAAC;AAEX,wBAAgB,QAAQ,CAAC,OAAO,EAAE,MAAM,8CAEvC;AAED,wBAAgB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS,CAErE;AAED,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,UAAU,EAAE,CAG7D;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,EAAE,GAAG,UAAU,EAAE,CAWpE;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,IAAI,IAAI,CAMzC"}
|
package/dist/providers.js
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
import { gateway } from 'ai';
|
|
8
8
|
export const MODELS = [
|
|
9
9
|
{ id: 'anthropic/claude-opus-4.6', displayName: 'Claude Opus 4.6', provider: 'anthropic' },
|
|
10
|
+
{ id: 'anthropic/claude-sonnet-4.6', displayName: 'Claude Sonnet 4.6', provider: 'anthropic' },
|
|
10
11
|
{ id: 'anthropic/claude-sonnet-4-20250514', displayName: 'Claude Sonnet 4', provider: 'anthropic' },
|
|
11
12
|
{ id: 'anthropic/claude-haiku-4.5', displayName: 'Claude Haiku 4.5', provider: 'anthropic' },
|
|
12
13
|
{ id: 'anthropic/claude-haiku-4-5-20251001', displayName: 'Claude Haiku 4.5', provider: 'anthropic' },
|
package/dist/providers.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"providers.js","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAQ7B,MAAM,CAAC,MAAM,MAAM,GAA0B;IAC3C,EAAE,EAAE,EAAE,2BAA2B,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC1F,EAAE,EAAE,EAAE,oCAAoC,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACnG,EAAE,EAAE,EAAE,4BAA4B,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC5F,EAAE,EAAE,EAAE,qCAAqC,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACrG,EAAE,EAAE,EAAE,gBAAgB,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACpE,EAAE,EAAE,EAAE,eAAe,EAAE,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClE,EAAE,EAAE,EAAE,oBAAoB,EAAE,WAAW,EAAE,aAAa,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAC5E,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,yBAAyB,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACtF,EAAE,EAAE,EAAE,mBAAmB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,EAAE;IAC1E,EAAE,EAAE,EAAE,sBAAsB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAS,EAAE;CACxE,CAAC;AAEX,MAAM,UAAU,QAAQ,CAAC,OAAe;IACtC,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,OAAe;IAC3C,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AAC9C,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,QAAkB;IAC7C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,GAAG,MAAM,CAAC,CAAC;IAC9C,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACvD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,WAAqB;IACpD,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QACjD,IAAI,QAAQ;YAAE,OAAO,EAAE,GAAG,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC1C,OAAO;YACL,EAAE;YACF,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;YAC3B,QAAQ,EAAE,QAAkC;SAC7C,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB;IAChC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CACb,2IAA2I,CAC5I,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
1
|
+
{"version":3,"file":"providers.js","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAQ7B,MAAM,CAAC,MAAM,MAAM,GAA0B;IAC3C,EAAE,EAAE,EAAE,2BAA2B,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC1F,EAAE,EAAE,EAAE,6BAA6B,EAAE,WAAW,EAAE,mBAAmB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC9F,EAAE,EAAE,EAAE,oCAAoC,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACnG,EAAE,EAAE,EAAE,4BAA4B,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC5F,EAAE,EAAE,EAAE,qCAAqC,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACrG,EAAE,EAAE,EAAE,gBAAgB,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACpE,EAAE,EAAE,EAAE,eAAe,EAAE,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClE,EAAE,EAAE,EAAE,oBAAoB,EAAE,WAAW,EAAE,aAAa,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAC5E,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,yBAAyB,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACtF,EAAE,EAAE,EAAE,mBAAmB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,EAAE;IAC1E,EAAE,EAAE,EAAE,sBAAsB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAS,EAAE;CACxE,CAAC;AAEX,MAAM,UAAU,QAAQ,CAAC,OAAe;IACtC,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,OAAe;IAC3C,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AAC9C,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,QAAkB;IAC7C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,GAAG,MAAM,CAAC,CAAC;IAC9C,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACvD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,WAAqB;IACpD,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QACjD,IAAI,QAAQ;YAAE,OAAO,EAAE,GAAG,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC1C,OAAO;YACL,EAAE;YACF,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;YAC3B,QAAQ,EAAE,QAAkC;SAC7C,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB;IAChC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CACb,2IAA2I,CAC5I,CAAC;IACJ,CAAC;AACH,CAAC"}
|
package/dist/runner.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Orchestrates the task x model x
|
|
2
|
+
* Orchestrates the task x model x contextMode matrix with tool-calling agents.
|
|
3
3
|
*
|
|
4
4
|
* For each task:
|
|
5
5
|
* - Create temp workdir (or use configured one)
|
|
@@ -35,6 +35,10 @@ export interface RunGridOptions {
|
|
|
35
35
|
category?: string;
|
|
36
36
|
websiteUrl?: string;
|
|
37
37
|
githubUrl?: string;
|
|
38
|
+
/** Serialized YAML of the resolved task suite. */
|
|
39
|
+
taskSuiteContent?: string;
|
|
40
|
+
/** Directory containing the config file (for resolving docs.md). */
|
|
41
|
+
configDir?: string;
|
|
38
42
|
}
|
|
39
43
|
export declare function runGrid(opts: RunGridOptions): Promise<GridReport[]>;
|
|
40
44
|
export declare function uploadReport(report: GridReport, backendUrl: string, apiKey: string): Promise<void>;
|
package/dist/runner.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AASH,OAAO,KAAK,EACV,SAAS,EACT,IAAI,EACJ,UAAU,EAKX,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AASjD,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAKxE;AAED,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAc9B;
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AASH,OAAO,KAAK,EACV,SAAS,EACT,IAAI,EACJ,UAAU,EAKX,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AASjD,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAKxE;AAED,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAc9B;AAiOD,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,8CAA8C;IAC9C,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC;IACf,iDAAiD;IACjD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+CAA+C;IAC/C,MAAM,CAAC,EAAE,UAAU,EAAE,CAAC;IACtB,wDAAwD;IACxD,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,0DAA0D;IAC1D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,mDAAmD;IACnD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kDAAkD;IAClD,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,oEAAoE;IACpE,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,wBAAsB,OAAO,CAAC,IAAI,EAAE,cAAc,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC,CAoEzE;AAkPD,wBAAsB,YAAY,CAChC,MAAM,EAAE,UAAU,EAClB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,IAAI,CAAC,CAsBf"}
|
package/dist/runner.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Orchestrates the task x model x
|
|
2
|
+
* Orchestrates the task x model x contextMode matrix with tool-calling agents.
|
|
3
3
|
*
|
|
4
4
|
* For each task:
|
|
5
5
|
* - Create temp workdir (or use configured one)
|
|
@@ -18,7 +18,7 @@ import { z } from 'zod';
|
|
|
18
18
|
import { getModel, filterModels } from './providers.js';
|
|
19
19
|
import { buildSystemMessage, buildUserMessage } from './prompt.js';
|
|
20
20
|
import { runAssertions } from './assertions.js';
|
|
21
|
-
import {
|
|
21
|
+
import { loadTopLevelHelp } from './help-loader.js';
|
|
22
22
|
import { execCommand, runSetup } from './exec.js';
|
|
23
23
|
import { TaskSuiteSchema } from './schemas.js';
|
|
24
24
|
import { detectCI, computeTaskSuiteHash } from './ci.js';
|
|
@@ -143,10 +143,12 @@ async function runAgentTask(model, systemMessage, userMessage, task, workdir) {
|
|
|
143
143
|
})),
|
|
144
144
|
usage: step.usage,
|
|
145
145
|
}));
|
|
146
|
+
// Cap turnsUsed at max_turns to guard against SDK retry anomalies
|
|
147
|
+
const maxTurns = task.max_turns ?? 5;
|
|
146
148
|
return {
|
|
147
149
|
passed: allPassed,
|
|
148
150
|
assertionResults,
|
|
149
|
-
turnsUsed: commandsRun.length,
|
|
151
|
+
turnsUsed: Math.min(commandsRun.length, maxTurns),
|
|
150
152
|
commandsRun,
|
|
151
153
|
totalInputTokens,
|
|
152
154
|
totalOutputTokens,
|
|
@@ -162,8 +164,8 @@ async function runAgentTask(model, systemMessage, userMessage, task, workdir) {
|
|
|
162
164
|
return {
|
|
163
165
|
passed: false,
|
|
164
166
|
assertionResults: [],
|
|
165
|
-
turnsUsed:
|
|
166
|
-
commandsRun,
|
|
167
|
+
turnsUsed: 0,
|
|
168
|
+
commandsRun: [],
|
|
167
169
|
totalInputTokens: 0,
|
|
168
170
|
totalOutputTokens: 0,
|
|
169
171
|
totalLatencyMs: Date.now() - start,
|
|
@@ -175,26 +177,36 @@ async function runAgentTask(model, systemMessage, userMessage, task, workdir) {
|
|
|
175
177
|
};
|
|
176
178
|
}
|
|
177
179
|
}
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
180
|
+
/** An eval with 0 tokens is an infra error (gateway timeout, auth failure, etc.) — not a real task result. */
|
|
181
|
+
function isInfraError(t) {
|
|
182
|
+
return t.totalInputTokens + t.totalOutputTokens === 0 && !t.passed;
|
|
183
|
+
}
|
|
184
|
+
function aggregateModelResult(model, taskResults, contextMode) {
|
|
185
|
+
// Exclude infra errors from pass rate — they're not real task failures
|
|
186
|
+
const scorable = taskResults.filter((t) => !isInfraError(t));
|
|
187
|
+
const infraErrors = taskResults.length - scorable.length;
|
|
188
|
+
if (infraErrors > 0) {
|
|
189
|
+
console.warn(` ⚠ ${infraErrors} task(s) excluded from stats (infra error, 0 tokens)`);
|
|
190
|
+
}
|
|
191
|
+
const passRate = scorable.length > 0
|
|
192
|
+
? scorable.filter((t) => t.passed).length / scorable.length
|
|
181
193
|
: 0;
|
|
182
|
-
const avgLatencyMs =
|
|
183
|
-
? Math.round(
|
|
194
|
+
const avgLatencyMs = scorable.length > 0
|
|
195
|
+
? Math.round(scorable.reduce((a, t) => a + t.totalLatencyMs, 0) / scorable.length)
|
|
184
196
|
: 0;
|
|
185
|
-
const passedTasks =
|
|
197
|
+
const passedTasks = scorable.filter((t) => t.passed);
|
|
186
198
|
const avgTurnsToSuccess = passedTasks.length > 0
|
|
187
199
|
? passedTasks.reduce((a, t) => a + t.turnsUsed, 0) / passedTasks.length
|
|
188
200
|
: 0;
|
|
189
|
-
const avgTotalTokens =
|
|
190
|
-
?
|
|
191
|
-
|
|
201
|
+
const avgTotalTokens = scorable.length > 0
|
|
202
|
+
? scorable.reduce((a, t) => a + t.totalInputTokens + t.totalOutputTokens, 0) /
|
|
203
|
+
scorable.length
|
|
192
204
|
: 0;
|
|
193
205
|
return {
|
|
194
206
|
provider: model.provider,
|
|
195
207
|
modelId: model.id,
|
|
196
208
|
displayName: model.displayName,
|
|
197
|
-
|
|
209
|
+
contextMode,
|
|
198
210
|
taskResults,
|
|
199
211
|
passRate,
|
|
200
212
|
avgTurnsToSuccess,
|
|
@@ -212,7 +224,7 @@ export async function runGrid(opts) {
|
|
|
212
224
|
cliName: opts.cliName,
|
|
213
225
|
tasks: opts.tasks,
|
|
214
226
|
models: opts.models ?? filterModels(config.models),
|
|
215
|
-
|
|
227
|
+
contextModes: config.contextModes,
|
|
216
228
|
limiter,
|
|
217
229
|
config,
|
|
218
230
|
versionCommand: opts.versionCommand,
|
|
@@ -223,6 +235,8 @@ export async function runGrid(opts) {
|
|
|
223
235
|
category: opts.category,
|
|
224
236
|
websiteUrl: opts.websiteUrl,
|
|
225
237
|
githubUrl: opts.githubUrl,
|
|
238
|
+
taskSuiteContent: opts.taskSuiteContent,
|
|
239
|
+
configDir: opts.configDir,
|
|
226
240
|
});
|
|
227
241
|
if (report)
|
|
228
242
|
reports.push(report);
|
|
@@ -246,7 +260,7 @@ export async function runGrid(opts) {
|
|
|
246
260
|
cliName,
|
|
247
261
|
tasks: suite.tasks,
|
|
248
262
|
models,
|
|
249
|
-
|
|
263
|
+
contextModes: config.contextModes,
|
|
250
264
|
limiter,
|
|
251
265
|
config,
|
|
252
266
|
versionCommand: suite.version_command,
|
|
@@ -260,32 +274,43 @@ export async function runGrid(opts) {
|
|
|
260
274
|
}
|
|
261
275
|
return reports;
|
|
262
276
|
}
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
277
|
+
/**
|
|
278
|
+
* Load context payload for a given mode.
|
|
279
|
+
*/
|
|
280
|
+
async function loadContextPayload(contextMode, cliName, configDir) {
|
|
281
|
+
switch (contextMode) {
|
|
282
|
+
case 'help': {
|
|
283
|
+
const helpText = await loadTopLevelHelp(cliName);
|
|
284
|
+
if (!helpText) {
|
|
285
|
+
console.warn(` Warning: could not capture --help for ${cliName}, falling back to zero-shot`);
|
|
286
|
+
}
|
|
287
|
+
return helpText;
|
|
274
288
|
}
|
|
275
|
-
|
|
276
|
-
|
|
289
|
+
case 'docs': {
|
|
290
|
+
if (!configDir) {
|
|
291
|
+
console.warn(` Warning: no config directory for docs mode, falling back to zero-shot`);
|
|
292
|
+
return null;
|
|
293
|
+
}
|
|
277
294
|
try {
|
|
278
|
-
const
|
|
279
|
-
|
|
280
|
-
cliVersion = live.version;
|
|
295
|
+
const docsPath = join(configDir, 'docs.md');
|
|
296
|
+
return await readFile(docsPath, 'utf-8');
|
|
281
297
|
}
|
|
282
298
|
catch {
|
|
283
|
-
console.warn(`
|
|
299
|
+
console.warn(` Warning: docs.md not found in ${configDir}, falling back to zero-shot`);
|
|
300
|
+
return null;
|
|
284
301
|
}
|
|
285
302
|
}
|
|
303
|
+
case 'zero-shot':
|
|
304
|
+
default:
|
|
305
|
+
return null;
|
|
286
306
|
}
|
|
307
|
+
}
|
|
308
|
+
async function runCliGrid(opts) {
|
|
309
|
+
const { cliName, tasks, models, contextModes, limiter, config, versionCommand, systemPrompt } = opts;
|
|
310
|
+
console.log(`\n=== ${cliName} ===`);
|
|
311
|
+
let cliVersion;
|
|
287
312
|
// Detect CLI version
|
|
288
|
-
if (
|
|
313
|
+
if (versionCommand) {
|
|
289
314
|
cliVersion = await detectCliVersion(versionCommand);
|
|
290
315
|
if (cliVersion)
|
|
291
316
|
console.log(` CLI version: ${cliVersion}`);
|
|
@@ -294,11 +319,12 @@ async function runCliGrid(opts) {
|
|
|
294
319
|
if (config.dryRun) {
|
|
295
320
|
const firstTask = tasks[0];
|
|
296
321
|
if (firstTask) {
|
|
297
|
-
for (const
|
|
298
|
-
const
|
|
299
|
-
const
|
|
300
|
-
|
|
301
|
-
console.log(
|
|
322
|
+
for (const contextMode of contextModes) {
|
|
323
|
+
const contextPayload = await loadContextPayload(contextMode, cliName, opts.configDir);
|
|
324
|
+
const sysMsg = buildSystemMessage(systemPrompt);
|
|
325
|
+
const userMessage = buildUserMessage(cliName, firstTask, contextMode, contextPayload);
|
|
326
|
+
console.log(`\n--- Dry Run: ${cliName} / ${firstTask.id} (${contextMode}) ---`);
|
|
327
|
+
console.log(`System: ${sysMsg}\n`);
|
|
302
328
|
console.log(`User:\n${userMessage}\n`);
|
|
303
329
|
console.log(`--- End Dry Run ---\n`);
|
|
304
330
|
}
|
|
@@ -306,9 +332,10 @@ async function runCliGrid(opts) {
|
|
|
306
332
|
return null;
|
|
307
333
|
}
|
|
308
334
|
const modelResults = [];
|
|
309
|
-
for (const
|
|
310
|
-
console.log(`\n
|
|
311
|
-
const
|
|
335
|
+
for (const contextMode of contextModes) {
|
|
336
|
+
console.log(`\n Context: ${contextMode}`);
|
|
337
|
+
const contextPayload = await loadContextPayload(contextMode, cliName, opts.configDir);
|
|
338
|
+
const sysMsg = buildSystemMessage(systemPrompt);
|
|
312
339
|
for (const model of models) {
|
|
313
340
|
console.log(` Model: ${model.displayName}`);
|
|
314
341
|
const taskResults = [];
|
|
@@ -324,11 +351,11 @@ async function runCliGrid(opts) {
|
|
|
324
351
|
const workdir = opts.workdir ?? config.workdir ?? await mkdtemp(join(tmpdir(), 'cli-bench-'));
|
|
325
352
|
const shouldCleanup = !opts.workdir && !config.workdir;
|
|
326
353
|
try {
|
|
327
|
-
const userMessage = buildUserMessage(cliName,
|
|
354
|
+
const userMessage = buildUserMessage(cliName, task, contextMode, contextPayload);
|
|
328
355
|
if (task.setup && task.setup.length > 0) {
|
|
329
356
|
await runSetup(task.setup, { cwd: workdir });
|
|
330
357
|
}
|
|
331
|
-
const agentResult = await runAgentTask(model,
|
|
358
|
+
const agentResult = await runAgentTask(model, sysMsg, userMessage, task, workdir);
|
|
332
359
|
const taskEval = {
|
|
333
360
|
taskId: task.id,
|
|
334
361
|
passed: agentResult.passed,
|
|
@@ -353,9 +380,10 @@ async function runCliGrid(opts) {
|
|
|
353
380
|
},
|
|
354
381
|
repeatIndex: repeatTotal > 1 ? repeatIndex : undefined,
|
|
355
382
|
};
|
|
356
|
-
const
|
|
383
|
+
const totalTokens = agentResult.totalInputTokens + agentResult.totalOutputTokens;
|
|
384
|
+
const icon = agentResult.passed ? '✓' : totalTokens === 0 ? '⚠' : '✗';
|
|
357
385
|
const repeatLabel = repeatTotal > 1 ? ` [${repeatIndex + 1}/${repeatTotal}]` : '';
|
|
358
|
-
console.log(` ${icon} ${task.id}${repeatLabel} (turns=${agentResult.turnsUsed}, tokens=${
|
|
386
|
+
console.log(` ${icon} ${task.id}${repeatLabel} (turns=${agentResult.turnsUsed}, tokens=${totalTokens})`);
|
|
359
387
|
return taskEval;
|
|
360
388
|
}
|
|
361
389
|
finally {
|
|
@@ -368,7 +396,7 @@ async function runCliGrid(opts) {
|
|
|
368
396
|
for (const taskEval of results) {
|
|
369
397
|
taskResults.push(taskEval);
|
|
370
398
|
}
|
|
371
|
-
modelResults.push(aggregateModelResult(model, taskResults,
|
|
399
|
+
modelResults.push(aggregateModelResult(model, taskResults, contextMode));
|
|
372
400
|
}
|
|
373
401
|
}
|
|
374
402
|
const ciMetadata = detectCI();
|
|
@@ -391,13 +419,14 @@ async function runCliGrid(opts) {
|
|
|
391
419
|
totalEvals,
|
|
392
420
|
generatedAt: new Date().toISOString(),
|
|
393
421
|
gridVersion: '0.4.0',
|
|
394
|
-
systemPrompt: buildSystemMessage(
|
|
422
|
+
systemPrompt: buildSystemMessage(systemPrompt),
|
|
395
423
|
displayName: opts.displayName,
|
|
396
424
|
category: opts.category,
|
|
397
425
|
websiteUrl: opts.websiteUrl,
|
|
398
426
|
githubUrl: opts.githubUrl,
|
|
399
427
|
...ciMetadata,
|
|
400
428
|
taskSuiteHash,
|
|
429
|
+
taskSuiteContent: opts.taskSuiteContent,
|
|
401
430
|
};
|
|
402
431
|
printReportSummary(report);
|
|
403
432
|
return report;
|
|
@@ -406,7 +435,7 @@ function printReportSummary(report) {
|
|
|
406
435
|
console.log(`\n--- ${report.cli} Summary ---`);
|
|
407
436
|
console.log(`Tasks: ${report.taskCount}${report.totalEvals ? `, Evals: ${report.totalEvals}` : ''}`);
|
|
408
437
|
for (const mr of report.modelResults) {
|
|
409
|
-
console.log(` ${mr.displayName} [${mr.
|
|
438
|
+
console.log(` ${mr.displayName} [${mr.contextMode}]: pass=${(mr.passRate * 100).toFixed(0)}% avgTurns=${mr.avgTurnsToSuccess.toFixed(1)} avgTokens=${Math.round(mr.avgTotalTokens)}`);
|
|
410
439
|
// Per-task breakdown when repeats are used
|
|
411
440
|
if (report.totalEvals && report.totalEvals > report.taskCount) {
|
|
412
441
|
const byTask = new Map();
|