@cliwatch/cli-bench 0.5.4 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/dist/client/types.gen.d.ts +12 -0
  3. package/dist/client/types.gen.d.ts.map +1 -1
  4. package/dist/client/zod.gen.d.ts +8 -0
  5. package/dist/client/zod.gen.d.ts.map +1 -1
  6. package/dist/client/zod.gen.js +9 -0
  7. package/dist/client/zod.gen.js.map +1 -1
  8. package/dist/config.d.ts +2 -2
  9. package/dist/config.d.ts.map +1 -1
  10. package/dist/config.js +8 -7
  11. package/dist/config.js.map +1 -1
  12. package/dist/help-loader.d.ts +6 -2
  13. package/dist/help-loader.d.ts.map +1 -1
  14. package/dist/help-loader.js +7 -77
  15. package/dist/help-loader.js.map +1 -1
  16. package/dist/index.js +12 -8
  17. package/dist/index.js.map +1 -1
  18. package/dist/init.js +1 -1
  19. package/dist/models.d.ts +5 -3
  20. package/dist/models.d.ts.map +1 -1
  21. package/dist/models.js +1 -1
  22. package/dist/models.js.map +1 -1
  23. package/dist/project.d.ts +1 -0
  24. package/dist/project.d.ts.map +1 -1
  25. package/dist/project.js +3 -2
  26. package/dist/project.js.map +1 -1
  27. package/dist/prompt.d.ts +9 -8
  28. package/dist/prompt.d.ts.map +1 -1
  29. package/dist/prompt.js +30 -74
  30. package/dist/prompt.js.map +1 -1
  31. package/dist/providers.d.ts.map +1 -1
  32. package/dist/providers.js +1 -0
  33. package/dist/providers.js.map +1 -1
  34. package/dist/runner.d.ts +5 -1
  35. package/dist/runner.d.ts.map +1 -1
  36. package/dist/runner.js +79 -50
  37. package/dist/runner.js.map +1 -1
  38. package/dist/schemas.d.ts +1 -1
  39. package/dist/schemas.js +1 -1
  40. package/dist/schemas.js.map +1 -1
  41. package/dist/suite-generator.d.ts.map +1 -1
  42. package/dist/suite-generator.js +5 -4
  43. package/dist/suite-generator.js.map +1 -1
  44. package/package.json +1 -1
package/dist/models.d.ts CHANGED
@@ -4,7 +4,8 @@
4
4
  * Assertion-based evaluation: tasks define assertions that are checked
5
5
  * against the agent's execution trace.
6
6
  */
7
- export type HelpMode = 'injected' | 'discoverable' | 'none';
7
+ export type ContextMode = 'zero-shot' | 'help' | 'docs';
8
+ export declare const CONTEXT_MODES: ContextMode[];
8
9
  export type Assertion = {
9
10
  output_contains: string;
10
11
  } | {
@@ -77,7 +78,7 @@ export interface ConfigFile {
77
78
  website_url?: string;
78
79
  github_url?: string;
79
80
  providers?: string[];
80
- help_modes?: string[];
81
+ context?: string[];
81
82
  system_prompt?: string;
82
83
  concurrency?: number;
83
84
  workdir?: string;
@@ -115,7 +116,7 @@ export interface ModelResult {
115
116
  provider: Provider;
116
117
  modelId: string;
117
118
  displayName: string;
118
- helpMode: HelpMode;
119
+ contextMode: ContextMode;
119
120
  taskResults: TaskEval[];
120
121
  passRate: number;
121
122
  avgTurnsToSuccess: number;
@@ -162,6 +163,7 @@ export interface GridReport {
162
163
  repository?: string;
163
164
  tags?: string[];
164
165
  taskSuiteHash?: string;
166
+ taskSuiteContent?: string;
165
167
  thresholdResults?: ThresholdCheckResult;
166
168
  }
167
169
  export interface HelpCache {
@@ -1 +1 @@
1
- {"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,MAAM,MAAM,QAAQ,GAAG,UAAU,GAAG,cAAc,GAAG,MAAM,CAAC;AAM5D,MAAM,MAAM,SAAS,GACjB;IAAE,eAAe,EAAE,MAAM,CAAA;CAAE,GAC3B;IAAE,aAAa,EAAE,MAAM,CAAA;CAAE,GACzB;IAAE,cAAc,EAAE,MAAM,CAAA;CAAE,GAC1B;IAAE,SAAS,EAAE,MAAM,CAAA;CAAE,GACrB;IAAE,WAAW,EAAE,MAAM,CAAA;CAAE,GACvB;IAAE,aAAa,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GACjD;IAAE,GAAG,EAAE,MAAM,CAAA;CAAE,GACf;IAAE,OAAO,EAAE,MAAM,CAAA;CAAE,GACnB;IAAE,SAAS,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GAC9D;IAAE,MAAM,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,eAAe,CAAC,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC;AAElF,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,SAAS,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,IAAI;IACnB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,SAAS,EAAE,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,SAAS;IACxB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,IAAI,EAAE,CAAC;CACf;AAMD,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;AAErD,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,OAAO,GAAG,eAAe,CAAC;CACtC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,UAAU,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,gBAAgB,CAAC;IAC9B,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC;CAC1B;AAMD,MAAM,MAAM,QAAQ,GAAG,MAAM,CAAC;AAE9B,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD,MAAM,WAAW,QAAQ;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,OAAO,EAAE,CAAC;IAC9B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,QAAQ,CAAC;IACnB,WAAW,EAAE,QAAQ,EAAE,CAAC;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,QAAQ,EAAE,OAAO,GAAG,eAAe,CAAC;CACrC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,EAAE,MAAM,CAAC;IACzB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,WAAW,EAAE,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,oBAAoB,CAAC;CACzC;AAMD,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB"}
1
+ {"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,MAAM,MAAM,WAAW,GAAG,WAAW,GAAG,MAAM,GAAG,MAAM,CAAC;AACxD,eAAO,MAAM,aAAa,EAAE,WAAW,EAAkC,CAAC;AAM1E,MAAM,MAAM,SAAS,GACjB;IAAE,eAAe,EAAE,MAAM,CAAA;CAAE,GAC3B;IAAE,aAAa,EAAE,MAAM,CAAA;CAAE,GACzB;IAAE,cAAc,EAAE,MAAM,CAAA;CAAE,GAC1B;IAAE,SAAS,EAAE,MAAM,CAAA;CAAE,GACrB;IAAE,WAAW,EAAE,MAAM,CAAA;CAAE,GACvB;IAAE,aAAa,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GACjD;IAAE,GAAG,EAAE,MAAM,CAAA;CAAE,GACf;IAAE,OAAO,EAAE,MAAM,CAAA;CAAE,GACnB;IAAE,SAAS,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAC;QAAC,GAAG,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,GAC9D;IAAE,MAAM,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,eAAe,CAAC,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC;AAElF,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,SAAS,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,IAAI;IACnB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,SAAS,EAAE,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,SAAS;IACxB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,IAAI,EAAE,CAAC;CACf;AAMD,MAAM,MAAM,UAAU,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;AAErD,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,OAAO,GAAG,eAAe,CAAC;CACtC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,UAAU,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,gBAAgB,CAAC;IAC9B,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC;CAC1B;AAMD,MAAM,MAAM,QAAQ,GAAG,MAAM,CAAC;AAE9B,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD,MAAM,WAAW,QAAQ;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,OAAO,EAAE,CAAC;IAC9B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,WAAW,CAAC;IACzB,WAAW,EAAE,QAAQ,EAAE,CAAC;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,QAAQ,EAAE,OAAO,GAAG,eAAe,CAAC;CACrC;AAED,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,EAAE,MAAM,CAAC;IACzB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,WAAW,EAAE,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,gBAAgB,CAAC,EAAE,oBAAoB,CAAC;CACzC;AAMD,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB"}
package/dist/models.js CHANGED
@@ -4,5 +4,5 @@
4
4
  * Assertion-based evaluation: tasks define assertions that are checked
5
5
  * against the agent's execution trace.
6
6
  */
7
- export {};
7
+ export const CONTEXT_MODES = ['zero-shot', 'help', 'docs'];
8
8
  //# sourceMappingURL=models.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG"}
1
+ {"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH,MAAM,CAAC,MAAM,aAAa,GAAkB,CAAC,WAAW,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC"}
package/dist/project.d.ts CHANGED
@@ -22,5 +22,6 @@ export declare function resolveTaskRefs(tasks: (Task | string)[], baseDir: strin
22
22
  export declare function loadProject(configPath: string): Promise<{
23
23
  config: ConfigFile;
24
24
  tasks: Task[];
25
+ taskSuiteContent: string;
25
26
  }>;
26
27
  //# sourceMappingURL=project.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"project.d.ts","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH,OAAO,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAIpD;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAqBrF;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAItE;AAED;;;GAGG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,EACxB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,IAAI,EAAE,CAAC,CA6CjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,KAAK,EAAE,IAAI,EAAE,CAAA;CAAE,CAAC,CAKpG"}
1
+ {"version":3,"file":"project.d.ts","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH,OAAO,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAIpD;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAqBrF;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAItE;AAED;;;GAGG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,CAAC,IAAI,GAAG,MAAM,CAAC,EAAE,EACxB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,IAAI,EAAE,CAAC,CA6CjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,KAAK,EAAE,IAAI,EAAE,CAAC;IAAC,gBAAgB,EAAE,MAAM,CAAA;CAAE,CAAC,CAM9H"}
package/dist/project.js CHANGED
@@ -5,7 +5,7 @@
5
5
  import { readFile, access } from 'node:fs/promises';
6
6
  import { join, dirname, resolve } from 'node:path';
7
7
  import { glob } from 'node:fs/promises';
8
- import { parse as parseYaml } from 'yaml';
8
+ import { parse as parseYaml, stringify as stringifyYaml } from 'yaml';
9
9
  import { ConfigFileSchema, TaskFileSchema, TaskSchema } from './schemas.js';
10
10
  const CONFIG_FILENAMES = ['cli-bench.yaml', 'cli-bench.yml'];
11
11
  /**
@@ -96,6 +96,7 @@ export async function loadProject(configPath) {
96
96
  const config = await loadConfigFile(configPath);
97
97
  const baseDir = dirname(configPath);
98
98
  const tasks = await resolveTaskRefs(config.tasks, baseDir);
99
- return { config, tasks };
99
+ const taskSuiteContent = stringifyYaml(tasks);
100
+ return { config, tasks, taskSuiteContent };
100
101
  }
101
102
  //# sourceMappingURL=project.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"project.js","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AACxC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAC1C,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAG5E,MAAM,gBAAgB,GAAG,CAAC,gBAAgB,EAAE,eAAe,CAAC,CAAC;AAE7D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,YAAqB;IAC3D,IAAI,YAAY,EAAE,CAAC;QACjB,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;YAC3B,OAAO,OAAO,CAAC,YAAY,CAAC,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,gBAAgB,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;QAChC,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YACxB,OAAO,SAAS,CAAC;QACnB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,IAAY;IAC/C,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAC9B,OAAO,gBAAgB,CAAC,KAAK,CAAC,MAAM,CAAe,CAAC;AACtD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAwB,EACxB,OAAe;IAEf,MAAM,QAAQ,GAAW,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,oBAAoB;YACpB,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC;YAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAEnC,gCAAgC;YAChC,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACtB,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;oBACxC,IAAI,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;wBACtD,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACpB,CAAC;gBACH,CAAC;gBACD,KAAK,CAAC,IAAI,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,CAAC;YAED,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;gBAC7B,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBAC9C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;gBAC9B,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,MAAM,CAAW,CAAC;gBACzD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;oBAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;wBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACtB,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,wCAAwC;YACxC,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAS,CAAC;YAC7C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,UAAkB;IAClD,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,UAAU,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,MAAM,KAAK,GAAG,MAAM,eAAe,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAC3D,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;AAC3B,CAAC"}
1
+ {"version":3,"file":"project.js","sourceRoot":"","sources":["../src/project.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AACxC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,SAAS,IAAI,aAAa,EAAE,MAAM,MAAM,CAAC;AACtE,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAG5E,MAAM,gBAAgB,GAAG,CAAC,gBAAgB,EAAE,eAAe,CAAC,CAAC;AAE7D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,YAAqB;IAC3D,IAAI,YAAY,EAAE,CAAC;QACjB,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;YAC3B,OAAO,OAAO,CAAC,YAAY,CAAC,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,gBAAgB,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;QAChC,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YACxB,OAAO,SAAS,CAAC;QACnB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,IAAY;IAC/C,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAC9B,OAAO,gBAAgB,CAAC,KAAK,CAAC,MAAM,CAAe,CAAC;AACtD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAwB,EACxB,OAAe;IAEf,MAAM,QAAQ,GAAW,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,oBAAoB;YACpB,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC;YAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAEnC,gCAAgC;YAChC,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACtB,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;oBACxC,IAAI,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;wBACtD,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACpB,CAAC;gBACH,CAAC;gBACD,KAAK,CAAC,IAAI,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,CAAC;YAED,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;gBAC7B,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBAC9C,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;gBAC9B,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,MAAM,CAAW,CAAC;gBACzD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;oBAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;wBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACtB,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,wCAAwC;YACxC,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAS,CAAC;YAC7C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC1B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,UAAkB;IAClD,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,UAAU,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,MAAM,KAAK,GAAG,MAAM,eAAe,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAC3D,MAAM,gBAAgB,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;IAC9C,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,gBAAgB,EAAE,CAAC;AAC7C,CAAC"}
package/dist/prompt.d.ts CHANGED
@@ -1,12 +1,13 @@
1
1
  /**
2
- * Builds prompts from help text + task definition.
2
+ * Builds prompts for CLI benchmark agents.
3
3
  *
4
- * Three help modes:
5
- * - injected: help text included in prompt (current behavior)
6
- * - discoverable: agent must run --help to discover commands
7
- * - none: agent relies on training knowledge only
4
+ * One unified system message for all context modes.
5
+ * The user message varies by context mode:
6
+ * - zero-shot: CLI name + task intent only
7
+ * - help: CLI name + top-level --help output + task intent
8
+ * - docs: CLI name + documentation contents + task intent
8
9
  */
9
- import type { HelpMode, Task } from './models.js';
10
- export declare function buildSystemMessage(helpMode: HelpMode, customPrompt?: string): string;
11
- export declare function buildUserMessage(cliName: string, helpTexts: Record<string, string> | null, task: Task, helpMode: HelpMode): string;
10
+ import type { ContextMode, Task } from './models.js';
11
+ export declare function buildSystemMessage(customPrompt?: string): string;
12
+ export declare function buildUserMessage(cliName: string, task: Task, contextMode: ContextMode, contextPayload?: string | null): string;
12
13
  //# sourceMappingURL=prompt.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AA6BlD,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,EAAE,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,CAkBpF;AAED,wBAAgB,gBAAgB,CAC9B,OAAO,EAAE,MAAM,EACf,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,EACxC,IAAI,EAAE,IAAI,EACV,QAAQ,EAAE,QAAQ,GACjB,MAAM,CAaR"}
1
+ {"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AASrD,wBAAgB,kBAAkB,CAAC,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,CAKhE;AAED,wBAAgB,gBAAgB,CAC9B,OAAO,EAAE,MAAM,EACf,IAAI,EAAE,IAAI,EACV,WAAW,EAAE,WAAW,EACxB,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,GAC7B,MAAM,CA+BR"}
package/dist/prompt.js CHANGED
@@ -1,96 +1,52 @@
1
1
  /**
2
- * Builds prompts from help text + task definition.
2
+ * Builds prompts for CLI benchmark agents.
3
3
  *
4
- * Three help modes:
5
- * - injected: help text included in prompt (current behavior)
6
- * - discoverable: agent must run --help to discover commands
7
- * - none: agent relies on training knowledge only
4
+ * One unified system message for all context modes.
5
+ * The user message varies by context mode:
6
+ * - zero-shot: CLI name + task intent only
7
+ * - help: CLI name + top-level --help output + task intent
8
+ * - docs: CLI name + documentation contents + task intent
8
9
  */
9
10
  const AGENT_SYSTEM_MESSAGE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
10
11
 
11
12
  Rules:
12
- - Read the help text carefully to understand available subcommands and flags
13
13
  - Execute commands using the run_command tool
14
14
  - If a command fails, read the error and retry with corrected flags
15
- - Do NOT invent flags that don't exist in the help text
16
15
  - When the task is complete, stop calling tools`;
17
- const AGENT_SYSTEM_MESSAGE_DISCOVERABLE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
18
-
19
- Rules:
20
- - Use <cli> --help and <cli> <subcommand> --help to discover available commands and flags
21
- - Execute commands using the run_command tool
22
- - If a command fails, read the error and retry with corrected flags
23
- - Do NOT invent flags — always check --help first
24
- - When the task is complete, stop calling tools`;
25
- const AGENT_SYSTEM_MESSAGE_NONE = `You are a CLI expert. You have a run_command tool to execute shell commands. Use it to accomplish the user's task.
26
-
27
- Rules:
28
- - Use your training knowledge of CLI tools to construct commands
29
- - Execute commands using the run_command tool
30
- - If a command fails, read the error and retry with corrected flags
31
- - Do NOT run --help commands — rely on your knowledge
32
- - When the task is complete, stop calling tools`;
33
- export function buildSystemMessage(helpMode, customPrompt) {
34
- let base;
35
- switch (helpMode) {
36
- case 'injected':
37
- base = AGENT_SYSTEM_MESSAGE;
38
- break;
39
- case 'discoverable':
40
- base = AGENT_SYSTEM_MESSAGE_DISCOVERABLE;
41
- break;
42
- case 'none':
43
- base = AGENT_SYSTEM_MESSAGE_NONE;
44
- break;
45
- }
16
+ export function buildSystemMessage(customPrompt) {
46
17
  if (customPrompt) {
47
- return `${base}\n\n${customPrompt}`;
18
+ return `${AGENT_SYSTEM_MESSAGE}\n\n${customPrompt}`;
48
19
  }
49
- return base;
20
+ return AGENT_SYSTEM_MESSAGE;
50
21
  }
51
- export function buildUserMessage(cliName, helpTexts, task, helpMode) {
52
- if (helpMode === 'injected' && helpTexts) {
53
- const relevantHelp = selectRelevantHelp(cliName, helpTexts, task);
54
- return `CLI: ${cliName}
22
+ export function buildUserMessage(cliName, task, contextMode, contextPayload) {
23
+ switch (contextMode) {
24
+ case 'help':
25
+ if (contextPayload) {
26
+ return `CLI: ${cliName}
55
27
 
56
28
  Help text:
57
- ${relevantHelp}
29
+ $ ${cliName} --help
30
+ ${contextPayload}
58
31
 
59
32
  Task: ${task.intent}`;
60
- }
61
- return `CLI: ${cliName}
33
+ }
34
+ // Fall through to zero-shot if no help available
35
+ return `CLI: ${cliName}\n\nTask: ${task.intent}`;
36
+ case 'docs':
37
+ if (contextPayload) {
38
+ return `CLI: ${cliName}
39
+
40
+ Documentation:
41
+ ${contextPayload}
62
42
 
63
43
  Task: ${task.intent}`;
64
- }
65
- /**
66
- * Select the most relevant help text sections for a task.
67
- * Trims to stay within reasonable context limits (~4K chars).
68
- */
69
- function selectRelevantHelp(cliName, helpTexts, task) {
70
- const sections = [];
71
- const maxChars = 4000;
72
- let totalChars = 0;
73
- // Always include root help
74
- const rootHelp = helpTexts[''];
75
- if (rootHelp) {
76
- sections.push(`$ ${cliName} --help\n${rootHelp}`);
77
- totalChars += rootHelp.length;
78
- }
79
- // Add help sections that match task keywords
80
- if (totalChars < maxChars * 0.7) {
81
- const taskWords = task.intent.toLowerCase().split(/\s+/);
82
- for (const [key, help] of Object.entries(helpTexts)) {
83
- if (key === '' || sections.some((s) => s.includes(`${cliName} ${key} --help`))) {
84
- continue;
85
- }
86
- const keyWords = key.split(' ');
87
- const matches = keyWords.some((kw) => taskWords.includes(kw));
88
- if (matches && totalChars + help.length < maxChars) {
89
- sections.push(`$ ${cliName} ${key} --help\n${help}`);
90
- totalChars += help.length;
91
44
  }
92
- }
45
+ // Fall through to zero-shot if no docs available
46
+ return `CLI: ${cliName}\n\nTask: ${task.intent}`;
47
+ case 'zero-shot':
48
+ default:
49
+ return `CLI: ${cliName}\n\nTask: ${task.intent}`;
93
50
  }
94
- return sections.join('\n\n');
95
51
  }
96
52
  //# sourceMappingURL=prompt.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,MAAM,oBAAoB,GAAG;;;;;;;gDAOmB,CAAC;AAEjD,MAAM,iCAAiC,GAAG;;;;;;;gDAOM,CAAC;AAEjD,MAAM,yBAAyB,GAAG;;;;;;;gDAOc,CAAC;AAEjD,MAAM,UAAU,kBAAkB,CAAC,QAAkB,EAAE,YAAqB;IAC1E,IAAI,IAAY,CAAC;IACjB,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,UAAU;YACb,IAAI,GAAG,oBAAoB,CAAC;YAC5B,MAAM;QACR,KAAK,cAAc;YACjB,IAAI,GAAG,iCAAiC,CAAC;YACzC,MAAM;QACR,KAAK,MAAM;YACT,IAAI,GAAG,yBAAyB,CAAC;YACjC,MAAM;IACV,CAAC;IAED,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,GAAG,IAAI,OAAO,YAAY,EAAE,CAAC;IACtC,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,OAAe,EACf,SAAwC,EACxC,IAAU,EACV,QAAkB;IAElB,IAAI,QAAQ,KAAK,UAAU,IAAI,SAAS,EAAE,CAAC;QACzC,MAAM,YAAY,GAAG,kBAAkB,CAAC,OAAO,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAClE,OAAO,QAAQ,OAAO;;;EAGxB,YAAY;;QAEN,IAAI,CAAC,MAAM,EAAE,CAAC;IACpB,CAAC;IACD,OAAO,QAAQ,OAAO;;QAEhB,IAAI,CAAC,MAAM,EAAE,CAAC;AACtB,CAAC;AAED;;;GAGG;AACH,SAAS,kBAAkB,CACzB,OAAe,EACf,SAAiC,EACjC,IAAU;IAEV,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,QAAQ,GAAG,IAAI,CAAC;IACtB,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,2BAA2B;IAC3B,MAAM,QAAQ,GAAG,SAAS,CAAC,EAAE,CAAC,CAAC;IAC/B,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,IAAI,CAAC,KAAK,OAAO,YAAY,QAAQ,EAAE,CAAC,CAAC;QAClD,UAAU,IAAI,QAAQ,CAAC,MAAM,CAAC;IAChC,CAAC;IAED,6CAA6C;IAC7C,IAAI,UAAU,GAAG,QAAQ,GAAG,GAAG,EAAE,CAAC;QAChC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACzD,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;YACpD,IAAI,GAAG,KAAK,EAAE,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,CAAC,CAAC,EAAE,CAAC;gBAC/E,SAAS;YACX,CAAC;YACD,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAChC,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC;YAC9D,IAAI,OAAO,IAAI,UAAU,GAAG,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;gBACnD,QAAQ,CAAC,IAAI,CAAC,KAAK,OAAO,IAAI,GAAG,YAAY,IAAI,EAAE,CAAC,CAAC;gBACrD,UAAU,IAAI,IAAI,CAAC,MAAM,CAAC;YAC5B,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC"}
1
+ {"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH,MAAM,oBAAoB,GAAG;;;;;gDAKmB,CAAC;AAEjD,MAAM,UAAU,kBAAkB,CAAC,YAAqB;IACtD,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,GAAG,oBAAoB,OAAO,YAAY,EAAE,CAAC;IACtD,CAAC;IACD,OAAO,oBAAoB,CAAC;AAC9B,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,OAAe,EACf,IAAU,EACV,WAAwB,EACxB,cAA8B;IAE9B,QAAQ,WAAW,EAAE,CAAC;QACpB,KAAK,MAAM;YACT,IAAI,cAAc,EAAE,CAAC;gBACnB,OAAO,QAAQ,OAAO;;;IAG1B,OAAO;EACT,cAAc;;QAER,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;YACD,iDAAiD;YACjD,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;QAEnD,KAAK,MAAM;YACT,IAAI,cAAc,EAAE,CAAC;gBACnB,OAAO,QAAQ,OAAO;;;EAG5B,cAAc;;QAER,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;YACD,iDAAiD;YACjD,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;QAEnD,KAAK,WAAW,CAAC;QACjB;YACE,OAAO,QAAQ,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC;IACrD,CAAC;AACH,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,MAAM,CAAC;IACX,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,eAAO,MAAM,MAAM,EAAE,SAAS,UAAU,EAa9B,CAAC;AAEX,wBAAgB,QAAQ,CAAC,OAAO,EAAE,MAAM,8CAEvC;AAED,wBAAgB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS,CAErE;AAED,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,UAAU,EAAE,CAG7D;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,EAAE,GAAG,UAAU,EAAE,CAWpE;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,IAAI,IAAI,CAMzC"}
1
+ {"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,MAAM,CAAC;IACX,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,eAAO,MAAM,MAAM,EAAE,SAAS,UAAU,EAc9B,CAAC;AAEX,wBAAgB,QAAQ,CAAC,OAAO,EAAE,MAAM,8CAEvC;AAED,wBAAgB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS,CAErE;AAED,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,UAAU,EAAE,CAG7D;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,EAAE,GAAG,UAAU,EAAE,CAWpE;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,IAAI,IAAI,CAMzC"}
package/dist/providers.js CHANGED
@@ -7,6 +7,7 @@
7
7
  import { gateway } from 'ai';
8
8
  export const MODELS = [
9
9
  { id: 'anthropic/claude-opus-4.6', displayName: 'Claude Opus 4.6', provider: 'anthropic' },
10
+ { id: 'anthropic/claude-sonnet-4.6', displayName: 'Claude Sonnet 4.6', provider: 'anthropic' },
10
11
  { id: 'anthropic/claude-sonnet-4-20250514', displayName: 'Claude Sonnet 4', provider: 'anthropic' },
11
12
  { id: 'anthropic/claude-haiku-4.5', displayName: 'Claude Haiku 4.5', provider: 'anthropic' },
12
13
  { id: 'anthropic/claude-haiku-4-5-20251001', displayName: 'Claude Haiku 4.5', provider: 'anthropic' },
@@ -1 +1 @@
1
- {"version":3,"file":"providers.js","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAQ7B,MAAM,CAAC,MAAM,MAAM,GAA0B;IAC3C,EAAE,EAAE,EAAE,2BAA2B,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC1F,EAAE,EAAE,EAAE,oCAAoC,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACnG,EAAE,EAAE,EAAE,4BAA4B,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC5F,EAAE,EAAE,EAAE,qCAAqC,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACrG,EAAE,EAAE,EAAE,gBAAgB,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACpE,EAAE,EAAE,EAAE,eAAe,EAAE,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClE,EAAE,EAAE,EAAE,oBAAoB,EAAE,WAAW,EAAE,aAAa,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAC5E,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,yBAAyB,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACtF,EAAE,EAAE,EAAE,mBAAmB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,EAAE;IAC1E,EAAE,EAAE,EAAE,sBAAsB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAS,EAAE;CACxE,CAAC;AAEX,MAAM,UAAU,QAAQ,CAAC,OAAe;IACtC,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,OAAe;IAC3C,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AAC9C,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,QAAkB;IAC7C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,GAAG,MAAM,CAAC,CAAC;IAC9C,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACvD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,WAAqB;IACpD,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QACjD,IAAI,QAAQ;YAAE,OAAO,EAAE,GAAG,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC1C,OAAO;YACL,EAAE;YACF,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;YAC3B,QAAQ,EAAE,QAAkC;SAC7C,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB;IAChC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CACb,2IAA2I,CAC5I,CAAC;IACJ,CAAC;AACH,CAAC"}
1
+ {"version":3,"file":"providers.js","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAQ7B,MAAM,CAAC,MAAM,MAAM,GAA0B;IAC3C,EAAE,EAAE,EAAE,2BAA2B,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC1F,EAAE,EAAE,EAAE,6BAA6B,EAAE,WAAW,EAAE,mBAAmB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC9F,EAAE,EAAE,EAAE,oCAAoC,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACnG,EAAE,EAAE,EAAE,4BAA4B,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IAC5F,EAAE,EAAE,EAAE,qCAAqC,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,WAAW,EAAE;IACrG,EAAE,EAAE,EAAE,gBAAgB,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACpE,EAAE,EAAE,EAAE,eAAe,EAAE,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClE,EAAE,EAAE,EAAE,oBAAoB,EAAE,WAAW,EAAE,aAAa,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAC5E,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,uBAAuB,EAAE,WAAW,EAAE,gBAAgB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAClF,EAAE,EAAE,EAAE,yBAAyB,EAAE,WAAW,EAAE,kBAAkB,EAAE,QAAQ,EAAE,QAAQ,EAAE;IACtF,EAAE,EAAE,EAAE,mBAAmB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,EAAE;IAC1E,EAAE,EAAE,EAAE,sBAAsB,EAAE,WAAW,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAS,EAAE;CACxE,CAAC;AAEX,MAAM,UAAU,QAAQ,CAAC,OAAe;IACtC,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,OAAe;IAC3C,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AAC9C,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,QAAkB;IAC7C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,GAAG,MAAM,CAAC,CAAC;IAC9C,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACvD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,WAAqB;IACpD,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QACjD,IAAI,QAAQ;YAAE,OAAO,EAAE,GAAG,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC1C,OAAO;YACL,EAAE;YACF,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;YAC3B,QAAQ,EAAE,QAAkC;SAC7C,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB;IAChC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CACb,2IAA2I,CAC5I,CAAC;IACJ,CAAC;AACH,CAAC"}
package/dist/runner.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Orchestrates the task x model x helpMode matrix with tool-calling agents.
2
+ * Orchestrates the task x model x contextMode matrix with tool-calling agents.
3
3
  *
4
4
  * For each task:
5
5
  * - Create temp workdir (or use configured one)
@@ -35,6 +35,10 @@ export interface RunGridOptions {
35
35
  category?: string;
36
36
  websiteUrl?: string;
37
37
  githubUrl?: string;
38
+ /** Serialized YAML of the resolved task suite. */
39
+ taskSuiteContent?: string;
40
+ /** Directory containing the config file (for resolving docs.md). */
41
+ configDir?: string;
38
42
  }
39
43
  export declare function runGrid(opts: RunGridOptions): Promise<GridReport[]>;
40
44
  export declare function uploadReport(report: GridReport, backendUrl: string, apiKey: string): Promise<void>;
@@ -1 +1 @@
1
- {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AASH,OAAO,KAAK,EACV,SAAS,EACT,IAAI,EACJ,UAAU,EAKX,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AASjD,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAKxE;AAED,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAc9B;AAkND,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,8CAA8C;IAC9C,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC;IACf,iDAAiD;IACjD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+CAA+C;IAC/C,MAAM,CAAC,EAAE,UAAU,EAAE,CAAC;IACtB,wDAAwD;IACxD,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,0DAA0D;IAC1D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,mDAAmD;IACnD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,wBAAsB,OAAO,CAAC,IAAI,EAAE,cAAc,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC,CAkEzE;AA4ND,wBAAsB,YAAY,CAChC,MAAM,EAAE,UAAU,EAClB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,IAAI,CAAC,CAsBf"}
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AASH,OAAO,KAAK,EACV,SAAS,EACT,IAAI,EACJ,UAAU,EAKX,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AASjD,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAKxE;AAED,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAc9B;AAiOD,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,8CAA8C;IAC9C,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC;IACf,iDAAiD;IACjD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+CAA+C;IAC/C,MAAM,CAAC,EAAE,UAAU,EAAE,CAAC;IACtB,wDAAwD;IACxD,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,0DAA0D;IAC1D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,mDAAmD;IACnD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kDAAkD;IAClD,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,oEAAoE;IACpE,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,wBAAsB,OAAO,CAAC,IAAI,EAAE,cAAc,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC,CAoEzE;AAkPD,wBAAsB,YAAY,CAChC,MAAM,EAAE,UAAU,EAClB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,IAAI,CAAC,CAsBf"}
package/dist/runner.js CHANGED
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Orchestrates the task x model x helpMode matrix with tool-calling agents.
2
+ * Orchestrates the task x model x contextMode matrix with tool-calling agents.
3
3
  *
4
4
  * For each task:
5
5
  * - Create temp workdir (or use configured one)
@@ -18,7 +18,7 @@ import { z } from 'zod';
18
18
  import { getModel, filterModels } from './providers.js';
19
19
  import { buildSystemMessage, buildUserMessage } from './prompt.js';
20
20
  import { runAssertions } from './assertions.js';
21
- import { loadHelpFromCache, loadHelpLive } from './help-loader.js';
21
+ import { loadTopLevelHelp } from './help-loader.js';
22
22
  import { execCommand, runSetup } from './exec.js';
23
23
  import { TaskSuiteSchema } from './schemas.js';
24
24
  import { detectCI, computeTaskSuiteHash } from './ci.js';
@@ -143,10 +143,12 @@ async function runAgentTask(model, systemMessage, userMessage, task, workdir) {
143
143
  })),
144
144
  usage: step.usage,
145
145
  }));
146
+ // Cap turnsUsed at max_turns to guard against SDK retry anomalies
147
+ const maxTurns = task.max_turns ?? 5;
146
148
  return {
147
149
  passed: allPassed,
148
150
  assertionResults,
149
- turnsUsed: commandsRun.length,
151
+ turnsUsed: Math.min(commandsRun.length, maxTurns),
150
152
  commandsRun,
151
153
  totalInputTokens,
152
154
  totalOutputTokens,
@@ -162,8 +164,8 @@ async function runAgentTask(model, systemMessage, userMessage, task, workdir) {
162
164
  return {
163
165
  passed: false,
164
166
  assertionResults: [],
165
- turnsUsed: commandsRun.length,
166
- commandsRun,
167
+ turnsUsed: 0,
168
+ commandsRun: [],
167
169
  totalInputTokens: 0,
168
170
  totalOutputTokens: 0,
169
171
  totalLatencyMs: Date.now() - start,
@@ -175,26 +177,36 @@ async function runAgentTask(model, systemMessage, userMessage, task, workdir) {
175
177
  };
176
178
  }
177
179
  }
178
- function aggregateModelResult(model, taskResults, helpMode) {
179
- const passRate = taskResults.length > 0
180
- ? taskResults.filter((t) => t.passed).length / taskResults.length
180
+ /** An eval with 0 tokens is an infra error (gateway timeout, auth failure, etc.) — not a real task result. */
181
+ function isInfraError(t) {
182
+ return t.totalInputTokens + t.totalOutputTokens === 0 && !t.passed;
183
+ }
184
+ function aggregateModelResult(model, taskResults, contextMode) {
185
+ // Exclude infra errors from pass rate — they're not real task failures
186
+ const scorable = taskResults.filter((t) => !isInfraError(t));
187
+ const infraErrors = taskResults.length - scorable.length;
188
+ if (infraErrors > 0) {
189
+ console.warn(` ⚠ ${infraErrors} task(s) excluded from stats (infra error, 0 tokens)`);
190
+ }
191
+ const passRate = scorable.length > 0
192
+ ? scorable.filter((t) => t.passed).length / scorable.length
181
193
  : 0;
182
- const avgLatencyMs = taskResults.length > 0
183
- ? Math.round(taskResults.reduce((a, t) => a + t.totalLatencyMs, 0) / taskResults.length)
194
+ const avgLatencyMs = scorable.length > 0
195
+ ? Math.round(scorable.reduce((a, t) => a + t.totalLatencyMs, 0) / scorable.length)
184
196
  : 0;
185
- const passedTasks = taskResults.filter((t) => t.passed);
197
+ const passedTasks = scorable.filter((t) => t.passed);
186
198
  const avgTurnsToSuccess = passedTasks.length > 0
187
199
  ? passedTasks.reduce((a, t) => a + t.turnsUsed, 0) / passedTasks.length
188
200
  : 0;
189
- const avgTotalTokens = taskResults.length > 0
190
- ? taskResults.reduce((a, t) => a + t.totalInputTokens + t.totalOutputTokens, 0) /
191
- taskResults.length
201
+ const avgTotalTokens = scorable.length > 0
202
+ ? scorable.reduce((a, t) => a + t.totalInputTokens + t.totalOutputTokens, 0) /
203
+ scorable.length
192
204
  : 0;
193
205
  return {
194
206
  provider: model.provider,
195
207
  modelId: model.id,
196
208
  displayName: model.displayName,
197
- helpMode,
209
+ contextMode,
198
210
  taskResults,
199
211
  passRate,
200
212
  avgTurnsToSuccess,
@@ -212,7 +224,7 @@ export async function runGrid(opts) {
212
224
  cliName: opts.cliName,
213
225
  tasks: opts.tasks,
214
226
  models: opts.models ?? filterModels(config.models),
215
- helpModes: config.helpModes,
227
+ contextModes: config.contextModes,
216
228
  limiter,
217
229
  config,
218
230
  versionCommand: opts.versionCommand,
@@ -223,6 +235,8 @@ export async function runGrid(opts) {
223
235
  category: opts.category,
224
236
  websiteUrl: opts.websiteUrl,
225
237
  githubUrl: opts.githubUrl,
238
+ taskSuiteContent: opts.taskSuiteContent,
239
+ configDir: opts.configDir,
226
240
  });
227
241
  if (report)
228
242
  reports.push(report);
@@ -246,7 +260,7 @@ export async function runGrid(opts) {
246
260
  cliName,
247
261
  tasks: suite.tasks,
248
262
  models,
249
- helpModes: config.helpModes,
263
+ contextModes: config.contextModes,
250
264
  limiter,
251
265
  config,
252
266
  versionCommand: suite.version_command,
@@ -260,32 +274,43 @@ export async function runGrid(opts) {
260
274
  }
261
275
  return reports;
262
276
  }
263
- async function runCliGrid(opts) {
264
- const { cliName, tasks, models, helpModes, limiter, config, versionCommand, systemPrompt } = opts;
265
- console.log(`\n=== ${cliName} ===`);
266
- // Load help text (used for injected mode)
267
- let helpTexts = null;
268
- let cliVersion;
269
- if (helpModes.includes('injected')) {
270
- const helpCache = await loadHelpFromCache(config.helpCacheDir, cliName);
271
- if (helpCache) {
272
- helpTexts = helpCache.help_texts;
273
- cliVersion = helpCache.version;
277
+ /**
278
+ * Load context payload for a given mode.
279
+ */
280
+ async function loadContextPayload(contextMode, cliName, configDir) {
281
+ switch (contextMode) {
282
+ case 'help': {
283
+ const helpText = await loadTopLevelHelp(cliName);
284
+ if (!helpText) {
285
+ console.warn(` Warning: could not capture --help for ${cliName}, falling back to zero-shot`);
286
+ }
287
+ return helpText;
274
288
  }
275
- else {
276
- console.log(` No cached help text, trying live capture...`);
289
+ case 'docs': {
290
+ if (!configDir) {
291
+ console.warn(` Warning: no config directory for docs mode, falling back to zero-shot`);
292
+ return null;
293
+ }
277
294
  try {
278
- const live = await loadHelpLive(cliName);
279
- helpTexts = live.help_texts;
280
- cliVersion = live.version;
295
+ const docsPath = join(configDir, 'docs.md');
296
+ return await readFile(docsPath, 'utf-8');
281
297
  }
282
298
  catch {
283
- console.warn(` Failed to capture help text for ${cliName} (injected mode may fail)`);
299
+ console.warn(` Warning: docs.md not found in ${configDir}, falling back to zero-shot`);
300
+ return null;
284
301
  }
285
302
  }
303
+ case 'zero-shot':
304
+ default:
305
+ return null;
286
306
  }
307
+ }
308
+ async function runCliGrid(opts) {
309
+ const { cliName, tasks, models, contextModes, limiter, config, versionCommand, systemPrompt } = opts;
310
+ console.log(`\n=== ${cliName} ===`);
311
+ let cliVersion;
287
312
  // Detect CLI version
288
- if (!cliVersion && versionCommand) {
313
+ if (versionCommand) {
289
314
  cliVersion = await detectCliVersion(versionCommand);
290
315
  if (cliVersion)
291
316
  console.log(` CLI version: ${cliVersion}`);
@@ -294,11 +319,12 @@ async function runCliGrid(opts) {
294
319
  if (config.dryRun) {
295
320
  const firstTask = tasks[0];
296
321
  if (firstTask) {
297
- for (const helpMode of helpModes) {
298
- const systemMessage = buildSystemMessage(helpMode, systemPrompt);
299
- const userMessage = buildUserMessage(cliName, helpTexts, firstTask, helpMode);
300
- console.log(`\n--- Dry Run: ${cliName} / ${firstTask.id} (${helpMode}) ---`);
301
- console.log(`System: ${systemMessage}\n`);
322
+ for (const contextMode of contextModes) {
323
+ const contextPayload = await loadContextPayload(contextMode, cliName, opts.configDir);
324
+ const sysMsg = buildSystemMessage(systemPrompt);
325
+ const userMessage = buildUserMessage(cliName, firstTask, contextMode, contextPayload);
326
+ console.log(`\n--- Dry Run: ${cliName} / ${firstTask.id} (${contextMode}) ---`);
327
+ console.log(`System: ${sysMsg}\n`);
302
328
  console.log(`User:\n${userMessage}\n`);
303
329
  console.log(`--- End Dry Run ---\n`);
304
330
  }
@@ -306,9 +332,10 @@ async function runCliGrid(opts) {
306
332
  return null;
307
333
  }
308
334
  const modelResults = [];
309
- for (const helpMode of helpModes) {
310
- console.log(`\n Help mode: ${helpMode}`);
311
- const systemMessage = buildSystemMessage(helpMode, systemPrompt);
335
+ for (const contextMode of contextModes) {
336
+ console.log(`\n Context: ${contextMode}`);
337
+ const contextPayload = await loadContextPayload(contextMode, cliName, opts.configDir);
338
+ const sysMsg = buildSystemMessage(systemPrompt);
312
339
  for (const model of models) {
313
340
  console.log(` Model: ${model.displayName}`);
314
341
  const taskResults = [];
@@ -324,11 +351,11 @@ async function runCliGrid(opts) {
324
351
  const workdir = opts.workdir ?? config.workdir ?? await mkdtemp(join(tmpdir(), 'cli-bench-'));
325
352
  const shouldCleanup = !opts.workdir && !config.workdir;
326
353
  try {
327
- const userMessage = buildUserMessage(cliName, helpTexts, task, helpMode);
354
+ const userMessage = buildUserMessage(cliName, task, contextMode, contextPayload);
328
355
  if (task.setup && task.setup.length > 0) {
329
356
  await runSetup(task.setup, { cwd: workdir });
330
357
  }
331
- const agentResult = await runAgentTask(model, systemMessage, userMessage, task, workdir);
358
+ const agentResult = await runAgentTask(model, sysMsg, userMessage, task, workdir);
332
359
  const taskEval = {
333
360
  taskId: task.id,
334
361
  passed: agentResult.passed,
@@ -353,9 +380,10 @@ async function runCliGrid(opts) {
353
380
  },
354
381
  repeatIndex: repeatTotal > 1 ? repeatIndex : undefined,
355
382
  };
356
- const icon = agentResult.passed ? '✓' : '✗';
383
+ const totalTokens = agentResult.totalInputTokens + agentResult.totalOutputTokens;
384
+ const icon = agentResult.passed ? '✓' : totalTokens === 0 ? '⚠' : '✗';
357
385
  const repeatLabel = repeatTotal > 1 ? ` [${repeatIndex + 1}/${repeatTotal}]` : '';
358
- console.log(` ${icon} ${task.id}${repeatLabel} (turns=${agentResult.turnsUsed}, tokens=${agentResult.totalInputTokens + agentResult.totalOutputTokens})`);
386
+ console.log(` ${icon} ${task.id}${repeatLabel} (turns=${agentResult.turnsUsed}, tokens=${totalTokens})`);
359
387
  return taskEval;
360
388
  }
361
389
  finally {
@@ -368,7 +396,7 @@ async function runCliGrid(opts) {
368
396
  for (const taskEval of results) {
369
397
  taskResults.push(taskEval);
370
398
  }
371
- modelResults.push(aggregateModelResult(model, taskResults, helpMode));
399
+ modelResults.push(aggregateModelResult(model, taskResults, contextMode));
372
400
  }
373
401
  }
374
402
  const ciMetadata = detectCI();
@@ -391,13 +419,14 @@ async function runCliGrid(opts) {
391
419
  totalEvals,
392
420
  generatedAt: new Date().toISOString(),
393
421
  gridVersion: '0.4.0',
394
- systemPrompt: buildSystemMessage(helpModes[0], systemPrompt),
422
+ systemPrompt: buildSystemMessage(systemPrompt),
395
423
  displayName: opts.displayName,
396
424
  category: opts.category,
397
425
  websiteUrl: opts.websiteUrl,
398
426
  githubUrl: opts.githubUrl,
399
427
  ...ciMetadata,
400
428
  taskSuiteHash,
429
+ taskSuiteContent: opts.taskSuiteContent,
401
430
  };
402
431
  printReportSummary(report);
403
432
  return report;
@@ -406,7 +435,7 @@ function printReportSummary(report) {
406
435
  console.log(`\n--- ${report.cli} Summary ---`);
407
436
  console.log(`Tasks: ${report.taskCount}${report.totalEvals ? `, Evals: ${report.totalEvals}` : ''}`);
408
437
  for (const mr of report.modelResults) {
409
- console.log(` ${mr.displayName} [${mr.helpMode}]: pass=${(mr.passRate * 100).toFixed(0)}% avgTurns=${mr.avgTurnsToSuccess.toFixed(1)} avgTokens=${Math.round(mr.avgTotalTokens)}`);
438
+ console.log(` ${mr.displayName} [${mr.contextMode}]: pass=${(mr.passRate * 100).toFixed(0)}% avgTurns=${mr.avgTurnsToSuccess.toFixed(1)} avgTokens=${Math.round(mr.avgTotalTokens)}`);
410
439
  // Per-task breakdown when repeats are used
411
440
  if (report.totalEvals && report.totalEvals > report.taskCount) {
412
441
  const byTask = new Map();