@vercel/agent-eval 0.0.9 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +69 -13
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/lib/agents/index.d.ts.map +1 -1
- package/dist/lib/agents/index.js +0 -2
- package/dist/lib/agents/index.js.map +1 -1
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js +3 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/o11y/index.d.ts +11 -0
- package/dist/lib/o11y/index.d.ts.map +1 -0
- package/dist/lib/o11y/index.js +11 -0
- package/dist/lib/o11y/index.js.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts +18 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.js +343 -0
- package/dist/lib/o11y/parsers/claude-code.js.map +1 -0
- package/dist/lib/o11y/parsers/codex.d.ts +17 -0
- package/dist/lib/o11y/parsers/codex.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/codex.js +296 -0
- package/dist/lib/o11y/parsers/codex.js.map +1 -0
- package/dist/lib/o11y/parsers/index.d.ts +51 -0
- package/dist/lib/o11y/parsers/index.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/index.js +280 -0
- package/dist/lib/o11y/parsers/index.js.map +1 -0
- package/dist/lib/o11y/parsers/opencode.d.ts +17 -0
- package/dist/lib/o11y/parsers/opencode.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/opencode.js +313 -0
- package/dist/lib/o11y/parsers/opencode.js.map +1 -0
- package/dist/lib/o11y/types.d.ts +113 -0
- package/dist/lib/o11y/types.d.ts.map +1 -0
- package/dist/lib/o11y/types.js +6 -0
- package/dist/lib/o11y/types.js.map +1 -0
- package/dist/lib/results.d.ts +2 -1
- package/dist/lib/results.d.ts.map +1 -1
- package/dist/lib/results.js +23 -7
- package/dist/lib/results.js.map +1 -1
- package/dist/lib/runner.d.ts +6 -5
- package/dist/lib/runner.d.ts.map +1 -1
- package/dist/lib/runner.js +21 -11
- package/dist/lib/runner.js.map +1 -1
- package/dist/lib/types.d.ts +25 -3
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/package.json +5 -3
- package/README.md +0 -474
- package/dist/lib/agents/ai-sdk-agent.d.ts +0 -10
- package/dist/lib/agents/ai-sdk-agent.d.ts.map +0 -1
- package/dist/lib/agents/ai-sdk-agent.js +0 -427
- package/dist/lib/agents/ai-sdk-agent.js.map +0 -1
package/dist/lib/runner.js
CHANGED
|
@@ -62,7 +62,7 @@ export async function runExperiment(options) {
|
|
|
62
62
|
let timeoutId;
|
|
63
63
|
const agentResult = await Promise.race([
|
|
64
64
|
agent.run(fixture.path, {
|
|
65
|
-
prompt: fixture.prompt,
|
|
65
|
+
prompt: config.editPrompt ? config.editPrompt(fixture.prompt) : fixture.prompt,
|
|
66
66
|
model: config.model,
|
|
67
67
|
timeout: timeoutMs,
|
|
68
68
|
apiKey,
|
|
@@ -161,15 +161,25 @@ export async function runExperiment(options) {
|
|
|
161
161
|
*/
|
|
162
162
|
export async function runSingleEval(fixture, options) {
|
|
163
163
|
const agent = getAgent(options.agent ?? 'vercel-ai-gateway/claude-code');
|
|
164
|
-
const
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
164
|
+
const models = Array.isArray(options.model) ? options.model : [options.model];
|
|
165
|
+
const prompt = options.editPrompt ? options.editPrompt(fixture.prompt) : fixture.prompt;
|
|
166
|
+
const results = [];
|
|
167
|
+
for (const model of models) {
|
|
168
|
+
const agentResult = await agent.run(fixture.path, {
|
|
169
|
+
prompt,
|
|
170
|
+
model,
|
|
171
|
+
timeout: options.timeout * 1000,
|
|
172
|
+
apiKey: options.apiKey,
|
|
173
|
+
setup: options.setup,
|
|
174
|
+
scripts: options.scripts,
|
|
175
|
+
sandbox: options.sandbox,
|
|
176
|
+
});
|
|
177
|
+
results.push(agentResultToEvalRunData(agentResult));
|
|
178
|
+
}
|
|
179
|
+
// TODO: remove this on the next major and return an array directly...it's just here to prevent breaking changes
|
|
180
|
+
if (!Array.isArray(options.model)) {
|
|
181
|
+
return results[0];
|
|
182
|
+
}
|
|
183
|
+
return results;
|
|
174
184
|
}
|
|
175
185
|
//# sourceMappingURL=runner.js.map
|
package/dist/lib/runner.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAUH,OAAO,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAC7C,OAAO,EACL,wBAAwB,EACxB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,EACX,kBAAkB,EAClB,eAAe,EACf,qBAAqB,GACtB,MAAM,cAAc,CAAC;AAwCtB;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAA6B;IAE7B,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;IAC9F,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC;IAE7B,8BAA8B;IAC9B,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAErC,MAAM,GAAG,GAAG,CAAC,GAAW,EAAE,EAAE;QAC1B,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC,GAAG,CAAC,CAAC;QAClB,CAAC;aAAM,IAAI,OAAO,EAAE,CAAC;YACnB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC,CAAC;IAEF,mDAAmD;IACnD,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,eAAe,EAAE,CAAC,CAAC;IAC5D,CAAC;IAED,oCAAoC;IACpC,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,QAAQ,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAC;QAC1C,CAAC;IACH,CAAC;IAED,GAAG,CAAC,YAAY,QAAQ,CAAC,MAAM,gCAAgC,QAAQ,CAAC,MAAM,YAAY,MAAM,CAAC,IAAI,QAAQ,CAAC,CAAC;IAE/G,uBAAuB;IACvB,MAAM,UAAU,GAAG,KAAK,EAAE,OAAoB,EAA0B,EAAE;QACxE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;QACtC,MAAM,UAAU,GAAG,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;QAEvD,2CAA2C;QAC3C,IAAI,UAAU,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YAC9B,OAAO;gBACL,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,QAAQ;gBACR,OAAO,EAAE;oBACP,MAAM,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,EAAE;iBAC5D;gBACD,OAAO,EAAE,IAAI;aACd,CAAC;QACJ,CAAC;QAED,GAAG,CAAC,qBAAqB,CAAC,OAAO,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;QAEpE,MAAM,SAAS,GAAG,MAAM,CAAC,OAAO,GAAG,IAAI,CAAC;QACxC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,oDAAoD;QACpD,MAAM,iBAAiB,GAAG,IAAI,eAAe,EAAE,CAAC;QAEhD,yDAAyD;QACzD,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;YACrB,UAAU,CAAC,MAAM,CAAC,gBAAgB,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,iBAAiB,CAAC,KAAK,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/F,CAAC;QAED,IAAI,SAAoD,CAAC;QAEzD,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC;YACrC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE;gBACtB,MAAM,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM;gBAC9E,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,OAAO,EAAE,SAAS;gBAClB,MAAM;gBACN,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,MAAM,EAAE,iBAAiB,CAAC,MAAM;gBAChC,OAAO,EAAE,MAAM,CAAC,OAAO;aACxB,CAAC;YACF,IAAI,OAAO,CAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE;gBAC/B,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC1B,iBAAiB,CAAC,KAAK,EAAE,CAAC,CAAC,mCAAmC;oBAC9D,MAAM,CAAC,IAAI,KAAK,CAAC,wBAAwB,MAAM,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC;gBAC/D,CAAC,EAAE,SAAS,CAAC,CAAC;YAChB,CAAC,CAAC;SACH,CAAC,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;YACjB,8CAA8C;YAC9C,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBAClE,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,MAAM,EAAE,EAAE;oBACV,KAAK,EAAE,KAAK,CAAC,OAAO;oBACpB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;iBACjC,CAAC;YACJ,CAAC;YACD,MAAM,KAAK,CAAC;QACd,CAAC,CAAC,CAAC;QAEH,kDAAkD;QAClD,IAAI,SAAS;YAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QAEvC,4BAA4B;QAC5B,IAAI,WAAW,CAAC,KAAK,KAAK,SAAS,IAAI,WAAW,CAAC,KAAK,KAAK,sBAAsB,EAAE,CAAC;YACpF,OAAO;gBACL,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,QAAQ;gBACR,OAAO,EAAE;oBACP,MAAM,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,WAAW,CAAC,QAAQ,GAAG,IAAI,EAAE;iBACtF;gBACD,OAAO,EAAE,IAAI;aACd,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,wBAAwB,CAAC,WAAW,CAAC,CAAC;QAEtD,GAAG,CAAC,eAAe,CAAC,OAAO,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC,EAAE,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC;QAE9E,4EAA4E;QAC5E,IAAI,MAAM,CAAC,SAAS,IAAI,OAAO,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YAC3D,GAAG,CAAC,eAAe,OAAO,CAAC,IAAI,kBAAkB,QAAQ,GAAG,CAAC,+BAA+B,CAAC,CAAC;YAC9F,UAAU,CAAC,KAAK,EAAE,CAAC;QACrB,CAAC;QAED,OAAO;YACL,WAAW,EAAE,OAAO,CAAC,IAAI;YACzB,QAAQ;YACR,OAAO;SACR,CAAC;IACJ,CAAC,CAAC;IAEF,gCAAgC;IAChC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC;IAE5D,sDAAsD;IACtD,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YACpB,gBAAgB,CAAC,GAAG,CAAC,MAAM,CAAC,WAAW,CAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,6CAA6C;IAC7C,MAAM,aAAa,GAAkB,EAAE,CAAC;IACxC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,cAAc,GAAG,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;QAE3D,wCAAwC;QACxC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;QAEvD,MAAM,WAAW,GAAkB,EAAE,CAAC;QACtC,KAAK,MAAM,MAAM,IAAI,cAAc,EAAE,CAAC;YACpC,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YAEjC,iDAAiD;YACjD,IAAI,MAAM,CAAC,SAAS,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;gBAClE,MAAM;YACR,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,iBAAiB,CAAC,OAAO,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;QAC7D,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC9B,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,IAAI,EAAE,CAAC;IAC/B,MAAM,iBAAiB,GAAG,uBAAuB,CAAC,MAAM,EAAE,aAAa,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;IAEjG,uBAAuB;IACvB,MAAM,SAAS,GAAG,WAAW,CAAC,iBAAiB,EAAE;QAC/C,UAAU;QACV,cAAc;KACf,CAAC,CAAC;IAEH,GAAG,CAAC,uBAAuB,SAAS,EAAE,CAAC,CAAC;IACxC,GAAG,CAAC,kBAAkB,CAAC,iBAAiB,CAAC,CAAC,CAAC;IAE3C,OAAO,iBAAiB,CAAC;AAC3B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAoB,EACpB,OAUC;IAED,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,IAAI,+BAA+B,CAAC,CAAC;IAEzE,MAAM,MAAM,GAAa,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACxF,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC;IAExF,MAAM,OAAO,GAAkB,EAAE,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAE9B,MAAM,WAAW,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE;YACjD,MAAM;YACN,KAAK;YACL,OAAO,EAAE,OAAO,CAAC,OAAO,GAAG,IAAI;YAC/B,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,OAAO,EAAE,OAAO,CAAC,OAAO;SACxB,CAAC,CAAC;QAEA,OAAO,CAAC,IAAI,CAAC,wBAAwB,CAAC,WAAW,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,gHAAgH;IAChH,IAAG,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,OAAO,OAAO,CAAC,CAAC,CAA2D,CAAC;IAC3E,CAAC;IAED,OAAO,OAAiE,CAAC;AAC3E,CAAC"}
|
package/dist/lib/types.d.ts
CHANGED
|
@@ -50,8 +50,10 @@ export type SandboxBackend = 'vercel' | 'docker';
|
|
|
50
50
|
export interface ExperimentConfig {
|
|
51
51
|
/** Which AI agent to use */
|
|
52
52
|
agent: AgentType;
|
|
53
|
-
/** Which AI model the agent should use.
|
|
54
|
-
|
|
53
|
+
/** Which AI model the agent should use. Can be a single model or array of models to test.
|
|
54
|
+
* If an array is provided, the experiment will run on each model.
|
|
55
|
+
* Default is agent-specific: 'opus' for claude-code, 'openai/gpt-5.2-codex' for codex */
|
|
56
|
+
model?: ModelTier | ModelTier[];
|
|
55
57
|
/** Which evals to run. Can be a string, array, or filter function. @default '*' (all evals) */
|
|
56
58
|
evals?: string | string[] | EvalFilter;
|
|
57
59
|
/** How many times to run each eval. @default 1 */
|
|
@@ -66,11 +68,28 @@ export interface ExperimentConfig {
|
|
|
66
68
|
setup?: SetupFunction;
|
|
67
69
|
/** Sandbox backend to use. @default 'auto' (Vercel if token present, else Docker) */
|
|
68
70
|
sandbox?: SandboxBackend | 'auto';
|
|
71
|
+
/** Optional function to modify the prompt before running the experiment. @default undefined */
|
|
72
|
+
editPrompt?: (prompt: string) => string;
|
|
69
73
|
}
|
|
70
74
|
/**
|
|
71
75
|
* Resolved experiment config with all defaults applied.
|
|
72
76
|
*/
|
|
73
77
|
export interface ResolvedExperimentConfig {
|
|
78
|
+
agent: AgentType;
|
|
79
|
+
model: ModelTier | ModelTier[];
|
|
80
|
+
evals: string | string[] | EvalFilter;
|
|
81
|
+
runs: number;
|
|
82
|
+
earlyExit: boolean;
|
|
83
|
+
scripts: string[];
|
|
84
|
+
timeout: number;
|
|
85
|
+
setup?: SetupFunction;
|
|
86
|
+
sandbox: SandboxBackend | 'auto';
|
|
87
|
+
editPrompt?: (prompt: string) => string;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Resolved experiment config with all defaults applied.
|
|
91
|
+
*/
|
|
92
|
+
export interface RunnableExperimentConfig {
|
|
74
93
|
agent: AgentType;
|
|
75
94
|
model: ModelTier;
|
|
76
95
|
evals: string | string[] | EvalFilter;
|
|
@@ -80,6 +99,7 @@ export interface ResolvedExperimentConfig {
|
|
|
80
99
|
timeout: number;
|
|
81
100
|
setup?: SetupFunction;
|
|
82
101
|
sandbox: SandboxBackend | 'auto';
|
|
102
|
+
editPrompt?: (prompt: string) => string;
|
|
83
103
|
}
|
|
84
104
|
/**
|
|
85
105
|
* Required files for a valid eval fixture.
|
|
@@ -115,8 +135,10 @@ export interface EvalRunResult {
|
|
|
115
135
|
error?: string;
|
|
116
136
|
/** Duration in seconds */
|
|
117
137
|
duration: number;
|
|
118
|
-
/** Path to transcript file (relative to run directory) */
|
|
138
|
+
/** Path to parsed transcript file (relative to run directory) */
|
|
119
139
|
transcriptPath?: string;
|
|
140
|
+
/** Path to raw transcript file (relative to run directory) */
|
|
141
|
+
transcriptRawPath?: string;
|
|
120
142
|
/** Paths to output files (relative to run directory) */
|
|
121
143
|
outputPaths?: {
|
|
122
144
|
/** Path to EVAL.ts test output */
|
package/dist/lib/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,MAAM,SAAS,GACjB,+BAA+B,GAC/B,aAAa,GACb,yBAAyB,GACzB,OAAO,GACP,4BAA4B,CAAC;AAEjC;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;AAEnD;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,mCAAmC;IACnC,UAAU,CACR,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,MAAM,EAAE,EACf,OAAO,CAAC,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;KAAE,GACzC,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,mCAAmC;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,wCAAwC;IACxC,mBAAmB,IAAI,MAAM,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;AAEhE;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,QAAQ,GAAG,QAAQ,CAAC;AAEjD;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,MAAM,SAAS,GACjB,+BAA+B,GAC/B,aAAa,GACb,yBAAyB,GACzB,OAAO,GACP,4BAA4B,CAAC;AAEjC;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;AAEnD;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,mCAAmC;IACnC,UAAU,CACR,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,MAAM,EAAE,EACf,OAAO,CAAC,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;KAAE,GACzC,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,mCAAmC;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,wCAAwC;IACxC,mBAAmB,IAAI,MAAM,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;AAEhE;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,QAAQ,GAAG,QAAQ,CAAC;AAEjD;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB;;6FAEyF;IACzF,KAAK,CAAC,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAEhC,+FAA+F;IAC/F,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IAEvC,kDAAkD;IAClD,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd,qDAAqD;IACrD,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,mEAAmE;IACnE,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IAEnB,8EAA8E;IAC9E,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,uEAAuE;IACvE,KAAK,CAAC,EAAE,aAAa,CAAC;IAEtB,qFAAqF;IACrF,OAAO,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;IAElC,+FAA+F;IAC/F,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAC/B,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,OAAO,EAAE,cAAc,GAAG,MAAM,CAAC;IACjC,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,OAAO,EAAE,cAAc,GAAG,MAAM,CAAC;IACjC,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;CACzC;AAED;;;GAGG;AACH,eAAO,MAAM,mBAAmB,mDAAoD,CAAC;AAErF;;;;GAIG;AACH,eAAO,MAAM,cAAc,uEAAwE,CAAC;AAEpG;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,qCAAqC;IACrC,IAAI,EAAE,MAAM,CAAC;IACb,uCAAuC;IACvC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gDAAgD;IAChD,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,0BAA0B;IAC1B,MAAM,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC5B,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,8DAA8D;IAC9D,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,wDAAwD;IACxD,WAAW,CAAC,EAAE;QACZ,kCAAkC;QAClC,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,8DAA8D;QAC9D,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,0DAA0D;IAC1D,MAAM,EAAE,aAAa,CAAC;IACtB,yEAAyE;IACzE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qDAAqD;IACrD,aAAa,CAAC,EAAE;QACd,0BAA0B;QAC1B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,qDAAqD;QACrD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,4BAA4B;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,YAAY,EAAE,MAAM,CAAC;IACrB,2EAA2E;IAC3E,IAAI,EAAE,WAAW,EAAE,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB,0CAA0C;IAC1C,WAAW,EAAE,MAAM,CAAC;IACpB,oCAAoC;IACpC,MAAM,EAAE,wBAAwB,CAAC;IACjC,4BAA4B;IAC5B,KAAK,EAAE,WAAW,EAAE,CAAC;CACtB"}
|
package/dist/lib/types.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AA2HH;;;GAGG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,cAAc,CAAU,CAAC;AAErF;;;;GAIG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,CAAU,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vercel/agent-eval",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.12",
|
|
4
4
|
"description": "Framework for testing AI coding agents in isolated sandboxes",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
7
|
-
"url": "https://github.com/vercel-labs/agent-eval.git"
|
|
7
|
+
"url": "git+https://github.com/vercel-labs/agent-eval.git"
|
|
8
8
|
},
|
|
9
9
|
"type": "module",
|
|
10
10
|
"main": "dist/index.js",
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
"test:integration:docker": "SANDBOX_BACKEND=docker INTEGRATION_TEST=1 vitest run src/integration.test.ts",
|
|
25
25
|
"test:integration:vercel": "SANDBOX_BACKEND=vercel INTEGRATION_TEST=1 vitest run src/integration.test.ts",
|
|
26
26
|
"lint": "eslint src/",
|
|
27
|
+
"release": "changeset publish",
|
|
27
28
|
"prepublishOnly": "npm run build"
|
|
28
29
|
},
|
|
29
30
|
"dependencies": {
|
|
@@ -61,6 +62,7 @@
|
|
|
61
62
|
],
|
|
62
63
|
"license": "MIT",
|
|
63
64
|
"publishConfig": {
|
|
64
|
-
"access": "public"
|
|
65
|
+
"access": "public",
|
|
66
|
+
"provenance": true
|
|
65
67
|
}
|
|
66
68
|
}
|
package/README.md
DELETED
|
@@ -1,474 +0,0 @@
|
|
|
1
|
-
# @vercel/agent-eval
|
|
2
|
-
|
|
3
|
-
Test AI coding agents on your framework. Measure what actually works.
|
|
4
|
-
|
|
5
|
-
## Why?
|
|
6
|
-
|
|
7
|
-
You're building a frontend framework and want AI agents to work well with it. But how do you know if:
|
|
8
|
-
- Your documentation helps agents write correct code?
|
|
9
|
-
- Adding an MCP server improves agent success rates?
|
|
10
|
-
- Sonnet performs as well as Opus for your use cases?
|
|
11
|
-
- Your latest API changes broke agent compatibility?
|
|
12
|
-
|
|
13
|
-
**This framework gives you answers.** Run controlled experiments, measure pass rates, compare techniques.
|
|
14
|
-
|
|
15
|
-
## Quick Start
|
|
16
|
-
|
|
17
|
-
```bash
|
|
18
|
-
# Create a new eval project
|
|
19
|
-
npx @vercel/agent-eval init my-framework-evals
|
|
20
|
-
cd my-framework-evals
|
|
21
|
-
|
|
22
|
-
# Install dependencies
|
|
23
|
-
npm install
|
|
24
|
-
|
|
25
|
-
# Add your API keys
|
|
26
|
-
cp .env.example .env
|
|
27
|
-
# Edit .env with your AI_GATEWAY_API_KEY and VERCEL_TOKEN
|
|
28
|
-
|
|
29
|
-
# Preview what will run (no API calls, no cost)
|
|
30
|
-
npx @vercel/agent-eval cc --dry
|
|
31
|
-
|
|
32
|
-
# Run the evals
|
|
33
|
-
npx @vercel/agent-eval cc
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
## A/B Testing AI Techniques
|
|
37
|
-
|
|
38
|
-
The real power is comparing different approaches. Create multiple experiment configs:
|
|
39
|
-
|
|
40
|
-
### Control: Baseline Agent
|
|
41
|
-
|
|
42
|
-
```typescript
|
|
43
|
-
// experiments/control.ts
|
|
44
|
-
import type { ExperimentConfig } from 'agent-eval';
|
|
45
|
-
|
|
46
|
-
const config: ExperimentConfig = {
|
|
47
|
-
agent: 'vercel-ai-gateway/claude-code',
|
|
48
|
-
model: 'opus',
|
|
49
|
-
runs: 10, // Multiple runs for statistical significance
|
|
50
|
-
earlyExit: false, // Run all attempts to measure reliability
|
|
51
|
-
};
|
|
52
|
-
|
|
53
|
-
export default config;
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
### Treatment: Agent with MCP Server
|
|
57
|
-
|
|
58
|
-
```typescript
|
|
59
|
-
// experiments/with-mcp.ts
|
|
60
|
-
import type { ExperimentConfig } from 'agent-eval';
|
|
61
|
-
|
|
62
|
-
const config: ExperimentConfig = {
|
|
63
|
-
agent: 'vercel-ai-gateway/claude-code',
|
|
64
|
-
model: 'opus',
|
|
65
|
-
runs: 10,
|
|
66
|
-
earlyExit: false,
|
|
67
|
-
|
|
68
|
-
setup: async (sandbox) => {
|
|
69
|
-
// Install your framework's MCP server
|
|
70
|
-
await sandbox.runCommand('npm', ['install', '-g', '@myframework/mcp-server']);
|
|
71
|
-
|
|
72
|
-
// Configure Claude to use it
|
|
73
|
-
await sandbox.writeFiles({
|
|
74
|
-
'.claude/settings.json': JSON.stringify({
|
|
75
|
-
mcpServers: {
|
|
76
|
-
myframework: { command: 'myframework-mcp' }
|
|
77
|
-
}
|
|
78
|
-
})
|
|
79
|
-
});
|
|
80
|
-
},
|
|
81
|
-
};
|
|
82
|
-
|
|
83
|
-
export default config;
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
### Run Both & Compare
|
|
87
|
-
|
|
88
|
-
```bash
|
|
89
|
-
# Preview first
|
|
90
|
-
npx @vercel/agent-eval control --dry
|
|
91
|
-
npx @vercel/agent-eval with-mcp --dry
|
|
92
|
-
|
|
93
|
-
# Run experiments
|
|
94
|
-
npx @vercel/agent-eval control
|
|
95
|
-
npx @vercel/agent-eval with-mcp
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
**Compare results:**
|
|
99
|
-
```
|
|
100
|
-
Control (baseline): 7/10 passed (70%)
|
|
101
|
-
With MCP: 9/10 passed (90%)
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Creating Evals for Your Framework
|
|
105
|
-
|
|
106
|
-
Each eval tests one specific task an agent should be able to do with your framework.
|
|
107
|
-
|
|
108
|
-
### Example: Testing Component Creation
|
|
109
|
-
|
|
110
|
-
```
|
|
111
|
-
evals/
|
|
112
|
-
create-button-component/
|
|
113
|
-
PROMPT.md # Task for the agent
|
|
114
|
-
EVAL.ts # Tests to verify success (or EVAL.tsx for JSX)
|
|
115
|
-
package.json # Your framework as a dependency
|
|
116
|
-
src/ # Starter code
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
### EVAL.ts vs EVAL.tsx
|
|
120
|
-
|
|
121
|
-
Use **EVAL.tsx** when your tests require JSX syntax (React Testing Library, component testing):
|
|
122
|
-
```typescript
|
|
123
|
-
// EVAL.tsx - use when testing React components
|
|
124
|
-
import { test, expect } from 'vitest';
|
|
125
|
-
import { render, screen } from '@testing-library/react';
|
|
126
|
-
import { Button } from './src/components/Button';
|
|
127
|
-
|
|
128
|
-
test('Button renders with label', () => {
|
|
129
|
-
render(<Button label="Click me" onClick={() => {}} />);
|
|
130
|
-
expect(screen.getByText('Click me')).toBeDefined();
|
|
131
|
-
});
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
Use **EVAL.ts** for tests that don't need JSX:
|
|
135
|
-
```typescript
|
|
136
|
-
// EVAL.ts - use for file checks, build tests, etc.
|
|
137
|
-
import { test, expect } from 'vitest';
|
|
138
|
-
import { existsSync } from 'fs';
|
|
139
|
-
|
|
140
|
-
test('Button component exists', () => {
|
|
141
|
-
expect(existsSync('src/components/Button.tsx')).toBe(true);
|
|
142
|
-
});
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
> **Note:** You only need one eval file per fixture. Choose `.tsx` if any test needs JSX, otherwise use `.ts`.
|
|
146
|
-
|
|
147
|
-
**PROMPT.md** - What you want the agent to do:
|
|
148
|
-
```markdown
|
|
149
|
-
Create a Button component using MyFramework.
|
|
150
|
-
|
|
151
|
-
Requirements:
|
|
152
|
-
- Export a Button component from src/components/Button.tsx
|
|
153
|
-
- Accept `label` and `onClick` props
|
|
154
|
-
- Use the framework's styling system for hover states
|
|
155
|
-
```
|
|
156
|
-
|
|
157
|
-
**EVAL.ts** (or **EVAL.tsx**) - How you verify it worked:
|
|
158
|
-
```typescript
|
|
159
|
-
import { test, expect } from 'vitest';
|
|
160
|
-
import { readFileSync, existsSync } from 'fs';
|
|
161
|
-
import { execSync } from 'child_process';
|
|
162
|
-
|
|
163
|
-
test('Button component exists', () => {
|
|
164
|
-
expect(existsSync('src/components/Button.tsx')).toBe(true);
|
|
165
|
-
});
|
|
166
|
-
|
|
167
|
-
test('has required props', () => {
|
|
168
|
-
const content = readFileSync('src/components/Button.tsx', 'utf-8');
|
|
169
|
-
expect(content).toContain('label');
|
|
170
|
-
expect(content).toContain('onClick');
|
|
171
|
-
});
|
|
172
|
-
|
|
173
|
-
test('project builds', () => {
|
|
174
|
-
execSync('npm run build', { stdio: 'pipe' });
|
|
175
|
-
});
|
|
176
|
-
```
|
|
177
|
-
|
|
178
|
-
**package.json** - Include your framework:
|
|
179
|
-
```json
|
|
180
|
-
{
|
|
181
|
-
"name": "create-button-component",
|
|
182
|
-
"type": "module",
|
|
183
|
-
"scripts": { "build": "tsc" },
|
|
184
|
-
"dependencies": {
|
|
185
|
-
"myframework": "^2.0.0"
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
```
|
|
189
|
-
|
|
190
|
-
## Experiment Ideas
|
|
191
|
-
|
|
192
|
-
| Experiment | Control | Treatment |
|
|
193
|
-
|------------|---------|-----------|
|
|
194
|
-
| MCP impact | No MCP | With MCP server |
|
|
195
|
-
| Model comparison | Haiku | Sonnet / Opus |
|
|
196
|
-
| Documentation | Minimal docs | Rich examples |
|
|
197
|
-
| System prompt | Default | Framework-specific |
|
|
198
|
-
| Tool availability | Read/write only | + custom tools |
|
|
199
|
-
|
|
200
|
-
## Configuration Reference
|
|
201
|
-
|
|
202
|
-
### Agent Selection
|
|
203
|
-
|
|
204
|
-
Choose your agent and authentication method:
|
|
205
|
-
|
|
206
|
-
```typescript
|
|
207
|
-
// Vercel AI Gateway (recommended - unified billing & observability)
|
|
208
|
-
agent: 'vercel-ai-gateway/claude-code' // Claude Code via AI Gateway
|
|
209
|
-
agent: 'vercel-ai-gateway/codex' // OpenAI Codex via AI Gateway
|
|
210
|
-
agent: 'vercel-ai-gateway/opencode' // OpenCode via AI Gateway
|
|
211
|
-
agent: 'vercel-ai-gateway/ai-sdk-harness' // Simple AI SDK harness (any model)
|
|
212
|
-
|
|
213
|
-
// Direct API (uses provider keys directly)
|
|
214
|
-
agent: 'claude-code' // requires ANTHROPIC_API_KEY
|
|
215
|
-
agent: 'codex' // requires OPENAI_API_KEY
|
|
216
|
-
```
|
|
217
|
-
|
|
218
|
-
See the Environment Variables section below for setup instructions.
|
|
219
|
-
|
|
220
|
-
### OpenCode Model Configuration
|
|
221
|
-
|
|
222
|
-
OpenCode uses Vercel AI Gateway exclusively. Models **must** be specified with the `vercel/{provider}/{model}` format:
|
|
223
|
-
|
|
224
|
-
```typescript
|
|
225
|
-
// Anthropic models
|
|
226
|
-
model: 'vercel/anthropic/claude-sonnet-4'
|
|
227
|
-
model: 'vercel/anthropic/claude-opus-4'
|
|
228
|
-
|
|
229
|
-
// Minimax models
|
|
230
|
-
model: 'vercel/minimax/minimax-m2.1'
|
|
231
|
-
model: 'vercel/minimax/minimax-m2.1-lightning'
|
|
232
|
-
|
|
233
|
-
// Moonshot AI (Kimi) models
|
|
234
|
-
model: 'vercel/moonshotai/kimi-k2'
|
|
235
|
-
model: 'vercel/moonshotai/kimi-k2-thinking'
|
|
236
|
-
|
|
237
|
-
// OpenAI models
|
|
238
|
-
model: 'vercel/openai/gpt-4o'
|
|
239
|
-
model: 'vercel/openai/o3'
|
|
240
|
-
```
|
|
241
|
-
|
|
242
|
-
> **Important:** The `vercel/` prefix is required. OpenCode's config sets up a `vercel` provider, so the model string must start with `vercel/` to route through Vercel AI Gateway correctly. Using just `anthropic/claude-sonnet-4` (without the `vercel/` prefix) will fail with a "provider not found" error.
|
|
243
|
-
|
|
244
|
-
Under the hood, the agent creates an `opencode.json` config file that configures the Vercel provider:
|
|
245
|
-
|
|
246
|
-
```json
|
|
247
|
-
{
|
|
248
|
-
"provider": {
|
|
249
|
-
"vercel": {
|
|
250
|
-
"options": {
|
|
251
|
-
"apiKey": "{env:AI_GATEWAY_API_KEY}"
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
},
|
|
255
|
-
"permission": {
|
|
256
|
-
"write": "allow",
|
|
257
|
-
"edit": "allow",
|
|
258
|
-
"bash": "allow"
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
```
|
|
262
|
-
|
|
263
|
-
And runs: `opencode run "<prompt>" --model {provider}/{model} --format json`
|
|
264
|
-
|
|
265
|
-
### AI SDK Harness Model Configuration
|
|
266
|
-
|
|
267
|
-
The AI SDK harness (`vercel-ai-gateway/ai-sdk-harness`) is a lightweight agent that works with **any model** available on Vercel AI Gateway. Unlike OpenCode, it uses the standard `{provider}/{model}` format without a `vercel/` prefix:
|
|
268
|
-
|
|
269
|
-
```typescript
|
|
270
|
-
// Anthropic models
|
|
271
|
-
model: 'anthropic/claude-sonnet-4'
|
|
272
|
-
model: 'anthropic/claude-opus-4'
|
|
273
|
-
|
|
274
|
-
// Moonshot AI (Kimi) models
|
|
275
|
-
model: 'moonshotai/kimi-k2.5'
|
|
276
|
-
model: 'moonshotai/kimi-k2-thinking'
|
|
277
|
-
|
|
278
|
-
// Minimax models
|
|
279
|
-
model: 'minimax/minimax-m2.1'
|
|
280
|
-
|
|
281
|
-
// OpenAI models
|
|
282
|
-
model: 'openai/gpt-4o'
|
|
283
|
-
```
|
|
284
|
-
|
|
285
|
-
The AI SDK harness includes these tools: `readFile`, `writeFile`, `editFile`, `listFiles`, `glob`, `grep`, and `bash`. It's ideal for evaluating models that may not be fully compatible with OpenCode.
|
|
286
|
-
|
|
287
|
-
### Full Configuration
|
|
288
|
-
|
|
289
|
-
```typescript
|
|
290
|
-
import type { ExperimentConfig } from 'agent-eval';
|
|
291
|
-
|
|
292
|
-
const config: ExperimentConfig = {
|
|
293
|
-
// Required: which agent and authentication to use
|
|
294
|
-
agent: 'vercel-ai-gateway/claude-code',
|
|
295
|
-
|
|
296
|
-
// Model to use (defaults vary by agent)
|
|
297
|
-
// - claude-code: 'opus'
|
|
298
|
-
// - codex: 'openai/gpt-5.2-codex'
|
|
299
|
-
// - opencode: 'vercel/anthropic/claude-sonnet-4' (note: vercel/ prefix required)
|
|
300
|
-
// - ai-sdk-harness: 'anthropic/claude-sonnet-4' (works with any AI Gateway model)
|
|
301
|
-
model: 'opus',
|
|
302
|
-
|
|
303
|
-
// How many times to run each eval
|
|
304
|
-
runs: 10,
|
|
305
|
-
|
|
306
|
-
// Stop after first success? (false for reliability measurement)
|
|
307
|
-
earlyExit: false,
|
|
308
|
-
|
|
309
|
-
// npm scripts that must pass after agent finishes
|
|
310
|
-
scripts: ['build', 'lint'],
|
|
311
|
-
|
|
312
|
-
// Timeout per run in seconds (default: 600)
|
|
313
|
-
timeout: 600,
|
|
314
|
-
|
|
315
|
-
// Filter which evals to run (pick one)
|
|
316
|
-
evals: '*', // all (default)
|
|
317
|
-
// evals: ['specific-eval'], // by name
|
|
318
|
-
// evals: (name) => name.startsWith('api-'), // by function
|
|
319
|
-
|
|
320
|
-
// Setup function for environment configuration
|
|
321
|
-
setup: async (sandbox) => {
|
|
322
|
-
await sandbox.writeFiles({ '.env': 'API_KEY=test' });
|
|
323
|
-
await sandbox.runCommand('npm', ['run', 'setup']);
|
|
324
|
-
},
|
|
325
|
-
};
|
|
326
|
-
|
|
327
|
-
export default config;
|
|
328
|
-
```
|
|
329
|
-
|
|
330
|
-
## CLI Commands
|
|
331
|
-
|
|
332
|
-
### `init <name>`
|
|
333
|
-
|
|
334
|
-
Create a new eval project:
|
|
335
|
-
```bash
|
|
336
|
-
npx @vercel/agent-eval init my-evals
|
|
337
|
-
```
|
|
338
|
-
|
|
339
|
-
### `<experiment>`
|
|
340
|
-
|
|
341
|
-
Run an experiment:
|
|
342
|
-
```bash
|
|
343
|
-
npx @vercel/agent-eval cc
|
|
344
|
-
```
|
|
345
|
-
|
|
346
|
-
**Dry run** - preview without executing (no API calls, no cost):
|
|
347
|
-
```bash
|
|
348
|
-
npx @vercel/agent-eval cc --dry
|
|
349
|
-
|
|
350
|
-
# Output:
|
|
351
|
-
# Found 5 valid fixture(s), will run 5:
|
|
352
|
-
# - create-button
|
|
353
|
-
# - add-routing
|
|
354
|
-
# - setup-state
|
|
355
|
-
# - ...
|
|
356
|
-
# Running 5 eval(s) x 10 run(s) = 50 total runs
|
|
357
|
-
# Agent: claude-code, Model: opus, Timeout: 300s
|
|
358
|
-
# [DRY RUN] Would execute evals here
|
|
359
|
-
```
|
|
360
|
-
|
|
361
|
-
## Results
|
|
362
|
-
|
|
363
|
-
Results are saved to `results/<experiment>/<timestamp>/`:
|
|
364
|
-
|
|
365
|
-
```
|
|
366
|
-
results/
|
|
367
|
-
with-mcp/
|
|
368
|
-
2026-01-27T10-30-00Z/
|
|
369
|
-
experiment.json # Config and summary
|
|
370
|
-
create-button/
|
|
371
|
-
summary.json # { totalRuns: 10, passedRuns: 9, passRate: "90%" }
|
|
372
|
-
run-1/
|
|
373
|
-
result.json # Individual run result
|
|
374
|
-
transcript.jsonl # Agent conversation
|
|
375
|
-
outputs/ # Test/script output
|
|
376
|
-
```
|
|
377
|
-
|
|
378
|
-
### Analyzing Results
|
|
379
|
-
|
|
380
|
-
```bash
|
|
381
|
-
# Quick comparison
|
|
382
|
-
cat results/control/*/experiment.json | jq '.evals[] | {name, passRate}'
|
|
383
|
-
cat results/with-mcp/*/experiment.json | jq '.evals[] | {name, passRate}'
|
|
384
|
-
```
|
|
385
|
-
|
|
386
|
-
| Pass Rate | Interpretation |
|
|
387
|
-
|-----------|----------------|
|
|
388
|
-
| 90-100% | Agent handles this reliably |
|
|
389
|
-
| 70-89% | Usually works, room for improvement |
|
|
390
|
-
| 50-69% | Unreliable, needs investigation |
|
|
391
|
-
| < 50% | Task too hard or prompt needs work |
|
|
392
|
-
|
|
393
|
-
## Environment Variables
|
|
394
|
-
|
|
395
|
-
Every run requires **two things**: an API key for the agent and a token for the Vercel sandbox. The exact variables depend on which authentication mode you use.
|
|
396
|
-
|
|
397
|
-
| Variable | Required when | Description |
|
|
398
|
-
|---|---|---|
|
|
399
|
-
| `AI_GATEWAY_API_KEY` | `agent: 'vercel-ai-gateway/...'` | Vercel AI Gateway key — works for all agents (claude-code, codex, opencode) |
|
|
400
|
-
| `ANTHROPIC_API_KEY` | `agent: 'claude-code'` | Direct Anthropic API key (`sk-ant-...`) |
|
|
401
|
-
| `OPENAI_API_KEY` | `agent: 'codex'` | Direct OpenAI API key (`sk-proj-...`) |
|
|
402
|
-
| `VERCEL_TOKEN` | Always (pick one) | Vercel personal access token — for local dev |
|
|
403
|
-
| `VERCEL_OIDC_TOKEN` | Always (pick one) | Vercel OIDC token — for CI/CD pipelines |
|
|
404
|
-
|
|
405
|
-
> **Note:** OpenCode only supports Vercel AI Gateway (`vercel-ai-gateway/opencode`). There is no direct API option for OpenCode.
|
|
406
|
-
|
|
407
|
-
> You always need **one agent key** + **one sandbox token**.
|
|
408
|
-
|
|
409
|
-
### Vercel AI Gateway (Recommended)
|
|
410
|
-
|
|
411
|
-
Use `vercel-ai-gateway/` prefixed agents. One key for all models.
|
|
412
|
-
|
|
413
|
-
```bash
|
|
414
|
-
# Agent access — get yours at https://vercel.com/dashboard -> AI Gateway
|
|
415
|
-
AI_GATEWAY_API_KEY=your-ai-gateway-api-key
|
|
416
|
-
|
|
417
|
-
# Sandbox access — create at https://vercel.com/account/tokens
|
|
418
|
-
VERCEL_TOKEN=your-vercel-token
|
|
419
|
-
# OR for CI/CD:
|
|
420
|
-
# VERCEL_OIDC_TOKEN=your-oidc-token
|
|
421
|
-
```
|
|
422
|
-
|
|
423
|
-
### Direct API Keys (Alternative)
|
|
424
|
-
|
|
425
|
-
Remove the `vercel-ai-gateway/` prefix and use provider keys directly:
|
|
426
|
-
|
|
427
|
-
```bash
|
|
428
|
-
# For agent: 'claude-code'
|
|
429
|
-
ANTHROPIC_API_KEY=sk-ant-...
|
|
430
|
-
|
|
431
|
-
# For agent: 'codex'
|
|
432
|
-
OPENAI_API_KEY=sk-proj-...
|
|
433
|
-
|
|
434
|
-
# Sandbox access is still required
|
|
435
|
-
VERCEL_TOKEN=your-vercel-token
|
|
436
|
-
```
|
|
437
|
-
|
|
438
|
-
### `.env` Setup
|
|
439
|
-
|
|
440
|
-
The `init` command generates a `.env.example` file. Copy it and fill in your keys:
|
|
441
|
-
|
|
442
|
-
```bash
|
|
443
|
-
cp .env.example .env
|
|
444
|
-
```
|
|
445
|
-
|
|
446
|
-
The framework loads `.env` automatically via [dotenv](https://github.com/motdotla/dotenv).
|
|
447
|
-
|
|
448
|
-
### Vercel Employees
|
|
449
|
-
|
|
450
|
-
**To get the environment variables, link to `vercel-labs/agent-eval` on Vercel:**
|
|
451
|
-
|
|
452
|
-
```bash
|
|
453
|
-
# Link to the vercel-labs/agent-eval project
|
|
454
|
-
vc link vercel-labs/agent-eval
|
|
455
|
-
|
|
456
|
-
# Pull environment variables
|
|
457
|
-
vc env pull
|
|
458
|
-
```
|
|
459
|
-
|
|
460
|
-
This writes a `.env.local` file with all the required environment variables (AI_GATEWAY_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY, VERCEL_OIDC_TOKEN) — no manual key setup needed. The framework automatically loads from both `.env` and `.env.local`.
|
|
461
|
-
|
|
462
|
-
## Tips
|
|
463
|
-
|
|
464
|
-
**Start with `--dry`**: Always preview before running to verify your config and avoid unexpected costs.
|
|
465
|
-
|
|
466
|
-
**Use multiple runs**: Single runs don't tell you reliability. Use `runs: 10` and `earlyExit: false` for meaningful data.
|
|
467
|
-
|
|
468
|
-
**Isolate variables**: Change one thing at a time between experiments. Don't compare "Opus with MCP" to "Haiku without MCP".
|
|
469
|
-
|
|
470
|
-
**Test incrementally**: Start with simple tasks, add complexity as you learn what works.
|
|
471
|
-
|
|
472
|
-
## License
|
|
473
|
-
|
|
474
|
-
MIT
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* AI SDK Agent - A simple coding agent using the Vercel AI SDK.
|
|
3
|
-
* Works with any model available on Vercel AI Gateway.
|
|
4
|
-
*/
|
|
5
|
-
import type { Agent } from './types.js';
|
|
6
|
-
/**
|
|
7
|
-
* Create AI SDK agent with Vercel AI Gateway authentication.
|
|
8
|
-
*/
|
|
9
|
-
export declare function createAiSdkAgent(): Agent;
|
|
10
|
-
//# sourceMappingURL=ai-sdk-agent.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"ai-sdk-agent.d.ts","sourceRoot":"","sources":["../../../src/lib/agents/ai-sdk-agent.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,KAAK,EAAmC,MAAM,YAAY,CAAC;AA8QzE;;GAEG;AACH,wBAAgB,gBAAgB,IAAI,KAAK,CAiMxC"}
|