@vercel/agent-eval 0.0.11 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +29 -0
- package/dist/cli.js.map +1 -1
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js +17 -3
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/runner.d.ts +1 -0
- package/dist/lib/runner.d.ts.map +1 -1
- package/dist/lib/runner.js +3 -2
- package/dist/lib/runner.js.map +1 -1
- package/dist/lib/types.d.ts +4 -0
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/package.json +3 -6
- package/README.md +0 -454
package/dist/cli.js
CHANGED
|
@@ -14,6 +14,7 @@ import { runExperiment } from './lib/runner.js';
|
|
|
14
14
|
import { initProject, getPostInitInstructions } from './lib/init.js';
|
|
15
15
|
import { getAgent } from './lib/agents/index.js';
|
|
16
16
|
import { getSandboxBackendInfo } from './lib/sandbox.js';
|
|
17
|
+
import { spawnSync } from 'child_process';
|
|
17
18
|
// Load environment variables (.env.local first, then .env as fallback)
|
|
18
19
|
dotenvConfig({ path: '.env.local' });
|
|
19
20
|
dotenvConfig();
|
|
@@ -184,6 +185,34 @@ program
|
|
|
184
185
|
process.exit(1);
|
|
185
186
|
}
|
|
186
187
|
});
|
|
188
|
+
/**
|
|
189
|
+
* playground command - Launch the web-based results viewer
|
|
190
|
+
* Spawns @vercel/agent-eval-playground (downloaded on-demand via npx if not installed)
|
|
191
|
+
*/
|
|
192
|
+
program
|
|
193
|
+
.command('playground')
|
|
194
|
+
.description('Launch the web-based playground for browsing experiment results')
|
|
195
|
+
.option('--port <port>', 'HTTP server port', '3000')
|
|
196
|
+
.option('--results-dir <dir>', 'Path to results directory', './results')
|
|
197
|
+
.option('--evals-dir <dir>', 'Path to evals directory', './evals')
|
|
198
|
+
.option('--watch', 'Enable live mode — watch results directory for changes')
|
|
199
|
+
.action(async (options) => {
|
|
200
|
+
const resultsDir = resolve(process.cwd(), options.resultsDir);
|
|
201
|
+
const evalsDir = resolve(process.cwd(), options.evalsDir);
|
|
202
|
+
console.log(chalk.blue('Starting Agent Eval Playground...'));
|
|
203
|
+
// Build args for the playground CLI
|
|
204
|
+
const playgroundArgs = [
|
|
205
|
+
'--results-dir', resultsDir,
|
|
206
|
+
'--evals-dir', evalsDir,
|
|
207
|
+
'--port', options.port,
|
|
208
|
+
];
|
|
209
|
+
if (options.watch) {
|
|
210
|
+
playgroundArgs.push('--watch');
|
|
211
|
+
}
|
|
212
|
+
// Try to run the playground package directly, fall back to npx
|
|
213
|
+
const result = spawnSync('npx', ['@vercel/agent-eval-playground', ...playgroundArgs], { stdio: 'inherit', cwd: process.cwd() });
|
|
214
|
+
process.exit(result.status ?? 1);
|
|
215
|
+
});
|
|
187
216
|
/**
|
|
188
217
|
* Default command - run experiment (no subcommand needed)
|
|
189
218
|
* Usage: agent-eval cc --dry
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;GAEG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,MAAM,IAAI,YAAY,EAAE,MAAM,QAAQ,CAAC;AAChD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,MAAM,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAC9C,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AACpC,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAC/D,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAChD,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AACjD,OAAO,EAAE,qBAAqB,EAAE,MAAM,kBAAkB,CAAC;
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;GAEG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,MAAM,IAAI,YAAY,EAAE,MAAM,QAAQ,CAAC;AAChD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,MAAM,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAC9C,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AACpC,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAC/D,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAChD,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AACjD,OAAO,EAAE,qBAAqB,EAAE,MAAM,kBAAkB,CAAC;AACzD,OAAO,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC;AAE1C,uEAAuE;AACvE,YAAY,CAAC,EAAE,IAAI,EAAE,YAAY,EAAE,CAAC,CAAC;AACrC,YAAY,EAAE,CAAC;AAEf,iCAAiC;AACjC,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC1D,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,OAAO,CAAC,SAAS,EAAE,iBAAiB,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;AAErF,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,8DAA8D,CAAC;KAC3E,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;AAExB;;;;GAIG;AACH,SAAS,iBAAiB,CAAC,KAAa;IACtC,6DAA6D;IAC7D,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAClG,OAAO,KAAK,CAAC;IACf,CAAC;IACD,6DAA6D;IAC7D,OAAO,eAAe,KAAK,KAAK,CAAC;AACnC,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,oBAAoB,CAAC,WAAmB,EAAE,OAA0B;IACjF,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,iBAAiB,CAAC,WAAW,CAAC,CAAC;QAClD,MAAM,kBAAkB,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,UAAU,CAAC,CAAC;QAE9D,IAAI,CAAC,UAAU,CAAC,kBAAkB,CAAC,EAAE,CAAC;YACpC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,0BAA0B,kBAAkB,EAAE,CAAC,CAAC,CAAC;YACzE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,uBAAuB,UAAU,KAAK,CAAC,CAAC,CAAC;QAChE,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,kBAAkB,CAAC,CAAC;QAEpD,mDAAmD;QACnD,kEAAkE;QAClE,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC,CAAC;QACxD,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,8BAA8B,QAAQ,EAAE,CAAC,CAAC,CAAC;YACnE,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,yDAAyD,CAAC,CAAC,CAAC;YACrF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,wBAAwB,QAAQ,KAAK,CAAC,CAAC,CAAC;QAC/D,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;QAEvD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,cAAc,MAAM,CAAC,MAAM,sBAAsB,CAAC,CAAC,CAAC;YAC7E,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,KAAK,CAAC,WAAW,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC1E,CAAC;QACH,CAAC;QAED,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC,CAAC;YACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,6BAA6B;QAC7B,MAAM,cAAc,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACnD,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,KAAK,EAAE,cAAc,CAAC,CAAC;QAEjE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC,CAAC;YACxD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,WAAW,QAAQ,CAAC,MAAM,+BAA+B,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvG,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,CAAC;QAC1C,CAAC;QAEJ,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAExE,2BAA2B;QAC3B,MAAM,iBAAiB,GAAG,SAAS,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC;QACzD,MAAM,SAAS,GAAG,iBAAiB,GAAG,MAAM,CAAC,MAAM,CAAC;QAEpD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,aAAa,SAAS,CAAC,MAAM,cAAc,MAAM,CAAC,IAAI,aAAa,MAAM,CAAC,MAAM,eAAe,SAAS,aAAa,CAAC,CAAC,CAAC;YAC/I,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,KAAK,aAAa,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc,MAAM,CAAC,OAAO,kBAAkB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;QAChJ,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,aAAa,SAAS,CAAC,MAAM,cAAc,MAAM,CAAC,IAAI,aAAa,SAAS,aAAa,CAAC,CAAC,CAAC;YACnH,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,KAAK,YAAY,MAAM,CAAC,CAAC,CAAC,cAAc,MAAM,CAAC,OAAO,kBAAkB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;QACvI,CAAC;QAED,0CAA0C;QAC1C,MAAM,WAAW,GAAG,qBAAqB,CAAC,EAAE,OAAO,EAAE,MAAM,CAAC,OAAO,EAAE,CAAC,CAAC;QACvE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,YAAY,WAAW,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;QAE/D,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;YAChB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,sCAAsC,CAAC,CAAC,CAAC;YAClE,OAAO;QACT,CAAC;QAED,8CAA8C;QAC9C,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACrC,MAAM,YAAY,GAAG,KAAK,CAAC,eAAe,EAAE,CAAC;QAC7C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QACzC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,YAAY,mCAAmC,CAAC,CAAC,CAAC;YAC7E,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,iEAAiE,CAAC,CAAC,CAAC;YAC7F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,kDAAkD;QAClD,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAE5E,uCAAuC;QACvC,MAAM,kBAAkB,GAAG,QAAQ,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAC5E,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CAAC,CAAC;QAErD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC,CAAC;QAEpD,iCAAiC;QACjC,IAAI,SAAS,GAAG,IAAI,CAAC;QACrB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,0CAA0C;YAC1C,MAAM,WAAW,GAAG,EAAE,GAAG,MAAM,EAAE,KAAK,EAAE,CAAC;YAEzC,sEAAsE;YACtE,MAAM,cAAc,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC;gBACtC,CAAC,CAAC,GAAG,kBAAkB,IAAI,KAAK,EAAE;gBAClC,CAAC,CAAC,kBAAkB,CAAC;YAEvB,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,6BAA6B,KAAK,MAAM,CAAC,CAAC,CAAC;YACpE,CAAC;YAED,qBAAqB;YACrB,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC;gBAClC,MAAM,EAAE,WAAW;gBACnB,QAAQ,EAAE,gBAAgB;gBAC1B,MAAM;gBACN,UAAU;gBACV,cAAc;gBACd,UAAU,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;aACtC,CAAC,CAAC;YAEH,kCAAkC;YAClC,MAAM,gBAAgB,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,SAAS,CAAC,CAAC;YAClF,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACtB,SAAS,GAAG,KAAK,CAAC;YACpB,CAAC;QACH,CAAC;QAED,6BAA6B;QAC7B,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAClC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC,CAAC;QACxD,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,QAAQ,CAAC,QAAQ,EAAE,+BAA+B,CAAC;KACnD,WAAW,CAAC,iDAAiD,CAAC;KAC9D,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,EAAE;IAC7B,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,8BAA8B,IAAI,EAAE,CAAC,CAAC,CAAC;QAE9D,MAAM,UAAU,GAAG,WAAW,CAAC;YAC7B,IAAI;YACJ,SAAS,EAAE,OAAO,CAAC,GAAG,EAAE;SACzB,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC,CAAC;IACzD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC,CAAC;QACxD,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL;;;GAGG;AACH,OAAO;KACJ,OAAO,CAAC,YAAY,CAAC;KACrB,WAAW,CAAC,iEAAiE,CAAC;KAC9E,MAAM,CAAC,eAAe,EAAE,kBAAkB,EAAE,MAAM,CAAC;KACnD,MAAM,CAAC,qBAAqB,EAAE,2BAA2B,EAAE,WAAW,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,yBAAyB,EAAE,SAAS,CAAC;KACjE,MAAM,CAAC,SAAS,EAAE,wDAAwD,CAAC;KAC3E,MAAM,CAAC,KAAK,EAAE,OAAgF,EAAE,EAAE;IACjG,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC;IAC9D,MAAM,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IAE1D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,mCAAmC,CAAC,CAAC,CAAC;IAE7D,oCAAoC;IACpC,MAAM,cAAc,GAAG;QACrB,eAAe,EAAE,UAAU;QAC3B,aAAa,EAAE,QAAQ;QACvB,QAAQ,EAAE,OAAO,CAAC,IAAI;KACvB,CAAC;IACF,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAClB,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACjC,CAAC;IAED,+DAA+D;IAC/D,MAAM,MAAM,GAAG,SAAS,CACtB,KAAK,EACL,CAAC,+BAA+B,EAAE,GAAG,cAAc,CAAC,EACpD,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,EAAE,CACzC,CAAC;IAEF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;AACnC,CAAC,CAAC,CAAC;AAEL;;;GAGG;AACH,OAAO;KACJ,QAAQ,CAAC,UAAU,EAAE,sCAAsC,CAAC;KAC5D,MAAM,CAAC,OAAO,EAAE,0CAA0C,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,WAA+B,EAAE,OAA0B,EAAE,EAAE;IAC5E,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,CAAC,IAAI,EAAE,CAAC;QACf,OAAO;IACT,CAAC;IACD,MAAM,oBAAoB,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;AACnD,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
package/dist/lib/config.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../../src/lib/config.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,KAAK,EACV,gBAAgB,EAChB,wBAAwB,EACxB,UAAU,EACX,MAAM,YAAY,CAAC;AAGpB;;GAEG;AACH,eAAO,MAAM,eAAe;;;;;aAKX,MAAM,EAAE;;;CAGxB,CAAC;
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../../src/lib/config.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,KAAK,EACV,gBAAgB,EAChB,wBAAwB,EACxB,UAAU,EACX,MAAM,YAAY,CAAC;AAGpB;;GAEG;AACH,eAAO,MAAM,eAAe;;;;;aAKX,MAAM,EAAE;;;CAGxB,CAAC;AA0BF;;;GAGG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,OAAO,GAAG,gBAAgB,CAWhE;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,gBAAgB,GAAG,wBAAwB,CAmBhF;AAED;;;GAGG;AACH,wBAAsB,UAAU,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,wBAAwB,CAAC,CA8BtF;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,EACtC,cAAc,EAAE,MAAM,EAAE,GACvB,MAAM,EAAE,CAyBV"}
|
package/dist/lib/config.js
CHANGED
|
@@ -36,6 +36,7 @@ const experimentConfigSchema = z.object({
|
|
|
36
36
|
timeout: z.number().positive().optional(),
|
|
37
37
|
setup: z.function().optional(),
|
|
38
38
|
sandbox: z.enum(['vercel', 'docker', 'auto']).optional(),
|
|
39
|
+
editPrompt: z.function().args(z.string()).returns(z.string()).optional(),
|
|
39
40
|
});
|
|
40
41
|
/**
|
|
41
42
|
* Validates an experiment configuration object.
|
|
@@ -69,6 +70,7 @@ export function resolveConfig(config) {
|
|
|
69
70
|
timeout: config.timeout ?? CONFIG_DEFAULTS.timeout,
|
|
70
71
|
setup: config.setup,
|
|
71
72
|
sandbox: config.sandbox ?? CONFIG_DEFAULTS.sandbox,
|
|
73
|
+
editPrompt: config.editPrompt,
|
|
72
74
|
};
|
|
73
75
|
}
|
|
74
76
|
/**
|
|
@@ -77,9 +79,21 @@ export function resolveConfig(config) {
|
|
|
77
79
|
*/
|
|
78
80
|
export async function loadConfig(configPath) {
|
|
79
81
|
try {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
82
|
+
let rawConfig;
|
|
83
|
+
// Use jiti for TypeScript files
|
|
84
|
+
if (configPath.endsWith('.ts')) {
|
|
85
|
+
const { createJiti } = await import('jiti');
|
|
86
|
+
const jiti = createJiti(import.meta.url, {
|
|
87
|
+
interopDefault: true,
|
|
88
|
+
moduleCache: false,
|
|
89
|
+
});
|
|
90
|
+
rawConfig = await jiti.import(configPath);
|
|
91
|
+
}
|
|
92
|
+
else {
|
|
93
|
+
// Dynamic import for JavaScript files
|
|
94
|
+
const module = await import(configPath);
|
|
95
|
+
rawConfig = module.default;
|
|
96
|
+
}
|
|
83
97
|
if (!rawConfig) {
|
|
84
98
|
throw new Error(`Config file must have a default export`);
|
|
85
99
|
}
|
package/dist/lib/config.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/lib/config.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAMxB,OAAO,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAE7C;;GAEG;AACH,MAAM,CAAC,MAAM,eAAe,GAAG;IAC7B,KAAK,EAAE,MAAe;IACtB,KAAK,EAAE,GAAY;IACnB,IAAI,EAAE,CAAC;IACP,SAAS,EAAE,IAAI;IACf,OAAO,EAAE,EAAc;IACvB,OAAO,EAAE,GAAG,EAAE,aAAa;IAC3B,OAAO,EAAE,MAAe;CACzB,CAAC;AAEF;;GAEG;AACH,MAAM,sBAAsB,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC;QACZ,+BAA+B;QAC/B,aAAa;QACb,yBAAyB;QACzB,OAAO;QACP,4BAA4B;KAC7B,CAAC;IACF,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC5D,KAAK,EAAE,CAAC;SACL,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;SAC5F,QAAQ,EAAE;IACb,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IAC5C,SAAS,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE;IACjC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IACzC,KAAK,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IAC9B,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/lib/config.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAMxB,OAAO,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAE7C;;GAEG;AACH,MAAM,CAAC,MAAM,eAAe,GAAG;IAC7B,KAAK,EAAE,MAAe;IACtB,KAAK,EAAE,GAAY;IACnB,IAAI,EAAE,CAAC;IACP,SAAS,EAAE,IAAI;IACf,OAAO,EAAE,EAAc;IACvB,OAAO,EAAE,GAAG,EAAE,aAAa;IAC3B,OAAO,EAAE,MAAe;CACzB,CAAC;AAEF;;GAEG;AACH,MAAM,sBAAsB,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC;QACZ,+BAA+B;QAC/B,aAAa;QACb,yBAAyB;QACzB,OAAO;QACP,4BAA4B;KAC7B,CAAC;IACF,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC5D,KAAK,EAAE,CAAC;SACL,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;SAC5F,QAAQ,EAAE;IACb,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IAC5C,SAAS,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE;IACjC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;IACvC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IACzC,KAAK,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IAC9B,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,EAAE;IACxD,UAAU,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;CACzE,CAAC,CAAC;AAEH;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,MAAe;IAC5C,MAAM,MAAM,GAAG,sBAAsB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IAExD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM;aAC/B,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC;aAC/D,IAAI,CAAC,IAAI,CAAC,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,sCAAsC,MAAM,EAAE,CAAC,CAAC;IAClE,CAAC;IAED,OAAO,MAAM,CAAC,IAAwB,CAAC;AACzC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,MAAwB;IACpD,wBAAwB;IACxB,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAErC,gDAAgD;IAChD,MAAM,YAAY,GAAG,MAAM,CAAC,KAAK,IAAI,KAAK,CAAC,eAAe,EAAE,CAAC;IAE7D,OAAO;QACL,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,KAAK,EAAE,YAAY;QACnB,KAAK,EAAE,MAAM,CAAC,KAAK,IAAI,GAAG;QAC1B,IAAI,EAAE,MAAM,CAAC,IAAI,IAAI,eAAe,CAAC,IAAI;QACzC,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,eAAe,CAAC,SAAS;QACxD,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,eAAe,CAAC,OAAO;QAClD,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,eAAe,CAAC,OAAO;QAClD,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,eAAe,CAAC,OAAO;QAClD,UAAU,EAAE,MAAM,CAAC,UAAU;KAC9B,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,UAAkB;IACjD,IAAI,CAAC;QACH,IAAI,SAAkB,CAAC;QAEvB,gCAAgC;QAChC,IAAI,UAAU,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/B,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,CAAC;YAC5C,MAAM,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,EAAE;gBACvC,cAAc,EAAE,IAAI;gBACpB,WAAW,EAAE,KAAK;aACnB,CAAC,CAAC;YACH,SAAS,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QAC5C,CAAC;aAAM,CAAC;YACN,sCAAsC;YACtC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,UAAU,CAAC,CAAC;YACxC,SAAS,GAAG,MAAM,CAAC,OAAO,CAAC;QAC7B,CAAC;QAED,IAAI,CAAC,SAAS,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,wCAAwC,CAAC,CAAC;QAC5D,CAAC;QAED,MAAM,MAAM,GAAG,cAAc,CAAC,SAAS,CAAC,CAAC;QACzC,OAAO,aAAa,CAAC,MAAM,CAAC,CAAC;IAC/B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,MAAM,IAAI,KAAK,CAAC,8BAA8B,UAAU,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAChF,CAAC;QACD,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAC9B,MAAsC,EACtC,cAAwB;IAExB,mBAAmB;IACnB,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QAC/B,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;YACnB,OAAO,cAAc,CAAC;QACxB,CAAC;QACD,IAAI,CAAC,cAAc,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YACrC,MAAM,IAAI,KAAK,CAAC,SAAS,MAAM,iCAAiC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC/F,CAAC;QACD,OAAO,CAAC,MAAM,CAAC,CAAC;IAClB,CAAC;IAED,sBAAsB;IACtB,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC1B,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;QACxE,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CACb,oBAAoB,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,sBAAsB,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACxF,CAAC;QACJ,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,kBAAkB;IAClB,OAAO,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;AACvC,CAAC"}
|
package/dist/lib/runner.d.ts
CHANGED
|
@@ -39,6 +39,7 @@ export declare function runSingleEval<T extends ResolvedExperimentConfig['model'
|
|
|
39
39
|
setup?: ResolvedExperimentConfig['setup'];
|
|
40
40
|
scripts?: string[];
|
|
41
41
|
sandbox?: ResolvedExperimentConfig['sandbox'];
|
|
42
|
+
editPrompt?: (prompt: string) => string;
|
|
42
43
|
verbose?: boolean;
|
|
43
44
|
}): Promise<T extends Array<unknown> ? EvalRunData[] : EvalRunData>;
|
|
44
45
|
//# sourceMappingURL=runner.d.ts.map
|
package/dist/lib/runner.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EACV,wBAAwB,EACxB,WAAW,EACX,WAAW,EAEX,iBAAiB,EACjB,wBAAwB,EACzB,MAAM,YAAY,CAAC;AAYpB;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,wCAAwC;IACxC,MAAM,EAAE,wBAAwB,CAAC;IACjC,sBAAsB;IACtB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gCAAgC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,oCAAoC;IACpC,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;IACvC,qCAAqC;IACrC,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAoBD;;;GAGG;AACH,wBAAsB,aAAa,CACjC,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,iBAAiB,CAAC,CA+K5B;AAED;;GAEG;AACH,wBAAsB,aAAa,CAAC,CAAC,SAAS,wBAAwB,CAAC,OAAO,CAAC,EAC7E,OAAO,EAAE,WAAW,EACpB,OAAO,EAAE;IACP,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,KAAK,EAAE,CAAC,CAAC;IACT,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,wBAAwB,CAAC,SAAS,CAAC,CAAC;IAC9C,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB,GACA,OAAO,CAAC,CAAC,SAAS,KAAK,CAAC,OAAO,CAAC,GAAG,WAAW,EAAE,GAAG,WAAW,CAAC,
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EACV,wBAAwB,EACxB,WAAW,EACX,WAAW,EAEX,iBAAiB,EACjB,wBAAwB,EACzB,MAAM,YAAY,CAAC;AAYpB;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,wCAAwC;IACxC,MAAM,EAAE,wBAAwB,CAAC;IACjC,sBAAsB;IACtB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gCAAgC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,oCAAoC;IACpC,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;IACvC,qCAAqC;IACrC,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAoBD;;;GAGG;AACH,wBAAsB,aAAa,CACjC,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,iBAAiB,CAAC,CA+K5B;AAED;;GAEG;AACH,wBAAsB,aAAa,CAAC,CAAC,SAAS,wBAAwB,CAAC,OAAO,CAAC,EAC7E,OAAO,EAAE,WAAW,EACpB,OAAO,EAAE;IACP,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,KAAK,EAAE,CAAC,CAAC;IACT,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,wBAAwB,CAAC,SAAS,CAAC,CAAC;IAC9C,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;IACxC,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB,GACA,OAAO,CAAC,CAAC,SAAS,KAAK,CAAC,OAAO,CAAC,GAAG,WAAW,EAAE,GAAG,WAAW,CAAC,CA6BjE"}
|
package/dist/lib/runner.js
CHANGED
|
@@ -62,7 +62,7 @@ export async function runExperiment(options) {
|
|
|
62
62
|
let timeoutId;
|
|
63
63
|
const agentResult = await Promise.race([
|
|
64
64
|
agent.run(fixture.path, {
|
|
65
|
-
prompt: fixture.prompt,
|
|
65
|
+
prompt: config.editPrompt ? config.editPrompt(fixture.prompt) : fixture.prompt,
|
|
66
66
|
model: config.model,
|
|
67
67
|
timeout: timeoutMs,
|
|
68
68
|
apiKey,
|
|
@@ -162,10 +162,11 @@ export async function runExperiment(options) {
|
|
|
162
162
|
export async function runSingleEval(fixture, options) {
|
|
163
163
|
const agent = getAgent(options.agent ?? 'vercel-ai-gateway/claude-code');
|
|
164
164
|
const models = Array.isArray(options.model) ? options.model : [options.model];
|
|
165
|
+
const prompt = options.editPrompt ? options.editPrompt(fixture.prompt) : fixture.prompt;
|
|
165
166
|
const results = [];
|
|
166
167
|
for (const model of models) {
|
|
167
168
|
const agentResult = await agent.run(fixture.path, {
|
|
168
|
-
prompt
|
|
169
|
+
prompt,
|
|
169
170
|
model,
|
|
170
171
|
timeout: options.timeout * 1000,
|
|
171
172
|
apiKey: options.apiKey,
|
package/dist/lib/runner.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAUH,OAAO,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAC7C,OAAO,EACL,wBAAwB,EACxB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,EACX,kBAAkB,EAClB,eAAe,EACf,qBAAqB,GACtB,MAAM,cAAc,CAAC;AAwCtB;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAA6B;IAE7B,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;IAC9F,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC;IAE7B,8BAA8B;IAC9B,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAErC,MAAM,GAAG,GAAG,CAAC,GAAW,EAAE,EAAE;QAC1B,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC,GAAG,CAAC,CAAC;QAClB,CAAC;aAAM,IAAI,OAAO,EAAE,CAAC;YACnB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC,CAAC;IAEF,mDAAmD;IACnD,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,eAAe,EAAE,CAAC,CAAC;IAC5D,CAAC;IAED,oCAAoC;IACpC,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,QAAQ,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAC;QAC1C,CAAC;IACH,CAAC;IAED,GAAG,CAAC,YAAY,QAAQ,CAAC,MAAM,gCAAgC,QAAQ,CAAC,MAAM,YAAY,MAAM,CAAC,IAAI,QAAQ,CAAC,CAAC;IAE/G,uBAAuB;IACvB,MAAM,UAAU,GAAG,KAAK,EAAE,OAAoB,EAA0B,EAAE;QACxE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;QACtC,MAAM,UAAU,GAAG,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;QAEvD,2CAA2C;QAC3C,IAAI,UAAU,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YAC9B,OAAO;gBACL,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,QAAQ;gBACR,OAAO,EAAE;oBACP,MAAM,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,EAAE;iBAC5D;gBACD,OAAO,EAAE,IAAI;aACd,CAAC;QACJ,CAAC;QAED,GAAG,CAAC,qBAAqB,CAAC,OAAO,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;QAEpE,MAAM,SAAS,GAAG,MAAM,CAAC,OAAO,GAAG,IAAI,CAAC;QACxC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,oDAAoD;QACpD,MAAM,iBAAiB,GAAG,IAAI,eAAe,EAAE,CAAC;QAEhD,yDAAyD;QACzD,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;YACrB,UAAU,CAAC,MAAM,CAAC,gBAAgB,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,iBAAiB,CAAC,KAAK,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/F,CAAC;QAED,IAAI,SAAoD,CAAC;QAEzD,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC;YACrC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE;gBACtB,MAAM,EAAE,OAAO,CAAC,MAAM;
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAUH,OAAO,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAC7C,OAAO,EACL,wBAAwB,EACxB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,EACX,kBAAkB,EAClB,eAAe,EACf,qBAAqB,GACtB,MAAM,cAAc,CAAC;AAwCtB;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAA6B;IAE7B,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;IAC9F,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC;IAE7B,8BAA8B;IAC9B,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAErC,MAAM,GAAG,GAAG,CAAC,GAAW,EAAE,EAAE;QAC1B,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC,GAAG,CAAC,CAAC;QAClB,CAAC;aAAM,IAAI,OAAO,EAAE,CAAC;YACnB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC,CAAC;IAEF,mDAAmD;IACnD,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,eAAe,EAAE,CAAC,CAAC;IAC5D,CAAC;IAED,oCAAoC;IACpC,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,QAAQ,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAC;QAC1C,CAAC;IACH,CAAC;IAED,GAAG,CAAC,YAAY,QAAQ,CAAC,MAAM,gCAAgC,QAAQ,CAAC,MAAM,YAAY,MAAM,CAAC,IAAI,QAAQ,CAAC,CAAC;IAE/G,uBAAuB;IACvB,MAAM,UAAU,GAAG,KAAK,EAAE,OAAoB,EAA0B,EAAE;QACxE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;QACtC,MAAM,UAAU,GAAG,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;QAEvD,2CAA2C;QAC3C,IAAI,UAAU,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YAC9B,OAAO;gBACL,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,QAAQ;gBACR,OAAO,EAAE;oBACP,MAAM,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,EAAE;iBAC5D;gBACD,OAAO,EAAE,IAAI;aACd,CAAC;QACJ,CAAC;QAED,GAAG,CAAC,qBAAqB,CAAC,OAAO,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;QAEpE,MAAM,SAAS,GAAG,MAAM,CAAC,OAAO,GAAG,IAAI,CAAC;QACxC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,oDAAoD;QACpD,MAAM,iBAAiB,GAAG,IAAI,eAAe,EAAE,CAAC;QAEhD,yDAAyD;QACzD,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;YACrB,UAAU,CAAC,MAAM,CAAC,gBAAgB,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,iBAAiB,CAAC,KAAK,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/F,CAAC;QAED,IAAI,SAAoD,CAAC;QAEzD,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC;YACrC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE;gBACtB,MAAM,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM;gBAC9E,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,OAAO,EAAE,SAAS;gBAClB,MAAM;gBACN,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,MAAM,EAAE,iBAAiB,CAAC,MAAM;gBAChC,OAAO,EAAE,MAAM,CAAC,OAAO;aACxB,CAAC;YACF,IAAI,OAAO,CAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE;gBAC/B,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC1B,iBAAiB,CAAC,KAAK,EAAE,CAAC,CAAC,mCAAmC;oBAC9D,MAAM,CAAC,IAAI,KAAK,CAAC,wBAAwB,MAAM,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC;gBAC/D,CAAC,EAAE,SAAS,CAAC,CAAC;YAChB,CAAC,CAAC;SACH,CAAC,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;YACjB,8CAA8C;YAC9C,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBAClE,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,MAAM,EAAE,EAAE;oBACV,KAAK,EAAE,KAAK,CAAC,OAAO;oBACpB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;iBACjC,CAAC;YACJ,CAAC;YACD,MAAM,KAAK,CAAC;QACd,CAAC,CAAC,CAAC;QAEH,kDAAkD;QAClD,IAAI,SAAS;YAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QAEvC,4BAA4B;QAC5B,IAAI,WAAW,CAAC,KAAK,KAAK,SAAS,IAAI,WAAW,CAAC,KAAK,KAAK,sBAAsB,EAAE,CAAC;YACpF,OAAO;gBACL,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,QAAQ;gBACR,OAAO,EAAE;oBACP,MAAM,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,WAAW,CAAC,QAAQ,GAAG,IAAI,EAAE;iBACtF;gBACD,OAAO,EAAE,IAAI;aACd,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,wBAAwB,CAAC,WAAW,CAAC,CAAC;QAEtD,GAAG,CAAC,eAAe,CAAC,OAAO,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC,EAAE,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC;QAE9E,4EAA4E;QAC5E,IAAI,MAAM,CAAC,SAAS,IAAI,OAAO,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YAC3D,GAAG,CAAC,eAAe,OAAO,CAAC,IAAI,kBAAkB,QAAQ,GAAG,CAAC,+BAA+B,CAAC,CAAC;YAC9F,UAAU,CAAC,KAAK,EAAE,CAAC;QACrB,CAAC;QAED,OAAO;YACL,WAAW,EAAE,OAAO,CAAC,IAAI;YACzB,QAAQ;YACR,OAAO;SACR,CAAC;IACJ,CAAC,CAAC;IAEF,gCAAgC;IAChC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC;IAE5D,sDAAsD;IACtD,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YACpB,gBAAgB,CAAC,GAAG,CAAC,MAAM,CAAC,WAAW,CAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,6CAA6C;IAC7C,MAAM,aAAa,GAAkB,EAAE,CAAC;IACxC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,cAAc,GAAG,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;QAE3D,wCAAwC;QACxC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;QAEvD,MAAM,WAAW,GAAkB,EAAE,CAAC;QACtC,KAAK,MAAM,MAAM,IAAI,cAAc,EAAE,CAAC;YACpC,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YAEjC,iDAAiD;YACjD,IAAI,MAAM,CAAC,SAAS,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;gBAClE,MAAM;YACR,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,iBAAiB,CAAC,OAAO,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;QAC7D,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC9B,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,IAAI,EAAE,CAAC;IAC/B,MAAM,iBAAiB,GAAG,uBAAuB,CAAC,MAAM,EAAE,aAAa,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;IAEjG,uBAAuB;IACvB,MAAM,SAAS,GAAG,WAAW,CAAC,iBAAiB,EAAE;QAC/C,UAAU;QACV,cAAc;KACf,CAAC,CAAC;IAEH,GAAG,CAAC,uBAAuB,SAAS,EAAE,CAAC,CAAC;IACxC,GAAG,CAAC,kBAAkB,CAAC,iBAAiB,CAAC,CAAC,CAAC;IAE3C,OAAO,iBAAiB,CAAC;AAC3B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAoB,EACpB,OAUC;IAED,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,IAAI,+BAA+B,CAAC,CAAC;IAEzE,MAAM,MAAM,GAAa,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACxF,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC;IAExF,MAAM,OAAO,GAAkB,EAAE,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAE9B,MAAM,WAAW,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE;YACjD,MAAM;YACN,KAAK;YACL,OAAO,EAAE,OAAO,CAAC,OAAO,GAAG,IAAI;YAC/B,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,OAAO,EAAE,OAAO,CAAC,OAAO;SACxB,CAAC,CAAC;QAEA,OAAO,CAAC,IAAI,CAAC,wBAAwB,CAAC,WAAW,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,gHAAgH;IAChH,IAAG,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,OAAO,OAAO,CAAC,CAAC,CAA2D,CAAC;IAC3E,CAAC;IAED,OAAO,OAAiE,CAAC;AAC3E,CAAC"}
|
package/dist/lib/types.d.ts
CHANGED
|
@@ -68,6 +68,8 @@ export interface ExperimentConfig {
|
|
|
68
68
|
setup?: SetupFunction;
|
|
69
69
|
/** Sandbox backend to use. @default 'auto' (Vercel if token present, else Docker) */
|
|
70
70
|
sandbox?: SandboxBackend | 'auto';
|
|
71
|
+
/** Optional function to modify the prompt before running the experiment. @default undefined */
|
|
72
|
+
editPrompt?: (prompt: string) => string;
|
|
71
73
|
}
|
|
72
74
|
/**
|
|
73
75
|
* Resolved experiment config with all defaults applied.
|
|
@@ -82,6 +84,7 @@ export interface ResolvedExperimentConfig {
|
|
|
82
84
|
timeout: number;
|
|
83
85
|
setup?: SetupFunction;
|
|
84
86
|
sandbox: SandboxBackend | 'auto';
|
|
87
|
+
editPrompt?: (prompt: string) => string;
|
|
85
88
|
}
|
|
86
89
|
/**
|
|
87
90
|
* Resolved experiment config with all defaults applied.
|
|
@@ -96,6 +99,7 @@ export interface RunnableExperimentConfig {
|
|
|
96
99
|
timeout: number;
|
|
97
100
|
setup?: SetupFunction;
|
|
98
101
|
sandbox: SandboxBackend | 'auto';
|
|
102
|
+
editPrompt?: (prompt: string) => string;
|
|
99
103
|
}
|
|
100
104
|
/**
|
|
101
105
|
* Required files for a valid eval fixture.
|
package/dist/lib/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,MAAM,SAAS,GACjB,+BAA+B,GAC/B,aAAa,GACb,yBAAyB,GACzB,OAAO,GACP,4BAA4B,CAAC;AAEjC;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;AAEnD;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,mCAAmC;IACnC,UAAU,CACR,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,MAAM,EAAE,EACf,OAAO,CAAC,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;KAAE,GACzC,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,mCAAmC;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,wCAAwC;IACxC,mBAAmB,IAAI,MAAM,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;AAEhE;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,QAAQ,GAAG,QAAQ,CAAC;AAEjD;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB;;6FAEyF;IACzF,KAAK,CAAC,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAEhC,+FAA+F;IAC/F,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IAEvC,kDAAkD;IAClD,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd,qDAAqD;IACrD,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,mEAAmE;IACnE,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IAEnB,8EAA8E;IAC9E,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,uEAAuE;IACvE,KAAK,CAAC,EAAE,aAAa,CAAC;IAEtB,qFAAqF;IACrF,OAAO,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,MAAM,SAAS,GACjB,+BAA+B,GAC/B,aAAa,GACb,yBAAyB,GACzB,OAAO,GACP,4BAA4B,CAAC;AAEjC;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;AAEnD;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,mCAAmC;IACnC,UAAU,CACR,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,MAAM,EAAE,EACf,OAAO,CAAC,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;KAAE,GACzC,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,mCAAmC;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,wCAAwC;IACxC,mBAAmB,IAAI,MAAM,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;AAEhE;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,QAAQ,GAAG,QAAQ,CAAC;AAEjD;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB;;6FAEyF;IACzF,KAAK,CAAC,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAEhC,+FAA+F;IAC/F,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IAEvC,kDAAkD;IAClD,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd,qDAAqD;IACrD,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,mEAAmE;IACnE,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IAEnB,8EAA8E;IAC9E,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,uEAAuE;IACvE,KAAK,CAAC,EAAE,aAAa,CAAC;IAEtB,qFAAqF;IACrF,OAAO,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;IAElC,+FAA+F;IAC/F,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAC/B,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,OAAO,EAAE,cAAc,GAAG,MAAM,CAAC;IACjC,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,OAAO,EAAE,cAAc,GAAG,MAAM,CAAC;IACjC,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;CACzC;AAED;;;GAGG;AACH,eAAO,MAAM,mBAAmB,mDAAoD,CAAC;AAErF;;;;GAIG;AACH,eAAO,MAAM,cAAc,uEAAwE,CAAC;AAEpG;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,qCAAqC;IACrC,IAAI,EAAE,MAAM,CAAC;IACb,uCAAuC;IACvC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gDAAgD;IAChD,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,0BAA0B;IAC1B,MAAM,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC5B,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,8DAA8D;IAC9D,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,wDAAwD;IACxD,WAAW,CAAC,EAAE;QACZ,kCAAkC;QAClC,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,8DAA8D;QAC9D,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,0DAA0D;IAC1D,MAAM,EAAE,aAAa,CAAC;IACtB,yEAAyE;IACzE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qDAAqD;IACrD,aAAa,CAAC,EAAE;QACd,0BAA0B;QAC1B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,qDAAqD;QACrD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,4BAA4B;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,YAAY,EAAE,MAAM,CAAC;IACrB,2EAA2E;IAC3E,IAAI,EAAE,WAAW,EAAE,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB,0CAA0C;IAC1C,WAAW,EAAE,MAAM,CAAC;IACpB,oCAAoC;IACpC,MAAM,EAAE,wBAAwB,CAAC;IACjC,4BAA4B;IAC5B,KAAK,EAAE,WAAW,EAAE,CAAC;CACtB"}
|
package/dist/lib/types.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AA2HH;;;GAGG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,cAAc,CAAU,CAAC;AAErF;;;;GAIG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,CAAU,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vercel/agent-eval",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.13",
|
|
4
4
|
"description": "Framework for testing AI coding agents in isolated sandboxes",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -24,9 +24,7 @@
|
|
|
24
24
|
"test:integration:docker": "SANDBOX_BACKEND=docker INTEGRATION_TEST=1 vitest run src/integration.test.ts",
|
|
25
25
|
"test:integration:vercel": "SANDBOX_BACKEND=vercel INTEGRATION_TEST=1 vitest run src/integration.test.ts",
|
|
26
26
|
"lint": "eslint src/",
|
|
27
|
-
"
|
|
28
|
-
"version": "changeset version",
|
|
29
|
-
"release": "npm run build && changeset publish",
|
|
27
|
+
"release": "changeset publish",
|
|
30
28
|
"prepublishOnly": "npm run build"
|
|
31
29
|
},
|
|
32
30
|
"dependencies": {
|
|
@@ -38,12 +36,11 @@
|
|
|
38
36
|
"dockerode": "^4.0.9",
|
|
39
37
|
"dotenv": "^16.4.5",
|
|
40
38
|
"glob": "^11.0.0",
|
|
39
|
+
"jiti": "^2.6.1",
|
|
41
40
|
"tar-stream": "^3.1.7",
|
|
42
41
|
"zod": "^3.23.8"
|
|
43
42
|
},
|
|
44
43
|
"devDependencies": {
|
|
45
|
-
"@changesets/changelog-github": "^0.5.2",
|
|
46
|
-
"@changesets/cli": "^2.29.8",
|
|
47
44
|
"@types/dockerode": "^4.0.1",
|
|
48
45
|
"@types/node": "^22.0.0",
|
|
49
46
|
"@types/tar-stream": "^3.1.4",
|
package/README.md
DELETED
|
@@ -1,454 +0,0 @@
|
|
|
1
|
-
# @vercel/agent-eval
|
|
2
|
-
|
|
3
|
-
Test AI coding agents on your framework. Measure what actually works.
|
|
4
|
-
|
|
5
|
-
## Why?
|
|
6
|
-
|
|
7
|
-
You're building a frontend framework and want AI agents to work well with it. But how do you know if:
|
|
8
|
-
- Your documentation helps agents write correct code?
|
|
9
|
-
- Adding an MCP server improves agent success rates?
|
|
10
|
-
- Sonnet performs as well as Opus for your use cases?
|
|
11
|
-
- Your latest API changes broke agent compatibility?
|
|
12
|
-
|
|
13
|
-
**This framework gives you answers.** Run controlled experiments, measure pass rates, compare techniques.
|
|
14
|
-
|
|
15
|
-
## Quick Start
|
|
16
|
-
|
|
17
|
-
```bash
|
|
18
|
-
# Create a new eval project
|
|
19
|
-
npx @vercel/agent-eval init my-framework-evals
|
|
20
|
-
cd my-framework-evals
|
|
21
|
-
|
|
22
|
-
# Install dependencies
|
|
23
|
-
npm install
|
|
24
|
-
|
|
25
|
-
# Add your API keys
|
|
26
|
-
cp .env.example .env
|
|
27
|
-
# Edit .env with your AI_GATEWAY_API_KEY and VERCEL_TOKEN
|
|
28
|
-
|
|
29
|
-
# Preview what will run (no API calls, no cost)
|
|
30
|
-
npx @vercel/agent-eval cc --dry
|
|
31
|
-
|
|
32
|
-
# Run the evals
|
|
33
|
-
npx @vercel/agent-eval cc
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
## A/B Testing AI Techniques
|
|
37
|
-
|
|
38
|
-
The real power is comparing different approaches. Create multiple experiment configs:
|
|
39
|
-
|
|
40
|
-
### Control: Baseline Agent
|
|
41
|
-
|
|
42
|
-
```typescript
|
|
43
|
-
// experiments/control.ts
|
|
44
|
-
import type { ExperimentConfig } from 'agent-eval';
|
|
45
|
-
|
|
46
|
-
const config: ExperimentConfig = {
|
|
47
|
-
agent: 'vercel-ai-gateway/claude-code',
|
|
48
|
-
model: 'opus',
|
|
49
|
-
runs: 10, // Multiple runs for statistical significance
|
|
50
|
-
earlyExit: false, // Run all attempts to measure reliability
|
|
51
|
-
};
|
|
52
|
-
|
|
53
|
-
export default config;
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
### Treatment: Agent with MCP Server
|
|
57
|
-
|
|
58
|
-
```typescript
|
|
59
|
-
// experiments/with-mcp.ts
|
|
60
|
-
import type { ExperimentConfig } from 'agent-eval';
|
|
61
|
-
|
|
62
|
-
const config: ExperimentConfig = {
|
|
63
|
-
agent: 'vercel-ai-gateway/claude-code',
|
|
64
|
-
model: 'opus',
|
|
65
|
-
runs: 10,
|
|
66
|
-
earlyExit: false,
|
|
67
|
-
|
|
68
|
-
setup: async (sandbox) => {
|
|
69
|
-
// Install your framework's MCP server
|
|
70
|
-
await sandbox.runCommand('npm', ['install', '-g', '@myframework/mcp-server']);
|
|
71
|
-
|
|
72
|
-
// Configure Claude to use it
|
|
73
|
-
await sandbox.writeFiles({
|
|
74
|
-
'.claude/settings.json': JSON.stringify({
|
|
75
|
-
mcpServers: {
|
|
76
|
-
myframework: { command: 'myframework-mcp' }
|
|
77
|
-
}
|
|
78
|
-
})
|
|
79
|
-
});
|
|
80
|
-
},
|
|
81
|
-
};
|
|
82
|
-
|
|
83
|
-
export default config;
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
### Run Both & Compare
|
|
87
|
-
|
|
88
|
-
```bash
|
|
89
|
-
# Preview first
|
|
90
|
-
npx @vercel/agent-eval control --dry
|
|
91
|
-
npx @vercel/agent-eval with-mcp --dry
|
|
92
|
-
|
|
93
|
-
# Run experiments
|
|
94
|
-
npx @vercel/agent-eval control
|
|
95
|
-
npx @vercel/agent-eval with-mcp
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
**Compare results:**
|
|
99
|
-
```
|
|
100
|
-
Control (baseline): 7/10 passed (70%)
|
|
101
|
-
With MCP: 9/10 passed (90%)
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Creating Evals for Your Framework
|
|
105
|
-
|
|
106
|
-
Each eval tests one specific task an agent should be able to do with your framework.
|
|
107
|
-
|
|
108
|
-
### Example: Testing Component Creation
|
|
109
|
-
|
|
110
|
-
```
|
|
111
|
-
evals/
|
|
112
|
-
create-button-component/
|
|
113
|
-
PROMPT.md # Task for the agent
|
|
114
|
-
EVAL.ts # Tests to verify success (or EVAL.tsx for JSX)
|
|
115
|
-
package.json # Your framework as a dependency
|
|
116
|
-
src/ # Starter code
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
### EVAL.ts vs EVAL.tsx
|
|
120
|
-
|
|
121
|
-
Use **EVAL.tsx** when your tests require JSX syntax (React Testing Library, component testing):
|
|
122
|
-
```typescript
|
|
123
|
-
// EVAL.tsx - use when testing React components
|
|
124
|
-
import { test, expect } from 'vitest';
|
|
125
|
-
import { render, screen } from '@testing-library/react';
|
|
126
|
-
import { Button } from './src/components/Button';
|
|
127
|
-
|
|
128
|
-
test('Button renders with label', () => {
|
|
129
|
-
render(<Button label="Click me" onClick={() => {}} />);
|
|
130
|
-
expect(screen.getByText('Click me')).toBeDefined();
|
|
131
|
-
});
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
Use **EVAL.ts** for tests that don't need JSX:
|
|
135
|
-
```typescript
|
|
136
|
-
// EVAL.ts - use for file checks, build tests, etc.
|
|
137
|
-
import { test, expect } from 'vitest';
|
|
138
|
-
import { existsSync } from 'fs';
|
|
139
|
-
|
|
140
|
-
test('Button component exists', () => {
|
|
141
|
-
expect(existsSync('src/components/Button.tsx')).toBe(true);
|
|
142
|
-
});
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
> **Note:** You only need one eval file per fixture. Choose `.tsx` if any test needs JSX, otherwise use `.ts`.
|
|
146
|
-
|
|
147
|
-
**PROMPT.md** - What you want the agent to do:
|
|
148
|
-
```markdown
|
|
149
|
-
Create a Button component using MyFramework.
|
|
150
|
-
|
|
151
|
-
Requirements:
|
|
152
|
-
- Export a Button component from src/components/Button.tsx
|
|
153
|
-
- Accept `label` and `onClick` props
|
|
154
|
-
- Use the framework's styling system for hover states
|
|
155
|
-
```
|
|
156
|
-
|
|
157
|
-
**EVAL.ts** (or **EVAL.tsx**) - How you verify it worked:
|
|
158
|
-
```typescript
|
|
159
|
-
import { test, expect } from 'vitest';
|
|
160
|
-
import { readFileSync, existsSync } from 'fs';
|
|
161
|
-
import { execSync } from 'child_process';
|
|
162
|
-
|
|
163
|
-
test('Button component exists', () => {
|
|
164
|
-
expect(existsSync('src/components/Button.tsx')).toBe(true);
|
|
165
|
-
});
|
|
166
|
-
|
|
167
|
-
test('has required props', () => {
|
|
168
|
-
const content = readFileSync('src/components/Button.tsx', 'utf-8');
|
|
169
|
-
expect(content).toContain('label');
|
|
170
|
-
expect(content).toContain('onClick');
|
|
171
|
-
});
|
|
172
|
-
|
|
173
|
-
test('project builds', () => {
|
|
174
|
-
execSync('npm run build', { stdio: 'pipe' });
|
|
175
|
-
});
|
|
176
|
-
```
|
|
177
|
-
|
|
178
|
-
**package.json** - Include your framework:
|
|
179
|
-
```json
|
|
180
|
-
{
|
|
181
|
-
"name": "create-button-component",
|
|
182
|
-
"type": "module",
|
|
183
|
-
"scripts": { "build": "tsc" },
|
|
184
|
-
"dependencies": {
|
|
185
|
-
"myframework": "^2.0.0"
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
```
|
|
189
|
-
|
|
190
|
-
## Experiment Ideas
|
|
191
|
-
|
|
192
|
-
| Experiment | Control | Treatment |
|
|
193
|
-
|------------|---------|-----------|
|
|
194
|
-
| MCP impact | No MCP | With MCP server |
|
|
195
|
-
| Model comparison | Haiku | Sonnet / Opus |
|
|
196
|
-
| Documentation | Minimal docs | Rich examples |
|
|
197
|
-
| System prompt | Default | Framework-specific |
|
|
198
|
-
| Tool availability | Read/write only | + custom tools |
|
|
199
|
-
|
|
200
|
-
## Configuration Reference
|
|
201
|
-
|
|
202
|
-
### Agent Selection
|
|
203
|
-
|
|
204
|
-
Choose your agent and authentication method:
|
|
205
|
-
|
|
206
|
-
```typescript
|
|
207
|
-
// Vercel AI Gateway (recommended - unified billing & observability)
|
|
208
|
-
agent: 'vercel-ai-gateway/claude-code' // Claude Code via AI Gateway
|
|
209
|
-
agent: 'vercel-ai-gateway/codex' // OpenAI Codex via AI Gateway
|
|
210
|
-
agent: 'vercel-ai-gateway/opencode' // OpenCode via AI Gateway
|
|
211
|
-
|
|
212
|
-
// Direct API (uses provider keys directly)
|
|
213
|
-
agent: 'claude-code' // requires ANTHROPIC_API_KEY
|
|
214
|
-
agent: 'codex' // requires OPENAI_API_KEY
|
|
215
|
-
```
|
|
216
|
-
|
|
217
|
-
See the Environment Variables section below for setup instructions.
|
|
218
|
-
|
|
219
|
-
### OpenCode Model Configuration
|
|
220
|
-
|
|
221
|
-
OpenCode uses Vercel AI Gateway exclusively. Models **must** be specified with the `vercel/{provider}/{model}` format:
|
|
222
|
-
|
|
223
|
-
```typescript
|
|
224
|
-
// Anthropic models
|
|
225
|
-
model: 'vercel/anthropic/claude-sonnet-4'
|
|
226
|
-
model: 'vercel/anthropic/claude-opus-4'
|
|
227
|
-
|
|
228
|
-
// Minimax models
|
|
229
|
-
model: 'vercel/minimax/minimax-m2.1'
|
|
230
|
-
model: 'vercel/minimax/minimax-m2.1-lightning'
|
|
231
|
-
|
|
232
|
-
// Moonshot AI (Kimi) models
|
|
233
|
-
model: 'vercel/moonshotai/kimi-k2'
|
|
234
|
-
model: 'vercel/moonshotai/kimi-k2-thinking'
|
|
235
|
-
|
|
236
|
-
// OpenAI models
|
|
237
|
-
model: 'vercel/openai/gpt-4o'
|
|
238
|
-
model: 'vercel/openai/o3'
|
|
239
|
-
```
|
|
240
|
-
|
|
241
|
-
> **Important:** The `vercel/` prefix is required. OpenCode's config sets up a `vercel` provider, so the model string must start with `vercel/` to route through Vercel AI Gateway correctly. Using just `anthropic/claude-sonnet-4` (without the `vercel/` prefix) will fail with a "provider not found" error.
|
|
242
|
-
|
|
243
|
-
Under the hood, the agent creates an `opencode.json` config file that configures the Vercel provider:
|
|
244
|
-
|
|
245
|
-
```json
|
|
246
|
-
{
|
|
247
|
-
"provider": {
|
|
248
|
-
"vercel": {
|
|
249
|
-
"options": {
|
|
250
|
-
"apiKey": "{env:AI_GATEWAY_API_KEY}"
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
},
|
|
254
|
-
"permission": {
|
|
255
|
-
"write": "allow",
|
|
256
|
-
"edit": "allow",
|
|
257
|
-
"bash": "allow"
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
```
|
|
261
|
-
|
|
262
|
-
And runs: `opencode run "<prompt>" --model {provider}/{model} --format json`
|
|
263
|
-
|
|
264
|
-
### Full Configuration
|
|
265
|
-
|
|
266
|
-
```typescript
|
|
267
|
-
import type { ExperimentConfig } from 'agent-eval';
|
|
268
|
-
|
|
269
|
-
const config: ExperimentConfig = {
|
|
270
|
-
// Required: which agent and authentication to use
|
|
271
|
-
agent: 'vercel-ai-gateway/claude-code',
|
|
272
|
-
|
|
273
|
-
// Model to use (defaults vary by agent)
|
|
274
|
-
// - claude-code: 'opus'
|
|
275
|
-
// - codex: 'openai/gpt-5.2-codex'
|
|
276
|
-
// - opencode: 'vercel/anthropic/claude-sonnet-4' (note: vercel/ prefix required)
|
|
277
|
-
model: 'opus',
|
|
278
|
-
|
|
279
|
-
// How many times to run each eval
|
|
280
|
-
runs: 10,
|
|
281
|
-
|
|
282
|
-
// Stop after first success? (false for reliability measurement)
|
|
283
|
-
earlyExit: false,
|
|
284
|
-
|
|
285
|
-
// npm scripts that must pass after agent finishes
|
|
286
|
-
scripts: ['build', 'lint'],
|
|
287
|
-
|
|
288
|
-
// Timeout per run in seconds (default: 600)
|
|
289
|
-
timeout: 600,
|
|
290
|
-
|
|
291
|
-
// Filter which evals to run (pick one)
|
|
292
|
-
evals: '*', // all (default)
|
|
293
|
-
// evals: ['specific-eval'], // by name
|
|
294
|
-
// evals: (name) => name.startsWith('api-'), // by function
|
|
295
|
-
|
|
296
|
-
// Setup function for environment configuration
|
|
297
|
-
setup: async (sandbox) => {
|
|
298
|
-
await sandbox.writeFiles({ '.env': 'API_KEY=test' });
|
|
299
|
-
await sandbox.runCommand('npm', ['run', 'setup']);
|
|
300
|
-
},
|
|
301
|
-
};
|
|
302
|
-
|
|
303
|
-
export default config;
|
|
304
|
-
```
|
|
305
|
-
|
|
306
|
-
## CLI Commands
|
|
307
|
-
|
|
308
|
-
### `init <name>`
|
|
309
|
-
|
|
310
|
-
Create a new eval project:
|
|
311
|
-
```bash
|
|
312
|
-
npx @vercel/agent-eval init my-evals
|
|
313
|
-
```
|
|
314
|
-
|
|
315
|
-
### `<experiment>`
|
|
316
|
-
|
|
317
|
-
Run an experiment:
|
|
318
|
-
```bash
|
|
319
|
-
npx @vercel/agent-eval cc
|
|
320
|
-
```
|
|
321
|
-
|
|
322
|
-
**Dry run** - preview without executing (no API calls, no cost):
|
|
323
|
-
```bash
|
|
324
|
-
npx @vercel/agent-eval cc --dry
|
|
325
|
-
|
|
326
|
-
# Output:
|
|
327
|
-
# Found 5 valid fixture(s), will run 5:
|
|
328
|
-
# - create-button
|
|
329
|
-
# - add-routing
|
|
330
|
-
# - setup-state
|
|
331
|
-
# - ...
|
|
332
|
-
# Running 5 eval(s) x 10 run(s) = 50 total runs
|
|
333
|
-
# Agent: claude-code, Model: opus, Timeout: 300s
|
|
334
|
-
# [DRY RUN] Would execute evals here
|
|
335
|
-
```
|
|
336
|
-
|
|
337
|
-
## Results
|
|
338
|
-
|
|
339
|
-
Results are saved to `results/<experiment>/<timestamp>/`:
|
|
340
|
-
|
|
341
|
-
```
|
|
342
|
-
results/
|
|
343
|
-
with-mcp/
|
|
344
|
-
2026-01-27T10-30-00Z/
|
|
345
|
-
experiment.json # Config and summary
|
|
346
|
-
create-button/
|
|
347
|
-
summary.json # { totalRuns: 10, passedRuns: 9, passRate: "90%" }
|
|
348
|
-
run-1/
|
|
349
|
-
result.json # Individual run result
|
|
350
|
-
transcript.jsonl # Agent conversation
|
|
351
|
-
outputs/ # Test/script output
|
|
352
|
-
```
|
|
353
|
-
|
|
354
|
-
### Analyzing Results
|
|
355
|
-
|
|
356
|
-
```bash
|
|
357
|
-
# Quick comparison
|
|
358
|
-
cat results/control/*/experiment.json | jq '.evals[] | {name, passRate}'
|
|
359
|
-
cat results/with-mcp/*/experiment.json | jq '.evals[] | {name, passRate}'
|
|
360
|
-
```
|
|
361
|
-
|
|
362
|
-
| Pass Rate | Interpretation |
|
|
363
|
-
|-----------|----------------|
|
|
364
|
-
| 90-100% | Agent handles this reliably |
|
|
365
|
-
| 70-89% | Usually works, room for improvement |
|
|
366
|
-
| 50-69% | Unreliable, needs investigation |
|
|
367
|
-
| < 50% | Task too hard or prompt needs work |
|
|
368
|
-
|
|
369
|
-
## Environment Variables
|
|
370
|
-
|
|
371
|
-
Every run requires **two things**: an API key for the agent and a token for the Vercel sandbox. The exact variables depend on which authentication mode you use.
|
|
372
|
-
|
|
373
|
-
| Variable | Required when | Description |
|
|
374
|
-
|---|---|---|
|
|
375
|
-
| `AI_GATEWAY_API_KEY` | `agent: 'vercel-ai-gateway/...'` | Vercel AI Gateway key — works for all agents (claude-code, codex, opencode) |
|
|
376
|
-
| `ANTHROPIC_API_KEY` | `agent: 'claude-code'` | Direct Anthropic API key (`sk-ant-...`) |
|
|
377
|
-
| `OPENAI_API_KEY` | `agent: 'codex'` | Direct OpenAI API key (`sk-proj-...`) |
|
|
378
|
-
| `VERCEL_TOKEN` | Always (pick one) | Vercel personal access token — for local dev |
|
|
379
|
-
| `VERCEL_OIDC_TOKEN` | Always (pick one) | Vercel OIDC token — for CI/CD pipelines |
|
|
380
|
-
|
|
381
|
-
> **Note:** OpenCode only supports Vercel AI Gateway (`vercel-ai-gateway/opencode`). There is no direct API option for OpenCode.
|
|
382
|
-
|
|
383
|
-
> You always need **one agent key** + **one sandbox token**.
|
|
384
|
-
|
|
385
|
-
### Vercel AI Gateway (Recommended)
|
|
386
|
-
|
|
387
|
-
Use `vercel-ai-gateway/` prefixed agents. One key for all models.
|
|
388
|
-
|
|
389
|
-
```bash
|
|
390
|
-
# Agent access — get yours at https://vercel.com/dashboard -> AI Gateway
|
|
391
|
-
AI_GATEWAY_API_KEY=your-ai-gateway-api-key
|
|
392
|
-
|
|
393
|
-
# Sandbox access — create at https://vercel.com/account/tokens
|
|
394
|
-
VERCEL_TOKEN=your-vercel-token
|
|
395
|
-
# OR for CI/CD:
|
|
396
|
-
# VERCEL_OIDC_TOKEN=your-oidc-token
|
|
397
|
-
```
|
|
398
|
-
|
|
399
|
-
### Direct API Keys (Alternative)
|
|
400
|
-
|
|
401
|
-
Remove the `vercel-ai-gateway/` prefix and use provider keys directly:
|
|
402
|
-
|
|
403
|
-
```bash
|
|
404
|
-
# For agent: 'claude-code'
|
|
405
|
-
ANTHROPIC_API_KEY=sk-ant-...
|
|
406
|
-
|
|
407
|
-
# For agent: 'codex'
|
|
408
|
-
OPENAI_API_KEY=sk-proj-...
|
|
409
|
-
|
|
410
|
-
# Sandbox access is still required
|
|
411
|
-
VERCEL_TOKEN=your-vercel-token
|
|
412
|
-
```
|
|
413
|
-
|
|
414
|
-
### `.env` Setup
|
|
415
|
-
|
|
416
|
-
The `init` command generates a `.env.example` file. Copy it and fill in your keys:
|
|
417
|
-
|
|
418
|
-
```bash
|
|
419
|
-
cp .env.example .env
|
|
420
|
-
```
|
|
421
|
-
|
|
422
|
-
The framework loads `.env` automatically via [dotenv](https://github.com/motdotla/dotenv).
|
|
423
|
-
|
|
424
|
-
### Vercel Employees
|
|
425
|
-
|
|
426
|
-
**To get the environment variables, link to `vercel-labs/agent-eval` on Vercel:**
|
|
427
|
-
|
|
428
|
-
```bash
|
|
429
|
-
# Link to the vercel-labs/agent-eval project
|
|
430
|
-
vc link vercel-labs/agent-eval
|
|
431
|
-
|
|
432
|
-
# Pull environment variables
|
|
433
|
-
vc env pull
|
|
434
|
-
```
|
|
435
|
-
|
|
436
|
-
This writes a `.env.local` file with all the required environment variables (AI_GATEWAY_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY, VERCEL_OIDC_TOKEN) — no manual key setup needed. The framework automatically loads from both `.env` and `.env.local`.
|
|
437
|
-
|
|
438
|
-
## Tips
|
|
439
|
-
|
|
440
|
-
**Start with `--dry`**: Always preview before running to verify your config and avoid unexpected costs.
|
|
441
|
-
|
|
442
|
-
**Use multiple runs**: Single runs don't tell you reliability. Use `runs: 10` and `earlyExit: false` for meaningful data.
|
|
443
|
-
|
|
444
|
-
**Isolate variables**: Change one thing at a time between experiments. Don't compare "Opus with MCP" to "Haiku without MCP".
|
|
445
|
-
|
|
446
|
-
**Test incrementally**: Start with simple tasks, add complexity as you learn what works.
|
|
447
|
-
|
|
448
|
-
## Contributing
|
|
449
|
-
|
|
450
|
-
See [CONTRIBUTING.md](CONTRIBUTING.md) for development workflow and release process.
|
|
451
|
-
|
|
452
|
-
## License
|
|
453
|
-
|
|
454
|
-
MIT
|