@vercel/agent-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +370 -0
  2. package/dist/cli.d.ts +6 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +166 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/index.d.ts +21 -0
  7. package/dist/index.d.ts.map +1 -0
  8. package/dist/index.js +17 -0
  9. package/dist/index.js.map +1 -0
  10. package/dist/lib/agents/claude-code.d.ts +12 -0
  11. package/dist/lib/agents/claude-code.d.ts.map +1 -0
  12. package/dist/lib/agents/claude-code.js +203 -0
  13. package/dist/lib/agents/claude-code.js.map +1 -0
  14. package/dist/lib/agents/codex.d.ts +12 -0
  15. package/dist/lib/agents/codex.d.ts.map +1 -0
  16. package/dist/lib/agents/codex.js +247 -0
  17. package/dist/lib/agents/codex.js.map +1 -0
  18. package/dist/lib/agents/index.d.ts +7 -0
  19. package/dist/lib/agents/index.d.ts.map +1 -0
  20. package/dist/lib/agents/index.js +14 -0
  21. package/dist/lib/agents/index.js.map +1 -0
  22. package/dist/lib/agents/registry.d.ts +23 -0
  23. package/dist/lib/agents/registry.d.ts.map +1 -0
  24. package/dist/lib/agents/registry.js +35 -0
  25. package/dist/lib/agents/registry.js.map +1 -0
  26. package/dist/lib/agents/shared.d.ts +47 -0
  27. package/dist/lib/agents/shared.d.ts.map +1 -0
  28. package/dist/lib/agents/shared.js +99 -0
  29. package/dist/lib/agents/shared.js.map +1 -0
  30. package/dist/lib/agents/types.d.ts +69 -0
  31. package/dist/lib/agents/types.d.ts.map +1 -0
  32. package/dist/lib/agents/types.js +5 -0
  33. package/dist/lib/agents/types.js.map +1 -0
  34. package/dist/lib/config.d.ts +34 -0
  35. package/dist/lib/config.d.ts.map +1 -0
  36. package/dist/lib/config.js +117 -0
  37. package/dist/lib/config.js.map +1 -0
  38. package/dist/lib/fixture.d.ts +52 -0
  39. package/dist/lib/fixture.d.ts.map +1 -0
  40. package/dist/lib/fixture.js +175 -0
  41. package/dist/lib/fixture.js.map +1 -0
  42. package/dist/lib/init.d.ts +21 -0
  43. package/dist/lib/init.d.ts.map +1 -0
  44. package/dist/lib/init.js +250 -0
  45. package/dist/lib/init.js.map +1 -0
  46. package/dist/lib/results.d.ts +54 -0
  47. package/dist/lib/results.d.ts.map +1 -0
  48. package/dist/lib/results.js +186 -0
  49. package/dist/lib/results.js.map +1 -0
  50. package/dist/lib/runner.d.ts +43 -0
  51. package/dist/lib/runner.d.ts.map +1 -0
  52. package/dist/lib/runner.js +142 -0
  53. package/dist/lib/runner.js.map +1 -0
  54. package/dist/lib/sandbox.d.ts +117 -0
  55. package/dist/lib/sandbox.d.ts.map +1 -0
  56. package/dist/lib/sandbox.js +248 -0
  57. package/dist/lib/sandbox.js.map +1 -0
  58. package/dist/lib/types.d.ts +166 -0
  59. package/dist/lib/types.d.ts.map +1 -0
  60. package/dist/lib/types.js +14 -0
  61. package/dist/lib/types.js.map +1 -0
  62. package/dist/test-setup.d.ts +2 -0
  63. package/dist/test-setup.d.ts.map +1 -0
  64. package/dist/test-setup.js +6 -0
  65. package/dist/test-setup.js.map +1 -0
  66. package/package.json +58 -0
package/README.md ADDED
@@ -0,0 +1,370 @@
1
+ # @vercel/agent-eval
2
+
3
+ Test AI coding agents on your framework. Measure what actually works.
4
+
5
+ ## Why?
6
+
7
+ You're building a frontend framework and want AI agents to work well with it. But how do you know if:
8
+ - Your documentation helps agents write correct code?
9
+ - Adding an MCP server improves agent success rates?
10
+ - Sonnet performs as well as Opus for your use cases?
11
+ - Your latest API changes broke agent compatibility?
12
+
13
+ **This framework gives you answers.** Run controlled experiments, measure pass rates, compare techniques.
14
+
15
+ ## Quick Start
16
+
17
+ ```bash
18
+ # Create a new eval project
19
+ npx @vercel/agent-eval init my-framework-evals
20
+ cd my-framework-evals
21
+
22
+ # Install dependencies
23
+ npm install
24
+
25
+ # Add your API keys
26
+ cp .env.example .env
27
+ # Edit .env with your AI_GATEWAY_API_KEY and VERCEL_TOKEN
28
+
29
+ # Preview what will run (no API calls, no cost)
30
+ npx @vercel/agent-eval cc --dry
31
+
32
+ # Run the evals
33
+ npx @vercel/agent-eval cc
34
+ ```
35
+
36
+ ## A/B Testing AI Techniques
37
+
38
+ The real power is comparing different approaches. Create multiple experiment configs:
39
+
40
+ ### Control: Baseline Agent
41
+
42
+ ```typescript
43
+ // experiments/control.ts
44
+ import type { ExperimentConfig } from 'agent-eval';
45
+
46
+ const config: ExperimentConfig = {
47
+ agent: 'vercel-ai-gateway/claude-code',
48
+ model: 'opus',
49
+ runs: 10, // Multiple runs for statistical significance
50
+ earlyExit: false, // Run all attempts to measure reliability
51
+ };
52
+
53
+ export default config;
54
+ ```
55
+
56
+ ### Treatment: Agent with MCP Server
57
+
58
+ ```typescript
59
+ // experiments/with-mcp.ts
60
+ import type { ExperimentConfig } from 'agent-eval';
61
+
62
+ const config: ExperimentConfig = {
63
+ agent: 'vercel-ai-gateway/claude-code',
64
+ model: 'opus',
65
+ runs: 10,
66
+ earlyExit: false,
67
+
68
+ setup: async (sandbox) => {
69
+ // Install your framework's MCP server
70
+ await sandbox.runCommand('npm', ['install', '-g', '@myframework/mcp-server']);
71
+
72
+ // Configure Claude to use it
73
+ await sandbox.writeFiles({
74
+ '.claude/settings.json': JSON.stringify({
75
+ mcpServers: {
76
+ myframework: { command: 'myframework-mcp' }
77
+ }
78
+ })
79
+ });
80
+ },
81
+ };
82
+
83
+ export default config;
84
+ ```
85
+
86
+ ### Run Both & Compare
87
+
88
+ ```bash
89
+ # Preview first
90
+ npx @vercel/agent-eval control --dry
91
+ npx @vercel/agent-eval with-mcp --dry
92
+
93
+ # Run experiments
94
+ npx @vercel/agent-eval control
95
+ npx @vercel/agent-eval with-mcp
96
+ ```
97
+
98
+ **Compare results:**
99
+ ```
100
+ Control (baseline): 7/10 passed (70%)
101
+ With MCP: 9/10 passed (90%)
102
+ ```
103
+
104
+ ## Creating Evals for Your Framework
105
+
106
+ Each eval tests one specific task an agent should be able to do with your framework.
107
+
108
+ ### Example: Testing Component Creation
109
+
110
+ ```
111
+ evals/
112
+ create-button-component/
113
+ PROMPT.md # Task for the agent
114
+ EVAL.ts # Tests to verify success
115
+ package.json # Your framework as a dependency
116
+ src/ # Starter code
117
+ ```
118
+
119
+ **PROMPT.md** - What you want the agent to do:
120
+ ```markdown
121
+ Create a Button component using MyFramework.
122
+
123
+ Requirements:
124
+ - Export a Button component from src/components/Button.tsx
125
+ - Accept `label` and `onClick` props
126
+ - Use the framework's styling system for hover states
127
+ ```
128
+
129
+ **EVAL.ts** - How you verify it worked:
130
+ ```typescript
131
+ import { test, expect } from 'vitest';
132
+ import { readFileSync, existsSync } from 'fs';
133
+ import { execSync } from 'child_process';
134
+
135
+ test('Button component exists', () => {
136
+ expect(existsSync('src/components/Button.tsx')).toBe(true);
137
+ });
138
+
139
+ test('has required props', () => {
140
+ const content = readFileSync('src/components/Button.tsx', 'utf-8');
141
+ expect(content).toContain('label');
142
+ expect(content).toContain('onClick');
143
+ });
144
+
145
+ test('project builds', () => {
146
+ execSync('npm run build', { stdio: 'pipe' });
147
+ });
148
+ ```
149
+
150
+ **package.json** - Include your framework:
151
+ ```json
152
+ {
153
+ "name": "create-button-component",
154
+ "type": "module",
155
+ "scripts": { "build": "tsc" },
156
+ "dependencies": {
157
+ "myframework": "^2.0.0"
158
+ }
159
+ }
160
+ ```
161
+
162
+ ## Experiment Ideas
163
+
164
+ | Experiment | Control | Treatment |
165
+ |------------|---------|-----------|
166
+ | MCP impact | No MCP | With MCP server |
167
+ | Model comparison | Haiku | Sonnet / Opus |
168
+ | Documentation | Minimal docs | Rich examples |
169
+ | System prompt | Default | Framework-specific |
170
+ | Tool availability | Read/write only | + custom tools |
171
+
172
+ ## Configuration Reference
173
+
174
+ ### Agent Selection
175
+
176
+ Choose your agent and authentication method:
177
+
178
+ ```typescript
179
+ // Vercel AI Gateway (recommended - unified billing & observability)
180
+ agent: 'vercel-ai-gateway/claude-code' // or 'vercel-ai-gateway/codex'
181
+
182
+ // Direct API (uses provider keys directly)
183
+ agent: 'claude-code' // requires ANTHROPIC_API_KEY
184
+ agent: 'codex' // requires OPENAI_API_KEY
185
+ ```
186
+
187
+ See the Environment Variables section below for setup instructions.
188
+
189
+ ### Full Configuration
190
+
191
+ ```typescript
192
+ import type { ExperimentConfig } from 'agent-eval';
193
+
194
+ const config: ExperimentConfig = {
195
+ // Required: which agent and authentication to use
196
+ agent: 'vercel-ai-gateway/claude-code',
197
+
198
+ // Model to use (defaults: 'opus' for claude-code, 'openai/gpt-5.2-codex' for codex)
199
+ model: 'opus',
200
+
201
+ // How many times to run each eval
202
+ runs: 10,
203
+
204
+ // Stop after first success? (false for reliability measurement)
205
+ earlyExit: false,
206
+
207
+ // npm scripts that must pass after agent finishes
208
+ scripts: ['build', 'lint'],
209
+
210
+ // Timeout per run in seconds
211
+ timeout: 300,
212
+
213
+ // Filter which evals to run (pick one)
214
+ evals: '*', // all (default)
215
+ // evals: ['specific-eval'], // by name
216
+ // evals: (name) => name.startsWith('api-'), // by function
217
+
218
+ // Setup function for environment configuration
219
+ setup: async (sandbox) => {
220
+ await sandbox.writeFiles({ '.env': 'API_KEY=test' });
221
+ await sandbox.runCommand('npm', ['run', 'setup']);
222
+ },
223
+ };
224
+
225
+ export default config;
226
+ ```
227
+
228
+ ## CLI Commands
229
+
230
+ ### `init <name>`
231
+
232
+ Create a new eval project:
233
+ ```bash
234
+ npx @vercel/agent-eval init my-evals
235
+ ```
236
+
237
+ ### `<experiment>`
238
+
239
+ Run an experiment:
240
+ ```bash
241
+ npx @vercel/agent-eval cc
242
+ ```
243
+
244
+ **Dry run** - preview without executing (no API calls, no cost):
245
+ ```bash
246
+ npx @vercel/agent-eval cc --dry
247
+
248
+ # Output:
249
+ # Found 5 valid fixture(s), will run 5:
250
+ # - create-button
251
+ # - add-routing
252
+ # - setup-state
253
+ # - ...
254
+ # Running 5 eval(s) x 10 run(s) = 50 total runs
255
+ # Agent: claude-code, Model: opus, Timeout: 300s
256
+ # [DRY RUN] Would execute evals here
257
+ ```
258
+
259
+ ## Results
260
+
261
+ Results are saved to `results/<experiment>/<timestamp>/`:
262
+
263
+ ```
264
+ results/
265
+ with-mcp/
266
+ 2026-01-27T10-30-00Z/
267
+ experiment.json # Config and summary
268
+ create-button/
269
+ summary.json # { totalRuns: 10, passedRuns: 9, passRate: "90%" }
270
+ run-1/
271
+ result.json # Individual run result
272
+ transcript.jsonl # Agent conversation
273
+ outputs/ # Test/script output
274
+ ```
275
+
276
+ ### Analyzing Results
277
+
278
+ ```bash
279
+ # Quick comparison
280
+ cat results/control/*/experiment.json | jq '.evals[] | {name, passRate}'
281
+ cat results/with-mcp/*/experiment.json | jq '.evals[] | {name, passRate}'
282
+ ```
283
+
284
+ | Pass Rate | Interpretation |
285
+ |-----------|----------------|
286
+ | 90-100% | Agent handles this reliably |
287
+ | 70-89% | Usually works, room for improvement |
288
+ | 50-69% | Unreliable, needs investigation |
289
+ | < 50% | Task too hard or prompt needs work |
290
+
291
+ ## Environment Variables
292
+
293
+ Every run requires **two things**: an API key for the agent and a token for the Vercel sandbox. The exact variables depend on which authentication mode you use.
294
+
295
+ | Variable | Required when | Description |
296
+ |---|---|---|
297
+ | `AI_GATEWAY_API_KEY` | `agent: 'vercel-ai-gateway/...'` | Vercel AI Gateway key — works for all agents |
298
+ | `ANTHROPIC_API_KEY` | `agent: 'claude-code'` | Direct Anthropic API key (`sk-ant-...`) |
299
+ | `OPENAI_API_KEY` | `agent: 'codex'` | Direct OpenAI API key (`sk-proj-...`) |
300
+ | `VERCEL_TOKEN` | Always (pick one) | Vercel personal access token — for local dev |
301
+ | `VERCEL_OIDC_TOKEN` | Always (pick one) | Vercel OIDC token — for CI/CD pipelines |
302
+
303
+ > You always need **one agent key** + **one sandbox token**.
304
+
305
+ ### Vercel AI Gateway (Recommended)
306
+
307
+ Use `vercel-ai-gateway/` prefixed agents. One key for all models.
308
+
309
+ ```bash
310
+ # Agent access — get yours at https://vercel.com/dashboard -> AI Gateway
311
+ AI_GATEWAY_API_KEY=your-ai-gateway-api-key
312
+
313
+ # Sandbox access — create at https://vercel.com/account/tokens
314
+ VERCEL_TOKEN=your-vercel-token
315
+ # OR for CI/CD:
316
+ # VERCEL_OIDC_TOKEN=your-oidc-token
317
+ ```
318
+
319
+ ### Direct API Keys (Alternative)
320
+
321
+ Remove the `vercel-ai-gateway/` prefix and use provider keys directly:
322
+
323
+ ```bash
324
+ # For agent: 'claude-code'
325
+ ANTHROPIC_API_KEY=sk-ant-...
326
+
327
+ # For agent: 'codex'
328
+ OPENAI_API_KEY=sk-proj-...
329
+
330
+ # Sandbox access is still required
331
+ VERCEL_TOKEN=your-vercel-token
332
+ ```
333
+
334
+ ### `.env` Setup
335
+
336
+ The `init` command generates a `.env.example` file. Copy it and fill in your keys:
337
+
338
+ ```bash
339
+ cp .env.example .env
340
+ ```
341
+
342
+ The framework loads `.env` automatically via [dotenv](https://github.com/motdotla/dotenv).
343
+
344
+ ### Vercel Employees
345
+
346
+ **To get the environment variables, link to `vercel-labs/agent-eval` on Vercel:**
347
+
348
+ ```bash
349
+ # Link to the vercel-labs/agent-eval project
350
+ vc link vercel-labs/agent-eval
351
+
352
+ # Pull environment variables
353
+ vc env pull
354
+ ```
355
+
356
+ This writes a `.env.local` file with all the required environment variables (AI_GATEWAY_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY, VERCEL_OIDC_TOKEN) — no manual key setup needed. The framework automatically loads from both `.env` and `.env.local`.
357
+
358
+ ## Tips
359
+
360
+ **Start with `--dry`**: Always preview before running to verify your config and avoid unexpected costs.
361
+
362
+ **Use multiple runs**: Single runs don't tell you reliability. Use `runs: 10` and `earlyExit: false` for meaningful data.
363
+
364
+ **Isolate variables**: Change one thing at a time between experiments. Don't compare "Opus with MCP" to "Haiku without MCP".
365
+
366
+ **Test incrementally**: Start with simple tasks, add complexity as you learn what works.
367
+
368
+ ## License
369
+
370
+ MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * CLI entry point for the eval framework.
4
+ */
5
+ export {};
6
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;GAEG"}
package/dist/cli.js ADDED
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * CLI entry point for the eval framework.
4
+ */
5
+ import { Command } from 'commander';
6
+ import { config as dotenvConfig } from 'dotenv';
7
+ import { resolve, dirname, basename } from 'path';
8
+ import { existsSync } from 'fs';
9
+ import chalk from 'chalk';
10
+ import { loadConfig, resolveEvalNames } from './lib/config.js';
11
+ import { loadAllFixtures } from './lib/fixture.js';
12
+ import { runExperiment } from './lib/runner.js';
13
+ import { initProject, getPostInitInstructions } from './lib/init.js';
14
+ import { getAgent } from './lib/agents/index.js';
15
+ // Load environment variables
16
+ dotenvConfig();
17
+ const program = new Command();
18
+ program
19
+ .name('agent-eval')
20
+ .description('Framework for testing AI coding agents in isolated sandboxes')
21
+ .version('0.0.1');
22
+ /**
23
+ * Resolve config path shorthand.
24
+ * - "cc" -> "experiments/cc.ts"
25
+ * - "experiments/cc.ts" -> "experiments/cc.ts" (unchanged)
26
+ */
27
+ function resolveConfigPath(input) {
28
+ // If it already has a path separator or extension, use as-is
29
+ if (input.includes('/') || input.includes('\\') || input.endsWith('.ts') || input.endsWith('.js')) {
30
+ return input;
31
+ }
32
+ // Otherwise, treat as shorthand: "cc" -> "experiments/cc.ts"
33
+ return `experiments/${input}.ts`;
34
+ }
35
+ /**
36
+ * Run experiment command handler
37
+ */
38
+ async function runExperimentCommand(configInput, options) {
39
+ try {
40
+ const configPath = resolveConfigPath(configInput);
41
+ const absoluteConfigPath = resolve(process.cwd(), configPath);
42
+ if (!existsSync(absoluteConfigPath)) {
43
+ console.error(chalk.red(`Config file not found: ${absoluteConfigPath}`));
44
+ process.exit(1);
45
+ }
46
+ console.log(chalk.blue(`Loading config from ${configPath}...`));
47
+ const config = await loadConfig(absoluteConfigPath);
48
+ // Discover evals - infer from config file location
49
+ // Config at project/experiments/foo.ts -> evals at project/evals/
50
+ const projectDir = dirname(dirname(absoluteConfigPath));
51
+ const evalsDir = resolve(projectDir, 'evals');
52
+ if (!existsSync(evalsDir)) {
53
+ console.error(chalk.red(`Evals directory not found: ${evalsDir}`));
54
+ console.error(chalk.gray(`Expected evals/ to be sibling to experiments/ directory`));
55
+ process.exit(1);
56
+ }
57
+ console.log(chalk.blue(`Discovering evals in ${evalsDir}...`));
58
+ const { fixtures, errors } = loadAllFixtures(evalsDir);
59
+ if (errors.length > 0) {
60
+ console.log(chalk.yellow(`\nWarning: ${errors.length} invalid fixture(s):`));
61
+ for (const error of errors) {
62
+ console.log(chalk.yellow(` - ${error.fixtureName}: ${error.message}`));
63
+ }
64
+ }
65
+ if (fixtures.length === 0) {
66
+ console.error(chalk.red('No valid eval fixtures found'));
67
+ process.exit(1);
68
+ }
69
+ // Resolve which evals to run
70
+ const availableNames = fixtures.map((f) => f.name);
71
+ const evalNames = resolveEvalNames(config.evals, availableNames);
72
+ if (evalNames.length === 0) {
73
+ console.error(chalk.red('No evals matched the filter'));
74
+ process.exit(1);
75
+ }
76
+ console.log(chalk.green(`\nFound ${fixtures.length} valid fixture(s), will run ${evalNames.length}:`));
77
+ for (const name of evalNames) {
78
+ console.log(chalk.green(` - ${name}`));
79
+ }
80
+ console.log(chalk.blue(`\nRunning ${evalNames.length} eval(s) x ${config.runs} run(s) = ${evalNames.length * config.runs} total runs`));
81
+ console.log(chalk.blue(`Agent: ${config.agent}, Model: ${config.model}, Timeout: ${config.timeout}s, Early Exit: ${config.earlyExit}`));
82
+ if (options.dry) {
83
+ console.log(chalk.yellow('\n[DRY RUN] Would execute evals here'));
84
+ return;
85
+ }
86
+ // Get the agent to check for required API key
87
+ const agent = getAgent(config.agent);
88
+ const apiKeyEnvVar = agent.getApiKeyEnvVar();
89
+ const apiKey = process.env[apiKeyEnvVar];
90
+ if (!apiKey) {
91
+ console.error(chalk.red(`${apiKeyEnvVar} environment variable is required`));
92
+ console.error(chalk.gray(`Get your API key at: https://vercel.com/dashboard -> AI Gateway`));
93
+ process.exit(1);
94
+ }
95
+ // Filter fixtures to only the ones we want to run
96
+ const selectedFixtures = fixtures.filter((f) => evalNames.includes(f.name));
97
+ // Get experiment name from config file
98
+ const experimentName = basename(configPath, '.ts').replace(/\.js$/, '');
99
+ const resultsDir = resolve(process.cwd(), 'results');
100
+ console.log(chalk.blue('\nStarting experiment...'));
101
+ // Run the experiment
102
+ const results = await runExperiment({
103
+ config,
104
+ fixtures: selectedFixtures,
105
+ apiKey,
106
+ resultsDir,
107
+ experimentName,
108
+ onProgress: (msg) => console.log(msg),
109
+ });
110
+ // Exit with appropriate code
111
+ const allPassed = results.evals.every((e) => e.passedRuns === e.totalRuns);
112
+ process.exit(allPassed ? 0 : 1);
113
+ }
114
+ catch (error) {
115
+ if (error instanceof Error) {
116
+ console.error(chalk.red(`Error: ${error.message}`));
117
+ }
118
+ else {
119
+ console.error(chalk.red('An unknown error occurred'));
120
+ }
121
+ process.exit(1);
122
+ }
123
+ }
124
+ /**
125
+ * init command - Create a new eval project
126
+ */
127
+ program
128
+ .command('init')
129
+ .argument('<name>', 'Name of the project to create')
130
+ .description('Create a new eval project with example fixtures')
131
+ .action(async (name) => {
132
+ try {
133
+ console.log(chalk.blue(`Creating new eval project: ${name}`));
134
+ const projectDir = initProject({
135
+ name,
136
+ targetDir: process.cwd(),
137
+ });
138
+ console.log(chalk.green('Project created successfully!'));
139
+ console.log(getPostInitInstructions(projectDir, name));
140
+ }
141
+ catch (error) {
142
+ if (error instanceof Error) {
143
+ console.error(chalk.red(`Error: ${error.message}`));
144
+ }
145
+ else {
146
+ console.error(chalk.red('An unknown error occurred'));
147
+ }
148
+ process.exit(1);
149
+ }
150
+ });
151
+ /**
152
+ * Default command - run experiment (no subcommand needed)
153
+ * Usage: agent-eval cc --dry
154
+ */
155
+ program
156
+ .argument('[config]', 'Experiment name (e.g., "cc") or path')
157
+ .option('--dry', 'Preview what would run without executing')
158
+ .action(async (configInput, options) => {
159
+ if (!configInput) {
160
+ program.help();
161
+ return;
162
+ }
163
+ await runExperimentCommand(configInput, options);
164
+ });
165
+ program.parse();
166
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;GAEG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,MAAM,IAAI,YAAY,EAAE,MAAM,QAAQ,CAAC;AAChD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,MAAM,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAChC,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAC/D,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAChD,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AAEjD,6BAA6B;AAC7B,YAAY,EAAE,CAAC;AAEf,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,8DAA8D,CAAC;KAC3E,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB;;;;GAIG;AACH,SAAS,iBAAiB,CAAC,KAAa;IACtC,6DAA6D;IAC7D,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAClG,OAAO,KAAK,CAAC;IACf,CAAC;IACD,6DAA6D;IAC7D,OAAO,eAAe,KAAK,KAAK,CAAC;AACnC,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,oBAAoB,CAAC,WAAmB,EAAE,OAA0B;IACjF,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,iBAAiB,CAAC,WAAW,CAAC,CAAC;QAClD,MAAM,kBAAkB,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,UAAU,CAAC,CAAC;QAE9D,IAAI,CAAC,UAAU,CAAC,kBAAkB,CAAC,EAAE,CAAC;YACpC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,0BAA0B,kBAAkB,EAAE,CAAC,CAAC,CAAC;YACzE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,uBAAuB,UAAU,KAAK,CAAC,CAAC,CAAC;QAChE,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,kBAAkB,CAAC,CAAC;QAEpD,mDAAmD;QACnD,kEAAkE;QAClE,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC,CAAC;QACxD,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,8BAA8B,QAAQ,EAAE,CAAC,CAAC,CAAC;YACnE,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,yDAAyD,CAAC,CAAC,CAAC;YACrF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,wBAAwB,QAAQ,KAAK,CAAC,CAAC,CAAC;QAC/D,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;QAEvD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,cAAc,MAAM,CAAC,MAAM,sBAAsB,CAAC,CAAC,CAAC;YAC7E,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,KAAK,CAAC,WAAW,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC1E,CAAC;QACH,CAAC;QAED,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC,CAAC;YACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,6BAA6B;QAC7B,MAAM,cAAc,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACnD,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,KAAK,EAAE,cAAc,CAAC,CAAC;QAEjE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC,CAAC;YACxD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,WAAW,QAAQ,CAAC,MAAM,+BAA+B,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvG,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,CAAC;QAC1C,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,aAAa,SAAS,CAAC,MAAM,cAAc,MAAM,CAAC,IAAI,aAAa,SAAS,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,aAAa,CAAC,CAAC,CAAC;QACxI,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,KAAK,YAAY,MAAM,CAAC,KAAK,cAAc,MAAM,CAAC,OAAO,kBAAkB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;QAExI,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;YAChB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,sCAAsC,CAAC,CAAC,CAAC;YAClE,OAAO;QACT,CAAC;QAED,8CAA8C;QAC9C,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACrC,MAAM,YAAY,GAAG,KAAK,CAAC,eAAe,EAAE,CAAC;QAC7C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QACzC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,YAAY,mCAAmC,CAAC,CAAC,CAAC;YAC7E,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,iEAAiE,CAAC,CAAC,CAAC;YAC7F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,kDAAkD;QAClD,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAE5E,uCAAuC;QACvC,MAAM,cAAc,GAAG,QAAQ,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACxE,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CAAC,CAAC;QAErD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC,CAAC;QAEpD,qBAAqB;QACrB,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC;YAClC,MAAM;YACN,QAAQ,EAAE,gBAAgB;YAC1B,MAAM;YACN,UAAU;YACV,cAAc;YACd,UAAU,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;SACtC,CAAC,CAAC;QAEH,6BAA6B;QAC7B,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,SAAS,CAAC,CAAC;QAC3E,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAClC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC,CAAC;QACxD,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,QAAQ,CAAC,QAAQ,EAAE,+BAA+B,CAAC;KACnD,WAAW,CAAC,iDAAiD,CAAC;KAC9D,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,EAAE;IAC7B,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,8BAA8B,IAAI,EAAE,CAAC,CAAC,CAAC;QAE9D,MAAM,UAAU,GAAG,WAAW,CAAC;YAC7B,IAAI;YACJ,SAAS,EAAE,OAAO,CAAC,GAAG,EAAE;SACzB,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC,CAAC;IACzD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC,CAAC;QACxD,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL;;;GAGG;AACH,OAAO;KACJ,QAAQ,CAAC,UAAU,EAAE,sCAAsC,CAAC;KAC5D,MAAM,CAAC,OAAO,EAAE,0CAA0C,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,WAA+B,EAAE,OAA0B,EAAE,EAAE;IAC5E,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,CAAC,IAAI,EAAE,CAAC;QACf,OAAO;IACT,CAAC;IACD,MAAM,oBAAoB,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;AACnD,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
@@ -0,0 +1,21 @@
1
+ /**
2
+ * agent-eval
3
+ *
4
+ * Framework for testing AI coding agents in isolated sandboxes.
5
+ */
6
+ export type { AgentType, ModelTier, EvalFilter, Sandbox, SetupFunction, ExperimentConfig, ResolvedExperimentConfig, EvalFixture, EvalRunResult, EvalRunData, EvalSummary, ExperimentResults, } from './lib/types.js';
7
+ export { REQUIRED_EVAL_FILES, EXCLUDED_FILES } from './lib/types.js';
8
+ export { CONFIG_DEFAULTS, validateConfig, resolveConfig, loadConfig, resolveEvalNames, } from './lib/config.js';
9
+ export { FixtureValidationError, discoverFixtures, validateFixtureFiles, validatePackageJson, loadFixture, loadAllFixtures, getFixtureFiles, readFixtureFiles, } from './lib/fixture.js';
10
+ export type { SandboxOptions, CommandResult, SandboxFile } from './lib/sandbox.js';
11
+ export { SandboxManager, DEFAULT_SANDBOX_TIMEOUT, IGNORED_PATTERNS, TEST_FILE_PATTERNS, collectLocalFiles, splitTestFiles, verifyNoTestFiles, } from './lib/sandbox.js';
12
+ export type { AgentRunOptions, AgentRunResult } from './lib/agents/types.js';
13
+ export type { Agent, ScriptResult } from './lib/agents/types.js';
14
+ export { getAgent, listAgents, registerAgent } from './lib/agents/index.js';
15
+ export type { SaveResultsOptions } from './lib/results.js';
16
+ export { agentResultToEvalRunData, createEvalSummary, createExperimentResults, saveResults, formatResultsTable, formatRunResult, createProgressDisplay, } from './lib/results.js';
17
+ export type { RunExperimentOptions } from './lib/runner.js';
18
+ export { runExperiment, runSingleEval } from './lib/runner.js';
19
+ export type { InitOptions } from './lib/init.js';
20
+ export { initProject, getPostInitInstructions } from './lib/init.js';
21
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,YAAY,EACV,SAAS,EACT,SAAS,EACT,UAAU,EACV,OAAO,EACP,aAAa,EACb,gBAAgB,EAChB,wBAAwB,EACxB,WAAW,EACX,aAAa,EACb,WAAW,EACX,WAAW,EACX,iBAAiB,GAClB,MAAM,gBAAgB,CAAC;AAGxB,OAAO,EAAE,mBAAmB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAGrE,OAAO,EACL,eAAe,EACf,cAAc,EACd,aAAa,EACb,UAAU,EACV,gBAAgB,GACjB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,sBAAsB,EACtB,gBAAgB,EAChB,oBAAoB,EACpB,mBAAmB,EACnB,WAAW,EACX,eAAe,EACf,eAAe,EACf,gBAAgB,GACjB,MAAM,kBAAkB,CAAC;AAG1B,YAAY,EAAE,cAAc,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACnF,OAAO,EACL,cAAc,EACd,uBAAuB,EACvB,gBAAgB,EAChB,kBAAkB,EAClB,iBAAiB,EACjB,cAAc,EACd,iBAAiB,GAClB,MAAM,kBAAkB,CAAC;AAG1B,YAAY,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAG7E,YAAY,EAAE,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AACjE,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAG5E,YAAY,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAC3D,OAAO,EACL,wBAAwB,EACxB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,EACX,kBAAkB,EAClB,eAAe,EACf,qBAAqB,GACtB,MAAM,kBAAkB,CAAC;AAG1B,YAAY,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAG/D,YAAY,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,17 @@
1
+ /**
2
+ * agent-eval
3
+ *
4
+ * Framework for testing AI coding agents in isolated sandboxes.
5
+ */
6
+ // Re-export constants
7
+ export { REQUIRED_EVAL_FILES, EXCLUDED_FILES } from './lib/types.js';
8
+ // Re-export config utilities
9
+ export { CONFIG_DEFAULTS, validateConfig, resolveConfig, loadConfig, resolveEvalNames, } from './lib/config.js';
10
+ // Re-export fixture utilities
11
+ export { FixtureValidationError, discoverFixtures, validateFixtureFiles, validatePackageJson, loadFixture, loadAllFixtures, getFixtureFiles, readFixtureFiles, } from './lib/fixture.js';
12
+ export { SandboxManager, DEFAULT_SANDBOX_TIMEOUT, IGNORED_PATTERNS, TEST_FILE_PATTERNS, collectLocalFiles, splitTestFiles, verifyNoTestFiles, } from './lib/sandbox.js';
13
+ export { getAgent, listAgents, registerAgent } from './lib/agents/index.js';
14
+ export { agentResultToEvalRunData, createEvalSummary, createExperimentResults, saveResults, formatResultsTable, formatRunResult, createProgressDisplay, } from './lib/results.js';
15
+ export { runExperiment, runSingleEval } from './lib/runner.js';
16
+ export { initProject, getPostInitInstructions } from './lib/init.js';
17
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAkBH,sBAAsB;AACtB,OAAO,EAAE,mBAAmB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAErE,6BAA6B;AAC7B,OAAO,EACL,eAAe,EACf,cAAc,EACd,aAAa,EACb,UAAU,EACV,gBAAgB,GACjB,MAAM,iBAAiB,CAAC;AAEzB,8BAA8B;AAC9B,OAAO,EACL,sBAAsB,EACtB,gBAAgB,EAChB,oBAAoB,EACpB,mBAAmB,EACnB,WAAW,EACX,eAAe,EACf,eAAe,EACf,gBAAgB,GACjB,MAAM,kBAAkB,CAAC;AAI1B,OAAO,EACL,cAAc,EACd,uBAAuB,EACvB,gBAAgB,EAChB,kBAAkB,EAClB,iBAAiB,EACjB,cAAc,EACd,iBAAiB,GAClB,MAAM,kBAAkB,CAAC;AAO1B,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAI5E,OAAO,EACL,wBAAwB,EACxB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,EACX,kBAAkB,EAClB,eAAe,EACf,qBAAqB,GACtB,MAAM,kBAAkB,CAAC;AAI1B,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAI/D,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC"}
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Claude Code agent implementation.
3
+ * Uses Vercel AI Gateway for model access.
4
+ */
5
+ import type { Agent } from './types.js';
6
+ /**
7
+ * Create Claude Code agent with specified authentication method.
8
+ */
9
+ export declare function createClaudeCodeAgent({ useVercelAiGateway }: {
10
+ useVercelAiGateway: boolean;
11
+ }): Agent;
12
+ //# sourceMappingURL=claude-code.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"claude-code.d.ts","sourceRoot":"","sources":["../../../src/lib/agents/claude-code.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,KAAK,EAAmC,MAAM,YAAY,CAAC;AA8CzE;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,EAAE,kBAAkB,EAAE,EAAE;IAAE,kBAAkB,EAAE,OAAO,CAAA;CAAE,GAAG,KAAK,CAiMpG"}