@vercel/agent-eval 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +370 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +166 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +17 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/agents/claude-code.d.ts +12 -0
- package/dist/lib/agents/claude-code.d.ts.map +1 -0
- package/dist/lib/agents/claude-code.js +203 -0
- package/dist/lib/agents/claude-code.js.map +1 -0
- package/dist/lib/agents/codex.d.ts +12 -0
- package/dist/lib/agents/codex.d.ts.map +1 -0
- package/dist/lib/agents/codex.js +247 -0
- package/dist/lib/agents/codex.js.map +1 -0
- package/dist/lib/agents/index.d.ts +7 -0
- package/dist/lib/agents/index.d.ts.map +1 -0
- package/dist/lib/agents/index.js +14 -0
- package/dist/lib/agents/index.js.map +1 -0
- package/dist/lib/agents/registry.d.ts +23 -0
- package/dist/lib/agents/registry.d.ts.map +1 -0
- package/dist/lib/agents/registry.js +35 -0
- package/dist/lib/agents/registry.js.map +1 -0
- package/dist/lib/agents/shared.d.ts +47 -0
- package/dist/lib/agents/shared.d.ts.map +1 -0
- package/dist/lib/agents/shared.js +99 -0
- package/dist/lib/agents/shared.js.map +1 -0
- package/dist/lib/agents/types.d.ts +69 -0
- package/dist/lib/agents/types.d.ts.map +1 -0
- package/dist/lib/agents/types.js +5 -0
- package/dist/lib/agents/types.js.map +1 -0
- package/dist/lib/config.d.ts +34 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +117 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/fixture.d.ts +52 -0
- package/dist/lib/fixture.d.ts.map +1 -0
- package/dist/lib/fixture.js +175 -0
- package/dist/lib/fixture.js.map +1 -0
- package/dist/lib/init.d.ts +21 -0
- package/dist/lib/init.d.ts.map +1 -0
- package/dist/lib/init.js +250 -0
- package/dist/lib/init.js.map +1 -0
- package/dist/lib/results.d.ts +54 -0
- package/dist/lib/results.d.ts.map +1 -0
- package/dist/lib/results.js +186 -0
- package/dist/lib/results.js.map +1 -0
- package/dist/lib/runner.d.ts +43 -0
- package/dist/lib/runner.d.ts.map +1 -0
- package/dist/lib/runner.js +142 -0
- package/dist/lib/runner.js.map +1 -0
- package/dist/lib/sandbox.d.ts +117 -0
- package/dist/lib/sandbox.d.ts.map +1 -0
- package/dist/lib/sandbox.js +248 -0
- package/dist/lib/sandbox.js.map +1 -0
- package/dist/lib/types.d.ts +166 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +14 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/test-setup.d.ts +2 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +6 -0
- package/dist/test-setup.js.map +1 -0
- package/package.json +58 -0
package/README.md
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
# @vercel/agent-eval
|
|
2
|
+
|
|
3
|
+
Test AI coding agents on your framework. Measure what actually works.
|
|
4
|
+
|
|
5
|
+
## Why?
|
|
6
|
+
|
|
7
|
+
You're building a frontend framework and want AI agents to work well with it. But how do you know if:
|
|
8
|
+
- Your documentation helps agents write correct code?
|
|
9
|
+
- Adding an MCP server improves agent success rates?
|
|
10
|
+
- Sonnet performs as well as Opus for your use cases?
|
|
11
|
+
- Your latest API changes broke agent compatibility?
|
|
12
|
+
|
|
13
|
+
**This framework gives you answers.** Run controlled experiments, measure pass rates, compare techniques.
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Create a new eval project
|
|
19
|
+
npx @vercel/agent-eval init my-framework-evals
|
|
20
|
+
cd my-framework-evals
|
|
21
|
+
|
|
22
|
+
# Install dependencies
|
|
23
|
+
npm install
|
|
24
|
+
|
|
25
|
+
# Add your API keys
|
|
26
|
+
cp .env.example .env
|
|
27
|
+
# Edit .env with your AI_GATEWAY_API_KEY and VERCEL_TOKEN
|
|
28
|
+
|
|
29
|
+
# Preview what will run (no API calls, no cost)
|
|
30
|
+
npx @vercel/agent-eval cc --dry
|
|
31
|
+
|
|
32
|
+
# Run the evals
|
|
33
|
+
npx @vercel/agent-eval cc
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## A/B Testing AI Techniques
|
|
37
|
+
|
|
38
|
+
The real power is comparing different approaches. Create multiple experiment configs:
|
|
39
|
+
|
|
40
|
+
### Control: Baseline Agent
|
|
41
|
+
|
|
42
|
+
```typescript
|
|
43
|
+
// experiments/control.ts
|
|
44
|
+
import type { ExperimentConfig } from 'agent-eval';
|
|
45
|
+
|
|
46
|
+
const config: ExperimentConfig = {
|
|
47
|
+
agent: 'vercel-ai-gateway/claude-code',
|
|
48
|
+
model: 'opus',
|
|
49
|
+
runs: 10, // Multiple runs for statistical significance
|
|
50
|
+
earlyExit: false, // Run all attempts to measure reliability
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
export default config;
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Treatment: Agent with MCP Server
|
|
57
|
+
|
|
58
|
+
```typescript
|
|
59
|
+
// experiments/with-mcp.ts
|
|
60
|
+
import type { ExperimentConfig } from 'agent-eval';
|
|
61
|
+
|
|
62
|
+
const config: ExperimentConfig = {
|
|
63
|
+
agent: 'vercel-ai-gateway/claude-code',
|
|
64
|
+
model: 'opus',
|
|
65
|
+
runs: 10,
|
|
66
|
+
earlyExit: false,
|
|
67
|
+
|
|
68
|
+
setup: async (sandbox) => {
|
|
69
|
+
// Install your framework's MCP server
|
|
70
|
+
await sandbox.runCommand('npm', ['install', '-g', '@myframework/mcp-server']);
|
|
71
|
+
|
|
72
|
+
// Configure Claude to use it
|
|
73
|
+
await sandbox.writeFiles({
|
|
74
|
+
'.claude/settings.json': JSON.stringify({
|
|
75
|
+
mcpServers: {
|
|
76
|
+
myframework: { command: 'myframework-mcp' }
|
|
77
|
+
}
|
|
78
|
+
})
|
|
79
|
+
});
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
export default config;
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Run Both & Compare
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# Preview first
|
|
90
|
+
npx @vercel/agent-eval control --dry
|
|
91
|
+
npx @vercel/agent-eval with-mcp --dry
|
|
92
|
+
|
|
93
|
+
# Run experiments
|
|
94
|
+
npx @vercel/agent-eval control
|
|
95
|
+
npx @vercel/agent-eval with-mcp
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Compare results:**
|
|
99
|
+
```
|
|
100
|
+
Control (baseline): 7/10 passed (70%)
|
|
101
|
+
With MCP: 9/10 passed (90%)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Creating Evals for Your Framework
|
|
105
|
+
|
|
106
|
+
Each eval tests one specific task an agent should be able to do with your framework.
|
|
107
|
+
|
|
108
|
+
### Example: Testing Component Creation
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
evals/
|
|
112
|
+
create-button-component/
|
|
113
|
+
PROMPT.md # Task for the agent
|
|
114
|
+
EVAL.ts # Tests to verify success
|
|
115
|
+
package.json # Your framework as a dependency
|
|
116
|
+
src/ # Starter code
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**PROMPT.md** - What you want the agent to do:
|
|
120
|
+
```markdown
|
|
121
|
+
Create a Button component using MyFramework.
|
|
122
|
+
|
|
123
|
+
Requirements:
|
|
124
|
+
- Export a Button component from src/components/Button.tsx
|
|
125
|
+
- Accept `label` and `onClick` props
|
|
126
|
+
- Use the framework's styling system for hover states
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**EVAL.ts** - How you verify it worked:
|
|
130
|
+
```typescript
|
|
131
|
+
import { test, expect } from 'vitest';
|
|
132
|
+
import { readFileSync, existsSync } from 'fs';
|
|
133
|
+
import { execSync } from 'child_process';
|
|
134
|
+
|
|
135
|
+
test('Button component exists', () => {
|
|
136
|
+
expect(existsSync('src/components/Button.tsx')).toBe(true);
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
test('has required props', () => {
|
|
140
|
+
const content = readFileSync('src/components/Button.tsx', 'utf-8');
|
|
141
|
+
expect(content).toContain('label');
|
|
142
|
+
expect(content).toContain('onClick');
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
test('project builds', () => {
|
|
146
|
+
execSync('npm run build', { stdio: 'pipe' });
|
|
147
|
+
});
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**package.json** - Include your framework:
|
|
151
|
+
```json
|
|
152
|
+
{
|
|
153
|
+
"name": "create-button-component",
|
|
154
|
+
"type": "module",
|
|
155
|
+
"scripts": { "build": "tsc" },
|
|
156
|
+
"dependencies": {
|
|
157
|
+
"myframework": "^2.0.0"
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Experiment Ideas
|
|
163
|
+
|
|
164
|
+
| Experiment | Control | Treatment |
|
|
165
|
+
|------------|---------|-----------|
|
|
166
|
+
| MCP impact | No MCP | With MCP server |
|
|
167
|
+
| Model comparison | Haiku | Sonnet / Opus |
|
|
168
|
+
| Documentation | Minimal docs | Rich examples |
|
|
169
|
+
| System prompt | Default | Framework-specific |
|
|
170
|
+
| Tool availability | Read/write only | + custom tools |
|
|
171
|
+
|
|
172
|
+
## Configuration Reference
|
|
173
|
+
|
|
174
|
+
### Agent Selection
|
|
175
|
+
|
|
176
|
+
Choose your agent and authentication method:
|
|
177
|
+
|
|
178
|
+
```typescript
|
|
179
|
+
// Vercel AI Gateway (recommended - unified billing & observability)
|
|
180
|
+
agent: 'vercel-ai-gateway/claude-code' // or 'vercel-ai-gateway/codex'
|
|
181
|
+
|
|
182
|
+
// Direct API (uses provider keys directly)
|
|
183
|
+
agent: 'claude-code' // requires ANTHROPIC_API_KEY
|
|
184
|
+
agent: 'codex' // requires OPENAI_API_KEY
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
See the Environment Variables section below for setup instructions.
|
|
188
|
+
|
|
189
|
+
### Full Configuration
|
|
190
|
+
|
|
191
|
+
```typescript
|
|
192
|
+
import type { ExperimentConfig } from 'agent-eval';
|
|
193
|
+
|
|
194
|
+
const config: ExperimentConfig = {
|
|
195
|
+
// Required: which agent and authentication to use
|
|
196
|
+
agent: 'vercel-ai-gateway/claude-code',
|
|
197
|
+
|
|
198
|
+
// Model to use (defaults: 'opus' for claude-code, 'openai/gpt-5.2-codex' for codex)
|
|
199
|
+
model: 'opus',
|
|
200
|
+
|
|
201
|
+
// How many times to run each eval
|
|
202
|
+
runs: 10,
|
|
203
|
+
|
|
204
|
+
// Stop after first success? (false for reliability measurement)
|
|
205
|
+
earlyExit: false,
|
|
206
|
+
|
|
207
|
+
// npm scripts that must pass after agent finishes
|
|
208
|
+
scripts: ['build', 'lint'],
|
|
209
|
+
|
|
210
|
+
// Timeout per run in seconds
|
|
211
|
+
timeout: 300,
|
|
212
|
+
|
|
213
|
+
// Filter which evals to run (pick one)
|
|
214
|
+
evals: '*', // all (default)
|
|
215
|
+
// evals: ['specific-eval'], // by name
|
|
216
|
+
// evals: (name) => name.startsWith('api-'), // by function
|
|
217
|
+
|
|
218
|
+
// Setup function for environment configuration
|
|
219
|
+
setup: async (sandbox) => {
|
|
220
|
+
await sandbox.writeFiles({ '.env': 'API_KEY=test' });
|
|
221
|
+
await sandbox.runCommand('npm', ['run', 'setup']);
|
|
222
|
+
},
|
|
223
|
+
};
|
|
224
|
+
|
|
225
|
+
export default config;
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## CLI Commands
|
|
229
|
+
|
|
230
|
+
### `init <name>`
|
|
231
|
+
|
|
232
|
+
Create a new eval project:
|
|
233
|
+
```bash
|
|
234
|
+
npx @vercel/agent-eval init my-evals
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### `<experiment>`
|
|
238
|
+
|
|
239
|
+
Run an experiment:
|
|
240
|
+
```bash
|
|
241
|
+
npx @vercel/agent-eval cc
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
**Dry run** - preview without executing (no API calls, no cost):
|
|
245
|
+
```bash
|
|
246
|
+
npx @vercel/agent-eval cc --dry
|
|
247
|
+
|
|
248
|
+
# Output:
|
|
249
|
+
# Found 5 valid fixture(s), will run 5:
|
|
250
|
+
# - create-button
|
|
251
|
+
# - add-routing
|
|
252
|
+
# - setup-state
|
|
253
|
+
# - ...
|
|
254
|
+
# Running 5 eval(s) x 10 run(s) = 50 total runs
|
|
255
|
+
# Agent: claude-code, Model: opus, Timeout: 300s
|
|
256
|
+
# [DRY RUN] Would execute evals here
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Results
|
|
260
|
+
|
|
261
|
+
Results are saved to `results/<experiment>/<timestamp>/`:
|
|
262
|
+
|
|
263
|
+
```
|
|
264
|
+
results/
|
|
265
|
+
with-mcp/
|
|
266
|
+
2026-01-27T10-30-00Z/
|
|
267
|
+
experiment.json # Config and summary
|
|
268
|
+
create-button/
|
|
269
|
+
summary.json # { totalRuns: 10, passedRuns: 9, passRate: "90%" }
|
|
270
|
+
run-1/
|
|
271
|
+
result.json # Individual run result
|
|
272
|
+
transcript.jsonl # Agent conversation
|
|
273
|
+
outputs/ # Test/script output
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
### Analyzing Results
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
# Quick comparison
|
|
280
|
+
cat results/control/*/experiment.json | jq '.evals[] | {name, passRate}'
|
|
281
|
+
cat results/with-mcp/*/experiment.json | jq '.evals[] | {name, passRate}'
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
| Pass Rate | Interpretation |
|
|
285
|
+
|-----------|----------------|
|
|
286
|
+
| 90-100% | Agent handles this reliably |
|
|
287
|
+
| 70-89% | Usually works, room for improvement |
|
|
288
|
+
| 50-69% | Unreliable, needs investigation |
|
|
289
|
+
| < 50% | Task too hard or prompt needs work |
|
|
290
|
+
|
|
291
|
+
## Environment Variables
|
|
292
|
+
|
|
293
|
+
Every run requires **two things**: an API key for the agent and a token for the Vercel sandbox. The exact variables depend on which authentication mode you use.
|
|
294
|
+
|
|
295
|
+
| Variable | Required when | Description |
|
|
296
|
+
|---|---|---|
|
|
297
|
+
| `AI_GATEWAY_API_KEY` | `agent: 'vercel-ai-gateway/...'` | Vercel AI Gateway key — works for all agents |
|
|
298
|
+
| `ANTHROPIC_API_KEY` | `agent: 'claude-code'` | Direct Anthropic API key (`sk-ant-...`) |
|
|
299
|
+
| `OPENAI_API_KEY` | `agent: 'codex'` | Direct OpenAI API key (`sk-proj-...`) |
|
|
300
|
+
| `VERCEL_TOKEN` | Always (pick one) | Vercel personal access token — for local dev |
|
|
301
|
+
| `VERCEL_OIDC_TOKEN` | Always (pick one) | Vercel OIDC token — for CI/CD pipelines |
|
|
302
|
+
|
|
303
|
+
> You always need **one agent key** + **one sandbox token**.
|
|
304
|
+
|
|
305
|
+
### Vercel AI Gateway (Recommended)
|
|
306
|
+
|
|
307
|
+
Use `vercel-ai-gateway/` prefixed agents. One key for all models.
|
|
308
|
+
|
|
309
|
+
```bash
|
|
310
|
+
# Agent access — get yours at https://vercel.com/dashboard -> AI Gateway
|
|
311
|
+
AI_GATEWAY_API_KEY=your-ai-gateway-api-key
|
|
312
|
+
|
|
313
|
+
# Sandbox access — create at https://vercel.com/account/tokens
|
|
314
|
+
VERCEL_TOKEN=your-vercel-token
|
|
315
|
+
# OR for CI/CD:
|
|
316
|
+
# VERCEL_OIDC_TOKEN=your-oidc-token
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
### Direct API Keys (Alternative)
|
|
320
|
+
|
|
321
|
+
Remove the `vercel-ai-gateway/` prefix and use provider keys directly:
|
|
322
|
+
|
|
323
|
+
```bash
|
|
324
|
+
# For agent: 'claude-code'
|
|
325
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
326
|
+
|
|
327
|
+
# For agent: 'codex'
|
|
328
|
+
OPENAI_API_KEY=sk-proj-...
|
|
329
|
+
|
|
330
|
+
# Sandbox access is still required
|
|
331
|
+
VERCEL_TOKEN=your-vercel-token
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
### `.env` Setup
|
|
335
|
+
|
|
336
|
+
The `init` command generates a `.env.example` file. Copy it and fill in your keys:
|
|
337
|
+
|
|
338
|
+
```bash
|
|
339
|
+
cp .env.example .env
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
The framework loads `.env` automatically via [dotenv](https://github.com/motdotla/dotenv).
|
|
343
|
+
|
|
344
|
+
### Vercel Employees
|
|
345
|
+
|
|
346
|
+
**To get the environment variables, link to `vercel-labs/agent-eval` on Vercel:**
|
|
347
|
+
|
|
348
|
+
```bash
|
|
349
|
+
# Link to the vercel-labs/agent-eval project
|
|
350
|
+
vc link vercel-labs/agent-eval
|
|
351
|
+
|
|
352
|
+
# Pull environment variables
|
|
353
|
+
vc env pull
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
This writes a `.env.local` file with all the required environment variables (AI_GATEWAY_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY, VERCEL_OIDC_TOKEN) — no manual key setup needed. The framework automatically loads from both `.env` and `.env.local`.
|
|
357
|
+
|
|
358
|
+
## Tips
|
|
359
|
+
|
|
360
|
+
**Start with `--dry`**: Always preview before running to verify your config and avoid unexpected costs.
|
|
361
|
+
|
|
362
|
+
**Use multiple runs**: Single runs don't tell you reliability. Use `runs: 10` and `earlyExit: false` for meaningful data.
|
|
363
|
+
|
|
364
|
+
**Isolate variables**: Change one thing at a time between experiments. Don't compare "Opus with MCP" to "Haiku without MCP".
|
|
365
|
+
|
|
366
|
+
**Test incrementally**: Start with simple tasks, add complexity as you learn what works.
|
|
367
|
+
|
|
368
|
+
## License
|
|
369
|
+
|
|
370
|
+
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;GAEG"}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI entry point for the eval framework.
|
|
4
|
+
*/
|
|
5
|
+
import { Command } from 'commander';
|
|
6
|
+
import { config as dotenvConfig } from 'dotenv';
|
|
7
|
+
import { resolve, dirname, basename } from 'path';
|
|
8
|
+
import { existsSync } from 'fs';
|
|
9
|
+
import chalk from 'chalk';
|
|
10
|
+
import { loadConfig, resolveEvalNames } from './lib/config.js';
|
|
11
|
+
import { loadAllFixtures } from './lib/fixture.js';
|
|
12
|
+
import { runExperiment } from './lib/runner.js';
|
|
13
|
+
import { initProject, getPostInitInstructions } from './lib/init.js';
|
|
14
|
+
import { getAgent } from './lib/agents/index.js';
|
|
15
|
+
// Load environment variables
|
|
16
|
+
dotenvConfig();
|
|
17
|
+
const program = new Command();
|
|
18
|
+
program
|
|
19
|
+
.name('agent-eval')
|
|
20
|
+
.description('Framework for testing AI coding agents in isolated sandboxes')
|
|
21
|
+
.version('0.0.1');
|
|
22
|
+
/**
|
|
23
|
+
* Resolve config path shorthand.
|
|
24
|
+
* - "cc" -> "experiments/cc.ts"
|
|
25
|
+
* - "experiments/cc.ts" -> "experiments/cc.ts" (unchanged)
|
|
26
|
+
*/
|
|
27
|
+
function resolveConfigPath(input) {
|
|
28
|
+
// If it already has a path separator or extension, use as-is
|
|
29
|
+
if (input.includes('/') || input.includes('\\') || input.endsWith('.ts') || input.endsWith('.js')) {
|
|
30
|
+
return input;
|
|
31
|
+
}
|
|
32
|
+
// Otherwise, treat as shorthand: "cc" -> "experiments/cc.ts"
|
|
33
|
+
return `experiments/${input}.ts`;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Run experiment command handler
|
|
37
|
+
*/
|
|
38
|
+
async function runExperimentCommand(configInput, options) {
|
|
39
|
+
try {
|
|
40
|
+
const configPath = resolveConfigPath(configInput);
|
|
41
|
+
const absoluteConfigPath = resolve(process.cwd(), configPath);
|
|
42
|
+
if (!existsSync(absoluteConfigPath)) {
|
|
43
|
+
console.error(chalk.red(`Config file not found: ${absoluteConfigPath}`));
|
|
44
|
+
process.exit(1);
|
|
45
|
+
}
|
|
46
|
+
console.log(chalk.blue(`Loading config from ${configPath}...`));
|
|
47
|
+
const config = await loadConfig(absoluteConfigPath);
|
|
48
|
+
// Discover evals - infer from config file location
|
|
49
|
+
// Config at project/experiments/foo.ts -> evals at project/evals/
|
|
50
|
+
const projectDir = dirname(dirname(absoluteConfigPath));
|
|
51
|
+
const evalsDir = resolve(projectDir, 'evals');
|
|
52
|
+
if (!existsSync(evalsDir)) {
|
|
53
|
+
console.error(chalk.red(`Evals directory not found: ${evalsDir}`));
|
|
54
|
+
console.error(chalk.gray(`Expected evals/ to be sibling to experiments/ directory`));
|
|
55
|
+
process.exit(1);
|
|
56
|
+
}
|
|
57
|
+
console.log(chalk.blue(`Discovering evals in ${evalsDir}...`));
|
|
58
|
+
const { fixtures, errors } = loadAllFixtures(evalsDir);
|
|
59
|
+
if (errors.length > 0) {
|
|
60
|
+
console.log(chalk.yellow(`\nWarning: ${errors.length} invalid fixture(s):`));
|
|
61
|
+
for (const error of errors) {
|
|
62
|
+
console.log(chalk.yellow(` - ${error.fixtureName}: ${error.message}`));
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
if (fixtures.length === 0) {
|
|
66
|
+
console.error(chalk.red('No valid eval fixtures found'));
|
|
67
|
+
process.exit(1);
|
|
68
|
+
}
|
|
69
|
+
// Resolve which evals to run
|
|
70
|
+
const availableNames = fixtures.map((f) => f.name);
|
|
71
|
+
const evalNames = resolveEvalNames(config.evals, availableNames);
|
|
72
|
+
if (evalNames.length === 0) {
|
|
73
|
+
console.error(chalk.red('No evals matched the filter'));
|
|
74
|
+
process.exit(1);
|
|
75
|
+
}
|
|
76
|
+
console.log(chalk.green(`\nFound ${fixtures.length} valid fixture(s), will run ${evalNames.length}:`));
|
|
77
|
+
for (const name of evalNames) {
|
|
78
|
+
console.log(chalk.green(` - ${name}`));
|
|
79
|
+
}
|
|
80
|
+
console.log(chalk.blue(`\nRunning ${evalNames.length} eval(s) x ${config.runs} run(s) = ${evalNames.length * config.runs} total runs`));
|
|
81
|
+
console.log(chalk.blue(`Agent: ${config.agent}, Model: ${config.model}, Timeout: ${config.timeout}s, Early Exit: ${config.earlyExit}`));
|
|
82
|
+
if (options.dry) {
|
|
83
|
+
console.log(chalk.yellow('\n[DRY RUN] Would execute evals here'));
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
// Get the agent to check for required API key
|
|
87
|
+
const agent = getAgent(config.agent);
|
|
88
|
+
const apiKeyEnvVar = agent.getApiKeyEnvVar();
|
|
89
|
+
const apiKey = process.env[apiKeyEnvVar];
|
|
90
|
+
if (!apiKey) {
|
|
91
|
+
console.error(chalk.red(`${apiKeyEnvVar} environment variable is required`));
|
|
92
|
+
console.error(chalk.gray(`Get your API key at: https://vercel.com/dashboard -> AI Gateway`));
|
|
93
|
+
process.exit(1);
|
|
94
|
+
}
|
|
95
|
+
// Filter fixtures to only the ones we want to run
|
|
96
|
+
const selectedFixtures = fixtures.filter((f) => evalNames.includes(f.name));
|
|
97
|
+
// Get experiment name from config file
|
|
98
|
+
const experimentName = basename(configPath, '.ts').replace(/\.js$/, '');
|
|
99
|
+
const resultsDir = resolve(process.cwd(), 'results');
|
|
100
|
+
console.log(chalk.blue('\nStarting experiment...'));
|
|
101
|
+
// Run the experiment
|
|
102
|
+
const results = await runExperiment({
|
|
103
|
+
config,
|
|
104
|
+
fixtures: selectedFixtures,
|
|
105
|
+
apiKey,
|
|
106
|
+
resultsDir,
|
|
107
|
+
experimentName,
|
|
108
|
+
onProgress: (msg) => console.log(msg),
|
|
109
|
+
});
|
|
110
|
+
// Exit with appropriate code
|
|
111
|
+
const allPassed = results.evals.every((e) => e.passedRuns === e.totalRuns);
|
|
112
|
+
process.exit(allPassed ? 0 : 1);
|
|
113
|
+
}
|
|
114
|
+
catch (error) {
|
|
115
|
+
if (error instanceof Error) {
|
|
116
|
+
console.error(chalk.red(`Error: ${error.message}`));
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
console.error(chalk.red('An unknown error occurred'));
|
|
120
|
+
}
|
|
121
|
+
process.exit(1);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* init command - Create a new eval project
|
|
126
|
+
*/
|
|
127
|
+
program
|
|
128
|
+
.command('init')
|
|
129
|
+
.argument('<name>', 'Name of the project to create')
|
|
130
|
+
.description('Create a new eval project with example fixtures')
|
|
131
|
+
.action(async (name) => {
|
|
132
|
+
try {
|
|
133
|
+
console.log(chalk.blue(`Creating new eval project: ${name}`));
|
|
134
|
+
const projectDir = initProject({
|
|
135
|
+
name,
|
|
136
|
+
targetDir: process.cwd(),
|
|
137
|
+
});
|
|
138
|
+
console.log(chalk.green('Project created successfully!'));
|
|
139
|
+
console.log(getPostInitInstructions(projectDir, name));
|
|
140
|
+
}
|
|
141
|
+
catch (error) {
|
|
142
|
+
if (error instanceof Error) {
|
|
143
|
+
console.error(chalk.red(`Error: ${error.message}`));
|
|
144
|
+
}
|
|
145
|
+
else {
|
|
146
|
+
console.error(chalk.red('An unknown error occurred'));
|
|
147
|
+
}
|
|
148
|
+
process.exit(1);
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
/**
|
|
152
|
+
* Default command - run experiment (no subcommand needed)
|
|
153
|
+
* Usage: agent-eval cc --dry
|
|
154
|
+
*/
|
|
155
|
+
program
|
|
156
|
+
.argument('[config]', 'Experiment name (e.g., "cc") or path')
|
|
157
|
+
.option('--dry', 'Preview what would run without executing')
|
|
158
|
+
.action(async (configInput, options) => {
|
|
159
|
+
if (!configInput) {
|
|
160
|
+
program.help();
|
|
161
|
+
return;
|
|
162
|
+
}
|
|
163
|
+
await runExperimentCommand(configInput, options);
|
|
164
|
+
});
|
|
165
|
+
program.parse();
|
|
166
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;GAEG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,MAAM,IAAI,YAAY,EAAE,MAAM,QAAQ,CAAC;AAChD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,MAAM,CAAC;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAChC,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAC/D,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAChD,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AAEjD,6BAA6B;AAC7B,YAAY,EAAE,CAAC;AAEf,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,8DAA8D,CAAC;KAC3E,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB;;;;GAIG;AACH,SAAS,iBAAiB,CAAC,KAAa;IACtC,6DAA6D;IAC7D,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAClG,OAAO,KAAK,CAAC;IACf,CAAC;IACD,6DAA6D;IAC7D,OAAO,eAAe,KAAK,KAAK,CAAC;AACnC,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,oBAAoB,CAAC,WAAmB,EAAE,OAA0B;IACjF,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,iBAAiB,CAAC,WAAW,CAAC,CAAC;QAClD,MAAM,kBAAkB,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,UAAU,CAAC,CAAC;QAE9D,IAAI,CAAC,UAAU,CAAC,kBAAkB,CAAC,EAAE,CAAC;YACpC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,0BAA0B,kBAAkB,EAAE,CAAC,CAAC,CAAC;YACzE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,uBAAuB,UAAU,KAAK,CAAC,CAAC,CAAC;QAChE,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,kBAAkB,CAAC,CAAC;QAEpD,mDAAmD;QACnD,kEAAkE;QAClE,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC,CAAC;QACxD,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,8BAA8B,QAAQ,EAAE,CAAC,CAAC,CAAC;YACnE,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,yDAAyD,CAAC,CAAC,CAAC;YACrF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,wBAAwB,QAAQ,KAAK,CAAC,CAAC,CAAC;QAC/D,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;QAEvD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,cAAc,MAAM,CAAC,MAAM,sBAAsB,CAAC,CAAC,CAAC;YAC7E,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,KAAK,CAAC,WAAW,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC1E,CAAC;QACH,CAAC;QAED,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC,CAAC;YACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,6BAA6B;QAC7B,MAAM,cAAc,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACnD,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,KAAK,EAAE,cAAc,CAAC,CAAC;QAEjE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC,CAAC;YACxD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,WAAW,QAAQ,CAAC,MAAM,+BAA+B,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvG,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,CAAC;QAC1C,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,aAAa,SAAS,CAAC,MAAM,cAAc,MAAM,CAAC,IAAI,aAAa,SAAS,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,aAAa,CAAC,CAAC,CAAC;QACxI,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,KAAK,YAAY,MAAM,CAAC,KAAK,cAAc,MAAM,CAAC,OAAO,kBAAkB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;QAExI,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;YAChB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,sCAAsC,CAAC,CAAC,CAAC;YAClE,OAAO;QACT,CAAC;QAED,8CAA8C;QAC9C,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACrC,MAAM,YAAY,GAAG,KAAK,CAAC,eAAe,EAAE,CAAC;QAC7C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QACzC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,YAAY,mCAAmC,CAAC,CAAC,CAAC;YAC7E,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,iEAAiE,CAAC,CAAC,CAAC;YAC7F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,kDAAkD;QAClD,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAE5E,uCAAuC;QACvC,MAAM,cAAc,GAAG,QAAQ,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACxE,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CAAC,CAAC;QAErD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC,CAAC;QAEpD,qBAAqB;QACrB,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC;YAClC,MAAM;YACN,QAAQ,EAAE,gBAAgB;YAC1B,MAAM;YACN,UAAU;YACV,cAAc;YACd,UAAU,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;SACtC,CAAC,CAAC;QAEH,6BAA6B;QAC7B,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,SAAS,CAAC,CAAC;QAC3E,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAClC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC,CAAC;QACxD,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,QAAQ,CAAC,QAAQ,EAAE,+BAA+B,CAAC;KACnD,WAAW,CAAC,iDAAiD,CAAC;KAC9D,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,EAAE;IAC7B,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,8BAA8B,IAAI,EAAE,CAAC,CAAC,CAAC;QAE9D,MAAM,UAAU,GAAG,WAAW,CAAC;YAC7B,IAAI;YACJ,SAAS,EAAE,OAAO,CAAC,GAAG,EAAE;SACzB,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC,CAAC;IACzD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC,CAAC;QACxD,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL;;;GAGG;AACH,OAAO;KACJ,QAAQ,CAAC,UAAU,EAAE,sCAAsC,CAAC;KAC5D,MAAM,CAAC,OAAO,EAAE,0CAA0C,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,WAA+B,EAAE,OAA0B,EAAE,EAAE;IAC5E,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,CAAC,IAAI,EAAE,CAAC;QACf,OAAO;IACT,CAAC;IACD,MAAM,oBAAoB,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;AACnD,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-eval
|
|
3
|
+
*
|
|
4
|
+
* Framework for testing AI coding agents in isolated sandboxes.
|
|
5
|
+
*/
|
|
6
|
+
export type { AgentType, ModelTier, EvalFilter, Sandbox, SetupFunction, ExperimentConfig, ResolvedExperimentConfig, EvalFixture, EvalRunResult, EvalRunData, EvalSummary, ExperimentResults, } from './lib/types.js';
|
|
7
|
+
export { REQUIRED_EVAL_FILES, EXCLUDED_FILES } from './lib/types.js';
|
|
8
|
+
export { CONFIG_DEFAULTS, validateConfig, resolveConfig, loadConfig, resolveEvalNames, } from './lib/config.js';
|
|
9
|
+
export { FixtureValidationError, discoverFixtures, validateFixtureFiles, validatePackageJson, loadFixture, loadAllFixtures, getFixtureFiles, readFixtureFiles, } from './lib/fixture.js';
|
|
10
|
+
export type { SandboxOptions, CommandResult, SandboxFile } from './lib/sandbox.js';
|
|
11
|
+
export { SandboxManager, DEFAULT_SANDBOX_TIMEOUT, IGNORED_PATTERNS, TEST_FILE_PATTERNS, collectLocalFiles, splitTestFiles, verifyNoTestFiles, } from './lib/sandbox.js';
|
|
12
|
+
export type { AgentRunOptions, AgentRunResult } from './lib/agents/types.js';
|
|
13
|
+
export type { Agent, ScriptResult } from './lib/agents/types.js';
|
|
14
|
+
export { getAgent, listAgents, registerAgent } from './lib/agents/index.js';
|
|
15
|
+
export type { SaveResultsOptions } from './lib/results.js';
|
|
16
|
+
export { agentResultToEvalRunData, createEvalSummary, createExperimentResults, saveResults, formatResultsTable, formatRunResult, createProgressDisplay, } from './lib/results.js';
|
|
17
|
+
export type { RunExperimentOptions } from './lib/runner.js';
|
|
18
|
+
export { runExperiment, runSingleEval } from './lib/runner.js';
|
|
19
|
+
export type { InitOptions } from './lib/init.js';
|
|
20
|
+
export { initProject, getPostInitInstructions } from './lib/init.js';
|
|
21
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,YAAY,EACV,SAAS,EACT,SAAS,EACT,UAAU,EACV,OAAO,EACP,aAAa,EACb,gBAAgB,EAChB,wBAAwB,EACxB,WAAW,EACX,aAAa,EACb,WAAW,EACX,WAAW,EACX,iBAAiB,GAClB,MAAM,gBAAgB,CAAC;AAGxB,OAAO,EAAE,mBAAmB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAGrE,OAAO,EACL,eAAe,EACf,cAAc,EACd,aAAa,EACb,UAAU,EACV,gBAAgB,GACjB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,sBAAsB,EACtB,gBAAgB,EAChB,oBAAoB,EACpB,mBAAmB,EACnB,WAAW,EACX,eAAe,EACf,eAAe,EACf,gBAAgB,GACjB,MAAM,kBAAkB,CAAC;AAG1B,YAAY,EAAE,cAAc,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACnF,OAAO,EACL,cAAc,EACd,uBAAuB,EACvB,gBAAgB,EAChB,kBAAkB,EAClB,iBAAiB,EACjB,cAAc,EACd,iBAAiB,GAClB,MAAM,kBAAkB,CAAC;AAG1B,YAAY,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAG7E,YAAY,EAAE,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AACjE,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAG5E,YAAY,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAC3D,OAAO,EACL,wBAAwB,EACxB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,EACX,kBAAkB,EAClB,eAAe,EACf,qBAAqB,GACtB,MAAM,kBAAkB,CAAC;AAG1B,YAAY,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAG/D,YAAY,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-eval
|
|
3
|
+
*
|
|
4
|
+
* Framework for testing AI coding agents in isolated sandboxes.
|
|
5
|
+
*/
|
|
6
|
+
// Re-export constants
|
|
7
|
+
export { REQUIRED_EVAL_FILES, EXCLUDED_FILES } from './lib/types.js';
|
|
8
|
+
// Re-export config utilities
|
|
9
|
+
export { CONFIG_DEFAULTS, validateConfig, resolveConfig, loadConfig, resolveEvalNames, } from './lib/config.js';
|
|
10
|
+
// Re-export fixture utilities
|
|
11
|
+
export { FixtureValidationError, discoverFixtures, validateFixtureFiles, validatePackageJson, loadFixture, loadAllFixtures, getFixtureFiles, readFixtureFiles, } from './lib/fixture.js';
|
|
12
|
+
export { SandboxManager, DEFAULT_SANDBOX_TIMEOUT, IGNORED_PATTERNS, TEST_FILE_PATTERNS, collectLocalFiles, splitTestFiles, verifyNoTestFiles, } from './lib/sandbox.js';
|
|
13
|
+
export { getAgent, listAgents, registerAgent } from './lib/agents/index.js';
|
|
14
|
+
export { agentResultToEvalRunData, createEvalSummary, createExperimentResults, saveResults, formatResultsTable, formatRunResult, createProgressDisplay, } from './lib/results.js';
|
|
15
|
+
export { runExperiment, runSingleEval } from './lib/runner.js';
|
|
16
|
+
export { initProject, getPostInitInstructions } from './lib/init.js';
|
|
17
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAkBH,sBAAsB;AACtB,OAAO,EAAE,mBAAmB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAErE,6BAA6B;AAC7B,OAAO,EACL,eAAe,EACf,cAAc,EACd,aAAa,EACb,UAAU,EACV,gBAAgB,GACjB,MAAM,iBAAiB,CAAC;AAEzB,8BAA8B;AAC9B,OAAO,EACL,sBAAsB,EACtB,gBAAgB,EAChB,oBAAoB,EACpB,mBAAmB,EACnB,WAAW,EACX,eAAe,EACf,eAAe,EACf,gBAAgB,GACjB,MAAM,kBAAkB,CAAC;AAI1B,OAAO,EACL,cAAc,EACd,uBAAuB,EACvB,gBAAgB,EAChB,kBAAkB,EAClB,iBAAiB,EACjB,cAAc,EACd,iBAAiB,GAClB,MAAM,kBAAkB,CAAC;AAO1B,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAI5E,OAAO,EACL,wBAAwB,EACxB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,EACX,kBAAkB,EAClB,eAAe,EACf,qBAAqB,GACtB,MAAM,kBAAkB,CAAC;AAI1B,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAI/D,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Claude Code agent implementation.
|
|
3
|
+
* Uses Vercel AI Gateway for model access.
|
|
4
|
+
*/
|
|
5
|
+
import type { Agent } from './types.js';
|
|
6
|
+
/**
|
|
7
|
+
* Create Claude Code agent with specified authentication method.
|
|
8
|
+
*/
|
|
9
|
+
export declare function createClaudeCodeAgent({ useVercelAiGateway }: {
|
|
10
|
+
useVercelAiGateway: boolean;
|
|
11
|
+
}): Agent;
|
|
12
|
+
//# sourceMappingURL=claude-code.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claude-code.d.ts","sourceRoot":"","sources":["../../../src/lib/agents/claude-code.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,KAAK,EAAmC,MAAM,YAAY,CAAC;AA8CzE;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,EAAE,kBAAkB,EAAE,EAAE;IAAE,kBAAkB,EAAE,OAAO,CAAA;CAAE,GAAG,KAAK,CAiMpG"}
|