@vercel/agent-eval 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +370 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +166 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +17 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/agents/claude-code.d.ts +12 -0
- package/dist/lib/agents/claude-code.d.ts.map +1 -0
- package/dist/lib/agents/claude-code.js +203 -0
- package/dist/lib/agents/claude-code.js.map +1 -0
- package/dist/lib/agents/codex.d.ts +12 -0
- package/dist/lib/agents/codex.d.ts.map +1 -0
- package/dist/lib/agents/codex.js +247 -0
- package/dist/lib/agents/codex.js.map +1 -0
- package/dist/lib/agents/index.d.ts +7 -0
- package/dist/lib/agents/index.d.ts.map +1 -0
- package/dist/lib/agents/index.js +14 -0
- package/dist/lib/agents/index.js.map +1 -0
- package/dist/lib/agents/registry.d.ts +23 -0
- package/dist/lib/agents/registry.d.ts.map +1 -0
- package/dist/lib/agents/registry.js +35 -0
- package/dist/lib/agents/registry.js.map +1 -0
- package/dist/lib/agents/shared.d.ts +47 -0
- package/dist/lib/agents/shared.d.ts.map +1 -0
- package/dist/lib/agents/shared.js +99 -0
- package/dist/lib/agents/shared.js.map +1 -0
- package/dist/lib/agents/types.d.ts +69 -0
- package/dist/lib/agents/types.d.ts.map +1 -0
- package/dist/lib/agents/types.js +5 -0
- package/dist/lib/agents/types.js.map +1 -0
- package/dist/lib/config.d.ts +34 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +117 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/fixture.d.ts +52 -0
- package/dist/lib/fixture.d.ts.map +1 -0
- package/dist/lib/fixture.js +175 -0
- package/dist/lib/fixture.js.map +1 -0
- package/dist/lib/init.d.ts +21 -0
- package/dist/lib/init.d.ts.map +1 -0
- package/dist/lib/init.js +250 -0
- package/dist/lib/init.js.map +1 -0
- package/dist/lib/results.d.ts +54 -0
- package/dist/lib/results.d.ts.map +1 -0
- package/dist/lib/results.js +186 -0
- package/dist/lib/results.js.map +1 -0
- package/dist/lib/runner.d.ts +43 -0
- package/dist/lib/runner.d.ts.map +1 -0
- package/dist/lib/runner.js +142 -0
- package/dist/lib/runner.js.map +1 -0
- package/dist/lib/sandbox.d.ts +117 -0
- package/dist/lib/sandbox.d.ts.map +1 -0
- package/dist/lib/sandbox.js +248 -0
- package/dist/lib/sandbox.js.map +1 -0
- package/dist/lib/types.d.ts +166 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +14 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/test-setup.d.ts +2 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +6 -0
- package/dist/test-setup.js.map +1 -0
- package/package.json +58 -0
package/dist/lib/init.js
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Project initialization - create new eval projects.
|
|
3
|
+
*/
|
|
4
|
+
import { mkdirSync, writeFileSync, existsSync } from 'fs';
|
|
5
|
+
import { join, dirname } from 'path';
|
|
6
|
+
/**
|
|
7
|
+
* Get the package.json template.
|
|
8
|
+
*/
|
|
9
|
+
function getPackageJson(projectName) {
|
|
10
|
+
return JSON.stringify({
|
|
11
|
+
name: projectName,
|
|
12
|
+
version: '0.0.1',
|
|
13
|
+
private: true,
|
|
14
|
+
type: 'module',
|
|
15
|
+
devDependencies: {
|
|
16
|
+
'agent-eval': '^0.0.1',
|
|
17
|
+
'@types/node': '^22.0.0',
|
|
18
|
+
typescript: '^5.6.0',
|
|
19
|
+
vitest: '^2.1.0',
|
|
20
|
+
},
|
|
21
|
+
}, null, 2);
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Get the .env.example template.
|
|
25
|
+
*/
|
|
26
|
+
function getEnvExample() {
|
|
27
|
+
return `# Required - Vercel AI Gateway API key (works for all agents)
|
|
28
|
+
# Get yours at: https://vercel.com/dashboard -> AI Gateway
|
|
29
|
+
AI_GATEWAY_API_KEY=your-ai-gateway-api-key
|
|
30
|
+
|
|
31
|
+
# Required - Vercel token for sandbox access (choose ONE of the options below)
|
|
32
|
+
# The @vercel/sandbox package automatically detects either token.
|
|
33
|
+
|
|
34
|
+
# Option 1: Personal Access Token (for local development)
|
|
35
|
+
# Create at: https://vercel.com/account/tokens
|
|
36
|
+
VERCEL_TOKEN=your-vercel-token
|
|
37
|
+
|
|
38
|
+
# Option 2: OIDC Token (for CI/CD pipelines like GitHub Actions)
|
|
39
|
+
# Automatically provided by Vercel's CI integration
|
|
40
|
+
# VERCEL_OIDC_TOKEN=your-oidc-token
|
|
41
|
+
`;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Get the .gitignore template.
|
|
45
|
+
*/
|
|
46
|
+
function getGitignore() {
|
|
47
|
+
return `node_modules/
|
|
48
|
+
dist/
|
|
49
|
+
.env
|
|
50
|
+
.env.local
|
|
51
|
+
results/
|
|
52
|
+
*.log
|
|
53
|
+
.DS_Store
|
|
54
|
+
`;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Get the default experiment configuration template (Claude Code).
|
|
58
|
+
*/
|
|
59
|
+
function getCCExperiment() {
|
|
60
|
+
return `import type { ExperimentConfig } from 'agent-eval';
|
|
61
|
+
|
|
62
|
+
const config: ExperimentConfig = {
|
|
63
|
+
agent: 'vercel-ai-gateway/claude-code',
|
|
64
|
+
runs: 1,
|
|
65
|
+
earlyExit: true,
|
|
66
|
+
scripts: ['build'],
|
|
67
|
+
timeout: 300,
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
export default config;
|
|
71
|
+
`;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Get the Codex experiment configuration template.
|
|
75
|
+
*/
|
|
76
|
+
function getCodexExperiment() {
|
|
77
|
+
return `import type { ExperimentConfig } from 'agent-eval';
|
|
78
|
+
|
|
79
|
+
const config: ExperimentConfig = {
|
|
80
|
+
agent: 'vercel-ai-gateway/codex',
|
|
81
|
+
runs: 1,
|
|
82
|
+
earlyExit: true,
|
|
83
|
+
scripts: ['build'],
|
|
84
|
+
timeout: 300,
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
export default config;
|
|
88
|
+
`;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Get the example eval fixture PROMPT.md.
|
|
92
|
+
*/
|
|
93
|
+
function getExamplePrompt() {
|
|
94
|
+
return `Add a greeting message below the heading that says "Welcome, user!"
|
|
95
|
+
|
|
96
|
+
Requirements:
|
|
97
|
+
- Add a paragraph element below the h1
|
|
98
|
+
- The text should be exactly "Welcome, user!"
|
|
99
|
+
- Keep the existing heading unchanged
|
|
100
|
+
`;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Get the example eval fixture EVAL.ts.
|
|
104
|
+
*/
|
|
105
|
+
function getExampleEval() {
|
|
106
|
+
return `import { readFileSync } from 'fs';
|
|
107
|
+
import { execSync } from 'child_process';
|
|
108
|
+
import { test, expect } from 'vitest';
|
|
109
|
+
|
|
110
|
+
test('greeting message exists in source', () => {
|
|
111
|
+
const content = readFileSync('src/App.tsx', 'utf-8');
|
|
112
|
+
expect(content).toContain('Welcome, user!');
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
test('app still builds', () => {
|
|
116
|
+
// This throws if the build fails
|
|
117
|
+
execSync('npm run build', { stdio: 'pipe' });
|
|
118
|
+
});
|
|
119
|
+
`;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Get the example eval fixture package.json.
|
|
123
|
+
*/
|
|
124
|
+
function getExamplePackageJson() {
|
|
125
|
+
return JSON.stringify({
|
|
126
|
+
name: 'add-greeting',
|
|
127
|
+
type: 'module',
|
|
128
|
+
scripts: {
|
|
129
|
+
build: 'tsc',
|
|
130
|
+
},
|
|
131
|
+
dependencies: {
|
|
132
|
+
react: '^18.0.0',
|
|
133
|
+
},
|
|
134
|
+
devDependencies: {
|
|
135
|
+
'@types/react': '^18.0.0',
|
|
136
|
+
typescript: '^5.0.0',
|
|
137
|
+
vitest: '^2.1.0',
|
|
138
|
+
},
|
|
139
|
+
}, null, 2);
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Get the root tsconfig.json for the project.
|
|
143
|
+
*/
|
|
144
|
+
function getRootTsconfig() {
|
|
145
|
+
return JSON.stringify({
|
|
146
|
+
compilerOptions: {
|
|
147
|
+
target: 'ES2022',
|
|
148
|
+
module: 'NodeNext',
|
|
149
|
+
moduleResolution: 'NodeNext',
|
|
150
|
+
strict: true,
|
|
151
|
+
skipLibCheck: true,
|
|
152
|
+
noEmit: true,
|
|
153
|
+
lib: ['ES2022'],
|
|
154
|
+
},
|
|
155
|
+
include: ['experiments'],
|
|
156
|
+
}, null, 2);
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Get the example eval fixture tsconfig.json.
|
|
160
|
+
*/
|
|
161
|
+
function getExampleTsconfig() {
|
|
162
|
+
return JSON.stringify({
|
|
163
|
+
compilerOptions: {
|
|
164
|
+
target: 'ES2020',
|
|
165
|
+
module: 'ESNext',
|
|
166
|
+
moduleResolution: 'bundler',
|
|
167
|
+
jsx: 'react-jsx',
|
|
168
|
+
strict: true,
|
|
169
|
+
outDir: 'dist',
|
|
170
|
+
skipLibCheck: true,
|
|
171
|
+
},
|
|
172
|
+
include: ['src'],
|
|
173
|
+
}, null, 2);
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Get the example eval fixture App.tsx.
|
|
177
|
+
*/
|
|
178
|
+
function getExampleApp() {
|
|
179
|
+
return `export function App() {
|
|
180
|
+
return (
|
|
181
|
+
<div>
|
|
182
|
+
<h1>Hello World</h1>
|
|
183
|
+
{/* TODO: Add greeting message here */}
|
|
184
|
+
</div>
|
|
185
|
+
);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
export default App;
|
|
189
|
+
`;
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Get all template files for a new project.
|
|
193
|
+
*/
|
|
194
|
+
function getTemplateFiles(projectName) {
|
|
195
|
+
return [
|
|
196
|
+
{ path: 'package.json', content: getPackageJson(projectName) },
|
|
197
|
+
{ path: 'tsconfig.json', content: getRootTsconfig() },
|
|
198
|
+
{ path: '.env.example', content: getEnvExample() },
|
|
199
|
+
{ path: '.gitignore', content: getGitignore() },
|
|
200
|
+
{ path: 'experiments/cc.ts', content: getCCExperiment() },
|
|
201
|
+
{ path: 'experiments/codex.ts', content: getCodexExperiment() },
|
|
202
|
+
{ path: 'evals/add-greeting/PROMPT.md', content: getExamplePrompt() },
|
|
203
|
+
{ path: 'evals/add-greeting/EVAL.ts', content: getExampleEval() },
|
|
204
|
+
{ path: 'evals/add-greeting/package.json', content: getExamplePackageJson() },
|
|
205
|
+
{ path: 'evals/add-greeting/tsconfig.json', content: getExampleTsconfig() },
|
|
206
|
+
{ path: 'evals/add-greeting/src/App.tsx', content: getExampleApp() },
|
|
207
|
+
];
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Initialize a new eval project.
|
|
211
|
+
*/
|
|
212
|
+
export function initProject(options) {
|
|
213
|
+
const targetDir = options.targetDir ?? process.cwd();
|
|
214
|
+
const projectDir = join(targetDir, options.name);
|
|
215
|
+
// Check if directory already exists
|
|
216
|
+
if (existsSync(projectDir)) {
|
|
217
|
+
throw new Error(`Directory already exists: ${projectDir}`);
|
|
218
|
+
}
|
|
219
|
+
// Create project directory
|
|
220
|
+
mkdirSync(projectDir, { recursive: true });
|
|
221
|
+
// Write all template files
|
|
222
|
+
const files = getTemplateFiles(options.name);
|
|
223
|
+
for (const file of files) {
|
|
224
|
+
const filePath = join(projectDir, file.path);
|
|
225
|
+
const fileDir = dirname(filePath);
|
|
226
|
+
// Create parent directories
|
|
227
|
+
mkdirSync(fileDir, { recursive: true });
|
|
228
|
+
// Write file
|
|
229
|
+
writeFileSync(filePath, file.content);
|
|
230
|
+
}
|
|
231
|
+
return projectDir;
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Get instructions for after project creation.
|
|
235
|
+
*/
|
|
236
|
+
export function getPostInitInstructions(projectDir, projectName) {
|
|
237
|
+
return `
|
|
238
|
+
Project created at: ${projectDir}
|
|
239
|
+
|
|
240
|
+
Next steps:
|
|
241
|
+
1. cd ${projectName}
|
|
242
|
+
2. npm install
|
|
243
|
+
3. Copy .env.example to .env and add your API keys
|
|
244
|
+
4. npx agent-eval
|
|
245
|
+
|
|
246
|
+
For more information, see the documentation at:
|
|
247
|
+
https://github.com/vercel-labs/agent-eval
|
|
248
|
+
`;
|
|
249
|
+
}
|
|
250
|
+
//# sourceMappingURL=init.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"init.js","sourceRoot":"","sources":["../../src/lib/init.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAC1D,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAoBrC;;GAEG;AACH,SAAS,cAAc,CAAC,WAAmB;IACzC,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,IAAI,EAAE,WAAW;QACjB,OAAO,EAAE,OAAO;QAChB,OAAO,EAAE,IAAI;QACb,IAAI,EAAE,QAAQ;QACd,eAAe,EAAE;YACf,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,UAAU,EAAE,QAAQ;YACpB,MAAM,EAAE,QAAQ;SACjB;KACF,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,aAAa;IACpB,OAAO;;;;;;;;;;;;;;CAcR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,YAAY;IACnB,OAAO;;;;;;;CAOR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,eAAe;IACtB,OAAO;;;;;;;;;;;CAWR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB;IACzB,OAAO;;;;;;;;;;;CAWR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB;IACvB,OAAO;;;;;;CAMR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,cAAc;IACrB,OAAO;;;;;;;;;;;;;CAaR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB;IAC5B,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,IAAI,EAAE,cAAc;QACpB,IAAI,EAAE,QAAQ;QACd,OAAO,EAAE;YACP,KAAK,EAAE,KAAK;SACb;QACD,YAAY,EAAE;YACZ,KAAK,EAAE,SAAS;SACjB;QACD,eAAe,EAAE;YACf,cAAc,EAAE,SAAS;YACzB,UAAU,EAAE,QAAQ;YACpB,MAAM,EAAE,QAAQ;SACjB;KACF,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,eAAe;IACtB,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,eAAe,EAAE;YACf,MAAM,EAAE,QAAQ;YAChB,MAAM,EAAE,UAAU;YAClB,gBAAgB,EAAE,UAAU;YAC5B,MAAM,EAAE,IAAI;YACZ,YAAY,EAAE,IAAI;YAClB,MAAM,EAAE,IAAI;YACZ,GAAG,EAAE,CAAC,QAAQ,CAAC;SAChB;QACD,OAAO,EAAE,CAAC,aAAa,CAAC;KACzB,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB;IACzB,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,eAAe,EAAE;YACf,MAAM,EAAE,QAAQ;YAChB,MAAM,EAAE,QAAQ;YAChB,gBAAgB,EAAE,SAAS;YAC3B,GAAG,EAAE,WAAW;YAChB,MAAM,EAAE,IAAI;YACZ,MAAM,EAAE,MAAM;YACd,YAAY,EAAE,IAAI;SACnB;QACD,OAAO,EAAE,CAAC,KAAK,CAAC;KACjB,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,aAAa;IACpB,OAAO;;;;;;;;;;CAUR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,WAAmB;IAC3C,OAAO;QACL,EAAE,IAAI,EAAE,cAAc,EAAE,OAAO,EAAE,cAAc,CAAC,WAAW,CAAC,EAAE;QAC9D,EAAE,IAAI,EAAE,eAAe,EAAE,OAAO,EAAE,eAAe,EAAE,EAAE;QACrD,EAAE,IAAI,EAAE,cAAc,EAAE,OAAO,EAAE,aAAa,EAAE,EAAE;QAClD,EAAE,IAAI,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,EAAE;QAC/C,EAAE,IAAI,EAAE,mBAAmB,EAAE,OAAO,EAAE,eAAe,EAAE,EAAE;QACzD,EAAE,IAAI,EAAE,sBAAsB,EAAE,OAAO,EAAE,kBAAkB,EAAE,EAAE;QAC/D,EAAE,IAAI,EAAE,8BAA8B,EAAE,OAAO,EAAE,gBAAgB,EAAE,EAAE;QACrE,EAAE,IAAI,EAAE,4BAA4B,EAAE,OAAO,EAAE,cAAc,EAAE,EAAE;QACjE,EAAE,IAAI,EAAE,iCAAiC,EAAE,OAAO,EAAE,qBAAqB,EAAE,EAAE;QAC7E,EAAE,IAAI,EAAE,kCAAkC,EAAE,OAAO,EAAE,kBAAkB,EAAE,EAAE;QAC3E,EAAE,IAAI,EAAE,gCAAgC,EAAE,OAAO,EAAE,aAAa,EAAE,EAAE;KACrE,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,OAAoB;IAC9C,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IACrD,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC;IAEjD,oCAAoC;IACpC,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,2BAA2B;IAC3B,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE3C,2BAA2B;IAC3B,MAAM,KAAK,GAAG,gBAAgB,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAC7C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;QAElC,4BAA4B;QAC5B,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAExC,aAAa;QACb,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;IACxC,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CAAC,UAAkB,EAAE,WAAmB;IAC7E,OAAO;sBACa,UAAU;;;UAGtB,WAAW;;;;;;;CAOpB,CAAC;AACF,CAAC"}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Results storage and reporting for eval experiments.
|
|
3
|
+
*/
|
|
4
|
+
import type { EvalRunResult, EvalRunData, EvalSummary, ExperimentResults, ResolvedExperimentConfig } from './types.js';
|
|
5
|
+
import type { AgentRunResult } from './agents/types.js';
|
|
6
|
+
/**
|
|
7
|
+
* Convert AgentRunResult to EvalRunData (result + transcript).
|
|
8
|
+
*/
|
|
9
|
+
export declare function agentResultToEvalRunData(agentResult: AgentRunResult): EvalRunData;
|
|
10
|
+
/**
|
|
11
|
+
* Create a summary from multiple run data.
|
|
12
|
+
*/
|
|
13
|
+
export declare function createEvalSummary(name: string, runData: EvalRunData[]): EvalSummary;
|
|
14
|
+
/**
|
|
15
|
+
* Create experiment results from eval summaries.
|
|
16
|
+
*/
|
|
17
|
+
export declare function createExperimentResults(config: ResolvedExperimentConfig, evals: EvalSummary[], startedAt: Date, completedAt: Date): ExperimentResults;
|
|
18
|
+
/**
|
|
19
|
+
* Options for saving results.
|
|
20
|
+
*/
|
|
21
|
+
export interface SaveResultsOptions {
|
|
22
|
+
/** Base directory for results */
|
|
23
|
+
resultsDir: string;
|
|
24
|
+
/** Experiment name (used for subdirectory) */
|
|
25
|
+
experimentName: string;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Save experiment results to disk.
|
|
29
|
+
*
|
|
30
|
+
* Creates a directory structure per design:
|
|
31
|
+
* results/
|
|
32
|
+
* experiment-name/
|
|
33
|
+
* 2024-01-26T12-00-00Z/
|
|
34
|
+
* eval-1/
|
|
35
|
+
* run-1/
|
|
36
|
+
* result.json
|
|
37
|
+
* transcript.jsonl
|
|
38
|
+
* outputs/
|
|
39
|
+
* summary.json
|
|
40
|
+
*/
|
|
41
|
+
export declare function saveResults(results: ExperimentResults, options: SaveResultsOptions): string;
|
|
42
|
+
/**
|
|
43
|
+
* Format results for terminal display.
|
|
44
|
+
*/
|
|
45
|
+
export declare function formatResultsTable(results: ExperimentResults): string;
|
|
46
|
+
/**
|
|
47
|
+
* Format a single eval result for terminal display (used during progress).
|
|
48
|
+
*/
|
|
49
|
+
export declare function formatRunResult(evalName: string, runNumber: number, totalRuns: number, result: EvalRunResult): string;
|
|
50
|
+
/**
|
|
51
|
+
* Create a progress indicator for running evals.
|
|
52
|
+
*/
|
|
53
|
+
export declare function createProgressDisplay(evalName: string, runNumber: number, totalRuns: number): string;
|
|
54
|
+
//# sourceMappingURL=results.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"results.d.ts","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EACV,aAAa,EACb,WAAW,EACX,WAAW,EACX,iBAAiB,EACjB,wBAAwB,EACzB,MAAM,YAAY,CAAC;AACpB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAExD;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,WAAW,EAAE,cAAc,GAAG,WAAW,CA4BjF;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,GAAG,WAAW,CAanF;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,wBAAwB,EAChC,KAAK,EAAE,WAAW,EAAE,EACpB,SAAS,EAAE,IAAI,EACf,WAAW,EAAE,IAAI,GAChB,iBAAiB,CAOnB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,iCAAiC;IACjC,UAAU,EAAE,MAAM,CAAC;IACnB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;CACxB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,WAAW,CACzB,OAAO,EAAE,iBAAiB,EAC1B,OAAO,EAAE,kBAAkB,GAC1B,MAAM,CA8ER;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,iBAAiB,GAAG,MAAM,CAsCrE;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,aAAa,GACpB,MAAM,CAYR;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,GAChB,MAAM,CAER"}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Results storage and reporting for eval experiments.
|
|
3
|
+
*/
|
|
4
|
+
import { mkdirSync, writeFileSync } from 'fs';
|
|
5
|
+
import { join } from 'path';
|
|
6
|
+
import chalk from 'chalk';
|
|
7
|
+
/**
|
|
8
|
+
* Convert AgentRunResult to EvalRunData (result + transcript).
|
|
9
|
+
*/
|
|
10
|
+
export function agentResultToEvalRunData(agentResult) {
|
|
11
|
+
// Collect output content from scripts and tests
|
|
12
|
+
const outputContent = {};
|
|
13
|
+
// Add EVAL.ts test output
|
|
14
|
+
if (agentResult.testResult?.output) {
|
|
15
|
+
outputContent.eval = agentResult.testResult.output;
|
|
16
|
+
}
|
|
17
|
+
// Add all script outputs (nested under 'scripts' to avoid collision)
|
|
18
|
+
if (agentResult.scriptsResults && Object.keys(agentResult.scriptsResults).length > 0) {
|
|
19
|
+
outputContent.scripts = {};
|
|
20
|
+
for (const [name, result] of Object.entries(agentResult.scriptsResults)) {
|
|
21
|
+
if (result.output) {
|
|
22
|
+
outputContent.scripts[name] = result.output;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return {
|
|
27
|
+
result: {
|
|
28
|
+
status: agentResult.success ? 'passed' : 'failed',
|
|
29
|
+
error: agentResult.error,
|
|
30
|
+
duration: agentResult.duration / 1000, // Convert to seconds
|
|
31
|
+
},
|
|
32
|
+
transcript: agentResult.transcript,
|
|
33
|
+
outputContent: Object.keys(outputContent).length > 0 ? outputContent : undefined,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Create a summary from multiple run data.
|
|
38
|
+
*/
|
|
39
|
+
export function createEvalSummary(name, runData) {
|
|
40
|
+
const runs = runData.map((r) => r.result);
|
|
41
|
+
const passedRuns = runs.filter((r) => r.status === 'passed').length;
|
|
42
|
+
const totalDuration = runs.reduce((sum, r) => sum + r.duration, 0);
|
|
43
|
+
return {
|
|
44
|
+
name,
|
|
45
|
+
totalRuns: runs.length,
|
|
46
|
+
passedRuns,
|
|
47
|
+
passRate: runs.length > 0 ? (passedRuns / runs.length) * 100 : 0,
|
|
48
|
+
meanDuration: runs.length > 0 ? totalDuration / runs.length : 0,
|
|
49
|
+
runs: runData,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Create experiment results from eval summaries.
|
|
54
|
+
*/
|
|
55
|
+
export function createExperimentResults(config, evals, startedAt, completedAt) {
|
|
56
|
+
return {
|
|
57
|
+
startedAt: startedAt.toISOString(),
|
|
58
|
+
completedAt: completedAt.toISOString(),
|
|
59
|
+
config,
|
|
60
|
+
evals,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Save experiment results to disk.
|
|
65
|
+
*
|
|
66
|
+
* Creates a directory structure per design:
|
|
67
|
+
* results/
|
|
68
|
+
* experiment-name/
|
|
69
|
+
* 2024-01-26T12-00-00Z/
|
|
70
|
+
* eval-1/
|
|
71
|
+
* run-1/
|
|
72
|
+
* result.json
|
|
73
|
+
* transcript.jsonl
|
|
74
|
+
* outputs/
|
|
75
|
+
* summary.json
|
|
76
|
+
*/
|
|
77
|
+
export function saveResults(results, options) {
|
|
78
|
+
const timestamp = results.startedAt.replace(/:/g, '-');
|
|
79
|
+
const experimentDir = join(options.resultsDir, options.experimentName, timestamp);
|
|
80
|
+
// Create experiment directory
|
|
81
|
+
mkdirSync(experimentDir, { recursive: true });
|
|
82
|
+
// Save per-eval results
|
|
83
|
+
for (const evalSummary of results.evals) {
|
|
84
|
+
const evalDir = join(experimentDir, evalSummary.name);
|
|
85
|
+
mkdirSync(evalDir, { recursive: true });
|
|
86
|
+
// Save summary (simplified format per design)
|
|
87
|
+
const summaryForFile = {
|
|
88
|
+
totalRuns: evalSummary.totalRuns,
|
|
89
|
+
passedRuns: evalSummary.passedRuns,
|
|
90
|
+
passRate: `${evalSummary.passRate.toFixed(0)}%`,
|
|
91
|
+
meanDuration: evalSummary.meanDuration,
|
|
92
|
+
};
|
|
93
|
+
writeFileSync(join(evalDir, 'summary.json'), JSON.stringify(summaryForFile, null, 2));
|
|
94
|
+
// Save individual run results
|
|
95
|
+
for (let i = 0; i < evalSummary.runs.length; i++) {
|
|
96
|
+
const runData = evalSummary.runs[i];
|
|
97
|
+
const runDir = join(evalDir, `run-${i + 1}`);
|
|
98
|
+
mkdirSync(runDir, { recursive: true });
|
|
99
|
+
// Build the result with paths
|
|
100
|
+
const resultWithPaths = { ...runData.result };
|
|
101
|
+
// Save transcript.jsonl if available
|
|
102
|
+
if (runData.transcript) {
|
|
103
|
+
writeFileSync(join(runDir, 'transcript.jsonl'), runData.transcript);
|
|
104
|
+
resultWithPaths.transcriptPath = './transcript.jsonl';
|
|
105
|
+
}
|
|
106
|
+
// Save script/test outputs to outputs/
|
|
107
|
+
const outputsDir = join(runDir, 'outputs');
|
|
108
|
+
mkdirSync(outputsDir, { recursive: true });
|
|
109
|
+
if (runData.outputContent) {
|
|
110
|
+
const outputPaths = {};
|
|
111
|
+
// Save EVAL.ts test output
|
|
112
|
+
if (runData.outputContent.eval) {
|
|
113
|
+
writeFileSync(join(outputsDir, 'eval.txt'), runData.outputContent.eval);
|
|
114
|
+
outputPaths.eval = './outputs/eval.txt';
|
|
115
|
+
}
|
|
116
|
+
// Save npm script outputs (nested to avoid collision)
|
|
117
|
+
if (runData.outputContent.scripts) {
|
|
118
|
+
outputPaths.scripts = {};
|
|
119
|
+
for (const [name, content] of Object.entries(runData.outputContent.scripts)) {
|
|
120
|
+
if (content) {
|
|
121
|
+
const fileName = `${name}.txt`;
|
|
122
|
+
writeFileSync(join(outputsDir, fileName), content);
|
|
123
|
+
outputPaths.scripts[name] = `./outputs/${fileName}`;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
if (outputPaths.eval || (outputPaths.scripts && Object.keys(outputPaths.scripts).length > 0)) {
|
|
128
|
+
resultWithPaths.outputPaths = outputPaths;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
// Save result.json with paths
|
|
132
|
+
writeFileSync(join(runDir, 'result.json'), JSON.stringify(resultWithPaths, null, 2));
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
return experimentDir;
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Format results for terminal display.
|
|
139
|
+
*/
|
|
140
|
+
export function formatResultsTable(results) {
|
|
141
|
+
const lines = [];
|
|
142
|
+
const separator = '─'.repeat(60);
|
|
143
|
+
lines.push('');
|
|
144
|
+
lines.push(chalk.bold('Experiment Results'));
|
|
145
|
+
lines.push(chalk.gray(separator));
|
|
146
|
+
lines.push('');
|
|
147
|
+
// Calculate overall stats
|
|
148
|
+
const totalRuns = results.evals.reduce((sum, e) => sum + e.totalRuns, 0);
|
|
149
|
+
const totalPassed = results.evals.reduce((sum, e) => sum + e.passedRuns, 0);
|
|
150
|
+
const overallPassRate = totalRuns > 0 ? (totalPassed / totalRuns) * 100 : 0;
|
|
151
|
+
for (const evalSummary of results.evals) {
|
|
152
|
+
const passIcon = evalSummary.passedRuns === evalSummary.totalRuns ? '✓' : '✗';
|
|
153
|
+
const passColor = evalSummary.passedRuns === evalSummary.totalRuns ? chalk.green : chalk.red;
|
|
154
|
+
lines.push(passColor(`${passIcon} ${evalSummary.name}: ${evalSummary.passedRuns}/${evalSummary.totalRuns} passed (${evalSummary.passRate.toFixed(0)}%)`));
|
|
155
|
+
lines.push(chalk.gray(` Mean duration: ${evalSummary.meanDuration.toFixed(1)}s`));
|
|
156
|
+
lines.push('');
|
|
157
|
+
}
|
|
158
|
+
lines.push(chalk.gray(separator));
|
|
159
|
+
lines.push('');
|
|
160
|
+
const overallColor = overallPassRate === 100 ? chalk.green : overallPassRate >= 50 ? chalk.yellow : chalk.red;
|
|
161
|
+
lines.push(overallColor(`Overall: ${totalPassed}/${totalRuns} passed (${overallPassRate.toFixed(0)}%)`));
|
|
162
|
+
const duration = (new Date(results.completedAt).getTime() - new Date(results.startedAt).getTime()) / 1000;
|
|
163
|
+
lines.push(chalk.gray(`Total time: ${duration.toFixed(1)}s`));
|
|
164
|
+
lines.push('');
|
|
165
|
+
return lines.join('\n');
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Format a single eval result for terminal display (used during progress).
|
|
169
|
+
*/
|
|
170
|
+
export function formatRunResult(evalName, runNumber, totalRuns, result) {
|
|
171
|
+
const icon = result.status === 'passed' ? '✓' : '✗';
|
|
172
|
+
const color = result.status === 'passed' ? chalk.green : chalk.red;
|
|
173
|
+
let line = color(`${icon} ${evalName} [${runNumber}/${totalRuns}]`);
|
|
174
|
+
line += chalk.gray(` (${result.duration.toFixed(1)}s)`);
|
|
175
|
+
if (result.error) {
|
|
176
|
+
line += chalk.red(` - ${result.error.slice(0, 50)}${result.error.length > 50 ? '...' : ''}`);
|
|
177
|
+
}
|
|
178
|
+
return line;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Create a progress indicator for running evals.
|
|
182
|
+
*/
|
|
183
|
+
export function createProgressDisplay(evalName, runNumber, totalRuns) {
|
|
184
|
+
return chalk.blue(`Running ${evalName} [${runNumber}/${totalRuns}]...`);
|
|
185
|
+
}
|
|
186
|
+
//# sourceMappingURL=results.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"results.js","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAC9C,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,KAAK,MAAM,OAAO,CAAC;AAU1B;;GAEG;AACH,MAAM,UAAU,wBAAwB,CAAC,WAA2B;IAClE,gDAAgD;IAChD,MAAM,aAAa,GAAiC,EAAE,CAAC;IAEvD,0BAA0B;IAC1B,IAAI,WAAW,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;QACnC,aAAa,CAAC,IAAI,GAAG,WAAW,CAAC,UAAU,CAAC,MAAM,CAAC;IACrD,CAAC;IAED,qEAAqE;IACrE,IAAI,WAAW,CAAC,cAAc,IAAI,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrF,aAAa,CAAC,OAAO,GAAG,EAAE,CAAC;QAC3B,KAAK,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,cAAc,CAAC,EAAE,CAAC;YACxE,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;gBAClB,aAAa,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;YAC9C,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO;QACL,MAAM,EAAE;YACN,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ;YACjD,KAAK,EAAE,WAAW,CAAC,KAAK;YACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ,GAAG,IAAI,EAAE,qBAAqB;SAC7D;QACD,UAAU,EAAE,WAAW,CAAC,UAAU;QAClC,aAAa,EAAE,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,SAAS;KACjF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAY,EAAE,OAAsB;IACpE,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC1C,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,MAAM,CAAC;IACpE,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;IAEnE,OAAO;QACL,IAAI;QACJ,SAAS,EAAE,IAAI,CAAC,MAAM;QACtB,UAAU;QACV,QAAQ,EAAE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;QAChE,YAAY,EAAE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAC/D,IAAI,EAAE,OAAO;KACd,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,MAAgC,EAChC,KAAoB,EACpB,SAAe,EACf,WAAiB;IAEjB,OAAO;QACL,SAAS,EAAE,SAAS,CAAC,WAAW,EAAE;QAClC,WAAW,EAAE,WAAW,CAAC,WAAW,EAAE;QACtC,MAAM;QACN,KAAK;KACN,CAAC;AACJ,CAAC;AAYD;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,WAAW,CACzB,OAA0B,EAC1B,OAA2B;IAE3B,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IACvD,MAAM,aAAa,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,cAAc,EAAE,SAAS,CAAC,CAAC;IAElF,8BAA8B;IAC9B,SAAS,CAAC,aAAa,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE9C,wBAAwB;IACxB,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,WAAW,CAAC,IAAI,CAAC,CAAC;QACtD,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAExC,8CAA8C;QAC9C,MAAM,cAAc,GAAG;YACrB,SAAS,EAAE,WAAW,CAAC,SAAS;YAChC,UAAU,EAAE,WAAW,CAAC,UAAU;YAClC,QAAQ,EAAE,GAAG,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;YAC/C,YAAY,EAAE,WAAW,CAAC,YAAY;SACvC,CAAC;QACF,aAAa,CACX,IAAI,CAAC,OAAO,EAAE,cAAc,CAAC,EAC7B,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CACxC,CAAC;QAEF,8BAA8B;QAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,MAAM,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACpC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC7C,SAAS,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAEvC,8BAA8B;YAC9B,MAAM,eAAe,GAAG,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;YAE9C,qCAAqC;YACrC,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;gBACvB,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,kBAAkB,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC;gBACpE,eAAe,CAAC,cAAc,GAAG,oBAAoB,CAAC;YACxD,CAAC;YAED,uCAAuC;YACvC,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;YAC3C,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAE3C,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;gBAC1B,MAAM,WAAW,GAAiC,EAAE,CAAC;gBAErD,2BAA2B;gBAC3B,IAAI,OAAO,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC;oBAC/B,aAAa,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;oBACxE,WAAW,CAAC,IAAI,GAAG,oBAAoB,CAAC;gBAC1C,CAAC;gBAED,sDAAsD;gBACtD,IAAI,OAAO,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;oBAClC,WAAW,CAAC,OAAO,GAAG,EAAE,CAAC;oBACzB,KAAK,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,CAAC;wBAC5E,IAAI,OAAO,EAAE,CAAC;4BACZ,MAAM,QAAQ,GAAG,GAAG,IAAI,MAAM,CAAC;4BAC/B,aAAa,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;4BACnD,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,aAAa,QAAQ,EAAE,CAAC;wBACtD,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,IAAI,WAAW,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,IAAI,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC;oBAC7F,eAAe,CAAC,WAAW,GAAG,WAAW,CAAC;gBAC5C,CAAC;YACH,CAAC;YAED,8BAA8B;YAC9B,aAAa,CACX,IAAI,CAAC,MAAM,EAAE,aAAa,CAAC,EAC3B,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,IAAI,EAAE,CAAC,CAAC,CACzC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,aAAa,CAAC;AACvB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,OAA0B;IAC3D,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IAEjC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,CAAC;IAC7C,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;IAClC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,0BAA0B;IAC1B,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;IACzE,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;IAC5E,MAAM,eAAe,GAAG,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,GAAG,SAAS,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAE5E,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QACxC,MAAM,QAAQ,GAAG,WAAW,CAAC,UAAU,KAAK,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;QAC9E,MAAM,SAAS,GAAG,WAAW,CAAC,UAAU,KAAK,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;QAE7F,KAAK,CAAC,IAAI,CACR,SAAS,CACP,GAAG,QAAQ,IAAI,WAAW,CAAC,IAAI,KAAK,WAAW,CAAC,UAAU,IAAI,WAAW,CAAC,SAAS,YAAY,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CACnI,CACF,CAAC;QACF,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,oBAAoB,WAAW,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACnF,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;IAClC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,MAAM,YAAY,GAAG,eAAe,KAAK,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;IAC9G,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,WAAW,IAAI,SAAS,YAAY,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;IAEzG,MAAM,QAAQ,GAAG,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,OAAO,EAAE,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC,GAAG,IAAI,CAAC;IAC1G,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,eAAe,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAC7B,QAAgB,EAChB,SAAiB,EACjB,SAAiB,EACjB,MAAqB;IAErB,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IACpD,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;IAEnE,IAAI,IAAI,GAAG,KAAK,CAAC,GAAG,IAAI,IAAI,QAAQ,KAAK,SAAS,IAAI,SAAS,GAAG,CAAC,CAAC;IACpE,IAAI,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAExD,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QACjB,IAAI,IAAI,KAAK,CAAC,GAAG,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC/F,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,QAAgB,EAChB,SAAiB,EACjB,SAAiB;IAEjB,OAAO,KAAK,CAAC,IAAI,CAAC,WAAW,QAAQ,KAAK,SAAS,IAAI,SAAS,MAAM,CAAC,CAAC;AAC1E,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Experiment runner - orchestrates running evals against agent.
|
|
3
|
+
* All evals and attempts run concurrently for maximum throughput.
|
|
4
|
+
* With earlyExit, in-flight attempts are aborted when one passes.
|
|
5
|
+
*/
|
|
6
|
+
import type { ResolvedExperimentConfig, EvalFixture, EvalRunData, ExperimentResults } from './types.js';
|
|
7
|
+
/**
|
|
8
|
+
* Options for running an experiment.
|
|
9
|
+
*/
|
|
10
|
+
export interface RunExperimentOptions {
|
|
11
|
+
/** Resolved experiment configuration */
|
|
12
|
+
config: ResolvedExperimentConfig;
|
|
13
|
+
/** Fixtures to run */
|
|
14
|
+
fixtures: EvalFixture[];
|
|
15
|
+
/** API key for the agent */
|
|
16
|
+
apiKey: string;
|
|
17
|
+
/** Directory to save results */
|
|
18
|
+
resultsDir: string;
|
|
19
|
+
/** Experiment name */
|
|
20
|
+
experimentName: string;
|
|
21
|
+
/** Callback for progress updates */
|
|
22
|
+
onProgress?: (message: string) => void;
|
|
23
|
+
/** Whether to run in verbose mode */
|
|
24
|
+
verbose?: boolean;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Run an experiment - execute all evals with configured runs concurrently.
|
|
28
|
+
* With earlyExit enabled, remaining attempts for a fixture are aborted once one passes.
|
|
29
|
+
*/
|
|
30
|
+
export declare function runExperiment(options: RunExperimentOptions): Promise<ExperimentResults>;
|
|
31
|
+
/**
|
|
32
|
+
* Run a single eval (for testing/debugging).
|
|
33
|
+
*/
|
|
34
|
+
export declare function runSingleEval(fixture: EvalFixture, options: {
|
|
35
|
+
agent?: ResolvedExperimentConfig['agent'];
|
|
36
|
+
model: ResolvedExperimentConfig['model'];
|
|
37
|
+
timeout: number;
|
|
38
|
+
apiKey: string;
|
|
39
|
+
setup?: ResolvedExperimentConfig['setup'];
|
|
40
|
+
scripts?: string[];
|
|
41
|
+
verbose?: boolean;
|
|
42
|
+
}): Promise<EvalRunData>;
|
|
43
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EACV,wBAAwB,EACxB,WAAW,EACX,WAAW,EAEX,iBAAiB,EAClB,MAAM,YAAY,CAAC;AAYpB;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,wCAAwC;IACxC,MAAM,EAAE,wBAAwB,CAAC;IACjC,sBAAsB;IACtB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gCAAgC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,oCAAoC;IACpC,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;IACvC,qCAAqC;IACrC,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAoBD;;;GAGG;AACH,wBAAsB,aAAa,CACjC,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,iBAAiB,CAAC,CA2I5B;AAED;;GAEG;AACH,wBAAsB,aAAa,CACjC,OAAO,EAAE,WAAW,EACpB,OAAO,EAAE;IACP,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,KAAK,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IACzC,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB,GACA,OAAO,CAAC,WAAW,CAAC,CAatB"}
|