agent-eval-opencode 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +590 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/agents/claude-code.d.ts +12 -0
- package/dist/lib/agents/claude-code.d.ts.map +1 -0
- package/dist/lib/agents/claude-code.js +231 -0
- package/dist/lib/agents/claude-code.js.map +1 -0
- package/dist/lib/agents/codex.d.ts +12 -0
- package/dist/lib/agents/codex.d.ts.map +1 -0
- package/dist/lib/agents/codex.js +267 -0
- package/dist/lib/agents/codex.js.map +1 -0
- package/dist/lib/agents/cursor.d.ts +10 -0
- package/dist/lib/agents/cursor.d.ts.map +1 -0
- package/dist/lib/agents/cursor.js +204 -0
- package/dist/lib/agents/cursor.js.map +1 -0
- package/dist/lib/agents/gemini.d.ts +10 -0
- package/dist/lib/agents/gemini.d.ts.map +1 -0
- package/dist/lib/agents/gemini.js +207 -0
- package/dist/lib/agents/gemini.js.map +1 -0
- package/dist/lib/agents/index.d.ts +7 -0
- package/dist/lib/agents/index.d.ts.map +1 -0
- package/dist/lib/agents/index.js +20 -0
- package/dist/lib/agents/index.js.map +1 -0
- package/dist/lib/agents/opencode.d.ts +11 -0
- package/dist/lib/agents/opencode.d.ts.map +1 -0
- package/dist/lib/agents/opencode.js +245 -0
- package/dist/lib/agents/opencode.js.map +1 -0
- package/dist/lib/agents/registry.d.ts +23 -0
- package/dist/lib/agents/registry.d.ts.map +1 -0
- package/dist/lib/agents/registry.js +35 -0
- package/dist/lib/agents/registry.js.map +1 -0
- package/dist/lib/agents/shared.d.ts +83 -0
- package/dist/lib/agents/shared.d.ts.map +1 -0
- package/dist/lib/agents/shared.js +192 -0
- package/dist/lib/agents/shared.js.map +1 -0
- package/dist/lib/agents/types.d.ts +73 -0
- package/dist/lib/agents/types.d.ts.map +1 -0
- package/dist/lib/agents/types.js +5 -0
- package/dist/lib/agents/types.js.map +1 -0
- package/dist/lib/classifier.d.ts +89 -0
- package/dist/lib/classifier.d.ts.map +1 -0
- package/dist/lib/classifier.js +285 -0
- package/dist/lib/classifier.js.map +1 -0
- package/dist/lib/config.d.ts +37 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +187 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/dashboard.d.ts +65 -0
- package/dist/lib/dashboard.d.ts.map +1 -0
- package/dist/lib/dashboard.js +237 -0
- package/dist/lib/dashboard.js.map +1 -0
- package/dist/lib/docker-sandbox.d.ts +92 -0
- package/dist/lib/docker-sandbox.d.ts.map +1 -0
- package/dist/lib/docker-sandbox.js +375 -0
- package/dist/lib/docker-sandbox.js.map +1 -0
- package/dist/lib/fingerprint.d.ts +15 -0
- package/dist/lib/fingerprint.d.ts.map +1 -0
- package/dist/lib/fingerprint.js +59 -0
- package/dist/lib/fingerprint.js.map +1 -0
- package/dist/lib/fixture.d.ts +55 -0
- package/dist/lib/fixture.d.ts.map +1 -0
- package/dist/lib/fixture.js +215 -0
- package/dist/lib/fixture.js.map +1 -0
- package/dist/lib/housekeeping.d.ts +26 -0
- package/dist/lib/housekeeping.d.ts.map +1 -0
- package/dist/lib/housekeeping.js +170 -0
- package/dist/lib/housekeeping.js.map +1 -0
- package/dist/lib/init.d.ts +21 -0
- package/dist/lib/init.d.ts.map +1 -0
- package/dist/lib/init.js +275 -0
- package/dist/lib/init.js.map +1 -0
- package/dist/lib/o11y/index.d.ts +13 -0
- package/dist/lib/o11y/index.d.ts.map +1 -0
- package/dist/lib/o11y/index.js +13 -0
- package/dist/lib/o11y/index.js.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts +18 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.js +343 -0
- package/dist/lib/o11y/parsers/claude-code.js.map +1 -0
- package/dist/lib/o11y/parsers/codex.d.ts +17 -0
- package/dist/lib/o11y/parsers/codex.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/codex.js +364 -0
- package/dist/lib/o11y/parsers/codex.js.map +1 -0
- package/dist/lib/o11y/parsers/cursor.d.ts +21 -0
- package/dist/lib/o11y/parsers/cursor.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/cursor.js +226 -0
- package/dist/lib/o11y/parsers/cursor.js.map +1 -0
- package/dist/lib/o11y/parsers/gemini.d.ts +21 -0
- package/dist/lib/o11y/parsers/gemini.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/gemini.js +241 -0
- package/dist/lib/o11y/parsers/gemini.js.map +1 -0
- package/dist/lib/o11y/parsers/index.d.ts +55 -0
- package/dist/lib/o11y/parsers/index.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/index.js +284 -0
- package/dist/lib/o11y/parsers/index.js.map +1 -0
- package/dist/lib/o11y/parsers/opencode.d.ts +17 -0
- package/dist/lib/o11y/parsers/opencode.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/opencode.js +320 -0
- package/dist/lib/o11y/parsers/opencode.js.map +1 -0
- package/dist/lib/o11y/types.d.ts +113 -0
- package/dist/lib/o11y/types.d.ts.map +1 -0
- package/dist/lib/o11y/types.js +6 -0
- package/dist/lib/o11y/types.js.map +1 -0
- package/dist/lib/results.d.ts +91 -0
- package/dist/lib/results.d.ts.map +1 -0
- package/dist/lib/results.js +361 -0
- package/dist/lib/results.js.map +1 -0
- package/dist/lib/runner.d.ts +71 -0
- package/dist/lib/runner.d.ts.map +1 -0
- package/dist/lib/runner.js +267 -0
- package/dist/lib/runner.js.map +1 -0
- package/dist/lib/sandbox.d.ts +173 -0
- package/dist/lib/sandbox.d.ts.map +1 -0
- package/dist/lib/sandbox.js +337 -0
- package/dist/lib/sandbox.js.map +1 -0
- package/dist/lib/types.d.ts +258 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +15 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/test-setup.d.ts +2 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +6 -0
- package/dist/test-setup.js.map +1 -0
- package/package.json +72 -0
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sandbox integration for isolated eval execution.
|
|
3
|
+
* Supports both Vercel Sandbox and Docker backends.
|
|
4
|
+
*/
|
|
5
|
+
import { Sandbox as VercelSandbox } from '@vercel/sandbox';
|
|
6
|
+
import { readFileSync } from 'fs';
|
|
7
|
+
import { join } from 'path';
|
|
8
|
+
import { DockerSandboxManager } from './docker-sandbox.js';
|
|
9
|
+
/**
|
|
10
|
+
* Default timeout for sandbox operations (10 minutes).
|
|
11
|
+
*/
|
|
12
|
+
export const DEFAULT_SANDBOX_TIMEOUT = 600000;
|
|
13
|
+
/**
|
|
14
|
+
* Files to ignore when copying to sandbox.
|
|
15
|
+
* These are build artifacts and dependencies that shouldn't be uploaded.
|
|
16
|
+
* Note: This is a general-purpose pattern list used by collectLocalFiles().
|
|
17
|
+
* For eval-specific exclusions (PROMPT.md, EVAL.ts), see TEST_FILE_PATTERNS.
|
|
18
|
+
*/
|
|
19
|
+
export const IGNORED_PATTERNS = [
|
|
20
|
+
'.git',
|
|
21
|
+
'.next',
|
|
22
|
+
'node_modules',
|
|
23
|
+
'.DS_Store',
|
|
24
|
+
'*.log',
|
|
25
|
+
'build',
|
|
26
|
+
'dist',
|
|
27
|
+
'pnpm-lock.yaml',
|
|
28
|
+
'package-lock.json',
|
|
29
|
+
];
|
|
30
|
+
/**
|
|
31
|
+
* Test/eval file patterns to withhold from agent during task execution.
|
|
32
|
+
* These files are uploaded AFTER the agent completes for validation.
|
|
33
|
+
* - PROMPT.md: Contains the task - agent receives this via CLI argument, not as a file
|
|
34
|
+
* - EVAL.ts/tsx: Validation tests - must be hidden so agent can't "cheat"
|
|
35
|
+
*/
|
|
36
|
+
export const TEST_FILE_PATTERNS = ['EVAL.ts', 'EVAL.tsx', 'PROMPT.md'];
|
|
37
|
+
/**
|
|
38
|
+
* Wrapper around Vercel Sandbox providing a cleaner API.
|
|
39
|
+
*/
|
|
40
|
+
export class SandboxManager {
|
|
41
|
+
sandbox;
|
|
42
|
+
_workingDirectory = '/vercel/sandbox';
|
|
43
|
+
constructor(sandbox) {
|
|
44
|
+
this.sandbox = sandbox;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Create a new sandbox instance.
|
|
48
|
+
*/
|
|
49
|
+
static async create(options = {}) {
|
|
50
|
+
const timeout = options.timeout ?? DEFAULT_SANDBOX_TIMEOUT;
|
|
51
|
+
const runtime = options.runtime ?? 'node24';
|
|
52
|
+
const credentials = resolveVercelSandboxCredentials(options);
|
|
53
|
+
const sandbox = await VercelSandbox.create({
|
|
54
|
+
runtime,
|
|
55
|
+
timeout,
|
|
56
|
+
...(credentials ?? {}),
|
|
57
|
+
});
|
|
58
|
+
return new SandboxManager(sandbox);
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Get the sandbox ID.
|
|
62
|
+
*/
|
|
63
|
+
get sandboxId() {
|
|
64
|
+
return this.sandbox.sandboxId;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Run a command in the sandbox.
|
|
68
|
+
*/
|
|
69
|
+
async runCommand(command, args = [], options = {}) {
|
|
70
|
+
const result = await this.sandbox.runCommand({
|
|
71
|
+
cmd: command,
|
|
72
|
+
args,
|
|
73
|
+
env: options.env,
|
|
74
|
+
});
|
|
75
|
+
return {
|
|
76
|
+
stdout: await result.stdout(),
|
|
77
|
+
stderr: await result.stderr(),
|
|
78
|
+
exitCode: result.exitCode,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Run a shell command (through bash).
|
|
83
|
+
*/
|
|
84
|
+
async runShell(command, env) {
|
|
85
|
+
const result = await this.sandbox.runCommand({
|
|
86
|
+
cmd: 'bash',
|
|
87
|
+
args: ['-c', command],
|
|
88
|
+
env,
|
|
89
|
+
});
|
|
90
|
+
return {
|
|
91
|
+
stdout: await result.stdout(),
|
|
92
|
+
stderr: await result.stderr(),
|
|
93
|
+
exitCode: result.exitCode,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Read a file from the sandbox.
|
|
98
|
+
*/
|
|
99
|
+
async readFile(path) {
|
|
100
|
+
const result = await this.runCommand('cat', [path]);
|
|
101
|
+
if (result.exitCode !== 0) {
|
|
102
|
+
throw new Error(`Failed to read file ${path}: ${result.stderr}`);
|
|
103
|
+
}
|
|
104
|
+
return result.stdout;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Check if a file exists in the sandbox.
|
|
108
|
+
*/
|
|
109
|
+
async fileExists(path) {
|
|
110
|
+
const result = await this.runCommand('test', ['-f', path]);
|
|
111
|
+
return result.exitCode === 0;
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Write files to the sandbox.
|
|
115
|
+
*/
|
|
116
|
+
async writeFiles(files) {
|
|
117
|
+
const sandboxFiles = [];
|
|
118
|
+
for (const [path, content] of Object.entries(files)) {
|
|
119
|
+
sandboxFiles.push({
|
|
120
|
+
path,
|
|
121
|
+
content: Buffer.from(content, 'utf-8'),
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
await this.sandbox.writeFiles(sandboxFiles);
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Upload files from local filesystem to sandbox.
|
|
128
|
+
*/
|
|
129
|
+
async uploadFiles(files) {
|
|
130
|
+
const sandboxFiles = files.map((f) => ({
|
|
131
|
+
path: f.path,
|
|
132
|
+
content: typeof f.content === 'string' ? Buffer.from(f.content, 'utf-8') : f.content,
|
|
133
|
+
}));
|
|
134
|
+
await this.sandbox.writeFiles(sandboxFiles);
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Get the working directory.
|
|
138
|
+
*/
|
|
139
|
+
getWorkingDirectory() {
|
|
140
|
+
return this._workingDirectory;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Stop and clean up the sandbox.
|
|
144
|
+
*/
|
|
145
|
+
async stop() {
|
|
146
|
+
await this.sandbox.stop();
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
function resolveVercelSandboxCredentials(options) {
|
|
150
|
+
const token = options.token ?? process.env.VERCEL_TOKEN;
|
|
151
|
+
const teamId = options.teamId ?? process.env.VERCEL_TEAM_ID;
|
|
152
|
+
const projectId = options.projectId ?? process.env.VERCEL_PROJECT_ID;
|
|
153
|
+
if (token && teamId && projectId) {
|
|
154
|
+
return { token, teamId, projectId };
|
|
155
|
+
}
|
|
156
|
+
return null;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Resolve which sandbox backend to use based on options.
|
|
160
|
+
*
|
|
161
|
+
* Priority:
|
|
162
|
+
* 1. Explicit backend in options (if not 'auto')
|
|
163
|
+
* 2. Auto-detect: Vercel if token present, else Docker
|
|
164
|
+
*/
|
|
165
|
+
export function resolveBackend(options) {
|
|
166
|
+
// Explicit backend in options
|
|
167
|
+
if (options?.backend && options.backend !== 'auto') {
|
|
168
|
+
return options.backend;
|
|
169
|
+
}
|
|
170
|
+
// Auto-detect: Vercel if token present, else Docker
|
|
171
|
+
if (process.env.VERCEL_TOKEN || process.env.VERCEL_OIDC_TOKEN) {
|
|
172
|
+
return 'vercel';
|
|
173
|
+
}
|
|
174
|
+
return 'docker';
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Get information about the sandbox backend that will be used.
|
|
178
|
+
* Useful for displaying to users.
|
|
179
|
+
*/
|
|
180
|
+
export function getSandboxBackendInfo(options) {
|
|
181
|
+
const backend = resolveBackend(options);
|
|
182
|
+
// Determine the reason
|
|
183
|
+
let reason;
|
|
184
|
+
let description;
|
|
185
|
+
const hasExplicitOption = options?.backend && options.backend !== 'auto';
|
|
186
|
+
if (hasExplicitOption) {
|
|
187
|
+
reason = 'explicit';
|
|
188
|
+
description = `${backend} (explicit)`;
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
reason = 'auto-detected';
|
|
192
|
+
if (backend === 'vercel') {
|
|
193
|
+
description = `${backend} (auto-detected: VERCEL_TOKEN found)`;
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
description = `${backend} (auto-detected: no VERCEL_TOKEN, using Docker)`;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
return { backend, reason, description };
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Create a sandbox using the appropriate backend.
|
|
203
|
+
*
|
|
204
|
+
* By default, uses Vercel Sandbox if VERCEL_TOKEN is present,
|
|
205
|
+
* otherwise falls back to Docker.
|
|
206
|
+
*
|
|
207
|
+
* @example
|
|
208
|
+
* ```typescript
|
|
209
|
+
* // Auto-detect backend
|
|
210
|
+
* const sandbox = await createSandbox();
|
|
211
|
+
*
|
|
212
|
+
* // Explicit Docker
|
|
213
|
+
* const sandbox = await createSandbox({ backend: 'docker' });
|
|
214
|
+
*
|
|
215
|
+
* // Explicit Vercel
|
|
216
|
+
* const sandbox = await createSandbox({ backend: 'vercel' });
|
|
217
|
+
* ```
|
|
218
|
+
*/
|
|
219
|
+
export async function createSandbox(options = {}) {
|
|
220
|
+
const backend = resolveBackend(options);
|
|
221
|
+
if (backend === 'docker') {
|
|
222
|
+
return DockerSandboxManager.create({
|
|
223
|
+
timeout: options.timeout,
|
|
224
|
+
runtime: options.runtime,
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
return SandboxManager.create({
|
|
228
|
+
timeout: options.timeout,
|
|
229
|
+
runtime: options.runtime,
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Collect files from a local directory for uploading to sandbox.
|
|
234
|
+
*/
|
|
235
|
+
export async function collectLocalFiles(dir, options = {}) {
|
|
236
|
+
const { readdirSync, statSync } = await import('fs');
|
|
237
|
+
const excludePatterns = options.excludePatterns ?? IGNORED_PATTERNS;
|
|
238
|
+
const includePatterns = options.includePatterns;
|
|
239
|
+
const files = [];
|
|
240
|
+
function shouldExclude(name, relativePath) {
|
|
241
|
+
for (const pattern of excludePatterns) {
|
|
242
|
+
if (pattern.startsWith('*.')) {
|
|
243
|
+
// Wildcard pattern
|
|
244
|
+
const ext = pattern.slice(1);
|
|
245
|
+
if (name.endsWith(ext)) {
|
|
246
|
+
return true;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
else if (name === pattern || relativePath === pattern) {
|
|
250
|
+
return true;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
return false;
|
|
254
|
+
}
|
|
255
|
+
function shouldInclude(name) {
|
|
256
|
+
if (!includePatterns) {
|
|
257
|
+
return true;
|
|
258
|
+
}
|
|
259
|
+
for (const pattern of includePatterns) {
|
|
260
|
+
if (pattern.startsWith('*.')) {
|
|
261
|
+
const ext = pattern.slice(1);
|
|
262
|
+
if (name.endsWith(ext)) {
|
|
263
|
+
return true;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
else if (name === pattern) {
|
|
267
|
+
return true;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
return false;
|
|
271
|
+
}
|
|
272
|
+
function walk(currentDir, relativePath = '') {
|
|
273
|
+
const entries = readdirSync(currentDir);
|
|
274
|
+
for (const entry of entries) {
|
|
275
|
+
const entryRelativePath = relativePath ? `${relativePath}/${entry}` : entry;
|
|
276
|
+
const fullPath = join(currentDir, entry);
|
|
277
|
+
if (shouldExclude(entry, entryRelativePath)) {
|
|
278
|
+
continue;
|
|
279
|
+
}
|
|
280
|
+
const stat = statSync(fullPath);
|
|
281
|
+
if (stat.isDirectory()) {
|
|
282
|
+
walk(fullPath, entryRelativePath);
|
|
283
|
+
}
|
|
284
|
+
else if (shouldInclude(entry)) {
|
|
285
|
+
const content = readFileSync(fullPath);
|
|
286
|
+
files.push({ path: entryRelativePath, content });
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
walk(dir);
|
|
291
|
+
return files;
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Check if a filename matches any of the test file patterns.
|
|
295
|
+
*/
|
|
296
|
+
function isTestFilePattern(filename) {
|
|
297
|
+
for (const pattern of TEST_FILE_PATTERNS) {
|
|
298
|
+
if (pattern.startsWith('*.')) {
|
|
299
|
+
const ext = pattern.slice(1);
|
|
300
|
+
if (filename.endsWith(ext)) {
|
|
301
|
+
return true;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
else if (filename === pattern) {
|
|
305
|
+
return true;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return false;
|
|
309
|
+
}
|
|
310
|
+
/**
|
|
311
|
+
* Split files into workspace files (visible to agent) and test files (hidden until validation).
|
|
312
|
+
*/
|
|
313
|
+
export function splitTestFiles(files) {
|
|
314
|
+
const workspaceFiles = [];
|
|
315
|
+
const testFiles = [];
|
|
316
|
+
for (const file of files) {
|
|
317
|
+
const name = file.path.split('/').pop() ?? file.path;
|
|
318
|
+
if (isTestFilePattern(name)) {
|
|
319
|
+
testFiles.push(file);
|
|
320
|
+
}
|
|
321
|
+
else {
|
|
322
|
+
workspaceFiles.push(file);
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
return { workspaceFiles, testFiles };
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Verify that no test files exist in the sandbox.
|
|
329
|
+
*/
|
|
330
|
+
export async function verifyNoTestFiles(sandbox) {
|
|
331
|
+
const result = await sandbox.runShell("find . -path './node_modules' -prune -o -name 'EVAL.ts' -print");
|
|
332
|
+
const foundTests = result.stdout.trim();
|
|
333
|
+
if (foundTests) {
|
|
334
|
+
throw new Error(`Test files found in sandbox before agent run: ${foundTests}`);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
//# sourceMappingURL=sandbox.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sandbox.js","sourceRoot":"","sources":["../../src/lib/sandbox.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,IAAI,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAE3D,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAE3D;;GAEG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,MAAM,CAAC;AAmB9C;;;;;GAKG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAG;IAC9B,MAAM;IACN,OAAO;IACP,cAAc;IACd,WAAW;IACX,OAAO;IACP,OAAO;IACP,MAAM;IACN,gBAAgB;IAChB,mBAAmB;CACpB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC,SAAS,EAAE,UAAU,EAAE,WAAW,CAAC,CAAC;AAqCvE;;GAEG;AACH,MAAM,OAAO,cAAc;IACjB,OAAO,CAAgB;IACvB,iBAAiB,GAAW,iBAAiB,CAAC;IAEtD,YAAY,OAAsB;QAChC,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAA0B,EAAE;QAC9C,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,uBAAuB,CAAC;QAC3D,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,QAAQ,CAAC;QAC5C,MAAM,WAAW,GAAG,+BAA+B,CAAC,OAAO,CAAC,CAAC;QAE7D,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,MAAM,CAAC;YACzC,OAAO;YACP,OAAO;YACP,GAAG,CAAC,WAAW,IAAI,EAAE,CAAC;SACvB,CAAC,CAAC;QACH,OAAO,IAAI,cAAc,CAAC,OAAO,CAAC,CAAC;IACrC,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC;IAChC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU,CACd,OAAe,EACf,OAAiB,EAAE,EACnB,UAA4C,EAAE;QAE9C,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC;YAC3C,GAAG,EAAE,OAAO;YACZ,IAAI;YACJ,GAAG,EAAE,OAAO,CAAC,GAAG;SACjB,CAAC,CAAC;QAEH,OAAO;YACL,MAAM,EAAE,MAAM,MAAM,CAAC,MAAM,EAAE;YAC7B,MAAM,EAAE,MAAM,MAAM,CAAC,MAAM,EAAE;YAC7B,QAAQ,EAAE,MAAM,CAAC,QAAQ;SAC1B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,OAAe,EAAE,GAA4B;QAC1D,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC;YAC3C,GAAG,EAAE,MAAM;YACX,IAAI,EAAE,CAAC,IAAI,EAAE,OAAO,CAAC;YACrB,GAAG;SACJ,CAAC,CAAC;QAEH,OAAO;YACL,MAAM,EAAE,MAAM,MAAM,CAAC,MAAM,EAAE;YAC7B,MAAM,EAAE,MAAM,MAAM,CAAC,MAAM,EAAE;YAC7B,QAAQ,EAAE,MAAM,CAAC,QAAQ;SAC1B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAY;QACzB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC;QACpD,IAAI,MAAM,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,uBAAuB,IAAI,KAAK,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QACnE,CAAC;QACD,OAAO,MAAM,CAAC,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU,CAAC,IAAY;QAC3B,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC;QAC3D,OAAO,MAAM,CAAC,QAAQ,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU,CAAC,KAA6B;QAC5C,MAAM,YAAY,GAA6C,EAAE,CAAC;QAElE,KAAK,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;YACpD,YAAY,CAAC,IAAI,CAAC;gBAChB,IAAI;gBACJ,OAAO,EAAE,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC;aACvC,CAAC,CAAC;QACL,CAAC;QAED,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,YAAY,CAAC,CAAC;IAC9C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW,CAAC,KAAoB;QACpC,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACrC,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,OAAO,EAAE,OAAO,CAAC,CAAC,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO;SACrF,CAAC,CAAC,CAAC;QAEJ,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,YAAY,CAAC,CAAC;IAC9C,CAAC;IAED;;OAEG;IACH,mBAAmB;QACjB,OAAO,IAAI,CAAC,iBAAiB,CAAC;IAChC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;IAC5B,CAAC;CACF;AAED,SAAS,+BAA+B,CAAC,OAAuB;IAK9D,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC;IACxD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAC5D,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IAErE,IAAI,KAAK,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;QACjC,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;IACtC,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,cAAc,CAAC,OAAwB;IACrD,8BAA8B;IAC9B,IAAI,OAAO,EAAE,OAAO,IAAI,OAAO,CAAC,OAAO,KAAK,MAAM,EAAE,CAAC;QACnD,OAAO,OAAO,CAAC,OAAO,CAAC;IACzB,CAAC;IAED,oDAAoD;IACpD,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;QAC9D,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,qBAAqB,CAAC,OAAwB;IAC5D,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IAExC,uBAAuB;IACvB,IAAI,MAAoC,CAAC;IACzC,IAAI,WAAmB,CAAC;IAExB,MAAM,iBAAiB,GAAG,OAAO,EAAE,OAAO,IAAI,OAAO,CAAC,OAAO,KAAK,MAAM,CAAC;IAEzE,IAAI,iBAAiB,EAAE,CAAC;QACtB,MAAM,GAAG,UAAU,CAAC;QACpB,WAAW,GAAG,GAAG,OAAO,aAAa,CAAC;IACxC,CAAC;SAAM,CAAC;QACN,MAAM,GAAG,eAAe,CAAC;QACzB,IAAI,OAAO,KAAK,QAAQ,EAAE,CAAC;YACzB,WAAW,GAAG,GAAG,OAAO,sCAAsC,CAAC;QACjE,CAAC;aAAM,CAAC;YACN,WAAW,GAAG,GAAG,OAAO,iDAAiD,CAAC;QAC5E,CAAC;IACH,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;AAC1C,CAAC;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,UAA0B,EAAE;IAE5B,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IAExC,IAAI,OAAO,KAAK,QAAQ,EAAE,CAAC;QACzB,OAAO,oBAAoB,CAAC,MAAM,CAAC;YACjC,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,OAAO,EAAE,OAAO,CAAC,OAAO;SACzB,CAAC,CAAC;IACL,CAAC;IAED,OAAO,cAAc,CAAC,MAAM,CAAC;QAC3B,OAAO,EAAE,OAAO,CAAC,OAAO;QACxB,OAAO,EAAE,OAAO,CAAC,OAAO;KACzB,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,GAAW,EACX,UAGI,EAAE;IAEN,MAAM,EAAE,WAAW,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;IAErD,MAAM,eAAe,GAAG,OAAO,CAAC,eAAe,IAAI,gBAAgB,CAAC;IACpE,MAAM,eAAe,GAAG,OAAO,CAAC,eAAe,CAAC;IAChD,MAAM,KAAK,GAAkB,EAAE,CAAC;IAEhC,SAAS,aAAa,CAAC,IAAY,EAAE,YAAoB;QACvD,KAAK,MAAM,OAAO,IAAI,eAAe,EAAE,CAAC;YACtC,IAAI,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC7B,mBAAmB;gBACnB,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC7B,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;oBACvB,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;iBAAM,IAAI,IAAI,KAAK,OAAO,IAAI,YAAY,KAAK,OAAO,EAAE,CAAC;gBACxD,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,SAAS,aAAa,CAAC,IAAY;QACjC,IAAI,CAAC,eAAe,EAAE,CAAC;YACrB,OAAO,IAAI,CAAC;QACd,CAAC;QACD,KAAK,MAAM,OAAO,IAAI,eAAe,EAAE,CAAC;YACtC,IAAI,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC7B,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC7B,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;oBACvB,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;iBAAM,IAAI,IAAI,KAAK,OAAO,EAAE,CAAC;gBAC5B,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,SAAS,IAAI,CAAC,UAAkB,EAAE,eAAuB,EAAE;QACzD,MAAM,OAAO,GAAG,WAAW,CAAC,UAAU,CAAC,CAAC;QAExC,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,MAAM,iBAAiB,GAAG,YAAY,CAAC,CAAC,CAAC,GAAG,YAAY,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;YAC5E,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC;YAEzC,IAAI,aAAa,CAAC,KAAK,EAAE,iBAAiB,CAAC,EAAE,CAAC;gBAC5C,SAAS;YACX,CAAC;YAED,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC;YAEhC,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gBACvB,IAAI,CAAC,QAAQ,EAAE,iBAAiB,CAAC,CAAC;YACpC,CAAC;iBAAM,IAAI,aAAa,CAAC,KAAK,CAAC,EAAE,CAAC;gBAChC,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;gBACvC,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,iBAAiB,EAAE,OAAO,EAAE,CAAC,CAAC;YACnD,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,QAAgB;IACzC,KAAK,MAAM,OAAO,IAAI,kBAAkB,EAAE,CAAC;QACzC,IAAI,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7B,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAC7B,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC3B,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;aAAM,IAAI,QAAQ,KAAK,OAAO,EAAE,CAAC;YAChC,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,KAAoB;IAIjD,MAAM,cAAc,GAAkB,EAAE,CAAC;IACzC,MAAM,SAAS,GAAkB,EAAE,CAAC;IAEpC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC;QAErD,IAAI,iBAAiB,CAAC,IAAI,CAAC,EAAE,CAAC;YAC5B,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvB,CAAC;aAAM,CAAC;YACN,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,CAAC;AACvC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,OAA8C;IAE9C,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,QAAQ,CACnC,iEAAiE,CAClE,CAAC;IAEF,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;IACxC,IAAI,UAAU,EAAE,CAAC;QACf,MAAM,IAAI,KAAK,CAAC,iDAAiD,UAAU,EAAE,CAAC,CAAC;IACjF,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core types for the eval framework.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Supported AI agent types.
|
|
6
|
+
*/
|
|
7
|
+
export type AgentType = 'vercel-ai-gateway/claude-code' | 'claude-code' | 'vercel-ai-gateway/codex' | 'codex' | 'opencode' | 'gemini' | 'cursor';
|
|
8
|
+
/**
|
|
9
|
+
* Model identifier - any string accepted.
|
|
10
|
+
* Each agent validates its own models at runtime.
|
|
11
|
+
*/
|
|
12
|
+
export type ModelTier = string;
|
|
13
|
+
/**
|
|
14
|
+
* Function type for filtering evals.
|
|
15
|
+
*/
|
|
16
|
+
export type EvalFilter = (name: string) => boolean;
|
|
17
|
+
/**
|
|
18
|
+
* Sandbox interface for setup functions.
|
|
19
|
+
* Provides methods to interact with the isolated VM.
|
|
20
|
+
*/
|
|
21
|
+
export interface Sandbox {
|
|
22
|
+
/** Run a command in the sandbox */
|
|
23
|
+
runCommand(command: string, args?: string[], options?: {
|
|
24
|
+
env?: Record<string, string>;
|
|
25
|
+
}): Promise<{
|
|
26
|
+
stdout: string;
|
|
27
|
+
stderr: string;
|
|
28
|
+
exitCode: number;
|
|
29
|
+
}>;
|
|
30
|
+
/** Read a file from the sandbox */
|
|
31
|
+
readFile(path: string): Promise<string>;
|
|
32
|
+
/** Write files to the sandbox */
|
|
33
|
+
writeFiles(files: Record<string, string>): Promise<void>;
|
|
34
|
+
/** Get the sandbox working directory */
|
|
35
|
+
getWorkingDirectory(): string;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Setup function that runs before the agent starts.
|
|
39
|
+
* Receives a sandbox instance for pre-configuration.
|
|
40
|
+
*/
|
|
41
|
+
export type SetupFunction = (sandbox: Sandbox) => Promise<void>;
|
|
42
|
+
/**
|
|
43
|
+
* Sandbox backend type.
|
|
44
|
+
*/
|
|
45
|
+
export type SandboxBackend = 'vercel' | 'docker';
|
|
46
|
+
/**
|
|
47
|
+
* Experiment configuration.
|
|
48
|
+
* Defines what to test and how.
|
|
49
|
+
*/
|
|
50
|
+
export interface ExperimentConfig {
|
|
51
|
+
/** Which AI agent to use */
|
|
52
|
+
agent: AgentType;
|
|
53
|
+
/** Which AI model the agent should use. Can be a single model or array of models to test.
|
|
54
|
+
* If an array is provided, the experiment will run on each model.
|
|
55
|
+
* Default is agent-specific: 'opus' for claude-code, 'openai/gpt-5.2-codex' for codex */
|
|
56
|
+
model?: ModelTier | ModelTier[];
|
|
57
|
+
/** Which evals to run. Can be a string, array, or filter function. @default '*' (all evals) */
|
|
58
|
+
evals?: string | string[] | EvalFilter;
|
|
59
|
+
/** How many times to run each eval. @default 1 */
|
|
60
|
+
runs?: number;
|
|
61
|
+
/** Stop after first successful run? @default true */
|
|
62
|
+
earlyExit?: boolean;
|
|
63
|
+
/** npm scripts that must pass after agent finishes. @default [] */
|
|
64
|
+
scripts?: string[];
|
|
65
|
+
/** Maximum time in seconds for agent to complete. @default 300 (5 minutes) */
|
|
66
|
+
timeout?: number;
|
|
67
|
+
/** Setup function that runs before agent starts. @default undefined */
|
|
68
|
+
setup?: SetupFunction;
|
|
69
|
+
/** Sandbox backend to use. @default 'auto' (Vercel if token present, else Docker) */
|
|
70
|
+
sandbox?: SandboxBackend | 'auto';
|
|
71
|
+
/** Optional function to modify the prompt before running the experiment. @default undefined */
|
|
72
|
+
editPrompt?: (prompt: string) => string;
|
|
73
|
+
/** Whether to copy project files into the result directory.
|
|
74
|
+
* - 'none': No files are copied (default)
|
|
75
|
+
* - 'changed': Only files the agent changed/created
|
|
76
|
+
* - 'all': Original project files + agent changes overlaid on top
|
|
77
|
+
* @default 'none' */
|
|
78
|
+
copyFiles?: 'none' | 'changed' | 'all';
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Resolved experiment config with all defaults applied.
|
|
82
|
+
*/
|
|
83
|
+
export interface ResolvedExperimentConfig {
|
|
84
|
+
agent: AgentType;
|
|
85
|
+
model: ModelTier | ModelTier[];
|
|
86
|
+
evals: string | string[] | EvalFilter;
|
|
87
|
+
runs: number;
|
|
88
|
+
earlyExit: boolean;
|
|
89
|
+
scripts: string[];
|
|
90
|
+
timeout: number;
|
|
91
|
+
setup?: SetupFunction;
|
|
92
|
+
sandbox: SandboxBackend | 'auto';
|
|
93
|
+
editPrompt?: (prompt: string) => string;
|
|
94
|
+
copyFiles: 'none' | 'changed' | 'all';
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Resolved experiment config with all defaults applied.
|
|
98
|
+
*/
|
|
99
|
+
export interface RunnableExperimentConfig {
|
|
100
|
+
agent: AgentType;
|
|
101
|
+
model: ModelTier;
|
|
102
|
+
evals: string | string[] | EvalFilter;
|
|
103
|
+
runs: number;
|
|
104
|
+
earlyExit: boolean;
|
|
105
|
+
scripts: string[];
|
|
106
|
+
timeout: number;
|
|
107
|
+
setup?: SetupFunction;
|
|
108
|
+
sandbox: SandboxBackend | 'auto';
|
|
109
|
+
editPrompt?: (prompt: string) => string;
|
|
110
|
+
copyFiles: 'none' | 'changed' | 'all';
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Required files for a valid eval fixture.
|
|
114
|
+
* Note: Either EVAL.ts or EVAL.tsx is required (not both).
|
|
115
|
+
*/
|
|
116
|
+
export declare const REQUIRED_EVAL_FILES: readonly ["PROMPT.md", "EVAL.ts", "package.json"];
|
|
117
|
+
/**
|
|
118
|
+
* Files excluded when listing fixture files (used by getFixtureFiles in fixture.ts).
|
|
119
|
+
* This is for local fixture introspection, NOT for sandbox uploads.
|
|
120
|
+
* For sandbox file filtering, see TEST_FILE_PATTERNS in sandbox.ts.
|
|
121
|
+
*/
|
|
122
|
+
export declare const EXCLUDED_FILES: readonly ["PROMPT.md", "EVAL.ts", "EVAL.tsx", "node_modules", ".git"];
|
|
123
|
+
/**
|
|
124
|
+
* Represents a discovered eval fixture.
|
|
125
|
+
*/
|
|
126
|
+
export interface EvalFixture {
|
|
127
|
+
/** Name of the eval (folder name) */
|
|
128
|
+
name: string;
|
|
129
|
+
/** Absolute path to the eval folder */
|
|
130
|
+
path: string;
|
|
131
|
+
/** Contents of PROMPT.md */
|
|
132
|
+
prompt: string;
|
|
133
|
+
/** Whether package.json has "type": "module" */
|
|
134
|
+
isModule: boolean;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Result of a single eval run.
|
|
138
|
+
*/
|
|
139
|
+
export interface EvalRunResult {
|
|
140
|
+
/** Pass or fail status */
|
|
141
|
+
status: 'passed' | 'failed';
|
|
142
|
+
/** Error message if failed */
|
|
143
|
+
error?: string;
|
|
144
|
+
/** Duration in seconds */
|
|
145
|
+
duration: number;
|
|
146
|
+
/** Model used for this run */
|
|
147
|
+
model?: string;
|
|
148
|
+
/** Path to parsed transcript file (relative to run directory) */
|
|
149
|
+
transcriptPath?: string;
|
|
150
|
+
/** Path to raw transcript file (relative to run directory) */
|
|
151
|
+
transcriptRawPath?: string;
|
|
152
|
+
/** Paths to output files (relative to run directory) */
|
|
153
|
+
outputPaths?: {
|
|
154
|
+
/** Path to EVAL.ts test output */
|
|
155
|
+
eval?: string;
|
|
156
|
+
/** Paths to npm script outputs (nested to avoid collision) */
|
|
157
|
+
scripts?: Record<string, string>;
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Internal run data including transcript and outputs (content, not paths).
|
|
162
|
+
*/
|
|
163
|
+
export interface EvalRunData {
|
|
164
|
+
/** The eval result (will have paths added when saving) */
|
|
165
|
+
result: EvalRunResult;
|
|
166
|
+
/** Structured transcript from Claude Code (saved to transcript.jsonl) */
|
|
167
|
+
transcript?: string;
|
|
168
|
+
/** Script/test output content (saved to outputs/) */
|
|
169
|
+
outputContent?: {
|
|
170
|
+
/** EVAL.ts test output */
|
|
171
|
+
eval?: string;
|
|
172
|
+
/** npm script outputs (nested to avoid collision) */
|
|
173
|
+
scripts?: Record<string, string>;
|
|
174
|
+
};
|
|
175
|
+
/** Files generated/modified by the agent (path -> content). Used for copyFiles option. */
|
|
176
|
+
generatedFiles?: Record<string, string>;
|
|
177
|
+
/** Files deleted by the agent. Used for copyFiles option. */
|
|
178
|
+
deletedFiles?: string[];
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Summary of multiple runs for a single eval.
|
|
182
|
+
*/
|
|
183
|
+
export interface EvalSummary {
|
|
184
|
+
/** Name of the eval */
|
|
185
|
+
name: string;
|
|
186
|
+
/** Total number of runs */
|
|
187
|
+
totalRuns: number;
|
|
188
|
+
/** Number of passed runs */
|
|
189
|
+
passedRuns: number;
|
|
190
|
+
/** Pass rate as a percentage */
|
|
191
|
+
passRate: number;
|
|
192
|
+
/** Mean duration across all runs */
|
|
193
|
+
meanDuration: number;
|
|
194
|
+
/** Individual run data (internal, not all fields saved to summary.json) */
|
|
195
|
+
runs: EvalRunData[];
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Failure classification for a failed eval run.
|
|
199
|
+
*/
|
|
200
|
+
export type FailureType = 'model' | 'infra' | 'timeout';
|
|
201
|
+
/**
|
|
202
|
+
* Classification result for a failed eval.
|
|
203
|
+
*/
|
|
204
|
+
export interface Classification {
|
|
205
|
+
failureType: FailureType;
|
|
206
|
+
failureReason: string;
|
|
207
|
+
/** When true, the user has acknowledged this non-model failure as a final result via --ack-failures. */
|
|
208
|
+
acknowledged?: boolean;
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Structured progress events emitted by the runner.
|
|
212
|
+
* The CLI decides how to render these (dashboard, console.log, etc.).
|
|
213
|
+
*/
|
|
214
|
+
export type ProgressEvent = {
|
|
215
|
+
type: 'experiment:start';
|
|
216
|
+
totalAttempts: number;
|
|
217
|
+
totalEvals: number;
|
|
218
|
+
totalRuns: number;
|
|
219
|
+
} | {
|
|
220
|
+
type: 'eval:start';
|
|
221
|
+
evalName: string;
|
|
222
|
+
runNumber: number;
|
|
223
|
+
totalRuns: number;
|
|
224
|
+
} | {
|
|
225
|
+
type: 'eval:complete';
|
|
226
|
+
evalName: string;
|
|
227
|
+
runNumber: number;
|
|
228
|
+
totalRuns: number;
|
|
229
|
+
result: EvalRunResult;
|
|
230
|
+
} | {
|
|
231
|
+
type: 'eval:abort';
|
|
232
|
+
evalName: string;
|
|
233
|
+
runNumber: number;
|
|
234
|
+
} | {
|
|
235
|
+
type: 'experiment:earlyExit';
|
|
236
|
+
evalName: string;
|
|
237
|
+
runNumber: number;
|
|
238
|
+
} | {
|
|
239
|
+
type: 'experiment:saved';
|
|
240
|
+
outputDir: string;
|
|
241
|
+
} | {
|
|
242
|
+
type: 'experiment:summary';
|
|
243
|
+
results: ExperimentResults;
|
|
244
|
+
};
|
|
245
|
+
/**
|
|
246
|
+
* Complete experiment results.
|
|
247
|
+
*/
|
|
248
|
+
export interface ExperimentResults {
|
|
249
|
+
/** Timestamp when experiment started */
|
|
250
|
+
startedAt: string;
|
|
251
|
+
/** Timestamp when experiment completed */
|
|
252
|
+
completedAt: string;
|
|
253
|
+
/** Experiment configuration used */
|
|
254
|
+
config: RunnableExperimentConfig;
|
|
255
|
+
/** Results for each eval */
|
|
256
|
+
evals: EvalSummary[];
|
|
257
|
+
}
|
|
258
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,MAAM,SAAS,GACjB,+BAA+B,GAC/B,aAAa,GACb,yBAAyB,GACzB,OAAO,GACP,UAAU,GACV,QAAQ,GACR,QAAQ,CAAC;AAEb;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;AAEnD;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,mCAAmC;IACnC,UAAU,CACR,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,MAAM,EAAE,EACf,OAAO,CAAC,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;KAAE,GACzC,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,mCAAmC;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,wCAAwC;IACxC,mBAAmB,IAAI,MAAM,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;AAEhE;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,QAAQ,GAAG,QAAQ,CAAC;AAEjD;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB;;6FAEyF;IACzF,KAAK,CAAC,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAEhC,+FAA+F;IAC/F,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IAEvC,kDAAkD;IAClD,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd,qDAAqD;IACrD,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,mEAAmE;IACnE,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IAEnB,8EAA8E;IAC9E,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,uEAAuE;IACvE,KAAK,CAAC,EAAE,aAAa,CAAC;IAEtB,qFAAqF;IACrF,OAAO,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;IAElC,+FAA+F;IAC/F,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;IAExC;;;;yBAIqB;IACrB,SAAS,CAAC,EAAE,MAAM,GAAG,SAAS,GAAG,KAAK,CAAC;CACxC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAC/B,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,OAAO,EAAE,cAAc,GAAG,MAAM,CAAC;IACjC,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;IACxC,SAAS,EAAE,MAAM,GAAG,SAAS,GAAG,KAAK,CAAC;CACvC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,OAAO,EAAE,cAAc,GAAG,MAAM,CAAC;IACjC,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;IACxC,SAAS,EAAE,MAAM,GAAG,SAAS,GAAG,KAAK,CAAC;CACvC;AAED;;;GAGG;AACH,eAAO,MAAM,mBAAmB,mDAAoD,CAAC;AAErF;;;;GAIG;AACH,eAAO,MAAM,cAAc,uEAAwE,CAAC;AAEpG;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,qCAAqC;IACrC,IAAI,EAAE,MAAM,CAAC;IACb,uCAAuC;IACvC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gDAAgD;IAChD,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,0BAA0B;IAC1B,MAAM,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC5B,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,8DAA8D;IAC9D,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,wDAAwD;IACxD,WAAW,CAAC,EAAE;QACZ,kCAAkC;QAClC,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,8DAA8D;QAC9D,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,0DAA0D;IAC1D,MAAM,EAAE,aAAa,CAAC;IACtB,yEAAyE;IACzE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qDAAqD;IACrD,aAAa,CAAC,EAAE;QACd,0BAA0B;QAC1B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,qDAAqD;QACrD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;IACF,0FAA0F;IAC1F,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxC,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,4BAA4B;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,YAAY,EAAE,MAAM,CAAC;IACrB,2EAA2E;IAC3E,IAAI,EAAE,WAAW,EAAE,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,MAAM,WAAW,GAAG,OAAO,GAAG,OAAO,GAAG,SAAS,CAAC;AAExD;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,WAAW,EAAE,WAAW,CAAC;IACzB,aAAa,EAAE,MAAM,CAAC;IACtB,wGAAwG;IACxG,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GACrB;IAAE,IAAI,EAAE,kBAAkB,CAAC;IAAC,aAAa,EAAE,MAAM,CAAC;IAAC,UAAU,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GAC1F;IAAE,IAAI,EAAE,YAAY,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GAC9E;IAAE,IAAI,EAAE,eAAe,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,aAAa,CAAA;CAAE,GACxG;IAAE,IAAI,EAAE,YAAY,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GAC3D;IAAE,IAAI,EAAE,sBAAsB,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACrE;IAAE,IAAI,EAAE,kBAAkB,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GAC/C;IAAE,IAAI,EAAE,oBAAoB,CAAC;IAAC,OAAO,EAAE,iBAAiB,CAAA;CAAE,CAAC;AAE/D;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB,0CAA0C;IAC1C,WAAW,EAAE,MAAM,CAAC;IACpB,oCAAoC;IACpC,MAAM,EAAE,wBAAwB,CAAC;IACjC,4BAA4B;IAC5B,KAAK,EAAE,WAAW,EAAE,CAAC;CACtB"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core types for the eval framework.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Required files for a valid eval fixture.
|
|
6
|
+
* Note: Either EVAL.ts or EVAL.tsx is required (not both).
|
|
7
|
+
*/
|
|
8
|
+
export const REQUIRED_EVAL_FILES = ['PROMPT.md', 'EVAL.ts', 'package.json'];
|
|
9
|
+
/**
|
|
10
|
+
* Files excluded when listing fixture files (used by getFixtureFiles in fixture.ts).
|
|
11
|
+
* This is for local fixture introspection, NOT for sandbox uploads.
|
|
12
|
+
* For sandbox file filtering, see TEST_FILE_PATTERNS in sandbox.ts.
|
|
13
|
+
*/
|
|
14
|
+
export const EXCLUDED_FILES = ['PROMPT.md', 'EVAL.ts', 'EVAL.tsx', 'node_modules', '.git'];
|
|
15
|
+
//# sourceMappingURL=types.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAsIH;;;GAGG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,cAAc,CAAU,CAAC;AAErF;;;;GAIG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,CAAU,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"test-setup.d.ts","sourceRoot":"","sources":["../src/test-setup.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"test-setup.js","sourceRoot":"","sources":["../src/test-setup.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,EAAE,MAAM,IAAI,YAAY,EAAE,MAAM,QAAQ,CAAC;AAEhD,YAAY,EAAE,CAAC"}
|