agent-eval-opencode 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +590 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/agents/claude-code.d.ts +12 -0
- package/dist/lib/agents/claude-code.d.ts.map +1 -0
- package/dist/lib/agents/claude-code.js +231 -0
- package/dist/lib/agents/claude-code.js.map +1 -0
- package/dist/lib/agents/codex.d.ts +12 -0
- package/dist/lib/agents/codex.d.ts.map +1 -0
- package/dist/lib/agents/codex.js +267 -0
- package/dist/lib/agents/codex.js.map +1 -0
- package/dist/lib/agents/cursor.d.ts +10 -0
- package/dist/lib/agents/cursor.d.ts.map +1 -0
- package/dist/lib/agents/cursor.js +204 -0
- package/dist/lib/agents/cursor.js.map +1 -0
- package/dist/lib/agents/gemini.d.ts +10 -0
- package/dist/lib/agents/gemini.d.ts.map +1 -0
- package/dist/lib/agents/gemini.js +207 -0
- package/dist/lib/agents/gemini.js.map +1 -0
- package/dist/lib/agents/index.d.ts +7 -0
- package/dist/lib/agents/index.d.ts.map +1 -0
- package/dist/lib/agents/index.js +20 -0
- package/dist/lib/agents/index.js.map +1 -0
- package/dist/lib/agents/opencode.d.ts +11 -0
- package/dist/lib/agents/opencode.d.ts.map +1 -0
- package/dist/lib/agents/opencode.js +245 -0
- package/dist/lib/agents/opencode.js.map +1 -0
- package/dist/lib/agents/registry.d.ts +23 -0
- package/dist/lib/agents/registry.d.ts.map +1 -0
- package/dist/lib/agents/registry.js +35 -0
- package/dist/lib/agents/registry.js.map +1 -0
- package/dist/lib/agents/shared.d.ts +83 -0
- package/dist/lib/agents/shared.d.ts.map +1 -0
- package/dist/lib/agents/shared.js +192 -0
- package/dist/lib/agents/shared.js.map +1 -0
- package/dist/lib/agents/types.d.ts +73 -0
- package/dist/lib/agents/types.d.ts.map +1 -0
- package/dist/lib/agents/types.js +5 -0
- package/dist/lib/agents/types.js.map +1 -0
- package/dist/lib/classifier.d.ts +89 -0
- package/dist/lib/classifier.d.ts.map +1 -0
- package/dist/lib/classifier.js +285 -0
- package/dist/lib/classifier.js.map +1 -0
- package/dist/lib/config.d.ts +37 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +187 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/dashboard.d.ts +65 -0
- package/dist/lib/dashboard.d.ts.map +1 -0
- package/dist/lib/dashboard.js +237 -0
- package/dist/lib/dashboard.js.map +1 -0
- package/dist/lib/docker-sandbox.d.ts +92 -0
- package/dist/lib/docker-sandbox.d.ts.map +1 -0
- package/dist/lib/docker-sandbox.js +375 -0
- package/dist/lib/docker-sandbox.js.map +1 -0
- package/dist/lib/fingerprint.d.ts +15 -0
- package/dist/lib/fingerprint.d.ts.map +1 -0
- package/dist/lib/fingerprint.js +59 -0
- package/dist/lib/fingerprint.js.map +1 -0
- package/dist/lib/fixture.d.ts +55 -0
- package/dist/lib/fixture.d.ts.map +1 -0
- package/dist/lib/fixture.js +215 -0
- package/dist/lib/fixture.js.map +1 -0
- package/dist/lib/housekeeping.d.ts +26 -0
- package/dist/lib/housekeeping.d.ts.map +1 -0
- package/dist/lib/housekeeping.js +170 -0
- package/dist/lib/housekeeping.js.map +1 -0
- package/dist/lib/init.d.ts +21 -0
- package/dist/lib/init.d.ts.map +1 -0
- package/dist/lib/init.js +275 -0
- package/dist/lib/init.js.map +1 -0
- package/dist/lib/o11y/index.d.ts +13 -0
- package/dist/lib/o11y/index.d.ts.map +1 -0
- package/dist/lib/o11y/index.js +13 -0
- package/dist/lib/o11y/index.js.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts +18 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.js +343 -0
- package/dist/lib/o11y/parsers/claude-code.js.map +1 -0
- package/dist/lib/o11y/parsers/codex.d.ts +17 -0
- package/dist/lib/o11y/parsers/codex.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/codex.js +364 -0
- package/dist/lib/o11y/parsers/codex.js.map +1 -0
- package/dist/lib/o11y/parsers/cursor.d.ts +21 -0
- package/dist/lib/o11y/parsers/cursor.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/cursor.js +226 -0
- package/dist/lib/o11y/parsers/cursor.js.map +1 -0
- package/dist/lib/o11y/parsers/gemini.d.ts +21 -0
- package/dist/lib/o11y/parsers/gemini.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/gemini.js +241 -0
- package/dist/lib/o11y/parsers/gemini.js.map +1 -0
- package/dist/lib/o11y/parsers/index.d.ts +55 -0
- package/dist/lib/o11y/parsers/index.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/index.js +284 -0
- package/dist/lib/o11y/parsers/index.js.map +1 -0
- package/dist/lib/o11y/parsers/opencode.d.ts +17 -0
- package/dist/lib/o11y/parsers/opencode.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/opencode.js +320 -0
- package/dist/lib/o11y/parsers/opencode.js.map +1 -0
- package/dist/lib/o11y/types.d.ts +113 -0
- package/dist/lib/o11y/types.d.ts.map +1 -0
- package/dist/lib/o11y/types.js +6 -0
- package/dist/lib/o11y/types.js.map +1 -0
- package/dist/lib/results.d.ts +91 -0
- package/dist/lib/results.d.ts.map +1 -0
- package/dist/lib/results.js +361 -0
- package/dist/lib/results.js.map +1 -0
- package/dist/lib/runner.d.ts +71 -0
- package/dist/lib/runner.d.ts.map +1 -0
- package/dist/lib/runner.js +267 -0
- package/dist/lib/runner.js.map +1 -0
- package/dist/lib/sandbox.d.ts +173 -0
- package/dist/lib/sandbox.d.ts.map +1 -0
- package/dist/lib/sandbox.js +337 -0
- package/dist/lib/sandbox.js.map +1 -0
- package/dist/lib/types.d.ts +258 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +15 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/test-setup.d.ts +2 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +6 -0
- package/dist/test-setup.js.map +1 -0
- package/package.json +72 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenCode CLI agent implementation.
|
|
3
|
+
* Uses host-mounted credentials (GitHub Copilot, etc.) for model access.
|
|
4
|
+
*/
|
|
5
|
+
import { existsSync, readFileSync } from 'fs';
|
|
6
|
+
import { homedir } from 'os';
|
|
7
|
+
import { join } from 'path';
|
|
8
|
+
import { createSandbox, collectLocalFiles, splitTestFiles, verifyNoTestFiles, } from '../sandbox.js';
|
|
9
|
+
import { runValidation, captureGeneratedFiles, createVitestConfig, initGitAndCommit, injectTranscriptContext, } from './shared.js';
|
|
10
|
+
/**
|
|
11
|
+
* Extract transcript from OpenCode JSON output.
|
|
12
|
+
* When run with --format json, OpenCode outputs JSON events to stdout.
|
|
13
|
+
*/
|
|
14
|
+
function extractTranscriptFromOutput(output) {
|
|
15
|
+
if (!output || !output.trim()) {
|
|
16
|
+
return undefined;
|
|
17
|
+
}
|
|
18
|
+
const lines = output.split('\n').filter(line => {
|
|
19
|
+
const trimmed = line.trim();
|
|
20
|
+
return trimmed.startsWith('{') && trimmed.endsWith('}');
|
|
21
|
+
});
|
|
22
|
+
if (lines.length === 0) {
|
|
23
|
+
return undefined;
|
|
24
|
+
}
|
|
25
|
+
return lines.join('\n');
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Generate OpenCode config file content.
|
|
29
|
+
* Only sets permissions -- provider auth comes from mounted credentials.
|
|
30
|
+
*/
|
|
31
|
+
function generateOpenCodeConfig() {
|
|
32
|
+
return `{
|
|
33
|
+
"$schema": "https://opencode.ai/config.json",
|
|
34
|
+
"permission": {
|
|
35
|
+
"write": "allow",
|
|
36
|
+
"edit": "allow",
|
|
37
|
+
"bash": "allow"
|
|
38
|
+
}
|
|
39
|
+
}`;
|
|
40
|
+
}
|
|
41
|
+
function collectCredentialFiles() {
|
|
42
|
+
const home = homedir();
|
|
43
|
+
const uploads = [];
|
|
44
|
+
const candidates = [
|
|
45
|
+
{
|
|
46
|
+
hostDir: join(home, '.local', 'share', 'opencode'),
|
|
47
|
+
containerDir: '/home/node/.local/share/opencode',
|
|
48
|
+
fileNames: ['auth.json'],
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
hostDir: join(home, '.config', 'github-copilot'),
|
|
52
|
+
containerDir: '/home/node/.config/github-copilot',
|
|
53
|
+
fileNames: ['apps.json', 'hosts.json'],
|
|
54
|
+
},
|
|
55
|
+
];
|
|
56
|
+
for (const { hostDir, containerDir, fileNames } of candidates) {
|
|
57
|
+
if (!existsSync(hostDir)) {
|
|
58
|
+
console.warn(`OpenCode agent: credential path not found, skipping: ${hostDir}`);
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
const files = [];
|
|
62
|
+
for (const name of fileNames) {
|
|
63
|
+
const fullPath = join(hostDir, name);
|
|
64
|
+
if (existsSync(fullPath)) {
|
|
65
|
+
files.push({ path: name, content: readFileSync(fullPath) });
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if (files.length > 0) {
|
|
69
|
+
uploads.push({ containerDir, files });
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return uploads;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Create OpenCode agent using host-mounted credentials.
|
|
76
|
+
* Requires Docker sandbox (bind mounts are not supported on Vercel sandbox).
|
|
77
|
+
*/
|
|
78
|
+
export function createOpenCodeAgent() {
|
|
79
|
+
return {
|
|
80
|
+
name: 'opencode',
|
|
81
|
+
displayName: 'OpenCode',
|
|
82
|
+
getApiKeyEnvVar() {
|
|
83
|
+
return '';
|
|
84
|
+
},
|
|
85
|
+
getDefaultModel() {
|
|
86
|
+
return 'github-copilot/claude-opus-4.6';
|
|
87
|
+
},
|
|
88
|
+
async run(fixturePath, options) {
|
|
89
|
+
const startTime = Date.now();
|
|
90
|
+
let sandbox = null;
|
|
91
|
+
let agentOutput = '';
|
|
92
|
+
let transcript;
|
|
93
|
+
let aborted = false;
|
|
94
|
+
let sandboxStopped = false;
|
|
95
|
+
const abortHandler = () => {
|
|
96
|
+
aborted = true;
|
|
97
|
+
if (sandbox && !sandboxStopped) {
|
|
98
|
+
sandboxStopped = true;
|
|
99
|
+
sandbox.stop().catch(() => { });
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
if (options.signal) {
|
|
103
|
+
if (options.signal.aborted) {
|
|
104
|
+
return {
|
|
105
|
+
success: false,
|
|
106
|
+
output: '',
|
|
107
|
+
error: 'Aborted before start',
|
|
108
|
+
duration: 0,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
options.signal.addEventListener('abort', abortHandler);
|
|
112
|
+
}
|
|
113
|
+
try {
|
|
114
|
+
const allFiles = await collectLocalFiles(fixturePath);
|
|
115
|
+
const { workspaceFiles, testFiles } = splitTestFiles(allFiles);
|
|
116
|
+
if (aborted) {
|
|
117
|
+
return {
|
|
118
|
+
success: false,
|
|
119
|
+
output: '',
|
|
120
|
+
error: 'Aborted',
|
|
121
|
+
duration: Date.now() - startTime,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
const credentialUploads = collectCredentialFiles();
|
|
125
|
+
const resolvedBackend = options.sandbox === 'vercel' ? 'vercel' : 'docker';
|
|
126
|
+
if (resolvedBackend === 'vercel') {
|
|
127
|
+
throw new Error('OpenCode agent requires Docker sandbox for credential injection. ' +
|
|
128
|
+
'Set sandbox: "docker" in your experiment config.');
|
|
129
|
+
}
|
|
130
|
+
sandbox = await createSandbox({
|
|
131
|
+
timeout: options.timeout,
|
|
132
|
+
runtime: 'node24',
|
|
133
|
+
backend: options.sandbox,
|
|
134
|
+
});
|
|
135
|
+
if (aborted) {
|
|
136
|
+
return {
|
|
137
|
+
success: false,
|
|
138
|
+
output: '',
|
|
139
|
+
error: 'Aborted',
|
|
140
|
+
duration: Date.now() - startTime,
|
|
141
|
+
sandboxId: sandbox.sandboxId,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
const dockerSandbox = sandbox;
|
|
145
|
+
for (const upload of credentialUploads) {
|
|
146
|
+
await dockerSandbox.uploadFilesToPath(upload.containerDir, upload.files);
|
|
147
|
+
}
|
|
148
|
+
await dockerSandbox.ensureUserOwnership('/home/node');
|
|
149
|
+
await sandbox.uploadFiles(workspaceFiles);
|
|
150
|
+
await initGitAndCommit(sandbox);
|
|
151
|
+
if (options.setup) {
|
|
152
|
+
await options.setup(sandbox);
|
|
153
|
+
}
|
|
154
|
+
let installResult = await sandbox.runCommand('npm', ['install']);
|
|
155
|
+
if (installResult.exitCode !== 0) {
|
|
156
|
+
installResult = await sandbox.runCommand('npm', ['install']);
|
|
157
|
+
}
|
|
158
|
+
if (installResult.exitCode !== 0) {
|
|
159
|
+
const output = (installResult.stdout + installResult.stderr).trim().split('\n').slice(-10).join('\n');
|
|
160
|
+
throw new Error(`npm install failed (exit code ${installResult.exitCode}):\n${output}`);
|
|
161
|
+
}
|
|
162
|
+
const cliInstall = await sandbox.runCommand('npm', [
|
|
163
|
+
'install',
|
|
164
|
+
'-g',
|
|
165
|
+
'opencode-ai',
|
|
166
|
+
]);
|
|
167
|
+
if (cliInstall.exitCode !== 0) {
|
|
168
|
+
throw new Error(`OpenCode CLI install failed: ${cliInstall.stderr}`);
|
|
169
|
+
}
|
|
170
|
+
const configContent = generateOpenCodeConfig();
|
|
171
|
+
await sandbox.writeFiles({
|
|
172
|
+
'opencode.json': configContent,
|
|
173
|
+
});
|
|
174
|
+
await verifyNoTestFiles(sandbox);
|
|
175
|
+
const opencodeResult = await sandbox.runCommand('opencode', [
|
|
176
|
+
'run',
|
|
177
|
+
options.prompt,
|
|
178
|
+
'--model',
|
|
179
|
+
options.model,
|
|
180
|
+
'--format',
|
|
181
|
+
'json',
|
|
182
|
+
]);
|
|
183
|
+
agentOutput = opencodeResult.stdout + opencodeResult.stderr;
|
|
184
|
+
transcript = extractTranscriptFromOutput(agentOutput);
|
|
185
|
+
if (opencodeResult.exitCode !== 0) {
|
|
186
|
+
const errorLines = agentOutput.trim().split('\n').slice(-5).join('\n');
|
|
187
|
+
return {
|
|
188
|
+
success: false,
|
|
189
|
+
output: agentOutput,
|
|
190
|
+
transcript,
|
|
191
|
+
error: errorLines || `OpenCode CLI exited with code ${opencodeResult.exitCode}`,
|
|
192
|
+
duration: Date.now() - startTime,
|
|
193
|
+
sandboxId: sandbox.sandboxId,
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
await sandbox.uploadFiles(testFiles);
|
|
197
|
+
await createVitestConfig(sandbox);
|
|
198
|
+
await injectTranscriptContext(sandbox, transcript, 'opencode', options.model);
|
|
199
|
+
const validationResults = await runValidation(sandbox, options.scripts ?? []);
|
|
200
|
+
const { generatedFiles, deletedFiles } = await captureGeneratedFiles(sandbox);
|
|
201
|
+
return {
|
|
202
|
+
success: validationResults.allPassed,
|
|
203
|
+
output: agentOutput,
|
|
204
|
+
transcript,
|
|
205
|
+
duration: Date.now() - startTime,
|
|
206
|
+
testResult: validationResults.test,
|
|
207
|
+
scriptsResults: validationResults.scripts,
|
|
208
|
+
sandboxId: sandbox.sandboxId,
|
|
209
|
+
generatedFiles,
|
|
210
|
+
deletedFiles,
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
catch (error) {
|
|
214
|
+
if (aborted) {
|
|
215
|
+
return {
|
|
216
|
+
success: false,
|
|
217
|
+
output: agentOutput,
|
|
218
|
+
transcript,
|
|
219
|
+
error: 'Aborted',
|
|
220
|
+
duration: Date.now() - startTime,
|
|
221
|
+
sandboxId: sandbox?.sandboxId,
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
return {
|
|
225
|
+
success: false,
|
|
226
|
+
output: agentOutput,
|
|
227
|
+
transcript,
|
|
228
|
+
error: error instanceof Error ? error.message : String(error),
|
|
229
|
+
duration: Date.now() - startTime,
|
|
230
|
+
sandboxId: sandbox?.sandboxId,
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
finally {
|
|
234
|
+
if (options.signal) {
|
|
235
|
+
options.signal.removeEventListener('abort', abortHandler);
|
|
236
|
+
}
|
|
237
|
+
if (sandbox && !sandboxStopped) {
|
|
238
|
+
sandboxStopped = true;
|
|
239
|
+
await sandbox.stop();
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
//# sourceMappingURL=opencode.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"opencode.js","sourceRoot":"","sources":["../../../src/lib/agents/opencode.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAC9C,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAG5B,OAAO,EACL,aAAa,EACb,iBAAiB,EACjB,cAAc,EACd,iBAAiB,GAElB,MAAM,eAAe,CAAC;AAGvB,OAAO,EACL,aAAa,EACb,qBAAqB,EACrB,kBAAkB,EAClB,gBAAgB,EAChB,uBAAuB,GACxB,MAAM,aAAa,CAAC;AAKrB;;;GAGG;AACH,SAAS,2BAA2B,CAAC,MAAc;IACjD,IAAI,CAAC,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC;QAC9B,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE;QAC7C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC5B,OAAO,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;;GAGG;AACH,SAAS,sBAAsB;IAC7B,OAAO;;;;;;;EAOP,CAAC;AACH,CAAC;AAOD,SAAS,sBAAsB;IAC7B,MAAM,IAAI,GAAG,OAAO,EAAE,CAAC;IACvB,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,MAAM,UAAU,GAAqE;QACnF;YACE,OAAO,EAAE,IAAI,CAAC,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,CAAC;YAClD,YAAY,EAAE,kCAAkC;YAChD,SAAS,EAAE,CAAC,WAAW,CAAC;SACzB;QACD;YACE,OAAO,EAAE,IAAI,CAAC,IAAI,EAAE,SAAS,EAAE,gBAAgB,CAAC;YAChD,YAAY,EAAE,mCAAmC;YACjD,SAAS,EAAE,CAAC,WAAW,EAAE,YAAY,CAAC;SACvC;KACF,CAAC;IAEF,KAAK,MAAM,EAAE,OAAO,EAAE,YAAY,EAAE,SAAS,EAAE,IAAI,UAAU,EAAE,CAAC;QAC9D,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YACzB,OAAO,CAAC,IAAI,CAAC,wDAAwD,OAAO,EAAE,CAAC,CAAC;YAChF,SAAS;QACX,CAAC;QAED,MAAM,KAAK,GAAkB,EAAE,CAAC;QAChC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;YAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;YACrC,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACzB,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,YAAY,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;YAC9D,CAAC;QACH,CAAC;QAED,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrB,OAAO,CAAC,IAAI,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,mBAAmB;IACjC,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,WAAW,EAAE,UAAU;QAEvB,eAAe;YACb,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,eAAe;YACb,OAAO,gCAAgC,CAAC;QAC1C,CAAC;QAED,KAAK,CAAC,GAAG,CAAC,WAAmB,EAAE,OAAwB;YACrD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC7B,IAAI,OAAO,GAAsB,IAAI,CAAC;YACtC,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,IAAI,UAA8B,CAAC;YACnC,IAAI,OAAO,GAAG,KAAK,CAAC;YACpB,IAAI,cAAc,GAAG,KAAK,CAAC;YAE3B,MAAM,YAAY,GAAG,GAAG,EAAE;gBACxB,OAAO,GAAG,IAAI,CAAC;gBACf,IAAI,OAAO,IAAI,CAAC,cAAc,EAAE,CAAC;oBAC/B,cAAc,GAAG,IAAI,CAAC;oBACtB,OAAO,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;gBACjC,CAAC;YACH,CAAC,CAAC;YAEF,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;gBACnB,IAAI,OAAO,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;oBAC3B,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,EAAE;wBACV,KAAK,EAAE,sBAAsB;wBAC7B,QAAQ,EAAE,CAAC;qBACZ,CAAC;gBACJ,CAAC;gBACD,OAAO,CAAC,MAAM,CAAC,gBAAgB,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;YACzD,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,MAAM,iBAAiB,CAAC,WAAW,CAAC,CAAC;gBACtD,MAAM,EAAE,cAAc,EAAE,SAAS,EAAE,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;gBAE/D,IAAI,OAAO,EAAE,CAAC;oBACZ,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,EAAE;wBACV,KAAK,EAAE,SAAS;wBAChB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;qBACjC,CAAC;gBACJ,CAAC;gBAED,MAAM,iBAAiB,GAAG,sBAAsB,EAAE,CAAC;gBAEnD,MAAM,eAAe,GAAG,OAAO,CAAC,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC;gBAC3E,IAAI,eAAe,KAAK,QAAQ,EAAE,CAAC;oBACjC,MAAM,IAAI,KAAK,CACb,mEAAmE;wBACnE,kDAAkD,CACnD,CAAC;gBACJ,CAAC;gBAED,OAAO,GAAG,MAAM,aAAa,CAAC;oBAC5B,OAAO,EAAE,OAAO,CAAC,OAAO;oBACxB,OAAO,EAAE,QAAQ;oBACjB,OAAO,EAAE,OAAO,CAAC,OAAO;iBACzB,CAAC,CAAC;gBAEH,IAAI,OAAO,EAAE,CAAC;oBACZ,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,EAAE;wBACV,KAAK,EAAE,SAAS;wBAChB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;wBAChC,SAAS,EAAE,OAAO,CAAC,SAAS;qBAC7B,CAAC;gBACJ,CAAC;gBAED,MAAM,aAAa,GAAG,OAA+B,CAAC;gBACtD,KAAK,MAAM,MAAM,IAAI,iBAAiB,EAAE,CAAC;oBACvC,MAAM,aAAa,CAAC,iBAAiB,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;gBAC3E,CAAC;gBACD,MAAM,aAAa,CAAC,mBAAmB,CAAC,YAAY,CAAC,CAAC;gBAEtD,MAAM,OAAO,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC;gBAEhD,MAAM,gBAAgB,CAAC,OAAO,CAAC,CAAC;gBAE1B,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;oBAClB,MAAM,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;gBAC/B,CAAC;gBAED,IAAI,aAAa,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC;gBACjE,IAAI,aAAa,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBACjC,aAAa,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC;gBAC/D,CAAC;gBACD,IAAI,aAAa,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBACjC,MAAM,MAAM,GAAG,CAAC,aAAa,CAAC,MAAM,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACtG,MAAM,IAAI,KAAK,CAAC,iCAAiC,aAAa,CAAC,QAAQ,OAAO,MAAM,EAAE,CAAC,CAAC;gBAC1F,CAAC;gBAED,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,KAAK,EAAE;oBACjD,SAAS;oBACT,IAAI;oBACJ,aAAa;iBACd,CAAC,CAAC;gBACH,IAAI,UAAU,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBAC9B,MAAM,IAAI,KAAK,CAAC,gCAAgC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC;gBACvE,CAAC;gBAED,MAAM,aAAa,GAAG,sBAAsB,EAAE,CAAC;gBAC/C,MAAM,OAAO,CAAC,UAAU,CAAC;oBACvB,eAAe,EAAE,aAAa;iBAC/B,CAAC,CAAC;gBAEH,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;gBAEjC,MAAM,cAAc,GAAG,MAAM,OAAO,CAAC,UAAU,CAC7C,UAAU,EACV;oBACE,KAAK;oBACL,OAAO,CAAC,MAAM;oBACd,SAAS;oBACT,OAAO,CAAC,KAAK;oBACb,UAAU;oBACV,MAAM;iBACP,CACF,CAAC;gBAEF,WAAW,GAAG,cAAc,CAAC,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC;gBAC5D,UAAU,GAAG,2BAA2B,CAAC,WAAW,CAAC,CAAC;gBAEtD,IAAI,cAAc,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBAClC,MAAM,UAAU,GAAG,WAAW,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACvE,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,WAAW;wBACnB,UAAU;wBACV,KAAK,EAAE,UAAU,IAAI,iCAAiC,cAAc,CAAC,QAAQ,EAAE;wBAC/E,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;wBAChC,SAAS,EAAE,OAAO,CAAC,SAAS;qBAC7B,CAAC;gBACJ,CAAC;gBAED,MAAM,OAAO,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;gBAErC,MAAM,kBAAkB,CAAC,OAAO,CAAC,CAAC;gBAElC,MAAM,uBAAuB,CAAC,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC;gBAE9E,MAAM,iBAAiB,GAAG,MAAM,aAAa,CAAC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC;gBAE9E,MAAM,EAAE,cAAc,EAAE,YAAY,EAAE,GAAG,MAAM,qBAAqB,CAAC,OAAO,CAAC,CAAC;gBAE9E,OAAO;oBACL,OAAO,EAAE,iBAAiB,CAAC,SAAS;oBACpC,MAAM,EAAE,WAAW;oBACnB,UAAU;oBACV,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;oBAChC,UAAU,EAAE,iBAAiB,CAAC,IAAI;oBAClC,cAAc,EAAE,iBAAiB,CAAC,OAAO;oBACzC,SAAS,EAAE,OAAO,CAAC,SAAS;oBAC5B,cAAc;oBACd,YAAY;iBACb,CAAC;YACJ,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,IAAI,OAAO,EAAE,CAAC;oBACZ,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,WAAW;wBACnB,UAAU;wBACV,KAAK,EAAE,SAAS;wBAChB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;wBAChC,SAAS,EAAE,OAAO,EAAE,SAAS;qBAC9B,CAAC;gBACJ,CAAC;gBACD,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,MAAM,EAAE,WAAW;oBACnB,UAAU;oBACV,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;oBAC7D,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;oBAChC,SAAS,EAAE,OAAO,EAAE,SAAS;iBAC9B,CAAC;YACJ,CAAC;oBAAS,CAAC;gBACT,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;oBACnB,OAAO,CAAC,MAAM,CAAC,mBAAmB,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;gBAC5D,CAAC;gBACD,IAAI,OAAO,IAAI,CAAC,cAAc,EAAE,CAAC;oBAC/B,cAAc,GAAG,IAAI,CAAC;oBACtB,MAAM,OAAO,CAAC,IAAI,EAAE,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;KACF,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent registry for managing available agents.
|
|
3
|
+
*/
|
|
4
|
+
import type { Agent } from './types.js';
|
|
5
|
+
import type { AgentType } from '../types.js';
|
|
6
|
+
/**
|
|
7
|
+
* Register an agent in the registry.
|
|
8
|
+
*/
|
|
9
|
+
export declare function registerAgent(agent: Agent): void;
|
|
10
|
+
/**
|
|
11
|
+
* Get an agent by name.
|
|
12
|
+
* @throws Error if agent is not found
|
|
13
|
+
*/
|
|
14
|
+
export declare function getAgent(name: AgentType): Agent;
|
|
15
|
+
/**
|
|
16
|
+
* List all registered agents.
|
|
17
|
+
*/
|
|
18
|
+
export declare function listAgents(): string[];
|
|
19
|
+
/**
|
|
20
|
+
* Check if an agent is registered.
|
|
21
|
+
*/
|
|
22
|
+
export declare function hasAgent(name: string): boolean;
|
|
23
|
+
//# sourceMappingURL=registry.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"registry.d.ts","sourceRoot":"","sources":["../../../src/lib/agents/registry.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACxC,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAI7C;;GAEG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,KAAK,GAAG,IAAI,CAEhD;AAED;;;GAGG;AACH,wBAAgB,QAAQ,CAAC,IAAI,EAAE,SAAS,GAAG,KAAK,CAO/C;AAED;;GAEG;AACH,wBAAgB,UAAU,IAAI,MAAM,EAAE,CAErC;AAED;;GAEG;AACH,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAE9C"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent registry for managing available agents.
|
|
3
|
+
*/
|
|
4
|
+
const agents = new Map();
|
|
5
|
+
/**
|
|
6
|
+
* Register an agent in the registry.
|
|
7
|
+
*/
|
|
8
|
+
export function registerAgent(agent) {
|
|
9
|
+
agents.set(agent.name, agent);
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Get an agent by name.
|
|
13
|
+
* @throws Error if agent is not found
|
|
14
|
+
*/
|
|
15
|
+
export function getAgent(name) {
|
|
16
|
+
const agent = agents.get(name);
|
|
17
|
+
if (!agent) {
|
|
18
|
+
const available = Array.from(agents.keys()).join(', ');
|
|
19
|
+
throw new Error(`Unknown agent: ${name}. Available agents: ${available}`);
|
|
20
|
+
}
|
|
21
|
+
return agent;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* List all registered agents.
|
|
25
|
+
*/
|
|
26
|
+
export function listAgents() {
|
|
27
|
+
return Array.from(agents.keys());
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Check if an agent is registered.
|
|
31
|
+
*/
|
|
32
|
+
export function hasAgent(name) {
|
|
33
|
+
return agents.has(name);
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=registry.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"registry.js","sourceRoot":"","sources":["../../../src/lib/agents/registry.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,MAAM,MAAM,GAAG,IAAI,GAAG,EAAiB,CAAC;AAExC;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,KAAY;IACxC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;AAChC,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,QAAQ,CAAC,IAAe;IACtC,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvD,MAAM,IAAI,KAAK,CAAC,kBAAkB,IAAI,uBAAuB,SAAS,EAAE,CAAC,CAAC;IAC5E,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU;IACxB,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;AACnC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,QAAQ,CAAC,IAAY;IACnC,OAAO,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared utilities for agent implementations.
|
|
3
|
+
*/
|
|
4
|
+
import type { ScriptResult } from './types.js';
|
|
5
|
+
import type { SandboxManager } from '../sandbox.js';
|
|
6
|
+
import type { DockerSandboxManager } from '../docker-sandbox.js';
|
|
7
|
+
/** Union type for sandbox implementations */
|
|
8
|
+
type AnySandbox = SandboxManager | DockerSandboxManager;
|
|
9
|
+
/**
|
|
10
|
+
* Well-known directory where transcript context is written inside the sandbox.
|
|
11
|
+
* EVAL.ts tests can read `__agent_eval__/results.json` to assert on agent behavior
|
|
12
|
+
* (e.g. which shell commands were run, files modified, tool calls made).
|
|
13
|
+
*/
|
|
14
|
+
export declare const TRANSCRIPT_CONTEXT_DIR = "__agent_eval__";
|
|
15
|
+
/** Path to the results file inside the sandbox. */
|
|
16
|
+
export declare const TRANSCRIPT_CONTEXT_PATH = "__agent_eval__/results.json";
|
|
17
|
+
/**
|
|
18
|
+
* Combined validation results.
|
|
19
|
+
*/
|
|
20
|
+
export interface ValidationResults {
|
|
21
|
+
allPassed: boolean;
|
|
22
|
+
test?: ScriptResult;
|
|
23
|
+
scripts: Record<string, ScriptResult>;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Run validation scripts in the sandbox.
|
|
27
|
+
*/
|
|
28
|
+
export declare function runValidation(sandbox: AnySandbox, scripts: string[]): Promise<ValidationResults>;
|
|
29
|
+
export declare function initGitAndCommit(sandbox: AnySandbox): Promise<void>;
|
|
30
|
+
/**
|
|
31
|
+
* Capture source files generated by the agent.
|
|
32
|
+
* Returns both modified/added files (with content) and deleted file paths.
|
|
33
|
+
*/
|
|
34
|
+
export declare function captureGeneratedFiles(sandbox: AnySandbox): Promise<{
|
|
35
|
+
generatedFiles: Record<string, string>;
|
|
36
|
+
deletedFiles: string[];
|
|
37
|
+
}>;
|
|
38
|
+
/**
|
|
39
|
+
* Create vitest config for running EVAL.ts or EVAL.tsx.
|
|
40
|
+
*/
|
|
41
|
+
export declare function createVitestConfig(sandbox: AnySandbox): Promise<void>;
|
|
42
|
+
/**
|
|
43
|
+
* Inject transcript context into the sandbox so EVAL.ts tests can assert on agent behavior.
|
|
44
|
+
* Writes parsed transcript summary to `__agent_eval__/results.json`.
|
|
45
|
+
*
|
|
46
|
+
* This is best-effort: failures are silently ignored since it's supplementary data.
|
|
47
|
+
*/
|
|
48
|
+
export declare function injectTranscriptContext(sandbox: AnySandbox, rawTranscript: string | undefined, agentName: string, model?: string): Promise<void>;
|
|
49
|
+
/**
|
|
50
|
+
* AI Gateway configuration.
|
|
51
|
+
*/
|
|
52
|
+
export declare const AI_GATEWAY: {
|
|
53
|
+
readonly baseUrl: "https://ai-gateway.vercel.sh";
|
|
54
|
+
readonly openAiBaseUrl: "https://ai-gateway.vercel.sh/v1";
|
|
55
|
+
readonly apiKeyEnvVar: "AI_GATEWAY_API_KEY";
|
|
56
|
+
};
|
|
57
|
+
/**
|
|
58
|
+
* Direct API configuration for Anthropic.
|
|
59
|
+
*/
|
|
60
|
+
export declare const ANTHROPIC_DIRECT: {
|
|
61
|
+
readonly apiKeyEnvVar: "ANTHROPIC_API_KEY";
|
|
62
|
+
};
|
|
63
|
+
/**
|
|
64
|
+
* Direct API configuration for OpenAI.
|
|
65
|
+
*/
|
|
66
|
+
export declare const OPENAI_DIRECT: {
|
|
67
|
+
readonly baseUrl: "https://api.openai.com/v1";
|
|
68
|
+
readonly apiKeyEnvVar: "OPENAI_API_KEY";
|
|
69
|
+
};
|
|
70
|
+
/**
|
|
71
|
+
* Direct API configuration for Google Gemini.
|
|
72
|
+
*/
|
|
73
|
+
export declare const GEMINI_DIRECT: {
|
|
74
|
+
readonly apiKeyEnvVar: "GEMINI_API_KEY";
|
|
75
|
+
};
|
|
76
|
+
/**
|
|
77
|
+
* Direct API configuration for Cursor.
|
|
78
|
+
*/
|
|
79
|
+
export declare const CURSOR_DIRECT: {
|
|
80
|
+
readonly apiKeyEnvVar: "CURSOR_API_KEY";
|
|
81
|
+
};
|
|
82
|
+
export {};
|
|
83
|
+
//# sourceMappingURL=shared.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"shared.d.ts","sourceRoot":"","sources":["../../../src/lib/agents/shared.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC/C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AACpD,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,sBAAsB,CAAC;AAGjE,6CAA6C;AAC7C,KAAK,UAAU,GAAG,cAAc,GAAG,oBAAoB,CAAC;AAExD;;;;GAIG;AACH,eAAO,MAAM,sBAAsB,mBAAmB,CAAC;AAEvD,mDAAmD;AACnD,eAAO,MAAM,uBAAuB,gCAA2C,CAAC;AAEhF;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,SAAS,EAAE,OAAO,CAAC;IACnB,IAAI,CAAC,EAAE,YAAY,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;CACvC;AAgCD;;GAEG;AACH,wBAAsB,aAAa,CACjC,OAAO,EAAE,UAAU,EACnB,OAAO,EAAE,MAAM,EAAE,GAChB,OAAO,CAAC,iBAAiB,CAAC,CAmC5B;AAED,wBAAsB,gBAAgB,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,CAUzE;AAED;;;GAGG;AACH,wBAAsB,qBAAqB,CACzC,OAAO,EAAE,UAAU,GAClB,OAAO,CAAC;IAAE,cAAc,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAAC,YAAY,EAAE,MAAM,EAAE,CAAA;CAAE,CAAC,CAmC7E;AAED;;GAEG;AACH,wBAAsB,kBAAkB,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,CAe3E;AAED;;;;;GAKG;AACH,wBAAsB,uBAAuB,CAC3C,OAAO,EAAE,UAAU,EACnB,aAAa,EAAE,MAAM,GAAG,SAAS,EACjC,SAAS,EAAE,MAAM,EACjB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,IAAI,CAAC,CAgBf;AAED;;GAEG;AACH,eAAO,MAAM,UAAU;;;;CAIb,CAAC;AAEX;;GAEG;AACH,eAAO,MAAM,gBAAgB;;CAEnB,CAAC;AAEX;;GAEG;AACH,eAAO,MAAM,aAAa;;;CAGhB,CAAC;AAEX;;GAEG;AACH,eAAO,MAAM,aAAa;;CAEhB,CAAC;AAEX;;GAEG;AACH,eAAO,MAAM,aAAa;;CAEhB,CAAC"}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared utilities for agent implementations.
|
|
3
|
+
*/
|
|
4
|
+
import { parseTranscript } from '../o11y/index.js';
|
|
5
|
+
/**
|
|
6
|
+
* Well-known directory where transcript context is written inside the sandbox.
|
|
7
|
+
* EVAL.ts tests can read `__agent_eval__/results.json` to assert on agent behavior
|
|
8
|
+
* (e.g. which shell commands were run, files modified, tool calls made).
|
|
9
|
+
*/
|
|
10
|
+
export const TRANSCRIPT_CONTEXT_DIR = '__agent_eval__';
|
|
11
|
+
/** Path to the results file inside the sandbox. */
|
|
12
|
+
export const TRANSCRIPT_CONTEXT_PATH = `${TRANSCRIPT_CONTEXT_DIR}/results.json`;
|
|
13
|
+
/**
|
|
14
|
+
* Detect which eval file exists in the sandbox (EVAL.ts or EVAL.tsx).
|
|
15
|
+
* Case-sensitive: Only matches exact uppercase filenames.
|
|
16
|
+
* Returns the filename if found, or 'EVAL.ts' as fallback.
|
|
17
|
+
*/
|
|
18
|
+
async function detectEvalFile(sandbox) {
|
|
19
|
+
try {
|
|
20
|
+
// List files in current directory and check for exact case match
|
|
21
|
+
const lsResult = await sandbox.runShell('ls -1');
|
|
22
|
+
if (lsResult.exitCode === 0) {
|
|
23
|
+
const files = lsResult.stdout.split('\n').map((f) => f.trim());
|
|
24
|
+
// Check for EVAL.tsx first (prefer JSX if both exist)
|
|
25
|
+
if (files.includes('EVAL.tsx')) {
|
|
26
|
+
return 'EVAL.tsx';
|
|
27
|
+
}
|
|
28
|
+
// Check for EVAL.ts
|
|
29
|
+
if (files.includes('EVAL.ts')) {
|
|
30
|
+
return 'EVAL.ts';
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
// Ignore errors
|
|
36
|
+
}
|
|
37
|
+
// Default to EVAL.ts (will fail later if it doesn't exist)
|
|
38
|
+
return 'EVAL.ts';
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Run validation scripts in the sandbox.
|
|
42
|
+
*/
|
|
43
|
+
export async function runValidation(sandbox, scripts) {
|
|
44
|
+
const results = {
|
|
45
|
+
allPassed: true,
|
|
46
|
+
scripts: {},
|
|
47
|
+
};
|
|
48
|
+
// Detect which eval file exists (EVAL.ts or EVAL.tsx)
|
|
49
|
+
const evalFile = await detectEvalFile(sandbox);
|
|
50
|
+
// Always run vitest for the eval file (explicitly specify the file)
|
|
51
|
+
const testResult = await sandbox.runCommand('npx', ['vitest', 'run', evalFile]);
|
|
52
|
+
results.test = {
|
|
53
|
+
success: testResult.exitCode === 0,
|
|
54
|
+
output: testResult.stdout + testResult.stderr,
|
|
55
|
+
};
|
|
56
|
+
if (!results.test.success) {
|
|
57
|
+
results.allPassed = false;
|
|
58
|
+
}
|
|
59
|
+
// Run configured scripts
|
|
60
|
+
for (const script of scripts) {
|
|
61
|
+
const scriptResult = await sandbox.runCommand('npm', ['run', script]);
|
|
62
|
+
const result = {
|
|
63
|
+
success: scriptResult.exitCode === 0,
|
|
64
|
+
output: scriptResult.stdout + scriptResult.stderr,
|
|
65
|
+
};
|
|
66
|
+
results.scripts[script] = result;
|
|
67
|
+
if (!result.success) {
|
|
68
|
+
results.allPassed = false;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return results;
|
|
72
|
+
}
|
|
73
|
+
export async function initGitAndCommit(sandbox) {
|
|
74
|
+
await sandbox.writeFiles({
|
|
75
|
+
".gitignore": "node_modules/\n",
|
|
76
|
+
});
|
|
77
|
+
// init a git repo and set user and name since those are needed. Commit everything to have a clean diff with HEAD to capture
|
|
78
|
+
// the generated files
|
|
79
|
+
await sandbox.runShell('git init && git config user.email "agent-eval@localhost" && git config user.name "agent-eval" && git add . && git commit -m "init"');
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Capture source files generated by the agent.
|
|
83
|
+
* Returns both modified/added files (with content) and deleted file paths.
|
|
84
|
+
*/
|
|
85
|
+
export async function captureGeneratedFiles(sandbox) {
|
|
86
|
+
const generatedFiles = {};
|
|
87
|
+
const deletedFiles = [];
|
|
88
|
+
try {
|
|
89
|
+
// Use --name-status to distinguish added/modified from deleted
|
|
90
|
+
const findResult = await sandbox.runShell("git add . && git diff HEAD --name-status");
|
|
91
|
+
const lines = findResult.stdout
|
|
92
|
+
.trim()
|
|
93
|
+
.split('\n')
|
|
94
|
+
.filter(Boolean);
|
|
95
|
+
for (const line of lines) {
|
|
96
|
+
const [status, ...rest] = line.split('\t');
|
|
97
|
+
const filePath = rest.join('\t');
|
|
98
|
+
if (!filePath)
|
|
99
|
+
continue;
|
|
100
|
+
if (status === 'D') {
|
|
101
|
+
deletedFiles.push(filePath);
|
|
102
|
+
}
|
|
103
|
+
else {
|
|
104
|
+
try {
|
|
105
|
+
const content = await sandbox.readFile(filePath);
|
|
106
|
+
generatedFiles[filePath] = content;
|
|
107
|
+
}
|
|
108
|
+
catch {
|
|
109
|
+
// Skip unreadable files
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
// If capture fails, return empty results
|
|
116
|
+
}
|
|
117
|
+
return { generatedFiles, deletedFiles };
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Create vitest config for running EVAL.ts or EVAL.tsx.
|
|
121
|
+
*/
|
|
122
|
+
export async function createVitestConfig(sandbox) {
|
|
123
|
+
// Detect which eval file exists
|
|
124
|
+
const evalFile = await detectEvalFile(sandbox);
|
|
125
|
+
await sandbox.writeFiles({
|
|
126
|
+
'vitest.config.ts': `
|
|
127
|
+
import { defineConfig } from 'vitest/config';
|
|
128
|
+
export default defineConfig({
|
|
129
|
+
test: {
|
|
130
|
+
include: ['${evalFile}'],
|
|
131
|
+
globals: false,
|
|
132
|
+
},
|
|
133
|
+
});
|
|
134
|
+
`,
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Inject transcript context into the sandbox so EVAL.ts tests can assert on agent behavior.
|
|
139
|
+
* Writes parsed transcript summary to `__agent_eval__/results.json`.
|
|
140
|
+
*
|
|
141
|
+
* This is best-effort: failures are silently ignored since it's supplementary data.
|
|
142
|
+
*/
|
|
143
|
+
export async function injectTranscriptContext(sandbox, rawTranscript, agentName, model) {
|
|
144
|
+
try {
|
|
145
|
+
const transcript = rawTranscript
|
|
146
|
+
? parseTranscript(rawTranscript, agentName, model)
|
|
147
|
+
: null;
|
|
148
|
+
const context = {
|
|
149
|
+
o11y: transcript?.summary ?? null,
|
|
150
|
+
};
|
|
151
|
+
await sandbox.writeFiles({
|
|
152
|
+
[TRANSCRIPT_CONTEXT_PATH]: JSON.stringify(context, null, 2),
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
catch {
|
|
156
|
+
// Best-effort: don't fail the eval if context injection fails
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* AI Gateway configuration.
|
|
161
|
+
*/
|
|
162
|
+
export const AI_GATEWAY = {
|
|
163
|
+
baseUrl: 'https://ai-gateway.vercel.sh',
|
|
164
|
+
openAiBaseUrl: 'https://ai-gateway.vercel.sh/v1',
|
|
165
|
+
apiKeyEnvVar: 'AI_GATEWAY_API_KEY',
|
|
166
|
+
};
|
|
167
|
+
/**
|
|
168
|
+
* Direct API configuration for Anthropic.
|
|
169
|
+
*/
|
|
170
|
+
export const ANTHROPIC_DIRECT = {
|
|
171
|
+
apiKeyEnvVar: 'ANTHROPIC_API_KEY',
|
|
172
|
+
};
|
|
173
|
+
/**
|
|
174
|
+
* Direct API configuration for OpenAI.
|
|
175
|
+
*/
|
|
176
|
+
export const OPENAI_DIRECT = {
|
|
177
|
+
baseUrl: 'https://api.openai.com/v1',
|
|
178
|
+
apiKeyEnvVar: 'OPENAI_API_KEY',
|
|
179
|
+
};
|
|
180
|
+
/**
|
|
181
|
+
* Direct API configuration for Google Gemini.
|
|
182
|
+
*/
|
|
183
|
+
export const GEMINI_DIRECT = {
|
|
184
|
+
apiKeyEnvVar: 'GEMINI_API_KEY',
|
|
185
|
+
};
|
|
186
|
+
/**
|
|
187
|
+
* Direct API configuration for Cursor.
|
|
188
|
+
*/
|
|
189
|
+
export const CURSOR_DIRECT = {
|
|
190
|
+
apiKeyEnvVar: 'CURSOR_API_KEY',
|
|
191
|
+
};
|
|
192
|
+
//# sourceMappingURL=shared.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"shared.js","sourceRoot":"","sources":["../../../src/lib/agents/shared.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAKnD;;;;GAIG;AACH,MAAM,CAAC,MAAM,sBAAsB,GAAG,gBAAgB,CAAC;AAEvD,mDAAmD;AACnD,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,sBAAsB,eAAe,CAAC;AAWhF;;;;GAIG;AACH,KAAK,UAAU,cAAc,CAAC,OAAmB;IAC/C,IAAI,CAAC;QACH,iEAAiE;QACjE,MAAM,QAAQ,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QACjD,IAAI,QAAQ,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;YAC5B,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAE/D,sDAAsD;YACtD,IAAI,KAAK,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;gBAC/B,OAAO,UAAU,CAAC;YACpB,CAAC;YAED,oBAAoB;YACpB,IAAI,KAAK,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC9B,OAAO,SAAS,CAAC;YACnB,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,gBAAgB;IAClB,CAAC;IAED,2DAA2D;IAC3D,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAmB,EACnB,OAAiB;IAEjB,MAAM,OAAO,GAAsB;QACjC,SAAS,EAAE,IAAI;QACf,OAAO,EAAE,EAAE;KACZ,CAAC;IAEF,sDAAsD;IACtD,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;IAE/C,oEAAoE;IACpE,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC,QAAQ,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC;IAChF,OAAO,CAAC,IAAI,GAAG;QACb,OAAO,EAAE,UAAU,CAAC,QAAQ,KAAK,CAAC;QAClC,MAAM,EAAE,UAAU,CAAC,MAAM,GAAG,UAAU,CAAC,MAAM;KAC9C,CAAC;IACF,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;QAC1B,OAAO,CAAC,SAAS,GAAG,KAAK,CAAC;IAC5B,CAAC;IAED,yBAAyB;IACzB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC;QACtE,MAAM,MAAM,GAAiB;YAC3B,OAAO,EAAE,YAAY,CAAC,QAAQ,KAAK,CAAC;YACpC,MAAM,EAAE,YAAY,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM;SAClD,CAAC;QAEF,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,MAAM,CAAC;QAEjC,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YACpB,OAAO,CAAC,SAAS,GAAG,KAAK,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,OAAmB;IACxD,MAAM,OAAO,CAAC,UAAU,CAAC;QACvB,YAAY,EAAE,iBAAiB;KAChC,CAAC,CAAC;IAEH,4HAA4H;IAC5H,sBAAsB;IACtB,MAAM,OAAO,CAAC,QAAQ,CACpB,oIAAoI,CACrI,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,OAAmB;IAEnB,MAAM,cAAc,GAA2B,EAAE,CAAC;IAClD,MAAM,YAAY,GAAa,EAAE,CAAC;IAElC,IAAI,CAAC;QACH,+DAA+D;QAC/D,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,0CAA0C,CAAC,CAAC;QAEtF,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM;aAC5B,IAAI,EAAE;aACN,KAAK,CAAC,IAAI,CAAC;aACX,MAAM,CAAC,OAAO,CAAC,CAAC;QAEnB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAC3C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAEjC,IAAI,CAAC,QAAQ;gBAAE,SAAS;YAExB,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;gBACnB,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;oBACjD,cAAc,CAAC,QAAQ,CAAC,GAAG,OAAO,CAAC;gBACrC,CAAC;gBAAC,MAAM,CAAC;oBACP,wBAAwB;gBAC1B,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,yCAAyC;IAC3C,CAAC;IAED,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,OAAmB;IAC1D,gCAAgC;IAChC,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;IAE/C,MAAM,OAAO,CAAC,UAAU,CAAC;QACvB,kBAAkB,EAAE;;;;iBAIP,QAAQ;;;;CAIxB;KACE,CAAC,CAAC;AACL,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC3C,OAAmB,EACnB,aAAiC,EACjC,SAAiB,EACjB,KAAc;IAEd,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,aAAa;YAC9B,CAAC,CAAC,eAAe,CAAC,aAAa,EAAE,SAAS,EAAE,KAAK,CAAC;YAClD,CAAC,CAAC,IAAI,CAAC;QAET,MAAM,OAAO,GAAG;YACd,IAAI,EAAE,UAAU,EAAE,OAAO,IAAI,IAAI;SAClC,CAAC;QAEF,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,CAAC,uBAAuB,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;SAC5D,CAAC,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,8DAA8D;IAChE,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,UAAU,GAAG;IACxB,OAAO,EAAE,8BAA8B;IACvC,aAAa,EAAE,iCAAiC;IAChD,YAAY,EAAE,oBAAoB;CAC1B,CAAC;AAEX;;GAEG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAG;IAC9B,YAAY,EAAE,mBAAmB;CACzB,CAAC;AAEX;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAG;IAC3B,OAAO,EAAE,2BAA2B;IACpC,YAAY,EAAE,gBAAgB;CACtB,CAAC;AAEX;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAG;IAC3B,YAAY,EAAE,gBAAgB;CACtB,CAAC;AAEX;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAG;IAC3B,YAAY,EAAE,gBAAgB;CACtB,CAAC"}
|