@vercel/agent-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +370 -0
  2. package/dist/cli.d.ts +6 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +166 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/index.d.ts +21 -0
  7. package/dist/index.d.ts.map +1 -0
  8. package/dist/index.js +17 -0
  9. package/dist/index.js.map +1 -0
  10. package/dist/lib/agents/claude-code.d.ts +12 -0
  11. package/dist/lib/agents/claude-code.d.ts.map +1 -0
  12. package/dist/lib/agents/claude-code.js +203 -0
  13. package/dist/lib/agents/claude-code.js.map +1 -0
  14. package/dist/lib/agents/codex.d.ts +12 -0
  15. package/dist/lib/agents/codex.d.ts.map +1 -0
  16. package/dist/lib/agents/codex.js +247 -0
  17. package/dist/lib/agents/codex.js.map +1 -0
  18. package/dist/lib/agents/index.d.ts +7 -0
  19. package/dist/lib/agents/index.d.ts.map +1 -0
  20. package/dist/lib/agents/index.js +14 -0
  21. package/dist/lib/agents/index.js.map +1 -0
  22. package/dist/lib/agents/registry.d.ts +23 -0
  23. package/dist/lib/agents/registry.d.ts.map +1 -0
  24. package/dist/lib/agents/registry.js +35 -0
  25. package/dist/lib/agents/registry.js.map +1 -0
  26. package/dist/lib/agents/shared.d.ts +47 -0
  27. package/dist/lib/agents/shared.d.ts.map +1 -0
  28. package/dist/lib/agents/shared.js +99 -0
  29. package/dist/lib/agents/shared.js.map +1 -0
  30. package/dist/lib/agents/types.d.ts +69 -0
  31. package/dist/lib/agents/types.d.ts.map +1 -0
  32. package/dist/lib/agents/types.js +5 -0
  33. package/dist/lib/agents/types.js.map +1 -0
  34. package/dist/lib/config.d.ts +34 -0
  35. package/dist/lib/config.d.ts.map +1 -0
  36. package/dist/lib/config.js +117 -0
  37. package/dist/lib/config.js.map +1 -0
  38. package/dist/lib/fixture.d.ts +52 -0
  39. package/dist/lib/fixture.d.ts.map +1 -0
  40. package/dist/lib/fixture.js +175 -0
  41. package/dist/lib/fixture.js.map +1 -0
  42. package/dist/lib/init.d.ts +21 -0
  43. package/dist/lib/init.d.ts.map +1 -0
  44. package/dist/lib/init.js +250 -0
  45. package/dist/lib/init.js.map +1 -0
  46. package/dist/lib/results.d.ts +54 -0
  47. package/dist/lib/results.d.ts.map +1 -0
  48. package/dist/lib/results.js +186 -0
  49. package/dist/lib/results.js.map +1 -0
  50. package/dist/lib/runner.d.ts +43 -0
  51. package/dist/lib/runner.d.ts.map +1 -0
  52. package/dist/lib/runner.js +142 -0
  53. package/dist/lib/runner.js.map +1 -0
  54. package/dist/lib/sandbox.d.ts +117 -0
  55. package/dist/lib/sandbox.d.ts.map +1 -0
  56. package/dist/lib/sandbox.js +248 -0
  57. package/dist/lib/sandbox.js.map +1 -0
  58. package/dist/lib/types.d.ts +166 -0
  59. package/dist/lib/types.d.ts.map +1 -0
  60. package/dist/lib/types.js +14 -0
  61. package/dist/lib/types.js.map +1 -0
  62. package/dist/test-setup.d.ts +2 -0
  63. package/dist/test-setup.d.ts.map +1 -0
  64. package/dist/test-setup.js +6 -0
  65. package/dist/test-setup.js.map +1 -0
  66. package/package.json +58 -0
@@ -0,0 +1,250 @@
1
+ /**
2
+ * Project initialization - create new eval projects.
3
+ */
4
+ import { mkdirSync, writeFileSync, existsSync } from 'fs';
5
+ import { join, dirname } from 'path';
6
+ /**
7
+ * Get the package.json template.
8
+ */
9
+ function getPackageJson(projectName) {
10
+ return JSON.stringify({
11
+ name: projectName,
12
+ version: '0.0.1',
13
+ private: true,
14
+ type: 'module',
15
+ devDependencies: {
16
+ 'agent-eval': '^0.0.1',
17
+ '@types/node': '^22.0.0',
18
+ typescript: '^5.6.0',
19
+ vitest: '^2.1.0',
20
+ },
21
+ }, null, 2);
22
+ }
23
+ /**
24
+ * Get the .env.example template.
25
+ */
26
+ function getEnvExample() {
27
+ return `# Required - Vercel AI Gateway API key (works for all agents)
28
+ # Get yours at: https://vercel.com/dashboard -> AI Gateway
29
+ AI_GATEWAY_API_KEY=your-ai-gateway-api-key
30
+
31
+ # Required - Vercel token for sandbox access (choose ONE of the options below)
32
+ # The @vercel/sandbox package automatically detects either token.
33
+
34
+ # Option 1: Personal Access Token (for local development)
35
+ # Create at: https://vercel.com/account/tokens
36
+ VERCEL_TOKEN=your-vercel-token
37
+
38
+ # Option 2: OIDC Token (for CI/CD pipelines like GitHub Actions)
39
+ # Automatically provided by Vercel's CI integration
40
+ # VERCEL_OIDC_TOKEN=your-oidc-token
41
+ `;
42
+ }
43
+ /**
44
+ * Get the .gitignore template.
45
+ */
46
+ function getGitignore() {
47
+ return `node_modules/
48
+ dist/
49
+ .env
50
+ .env.local
51
+ results/
52
+ *.log
53
+ .DS_Store
54
+ `;
55
+ }
56
+ /**
57
+ * Get the default experiment configuration template (Claude Code).
58
+ */
59
+ function getCCExperiment() {
60
+ return `import type { ExperimentConfig } from 'agent-eval';
61
+
62
+ const config: ExperimentConfig = {
63
+ agent: 'vercel-ai-gateway/claude-code',
64
+ runs: 1,
65
+ earlyExit: true,
66
+ scripts: ['build'],
67
+ timeout: 300,
68
+ };
69
+
70
+ export default config;
71
+ `;
72
+ }
73
+ /**
74
+ * Get the Codex experiment configuration template.
75
+ */
76
+ function getCodexExperiment() {
77
+ return `import type { ExperimentConfig } from 'agent-eval';
78
+
79
+ const config: ExperimentConfig = {
80
+ agent: 'vercel-ai-gateway/codex',
81
+ runs: 1,
82
+ earlyExit: true,
83
+ scripts: ['build'],
84
+ timeout: 300,
85
+ };
86
+
87
+ export default config;
88
+ `;
89
+ }
90
+ /**
91
+ * Get the example eval fixture PROMPT.md.
92
+ */
93
+ function getExamplePrompt() {
94
+ return `Add a greeting message below the heading that says "Welcome, user!"
95
+
96
+ Requirements:
97
+ - Add a paragraph element below the h1
98
+ - The text should be exactly "Welcome, user!"
99
+ - Keep the existing heading unchanged
100
+ `;
101
+ }
102
+ /**
103
+ * Get the example eval fixture EVAL.ts.
104
+ */
105
+ function getExampleEval() {
106
+ return `import { readFileSync } from 'fs';
107
+ import { execSync } from 'child_process';
108
+ import { test, expect } from 'vitest';
109
+
110
+ test('greeting message exists in source', () => {
111
+ const content = readFileSync('src/App.tsx', 'utf-8');
112
+ expect(content).toContain('Welcome, user!');
113
+ });
114
+
115
+ test('app still builds', () => {
116
+ // This throws if the build fails
117
+ execSync('npm run build', { stdio: 'pipe' });
118
+ });
119
+ `;
120
+ }
121
+ /**
122
+ * Get the example eval fixture package.json.
123
+ */
124
+ function getExamplePackageJson() {
125
+ return JSON.stringify({
126
+ name: 'add-greeting',
127
+ type: 'module',
128
+ scripts: {
129
+ build: 'tsc',
130
+ },
131
+ dependencies: {
132
+ react: '^18.0.0',
133
+ },
134
+ devDependencies: {
135
+ '@types/react': '^18.0.0',
136
+ typescript: '^5.0.0',
137
+ vitest: '^2.1.0',
138
+ },
139
+ }, null, 2);
140
+ }
141
+ /**
142
+ * Get the root tsconfig.json for the project.
143
+ */
144
+ function getRootTsconfig() {
145
+ return JSON.stringify({
146
+ compilerOptions: {
147
+ target: 'ES2022',
148
+ module: 'NodeNext',
149
+ moduleResolution: 'NodeNext',
150
+ strict: true,
151
+ skipLibCheck: true,
152
+ noEmit: true,
153
+ lib: ['ES2022'],
154
+ },
155
+ include: ['experiments'],
156
+ }, null, 2);
157
+ }
158
+ /**
159
+ * Get the example eval fixture tsconfig.json.
160
+ */
161
+ function getExampleTsconfig() {
162
+ return JSON.stringify({
163
+ compilerOptions: {
164
+ target: 'ES2020',
165
+ module: 'ESNext',
166
+ moduleResolution: 'bundler',
167
+ jsx: 'react-jsx',
168
+ strict: true,
169
+ outDir: 'dist',
170
+ skipLibCheck: true,
171
+ },
172
+ include: ['src'],
173
+ }, null, 2);
174
+ }
175
+ /**
176
+ * Get the example eval fixture App.tsx.
177
+ */
178
+ function getExampleApp() {
179
+ return `export function App() {
180
+ return (
181
+ <div>
182
+ <h1>Hello World</h1>
183
+ {/* TODO: Add greeting message here */}
184
+ </div>
185
+ );
186
+ }
187
+
188
+ export default App;
189
+ `;
190
+ }
191
+ /**
192
+ * Get all template files for a new project.
193
+ */
194
+ function getTemplateFiles(projectName) {
195
+ return [
196
+ { path: 'package.json', content: getPackageJson(projectName) },
197
+ { path: 'tsconfig.json', content: getRootTsconfig() },
198
+ { path: '.env.example', content: getEnvExample() },
199
+ { path: '.gitignore', content: getGitignore() },
200
+ { path: 'experiments/cc.ts', content: getCCExperiment() },
201
+ { path: 'experiments/codex.ts', content: getCodexExperiment() },
202
+ { path: 'evals/add-greeting/PROMPT.md', content: getExamplePrompt() },
203
+ { path: 'evals/add-greeting/EVAL.ts', content: getExampleEval() },
204
+ { path: 'evals/add-greeting/package.json', content: getExamplePackageJson() },
205
+ { path: 'evals/add-greeting/tsconfig.json', content: getExampleTsconfig() },
206
+ { path: 'evals/add-greeting/src/App.tsx', content: getExampleApp() },
207
+ ];
208
+ }
209
+ /**
210
+ * Initialize a new eval project.
211
+ */
212
+ export function initProject(options) {
213
+ const targetDir = options.targetDir ?? process.cwd();
214
+ const projectDir = join(targetDir, options.name);
215
+ // Check if directory already exists
216
+ if (existsSync(projectDir)) {
217
+ throw new Error(`Directory already exists: ${projectDir}`);
218
+ }
219
+ // Create project directory
220
+ mkdirSync(projectDir, { recursive: true });
221
+ // Write all template files
222
+ const files = getTemplateFiles(options.name);
223
+ for (const file of files) {
224
+ const filePath = join(projectDir, file.path);
225
+ const fileDir = dirname(filePath);
226
+ // Create parent directories
227
+ mkdirSync(fileDir, { recursive: true });
228
+ // Write file
229
+ writeFileSync(filePath, file.content);
230
+ }
231
+ return projectDir;
232
+ }
233
+ /**
234
+ * Get instructions for after project creation.
235
+ */
236
+ export function getPostInitInstructions(projectDir, projectName) {
237
+ return `
238
+ Project created at: ${projectDir}
239
+
240
+ Next steps:
241
+ 1. cd ${projectName}
242
+ 2. npm install
243
+ 3. Copy .env.example to .env and add your API keys
244
+ 4. npx agent-eval
245
+
246
+ For more information, see the documentation at:
247
+ https://github.com/vercel-labs/agent-eval
248
+ `;
249
+ }
250
+ //# sourceMappingURL=init.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"init.js","sourceRoot":"","sources":["../../src/lib/init.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAC1D,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAoBrC;;GAEG;AACH,SAAS,cAAc,CAAC,WAAmB;IACzC,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,IAAI,EAAE,WAAW;QACjB,OAAO,EAAE,OAAO;QAChB,OAAO,EAAE,IAAI;QACb,IAAI,EAAE,QAAQ;QACd,eAAe,EAAE;YACf,YAAY,EAAE,QAAQ;YACtB,aAAa,EAAE,SAAS;YACxB,UAAU,EAAE,QAAQ;YACpB,MAAM,EAAE,QAAQ;SACjB;KACF,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,aAAa;IACpB,OAAO;;;;;;;;;;;;;;CAcR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,YAAY;IACnB,OAAO;;;;;;;CAOR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,eAAe;IACtB,OAAO;;;;;;;;;;;CAWR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB;IACzB,OAAO;;;;;;;;;;;CAWR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB;IACvB,OAAO;;;;;;CAMR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,cAAc;IACrB,OAAO;;;;;;;;;;;;;CAaR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB;IAC5B,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,IAAI,EAAE,cAAc;QACpB,IAAI,EAAE,QAAQ;QACd,OAAO,EAAE;YACP,KAAK,EAAE,KAAK;SACb;QACD,YAAY,EAAE;YACZ,KAAK,EAAE,SAAS;SACjB;QACD,eAAe,EAAE;YACf,cAAc,EAAE,SAAS;YACzB,UAAU,EAAE,QAAQ;YACpB,MAAM,EAAE,QAAQ;SACjB;KACF,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,eAAe;IACtB,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,eAAe,EAAE;YACf,MAAM,EAAE,QAAQ;YAChB,MAAM,EAAE,UAAU;YAClB,gBAAgB,EAAE,UAAU;YAC5B,MAAM,EAAE,IAAI;YACZ,YAAY,EAAE,IAAI;YAClB,MAAM,EAAE,IAAI;YACZ,GAAG,EAAE,CAAC,QAAQ,CAAC;SAChB;QACD,OAAO,EAAE,CAAC,aAAa,CAAC;KACzB,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB;IACzB,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,eAAe,EAAE;YACf,MAAM,EAAE,QAAQ;YAChB,MAAM,EAAE,QAAQ;YAChB,gBAAgB,EAAE,SAAS;YAC3B,GAAG,EAAE,WAAW;YAChB,MAAM,EAAE,IAAI;YACZ,MAAM,EAAE,MAAM;YACd,YAAY,EAAE,IAAI;SACnB;QACD,OAAO,EAAE,CAAC,KAAK,CAAC;KACjB,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,aAAa;IACpB,OAAO;;;;;;;;;;CAUR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,WAAmB;IAC3C,OAAO;QACL,EAAE,IAAI,EAAE,cAAc,EAAE,OAAO,EAAE,cAAc,CAAC,WAAW,CAAC,EAAE;QAC9D,EAAE,IAAI,EAAE,eAAe,EAAE,OAAO,EAAE,eAAe,EAAE,EAAE;QACrD,EAAE,IAAI,EAAE,cAAc,EAAE,OAAO,EAAE,aAAa,EAAE,EAAE;QAClD,EAAE,IAAI,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,EAAE;QAC/C,EAAE,IAAI,EAAE,mBAAmB,EAAE,OAAO,EAAE,eAAe,EAAE,EAAE;QACzD,EAAE,IAAI,EAAE,sBAAsB,EAAE,OAAO,EAAE,kBAAkB,EAAE,EAAE;QAC/D,EAAE,IAAI,EAAE,8BAA8B,EAAE,OAAO,EAAE,gBAAgB,EAAE,EAAE;QACrE,EAAE,IAAI,EAAE,4BAA4B,EAAE,OAAO,EAAE,cAAc,EAAE,EAAE;QACjE,EAAE,IAAI,EAAE,iCAAiC,EAAE,OAAO,EAAE,qBAAqB,EAAE,EAAE;QAC7E,EAAE,IAAI,EAAE,kCAAkC,EAAE,OAAO,EAAE,kBAAkB,EAAE,EAAE;QAC3E,EAAE,IAAI,EAAE,gCAAgC,EAAE,OAAO,EAAE,aAAa,EAAE,EAAE;KACrE,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,OAAoB;IAC9C,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IACrD,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC;IAEjD,oCAAoC;IACpC,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,2BAA2B;IAC3B,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE3C,2BAA2B;IAC3B,MAAM,KAAK,GAAG,gBAAgB,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAC7C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;QAElC,4BAA4B;QAC5B,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAExC,aAAa;QACb,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;IACxC,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CAAC,UAAkB,EAAE,WAAmB;IAC7E,OAAO;sBACa,UAAU;;;UAGtB,WAAW;;;;;;;CAOpB,CAAC;AACF,CAAC"}
@@ -0,0 +1,54 @@
1
+ /**
2
+ * Results storage and reporting for eval experiments.
3
+ */
4
+ import type { EvalRunResult, EvalRunData, EvalSummary, ExperimentResults, ResolvedExperimentConfig } from './types.js';
5
+ import type { AgentRunResult } from './agents/types.js';
6
+ /**
7
+ * Convert AgentRunResult to EvalRunData (result + transcript).
8
+ */
9
+ export declare function agentResultToEvalRunData(agentResult: AgentRunResult): EvalRunData;
10
+ /**
11
+ * Create a summary from multiple run data.
12
+ */
13
+ export declare function createEvalSummary(name: string, runData: EvalRunData[]): EvalSummary;
14
+ /**
15
+ * Create experiment results from eval summaries.
16
+ */
17
+ export declare function createExperimentResults(config: ResolvedExperimentConfig, evals: EvalSummary[], startedAt: Date, completedAt: Date): ExperimentResults;
18
+ /**
19
+ * Options for saving results.
20
+ */
21
+ export interface SaveResultsOptions {
22
+ /** Base directory for results */
23
+ resultsDir: string;
24
+ /** Experiment name (used for subdirectory) */
25
+ experimentName: string;
26
+ }
27
+ /**
28
+ * Save experiment results to disk.
29
+ *
30
+ * Creates a directory structure per design:
31
+ * results/
32
+ * experiment-name/
33
+ * 2024-01-26T12-00-00Z/
34
+ * eval-1/
35
+ * run-1/
36
+ * result.json
37
+ * transcript.jsonl
38
+ * outputs/
39
+ * summary.json
40
+ */
41
+ export declare function saveResults(results: ExperimentResults, options: SaveResultsOptions): string;
42
+ /**
43
+ * Format results for terminal display.
44
+ */
45
+ export declare function formatResultsTable(results: ExperimentResults): string;
46
+ /**
47
+ * Format a single eval result for terminal display (used during progress).
48
+ */
49
+ export declare function formatRunResult(evalName: string, runNumber: number, totalRuns: number, result: EvalRunResult): string;
50
+ /**
51
+ * Create a progress indicator for running evals.
52
+ */
53
+ export declare function createProgressDisplay(evalName: string, runNumber: number, totalRuns: number): string;
54
+ //# sourceMappingURL=results.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"results.d.ts","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EACV,aAAa,EACb,WAAW,EACX,WAAW,EACX,iBAAiB,EACjB,wBAAwB,EACzB,MAAM,YAAY,CAAC;AACpB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAExD;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,WAAW,EAAE,cAAc,GAAG,WAAW,CA4BjF;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,GAAG,WAAW,CAanF;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,wBAAwB,EAChC,KAAK,EAAE,WAAW,EAAE,EACpB,SAAS,EAAE,IAAI,EACf,WAAW,EAAE,IAAI,GAChB,iBAAiB,CAOnB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,iCAAiC;IACjC,UAAU,EAAE,MAAM,CAAC;IACnB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;CACxB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,WAAW,CACzB,OAAO,EAAE,iBAAiB,EAC1B,OAAO,EAAE,kBAAkB,GAC1B,MAAM,CA8ER;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,iBAAiB,GAAG,MAAM,CAsCrE;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,aAAa,GACpB,MAAM,CAYR;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,GAChB,MAAM,CAER"}
@@ -0,0 +1,186 @@
1
+ /**
2
+ * Results storage and reporting for eval experiments.
3
+ */
4
+ import { mkdirSync, writeFileSync } from 'fs';
5
+ import { join } from 'path';
6
+ import chalk from 'chalk';
7
+ /**
8
+ * Convert AgentRunResult to EvalRunData (result + transcript).
9
+ */
10
+ export function agentResultToEvalRunData(agentResult) {
11
+ // Collect output content from scripts and tests
12
+ const outputContent = {};
13
+ // Add EVAL.ts test output
14
+ if (agentResult.testResult?.output) {
15
+ outputContent.eval = agentResult.testResult.output;
16
+ }
17
+ // Add all script outputs (nested under 'scripts' to avoid collision)
18
+ if (agentResult.scriptsResults && Object.keys(agentResult.scriptsResults).length > 0) {
19
+ outputContent.scripts = {};
20
+ for (const [name, result] of Object.entries(agentResult.scriptsResults)) {
21
+ if (result.output) {
22
+ outputContent.scripts[name] = result.output;
23
+ }
24
+ }
25
+ }
26
+ return {
27
+ result: {
28
+ status: agentResult.success ? 'passed' : 'failed',
29
+ error: agentResult.error,
30
+ duration: agentResult.duration / 1000, // Convert to seconds
31
+ },
32
+ transcript: agentResult.transcript,
33
+ outputContent: Object.keys(outputContent).length > 0 ? outputContent : undefined,
34
+ };
35
+ }
36
+ /**
37
+ * Create a summary from multiple run data.
38
+ */
39
+ export function createEvalSummary(name, runData) {
40
+ const runs = runData.map((r) => r.result);
41
+ const passedRuns = runs.filter((r) => r.status === 'passed').length;
42
+ const totalDuration = runs.reduce((sum, r) => sum + r.duration, 0);
43
+ return {
44
+ name,
45
+ totalRuns: runs.length,
46
+ passedRuns,
47
+ passRate: runs.length > 0 ? (passedRuns / runs.length) * 100 : 0,
48
+ meanDuration: runs.length > 0 ? totalDuration / runs.length : 0,
49
+ runs: runData,
50
+ };
51
+ }
52
+ /**
53
+ * Create experiment results from eval summaries.
54
+ */
55
+ export function createExperimentResults(config, evals, startedAt, completedAt) {
56
+ return {
57
+ startedAt: startedAt.toISOString(),
58
+ completedAt: completedAt.toISOString(),
59
+ config,
60
+ evals,
61
+ };
62
+ }
63
+ /**
64
+ * Save experiment results to disk.
65
+ *
66
+ * Creates a directory structure per design:
67
+ * results/
68
+ * experiment-name/
69
+ * 2024-01-26T12-00-00Z/
70
+ * eval-1/
71
+ * run-1/
72
+ * result.json
73
+ * transcript.jsonl
74
+ * outputs/
75
+ * summary.json
76
+ */
77
+ export function saveResults(results, options) {
78
+ const timestamp = results.startedAt.replace(/:/g, '-');
79
+ const experimentDir = join(options.resultsDir, options.experimentName, timestamp);
80
+ // Create experiment directory
81
+ mkdirSync(experimentDir, { recursive: true });
82
+ // Save per-eval results
83
+ for (const evalSummary of results.evals) {
84
+ const evalDir = join(experimentDir, evalSummary.name);
85
+ mkdirSync(evalDir, { recursive: true });
86
+ // Save summary (simplified format per design)
87
+ const summaryForFile = {
88
+ totalRuns: evalSummary.totalRuns,
89
+ passedRuns: evalSummary.passedRuns,
90
+ passRate: `${evalSummary.passRate.toFixed(0)}%`,
91
+ meanDuration: evalSummary.meanDuration,
92
+ };
93
+ writeFileSync(join(evalDir, 'summary.json'), JSON.stringify(summaryForFile, null, 2));
94
+ // Save individual run results
95
+ for (let i = 0; i < evalSummary.runs.length; i++) {
96
+ const runData = evalSummary.runs[i];
97
+ const runDir = join(evalDir, `run-${i + 1}`);
98
+ mkdirSync(runDir, { recursive: true });
99
+ // Build the result with paths
100
+ const resultWithPaths = { ...runData.result };
101
+ // Save transcript.jsonl if available
102
+ if (runData.transcript) {
103
+ writeFileSync(join(runDir, 'transcript.jsonl'), runData.transcript);
104
+ resultWithPaths.transcriptPath = './transcript.jsonl';
105
+ }
106
+ // Save script/test outputs to outputs/
107
+ const outputsDir = join(runDir, 'outputs');
108
+ mkdirSync(outputsDir, { recursive: true });
109
+ if (runData.outputContent) {
110
+ const outputPaths = {};
111
+ // Save EVAL.ts test output
112
+ if (runData.outputContent.eval) {
113
+ writeFileSync(join(outputsDir, 'eval.txt'), runData.outputContent.eval);
114
+ outputPaths.eval = './outputs/eval.txt';
115
+ }
116
+ // Save npm script outputs (nested to avoid collision)
117
+ if (runData.outputContent.scripts) {
118
+ outputPaths.scripts = {};
119
+ for (const [name, content] of Object.entries(runData.outputContent.scripts)) {
120
+ if (content) {
121
+ const fileName = `${name}.txt`;
122
+ writeFileSync(join(outputsDir, fileName), content);
123
+ outputPaths.scripts[name] = `./outputs/${fileName}`;
124
+ }
125
+ }
126
+ }
127
+ if (outputPaths.eval || (outputPaths.scripts && Object.keys(outputPaths.scripts).length > 0)) {
128
+ resultWithPaths.outputPaths = outputPaths;
129
+ }
130
+ }
131
+ // Save result.json with paths
132
+ writeFileSync(join(runDir, 'result.json'), JSON.stringify(resultWithPaths, null, 2));
133
+ }
134
+ }
135
+ return experimentDir;
136
+ }
137
+ /**
138
+ * Format results for terminal display.
139
+ */
140
+ export function formatResultsTable(results) {
141
+ const lines = [];
142
+ const separator = '─'.repeat(60);
143
+ lines.push('');
144
+ lines.push(chalk.bold('Experiment Results'));
145
+ lines.push(chalk.gray(separator));
146
+ lines.push('');
147
+ // Calculate overall stats
148
+ const totalRuns = results.evals.reduce((sum, e) => sum + e.totalRuns, 0);
149
+ const totalPassed = results.evals.reduce((sum, e) => sum + e.passedRuns, 0);
150
+ const overallPassRate = totalRuns > 0 ? (totalPassed / totalRuns) * 100 : 0;
151
+ for (const evalSummary of results.evals) {
152
+ const passIcon = evalSummary.passedRuns === evalSummary.totalRuns ? '✓' : '✗';
153
+ const passColor = evalSummary.passedRuns === evalSummary.totalRuns ? chalk.green : chalk.red;
154
+ lines.push(passColor(`${passIcon} ${evalSummary.name}: ${evalSummary.passedRuns}/${evalSummary.totalRuns} passed (${evalSummary.passRate.toFixed(0)}%)`));
155
+ lines.push(chalk.gray(` Mean duration: ${evalSummary.meanDuration.toFixed(1)}s`));
156
+ lines.push('');
157
+ }
158
+ lines.push(chalk.gray(separator));
159
+ lines.push('');
160
+ const overallColor = overallPassRate === 100 ? chalk.green : overallPassRate >= 50 ? chalk.yellow : chalk.red;
161
+ lines.push(overallColor(`Overall: ${totalPassed}/${totalRuns} passed (${overallPassRate.toFixed(0)}%)`));
162
+ const duration = (new Date(results.completedAt).getTime() - new Date(results.startedAt).getTime()) / 1000;
163
+ lines.push(chalk.gray(`Total time: ${duration.toFixed(1)}s`));
164
+ lines.push('');
165
+ return lines.join('\n');
166
+ }
167
+ /**
168
+ * Format a single eval result for terminal display (used during progress).
169
+ */
170
+ export function formatRunResult(evalName, runNumber, totalRuns, result) {
171
+ const icon = result.status === 'passed' ? '✓' : '✗';
172
+ const color = result.status === 'passed' ? chalk.green : chalk.red;
173
+ let line = color(`${icon} ${evalName} [${runNumber}/${totalRuns}]`);
174
+ line += chalk.gray(` (${result.duration.toFixed(1)}s)`);
175
+ if (result.error) {
176
+ line += chalk.red(` - ${result.error.slice(0, 50)}${result.error.length > 50 ? '...' : ''}`);
177
+ }
178
+ return line;
179
+ }
180
+ /**
181
+ * Create a progress indicator for running evals.
182
+ */
183
+ export function createProgressDisplay(evalName, runNumber, totalRuns) {
184
+ return chalk.blue(`Running ${evalName} [${runNumber}/${totalRuns}]...`);
185
+ }
186
+ //# sourceMappingURL=results.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"results.js","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAC9C,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,KAAK,MAAM,OAAO,CAAC;AAU1B;;GAEG;AACH,MAAM,UAAU,wBAAwB,CAAC,WAA2B;IAClE,gDAAgD;IAChD,MAAM,aAAa,GAAiC,EAAE,CAAC;IAEvD,0BAA0B;IAC1B,IAAI,WAAW,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;QACnC,aAAa,CAAC,IAAI,GAAG,WAAW,CAAC,UAAU,CAAC,MAAM,CAAC;IACrD,CAAC;IAED,qEAAqE;IACrE,IAAI,WAAW,CAAC,cAAc,IAAI,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrF,aAAa,CAAC,OAAO,GAAG,EAAE,CAAC;QAC3B,KAAK,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,cAAc,CAAC,EAAE,CAAC;YACxE,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;gBAClB,aAAa,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;YAC9C,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO;QACL,MAAM,EAAE;YACN,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ;YACjD,KAAK,EAAE,WAAW,CAAC,KAAK;YACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ,GAAG,IAAI,EAAE,qBAAqB;SAC7D;QACD,UAAU,EAAE,WAAW,CAAC,UAAU;QAClC,aAAa,EAAE,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,SAAS;KACjF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAY,EAAE,OAAsB;IACpE,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC1C,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,MAAM,CAAC;IACpE,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;IAEnE,OAAO;QACL,IAAI;QACJ,SAAS,EAAE,IAAI,CAAC,MAAM;QACtB,UAAU;QACV,QAAQ,EAAE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;QAChE,YAAY,EAAE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAC/D,IAAI,EAAE,OAAO;KACd,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,MAAgC,EAChC,KAAoB,EACpB,SAAe,EACf,WAAiB;IAEjB,OAAO;QACL,SAAS,EAAE,SAAS,CAAC,WAAW,EAAE;QAClC,WAAW,EAAE,WAAW,CAAC,WAAW,EAAE;QACtC,MAAM;QACN,KAAK;KACN,CAAC;AACJ,CAAC;AAYD;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,WAAW,CACzB,OAA0B,EAC1B,OAA2B;IAE3B,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IACvD,MAAM,aAAa,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,cAAc,EAAE,SAAS,CAAC,CAAC;IAElF,8BAA8B;IAC9B,SAAS,CAAC,aAAa,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE9C,wBAAwB;IACxB,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,WAAW,CAAC,IAAI,CAAC,CAAC;QACtD,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAExC,8CAA8C;QAC9C,MAAM,cAAc,GAAG;YACrB,SAAS,EAAE,WAAW,CAAC,SAAS;YAChC,UAAU,EAAE,WAAW,CAAC,UAAU;YAClC,QAAQ,EAAE,GAAG,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;YAC/C,YAAY,EAAE,WAAW,CAAC,YAAY;SACvC,CAAC;QACF,aAAa,CACX,IAAI,CAAC,OAAO,EAAE,cAAc,CAAC,EAC7B,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CACxC,CAAC;QAEF,8BAA8B;QAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,MAAM,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACpC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC7C,SAAS,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAEvC,8BAA8B;YAC9B,MAAM,eAAe,GAAG,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;YAE9C,qCAAqC;YACrC,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;gBACvB,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,kBAAkB,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC;gBACpE,eAAe,CAAC,cAAc,GAAG,oBAAoB,CAAC;YACxD,CAAC;YAED,uCAAuC;YACvC,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;YAC3C,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAE3C,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;gBAC1B,MAAM,WAAW,GAAiC,EAAE,CAAC;gBAErD,2BAA2B;gBAC3B,IAAI,OAAO,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC;oBAC/B,aAAa,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;oBACxE,WAAW,CAAC,IAAI,GAAG,oBAAoB,CAAC;gBAC1C,CAAC;gBAED,sDAAsD;gBACtD,IAAI,OAAO,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;oBAClC,WAAW,CAAC,OAAO,GAAG,EAAE,CAAC;oBACzB,KAAK,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,CAAC;wBAC5E,IAAI,OAAO,EAAE,CAAC;4BACZ,MAAM,QAAQ,GAAG,GAAG,IAAI,MAAM,CAAC;4BAC/B,aAAa,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;4BACnD,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,aAAa,QAAQ,EAAE,CAAC;wBACtD,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,IAAI,WAAW,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,IAAI,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC;oBAC7F,eAAe,CAAC,WAAW,GAAG,WAAW,CAAC;gBAC5C,CAAC;YACH,CAAC;YAED,8BAA8B;YAC9B,aAAa,CACX,IAAI,CAAC,MAAM,EAAE,aAAa,CAAC,EAC3B,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,IAAI,EAAE,CAAC,CAAC,CACzC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,aAAa,CAAC;AACvB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,OAA0B;IAC3D,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IAEjC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,CAAC;IAC7C,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;IAClC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,0BAA0B;IAC1B,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;IACzE,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;IAC5E,MAAM,eAAe,GAAG,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,GAAG,SAAS,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAE5E,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QACxC,MAAM,QAAQ,GAAG,WAAW,CAAC,UAAU,KAAK,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;QAC9E,MAAM,SAAS,GAAG,WAAW,CAAC,UAAU,KAAK,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;QAE7F,KAAK,CAAC,IAAI,CACR,SAAS,CACP,GAAG,QAAQ,IAAI,WAAW,CAAC,IAAI,KAAK,WAAW,CAAC,UAAU,IAAI,WAAW,CAAC,SAAS,YAAY,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CACnI,CACF,CAAC;QACF,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,oBAAoB,WAAW,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACnF,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;IAClC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,MAAM,YAAY,GAAG,eAAe,KAAK,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;IAC9G,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,WAAW,IAAI,SAAS,YAAY,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;IAEzG,MAAM,QAAQ,GAAG,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,OAAO,EAAE,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC,GAAG,IAAI,CAAC;IAC1G,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,eAAe,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAC7B,QAAgB,EAChB,SAAiB,EACjB,SAAiB,EACjB,MAAqB;IAErB,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IACpD,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;IAEnE,IAAI,IAAI,GAAG,KAAK,CAAC,GAAG,IAAI,IAAI,QAAQ,KAAK,SAAS,IAAI,SAAS,GAAG,CAAC,CAAC;IACpE,IAAI,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAExD,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QACjB,IAAI,IAAI,KAAK,CAAC,GAAG,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC/F,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,QAAgB,EAChB,SAAiB,EACjB,SAAiB;IAEjB,OAAO,KAAK,CAAC,IAAI,CAAC,WAAW,QAAQ,KAAK,SAAS,IAAI,SAAS,MAAM,CAAC,CAAC;AAC1E,CAAC"}
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Experiment runner - orchestrates running evals against agent.
3
+ * All evals and attempts run concurrently for maximum throughput.
4
+ * With earlyExit, in-flight attempts are aborted when one passes.
5
+ */
6
+ import type { ResolvedExperimentConfig, EvalFixture, EvalRunData, ExperimentResults } from './types.js';
7
+ /**
8
+ * Options for running an experiment.
9
+ */
10
+ export interface RunExperimentOptions {
11
+ /** Resolved experiment configuration */
12
+ config: ResolvedExperimentConfig;
13
+ /** Fixtures to run */
14
+ fixtures: EvalFixture[];
15
+ /** API key for the agent */
16
+ apiKey: string;
17
+ /** Directory to save results */
18
+ resultsDir: string;
19
+ /** Experiment name */
20
+ experimentName: string;
21
+ /** Callback for progress updates */
22
+ onProgress?: (message: string) => void;
23
+ /** Whether to run in verbose mode */
24
+ verbose?: boolean;
25
+ }
26
+ /**
27
+ * Run an experiment - execute all evals with configured runs concurrently.
28
+ * With earlyExit enabled, remaining attempts for a fixture are aborted once one passes.
29
+ */
30
+ export declare function runExperiment(options: RunExperimentOptions): Promise<ExperimentResults>;
31
+ /**
32
+ * Run a single eval (for testing/debugging).
33
+ */
34
+ export declare function runSingleEval(fixture: EvalFixture, options: {
35
+ agent?: ResolvedExperimentConfig['agent'];
36
+ model: ResolvedExperimentConfig['model'];
37
+ timeout: number;
38
+ apiKey: string;
39
+ setup?: ResolvedExperimentConfig['setup'];
40
+ scripts?: string[];
41
+ verbose?: boolean;
42
+ }): Promise<EvalRunData>;
43
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EACV,wBAAwB,EACxB,WAAW,EACX,WAAW,EAEX,iBAAiB,EAClB,MAAM,YAAY,CAAC;AAYpB;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,wCAAwC;IACxC,MAAM,EAAE,wBAAwB,CAAC;IACjC,sBAAsB;IACtB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gCAAgC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,oCAAoC;IACpC,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;IACvC,qCAAqC;IACrC,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAoBD;;;GAGG;AACH,wBAAsB,aAAa,CACjC,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,iBAAiB,CAAC,CA2I5B;AAED;;GAEG;AACH,wBAAsB,aAAa,CACjC,OAAO,EAAE,WAAW,EACpB,OAAO,EAAE;IACP,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,KAAK,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IACzC,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB,GACA,OAAO,CAAC,WAAW,CAAC,CAatB"}