agent-eval-opencode 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +590 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/agents/claude-code.d.ts +12 -0
- package/dist/lib/agents/claude-code.d.ts.map +1 -0
- package/dist/lib/agents/claude-code.js +231 -0
- package/dist/lib/agents/claude-code.js.map +1 -0
- package/dist/lib/agents/codex.d.ts +12 -0
- package/dist/lib/agents/codex.d.ts.map +1 -0
- package/dist/lib/agents/codex.js +267 -0
- package/dist/lib/agents/codex.js.map +1 -0
- package/dist/lib/agents/cursor.d.ts +10 -0
- package/dist/lib/agents/cursor.d.ts.map +1 -0
- package/dist/lib/agents/cursor.js +204 -0
- package/dist/lib/agents/cursor.js.map +1 -0
- package/dist/lib/agents/gemini.d.ts +10 -0
- package/dist/lib/agents/gemini.d.ts.map +1 -0
- package/dist/lib/agents/gemini.js +207 -0
- package/dist/lib/agents/gemini.js.map +1 -0
- package/dist/lib/agents/index.d.ts +7 -0
- package/dist/lib/agents/index.d.ts.map +1 -0
- package/dist/lib/agents/index.js +20 -0
- package/dist/lib/agents/index.js.map +1 -0
- package/dist/lib/agents/opencode.d.ts +11 -0
- package/dist/lib/agents/opencode.d.ts.map +1 -0
- package/dist/lib/agents/opencode.js +245 -0
- package/dist/lib/agents/opencode.js.map +1 -0
- package/dist/lib/agents/registry.d.ts +23 -0
- package/dist/lib/agents/registry.d.ts.map +1 -0
- package/dist/lib/agents/registry.js +35 -0
- package/dist/lib/agents/registry.js.map +1 -0
- package/dist/lib/agents/shared.d.ts +83 -0
- package/dist/lib/agents/shared.d.ts.map +1 -0
- package/dist/lib/agents/shared.js +192 -0
- package/dist/lib/agents/shared.js.map +1 -0
- package/dist/lib/agents/types.d.ts +73 -0
- package/dist/lib/agents/types.d.ts.map +1 -0
- package/dist/lib/agents/types.js +5 -0
- package/dist/lib/agents/types.js.map +1 -0
- package/dist/lib/classifier.d.ts +89 -0
- package/dist/lib/classifier.d.ts.map +1 -0
- package/dist/lib/classifier.js +285 -0
- package/dist/lib/classifier.js.map +1 -0
- package/dist/lib/config.d.ts +37 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +187 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/dashboard.d.ts +65 -0
- package/dist/lib/dashboard.d.ts.map +1 -0
- package/dist/lib/dashboard.js +237 -0
- package/dist/lib/dashboard.js.map +1 -0
- package/dist/lib/docker-sandbox.d.ts +92 -0
- package/dist/lib/docker-sandbox.d.ts.map +1 -0
- package/dist/lib/docker-sandbox.js +375 -0
- package/dist/lib/docker-sandbox.js.map +1 -0
- package/dist/lib/fingerprint.d.ts +15 -0
- package/dist/lib/fingerprint.d.ts.map +1 -0
- package/dist/lib/fingerprint.js +59 -0
- package/dist/lib/fingerprint.js.map +1 -0
- package/dist/lib/fixture.d.ts +55 -0
- package/dist/lib/fixture.d.ts.map +1 -0
- package/dist/lib/fixture.js +215 -0
- package/dist/lib/fixture.js.map +1 -0
- package/dist/lib/housekeeping.d.ts +26 -0
- package/dist/lib/housekeeping.d.ts.map +1 -0
- package/dist/lib/housekeeping.js +170 -0
- package/dist/lib/housekeeping.js.map +1 -0
- package/dist/lib/init.d.ts +21 -0
- package/dist/lib/init.d.ts.map +1 -0
- package/dist/lib/init.js +275 -0
- package/dist/lib/init.js.map +1 -0
- package/dist/lib/o11y/index.d.ts +13 -0
- package/dist/lib/o11y/index.d.ts.map +1 -0
- package/dist/lib/o11y/index.js +13 -0
- package/dist/lib/o11y/index.js.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts +18 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.js +343 -0
- package/dist/lib/o11y/parsers/claude-code.js.map +1 -0
- package/dist/lib/o11y/parsers/codex.d.ts +17 -0
- package/dist/lib/o11y/parsers/codex.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/codex.js +364 -0
- package/dist/lib/o11y/parsers/codex.js.map +1 -0
- package/dist/lib/o11y/parsers/cursor.d.ts +21 -0
- package/dist/lib/o11y/parsers/cursor.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/cursor.js +226 -0
- package/dist/lib/o11y/parsers/cursor.js.map +1 -0
- package/dist/lib/o11y/parsers/gemini.d.ts +21 -0
- package/dist/lib/o11y/parsers/gemini.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/gemini.js +241 -0
- package/dist/lib/o11y/parsers/gemini.js.map +1 -0
- package/dist/lib/o11y/parsers/index.d.ts +55 -0
- package/dist/lib/o11y/parsers/index.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/index.js +284 -0
- package/dist/lib/o11y/parsers/index.js.map +1 -0
- package/dist/lib/o11y/parsers/opencode.d.ts +17 -0
- package/dist/lib/o11y/parsers/opencode.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/opencode.js +320 -0
- package/dist/lib/o11y/parsers/opencode.js.map +1 -0
- package/dist/lib/o11y/types.d.ts +113 -0
- package/dist/lib/o11y/types.d.ts.map +1 -0
- package/dist/lib/o11y/types.js +6 -0
- package/dist/lib/o11y/types.js.map +1 -0
- package/dist/lib/results.d.ts +91 -0
- package/dist/lib/results.d.ts.map +1 -0
- package/dist/lib/results.js +361 -0
- package/dist/lib/results.js.map +1 -0
- package/dist/lib/runner.d.ts +71 -0
- package/dist/lib/runner.d.ts.map +1 -0
- package/dist/lib/runner.js +267 -0
- package/dist/lib/runner.js.map +1 -0
- package/dist/lib/sandbox.d.ts +173 -0
- package/dist/lib/sandbox.d.ts.map +1 -0
- package/dist/lib/sandbox.js +337 -0
- package/dist/lib/sandbox.js.map +1 -0
- package/dist/lib/types.d.ts +258 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +15 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/test-setup.d.ts +2 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +6 -0
- package/dist/test-setup.js.map +1 -0
- package/package.json +72 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Results storage and reporting for eval experiments.
|
|
3
|
+
*/
|
|
4
|
+
import { mkdirSync, writeFileSync, readdirSync, readFileSync, existsSync, statSync, unlinkSync, } from 'fs';
|
|
5
|
+
import { join, dirname } from 'path';
|
|
6
|
+
import chalk from 'chalk';
|
|
7
|
+
import { parseTranscript } from './o11y/index.js';
|
|
8
|
+
import { isNonModelFailure } from './classifier.js';
|
|
9
|
+
import { readFixtureFiles } from './fixture.js';
|
|
10
|
+
/**
|
|
11
|
+
* Convert AgentRunResult to EvalRunData (result + transcript).
|
|
12
|
+
*/
|
|
13
|
+
export function agentResultToEvalRunData(agentResult) {
|
|
14
|
+
// Collect output content from scripts and tests
|
|
15
|
+
const outputContent = {};
|
|
16
|
+
// Add EVAL.ts test output
|
|
17
|
+
if (agentResult.testResult?.output) {
|
|
18
|
+
outputContent.eval = agentResult.testResult.output;
|
|
19
|
+
}
|
|
20
|
+
// Add all script outputs (nested under 'scripts' to avoid collision)
|
|
21
|
+
if (agentResult.scriptsResults &&
|
|
22
|
+
Object.keys(agentResult.scriptsResults).length > 0) {
|
|
23
|
+
outputContent.scripts = {};
|
|
24
|
+
for (const [name, result] of Object.entries(agentResult.scriptsResults)) {
|
|
25
|
+
if (result.output) {
|
|
26
|
+
outputContent.scripts[name] = result.output;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return {
|
|
31
|
+
result: {
|
|
32
|
+
status: agentResult.success ? 'passed' : 'failed',
|
|
33
|
+
error: agentResult.error,
|
|
34
|
+
duration: agentResult.duration / 1000, // Convert to seconds
|
|
35
|
+
},
|
|
36
|
+
transcript: agentResult.transcript,
|
|
37
|
+
outputContent: Object.keys(outputContent).length > 0 ? outputContent : undefined,
|
|
38
|
+
generatedFiles: agentResult.generatedFiles,
|
|
39
|
+
deletedFiles: agentResult.deletedFiles,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Create a summary from multiple run data.
|
|
44
|
+
*/
|
|
45
|
+
export function createEvalSummary(name, runData) {
|
|
46
|
+
const runs = runData.map((r) => r.result);
|
|
47
|
+
const passedRuns = runs.filter((r) => r.status === 'passed').length;
|
|
48
|
+
const totalDuration = runs.reduce((sum, r) => sum + r.duration, 0);
|
|
49
|
+
return {
|
|
50
|
+
name,
|
|
51
|
+
totalRuns: runs.length,
|
|
52
|
+
passedRuns,
|
|
53
|
+
passRate: runs.length > 0 ? (passedRuns / runs.length) * 100 : 0,
|
|
54
|
+
meanDuration: runs.length > 0 ? totalDuration / runs.length : 0,
|
|
55
|
+
runs: runData,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Create experiment results from eval summaries.
|
|
60
|
+
*/
|
|
61
|
+
export function createExperimentResults(config, evals, startedAt, completedAt) {
|
|
62
|
+
return {
|
|
63
|
+
startedAt: startedAt.toISOString(),
|
|
64
|
+
completedAt: completedAt.toISOString(),
|
|
65
|
+
config,
|
|
66
|
+
evals,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Save experiment results to disk.
|
|
71
|
+
*
|
|
72
|
+
* Creates a directory structure per design:
|
|
73
|
+
* results/
|
|
74
|
+
* experiment-name/
|
|
75
|
+
* 2024-01-26T12-00-00Z/
|
|
76
|
+
* eval-1/
|
|
77
|
+
* run-1/
|
|
78
|
+
* result.json
|
|
79
|
+
* transcript.json (parsed/structured - primary format)
|
|
80
|
+
* transcript-raw.jsonl (raw agent output - for debugging)
|
|
81
|
+
* outputs/
|
|
82
|
+
* summary.json
|
|
83
|
+
*/
|
|
84
|
+
export function saveResults(results, options) {
|
|
85
|
+
const timestamp = results.startedAt.replace(/:/g, '-');
|
|
86
|
+
const experimentDir = join(options.resultsDir, options.experimentName, timestamp);
|
|
87
|
+
// Create experiment directory
|
|
88
|
+
mkdirSync(experimentDir, { recursive: true });
|
|
89
|
+
// Save per-eval results
|
|
90
|
+
for (const evalSummary of results.evals) {
|
|
91
|
+
const evalDir = join(experimentDir, evalSummary.name);
|
|
92
|
+
mkdirSync(evalDir, { recursive: true });
|
|
93
|
+
// Save summary (simplified format per design)
|
|
94
|
+
const fingerprint = options.fingerprints?.[evalSummary.name];
|
|
95
|
+
const valid = options.validity?.[evalSummary.name];
|
|
96
|
+
const summaryForFile = {
|
|
97
|
+
totalRuns: evalSummary.totalRuns,
|
|
98
|
+
passedRuns: evalSummary.passedRuns,
|
|
99
|
+
passRate: `${evalSummary.passRate.toFixed(0)}%`,
|
|
100
|
+
meanDuration: evalSummary.meanDuration,
|
|
101
|
+
};
|
|
102
|
+
if (fingerprint) {
|
|
103
|
+
summaryForFile.fingerprint = fingerprint;
|
|
104
|
+
}
|
|
105
|
+
if (valid === false) {
|
|
106
|
+
summaryForFile.valid = false;
|
|
107
|
+
}
|
|
108
|
+
if (options.smoke) {
|
|
109
|
+
summaryForFile.smoke = true;
|
|
110
|
+
}
|
|
111
|
+
writeFileSync(join(evalDir, 'summary.json'), JSON.stringify(summaryForFile, null, 2));
|
|
112
|
+
// Save individual run results
|
|
113
|
+
for (let i = 0; i < evalSummary.runs.length; i++) {
|
|
114
|
+
const runData = evalSummary.runs[i];
|
|
115
|
+
const runDir = join(evalDir, `run-${i + 1}`);
|
|
116
|
+
mkdirSync(runDir, { recursive: true });
|
|
117
|
+
// Build the result with paths and o11y summary
|
|
118
|
+
const model = results.config.model;
|
|
119
|
+
const resultWithPaths = {
|
|
120
|
+
...runData.result,
|
|
121
|
+
model,
|
|
122
|
+
};
|
|
123
|
+
// Save transcripts if available
|
|
124
|
+
if (runData.transcript) {
|
|
125
|
+
// Parse the raw transcript
|
|
126
|
+
const transcript = parseTranscript(runData.transcript, results.config.agent, model);
|
|
127
|
+
// Save parsed transcript as primary format (transcript.json)
|
|
128
|
+
writeFileSync(join(runDir, 'transcript.json'), JSON.stringify(transcript, null, 2));
|
|
129
|
+
resultWithPaths.transcriptPath = './transcript.json';
|
|
130
|
+
// Save raw transcript for debugging (transcript-raw.jsonl)
|
|
131
|
+
writeFileSync(join(runDir, 'transcript-raw.jsonl'), runData.transcript);
|
|
132
|
+
resultWithPaths.transcriptRawPath = './transcript-raw.jsonl';
|
|
133
|
+
// Include summary in result.json for quick access
|
|
134
|
+
resultWithPaths.o11y = transcript.summary;
|
|
135
|
+
}
|
|
136
|
+
// Save script/test outputs to outputs/
|
|
137
|
+
const outputsDir = join(runDir, 'outputs');
|
|
138
|
+
mkdirSync(outputsDir, { recursive: true });
|
|
139
|
+
if (runData.outputContent) {
|
|
140
|
+
const outputPaths = {};
|
|
141
|
+
// Save EVAL.ts test output
|
|
142
|
+
if (runData.outputContent.eval) {
|
|
143
|
+
writeFileSync(join(outputsDir, 'eval.txt'), runData.outputContent.eval);
|
|
144
|
+
outputPaths.eval = './outputs/eval.txt';
|
|
145
|
+
}
|
|
146
|
+
// Save npm script outputs under outputs/scripts/ to avoid collision with eval.txt
|
|
147
|
+
if (runData.outputContent.scripts) {
|
|
148
|
+
const scriptsDir = join(outputsDir, 'scripts');
|
|
149
|
+
mkdirSync(scriptsDir, { recursive: true });
|
|
150
|
+
outputPaths.scripts = {};
|
|
151
|
+
for (const [name, content] of Object.entries(runData.outputContent.scripts)) {
|
|
152
|
+
if (content) {
|
|
153
|
+
const fileName = `${name}.txt`;
|
|
154
|
+
writeFileSync(join(scriptsDir, fileName), content);
|
|
155
|
+
outputPaths.scripts[name] = `./outputs/scripts/${fileName}`;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
if (outputPaths.eval ||
|
|
160
|
+
(outputPaths.scripts &&
|
|
161
|
+
Object.keys(outputPaths.scripts).length > 0)) {
|
|
162
|
+
resultWithPaths.outputPaths = outputPaths;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
// Copy project files based on copyFiles config
|
|
166
|
+
const copyFiles = results.config.copyFiles;
|
|
167
|
+
const hasGeneratedFiles = runData.generatedFiles &&
|
|
168
|
+
Object.keys(runData.generatedFiles).length > 0;
|
|
169
|
+
const hasDeletedFiles = runData.deletedFiles && runData.deletedFiles.length > 0;
|
|
170
|
+
if (copyFiles &&
|
|
171
|
+
copyFiles !== 'none' &&
|
|
172
|
+
(hasGeneratedFiles || hasDeletedFiles)) {
|
|
173
|
+
const projectDir = join(runDir, 'project');
|
|
174
|
+
mkdirSync(projectDir, { recursive: true });
|
|
175
|
+
// For 'all' mode, first copy original fixture files
|
|
176
|
+
if (copyFiles === 'all') {
|
|
177
|
+
const fixturePath = options.fixturePaths?.[evalSummary.name];
|
|
178
|
+
if (fixturePath) {
|
|
179
|
+
const fixtureFiles = readFixtureFiles(fixturePath);
|
|
180
|
+
for (const [filePath, content] of fixtureFiles) {
|
|
181
|
+
const fullPath = join(projectDir, filePath);
|
|
182
|
+
mkdirSync(dirname(fullPath), { recursive: true });
|
|
183
|
+
writeFileSync(fullPath, content);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
// Remove deleted files from the copied project
|
|
187
|
+
if (runData.deletedFiles) {
|
|
188
|
+
for (const filePath of runData.deletedFiles) {
|
|
189
|
+
const fullPath = join(projectDir, filePath);
|
|
190
|
+
if (existsSync(fullPath)) {
|
|
191
|
+
unlinkSync(fullPath);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
// Write generated files (overwrites originals in 'all' mode where paths match)
|
|
197
|
+
if (runData.generatedFiles) {
|
|
198
|
+
for (const [filePath, content] of Object.entries(runData.generatedFiles)) {
|
|
199
|
+
const fullPath = join(projectDir, filePath);
|
|
200
|
+
mkdirSync(dirname(fullPath), { recursive: true });
|
|
201
|
+
writeFileSync(fullPath, content);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
// Save result.json with paths and o11y summary
|
|
206
|
+
writeFileSync(join(runDir, 'result.json'), JSON.stringify(resultWithPaths, null, 2));
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
return experimentDir;
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Format results for terminal display.
|
|
213
|
+
*/
|
|
214
|
+
export function formatResultsTable(results) {
|
|
215
|
+
const lines = [];
|
|
216
|
+
const separator = '─'.repeat(60);
|
|
217
|
+
lines.push('');
|
|
218
|
+
lines.push(chalk.bold('Experiment Results'));
|
|
219
|
+
lines.push(chalk.gray(separator));
|
|
220
|
+
lines.push('');
|
|
221
|
+
// Calculate overall stats
|
|
222
|
+
const totalRuns = results.evals.reduce((sum, e) => sum + e.totalRuns, 0);
|
|
223
|
+
const totalPassed = results.evals.reduce((sum, e) => sum + e.passedRuns, 0);
|
|
224
|
+
const overallPassRate = totalRuns > 0 ? (totalPassed / totalRuns) * 100 : 0;
|
|
225
|
+
for (const evalSummary of results.evals) {
|
|
226
|
+
const passIcon = evalSummary.passedRuns === evalSummary.totalRuns ? '✓' : '✗';
|
|
227
|
+
const passColor = evalSummary.passedRuns === evalSummary.totalRuns
|
|
228
|
+
? chalk.green
|
|
229
|
+
: chalk.red;
|
|
230
|
+
lines.push(passColor(`${passIcon} ${evalSummary.name}: ${evalSummary.passedRuns}/${evalSummary.totalRuns} passed (${evalSummary.passRate.toFixed(0)}%)`));
|
|
231
|
+
lines.push(chalk.gray(` Mean duration: ${evalSummary.meanDuration.toFixed(1)}s`));
|
|
232
|
+
lines.push('');
|
|
233
|
+
}
|
|
234
|
+
lines.push(chalk.gray(separator));
|
|
235
|
+
lines.push('');
|
|
236
|
+
const overallColor = overallPassRate === 100
|
|
237
|
+
? chalk.green
|
|
238
|
+
: overallPassRate >= 50
|
|
239
|
+
? chalk.yellow
|
|
240
|
+
: chalk.red;
|
|
241
|
+
lines.push(overallColor(`Overall: ${totalPassed}/${totalRuns} passed (${overallPassRate.toFixed(0)}%)`));
|
|
242
|
+
const duration = (new Date(results.completedAt).getTime() -
|
|
243
|
+
new Date(results.startedAt).getTime()) /
|
|
244
|
+
1000;
|
|
245
|
+
lines.push(chalk.gray(`Total time: ${duration.toFixed(1)}s`));
|
|
246
|
+
lines.push('');
|
|
247
|
+
return lines.join('\n');
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Format a single eval result for terminal display (used during progress).
|
|
251
|
+
*/
|
|
252
|
+
export function formatRunResult(evalName, runNumber, totalRuns, result, context) {
|
|
253
|
+
const icon = result.status === 'passed' ? '✓' : '✗';
|
|
254
|
+
const color = result.status === 'passed' ? chalk.green : chalk.red;
|
|
255
|
+
const prefix = context?.experimentName
|
|
256
|
+
? `${context.experimentName}/${evalName}`
|
|
257
|
+
: evalName;
|
|
258
|
+
let line = color(`${icon} ${prefix} [${runNumber}/${totalRuns}]`);
|
|
259
|
+
if (context?.model || context?.agent) {
|
|
260
|
+
line += chalk.gray(` (${[context.agent, context.model].filter(Boolean).join(' · ')})`);
|
|
261
|
+
}
|
|
262
|
+
line += chalk.gray(` (${result.duration.toFixed(1)}s)`);
|
|
263
|
+
if (result.error) {
|
|
264
|
+
line += chalk.red(` - ${result.error.slice(0, 200)}${result.error.length > 200 ? '...' : ''}`);
|
|
265
|
+
}
|
|
266
|
+
return line;
|
|
267
|
+
}
|
|
268
|
+
/**
|
|
269
|
+
* Create a progress indicator for running evals.
|
|
270
|
+
*/
|
|
271
|
+
export function createProgressDisplay(evalName, runNumber, totalRuns, context) {
|
|
272
|
+
const prefix = context?.experimentName
|
|
273
|
+
? `${context.experimentName}/${evalName}`
|
|
274
|
+
: evalName;
|
|
275
|
+
const meta = [context?.agent, context?.model].filter(Boolean).join(' · ');
|
|
276
|
+
const suffix = meta ? ` [${meta}]` : '';
|
|
277
|
+
return chalk.blue(`Running ${prefix} [${runNumber}/${totalRuns}]${suffix}...`);
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* Scan existing results for an experiment to find reusable eval results.
|
|
281
|
+
*
|
|
282
|
+
* A result is reusable if:
|
|
283
|
+
* 1. Its fingerprint matches the current fingerprint
|
|
284
|
+
* 2. It is "valid" (not marked as invalid by the classifier)
|
|
285
|
+
* 3. It has passedRuns > 0 (successful result worth reusing)
|
|
286
|
+
*
|
|
287
|
+
* Scans all timestamps newest-first and returns the latest match per eval.
|
|
288
|
+
*/
|
|
289
|
+
export function scanReusableResults(resultsDir, experimentName, fingerprints) {
|
|
290
|
+
const reusable = new Map();
|
|
291
|
+
const experimentDir = join(resultsDir, experimentName);
|
|
292
|
+
if (!existsSync(experimentDir))
|
|
293
|
+
return reusable;
|
|
294
|
+
// Get all timestamps, sorted newest first
|
|
295
|
+
let timestamps;
|
|
296
|
+
try {
|
|
297
|
+
timestamps = readdirSync(experimentDir)
|
|
298
|
+
.filter((t) => !t.startsWith('.'))
|
|
299
|
+
.sort()
|
|
300
|
+
.reverse();
|
|
301
|
+
}
|
|
302
|
+
catch {
|
|
303
|
+
return reusable;
|
|
304
|
+
}
|
|
305
|
+
for (const timestamp of timestamps) {
|
|
306
|
+
const tsDir = join(experimentDir, timestamp);
|
|
307
|
+
if (!statSync(tsDir).isDirectory())
|
|
308
|
+
continue;
|
|
309
|
+
let evalDirs;
|
|
310
|
+
try {
|
|
311
|
+
evalDirs = readdirSync(tsDir).filter((d) => !d.startsWith('.'));
|
|
312
|
+
}
|
|
313
|
+
catch {
|
|
314
|
+
continue;
|
|
315
|
+
}
|
|
316
|
+
for (const evalDir of evalDirs) {
|
|
317
|
+
// Already found a reusable result for this eval
|
|
318
|
+
if (reusable.has(evalDir))
|
|
319
|
+
continue;
|
|
320
|
+
// Check if we have a fingerprint for this eval
|
|
321
|
+
const expectedFingerprint = fingerprints[evalDir];
|
|
322
|
+
if (!expectedFingerprint)
|
|
323
|
+
continue;
|
|
324
|
+
const summaryPath = join(tsDir, evalDir, 'summary.json');
|
|
325
|
+
try {
|
|
326
|
+
const summary = JSON.parse(readFileSync(summaryPath, 'utf-8'));
|
|
327
|
+
// Check fingerprint match
|
|
328
|
+
if (summary.fingerprint !== expectedFingerprint)
|
|
329
|
+
continue;
|
|
330
|
+
// Check validity (valid defaults to true if not explicitly set to false)
|
|
331
|
+
if (summary.valid === false)
|
|
332
|
+
continue;
|
|
333
|
+
// Skip smoke test results
|
|
334
|
+
if (summary.smoke === true)
|
|
335
|
+
continue;
|
|
336
|
+
// Skip non-model failures (infra/timeout) — they should be re-run
|
|
337
|
+
if (isNonModelFailure(join(tsDir, evalDir)))
|
|
338
|
+
continue;
|
|
339
|
+
// Check that it has completed runs (use --force to re-run failures)
|
|
340
|
+
if (summary.totalRuns <= 0)
|
|
341
|
+
continue;
|
|
342
|
+
// Unclassified failures (0% with no classification.json) are not reusable —
|
|
343
|
+
// they were never properly processed (e.g. interrupted run) and need re-running.
|
|
344
|
+
if (summary.passedRuns === 0 &&
|
|
345
|
+
!existsSync(join(tsDir, evalDir, 'classification.json')))
|
|
346
|
+
continue;
|
|
347
|
+
reusable.set(evalDir, {
|
|
348
|
+
evalName: evalDir,
|
|
349
|
+
fingerprint: summary.fingerprint,
|
|
350
|
+
passRate: summary.passRate,
|
|
351
|
+
timestamp,
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
catch {
|
|
355
|
+
// Skip invalid summaries
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
return reusable;
|
|
360
|
+
}
|
|
361
|
+
//# sourceMappingURL=results.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"results.js","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EACN,SAAS,EACT,aAAa,EACb,WAAW,EACX,YAAY,EACZ,UAAU,EACV,QAAQ,EACR,UAAU,GACV,MAAM,IAAI,CAAC;AACZ,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,KAAK,MAAM,OAAO,CAAC;AAS1B,OAAO,EAAE,eAAe,EAAmB,MAAM,iBAAiB,CAAC;AACnE,OAAO,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhD;;GAEG;AACH,MAAM,UAAU,wBAAwB,CACvC,WAA2B;IAE3B,gDAAgD;IAChD,MAAM,aAAa,GAAiC,EAAE,CAAC;IAEvD,0BAA0B;IAC1B,IAAI,WAAW,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;QACpC,aAAa,CAAC,IAAI,GAAG,WAAW,CAAC,UAAU,CAAC,MAAM,CAAC;IACpD,CAAC;IAED,qEAAqE;IACrE,IACC,WAAW,CAAC,cAAc;QAC1B,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC,MAAM,GAAG,CAAC,EACjD,CAAC;QACF,aAAa,CAAC,OAAO,GAAG,EAAE,CAAC;QAC3B,KAAK,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAC1C,WAAW,CAAC,cAAc,CAC1B,EAAE,CAAC;YACH,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;gBACnB,aAAa,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;YAC7C,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO;QACN,MAAM,EAAE;YACP,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ;YACjD,KAAK,EAAE,WAAW,CAAC,KAAK;YACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ,GAAG,IAAI,EAAE,qBAAqB;SAC5D;QACD,UAAU,EAAE,WAAW,CAAC,UAAU;QAClC,aAAa,EACZ,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,SAAS;QAClE,cAAc,EAAE,WAAW,CAAC,cAAc;QAC1C,YAAY,EAAE,WAAW,CAAC,YAAY;KACtC,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAChC,IAAY,EACZ,OAAsB;IAEtB,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC1C,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,MAAM,CAAC;IACpE,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;IAEnE,OAAO;QACN,IAAI;QACJ,SAAS,EAAE,IAAI,CAAC,MAAM;QACtB,UAAU;QACV,QAAQ,EAAE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;QAChE,YAAY,EAAE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAC/D,IAAI,EAAE,OAAO;KACb,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACtC,MAAgC,EAChC,KAAoB,EACpB,SAAe,EACf,WAAiB;IAEjB,OAAO;QACN,SAAS,EAAE,SAAS,CAAC,WAAW,EAAE;QAClC,WAAW,EAAE,WAAW,CAAC,WAAW,EAAE;QACtC,MAAM;QACN,KAAK;KACL,CAAC;AACH,CAAC;AAoBD;;;;;;;;;;;;;;GAcG;AACH,MAAM,UAAU,WAAW,CAC1B,OAA0B,EAC1B,OAA2B;IAE3B,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IACvD,MAAM,aAAa,GAAG,IAAI,CACzB,OAAO,CAAC,UAAU,EAClB,OAAO,CAAC,cAAc,EACtB,SAAS,CACT,CAAC;IAEF,8BAA8B;IAC9B,SAAS,CAAC,aAAa,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE9C,wBAAwB;IACxB,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,WAAW,CAAC,IAAI,CAAC,CAAC;QACtD,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAExC,8CAA8C;QAC9C,MAAM,WAAW,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QAC7D,MAAM,KAAK,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACnD,MAAM,cAAc,GAA4B;YAC/C,SAAS,EAAE,WAAW,CAAC,SAAS;YAChC,UAAU,EAAE,WAAW,CAAC,UAAU;YAClC,QAAQ,EAAE,GAAG,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;YAC/C,YAAY,EAAE,WAAW,CAAC,YAAY;SACtC,CAAC;QACF,IAAI,WAAW,EAAE,CAAC;YACjB,cAAc,CAAC,WAAW,GAAG,WAAW,CAAC;QAC1C,CAAC;QACD,IAAI,KAAK,KAAK,KAAK,EAAE,CAAC;YACrB,cAAc,CAAC,KAAK,GAAG,KAAK,CAAC;QAC9B,CAAC;QACD,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,cAAc,CAAC,KAAK,GAAG,IAAI,CAAC;QAC7B,CAAC;QACD,aAAa,CACZ,IAAI,CAAC,OAAO,EAAE,cAAc,CAAC,EAC7B,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CACvC,CAAC;QAEF,8BAA8B;QAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAClD,MAAM,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACpC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC7C,SAAS,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAEvC,+CAA+C;YAC/C,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC;YACnC,MAAM,eAAe,GAEjB;gBACH,GAAG,OAAO,CAAC,MAAM;gBACjB,KAAK;aACL,CAAC;YAEF,gCAAgC;YAChC,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;gBACxB,2BAA2B;gBAC3B,MAAM,UAAU,GAAG,eAAe,CACjC,OAAO,CAAC,UAAU,EAClB,OAAO,CAAC,MAAM,CAAC,KAAK,EACpB,KAAK,CACL,CAAC;gBAEF,6DAA6D;gBAC7D,aAAa,CACZ,IAAI,CAAC,MAAM,EAAE,iBAAiB,CAAC,EAC/B,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CACnC,CAAC;gBACF,eAAe,CAAC,cAAc,GAAG,mBAAmB,CAAC;gBAErD,2DAA2D;gBAC3D,aAAa,CACZ,IAAI,CAAC,MAAM,EAAE,sBAAsB,CAAC,EACpC,OAAO,CAAC,UAAU,CAClB,CAAC;gBACF,eAAe,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;gBAE7D,kDAAkD;gBAClD,eAAe,CAAC,IAAI,GAAG,UAAU,CAAC,OAAO,CAAC;YAC3C,CAAC;YAED,uCAAuC;YACvC,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;YAC3C,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAE3C,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;gBAC3B,MAAM,WAAW,GAAiC,EAAE,CAAC;gBAErD,2BAA2B;gBAC3B,IAAI,OAAO,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC;oBAChC,aAAa,CACZ,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,EAC5B,OAAO,CAAC,aAAa,CAAC,IAAI,CAC1B,CAAC;oBACF,WAAW,CAAC,IAAI,GAAG,oBAAoB,CAAC;gBACzC,CAAC;gBAED,kFAAkF;gBAClF,IAAI,OAAO,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;oBACnC,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,EAAE,SAAS,CAAC,CAAC;oBAC/C,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;oBAC3C,WAAW,CAAC,OAAO,GAAG,EAAE,CAAC;oBACzB,KAAK,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAC3C,OAAO,CAAC,aAAa,CAAC,OAAO,CAC7B,EAAE,CAAC;wBACH,IAAI,OAAO,EAAE,CAAC;4BACb,MAAM,QAAQ,GAAG,GAAG,IAAI,MAAM,CAAC;4BAC/B,aAAa,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;4BACnD,WAAW,CAAC,OAAO,CAClB,IAAI,CACJ,GAAG,qBAAqB,QAAQ,EAAE,CAAC;wBACrC,CAAC;oBACF,CAAC;gBACF,CAAC;gBAED,IACC,WAAW,CAAC,IAAI;oBAChB,CAAC,WAAW,CAAC,OAAO;wBACnB,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,EAC5C,CAAC;oBACF,eAAe,CAAC,WAAW,GAAG,WAAW,CAAC;gBAC3C,CAAC;YACF,CAAC;YAED,+CAA+C;YAC/C,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,SAAS,CAAC;YAC3C,MAAM,iBAAiB,GACtB,OAAO,CAAC,cAAc;gBACtB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YAChD,MAAM,eAAe,GACpB,OAAO,CAAC,YAAY,IAAI,OAAO,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC;YACzD,IACC,SAAS;gBACT,SAAS,KAAK,MAAM;gBACpB,CAAC,iBAAiB,IAAI,eAAe,CAAC,EACrC,CAAC;gBACF,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;gBAC3C,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;gBAE3C,oDAAoD;gBACpD,IAAI,SAAS,KAAK,KAAK,EAAE,CAAC;oBACzB,MAAM,WAAW,GAChB,OAAO,CAAC,YAAY,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;oBAC1C,IAAI,WAAW,EAAE,CAAC;wBACjB,MAAM,YAAY,GAAG,gBAAgB,CAAC,WAAW,CAAC,CAAC;wBACnD,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,YAAY,EAAE,CAAC;4BAChD,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;4BAC5C,SAAS,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;4BAClD,aAAa,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;wBAClC,CAAC;oBACF,CAAC;oBAED,+CAA+C;oBAC/C,IAAI,OAAO,CAAC,YAAY,EAAE,CAAC;wBAC1B,KAAK,MAAM,QAAQ,IAAI,OAAO,CAAC,YAAY,EAAE,CAAC;4BAC7C,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;4BAC5C,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;gCAC1B,UAAU,CAAC,QAAQ,CAAC,CAAC;4BACtB,CAAC;wBACF,CAAC;oBACF,CAAC;gBACF,CAAC;gBAED,+EAA+E;gBAC/E,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;oBAC5B,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAC/C,OAAO,CAAC,cAAc,CACtB,EAAE,CAAC;wBACH,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;wBAC5C,SAAS,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;wBAClD,aAAa,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;oBAClC,CAAC;gBACF,CAAC;YACF,CAAC;YAED,+CAA+C;YAC/C,aAAa,CACZ,IAAI,CAAC,MAAM,EAAE,aAAa,CAAC,EAC3B,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,IAAI,EAAE,CAAC,CAAC,CACxC,CAAC;QACH,CAAC;IACF,CAAC;IAED,OAAO,aAAa,CAAC;AACtB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,OAA0B;IAC5D,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IAEjC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,CAAC;IAC7C,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;IAClC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,0BAA0B;IAC1B,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;IACzE,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;IAC5E,MAAM,eAAe,GAAG,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,GAAG,SAAS,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAE5E,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QACzC,MAAM,QAAQ,GACb,WAAW,CAAC,UAAU,KAAK,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;QAC9D,MAAM,SAAS,GACd,WAAW,CAAC,UAAU,KAAK,WAAW,CAAC,SAAS;YAC/C,CAAC,CAAC,KAAK,CAAC,KAAK;YACb,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;QAEd,KAAK,CAAC,IAAI,CACT,SAAS,CACR,GAAG,QAAQ,IAAI,WAAW,CAAC,IAAI,KAAK,WAAW,CAAC,UAAU,IACzD,WAAW,CAAC,SACb,YAAY,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAC/C,CACD,CAAC;QACF,KAAK,CAAC,IAAI,CACT,KAAK,CAAC,IAAI,CACT,oBAAoB,WAAW,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAC1D,CACD,CAAC;QACF,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAChB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;IAClC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,MAAM,YAAY,GACjB,eAAe,KAAK,GAAG;QACtB,CAAC,CAAC,KAAK,CAAC,KAAK;QACb,CAAC,CAAC,eAAe,IAAI,EAAE;YACvB,CAAC,CAAC,KAAK,CAAC,MAAM;YACd,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;IACd,KAAK,CAAC,IAAI,CACT,YAAY,CACX,YAAY,WAAW,IAAI,SAAS,YAAY,eAAe,CAAC,OAAO,CACtE,CAAC,CACD,IAAI,CACL,CACD,CAAC;IAEF,MAAM,QAAQ,GACb,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,OAAO,EAAE;QACvC,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC;QACvC,IAAI,CAAC;IACN,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,eAAe,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACzB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAC9B,QAAgB,EAChB,SAAiB,EACjB,SAAiB,EACjB,MAAqB,EACrB,OAAqE;IAErE,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IACpD,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;IACnE,MAAM,MAAM,GAAG,OAAO,EAAE,cAAc;QACrC,CAAC,CAAC,GAAG,OAAO,CAAC,cAAc,IAAI,QAAQ,EAAE;QACzC,CAAC,CAAC,QAAQ,CAAC;IAEZ,IAAI,IAAI,GAAG,KAAK,CAAC,GAAG,IAAI,IAAI,MAAM,KAAK,SAAS,IAAI,SAAS,GAAG,CAAC,CAAC;IAClE,IAAI,OAAO,EAAE,KAAK,IAAI,OAAO,EAAE,KAAK,EAAE,CAAC;QACtC,IAAI,IAAI,KAAK,CAAC,IAAI,CACjB,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAClE,CAAC;IACH,CAAC;IACD,IAAI,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAExD,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QAClB,IAAI,IAAI,KAAK,CAAC,GAAG,CAChB,MAAM,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAC/B,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EACrC,EAAE,CACF,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACpC,QAAgB,EAChB,SAAiB,EACjB,SAAiB,EACjB,OAAqE;IAErE,MAAM,MAAM,GAAG,OAAO,EAAE,cAAc;QACrC,CAAC,CAAC,GAAG,OAAO,CAAC,cAAc,IAAI,QAAQ,EAAE;QACzC,CAAC,CAAC,QAAQ,CAAC;IACZ,MAAM,IAAI,GAAG,CAAC,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC1E,MAAM,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,KAAK,IAAI,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;IACxC,OAAO,KAAK,CAAC,IAAI,CAChB,WAAW,MAAM,KAAK,SAAS,IAAI,SAAS,IAAI,MAAM,KAAK,CAC3D,CAAC;AACH,CAAC;AAYD;;;;;;;;;GASG;AACH,MAAM,UAAU,mBAAmB,CAClC,UAAkB,EAClB,cAAsB,EACtB,YAAoC;IAEpC,MAAM,QAAQ,GAAG,IAAI,GAAG,EAA0B,CAAC;IACnD,MAAM,aAAa,GAAG,IAAI,CAAC,UAAU,EAAE,cAAc,CAAC,CAAC;IAEvD,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;QAAE,OAAO,QAAQ,CAAC;IAEhD,0CAA0C;IAC1C,IAAI,UAAoB,CAAC;IACzB,IAAI,CAAC;QACJ,UAAU,GAAG,WAAW,CAAC,aAAa,CAAC;aACrC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;aACjC,IAAI,EAAE;aACN,OAAO,EAAE,CAAC;IACb,CAAC;IAAC,MAAM,CAAC;QACR,OAAO,QAAQ,CAAC;IACjB,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACpC,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,SAAS,CAAC,CAAC;QAC7C,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,WAAW,EAAE;YAAE,SAAS;QAE7C,IAAI,QAAkB,CAAC;QACvB,IAAI,CAAC;YACJ,QAAQ,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QACjE,CAAC;QAAC,MAAM,CAAC;YACR,SAAS;QACV,CAAC;QAED,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,gDAAgD;YAChD,IAAI,QAAQ,CAAC,GAAG,CAAC,OAAO,CAAC;gBAAE,SAAS;YAEpC,+CAA+C;YAC/C,MAAM,mBAAmB,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YAClD,IAAI,CAAC,mBAAmB;gBAAE,SAAS;YAEnC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,EAAE,OAAO,EAAE,cAAc,CAAC,CAAC;YACzD,IAAI,CAAC;gBACJ,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC;gBAE/D,0BAA0B;gBAC1B,IAAI,OAAO,CAAC,WAAW,KAAK,mBAAmB;oBAAE,SAAS;gBAE1D,yEAAyE;gBACzE,IAAI,OAAO,CAAC,KAAK,KAAK,KAAK;oBAAE,SAAS;gBAEtC,0BAA0B;gBAC1B,IAAI,OAAO,CAAC,KAAK,KAAK,IAAI;oBAAE,SAAS;gBAErC,kEAAkE;gBAClE,IAAI,iBAAiB,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;oBAAE,SAAS;gBAEtD,oEAAoE;gBACpE,IAAI,OAAO,CAAC,SAAS,IAAI,CAAC;oBAAE,SAAS;gBAErC,4EAA4E;gBAC5E,iFAAiF;gBACjF,IACC,OAAO,CAAC,UAAU,KAAK,CAAC;oBACxB,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,EAAE,qBAAqB,CAAC,CAAC;oBAExD,SAAS;gBAEV,QAAQ,CAAC,GAAG,CAAC,OAAO,EAAE;oBACrB,QAAQ,EAAE,OAAO;oBACjB,WAAW,EAAE,OAAO,CAAC,WAAW;oBAChC,QAAQ,EAAE,OAAO,CAAC,QAAQ;oBAC1B,SAAS;iBACT,CAAC,CAAC;YACJ,CAAC;YAAC,MAAM,CAAC;gBACR,yBAAyB;YAC1B,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO,QAAQ,CAAC;AACjB,CAAC"}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Experiment runner - orchestrates running evals against agent.
|
|
3
|
+
* Concurrency is controlled via an optional ConcurrencyLimiter shared across experiments.
|
|
4
|
+
* 429 rate-limit errors are retried with exponential backoff.
|
|
5
|
+
* With earlyExit, in-flight attempts are aborted when one passes.
|
|
6
|
+
*/
|
|
7
|
+
import type { ResolvedExperimentConfig, EvalFixture, EvalRunData, ExperimentResults, RunnableExperimentConfig, ProgressEvent } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Rate-limits how many operations can START within a time window.
|
|
10
|
+
* Once started, operations run freely with no concurrency limit.
|
|
11
|
+
* Create one instance and share it across experiments to control global start rate.
|
|
12
|
+
*/
|
|
13
|
+
export declare class StartRateLimiter {
|
|
14
|
+
private readonly startsPerWindow;
|
|
15
|
+
private readonly windowMs;
|
|
16
|
+
private queue;
|
|
17
|
+
private started;
|
|
18
|
+
private timer;
|
|
19
|
+
constructor(startsPerWindow: number, windowMs: number);
|
|
20
|
+
/**
|
|
21
|
+
* Wait for permission to start, then return immediately.
|
|
22
|
+
* The operation runs freely after this resolves.
|
|
23
|
+
*/
|
|
24
|
+
waitToStart(): Promise<void>;
|
|
25
|
+
private ensureTimer;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Options for running an experiment.
|
|
29
|
+
*/
|
|
30
|
+
export interface RunExperimentOptions {
|
|
31
|
+
/** Resolved experiment configuration */
|
|
32
|
+
config: RunnableExperimentConfig;
|
|
33
|
+
/** Fixtures to run */
|
|
34
|
+
fixtures: EvalFixture[];
|
|
35
|
+
/** API key for the agent */
|
|
36
|
+
apiKey: string;
|
|
37
|
+
/** Directory to save results */
|
|
38
|
+
resultsDir: string;
|
|
39
|
+
/** Experiment name */
|
|
40
|
+
experimentName: string;
|
|
41
|
+
/** Per-eval fingerprints (eval name -> hash) for result reuse */
|
|
42
|
+
fingerprints?: Record<string, string>;
|
|
43
|
+
/** Callback for progress updates */
|
|
44
|
+
onProgress?: (event: ProgressEvent) => void;
|
|
45
|
+
/** Whether to run in verbose mode */
|
|
46
|
+
verbose?: boolean;
|
|
47
|
+
/** Whether this is a smoke test run */
|
|
48
|
+
smoke?: boolean;
|
|
49
|
+
/** Shared rate limiter to control how many sandbox runs start per time window */
|
|
50
|
+
rateLimiter?: StartRateLimiter;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Run an experiment - execute all evals with configured runs concurrently.
|
|
54
|
+
* With earlyExit enabled, remaining attempts for a fixture are aborted once one passes.
|
|
55
|
+
*/
|
|
56
|
+
export declare function runExperiment(options: RunExperimentOptions): Promise<ExperimentResults>;
|
|
57
|
+
/**
|
|
58
|
+
* Run a single eval (for testing/debugging).
|
|
59
|
+
*/
|
|
60
|
+
export declare function runSingleEval<T extends ResolvedExperimentConfig['model']>(fixture: EvalFixture, options: {
|
|
61
|
+
agent?: ResolvedExperimentConfig['agent'];
|
|
62
|
+
model: T;
|
|
63
|
+
timeout: number;
|
|
64
|
+
apiKey: string;
|
|
65
|
+
setup?: ResolvedExperimentConfig['setup'];
|
|
66
|
+
scripts?: string[];
|
|
67
|
+
sandbox?: ResolvedExperimentConfig['sandbox'];
|
|
68
|
+
editPrompt?: (prompt: string) => string;
|
|
69
|
+
verbose?: boolean;
|
|
70
|
+
}): Promise<T extends Array<unknown> ? EvalRunData[] : EvalRunData>;
|
|
71
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,wBAAwB,EACxB,WAAW,EACX,WAAW,EAEX,iBAAiB,EACjB,wBAAwB,EACxB,aAAa,EACd,MAAM,YAAY,CAAC;AASpB;;;;GAIG;AACH,qBAAa,gBAAgB;IAMzB,OAAO,CAAC,QAAQ,CAAC,eAAe;IAChC,OAAO,CAAC,QAAQ,CAAC,QAAQ;IAN3B,OAAO,CAAC,KAAK,CAAsB;IACnC,OAAO,CAAC,OAAO,CAAK;IACpB,OAAO,CAAC,KAAK,CAA+C;gBAGzC,eAAe,EAAE,MAAM,EACvB,QAAQ,EAAE,MAAM;IAGnC;;;OAGG;IACG,WAAW,IAAI,OAAO,CAAC,IAAI,CAAC;IAWlC,OAAO,CAAC,WAAW;CAgBpB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,wCAAwC;IACxC,MAAM,EAAE,wBAAwB,CAAC;IACjC,sBAAsB;IACtB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gCAAgC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,iEAAiE;IACjE,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACtC,oCAAoC;IACpC,UAAU,CAAC,EAAE,CAAC,KAAK,EAAE,aAAa,KAAK,IAAI,CAAC;IAC5C,qCAAqC;IACrC,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,uCAAuC;IACvC,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,iFAAiF;IACjF,WAAW,CAAC,EAAE,gBAAgB,CAAC;CAChC;AAoBD;;;GAGG;AACH,wBAAsB,aAAa,CACjC,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,iBAAiB,CAAC,CA2N5B;AAED;;GAEG;AACH,wBAAsB,aAAa,CAAC,CAAC,SAAS,wBAAwB,CAAC,OAAO,CAAC,EAC7E,OAAO,EAAE,WAAW,EACpB,OAAO,EAAE;IACP,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,KAAK,EAAE,CAAC,CAAC;IACT,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,wBAAwB,CAAC,OAAO,CAAC,CAAC;IAC1C,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,EAAE,wBAAwB,CAAC,SAAS,CAAC,CAAC;IAC9C,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,MAAM,CAAC;IACxC,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB,GACA,OAAO,CAAC,CAAC,SAAS,KAAK,CAAC,OAAO,CAAC,GAAG,WAAW,EAAE,GAAG,WAAW,CAAC,CA6BjE"}
|