@vercel/agent-eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +234 -14
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +6 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7 -1
- package/dist/index.js.map +1 -1
- package/dist/lib/classifier.d.ts +80 -0
- package/dist/lib/classifier.d.ts.map +1 -0
- package/dist/lib/classifier.js +268 -0
- package/dist/lib/classifier.js.map +1 -0
- package/dist/lib/fingerprint.d.ts +15 -0
- package/dist/lib/fingerprint.d.ts.map +1 -0
- package/dist/lib/fingerprint.js +59 -0
- package/dist/lib/fingerprint.js.map +1 -0
- package/dist/lib/housekeeping.d.ts +25 -0
- package/dist/lib/housekeeping.d.ts.map +1 -0
- package/dist/lib/housekeeping.js +131 -0
- package/dist/lib/housekeeping.js.map +1 -0
- package/dist/lib/results.d.ts +29 -0
- package/dist/lib/results.d.ts.map +1 -1
- package/dist/lib/results.js +89 -4
- package/dist/lib/results.js.map +1 -1
- package/dist/lib/types.d.ts +11 -0
- package/dist/lib/types.d.ts.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Failure classification for eval results.
|
|
3
|
+
*
|
|
4
|
+
* Classifies failed eval runs as:
|
|
5
|
+
* - "model" — the model tried but wrote incorrect code
|
|
6
|
+
* - "infra" — infrastructure broke (API errors, rate limits, crashes)
|
|
7
|
+
* - "timeout" — the run hit its time limit
|
|
8
|
+
*
|
|
9
|
+
* Uses AI classification via the Vercel AI Gateway. Requires AI_GATEWAY_API_KEY.
|
|
10
|
+
*/
|
|
11
|
+
import { readFileSync, readdirSync, statSync, writeFileSync } from 'fs';
|
|
12
|
+
import { join, resolve } from 'path';
|
|
13
|
+
import { tool } from 'ai';
|
|
14
|
+
import { z } from 'zod';
|
|
15
|
+
const CLASSIFIER_SYSTEM_PROMPT = `You are a failure classifier for an AI coding agent benchmark.
|
|
16
|
+
|
|
17
|
+
Your job: figure out WHY a failed eval run failed. Each eval tests whether an AI model can complete a coding task (e.g. migrate to App Router, add a Next.js feature). You have tools to explore the result files.
|
|
18
|
+
|
|
19
|
+
Classify into one of:
|
|
20
|
+
- "model" — the model tried but wrote incorrect code
|
|
21
|
+
- "infra" — infrastructure broke (API errors, rate limits, crashes) and the model never got to do real work
|
|
22
|
+
- "timeout" — the run hit its time limit
|
|
23
|
+
|
|
24
|
+
The eval result directory contains run-1/ through run-N/ subdirectories (one per attempt, N depends on config), plus a summary.json. Each run directory has:
|
|
25
|
+
- result.json — status, error, duration
|
|
26
|
+
- transcript.json or transcript-raw.jsonl (or older results may have transcript.jsonl) — the agent's conversation log
|
|
27
|
+
- outputs/eval.txt — EVAL.ts test output
|
|
28
|
+
- outputs/scripts/*.txt — npm script outputs (e.g. build.txt), if the experiment configured scripts
|
|
29
|
+
|
|
30
|
+
IMPORTANT: The eval harness always runs EVAL.ts tests after the agent finishes, plus any npm scripts configured in the experiment's \`scripts\` array (e.g. \`["build"]\`). These run even if the model produced nothing — tests just run against unmodified scaffold code (TODO placeholders). So test/script failures alone do NOT mean the model wrote code.
|
|
31
|
+
|
|
32
|
+
The transcript is the key evidence. It records every action the model took. If there is no transcript file, or the transcript only shows errors (no tool calls or text output from the model), the model never actually ran — that's "infra". Only classify as "model" if you see evidence in the transcript that the model actually generated code.`;
|
|
33
|
+
/**
|
|
34
|
+
* Validates and resolves a path, ensuring it stays within the allowed root.
|
|
35
|
+
*/
|
|
36
|
+
function safePath(root, relativePath) {
|
|
37
|
+
const resolved = resolve(root, relativePath);
|
|
38
|
+
if (!resolved.startsWith(root))
|
|
39
|
+
return null;
|
|
40
|
+
return resolved;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Creates sandboxed read-only tools for the AI classifier.
|
|
44
|
+
*/
|
|
45
|
+
export function createClassifierTools(evalResultDir) {
|
|
46
|
+
return {
|
|
47
|
+
list_files: tool({
|
|
48
|
+
description: 'List files and directories at a path relative to the eval result root. Use "." for the root.',
|
|
49
|
+
inputSchema: z.object({
|
|
50
|
+
path: z
|
|
51
|
+
.string()
|
|
52
|
+
.describe('Relative path to list, e.g. "." or "run-1" or "run-1/outputs"'),
|
|
53
|
+
}),
|
|
54
|
+
execute: async ({ path: relPath }) => {
|
|
55
|
+
const target = safePath(evalResultDir, relPath);
|
|
56
|
+
if (!target)
|
|
57
|
+
return { error: 'Path outside allowed directory' };
|
|
58
|
+
try {
|
|
59
|
+
const entries = readdirSync(target);
|
|
60
|
+
const results = [];
|
|
61
|
+
for (const entry of entries.sort()) {
|
|
62
|
+
const info = statSync(join(target, entry));
|
|
63
|
+
results.push({ name: entry, type: info.isDirectory() ? 'dir' : 'file' });
|
|
64
|
+
}
|
|
65
|
+
return { entries: results };
|
|
66
|
+
}
|
|
67
|
+
catch {
|
|
68
|
+
return { error: `Cannot list: ${relPath}` };
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
}),
|
|
72
|
+
read_file: tool({
|
|
73
|
+
description: 'Read a file relative to the eval result root. For large files, use offset/limit to paginate.',
|
|
74
|
+
inputSchema: z.object({
|
|
75
|
+
path: z
|
|
76
|
+
.string()
|
|
77
|
+
.describe('Relative path to the file, e.g. "run-1/result.json"'),
|
|
78
|
+
offset: z
|
|
79
|
+
.number()
|
|
80
|
+
.describe('Line offset to start reading from (0-based)')
|
|
81
|
+
.optional(),
|
|
82
|
+
limit: z
|
|
83
|
+
.number()
|
|
84
|
+
.describe('Max number of lines to return')
|
|
85
|
+
.optional(),
|
|
86
|
+
}),
|
|
87
|
+
execute: async ({ path: relPath, offset: rawOffset, limit: rawLimit }) => {
|
|
88
|
+
const offset = rawOffset ?? 0;
|
|
89
|
+
const limit = rawLimit ?? 200;
|
|
90
|
+
const target = safePath(evalResultDir, relPath);
|
|
91
|
+
if (!target)
|
|
92
|
+
return { error: 'Path outside allowed directory' };
|
|
93
|
+
try {
|
|
94
|
+
const content = readFileSync(target, 'utf-8');
|
|
95
|
+
const lines = content.split('\n');
|
|
96
|
+
const sliced = lines.slice(offset, offset + limit);
|
|
97
|
+
return {
|
|
98
|
+
content: sliced.join('\n'),
|
|
99
|
+
totalLines: lines.length,
|
|
100
|
+
showing: `lines ${offset}-${Math.min(offset + limit, lines.length)} of ${lines.length}`,
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return { error: `Cannot read: ${relPath}` };
|
|
105
|
+
}
|
|
106
|
+
},
|
|
107
|
+
}),
|
|
108
|
+
grep: tool({
|
|
109
|
+
description: 'Search for a pattern in files under a directory. Returns matching lines with context.',
|
|
110
|
+
inputSchema: z.object({
|
|
111
|
+
pattern: z.string().describe('Text or regex pattern to search for'),
|
|
112
|
+
path: z
|
|
113
|
+
.string()
|
|
114
|
+
.describe('Relative directory or file to search in, e.g. "." or "run-1"'),
|
|
115
|
+
maxResults: z
|
|
116
|
+
.number()
|
|
117
|
+
.describe('Max number of matches to return')
|
|
118
|
+
.optional(),
|
|
119
|
+
}),
|
|
120
|
+
execute: async ({ pattern, path: relPath, maxResults: rawMax }) => {
|
|
121
|
+
const maxResults = rawMax ?? 20;
|
|
122
|
+
const target = safePath(evalResultDir, relPath);
|
|
123
|
+
if (!target)
|
|
124
|
+
return { error: 'Path outside allowed directory' };
|
|
125
|
+
const regex = new RegExp(pattern, 'i');
|
|
126
|
+
const matches = [];
|
|
127
|
+
async function searchFile(filePath, relName) {
|
|
128
|
+
try {
|
|
129
|
+
const content = readFileSync(filePath, 'utf-8');
|
|
130
|
+
const lines = content.split('\n');
|
|
131
|
+
for (let i = 0; i < lines.length && matches.length < maxResults; i++) {
|
|
132
|
+
if (regex.test(lines[i])) {
|
|
133
|
+
matches.push({
|
|
134
|
+
file: relName,
|
|
135
|
+
line: i + 1,
|
|
136
|
+
text: lines[i].slice(0, 500),
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
catch {
|
|
142
|
+
// Skip unreadable files
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
async function searchDir(dirPath, prefix) {
|
|
146
|
+
try {
|
|
147
|
+
const entries = readdirSync(dirPath);
|
|
148
|
+
for (const entry of entries) {
|
|
149
|
+
if (matches.length >= maxResults)
|
|
150
|
+
break;
|
|
151
|
+
const full = join(dirPath, entry);
|
|
152
|
+
const rel = prefix ? `${prefix}/${entry}` : entry;
|
|
153
|
+
const info = statSync(full);
|
|
154
|
+
if (info.isDirectory()) {
|
|
155
|
+
await searchDir(full, rel);
|
|
156
|
+
}
|
|
157
|
+
else {
|
|
158
|
+
await searchFile(full, rel);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
catch {
|
|
163
|
+
// Skip unreadable dirs
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
try {
|
|
167
|
+
const info = statSync(target);
|
|
168
|
+
if (info.isDirectory()) {
|
|
169
|
+
await searchDir(target, relPath === '.' ? '' : relPath);
|
|
170
|
+
}
|
|
171
|
+
else {
|
|
172
|
+
await searchFile(target, relPath);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
catch {
|
|
176
|
+
return { error: `Path not found: ${relPath}` };
|
|
177
|
+
}
|
|
178
|
+
return {
|
|
179
|
+
matches,
|
|
180
|
+
totalFound: matches.length,
|
|
181
|
+
truncated: matches.length >= maxResults,
|
|
182
|
+
};
|
|
183
|
+
},
|
|
184
|
+
}),
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
/**
|
|
188
|
+
* Classify a failure using AI via the Vercel AI Gateway.
|
|
189
|
+
* Requires AI_GATEWAY_API_KEY in the environment.
|
|
190
|
+
*/
|
|
191
|
+
export async function classifyWithAI(evalResultDir, evalName, experimentName) {
|
|
192
|
+
const { generateText, hasToolCall, createGateway } = await import('ai');
|
|
193
|
+
const gateway = createGateway({ apiKey: process.env.AI_GATEWAY_API_KEY ?? '' });
|
|
194
|
+
let classification = null;
|
|
195
|
+
const explorationTools = createClassifierTools(evalResultDir);
|
|
196
|
+
const allTools = {
|
|
197
|
+
...explorationTools,
|
|
198
|
+
classify: tool({
|
|
199
|
+
description: 'Submit your final classification. Call this once you have enough evidence.',
|
|
200
|
+
inputSchema: z.object({
|
|
201
|
+
failureType: z
|
|
202
|
+
.enum(['model', 'infra', 'timeout'])
|
|
203
|
+
.describe('The failure category'),
|
|
204
|
+
failureReason: z
|
|
205
|
+
.string()
|
|
206
|
+
.describe('Brief 1-2 sentence explanation of why'),
|
|
207
|
+
}),
|
|
208
|
+
execute: async ({ failureType, failureReason }) => {
|
|
209
|
+
classification = { failureType: failureType, failureReason };
|
|
210
|
+
return { ok: true };
|
|
211
|
+
},
|
|
212
|
+
}),
|
|
213
|
+
};
|
|
214
|
+
try {
|
|
215
|
+
await generateText({
|
|
216
|
+
model: gateway('anthropic/claude-sonnet-4-5'),
|
|
217
|
+
system: CLASSIFIER_SYSTEM_PROMPT,
|
|
218
|
+
prompt: `Classify the failure for eval "${evalName}" (experiment: ${experimentName}). Use the exploration tools to investigate, then call classify() with your verdict.`,
|
|
219
|
+
tools: allTools,
|
|
220
|
+
stopWhen: hasToolCall('classify'),
|
|
221
|
+
});
|
|
222
|
+
return classification;
|
|
223
|
+
}
|
|
224
|
+
catch {
|
|
225
|
+
return null;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Classify a failed eval result using AI.
|
|
230
|
+
* Requires AI_GATEWAY_API_KEY in the environment.
|
|
231
|
+
*
|
|
232
|
+
* Caches results in classification.json within the eval result directory.
|
|
233
|
+
*/
|
|
234
|
+
export async function classifyFailure(evalResultDir, evalName, experimentName) {
|
|
235
|
+
// Check for cached classification
|
|
236
|
+
const cachedPath = join(evalResultDir, 'classification.json');
|
|
237
|
+
try {
|
|
238
|
+
const cached = JSON.parse(readFileSync(cachedPath, 'utf-8'));
|
|
239
|
+
if (cached.failureType && cached.failureReason) {
|
|
240
|
+
return { failureType: cached.failureType, failureReason: cached.failureReason };
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
catch {
|
|
244
|
+
// No cache
|
|
245
|
+
}
|
|
246
|
+
// Classify with AI
|
|
247
|
+
const classification = await classifyWithAI(evalResultDir, evalName, experimentName);
|
|
248
|
+
// Cache the result
|
|
249
|
+
if (classification) {
|
|
250
|
+
try {
|
|
251
|
+
writeFileSync(cachedPath, JSON.stringify(classification, null, 2));
|
|
252
|
+
}
|
|
253
|
+
catch {
|
|
254
|
+
// Non-fatal: caching failed
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
return classification;
|
|
258
|
+
}
|
|
259
|
+
/**
|
|
260
|
+
* Check if all runs of an eval failed with non-model failures.
|
|
261
|
+
* Used to determine if auto-retry is appropriate.
|
|
262
|
+
*/
|
|
263
|
+
export function shouldRetry(classifications) {
|
|
264
|
+
if (classifications.length === 0)
|
|
265
|
+
return false;
|
|
266
|
+
return classifications.every((c) => c.failureType !== 'model');
|
|
267
|
+
}
|
|
268
|
+
//# sourceMappingURL=classifier.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../../src/lib/classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AACxE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,EAAE,IAAI,EAAE,MAAM,IAAI,CAAC;AAC1B,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;;;;;qVAiBoT,CAAC;AAEtV;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,YAAoB;IAClD,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;IAC7C,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5C,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CAAC,aAAqB;IACzD,OAAO;QACL,UAAU,EAAE,IAAI,CAAC;YACf,WAAW,EACT,8FAA8F;YAChG,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,+DAA+D,CAAC;aAC7E,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE;gBACnC,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;oBACpC,MAAM,OAAO,GAAkD,EAAE,CAAC;oBAClE,KAAK,MAAM,KAAK,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;wBACnC,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;wBAC3C,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;oBAC3E,CAAC;oBACD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;gBAC9B,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,gBAAgB,OAAO,EAAE,EAAE,CAAC;gBAC9C,CAAC;YACH,CAAC;SACF,CAAC;QAEF,SAAS,EAAE,IAAI,CAAC;YACd,WAAW,EACT,8FAA8F;YAChG,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,qDAAqD,CAAC;gBAClE,MAAM,EAAE,CAAC;qBACN,MAAM,EAAE;qBACR,QAAQ,CAAC,6CAA6C,CAAC;qBACvD,QAAQ,EAAE;gBACb,KAAK,EAAE,CAAC;qBACL,MAAM,EAAE;qBACR,QAAQ,CAAC,+BAA+B,CAAC;qBACzC,QAAQ,EAAE;aACd,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE,EAAE;gBACvE,MAAM,MAAM,GAAG,SAAS,IAAI,CAAC,CAAC;gBAC9B,MAAM,KAAK,GAAG,QAAQ,IAAI,GAAG,CAAC;gBAC9B,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;oBAC9C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;oBAClC,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,GAAG,KAAK,CAAC,CAAC;oBACnD,OAAO;wBACL,OAAO,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;wBAC1B,UAAU,EAAE,KAAK,CAAC,MAAM;wBACxB,OAAO,EAAE,SAAS,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,OAAO,KAAK,CAAC,MAAM,EAAE;qBACxF,CAAC;gBACJ,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,gBAAgB,OAAO,EAAE,EAAE,CAAC;gBAC9C,CAAC;YACH,CAAC;SACF,CAAC;QAEF,IAAI,EAAE,IAAI,CAAC;YACT,WAAW,EACT,uFAAuF;YACzF,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,qCAAqC,CAAC;gBACnE,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,8DAA8D,CAAC;gBAC3E,UAAU,EAAE,CAAC;qBACV,MAAM,EAAE;qBACR,QAAQ,CAAC,iCAAiC,CAAC;qBAC3C,QAAQ,EAAE;aACd,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,EAAE,EAAE;gBAChE,MAAM,UAAU,GAAG,MAAM,IAAI,EAAE,CAAC;gBAChC,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;gBACvC,MAAM,OAAO,GAAwD,EAAE,CAAC;gBAExE,KAAK,UAAU,UAAU,CAAC,QAAgB,EAAE,OAAe;oBACzD,IAAI,CAAC;wBACH,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;wBAChD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;wBAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;4BACrE,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gCACzB,OAAO,CAAC,IAAI,CAAC;oCACX,IAAI,EAAE,OAAO;oCACb,IAAI,EAAE,CAAC,GAAG,CAAC;oCACX,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;iCAC7B,CAAC,CAAC;4BACL,CAAC;wBACH,CAAC;oBACH,CAAC;oBAAC,MAAM,CAAC;wBACP,wBAAwB;oBAC1B,CAAC;gBACH,CAAC;gBAED,KAAK,UAAU,SAAS,CAAC,OAAe,EAAE,MAAc;oBACtD,IAAI,CAAC;wBACH,MAAM,OAAO,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC;wBACrC,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;4BAC5B,IAAI,OAAO,CAAC,MAAM,IAAI,UAAU;gCAAE,MAAM;4BACxC,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;4BAClC,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;4BAClD,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;4BAC5B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gCACvB,MAAM,SAAS,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;4BAC7B,CAAC;iCAAM,CAAC;gCACN,MAAM,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;4BAC9B,CAAC;wBACH,CAAC;oBACH,CAAC;oBAAC,MAAM,CAAC;wBACP,uBAAuB;oBACzB,CAAC;gBACH,CAAC;gBAED,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC;oBAC9B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;wBACvB,MAAM,SAAS,CAAC,MAAM,EAAE,OAAO,KAAK,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;oBAC1D,CAAC;yBAAM,CAAC;wBACN,MAAM,UAAU,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;oBACpC,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,mBAAmB,OAAO,EAAE,EAAE,CAAC;gBACjD,CAAC;gBAED,OAAO;oBACL,OAAO;oBACP,UAAU,EAAE,OAAO,CAAC,MAAM;oBAC1B,SAAS,EAAE,OAAO,CAAC,MAAM,IAAI,UAAU;iBACxC,CAAC;YACJ,CAAC;SACF,CAAC;KACH,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,aAAqB,EACrB,QAAgB,EAChB,cAAsB;IAEtB,MAAM,EAAE,YAAY,EAAE,WAAW,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;IAExE,MAAM,OAAO,GAAG,aAAa,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,kBAAkB,IAAI,EAAE,EAAE,CAAC,CAAC;IAEhF,IAAI,cAAc,GAA0B,IAAI,CAAC;IAEjD,MAAM,gBAAgB,GAAG,qBAAqB,CAAC,aAAa,CAAC,CAAC;IAC9D,MAAM,QAAQ,GAAG;QACf,GAAG,gBAAgB;QACnB,QAAQ,EAAE,IAAI,CAAC;YACb,WAAW,EAAE,4EAA4E;YACzF,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,WAAW,EAAE,CAAC;qBACX,IAAI,CAAC,CAAC,OAAO,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;qBACnC,QAAQ,CAAC,sBAAsB,CAAC;gBACnC,aAAa,EAAE,CAAC;qBACb,MAAM,EAAE;qBACR,QAAQ,CAAC,uCAAuC,CAAC;aACrD,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,WAAW,EAAE,aAAa,EAAE,EAAE,EAAE;gBAChD,cAAc,GAAG,EAAE,WAAW,EAAE,WAA0B,EAAE,aAAa,EAAE,CAAC;gBAC5E,OAAO,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC;YACtB,CAAC;SACF,CAAC;KACH,CAAC;IAEF,IAAI,CAAC;QACH,MAAM,YAAY,CAAC;YACjB,KAAK,EAAE,OAAO,CAAC,6BAA6B,CAAC;YAC7C,MAAM,EAAE,wBAAwB;YAChC,MAAM,EAAE,kCAAkC,QAAQ,kBAAkB,cAAc,sFAAsF;YACxK,KAAK,EAAE,QAAQ;YACf,QAAQ,EAAE,WAAW,CAAC,UAAU,CAAC;SAClC,CAAC,CAAC;QAEH,OAAO,cAAc,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,aAAqB,EACrB,QAAgB,EAChB,cAAsB;IAEtB,kCAAkC;IAClC,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,EAAE,qBAAqB,CAAC,CAAC;IAC9D,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC;QAC7D,IAAI,MAAM,CAAC,WAAW,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YAC/C,OAAO,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,aAAa,EAAE,MAAM,CAAC,aAAa,EAAE,CAAC;QAClF,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,WAAW;IACb,CAAC;IAED,mBAAmB;IACnB,MAAM,cAAc,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,QAAQ,EAAE,cAAc,CAAC,CAAC;IAErF,mBAAmB;IACnB,IAAI,cAAc,EAAE,CAAC;QACnB,IAAI,CAAC;YACH,aAAa,CAAC,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QACrE,CAAC;QAAC,MAAM,CAAC;YACP,4BAA4B;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,cAAc,CAAC;AACxB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,eAAiC;IAC3D,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAC/C,OAAO,eAAe,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,KAAK,OAAO,CAAC,CAAC;AACjE,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content fingerprinting for eval result reuse.
|
|
3
|
+
*
|
|
4
|
+
* A fingerprint captures the eval files + config fields that affect results.
|
|
5
|
+
* If the fingerprint matches and the result is valid, the eval can be skipped.
|
|
6
|
+
*/
|
|
7
|
+
import type { RunnableExperimentConfig } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Compute a fingerprint for an (eval, config) pair.
|
|
10
|
+
*
|
|
11
|
+
* Hashes: all eval directory files + config fields that affect results.
|
|
12
|
+
* Returns a hex SHA-256 digest.
|
|
13
|
+
*/
|
|
14
|
+
export declare function computeFingerprint(evalPath: string, config: RunnableExperimentConfig): string;
|
|
15
|
+
//# sourceMappingURL=fingerprint.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fingerprint.d.ts","sourceRoot":"","sources":["../../src/lib/fingerprint.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,OAAO,KAAK,EAAE,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAuC3D;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,wBAAwB,GAAG,MAAM,CAuB7F"}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content fingerprinting for eval result reuse.
|
|
3
|
+
*
|
|
4
|
+
* A fingerprint captures the eval files + config fields that affect results.
|
|
5
|
+
* If the fingerprint matches and the result is valid, the eval can be skipped.
|
|
6
|
+
*/
|
|
7
|
+
import { createHash } from 'crypto';
|
|
8
|
+
import { readFileSync, readdirSync, statSync } from 'fs';
|
|
9
|
+
import { join } from 'path';
|
|
10
|
+
/**
|
|
11
|
+
* Recursively collects all files in a directory, sorted for deterministic hashing.
|
|
12
|
+
* Skips node_modules and .git.
|
|
13
|
+
*/
|
|
14
|
+
function collectFiles(dir, basePath = '') {
|
|
15
|
+
const files = [];
|
|
16
|
+
const entries = readdirSync(dir).sort();
|
|
17
|
+
for (const entry of entries) {
|
|
18
|
+
if (entry === 'node_modules' || entry === '.git')
|
|
19
|
+
continue;
|
|
20
|
+
const fullPath = join(dir, entry);
|
|
21
|
+
const relativePath = basePath ? `${basePath}/${entry}` : entry;
|
|
22
|
+
const stat = statSync(fullPath);
|
|
23
|
+
if (stat.isDirectory()) {
|
|
24
|
+
files.push(...collectFiles(fullPath, relativePath));
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
files.push({ relativePath, content: readFileSync(fullPath, 'utf-8') });
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return files;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Compute a fingerprint for an (eval, config) pair.
|
|
34
|
+
*
|
|
35
|
+
* Hashes: all eval directory files + config fields that affect results.
|
|
36
|
+
* Returns a hex SHA-256 digest.
|
|
37
|
+
*/
|
|
38
|
+
export function computeFingerprint(evalPath, config) {
|
|
39
|
+
const hash = createHash('sha256');
|
|
40
|
+
// Hash all files in the eval directory (sorted for determinism)
|
|
41
|
+
const files = collectFiles(evalPath);
|
|
42
|
+
for (const file of files) {
|
|
43
|
+
hash.update(`file:${file.relativePath}\n`);
|
|
44
|
+
hash.update(file.content);
|
|
45
|
+
hash.update('\0');
|
|
46
|
+
}
|
|
47
|
+
// Hash config fields that affect results
|
|
48
|
+
const configForHash = {
|
|
49
|
+
agent: config.agent,
|
|
50
|
+
model: config.model,
|
|
51
|
+
scripts: [...config.scripts].sort(),
|
|
52
|
+
timeout: config.timeout,
|
|
53
|
+
earlyExit: config.earlyExit,
|
|
54
|
+
runs: config.runs,
|
|
55
|
+
};
|
|
56
|
+
hash.update(`config:${JSON.stringify(configForHash)}`);
|
|
57
|
+
return hash.digest('hex');
|
|
58
|
+
}
|
|
59
|
+
//# sourceMappingURL=fingerprint.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fingerprint.js","sourceRoot":"","sources":["../../src/lib/fingerprint.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AACpC,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,IAAI,CAAC;AACzD,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAgB5B;;;GAGG;AACH,SAAS,YAAY,CAAC,GAAW,EAAE,WAAmB,EAAE;IACtD,MAAM,KAAK,GAAqD,EAAE,CAAC;IACnE,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAExC,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,IAAI,KAAK,KAAK,cAAc,IAAI,KAAK,KAAK,MAAM;YAAE,SAAS;QAC3D,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAClC,MAAM,YAAY,GAAG,QAAQ,CAAC,CAAC,CAAC,GAAG,QAAQ,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;QAC/D,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAEhC,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;YACvB,KAAK,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,KAAK,CAAC,IAAI,CAAC,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,EAAE,CAAC,CAAC;QACzE,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB,CAAC,QAAgB,EAAE,MAAgC;IACnF,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;IAElC,gEAAgE;IAChE,MAAM,KAAK,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACrC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,MAAM,CAAC,QAAQ,IAAI,CAAC,YAAY,IAAI,CAAC,CAAC;QAC3C,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC1B,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACpB,CAAC;IAED,yCAAyC;IACzC,MAAM,aAAa,GAA0B;QAC3C,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,OAAO,EAAE,CAAC,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE;QACnC,OAAO,EAAE,MAAM,CAAC,OAAO;QACvB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;KAClB,CAAC;IACF,IAAI,CAAC,MAAM,CAAC,UAAU,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,EAAE,CAAC,CAAC;IAEvD,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC5B,CAAC"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Housekeeping for eval results.
|
|
3
|
+
*
|
|
4
|
+
* After experiments complete, consolidate results:
|
|
5
|
+
* - For each (experiment, eval) pair: keep only the latest valid result
|
|
6
|
+
* - Remove older duplicates and dangling/incomplete results
|
|
7
|
+
* - Remove empty timestamp directories
|
|
8
|
+
*/
|
|
9
|
+
interface HousekeepingStats {
|
|
10
|
+
removedDuplicates: number;
|
|
11
|
+
removedIncomplete: number;
|
|
12
|
+
removedEmptyDirs: number;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Run housekeeping on a single experiment's results directory.
|
|
16
|
+
*
|
|
17
|
+
* For each eval: keeps the newest complete result (has summary.json and
|
|
18
|
+
* at least one transcript), removes older duplicates and incomplete results.
|
|
19
|
+
* Removes empty timestamp directories afterward.
|
|
20
|
+
*/
|
|
21
|
+
export declare function housekeep(resultsDir: string, experimentName: string, options?: {
|
|
22
|
+
dry?: boolean;
|
|
23
|
+
}): HousekeepingStats;
|
|
24
|
+
export {};
|
|
25
|
+
//# sourceMappingURL=housekeeping.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"housekeeping.d.ts","sourceRoot":"","sources":["../../src/lib/housekeeping.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAKH,UAAU,iBAAiB;IACzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,iBAAiB,EAAE,MAAM,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;CAC1B;AAED;;;;;;GAMG;AACH,wBAAgB,SAAS,CACvB,UAAU,EAAE,MAAM,EAClB,cAAc,EAAE,MAAM,EACtB,OAAO,CAAC,EAAE;IAAE,GAAG,CAAC,EAAE,OAAO,CAAA;CAAE,GAC1B,iBAAiB,CAkFnB"}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Housekeeping for eval results.
|
|
3
|
+
*
|
|
4
|
+
* After experiments complete, consolidate results:
|
|
5
|
+
* - For each (experiment, eval) pair: keep only the latest valid result
|
|
6
|
+
* - Remove older duplicates and dangling/incomplete results
|
|
7
|
+
* - Remove empty timestamp directories
|
|
8
|
+
*/
|
|
9
|
+
import { readdirSync, rmSync, existsSync, readFileSync, statSync } from 'fs';
|
|
10
|
+
import { join } from 'path';
|
|
11
|
+
/**
|
|
12
|
+
* Run housekeeping on a single experiment's results directory.
|
|
13
|
+
*
|
|
14
|
+
* For each eval: keeps the newest complete result (has summary.json and
|
|
15
|
+
* at least one transcript), removes older duplicates and incomplete results.
|
|
16
|
+
* Removes empty timestamp directories afterward.
|
|
17
|
+
*/
|
|
18
|
+
export function housekeep(resultsDir, experimentName, options) {
|
|
19
|
+
const stats = {
|
|
20
|
+
removedDuplicates: 0,
|
|
21
|
+
removedIncomplete: 0,
|
|
22
|
+
removedEmptyDirs: 0,
|
|
23
|
+
};
|
|
24
|
+
const experimentDir = join(resultsDir, experimentName);
|
|
25
|
+
if (!existsSync(experimentDir))
|
|
26
|
+
return stats;
|
|
27
|
+
// Get all timestamps sorted newest first
|
|
28
|
+
let timestamps;
|
|
29
|
+
try {
|
|
30
|
+
timestamps = readdirSync(experimentDir)
|
|
31
|
+
.filter((t) => !t.startsWith('.'))
|
|
32
|
+
.filter((t) => {
|
|
33
|
+
try {
|
|
34
|
+
return statSync(join(experimentDir, t)).isDirectory();
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
})
|
|
40
|
+
.sort()
|
|
41
|
+
.reverse();
|
|
42
|
+
}
|
|
43
|
+
catch {
|
|
44
|
+
return stats;
|
|
45
|
+
}
|
|
46
|
+
// Track which evals we've already seen (newest wins)
|
|
47
|
+
const seenEvals = new Set();
|
|
48
|
+
for (const timestamp of timestamps) {
|
|
49
|
+
const tsDir = join(experimentDir, timestamp);
|
|
50
|
+
let evalDirs;
|
|
51
|
+
try {
|
|
52
|
+
evalDirs = readdirSync(tsDir).filter((d) => !d.startsWith('.'));
|
|
53
|
+
}
|
|
54
|
+
catch {
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
for (const evalDir of evalDirs) {
|
|
58
|
+
const evalResultDir = join(tsDir, evalDir);
|
|
59
|
+
if (!statSync(evalResultDir).isDirectory())
|
|
60
|
+
continue;
|
|
61
|
+
if (seenEvals.has(evalDir)) {
|
|
62
|
+
// Older duplicate — remove
|
|
63
|
+
if (!options?.dry) {
|
|
64
|
+
rmSync(evalResultDir, { recursive: true });
|
|
65
|
+
}
|
|
66
|
+
stats.removedDuplicates++;
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
// Check if this result is complete
|
|
70
|
+
if (isComplete(evalResultDir)) {
|
|
71
|
+
seenEvals.add(evalDir);
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
// Incomplete — remove
|
|
75
|
+
if (!options?.dry) {
|
|
76
|
+
rmSync(evalResultDir, { recursive: true });
|
|
77
|
+
}
|
|
78
|
+
stats.removedIncomplete++;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// Check if timestamp dir is now empty
|
|
82
|
+
try {
|
|
83
|
+
const remaining = readdirSync(tsDir).filter((d) => !d.startsWith('.'));
|
|
84
|
+
if (remaining.length === 0) {
|
|
85
|
+
if (!options?.dry) {
|
|
86
|
+
rmSync(tsDir, { recursive: true });
|
|
87
|
+
}
|
|
88
|
+
stats.removedEmptyDirs++;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
catch {
|
|
92
|
+
// Directory already removed or inaccessible
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return stats;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Check if an eval result directory is complete.
|
|
99
|
+
* Complete means: has summary.json and at least one run with a transcript.
|
|
100
|
+
*/
|
|
101
|
+
function isComplete(evalResultDir) {
|
|
102
|
+
const summaryPath = join(evalResultDir, 'summary.json');
|
|
103
|
+
if (!existsSync(summaryPath))
|
|
104
|
+
return false;
|
|
105
|
+
// Check for at least one transcript
|
|
106
|
+
try {
|
|
107
|
+
const entries = readdirSync(evalResultDir);
|
|
108
|
+
for (const entry of entries) {
|
|
109
|
+
if (!entry.startsWith('run-'))
|
|
110
|
+
continue;
|
|
111
|
+
const runDir = join(evalResultDir, entry);
|
|
112
|
+
if (existsSync(join(runDir, 'transcript-raw.jsonl')) ||
|
|
113
|
+
existsSync(join(runDir, 'transcript.json'))) {
|
|
114
|
+
return true;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
catch {
|
|
119
|
+
return false;
|
|
120
|
+
}
|
|
121
|
+
// No transcript found — but summary.json exists.
|
|
122
|
+
// Still consider complete if summary shows 0% (model produced nothing, which is valid).
|
|
123
|
+
try {
|
|
124
|
+
const summary = JSON.parse(readFileSync(summaryPath, 'utf-8'));
|
|
125
|
+
return summary.totalRuns > 0;
|
|
126
|
+
}
|
|
127
|
+
catch {
|
|
128
|
+
return false;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
//# sourceMappingURL=housekeeping.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"housekeeping.js","sourceRoot":"","sources":["../../src/lib/housekeeping.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,UAAU,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,IAAI,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAQ5B;;;;;;GAMG;AACH,MAAM,UAAU,SAAS,CACvB,UAAkB,EAClB,cAAsB,EACtB,OAA2B;IAE3B,MAAM,KAAK,GAAsB;QAC/B,iBAAiB,EAAE,CAAC;QACpB,iBAAiB,EAAE,CAAC;QACpB,gBAAgB,EAAE,CAAC;KACpB,CAAC;IAEF,MAAM,aAAa,GAAG,IAAI,CAAC,UAAU,EAAE,cAAc,CAAC,CAAC;IACvD,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;QAAE,OAAO,KAAK,CAAC;IAE7C,yCAAyC;IACzC,IAAI,UAAoB,CAAC;IACzB,IAAI,CAAC;QACH,UAAU,GAAG,WAAW,CAAC,aAAa,CAAC;aACpC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;aACjC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACZ,IAAI,CAAC;gBACH,OAAO,QAAQ,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;YACxD,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC;aACD,IAAI,EAAE;aACN,OAAO,EAAE,CAAC;IACf,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;IAED,qDAAqD;IACrD,MAAM,SAAS,GAAG,IAAI,GAAG,EAAU,CAAC;IAEpC,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,SAAS,CAAC,CAAC;QAE7C,IAAI,QAAkB,CAAC;QACvB,IAAI,CAAC;YACH,QAAQ,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QAClE,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QAED,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;YAE3C,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE;gBAAE,SAAS;YAErD,IAAI,SAAS,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC3B,2BAA2B;gBAC3B,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC;oBAClB,MAAM,CAAC,aAAa,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC7C,CAAC;gBACD,KAAK,CAAC,iBAAiB,EAAE,CAAC;gBAC1B,SAAS;YACX,CAAC;YAED,mCAAmC;YACnC,IAAI,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;gBAC9B,SAAS,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YACzB,CAAC;iBAAM,CAAC;gBACN,sBAAsB;gBACtB,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC;oBAClB,MAAM,CAAC,aAAa,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC7C,CAAC;gBACD,KAAK,CAAC,iBAAiB,EAAE,CAAC;YAC5B,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;YACvE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC3B,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC;oBAClB,MAAM,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;gBACrC,CAAC;gBACD,KAAK,CAAC,gBAAgB,EAAE,CAAC;YAC3B,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,4CAA4C;QAC9C,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;GAGG;AACH,SAAS,UAAU,CAAC,aAAqB;IACvC,MAAM,WAAW,GAAG,IAAI,CAAC,aAAa,EAAE,cAAc,CAAC,CAAC;IACxD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC;QAAE,OAAO,KAAK,CAAC;IAE3C,oCAAoC;IACpC,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,WAAW,CAAC,aAAa,CAAC,CAAC;QAC3C,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,CAAC;gBAAE,SAAS;YACxC,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;YAC1C,IACE,UAAU,CAAC,IAAI,CAAC,MAAM,EAAE,sBAAsB,CAAC,CAAC;gBAChD,UAAU,CAAC,IAAI,CAAC,MAAM,EAAE,iBAAiB,CAAC,CAAC,EAC3C,CAAC;gBACD,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;IAED,iDAAiD;IACjD,wFAAwF;IACxF,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC;QAC/D,OAAO,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC"}
|
package/dist/lib/results.d.ts
CHANGED
|
@@ -23,6 +23,15 @@ export interface SaveResultsOptions {
|
|
|
23
23
|
resultsDir: string;
|
|
24
24
|
/** Experiment name (used for subdirectory) */
|
|
25
25
|
experimentName: string;
|
|
26
|
+
/** Per-eval fingerprints (eval name -> fingerprint hash) */
|
|
27
|
+
fingerprints?: Record<string, string>;
|
|
28
|
+
/** Per-eval classification results (eval name -> classification) */
|
|
29
|
+
classifications?: Record<string, {
|
|
30
|
+
failureType: string;
|
|
31
|
+
failureReason: string;
|
|
32
|
+
}>;
|
|
33
|
+
/** Per-eval validity flags (eval name -> valid). Defaults to true. */
|
|
34
|
+
validity?: Record<string, boolean>;
|
|
26
35
|
}
|
|
27
36
|
/**
|
|
28
37
|
* Save experiment results to disk.
|
|
@@ -52,4 +61,24 @@ export declare function formatRunResult(evalName: string, runNumber: number, tot
|
|
|
52
61
|
* Create a progress indicator for running evals.
|
|
53
62
|
*/
|
|
54
63
|
export declare function createProgressDisplay(evalName: string, runNumber: number, totalRuns: number): string;
|
|
64
|
+
/**
|
|
65
|
+
* A reusable result found by the scanner.
|
|
66
|
+
*/
|
|
67
|
+
export interface ReusableResult {
|
|
68
|
+
evalName: string;
|
|
69
|
+
fingerprint: string;
|
|
70
|
+
passRate: string;
|
|
71
|
+
timestamp: string;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Scan existing results for an experiment to find reusable eval results.
|
|
75
|
+
*
|
|
76
|
+
* A result is reusable if:
|
|
77
|
+
* 1. Its fingerprint matches the current fingerprint
|
|
78
|
+
* 2. It is "valid" (not marked as invalid by the classifier)
|
|
79
|
+
* 3. It has passedRuns > 0 (successful result worth reusing)
|
|
80
|
+
*
|
|
81
|
+
* Scans all timestamps newest-first and returns the latest match per eval.
|
|
82
|
+
*/
|
|
83
|
+
export declare function scanReusableResults(resultsDir: string, experimentName: string, fingerprints: Record<string, string>): Map<string, ReusableResult>;
|
|
55
84
|
//# sourceMappingURL=results.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"results.d.ts","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EACV,aAAa,EACb,WAAW,EACX,WAAW,EACX,iBAAiB,EACjB,wBAAwB,EACzB,MAAM,YAAY,CAAC;AACpB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAGxD;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,WAAW,EAAE,cAAc,GAAG,WAAW,CA4BjF;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,GAAG,WAAW,CAanF;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,wBAAwB,EAChC,KAAK,EAAE,WAAW,EAAE,EACpB,SAAS,EAAE,IAAI,EACf,WAAW,EAAE,IAAI,GAChB,iBAAiB,CAOnB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,iCAAiC;IACjC,UAAU,EAAE,MAAM,CAAC;IACnB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"results.d.ts","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EACV,aAAa,EACb,WAAW,EACX,WAAW,EACX,iBAAiB,EACjB,wBAAwB,EACzB,MAAM,YAAY,CAAC;AACpB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAGxD;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,WAAW,EAAE,cAAc,GAAG,WAAW,CA4BjF;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,GAAG,WAAW,CAanF;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,wBAAwB,EAChC,KAAK,EAAE,WAAW,EAAE,EACpB,SAAS,EAAE,IAAI,EACf,WAAW,EAAE,IAAI,GAChB,iBAAiB,CAOnB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,iCAAiC;IACjC,UAAU,EAAE,MAAM,CAAC;IACnB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;IACvB,4DAA4D;IAC5D,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACtC,oEAAoE;IACpE,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE;QAAE,WAAW,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjF,sEAAsE;IACtE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,WAAW,CACzB,OAAO,EAAE,iBAAiB,EAC1B,OAAO,EAAE,kBAAkB,GAC1B,MAAM,CAkHR;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,iBAAiB,GAAG,MAAM,CAsCrE;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,aAAa,GACpB,MAAM,CAYR;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,GAChB,MAAM,CAER;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;;;;;;;;GASG;AACH,wBAAgB,mBAAmB,CACjC,UAAU,EAAE,MAAM,EAClB,cAAc,EAAE,MAAM,EACtB,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACnC,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CA8D7B"}
|