@vercel/agent-eval 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ /**
2
+ * Failure classification for eval results.
3
+ *
4
+ * Classifies failed eval runs as:
5
+ * - "model" — the model tried but wrote incorrect code
6
+ * - "infra" — infrastructure broke (API errors, rate limits, crashes)
7
+ * - "timeout" — the run hit its time limit
8
+ *
9
+ * Uses AI classification via the Vercel AI Gateway. Requires AI_GATEWAY_API_KEY.
10
+ */
11
+ import { readFileSync, readdirSync, statSync, writeFileSync } from 'fs';
12
+ import { join, resolve } from 'path';
13
+ import { tool } from 'ai';
14
+ import { z } from 'zod';
15
+ const CLASSIFIER_SYSTEM_PROMPT = `You are a failure classifier for an AI coding agent benchmark.
16
+
17
+ Your job: figure out WHY a failed eval run failed. Each eval tests whether an AI model can complete a coding task (e.g. migrate to App Router, add a Next.js feature). You have tools to explore the result files.
18
+
19
+ Classify into one of:
20
+ - "model" — the model tried but wrote incorrect code
21
+ - "infra" — infrastructure broke (API errors, rate limits, crashes) and the model never got to do real work
22
+ - "timeout" — the run hit its time limit
23
+
24
+ The eval result directory contains run-1/ through run-N/ subdirectories (one per attempt, N depends on config), plus a summary.json. Each run directory has:
25
+ - result.json — status, error, duration
26
+ - transcript.json or transcript-raw.jsonl (or older results may have transcript.jsonl) — the agent's conversation log
27
+ - outputs/eval.txt — EVAL.ts test output
28
+ - outputs/scripts/*.txt — npm script outputs (e.g. build.txt), if the experiment configured scripts
29
+
30
+ IMPORTANT: The eval harness always runs EVAL.ts tests after the agent finishes, plus any npm scripts configured in the experiment's \`scripts\` array (e.g. \`["build"]\`). These run even if the model produced nothing — tests just run against unmodified scaffold code (TODO placeholders). So test/script failures alone do NOT mean the model wrote code.
31
+
32
+ The transcript is the key evidence. It records every action the model took. If there is no transcript file, or the transcript only shows errors (no tool calls or text output from the model), the model never actually ran — that's "infra". Only classify as "model" if you see evidence in the transcript that the model actually generated code.`;
33
+ /**
34
+ * Validates and resolves a path, ensuring it stays within the allowed root.
35
+ */
36
+ function safePath(root, relativePath) {
37
+ const resolved = resolve(root, relativePath);
38
+ if (!resolved.startsWith(root))
39
+ return null;
40
+ return resolved;
41
+ }
42
+ /**
43
+ * Creates sandboxed read-only tools for the AI classifier.
44
+ */
45
+ export function createClassifierTools(evalResultDir) {
46
+ return {
47
+ list_files: tool({
48
+ description: 'List files and directories at a path relative to the eval result root. Use "." for the root.',
49
+ inputSchema: z.object({
50
+ path: z
51
+ .string()
52
+ .describe('Relative path to list, e.g. "." or "run-1" or "run-1/outputs"'),
53
+ }),
54
+ execute: async ({ path: relPath }) => {
55
+ const target = safePath(evalResultDir, relPath);
56
+ if (!target)
57
+ return { error: 'Path outside allowed directory' };
58
+ try {
59
+ const entries = readdirSync(target);
60
+ const results = [];
61
+ for (const entry of entries.sort()) {
62
+ const info = statSync(join(target, entry));
63
+ results.push({ name: entry, type: info.isDirectory() ? 'dir' : 'file' });
64
+ }
65
+ return { entries: results };
66
+ }
67
+ catch {
68
+ return { error: `Cannot list: ${relPath}` };
69
+ }
70
+ },
71
+ }),
72
+ read_file: tool({
73
+ description: 'Read a file relative to the eval result root. For large files, use offset/limit to paginate.',
74
+ inputSchema: z.object({
75
+ path: z
76
+ .string()
77
+ .describe('Relative path to the file, e.g. "run-1/result.json"'),
78
+ offset: z
79
+ .number()
80
+ .describe('Line offset to start reading from (0-based)')
81
+ .optional(),
82
+ limit: z
83
+ .number()
84
+ .describe('Max number of lines to return')
85
+ .optional(),
86
+ }),
87
+ execute: async ({ path: relPath, offset: rawOffset, limit: rawLimit }) => {
88
+ const offset = rawOffset ?? 0;
89
+ const limit = rawLimit ?? 200;
90
+ const target = safePath(evalResultDir, relPath);
91
+ if (!target)
92
+ return { error: 'Path outside allowed directory' };
93
+ try {
94
+ const content = readFileSync(target, 'utf-8');
95
+ const lines = content.split('\n');
96
+ const sliced = lines.slice(offset, offset + limit);
97
+ return {
98
+ content: sliced.join('\n'),
99
+ totalLines: lines.length,
100
+ showing: `lines ${offset}-${Math.min(offset + limit, lines.length)} of ${lines.length}`,
101
+ };
102
+ }
103
+ catch {
104
+ return { error: `Cannot read: ${relPath}` };
105
+ }
106
+ },
107
+ }),
108
+ grep: tool({
109
+ description: 'Search for a pattern in files under a directory. Returns matching lines with context.',
110
+ inputSchema: z.object({
111
+ pattern: z.string().describe('Text or regex pattern to search for'),
112
+ path: z
113
+ .string()
114
+ .describe('Relative directory or file to search in, e.g. "." or "run-1"'),
115
+ maxResults: z
116
+ .number()
117
+ .describe('Max number of matches to return')
118
+ .optional(),
119
+ }),
120
+ execute: async ({ pattern, path: relPath, maxResults: rawMax }) => {
121
+ const maxResults = rawMax ?? 20;
122
+ const target = safePath(evalResultDir, relPath);
123
+ if (!target)
124
+ return { error: 'Path outside allowed directory' };
125
+ const regex = new RegExp(pattern, 'i');
126
+ const matches = [];
127
+ async function searchFile(filePath, relName) {
128
+ try {
129
+ const content = readFileSync(filePath, 'utf-8');
130
+ const lines = content.split('\n');
131
+ for (let i = 0; i < lines.length && matches.length < maxResults; i++) {
132
+ if (regex.test(lines[i])) {
133
+ matches.push({
134
+ file: relName,
135
+ line: i + 1,
136
+ text: lines[i].slice(0, 500),
137
+ });
138
+ }
139
+ }
140
+ }
141
+ catch {
142
+ // Skip unreadable files
143
+ }
144
+ }
145
+ async function searchDir(dirPath, prefix) {
146
+ try {
147
+ const entries = readdirSync(dirPath);
148
+ for (const entry of entries) {
149
+ if (matches.length >= maxResults)
150
+ break;
151
+ const full = join(dirPath, entry);
152
+ const rel = prefix ? `${prefix}/${entry}` : entry;
153
+ const info = statSync(full);
154
+ if (info.isDirectory()) {
155
+ await searchDir(full, rel);
156
+ }
157
+ else {
158
+ await searchFile(full, rel);
159
+ }
160
+ }
161
+ }
162
+ catch {
163
+ // Skip unreadable dirs
164
+ }
165
+ }
166
+ try {
167
+ const info = statSync(target);
168
+ if (info.isDirectory()) {
169
+ await searchDir(target, relPath === '.' ? '' : relPath);
170
+ }
171
+ else {
172
+ await searchFile(target, relPath);
173
+ }
174
+ }
175
+ catch {
176
+ return { error: `Path not found: ${relPath}` };
177
+ }
178
+ return {
179
+ matches,
180
+ totalFound: matches.length,
181
+ truncated: matches.length >= maxResults,
182
+ };
183
+ },
184
+ }),
185
+ };
186
+ }
187
+ /**
188
+ * Classify a failure using AI via the Vercel AI Gateway.
189
+ * Requires AI_GATEWAY_API_KEY in the environment.
190
+ */
191
+ export async function classifyWithAI(evalResultDir, evalName, experimentName) {
192
+ const { generateText, hasToolCall, createGateway } = await import('ai');
193
+ const gateway = createGateway({ apiKey: process.env.AI_GATEWAY_API_KEY ?? '' });
194
+ let classification = null;
195
+ const explorationTools = createClassifierTools(evalResultDir);
196
+ const allTools = {
197
+ ...explorationTools,
198
+ classify: tool({
199
+ description: 'Submit your final classification. Call this once you have enough evidence.',
200
+ inputSchema: z.object({
201
+ failureType: z
202
+ .enum(['model', 'infra', 'timeout'])
203
+ .describe('The failure category'),
204
+ failureReason: z
205
+ .string()
206
+ .describe('Brief 1-2 sentence explanation of why'),
207
+ }),
208
+ execute: async ({ failureType, failureReason }) => {
209
+ classification = { failureType: failureType, failureReason };
210
+ return { ok: true };
211
+ },
212
+ }),
213
+ };
214
+ try {
215
+ await generateText({
216
+ model: gateway('anthropic/claude-sonnet-4-5'),
217
+ system: CLASSIFIER_SYSTEM_PROMPT,
218
+ prompt: `Classify the failure for eval "${evalName}" (experiment: ${experimentName}). Use the exploration tools to investigate, then call classify() with your verdict.`,
219
+ tools: allTools,
220
+ stopWhen: hasToolCall('classify'),
221
+ });
222
+ return classification;
223
+ }
224
+ catch {
225
+ return null;
226
+ }
227
+ }
228
+ /**
229
+ * Classify a failed eval result using AI.
230
+ * Requires AI_GATEWAY_API_KEY in the environment.
231
+ *
232
+ * Caches results in classification.json within the eval result directory.
233
+ */
234
+ export async function classifyFailure(evalResultDir, evalName, experimentName) {
235
+ // Check for cached classification
236
+ const cachedPath = join(evalResultDir, 'classification.json');
237
+ try {
238
+ const cached = JSON.parse(readFileSync(cachedPath, 'utf-8'));
239
+ if (cached.failureType && cached.failureReason) {
240
+ return { failureType: cached.failureType, failureReason: cached.failureReason };
241
+ }
242
+ }
243
+ catch {
244
+ // No cache
245
+ }
246
+ // Classify with AI
247
+ const classification = await classifyWithAI(evalResultDir, evalName, experimentName);
248
+ // Cache the result
249
+ if (classification) {
250
+ try {
251
+ writeFileSync(cachedPath, JSON.stringify(classification, null, 2));
252
+ }
253
+ catch {
254
+ // Non-fatal: caching failed
255
+ }
256
+ }
257
+ return classification;
258
+ }
259
+ /**
260
+ * Check if all runs of an eval failed with non-model failures.
261
+ * Used to determine if auto-retry is appropriate.
262
+ */
263
+ export function shouldRetry(classifications) {
264
+ if (classifications.length === 0)
265
+ return false;
266
+ return classifications.every((c) => c.failureType !== 'model');
267
+ }
268
+ //# sourceMappingURL=classifier.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"classifier.js","sourceRoot":"","sources":["../../src/lib/classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AACxE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,EAAE,IAAI,EAAE,MAAM,IAAI,CAAC;AAC1B,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;;;;;qVAiBoT,CAAC;AAEtV;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,YAAoB;IAClD,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;IAC7C,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5C,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CAAC,aAAqB;IACzD,OAAO;QACL,UAAU,EAAE,IAAI,CAAC;YACf,WAAW,EACT,8FAA8F;YAChG,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,+DAA+D,CAAC;aAC7E,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE;gBACnC,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;oBACpC,MAAM,OAAO,GAAkD,EAAE,CAAC;oBAClE,KAAK,MAAM,KAAK,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;wBACnC,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;wBAC3C,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;oBAC3E,CAAC;oBACD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;gBAC9B,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,gBAAgB,OAAO,EAAE,EAAE,CAAC;gBAC9C,CAAC;YACH,CAAC;SACF,CAAC;QAEF,SAAS,EAAE,IAAI,CAAC;YACd,WAAW,EACT,8FAA8F;YAChG,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,qDAAqD,CAAC;gBAClE,MAAM,EAAE,CAAC;qBACN,MAAM,EAAE;qBACR,QAAQ,CAAC,6CAA6C,CAAC;qBACvD,QAAQ,EAAE;gBACb,KAAK,EAAE,CAAC;qBACL,MAAM,EAAE;qBACR,QAAQ,CAAC,+BAA+B,CAAC;qBACzC,QAAQ,EAAE;aACd,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE,EAAE;gBACvE,MAAM,MAAM,GAAG,SAAS,IAAI,CAAC,CAAC;gBAC9B,MAAM,KAAK,GAAG,QAAQ,IAAI,GAAG,CAAC;gBAC9B,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;oBAC9C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;oBAClC,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,GAAG,KAAK,CAAC,CAAC;oBACnD,OAAO;wBACL,OAAO,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;wBAC1B,UAAU,EAAE,KAAK,CAAC,MAAM;wBACxB,OAAO,EAAE,SAAS,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,OAAO,KAAK,CAAC,MAAM,EAAE;qBACxF,CAAC;gBACJ,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,gBAAgB,OAAO,EAAE,EAAE,CAAC;gBAC9C,CAAC;YACH,CAAC;SACF,CAAC;QAEF,IAAI,EAAE,IAAI,CAAC;YACT,WAAW,EACT,uFAAuF;YACzF,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,qCAAqC,CAAC;gBACnE,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,8DAA8D,CAAC;gBAC3E,UAAU,EAAE,CAAC;qBACV,MAAM,EAAE;qBACR,QAAQ,CAAC,iCAAiC,CAAC;qBAC3C,QAAQ,EAAE;aACd,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,EAAE,EAAE;gBAChE,MAAM,UAAU,GAAG,MAAM,IAAI,EAAE,CAAC;gBAChC,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;gBACvC,MAAM,OAAO,GAAwD,EAAE,CAAC;gBAExE,KAAK,UAAU,UAAU,CAAC,QAAgB,EAAE,OAAe;oBACzD,IAAI,CAAC;wBACH,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;wBAChD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;wBAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;4BACrE,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gCACzB,OAAO,CAAC,IAAI,CAAC;oCACX,IAAI,EAAE,OAAO;oCACb,IAAI,EAAE,CAAC,GAAG,CAAC;oCACX,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;iCAC7B,CAAC,CAAC;4BACL,CAAC;wBACH,CAAC;oBACH,CAAC;oBAAC,MAAM,CAAC;wBACP,wBAAwB;oBAC1B,CAAC;gBACH,CAAC;gBAED,KAAK,UAAU,SAAS,CAAC,OAAe,EAAE,MAAc;oBACtD,IAAI,CAAC;wBACH,MAAM,OAAO,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC;wBACrC,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;4BAC5B,IAAI,OAAO,CAAC,MAAM,IAAI,UAAU;gCAAE,MAAM;4BACxC,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;4BAClC,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;4BAClD,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;4BAC5B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gCACvB,MAAM,SAAS,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;4BAC7B,CAAC;iCAAM,CAAC;gCACN,MAAM,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;4BAC9B,CAAC;wBACH,CAAC;oBACH,CAAC;oBAAC,MAAM,CAAC;wBACP,uBAAuB;oBACzB,CAAC;gBACH,CAAC;gBAED,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC;oBAC9B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;wBACvB,MAAM,SAAS,CAAC,MAAM,EAAE,OAAO,KAAK,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;oBAC1D,CAAC;yBAAM,CAAC;wBACN,MAAM,UAAU,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;oBACpC,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,mBAAmB,OAAO,EAAE,EAAE,CAAC;gBACjD,CAAC;gBAED,OAAO;oBACL,OAAO;oBACP,UAAU,EAAE,OAAO,CAAC,MAAM;oBAC1B,SAAS,EAAE,OAAO,CAAC,MAAM,IAAI,UAAU;iBACxC,CAAC;YACJ,CAAC;SACF,CAAC;KACH,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,aAAqB,EACrB,QAAgB,EAChB,cAAsB;IAEtB,MAAM,EAAE,YAAY,EAAE,WAAW,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;IAExE,MAAM,OAAO,GAAG,aAAa,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,kBAAkB,IAAI,EAAE,EAAE,CAAC,CAAC;IAEhF,IAAI,cAAc,GAA0B,IAAI,CAAC;IAEjD,MAAM,gBAAgB,GAAG,qBAAqB,CAAC,aAAa,CAAC,CAAC;IAC9D,MAAM,QAAQ,GAAG;QACf,GAAG,gBAAgB;QACnB,QAAQ,EAAE,IAAI,CAAC;YACb,WAAW,EAAE,4EAA4E;YACzF,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,WAAW,EAAE,CAAC;qBACX,IAAI,CAAC,CAAC,OAAO,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;qBACnC,QAAQ,CAAC,sBAAsB,CAAC;gBACnC,aAAa,EAAE,CAAC;qBACb,MAAM,EAAE;qBACR,QAAQ,CAAC,uCAAuC,CAAC;aACrD,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,WAAW,EAAE,aAAa,EAAE,EAAE,EAAE;gBAChD,cAAc,GAAG,EAAE,WAAW,EAAE,WAA0B,EAAE,aAAa,EAAE,CAAC;gBAC5E,OAAO,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC;YACtB,CAAC;SACF,CAAC;KACH,CAAC;IAEF,IAAI,CAAC;QACH,MAAM,YAAY,CAAC;YACjB,KAAK,EAAE,OAAO,CAAC,6BAA6B,CAAC;YAC7C,MAAM,EAAE,wBAAwB;YAChC,MAAM,EAAE,kCAAkC,QAAQ,kBAAkB,cAAc,sFAAsF;YACxK,KAAK,EAAE,QAAQ;YACf,QAAQ,EAAE,WAAW,CAAC,UAAU,CAAC;SAClC,CAAC,CAAC;QAEH,OAAO,cAAc,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,aAAqB,EACrB,QAAgB,EAChB,cAAsB;IAEtB,kCAAkC;IAClC,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,EAAE,qBAAqB,CAAC,CAAC;IAC9D,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC;QAC7D,IAAI,MAAM,CAAC,WAAW,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YAC/C,OAAO,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,aAAa,EAAE,MAAM,CAAC,aAAa,EAAE,CAAC;QAClF,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,WAAW;IACb,CAAC;IAED,mBAAmB;IACnB,MAAM,cAAc,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,QAAQ,EAAE,cAAc,CAAC,CAAC;IAErF,mBAAmB;IACnB,IAAI,cAAc,EAAE,CAAC;QACnB,IAAI,CAAC;YACH,aAAa,CAAC,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QACrE,CAAC;QAAC,MAAM,CAAC;YACP,4BAA4B;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,cAAc,CAAC;AACxB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,eAAiC;IAC3D,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAC/C,OAAO,eAAe,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,KAAK,OAAO,CAAC,CAAC;AACjE,CAAC"}
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Content fingerprinting for eval result reuse.
3
+ *
4
+ * A fingerprint captures the eval files + config fields that affect results.
5
+ * If the fingerprint matches and the result is valid, the eval can be skipped.
6
+ */
7
+ import type { RunnableExperimentConfig } from './types.js';
8
+ /**
9
+ * Compute a fingerprint for an (eval, config) pair.
10
+ *
11
+ * Hashes: all eval directory files + config fields that affect results.
12
+ * Returns a hex SHA-256 digest.
13
+ */
14
+ export declare function computeFingerprint(evalPath: string, config: RunnableExperimentConfig): string;
15
+ //# sourceMappingURL=fingerprint.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fingerprint.d.ts","sourceRoot":"","sources":["../../src/lib/fingerprint.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,OAAO,KAAK,EAAE,wBAAwB,EAAE,MAAM,YAAY,CAAC;AAuC3D;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,wBAAwB,GAAG,MAAM,CAuB7F"}
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Content fingerprinting for eval result reuse.
3
+ *
4
+ * A fingerprint captures the eval files + config fields that affect results.
5
+ * If the fingerprint matches and the result is valid, the eval can be skipped.
6
+ */
7
+ import { createHash } from 'crypto';
8
+ import { readFileSync, readdirSync, statSync } from 'fs';
9
+ import { join } from 'path';
10
+ /**
11
+ * Recursively collects all files in a directory, sorted for deterministic hashing.
12
+ * Skips node_modules and .git.
13
+ */
14
+ function collectFiles(dir, basePath = '') {
15
+ const files = [];
16
+ const entries = readdirSync(dir).sort();
17
+ for (const entry of entries) {
18
+ if (entry === 'node_modules' || entry === '.git')
19
+ continue;
20
+ const fullPath = join(dir, entry);
21
+ const relativePath = basePath ? `${basePath}/${entry}` : entry;
22
+ const stat = statSync(fullPath);
23
+ if (stat.isDirectory()) {
24
+ files.push(...collectFiles(fullPath, relativePath));
25
+ }
26
+ else {
27
+ files.push({ relativePath, content: readFileSync(fullPath, 'utf-8') });
28
+ }
29
+ }
30
+ return files;
31
+ }
32
+ /**
33
+ * Compute a fingerprint for an (eval, config) pair.
34
+ *
35
+ * Hashes: all eval directory files + config fields that affect results.
36
+ * Returns a hex SHA-256 digest.
37
+ */
38
+ export function computeFingerprint(evalPath, config) {
39
+ const hash = createHash('sha256');
40
+ // Hash all files in the eval directory (sorted for determinism)
41
+ const files = collectFiles(evalPath);
42
+ for (const file of files) {
43
+ hash.update(`file:${file.relativePath}\n`);
44
+ hash.update(file.content);
45
+ hash.update('\0');
46
+ }
47
+ // Hash config fields that affect results
48
+ const configForHash = {
49
+ agent: config.agent,
50
+ model: config.model,
51
+ scripts: [...config.scripts].sort(),
52
+ timeout: config.timeout,
53
+ earlyExit: config.earlyExit,
54
+ runs: config.runs,
55
+ };
56
+ hash.update(`config:${JSON.stringify(configForHash)}`);
57
+ return hash.digest('hex');
58
+ }
59
+ //# sourceMappingURL=fingerprint.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fingerprint.js","sourceRoot":"","sources":["../../src/lib/fingerprint.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AACpC,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,IAAI,CAAC;AACzD,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAgB5B;;;GAGG;AACH,SAAS,YAAY,CAAC,GAAW,EAAE,WAAmB,EAAE;IACtD,MAAM,KAAK,GAAqD,EAAE,CAAC;IACnE,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAExC,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,IAAI,KAAK,KAAK,cAAc,IAAI,KAAK,KAAK,MAAM;YAAE,SAAS;QAC3D,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAClC,MAAM,YAAY,GAAG,QAAQ,CAAC,CAAC,CAAC,GAAG,QAAQ,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;QAC/D,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAEhC,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;YACvB,KAAK,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,KAAK,CAAC,IAAI,CAAC,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,EAAE,CAAC,CAAC;QACzE,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB,CAAC,QAAgB,EAAE,MAAgC;IACnF,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;IAElC,gEAAgE;IAChE,MAAM,KAAK,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACrC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,MAAM,CAAC,QAAQ,IAAI,CAAC,YAAY,IAAI,CAAC,CAAC;QAC3C,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC1B,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACpB,CAAC;IAED,yCAAyC;IACzC,MAAM,aAAa,GAA0B;QAC3C,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,OAAO,EAAE,CAAC,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE;QACnC,OAAO,EAAE,MAAM,CAAC,OAAO;QACvB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;KAClB,CAAC;IACF,IAAI,CAAC,MAAM,CAAC,UAAU,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,EAAE,CAAC,CAAC;IAEvD,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC5B,CAAC"}
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Housekeeping for eval results.
3
+ *
4
+ * After experiments complete, consolidate results:
5
+ * - For each (experiment, eval) pair: keep only the latest valid result
6
+ * - Remove older duplicates and dangling/incomplete results
7
+ * - Remove empty timestamp directories
8
+ */
9
+ interface HousekeepingStats {
10
+ removedDuplicates: number;
11
+ removedIncomplete: number;
12
+ removedEmptyDirs: number;
13
+ }
14
+ /**
15
+ * Run housekeeping on a single experiment's results directory.
16
+ *
17
+ * For each eval: keeps the newest complete result (has summary.json and
18
+ * at least one transcript), removes older duplicates and incomplete results.
19
+ * Removes empty timestamp directories afterward.
20
+ */
21
+ export declare function housekeep(resultsDir: string, experimentName: string, options?: {
22
+ dry?: boolean;
23
+ }): HousekeepingStats;
24
+ export {};
25
+ //# sourceMappingURL=housekeeping.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"housekeeping.d.ts","sourceRoot":"","sources":["../../src/lib/housekeeping.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAKH,UAAU,iBAAiB;IACzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,iBAAiB,EAAE,MAAM,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;CAC1B;AAED;;;;;;GAMG;AACH,wBAAgB,SAAS,CACvB,UAAU,EAAE,MAAM,EAClB,cAAc,EAAE,MAAM,EACtB,OAAO,CAAC,EAAE;IAAE,GAAG,CAAC,EAAE,OAAO,CAAA;CAAE,GAC1B,iBAAiB,CAwFnB"}
@@ -0,0 +1,148 @@
1
+ /**
2
+ * Housekeeping for eval results.
3
+ *
4
+ * After experiments complete, consolidate results:
5
+ * - For each (experiment, eval) pair: keep only the latest valid result
6
+ * - Remove older duplicates and dangling/incomplete results
7
+ * - Remove empty timestamp directories
8
+ */
9
+ import { readdirSync, rmSync, existsSync, readFileSync, statSync } from 'fs';
10
+ import { join } from 'path';
11
+ /**
12
+ * Run housekeeping on a single experiment's results directory.
13
+ *
14
+ * For each eval: keeps the newest complete result (has summary.json and
15
+ * at least one transcript), removes older duplicates and incomplete results.
16
+ * Removes empty timestamp directories afterward.
17
+ */
18
+ export function housekeep(resultsDir, experimentName, options) {
19
+ const stats = {
20
+ removedDuplicates: 0,
21
+ removedIncomplete: 0,
22
+ removedEmptyDirs: 0,
23
+ };
24
+ const experimentDir = join(resultsDir, experimentName);
25
+ if (!existsSync(experimentDir))
26
+ return stats;
27
+ // Get all timestamps sorted newest first
28
+ let timestamps;
29
+ try {
30
+ timestamps = readdirSync(experimentDir)
31
+ .filter((t) => !t.startsWith('.'))
32
+ .filter((t) => {
33
+ try {
34
+ return statSync(join(experimentDir, t)).isDirectory();
35
+ }
36
+ catch {
37
+ return false;
38
+ }
39
+ })
40
+ .sort()
41
+ .reverse();
42
+ }
43
+ catch {
44
+ return stats;
45
+ }
46
+ // Track which (eval, fingerprint) pairs we've already seen (newest wins).
47
+ // Results with different fingerprints (e.g. smoke vs full run) are not
48
+ // duplicates of each other and should coexist.
49
+ const seenEvals = new Set();
50
+ for (const timestamp of timestamps) {
51
+ const tsDir = join(experimentDir, timestamp);
52
+ let evalDirs;
53
+ try {
54
+ evalDirs = readdirSync(tsDir).filter((d) => !d.startsWith('.'));
55
+ }
56
+ catch {
57
+ continue;
58
+ }
59
+ for (const evalDir of evalDirs) {
60
+ const evalResultDir = join(tsDir, evalDir);
61
+ if (!statSync(evalResultDir).isDirectory())
62
+ continue;
63
+ // Read fingerprint to distinguish different configs (e.g. smoke vs full)
64
+ const fingerprint = readFingerprint(evalResultDir);
65
+ const dedupeKey = fingerprint ? `${evalDir}:${fingerprint}` : evalDir;
66
+ if (seenEvals.has(dedupeKey)) {
67
+ // Older duplicate with same fingerprint — remove
68
+ if (!options?.dry) {
69
+ rmSync(evalResultDir, { recursive: true });
70
+ }
71
+ stats.removedDuplicates++;
72
+ continue;
73
+ }
74
+ // Check if this result is complete
75
+ if (isComplete(evalResultDir)) {
76
+ seenEvals.add(dedupeKey);
77
+ }
78
+ else {
79
+ // Incomplete — remove
80
+ if (!options?.dry) {
81
+ rmSync(evalResultDir, { recursive: true });
82
+ }
83
+ stats.removedIncomplete++;
84
+ }
85
+ }
86
+ // Check if timestamp dir is now empty
87
+ try {
88
+ const remaining = readdirSync(tsDir).filter((d) => !d.startsWith('.'));
89
+ if (remaining.length === 0) {
90
+ if (!options?.dry) {
91
+ rmSync(tsDir, { recursive: true });
92
+ }
93
+ stats.removedEmptyDirs++;
94
+ }
95
+ }
96
+ catch {
97
+ // Directory already removed or inaccessible
98
+ }
99
+ }
100
+ return stats;
101
+ }
102
+ /**
103
+ * Read the fingerprint from an eval result's summary.json, if present.
104
+ */
105
+ function readFingerprint(evalResultDir) {
106
+ try {
107
+ const summary = JSON.parse(readFileSync(join(evalResultDir, 'summary.json'), 'utf-8'));
108
+ return summary.fingerprint;
109
+ }
110
+ catch {
111
+ return undefined;
112
+ }
113
+ }
114
+ /**
115
+ * Check if an eval result directory is complete.
116
+ * Complete means: has summary.json and at least one run with a transcript.
117
+ */
118
+ function isComplete(evalResultDir) {
119
+ const summaryPath = join(evalResultDir, 'summary.json');
120
+ if (!existsSync(summaryPath))
121
+ return false;
122
+ // Check for at least one transcript
123
+ try {
124
+ const entries = readdirSync(evalResultDir);
125
+ for (const entry of entries) {
126
+ if (!entry.startsWith('run-'))
127
+ continue;
128
+ const runDir = join(evalResultDir, entry);
129
+ if (existsSync(join(runDir, 'transcript-raw.jsonl')) ||
130
+ existsSync(join(runDir, 'transcript.json'))) {
131
+ return true;
132
+ }
133
+ }
134
+ }
135
+ catch {
136
+ return false;
137
+ }
138
+ // No transcript found — but summary.json exists.
139
+ // Still consider complete if summary shows 0% (model produced nothing, which is valid).
140
+ try {
141
+ const summary = JSON.parse(readFileSync(summaryPath, 'utf-8'));
142
+ return summary.totalRuns > 0;
143
+ }
144
+ catch {
145
+ return false;
146
+ }
147
+ }
148
+ //# sourceMappingURL=housekeeping.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"housekeeping.js","sourceRoot":"","sources":["../../src/lib/housekeeping.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,UAAU,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,IAAI,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAQ5B;;;;;;GAMG;AACH,MAAM,UAAU,SAAS,CACvB,UAAkB,EAClB,cAAsB,EACtB,OAA2B;IAE3B,MAAM,KAAK,GAAsB;QAC/B,iBAAiB,EAAE,CAAC;QACpB,iBAAiB,EAAE,CAAC;QACpB,gBAAgB,EAAE,CAAC;KACpB,CAAC;IAEF,MAAM,aAAa,GAAG,IAAI,CAAC,UAAU,EAAE,cAAc,CAAC,CAAC;IACvD,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;QAAE,OAAO,KAAK,CAAC;IAE7C,yCAAyC;IACzC,IAAI,UAAoB,CAAC;IACzB,IAAI,CAAC;QACH,UAAU,GAAG,WAAW,CAAC,aAAa,CAAC;aACpC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;aACjC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACZ,IAAI,CAAC;gBACH,OAAO,QAAQ,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;YACxD,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC;aACD,IAAI,EAAE;aACN,OAAO,EAAE,CAAC;IACf,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;IAED,0EAA0E;IAC1E,uEAAuE;IACvE,+CAA+C;IAC/C,MAAM,SAAS,GAAG,IAAI,GAAG,EAAU,CAAC;IAEpC,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,SAAS,CAAC,CAAC;QAE7C,IAAI,QAAkB,CAAC;QACvB,IAAI,CAAC;YACH,QAAQ,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QAClE,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QAED,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;YAE3C,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE;gBAAE,SAAS;YAErD,yEAAyE;YACzE,MAAM,WAAW,GAAG,eAAe,CAAC,aAAa,CAAC,CAAC;YACnD,MAAM,SAAS,GAAG,WAAW,CAAC,CAAC,CAAC,GAAG,OAAO,IAAI,WAAW,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;YAEtE,IAAI,SAAS,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC7B,iDAAiD;gBACjD,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC;oBAClB,MAAM,CAAC,aAAa,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC7C,CAAC;gBACD,KAAK,CAAC,iBAAiB,EAAE,CAAC;gBAC1B,SAAS;YACX,CAAC;YAED,mCAAmC;YACnC,IAAI,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;gBAC9B,SAAS,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;YAC3B,CAAC;iBAAM,CAAC;gBACN,sBAAsB;gBACtB,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC;oBAClB,MAAM,CAAC,aAAa,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC7C,CAAC;gBACD,KAAK,CAAC,iBAAiB,EAAE,CAAC;YAC5B,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;YACvE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC3B,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC;oBAClB,MAAM,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;gBACrC,CAAC;gBACD,KAAK,CAAC,gBAAgB,EAAE,CAAC;YAC3B,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,4CAA4C;QAC9C,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,aAAqB;IAC5C,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,aAAa,EAAE,cAAc,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;QACvF,OAAO,OAAO,CAAC,WAAW,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,UAAU,CAAC,aAAqB;IACvC,MAAM,WAAW,GAAG,IAAI,CAAC,aAAa,EAAE,cAAc,CAAC,CAAC;IACxD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC;QAAE,OAAO,KAAK,CAAC;IAE3C,oCAAoC;IACpC,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,WAAW,CAAC,aAAa,CAAC,CAAC;QAC3C,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,CAAC;gBAAE,SAAS;YACxC,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;YAC1C,IACE,UAAU,CAAC,IAAI,CAAC,MAAM,EAAE,sBAAsB,CAAC,CAAC;gBAChD,UAAU,CAAC,IAAI,CAAC,MAAM,EAAE,iBAAiB,CAAC,CAAC,EAC3C,CAAC;gBACD,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;IAED,iDAAiD;IACjD,wFAAwF;IACxF,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC;QAC/D,OAAO,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC"}
@@ -23,6 +23,15 @@ export interface SaveResultsOptions {
23
23
  resultsDir: string;
24
24
  /** Experiment name (used for subdirectory) */
25
25
  experimentName: string;
26
+ /** Per-eval fingerprints (eval name -> fingerprint hash) */
27
+ fingerprints?: Record<string, string>;
28
+ /** Per-eval classification results (eval name -> classification) */
29
+ classifications?: Record<string, {
30
+ failureType: string;
31
+ failureReason: string;
32
+ }>;
33
+ /** Per-eval validity flags (eval name -> valid). Defaults to true. */
34
+ validity?: Record<string, boolean>;
26
35
  }
27
36
  /**
28
37
  * Save experiment results to disk.
@@ -52,4 +61,24 @@ export declare function formatRunResult(evalName: string, runNumber: number, tot
52
61
  * Create a progress indicator for running evals.
53
62
  */
54
63
  export declare function createProgressDisplay(evalName: string, runNumber: number, totalRuns: number): string;
64
+ /**
65
+ * A reusable result found by the scanner.
66
+ */
67
+ export interface ReusableResult {
68
+ evalName: string;
69
+ fingerprint: string;
70
+ passRate: string;
71
+ timestamp: string;
72
+ }
73
+ /**
74
+ * Scan existing results for an experiment to find reusable eval results.
75
+ *
76
+ * A result is reusable if:
77
+ * 1. Its fingerprint matches the current fingerprint
78
+ * 2. It is "valid" (not marked as invalid by the classifier)
79
+ * 3. It has passedRuns > 0 (successful result worth reusing)
80
+ *
81
+ * Scans all timestamps newest-first and returns the latest match per eval.
82
+ */
83
+ export declare function scanReusableResults(resultsDir: string, experimentName: string, fingerprints: Record<string, string>): Map<string, ReusableResult>;
55
84
  //# sourceMappingURL=results.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"results.d.ts","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EACV,aAAa,EACb,WAAW,EACX,WAAW,EACX,iBAAiB,EACjB,wBAAwB,EACzB,MAAM,YAAY,CAAC;AACpB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAGxD;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,WAAW,EAAE,cAAc,GAAG,WAAW,CA4BjF;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,GAAG,WAAW,CAanF;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,wBAAwB,EAChC,KAAK,EAAE,WAAW,EAAE,EACpB,SAAS,EAAE,IAAI,EACf,WAAW,EAAE,IAAI,GAChB,iBAAiB,CAOnB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,iCAAiC;IACjC,UAAU,EAAE,MAAM,CAAC;IACnB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;CACxB;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,WAAW,CACzB,OAAO,EAAE,iBAAiB,EAC1B,OAAO,EAAE,kBAAkB,GAC1B,MAAM,CAoGR;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,iBAAiB,GAAG,MAAM,CAsCrE;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,aAAa,GACpB,MAAM,CAYR;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,GAChB,MAAM,CAER"}
1
+ {"version":3,"file":"results.d.ts","sourceRoot":"","sources":["../../src/lib/results.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EACV,aAAa,EACb,WAAW,EACX,WAAW,EACX,iBAAiB,EACjB,wBAAwB,EACzB,MAAM,YAAY,CAAC;AACpB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAGxD;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,WAAW,EAAE,cAAc,GAAG,WAAW,CA4BjF;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,GAAG,WAAW,CAanF;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,wBAAwB,EAChC,KAAK,EAAE,WAAW,EAAE,EACpB,SAAS,EAAE,IAAI,EACf,WAAW,EAAE,IAAI,GAChB,iBAAiB,CAOnB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,iCAAiC;IACjC,UAAU,EAAE,MAAM,CAAC;IACnB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;IACvB,4DAA4D;IAC5D,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACtC,oEAAoE;IACpE,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE;QAAE,WAAW,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjF,sEAAsE;IACtE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,WAAW,CACzB,OAAO,EAAE,iBAAiB,EAC1B,OAAO,EAAE,kBAAkB,GAC1B,MAAM,CAkHR;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,iBAAiB,GAAG,MAAM,CAsCrE;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,aAAa,GACpB,MAAM,CAYR;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,GAChB,MAAM,CAER;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;;;;;;;;GASG;AACH,wBAAgB,mBAAmB,CACjC,UAAU,EAAE,MAAM,EAClB,cAAc,EAAE,MAAM,EACtB,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACnC,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CA8D7B"}