@ls-stack/agent-eval 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/app-CKa9TjXw.mjs +244 -0
- package/dist/apps/web/dist/assets/index-BUz24J7O.css +1 -0
- package/dist/apps/web/dist/assets/index-Dm50Ynbs.js +109 -0
- package/dist/apps/web/dist/favicon.svg +20 -0
- package/dist/apps/web/dist/index.html +34 -0
- package/dist/bin.d.mts +1 -0
- package/dist/bin.mjs +41 -0
- package/dist/cli-CwEFLP0w.mjs +3422 -0
- package/dist/index.d.mts +2043 -0
- package/dist/index.mjs +3 -0
- package/dist/runner-CD5aDJ0C.mjs +15 -0
- package/dist/runner-Ck4X0H3p.mjs +2 -0
- package/dist/src-BDRmaWFu.mjs +2 -0
- package/package.json +71 -0
|
@@ -0,0 +1,3422 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
3
|
+
import { dirname, extname, join, relative, resolve } from "node:path";
|
|
4
|
+
import { AsyncLocalStorage } from "node:async_hooks";
|
|
5
|
+
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
6
|
+
import { z } from "zod/v4";
|
|
7
|
+
import { watch } from "chokidar";
|
|
8
|
+
import { glob } from "glob";
|
|
9
|
+
import { existsSync } from "node:fs";
|
|
10
|
+
import { resultify } from "t-result";
|
|
11
|
+
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
12
|
+
import { spawn, spawnSync } from "node:child_process";
|
|
13
|
+
//#region ../sdk/src/defineEval.ts
|
|
14
|
+
const evalRegistry = /* @__PURE__ */ new Map();
|
|
15
|
+
/** Return the in-memory registry of evals defined in the current process. */
|
|
16
|
+
function getEvalRegistry() {
|
|
17
|
+
return evalRegistry;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Register an eval definition with the SDK so the runner can discover it
|
|
21
|
+
* after importing the eval module.
|
|
22
|
+
*/
|
|
23
|
+
function defineEval(definition) {
|
|
24
|
+
evalRegistry.set(definition.id, {
|
|
25
|
+
id: definition.id,
|
|
26
|
+
title: definition.title,
|
|
27
|
+
use: (fn) => fn(definition)
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
//#endregion
|
|
31
|
+
//#region ../sdk/src/repoFile.ts
|
|
32
|
+
/**
|
|
33
|
+
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
34
|
+
* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
|
|
35
|
+
*
|
|
36
|
+
* @param path Relative or absolute path to the repository file.
|
|
37
|
+
* @param mimeType Optional MIME type hint for UI rendering.
|
|
38
|
+
* @returns A repo-backed file reference suitable for file/media columns.
|
|
39
|
+
*/
|
|
40
|
+
function repoFile(path, mimeType) {
|
|
41
|
+
return {
|
|
42
|
+
source: "repo",
|
|
43
|
+
path,
|
|
44
|
+
mimeType
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
//#endregion
|
|
48
|
+
//#region ../sdk/src/runtime.ts
|
|
49
|
+
const scopeStorage = new AsyncLocalStorage();
|
|
50
|
+
let activeEvalScopeCount = 0;
|
|
51
|
+
/** Error thrown when an eval assertion fails during case execution. */
|
|
52
|
+
var EvalAssertionError = class extends Error {
|
|
53
|
+
constructor(message) {
|
|
54
|
+
super(message);
|
|
55
|
+
this.name = "EvalAssertionError";
|
|
56
|
+
}
|
|
57
|
+
};
|
|
58
|
+
/** Return the current eval scope for the active async context, if any. */
|
|
59
|
+
function getCurrentScope() {
|
|
60
|
+
if (activeEvalScopeCount === 0) return void 0;
|
|
61
|
+
return scopeStorage.getStore();
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Return whether the current async execution is inside an active eval case.
|
|
65
|
+
*
|
|
66
|
+
* This is useful for shared workflow code that wants to branch on eval-only
|
|
67
|
+
* behavior without importing or inspecting the full eval scope.
|
|
68
|
+
*/
|
|
69
|
+
function isInEvalScope() {
|
|
70
|
+
return getCurrentScope() !== void 0;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Attach cache context (adapter, mode, eval id, fingerprint) to a scope.
|
|
74
|
+
*
|
|
75
|
+
* Runner-internal helper called immediately before the user's `execute`
|
|
76
|
+
* function runs inside `runInEvalScope`.
|
|
77
|
+
*/
|
|
78
|
+
function setScopeCacheContext(scope, context) {
|
|
79
|
+
scope.cacheContext = context;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Execute a callback inside a fresh eval case scope and capture its outputs,
|
|
83
|
+
* trace data, and terminal error state.
|
|
84
|
+
*/
|
|
85
|
+
async function runInEvalScope(caseId, fn, options = {}) {
|
|
86
|
+
const scope = {
|
|
87
|
+
caseId,
|
|
88
|
+
outputs: {},
|
|
89
|
+
assertionFailures: [],
|
|
90
|
+
spans: [],
|
|
91
|
+
checkpoints: /* @__PURE__ */ new Map(),
|
|
92
|
+
spanStack: [],
|
|
93
|
+
activeSpanStack: [],
|
|
94
|
+
recordingStack: [],
|
|
95
|
+
replayingDepth: 0,
|
|
96
|
+
cacheContext: options.cacheContext
|
|
97
|
+
};
|
|
98
|
+
activeEvalScopeCount++;
|
|
99
|
+
try {
|
|
100
|
+
return await scopeStorage.run(scope, async () => {
|
|
101
|
+
try {
|
|
102
|
+
return {
|
|
103
|
+
result: await fn(),
|
|
104
|
+
scope,
|
|
105
|
+
error: void 0
|
|
106
|
+
};
|
|
107
|
+
} catch (error) {
|
|
108
|
+
return {
|
|
109
|
+
result: void 0,
|
|
110
|
+
scope,
|
|
111
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
} finally {
|
|
116
|
+
activeEvalScopeCount--;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
function recordOpIfActive(scope, op) {
|
|
120
|
+
if (scope.replayingDepth > 0) return;
|
|
121
|
+
const top = scope.recordingStack.at(-1);
|
|
122
|
+
if (top) top.ops.push(op);
|
|
123
|
+
}
|
|
124
|
+
function toAssertionFailure$1(message, error = void 0) {
|
|
125
|
+
return error?.stack ? {
|
|
126
|
+
message,
|
|
127
|
+
stack: error.stack
|
|
128
|
+
} : { message };
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Record or replace an output value for the current case scope.
|
|
132
|
+
*
|
|
133
|
+
* Supported values include scalars, JSON-safe objects/arrays, explicit file
|
|
134
|
+
* refs, and native `Blob`/`File` instances for media or file columns.
|
|
135
|
+
*/
|
|
136
|
+
function setEvalOutput(key, value) {
|
|
137
|
+
const scope = getCurrentScope();
|
|
138
|
+
if (!scope) return;
|
|
139
|
+
scope.outputs[key] = value;
|
|
140
|
+
recordOpIfActive(scope, {
|
|
141
|
+
kind: "setOutput",
|
|
142
|
+
key,
|
|
143
|
+
value
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Add a numeric delta to an output value in the current case scope.
|
|
148
|
+
*
|
|
149
|
+
* If the existing value is non-numeric, the operation is recorded as an
|
|
150
|
+
* assertion failure instead of mutating the output.
|
|
151
|
+
*/
|
|
152
|
+
function incrementEvalOutput(key, delta) {
|
|
153
|
+
const scope = getCurrentScope();
|
|
154
|
+
if (!scope) return;
|
|
155
|
+
const existing = scope.outputs[key];
|
|
156
|
+
if (existing === void 0) {
|
|
157
|
+
scope.outputs[key] = delta;
|
|
158
|
+
recordOpIfActive(scope, {
|
|
159
|
+
kind: "incrementOutput",
|
|
160
|
+
key,
|
|
161
|
+
delta
|
|
162
|
+
});
|
|
163
|
+
return;
|
|
164
|
+
}
|
|
165
|
+
if (typeof existing !== "number") {
|
|
166
|
+
scope.assertionFailures.push(toAssertionFailure$1(`incrementEvalOutput("${key}"): existing value is ${typeof existing}, expected number`));
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
scope.outputs[key] = existing + delta;
|
|
170
|
+
recordOpIfActive(scope, {
|
|
171
|
+
kind: "incrementOutput",
|
|
172
|
+
key,
|
|
173
|
+
delta
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Assert a condition for the current eval case and throw on failure.
|
|
178
|
+
*
|
|
179
|
+
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
180
|
+
* can safely reuse `evalAssert(...)` when it also runs outside an eval.
|
|
181
|
+
*/
|
|
182
|
+
function evalAssert(condition, message) {
|
|
183
|
+
if (condition) return;
|
|
184
|
+
const scope = getCurrentScope();
|
|
185
|
+
if (!scope) return;
|
|
186
|
+
const error = new EvalAssertionError(message);
|
|
187
|
+
scope.assertionFailures.push(toAssertionFailure$1(message, error));
|
|
188
|
+
throw error;
|
|
189
|
+
}
|
|
190
|
+
//#endregion
|
|
191
|
+
//#region ../sdk/src/tracer.ts
|
|
192
|
+
let spanIdCounter = 0;
|
|
193
|
+
function generateSpanId() {
|
|
194
|
+
spanIdCounter++;
|
|
195
|
+
return `span_${String(Date.now())}_${String(spanIdCounter)}`;
|
|
196
|
+
}
|
|
197
|
+
function updateCurrentSpan(update) {
|
|
198
|
+
const currentSpan = getCurrentScope()?.activeSpanStack.at(-1);
|
|
199
|
+
if (!currentSpan) return;
|
|
200
|
+
update(currentSpan);
|
|
201
|
+
}
|
|
202
|
+
function noopActiveSpan() {
|
|
203
|
+
return {
|
|
204
|
+
setName() {},
|
|
205
|
+
setAttribute() {},
|
|
206
|
+
setAttributes() {}
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
function mergeSpanAttributes(span, attributes) {
|
|
210
|
+
span.attributes = {
|
|
211
|
+
...span.attributes,
|
|
212
|
+
...attributes
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
function createSpanHandle(span) {
|
|
216
|
+
return {
|
|
217
|
+
setName(value) {
|
|
218
|
+
span.name = value;
|
|
219
|
+
},
|
|
220
|
+
setAttribute(key, value) {
|
|
221
|
+
mergeSpanAttributes(span, { [key]: value });
|
|
222
|
+
},
|
|
223
|
+
setAttributes(value) {
|
|
224
|
+
mergeSpanAttributes(span, value);
|
|
225
|
+
}
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Ambient handle for the active span in the current async context.
|
|
230
|
+
*
|
|
231
|
+
* Calls are no-ops when executed outside of `evalTracer.span(...)`.
|
|
232
|
+
*/
|
|
233
|
+
const evalSpan = {
|
|
234
|
+
setName(value) {
|
|
235
|
+
updateCurrentSpan((currentSpan) => {
|
|
236
|
+
currentSpan.name = value;
|
|
237
|
+
});
|
|
238
|
+
},
|
|
239
|
+
setAttribute(key, value) {
|
|
240
|
+
updateCurrentSpan((currentSpan) => {
|
|
241
|
+
mergeSpanAttributes(currentSpan, { [key]: value });
|
|
242
|
+
});
|
|
243
|
+
},
|
|
244
|
+
setAttributes(value) {
|
|
245
|
+
updateCurrentSpan((currentSpan) => {
|
|
246
|
+
mergeSpanAttributes(currentSpan, value);
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
};
|
|
250
|
+
async function traceSpan(info, fn) {
|
|
251
|
+
const scope = getCurrentScope();
|
|
252
|
+
if (!scope) return await fn(noopActiveSpan());
|
|
253
|
+
const id = generateSpanId();
|
|
254
|
+
const spanRecord = {
|
|
255
|
+
id,
|
|
256
|
+
parentId: scope.activeSpanStack.at(-1)?.id ?? null,
|
|
257
|
+
caseId: scope.caseId,
|
|
258
|
+
kind: info.kind,
|
|
259
|
+
name: info.name,
|
|
260
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
261
|
+
endedAt: null,
|
|
262
|
+
status: "running",
|
|
263
|
+
attributes: info.attributes
|
|
264
|
+
};
|
|
265
|
+
scope.spans.push(spanRecord);
|
|
266
|
+
scope.spanStack.push(id);
|
|
267
|
+
scope.activeSpanStack.push(spanRecord);
|
|
268
|
+
const activeSpan = createSpanHandle(spanRecord);
|
|
269
|
+
try {
|
|
270
|
+
const cacheOpts = info.cache;
|
|
271
|
+
const cacheCtx = scope.cacheContext;
|
|
272
|
+
if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
|
|
273
|
+
const ctx = cacheCtx;
|
|
274
|
+
const namespace = cacheOpts.namespace ?? `${ctx.evalId}__${info.name}`;
|
|
275
|
+
const keyHash = hashCacheKey({
|
|
276
|
+
namespace,
|
|
277
|
+
codeFingerprint: ctx.codeFingerprint,
|
|
278
|
+
key: cacheOpts.key
|
|
279
|
+
});
|
|
280
|
+
mergeSpanAttributes(spanRecord, {
|
|
281
|
+
"cache.key": keyHash,
|
|
282
|
+
"cache.namespace": namespace
|
|
283
|
+
});
|
|
284
|
+
if (ctx.mode === "use") {
|
|
285
|
+
const hit = await ctx.adapter.lookup(namespace, keyHash);
|
|
286
|
+
if (hit) {
|
|
287
|
+
const storedAt = hit.storedAt;
|
|
288
|
+
mergeSpanAttributes(spanRecord, {
|
|
289
|
+
"cache.status": "hit",
|
|
290
|
+
"cache.storedAt": storedAt,
|
|
291
|
+
"cache.age": Date.now() - new Date(storedAt).getTime()
|
|
292
|
+
});
|
|
293
|
+
replayRecording(scope, spanRecord, hit.recording);
|
|
294
|
+
spanRecord.status = "ok";
|
|
295
|
+
spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
296
|
+
return hit.recording.returnValue;
|
|
297
|
+
}
|
|
298
|
+
mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
|
|
299
|
+
} else if (ctx.mode === "refresh") mergeSpanAttributes(spanRecord, { "cache.status": "refresh" });
|
|
300
|
+
else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
|
|
301
|
+
const frame = {
|
|
302
|
+
baseSpanIndex: scope.spans.length,
|
|
303
|
+
cachedSpanId: id,
|
|
304
|
+
ops: []
|
|
305
|
+
};
|
|
306
|
+
scope.recordingStack.push(frame);
|
|
307
|
+
let bodyResult;
|
|
308
|
+
try {
|
|
309
|
+
bodyResult = await fn(activeSpan);
|
|
310
|
+
} finally {
|
|
311
|
+
scope.recordingStack.pop();
|
|
312
|
+
}
|
|
313
|
+
appendSubSpanOps(scope, frame);
|
|
314
|
+
if (ctx.mode !== "bypass") {
|
|
315
|
+
const recording = {
|
|
316
|
+
returnValue: toJsonSafe(bodyResult),
|
|
317
|
+
finalAttributes: stripCacheAttributes(spanRecord.attributes),
|
|
318
|
+
ops: frame.ops
|
|
319
|
+
};
|
|
320
|
+
const entry = {
|
|
321
|
+
version: 1,
|
|
322
|
+
key: keyHash,
|
|
323
|
+
namespace,
|
|
324
|
+
spanName: info.name,
|
|
325
|
+
spanKind: info.kind,
|
|
326
|
+
storedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
327
|
+
codeFingerprint: ctx.codeFingerprint,
|
|
328
|
+
recording
|
|
329
|
+
};
|
|
330
|
+
await ctx.adapter.write(entry);
|
|
331
|
+
}
|
|
332
|
+
spanRecord.status = "ok";
|
|
333
|
+
spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
334
|
+
return bodyResult;
|
|
335
|
+
}
|
|
336
|
+
const result = await fn(activeSpan);
|
|
337
|
+
spanRecord.status = "ok";
|
|
338
|
+
spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
339
|
+
return result;
|
|
340
|
+
} catch (error) {
|
|
341
|
+
spanRecord.status = "error";
|
|
342
|
+
spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
343
|
+
if (error instanceof Error) spanRecord.error = {
|
|
344
|
+
name: error.name,
|
|
345
|
+
message: error.message,
|
|
346
|
+
stack: error.stack
|
|
347
|
+
};
|
|
348
|
+
else spanRecord.error = { message: String(error) };
|
|
349
|
+
throw error;
|
|
350
|
+
} finally {
|
|
351
|
+
scope.spanStack.pop();
|
|
352
|
+
scope.activeSpanStack.pop();
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Trace builder used to create hierarchical spans and checkpoints during eval
|
|
357
|
+
* execution.
|
|
358
|
+
*/
|
|
359
|
+
const evalTracer = {
|
|
360
|
+
/** Run a callback inside a new trace span and record its lifecycle. */
|
|
361
|
+
span: traceSpan,
|
|
362
|
+
/** Record a named point-in-time value alongside the trace. */
|
|
363
|
+
checkpoint(name, data) {
|
|
364
|
+
const scope = getCurrentScope();
|
|
365
|
+
if (!scope) return;
|
|
366
|
+
scope.checkpoints.set(name, data);
|
|
367
|
+
const id = generateSpanId();
|
|
368
|
+
const parentId = scope.spanStack.at(-1) ?? null;
|
|
369
|
+
scope.spans.push({
|
|
370
|
+
id,
|
|
371
|
+
parentId,
|
|
372
|
+
caseId: scope.caseId,
|
|
373
|
+
kind: "checkpoint",
|
|
374
|
+
name,
|
|
375
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
376
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
377
|
+
status: "ok",
|
|
378
|
+
attributes: { value: data }
|
|
379
|
+
});
|
|
380
|
+
if (scope.replayingDepth === 0) {
|
|
381
|
+
const top = scope.recordingStack.at(-1);
|
|
382
|
+
if (top) top.ops.push({
|
|
383
|
+
kind: "checkpoint",
|
|
384
|
+
name,
|
|
385
|
+
data
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
};
|
|
390
|
+
/** Build a queryable trace tree helper from a flat span list and checkpoints. */
|
|
391
|
+
function buildTraceTree(spans, checkpoints) {
|
|
392
|
+
return {
|
|
393
|
+
spans,
|
|
394
|
+
rootSpans: spans.filter((s) => s.parentId === null),
|
|
395
|
+
findSpan(name) {
|
|
396
|
+
return spans.find((s) => s.name === name);
|
|
397
|
+
},
|
|
398
|
+
findSpansByKind(kind) {
|
|
399
|
+
return spans.filter((s) => s.kind === kind);
|
|
400
|
+
},
|
|
401
|
+
flattenDfs() {
|
|
402
|
+
const result = [];
|
|
403
|
+
function visit(parentId) {
|
|
404
|
+
for (const childSpan of spans) if (childSpan.parentId === parentId) {
|
|
405
|
+
result.push(childSpan);
|
|
406
|
+
visit(childSpan.id);
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
visit(null);
|
|
410
|
+
return result;
|
|
411
|
+
},
|
|
412
|
+
checkpoints
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
/** Hash the components of a cache key into a deterministic hex digest. */
|
|
416
|
+
function hashCacheKey(input) {
|
|
417
|
+
return createHash("sha256").update(getCompositeKey(input)).digest("hex");
|
|
418
|
+
}
|
|
419
|
+
function toJsonSafe(value) {
|
|
420
|
+
if (value === void 0) return void 0;
|
|
421
|
+
const text = JSON.stringify(value);
|
|
422
|
+
return JSON.parse(text);
|
|
423
|
+
}
|
|
424
|
+
function stripCacheAttributes(attributes) {
|
|
425
|
+
if (!attributes) return {};
|
|
426
|
+
const result = {};
|
|
427
|
+
for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
|
|
428
|
+
return result;
|
|
429
|
+
}
|
|
430
|
+
function serializeSubSpanTree(scope, spanId) {
|
|
431
|
+
const original = scope.spans.find((s) => s.id === spanId);
|
|
432
|
+
if (!original) return {
|
|
433
|
+
kind: "custom",
|
|
434
|
+
name: "unknown",
|
|
435
|
+
attributes: void 0,
|
|
436
|
+
status: "ok",
|
|
437
|
+
error: void 0,
|
|
438
|
+
children: []
|
|
439
|
+
};
|
|
440
|
+
const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
|
|
441
|
+
return {
|
|
442
|
+
kind: original.kind,
|
|
443
|
+
name: original.name,
|
|
444
|
+
attributes: original.attributes,
|
|
445
|
+
status: original.status,
|
|
446
|
+
error: original.error,
|
|
447
|
+
children
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
function appendSubSpanOps(scope, frame) {
|
|
451
|
+
for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
|
|
452
|
+
const candidate = scope.spans[i];
|
|
453
|
+
if (candidate?.parentId === frame.cachedSpanId) frame.ops.push({
|
|
454
|
+
kind: "subSpan",
|
|
455
|
+
span: serializeSubSpanTree(scope, candidate.id)
|
|
456
|
+
});
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
function replayRecording(scope, parentSpan, recording) {
|
|
460
|
+
scope.replayingDepth++;
|
|
461
|
+
try {
|
|
462
|
+
for (const op of recording.ops) applyRecordingOp(scope, parentSpan, op);
|
|
463
|
+
if (Object.keys(recording.finalAttributes).length > 0) mergeSpanAttributes(parentSpan, recording.finalAttributes);
|
|
464
|
+
} finally {
|
|
465
|
+
scope.replayingDepth--;
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
function applyRecordingOp(scope, parentSpan, op) {
|
|
469
|
+
if (op.kind === "setOutput") {
|
|
470
|
+
scope.outputs[op.key] = op.value;
|
|
471
|
+
return;
|
|
472
|
+
}
|
|
473
|
+
if (op.kind === "incrementOutput") {
|
|
474
|
+
const existing = scope.outputs[op.key];
|
|
475
|
+
if (existing === void 0) scope.outputs[op.key] = op.delta;
|
|
476
|
+
else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
|
|
477
|
+
else scope.assertionFailures.push({ message: `replay incrementEvalOutput("${op.key}"): existing value is ${typeof existing}, expected number` });
|
|
478
|
+
return;
|
|
479
|
+
}
|
|
480
|
+
if (op.kind === "checkpoint") {
|
|
481
|
+
scope.checkpoints.set(op.name, op.data);
|
|
482
|
+
return;
|
|
483
|
+
}
|
|
484
|
+
replaySerializedSpan(scope, parentSpan.id, op.span);
|
|
485
|
+
}
|
|
486
|
+
function replaySerializedSpan(scope, parentId, serialized) {
|
|
487
|
+
const id = generateSpanId();
|
|
488
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
489
|
+
const replayed = {
|
|
490
|
+
id,
|
|
491
|
+
parentId,
|
|
492
|
+
caseId: scope.caseId,
|
|
493
|
+
kind: serialized.kind,
|
|
494
|
+
name: serialized.name,
|
|
495
|
+
startedAt: now,
|
|
496
|
+
endedAt: now,
|
|
497
|
+
status: serialized.status,
|
|
498
|
+
attributes: serialized.attributes,
|
|
499
|
+
error: serialized.error
|
|
500
|
+
};
|
|
501
|
+
scope.spans.push(replayed);
|
|
502
|
+
for (const child of serialized.children) replaySerializedSpan(scope, id, child);
|
|
503
|
+
}
|
|
504
|
+
//#endregion
|
|
505
|
+
//#region ../shared/src/schemas/display.ts
|
|
506
|
+
const scalarCellSchema = z.union([
|
|
507
|
+
z.string(),
|
|
508
|
+
z.number(),
|
|
509
|
+
z.boolean(),
|
|
510
|
+
z.null()
|
|
511
|
+
]);
|
|
512
|
+
const jsonCellSchema = z.lazy(() => z.union([
|
|
513
|
+
scalarCellSchema,
|
|
514
|
+
z.array(jsonCellSchema),
|
|
515
|
+
z.record(z.string(), jsonCellSchema)
|
|
516
|
+
]));
|
|
517
|
+
const repoFileRefSchema = z.object({
|
|
518
|
+
source: z.literal("repo"),
|
|
519
|
+
path: z.string(),
|
|
520
|
+
mimeType: z.string().optional()
|
|
521
|
+
});
|
|
522
|
+
const runArtifactRefSchema = z.object({
|
|
523
|
+
source: z.literal("run"),
|
|
524
|
+
artifactId: z.string(),
|
|
525
|
+
mimeType: z.string(),
|
|
526
|
+
fileName: z.string().optional()
|
|
527
|
+
});
|
|
528
|
+
const fileRefSchema = z.union([repoFileRefSchema, runArtifactRefSchema]);
|
|
529
|
+
/** Schema for numeric presentation options used by number-formatted values. */
|
|
530
|
+
const numberDisplayOptionsSchema = z.object({
|
|
531
|
+
notation: z.enum(["standard", "compact"]).optional(),
|
|
532
|
+
compactDisplay: z.enum(["short", "long"]).optional(),
|
|
533
|
+
prefix: z.string().optional(),
|
|
534
|
+
suffix: z.string().optional(),
|
|
535
|
+
decimalPlaces: z.number().int().min(0).optional()
|
|
536
|
+
});
|
|
537
|
+
/** Schema for the supported column rendering kinds in list views. */
|
|
538
|
+
const columnKindSchema = z.enum([
|
|
539
|
+
"string",
|
|
540
|
+
"number",
|
|
541
|
+
"boolean"
|
|
542
|
+
]);
|
|
543
|
+
/** Schema for the built-in column formatting presets. */
|
|
544
|
+
const columnFormatSchema = z.enum([
|
|
545
|
+
"boolean",
|
|
546
|
+
"markdown",
|
|
547
|
+
"json",
|
|
548
|
+
"image",
|
|
549
|
+
"audio",
|
|
550
|
+
"video",
|
|
551
|
+
"file",
|
|
552
|
+
"duration",
|
|
553
|
+
"percent",
|
|
554
|
+
"number",
|
|
555
|
+
"passFail",
|
|
556
|
+
"stars"
|
|
557
|
+
]);
|
|
558
|
+
/** Schema describing a rendered column in the eval results table. */
|
|
559
|
+
const columnDefSchema = z.object({
|
|
560
|
+
key: z.string(),
|
|
561
|
+
label: z.string(),
|
|
562
|
+
kind: columnKindSchema,
|
|
563
|
+
format: columnFormatSchema.optional(),
|
|
564
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
565
|
+
isScore: z.boolean().optional(),
|
|
566
|
+
isManualScore: z.boolean().optional(),
|
|
567
|
+
passThreshold: z.number().optional(),
|
|
568
|
+
maxStars: z.number().int().min(2).optional(),
|
|
569
|
+
hideInTable: z.boolean().optional(),
|
|
570
|
+
sortable: z.boolean().optional(),
|
|
571
|
+
align: z.enum([
|
|
572
|
+
"left",
|
|
573
|
+
"center",
|
|
574
|
+
"right"
|
|
575
|
+
]).optional()
|
|
576
|
+
});
|
|
577
|
+
/** Schema for any supported value that can populate a table cell. */
|
|
578
|
+
const cellValueSchema = z.union([jsonCellSchema, fileRefSchema]);
|
|
579
|
+
//#endregion
|
|
580
|
+
//#region ../shared/src/schemas/trace.ts
|
|
581
|
+
/** Schema for the semantic categories used to classify trace spans. */
|
|
582
|
+
const traceSpanKindSchema = z.enum([
|
|
583
|
+
"eval",
|
|
584
|
+
"agent",
|
|
585
|
+
"llm",
|
|
586
|
+
"tool",
|
|
587
|
+
"retrieval",
|
|
588
|
+
"scorer",
|
|
589
|
+
"checkpoint",
|
|
590
|
+
"custom"
|
|
591
|
+
]);
|
|
592
|
+
/** Schema for the supported presentation formats of trace attributes. */
|
|
593
|
+
const traceAttributeDisplayFormatSchema = z.enum([
|
|
594
|
+
"string",
|
|
595
|
+
"number",
|
|
596
|
+
"duration",
|
|
597
|
+
"json"
|
|
598
|
+
]);
|
|
599
|
+
/** Schema for the UI locations where a trace attribute can appear. */
|
|
600
|
+
const traceAttributeDisplayPlacementSchema = z.enum([
|
|
601
|
+
"tree",
|
|
602
|
+
"detail",
|
|
603
|
+
"section"
|
|
604
|
+
]);
|
|
605
|
+
/** Schema for resolved trace display rules sent to the UI. */
|
|
606
|
+
const traceAttributeDisplaySchema = z.object({
|
|
607
|
+
key: z.string().optional(),
|
|
608
|
+
path: z.string(),
|
|
609
|
+
label: z.string().optional(),
|
|
610
|
+
format: traceAttributeDisplayFormatSchema.optional(),
|
|
611
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
612
|
+
placements: z.array(traceAttributeDisplayPlacementSchema).optional(),
|
|
613
|
+
scope: z.enum(["self", "subtree"]).optional(),
|
|
614
|
+
mode: z.enum([
|
|
615
|
+
"all",
|
|
616
|
+
"last",
|
|
617
|
+
"sum"
|
|
618
|
+
]).optional()
|
|
619
|
+
});
|
|
620
|
+
/** Schema for trace display config after transforms have been resolved. */
|
|
621
|
+
const traceDisplayConfigSchema = z.object({ attributes: z.array(traceAttributeDisplaySchema).optional() });
|
|
622
|
+
/** Schema for authored trace display rules accepted from user config. */
|
|
623
|
+
const traceAttributeDisplayInputSchema = z.object({
|
|
624
|
+
key: z.string().optional(),
|
|
625
|
+
path: z.string(),
|
|
626
|
+
label: z.string().optional(),
|
|
627
|
+
format: traceAttributeDisplayFormatSchema.optional(),
|
|
628
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
629
|
+
placements: z.array(traceAttributeDisplayPlacementSchema).optional(),
|
|
630
|
+
scope: z.enum(["self", "subtree"]).optional(),
|
|
631
|
+
mode: z.enum([
|
|
632
|
+
"all",
|
|
633
|
+
"last",
|
|
634
|
+
"sum"
|
|
635
|
+
]).optional(),
|
|
636
|
+
transform: z.custom((value) => value === void 0 || typeof value === "function", { message: "Expected a transform function" }).optional()
|
|
637
|
+
});
|
|
638
|
+
/** Schema for authored trace display config in eval or workspace config. */
|
|
639
|
+
const traceDisplayInputConfigSchema = z.object({ attributes: z.array(traceAttributeDisplayInputSchema).optional() });
|
|
640
|
+
/** Schema for a persisted trace span captured during case execution. */
|
|
641
|
+
const traceSpanSchema = z.object({
|
|
642
|
+
id: z.string(),
|
|
643
|
+
parentId: z.string().nullable(),
|
|
644
|
+
caseId: z.string(),
|
|
645
|
+
kind: traceSpanKindSchema,
|
|
646
|
+
name: z.string(),
|
|
647
|
+
startedAt: z.string(),
|
|
648
|
+
endedAt: z.string().nullable(),
|
|
649
|
+
status: z.enum([
|
|
650
|
+
"running",
|
|
651
|
+
"ok",
|
|
652
|
+
"error",
|
|
653
|
+
"cancelled"
|
|
654
|
+
]),
|
|
655
|
+
attributes: z.record(z.string(), z.unknown()).optional(),
|
|
656
|
+
error: z.object({
|
|
657
|
+
name: z.string().optional(),
|
|
658
|
+
message: z.string(),
|
|
659
|
+
stack: z.string().optional()
|
|
660
|
+
}).optional()
|
|
661
|
+
});
|
|
662
|
+
//#endregion
|
|
663
|
+
//#region ../shared/src/schemas/chart.ts
|
|
664
|
+
/** Chart type rendered for a single eval history chart. */
|
|
665
|
+
const evalChartTypeSchema = z.enum([
|
|
666
|
+
"area",
|
|
667
|
+
"line",
|
|
668
|
+
"bar"
|
|
669
|
+
]);
|
|
670
|
+
/**
|
|
671
|
+
* Run-level metric sourced from the aggregated `RunSummary` for a run, rather
|
|
672
|
+
* than from a per-case column.
|
|
673
|
+
*/
|
|
674
|
+
const evalChartBuiltinMetricSchema = z.enum(["passRate", "durationMs"]);
|
|
675
|
+
/** Reducer applied to a numeric column across all cases of a single run. */
|
|
676
|
+
const evalChartAggregateSchema = z.enum([
|
|
677
|
+
"avg",
|
|
678
|
+
"sum",
|
|
679
|
+
"min",
|
|
680
|
+
"max",
|
|
681
|
+
"latest",
|
|
682
|
+
"passThresholdRate"
|
|
683
|
+
]);
|
|
684
|
+
/**
|
|
685
|
+
* Semantic color token resolved to a theme color by the web UI. The SDK does
|
|
686
|
+
* not emit raw hex so authored evals stay decoupled from the web theme.
|
|
687
|
+
*/
|
|
688
|
+
const evalChartColorSchema = z.enum([
|
|
689
|
+
"accent",
|
|
690
|
+
"accentDim",
|
|
691
|
+
"success",
|
|
692
|
+
"error",
|
|
693
|
+
"warning",
|
|
694
|
+
"textMuted"
|
|
695
|
+
]);
|
|
696
|
+
/** Y-axis placement for a plotted series on a dual-axis chart. */
|
|
697
|
+
const evalChartAxisSchema = z.enum(["left", "right"]);
|
|
698
|
+
/**
|
|
699
|
+
* One plotted series on an eval history chart. `builtin` metrics come from the
|
|
700
|
+
* per-run `RunSummary`; `column` metrics aggregate a per-case score or
|
|
701
|
+
* `setEvalOutput` column across the run using `aggregate`.
|
|
702
|
+
*/
|
|
703
|
+
const evalChartMetricSchema = z.discriminatedUnion("source", [z.object({
|
|
704
|
+
source: z.literal("builtin"),
|
|
705
|
+
metric: evalChartBuiltinMetricSchema,
|
|
706
|
+
label: z.string().optional(),
|
|
707
|
+
color: evalChartColorSchema.optional(),
|
|
708
|
+
axis: evalChartAxisSchema.optional()
|
|
709
|
+
}), z.object({
|
|
710
|
+
source: z.literal("column"),
|
|
711
|
+
/** Matches a declared score key or a `setEvalOutput` key on the eval. */
|
|
712
|
+
key: z.string().min(1),
|
|
713
|
+
aggregate: evalChartAggregateSchema,
|
|
714
|
+
label: z.string().optional(),
|
|
715
|
+
color: evalChartColorSchema.optional(),
|
|
716
|
+
axis: evalChartAxisSchema.optional()
|
|
717
|
+
})]);
|
|
718
|
+
/** Extra field rendered only in the tooltip, not plotted as a series. */
|
|
719
|
+
const evalChartTooltipExtraSchema = z.discriminatedUnion("source", [z.object({
|
|
720
|
+
source: z.literal("builtin"),
|
|
721
|
+
metric: evalChartBuiltinMetricSchema,
|
|
722
|
+
label: z.string().optional()
|
|
723
|
+
}), z.object({
|
|
724
|
+
source: z.literal("column"),
|
|
725
|
+
key: z.string().min(1),
|
|
726
|
+
aggregate: evalChartAggregateSchema,
|
|
727
|
+
label: z.string().optional()
|
|
728
|
+
})]);
|
|
729
|
+
/**
|
|
730
|
+
* Authored configuration for one eval history chart rendered in `EvalCard`.
|
|
731
|
+
* Authors declare a list of these via `EvalDefinition.charts` — the UI renders
|
|
732
|
+
* each entry as its own chart frame, stacked in authoring order.
|
|
733
|
+
*/
|
|
734
|
+
const evalChartConfigSchema = z.object({
|
|
735
|
+
/** Optional heading shown above the chart frame in the UI. */
|
|
736
|
+
heading: z.string().optional(),
|
|
737
|
+
type: evalChartTypeSchema,
|
|
738
|
+
/** At least one series must be declared. */
|
|
739
|
+
metrics: z.array(evalChartMetricSchema).min(1),
|
|
740
|
+
/**
|
|
741
|
+
* Per-axis Y domain. Omit either side for automatic scaling. When unset the
|
|
742
|
+
* chart auto-scales — there is no implicit `[0, 1]` clamp.
|
|
743
|
+
*/
|
|
744
|
+
yDomain: z.object({
|
|
745
|
+
left: z.object({
|
|
746
|
+
min: z.number().optional(),
|
|
747
|
+
max: z.number().optional()
|
|
748
|
+
}).optional(),
|
|
749
|
+
right: z.object({
|
|
750
|
+
min: z.number().optional(),
|
|
751
|
+
max: z.number().optional()
|
|
752
|
+
}).optional()
|
|
753
|
+
}).optional(),
|
|
754
|
+
tooltipExtras: z.array(evalChartTooltipExtraSchema).optional()
|
|
755
|
+
});
|
|
756
|
+
/**
|
|
757
|
+
* Ordered list of history charts rendered for an eval. Opt-in: when omitted or
|
|
758
|
+
* empty, the UI renders no history chart at all.
|
|
759
|
+
*/
|
|
760
|
+
const evalChartsConfigSchema = z.array(evalChartConfigSchema);
|
|
761
|
+
//#endregion
|
|
762
|
+
//#region ../shared/src/schemas/eval.ts
|
|
763
|
+
/** Freshness signal derived from the latest relevant run plus git state. */
|
|
764
|
+
const evalFreshnessStatusSchema = z.enum([
|
|
765
|
+
"fresh",
|
|
766
|
+
"stale",
|
|
767
|
+
"outdated"
|
|
768
|
+
]);
|
|
769
|
+
/** Reducer used to collapse a column's per-case values into a single stat. */
|
|
770
|
+
const evalStatAggregateSchema = z.enum([
|
|
771
|
+
"avg",
|
|
772
|
+
"min",
|
|
773
|
+
"max",
|
|
774
|
+
"sum",
|
|
775
|
+
"last"
|
|
776
|
+
]);
|
|
777
|
+
/**
|
|
778
|
+
* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
|
|
779
|
+
* `column` aggregates a score or numeric output column across the latest run.
|
|
780
|
+
*/
|
|
781
|
+
const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
782
|
+
z.object({ kind: z.literal("cases") }),
|
|
783
|
+
z.object({
|
|
784
|
+
kind: z.literal("passRate"),
|
|
785
|
+
accent: z.boolean().optional()
|
|
786
|
+
}),
|
|
787
|
+
z.object({ kind: z.literal("duration") }),
|
|
788
|
+
z.object({
|
|
789
|
+
kind: z.literal("column"),
|
|
790
|
+
key: z.string(),
|
|
791
|
+
label: z.string().optional(),
|
|
792
|
+
aggregate: evalStatAggregateSchema,
|
|
793
|
+
format: columnFormatSchema.optional(),
|
|
794
|
+
accent: z.boolean().optional()
|
|
795
|
+
})
|
|
796
|
+
]);
|
|
797
|
+
/** Ordered list of stats rendered in the EvalCard stats row. */
|
|
798
|
+
const evalStatsConfigSchema = z.array(evalStatItemSchema);
|
|
799
|
+
/** Schema summarizing a discovered eval for list and overview screens. */
|
|
800
|
+
const evalSummarySchema = z.object({
|
|
801
|
+
id: z.string(),
|
|
802
|
+
title: z.string().optional(),
|
|
803
|
+
/** Eval file path relative to the active workspace root. */
|
|
804
|
+
filePath: z.string(),
|
|
805
|
+
/** Indicates the eval file changed since the latest passing result. */
|
|
806
|
+
stale: z.boolean(),
|
|
807
|
+
/** Indicates the latest comparable run is from an older commit and too old. */
|
|
808
|
+
outdated: z.boolean(),
|
|
809
|
+
/** Latest derived freshness signal for this eval. */
|
|
810
|
+
freshnessStatus: evalFreshnessStatusSchema,
|
|
811
|
+
/** Timestamp for the latest run considered when deriving freshness. */
|
|
812
|
+
latestRunAt: z.string().nullable(),
|
|
813
|
+
/** Commit SHA recorded on the latest run considered for freshness. */
|
|
814
|
+
latestRunCommitSha: z.string().nullable(),
|
|
815
|
+
/** Current workspace commit SHA when the summary was requested. */
|
|
816
|
+
currentCommitSha: z.string().nullable(),
|
|
817
|
+
columnDefs: z.array(columnDefSchema),
|
|
818
|
+
caseCount: z.number().nullable(),
|
|
819
|
+
lastRunStatus: z.enum([
|
|
820
|
+
"pass",
|
|
821
|
+
"fail",
|
|
822
|
+
"error",
|
|
823
|
+
"running",
|
|
824
|
+
"cancelled",
|
|
825
|
+
"unscored"
|
|
826
|
+
]).nullable(),
|
|
827
|
+
/**
|
|
828
|
+
* Optional per-eval stats row configuration for the EvalCard. Opt-in: when
|
|
829
|
+
* omitted or empty, the UI renders no stats row at all.
|
|
830
|
+
*/
|
|
831
|
+
stats: evalStatsConfigSchema.optional(),
|
|
832
|
+
/**
|
|
833
|
+
* Ordered per-eval history chart configuration for the EvalCard. Opt-in:
|
|
834
|
+
* when omitted or empty, the UI renders no history chart at all.
|
|
835
|
+
*/
|
|
836
|
+
charts: evalChartsConfigSchema.optional()
|
|
837
|
+
});
|
|
838
|
+
/** Schema for one case row in an eval run result table. */
|
|
839
|
+
const caseRowSchema = z.object({
|
|
840
|
+
caseId: z.string(),
|
|
841
|
+
evalId: z.string(),
|
|
842
|
+
status: z.enum([
|
|
843
|
+
"pending",
|
|
844
|
+
"running",
|
|
845
|
+
"pass",
|
|
846
|
+
"fail",
|
|
847
|
+
"error",
|
|
848
|
+
"cancelled"
|
|
849
|
+
]),
|
|
850
|
+
latencyMs: z.number().nullable(),
|
|
851
|
+
costUsd: z.number().nullable().optional(),
|
|
852
|
+
columns: z.record(z.string(), cellValueSchema),
|
|
853
|
+
/** Winning trial index for the persisted case result. */
|
|
854
|
+
trial: z.number()
|
|
855
|
+
});
|
|
856
|
+
/** Structured assertion failure metadata captured for one case run. */
|
|
857
|
+
const assertionFailureSchema = z.object({
|
|
858
|
+
/** Human-readable assertion failure message shown in the UI and artifacts. */
|
|
859
|
+
message: z.string(),
|
|
860
|
+
/** Stack trace captured from the originating error when available. */
|
|
861
|
+
stack: z.string().optional()
|
|
862
|
+
});
|
|
863
|
+
const legacyAssertionFailureSchema = z.string().transform((message) => ({ message }));
|
|
864
|
+
/** Trace payload captured while computing one score for a case. */
|
|
865
|
+
const scoreTraceSchema = z.object({
|
|
866
|
+
trace: z.array(traceSpanSchema),
|
|
867
|
+
traceDisplay: traceDisplayConfigSchema
|
|
868
|
+
});
|
|
869
|
+
/** Schema for the detailed payload shown when opening a specific case. */
|
|
870
|
+
const caseDetailSchema = z.object({
|
|
871
|
+
caseId: z.string(),
|
|
872
|
+
evalId: z.string(),
|
|
873
|
+
status: z.enum([
|
|
874
|
+
"pending",
|
|
875
|
+
"running",
|
|
876
|
+
"pass",
|
|
877
|
+
"fail",
|
|
878
|
+
"error",
|
|
879
|
+
"cancelled"
|
|
880
|
+
]),
|
|
881
|
+
input: z.unknown(),
|
|
882
|
+
trace: z.array(traceSpanSchema),
|
|
883
|
+
traceDisplay: traceDisplayConfigSchema,
|
|
884
|
+
/**
|
|
885
|
+
* Separate trace payloads emitted by score computation. These are kept out
|
|
886
|
+
* of `trace` so derive-from-execution metrics do not include judge/scorer
|
|
887
|
+
* work.
|
|
888
|
+
*/
|
|
889
|
+
scoringTraces: z.record(z.string(), scoreTraceSchema).optional(),
|
|
890
|
+
columns: z.record(z.string(), cellValueSchema),
|
|
891
|
+
assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
|
|
892
|
+
error: z.object({
|
|
893
|
+
name: z.string().optional(),
|
|
894
|
+
message: z.string(),
|
|
895
|
+
stack: z.string().optional()
|
|
896
|
+
}).nullable(),
|
|
897
|
+
/** Winning trial index for the persisted case detail. */
|
|
898
|
+
trial: z.number()
|
|
899
|
+
});
|
|
900
|
+
//#endregion
|
|
901
|
+
//#region ../shared/src/schemas/cache.ts
|
|
902
|
+
/**
|
|
903
|
+
* Mode that controls how the cache is consulted for a given run.
|
|
904
|
+
*
|
|
905
|
+
* - `use`: read cache on hit, write on miss. Default.
|
|
906
|
+
* - `bypass`: never read, never write.
|
|
907
|
+
* - `refresh`: never read, always write (forces re-execution and overwrites).
|
|
908
|
+
*/
|
|
909
|
+
const cacheModeSchema = z.enum([
|
|
910
|
+
"use",
|
|
911
|
+
"bypass",
|
|
912
|
+
"refresh"
|
|
913
|
+
]);
|
|
914
|
+
/** Options accepted by an `evalTracer.span` call to opt the span into caching. */
|
|
915
|
+
const spanCacheOptionsSchema = z.object({
|
|
916
|
+
/** Arbitrary JSON-safe value used to derive the cache key. */
|
|
917
|
+
key: z.unknown(),
|
|
918
|
+
/** Override the default namespace (`${evalId}__${spanName}`). */
|
|
919
|
+
namespace: z.string().optional()
|
|
920
|
+
});
|
|
921
|
+
/** Summary of a single persisted cache entry, used by list/delete endpoints. */
|
|
922
|
+
const cacheListItemSchema = z.object({
|
|
923
|
+
key: z.string(),
|
|
924
|
+
namespace: z.string(),
|
|
925
|
+
spanName: z.string(),
|
|
926
|
+
spanKind: traceSpanKindSchema,
|
|
927
|
+
storedAt: z.string(),
|
|
928
|
+
codeFingerprint: z.string(),
|
|
929
|
+
sizeBytes: z.number()
|
|
930
|
+
});
|
|
931
|
+
/** Zod schema for `SerializedCacheSpan`, defined lazily for recursion. */
|
|
932
|
+
const serializedCacheSpanSchema = z.object({
|
|
933
|
+
kind: traceSpanKindSchema,
|
|
934
|
+
name: z.string(),
|
|
935
|
+
attributes: z.record(z.string(), z.unknown()).optional(),
|
|
936
|
+
status: z.enum([
|
|
937
|
+
"running",
|
|
938
|
+
"ok",
|
|
939
|
+
"error",
|
|
940
|
+
"cancelled"
|
|
941
|
+
]),
|
|
942
|
+
error: z.object({
|
|
943
|
+
name: z.string().optional(),
|
|
944
|
+
message: z.string(),
|
|
945
|
+
stack: z.string().optional()
|
|
946
|
+
}).optional()
|
|
947
|
+
}).extend({ children: z.lazy(() => z.array(serializedCacheSpanSchema)) });
|
|
948
|
+
/**
|
|
949
|
+
* One captured operation performed while a cached span's body executed.
|
|
950
|
+
*
|
|
951
|
+
* Operations are replayed in order against a fresh scope on cache hit to
|
|
952
|
+
* reproduce the observable effects of the original run.
|
|
953
|
+
*/
|
|
954
|
+
const cacheRecordingOpSchema = z.discriminatedUnion("kind", [
|
|
955
|
+
z.object({
|
|
956
|
+
kind: z.literal("setOutput"),
|
|
957
|
+
key: z.string(),
|
|
958
|
+
value: z.unknown()
|
|
959
|
+
}),
|
|
960
|
+
z.object({
|
|
961
|
+
kind: z.literal("incrementOutput"),
|
|
962
|
+
key: z.string(),
|
|
963
|
+
delta: z.number()
|
|
964
|
+
}),
|
|
965
|
+
z.object({
|
|
966
|
+
kind: z.literal("checkpoint"),
|
|
967
|
+
name: z.string(),
|
|
968
|
+
data: z.unknown()
|
|
969
|
+
}),
|
|
970
|
+
z.object({
|
|
971
|
+
kind: z.literal("subSpan"),
|
|
972
|
+
span: serializedCacheSpanSchema
|
|
973
|
+
})
|
|
974
|
+
]);
|
|
975
|
+
/** Captured observable effects + return value of a cached span body. */
|
|
976
|
+
const cacheRecordingSchema = z.object({
|
|
977
|
+
returnValue: z.unknown(),
|
|
978
|
+
finalAttributes: z.record(z.string(), z.unknown()),
|
|
979
|
+
ops: z.array(cacheRecordingOpSchema)
|
|
980
|
+
});
|
|
981
|
+
/** Persisted cache file containing metadata and a recording. */
|
|
982
|
+
const cacheEntrySchema = z.object({
|
|
983
|
+
version: z.literal(1),
|
|
984
|
+
key: z.string(),
|
|
985
|
+
namespace: z.string(),
|
|
986
|
+
spanName: z.string(),
|
|
987
|
+
spanKind: traceSpanKindSchema,
|
|
988
|
+
storedAt: z.string(),
|
|
989
|
+
codeFingerprint: z.string(),
|
|
990
|
+
recording: cacheRecordingSchema
|
|
991
|
+
});
|
|
992
|
+
//#endregion
|
|
993
|
+
//#region ../shared/src/schemas/config.ts
|
|
994
|
+
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
995
|
+
const trialSelectionModeSchema = z.enum(["lowestScore", "median"]);
|
|
996
|
+
/** Zod schema for validating `agent-evals.config.ts` input. */
|
|
997
|
+
const agentEvalsConfigSchema = z.object({
|
|
998
|
+
workspaceRoot: z.string().optional(),
|
|
999
|
+
include: z.array(z.string()),
|
|
1000
|
+
defaultTrials: z.number().optional(),
|
|
1001
|
+
trialSelection: trialSelectionModeSchema.optional(),
|
|
1002
|
+
concurrency: z.number().optional(),
|
|
1003
|
+
staleAfterDays: z.number().optional(),
|
|
1004
|
+
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
1005
|
+
cache: z.object({
|
|
1006
|
+
enabled: z.boolean().optional(),
|
|
1007
|
+
dir: z.string().optional()
|
|
1008
|
+
}).optional()
|
|
1009
|
+
});
|
|
1010
|
+
//#endregion
|
|
1011
|
+
//#region ../shared/src/schemas/run.ts
|
|
1012
|
+
/** Schema for persisted metadata about a single run invocation. */
|
|
1013
|
+
const runManifestSchema = z.object({
|
|
1014
|
+
id: z.string(),
|
|
1015
|
+
/**
|
|
1016
|
+
* Short, human-readable run id (e.g. `r0`, `r1`). Monotonic global counter
|
|
1017
|
+
* assigned at creation; oldest run is `r0`. Legacy persisted runs are
|
|
1018
|
+
* migrated to have a `shortId` on load.
|
|
1019
|
+
*/
|
|
1020
|
+
shortId: z.string(),
|
|
1021
|
+
status: z.enum([
|
|
1022
|
+
"pending",
|
|
1023
|
+
"running",
|
|
1024
|
+
"completed",
|
|
1025
|
+
"cancelled",
|
|
1026
|
+
"error"
|
|
1027
|
+
]),
|
|
1028
|
+
startedAt: z.string(),
|
|
1029
|
+
endedAt: z.string().nullable(),
|
|
1030
|
+
/**
|
|
1031
|
+
* Git commit SHA for the workspace when the run started. Older persisted
|
|
1032
|
+
* runs may not include this field.
|
|
1033
|
+
*/
|
|
1034
|
+
commitSha: z.string().nullable().optional().default(null),
|
|
1035
|
+
/**
|
|
1036
|
+
* Eval-file fingerprints captured for this run, keyed by eval id. Older
|
|
1037
|
+
* persisted runs may not include this field.
|
|
1038
|
+
*/
|
|
1039
|
+
evalSourceFingerprints: z.record(z.string(), z.string()).optional().default({}),
|
|
1040
|
+
target: z.object({
|
|
1041
|
+
mode: z.enum([
|
|
1042
|
+
"all",
|
|
1043
|
+
"evalIds",
|
|
1044
|
+
"caseIds"
|
|
1045
|
+
]),
|
|
1046
|
+
evalIds: z.array(z.string()).optional(),
|
|
1047
|
+
caseIds: z.array(z.string()).optional()
|
|
1048
|
+
}),
|
|
1049
|
+
/** Number of trial attempts executed for each case in this run. */
|
|
1050
|
+
trials: z.number(),
|
|
1051
|
+
/**
|
|
1052
|
+
* Strategy used to collapse repeated trials into the single persisted case
|
|
1053
|
+
* result for this run. Older persisted runs may not include this field.
|
|
1054
|
+
*/
|
|
1055
|
+
trialSelection: trialSelectionModeSchema.optional().default("lowestScore"),
|
|
1056
|
+
/** Cache mode used for this run. Defaults to `use` when absent. */
|
|
1057
|
+
cacheMode: cacheModeSchema.optional()
|
|
1058
|
+
});
|
|
1059
|
+
/** Schema for aggregate metrics computed over a completed or active run. */
|
|
1060
|
+
const runSummarySchema = z.object({
|
|
1061
|
+
runId: z.string(),
|
|
1062
|
+
status: z.enum([
|
|
1063
|
+
"pending",
|
|
1064
|
+
"running",
|
|
1065
|
+
"completed",
|
|
1066
|
+
"cancelled",
|
|
1067
|
+
"error"
|
|
1068
|
+
]),
|
|
1069
|
+
totalCases: z.number(),
|
|
1070
|
+
passedCases: z.number(),
|
|
1071
|
+
failedCases: z.number(),
|
|
1072
|
+
errorCases: z.number(),
|
|
1073
|
+
cancelledCases: z.number(),
|
|
1074
|
+
totalDurationMs: z.number().nullable(),
|
|
1075
|
+
errorMessage: z.string().nullable().default(null)
|
|
1076
|
+
});
|
|
1077
|
+
//#endregion
|
|
1078
|
+
//#region ../shared/src/status.ts
|
|
1079
|
+
function deriveLifecycleStatus(lifecycleStatus) {
|
|
1080
|
+
if (lifecycleStatus === "pending" || lifecycleStatus === "running" || lifecycleStatus === "cancelled" || lifecycleStatus === "error") return lifecycleStatus;
|
|
1081
|
+
return null;
|
|
1082
|
+
}
|
|
1083
|
+
/**
|
|
1084
|
+
* Derive an aggregate status from child statuses, optionally allowing a raw run
|
|
1085
|
+
* lifecycle status to override active terminal states such as `running`,
|
|
1086
|
+
* `cancelled`, and `error`.
|
|
1087
|
+
*/
|
|
1088
|
+
function deriveStatusFromChildStatuses(params) {
|
|
1089
|
+
const lifecycle = deriveLifecycleStatus(params.lifecycleStatus);
|
|
1090
|
+
if (lifecycle !== null) return lifecycle;
|
|
1091
|
+
let hasPass = false;
|
|
1092
|
+
let hasPending = false;
|
|
1093
|
+
let hasRunning = false;
|
|
1094
|
+
let hasCancelled = false;
|
|
1095
|
+
let hasError = false;
|
|
1096
|
+
let hasFail = false;
|
|
1097
|
+
for (const status of params.statuses) {
|
|
1098
|
+
if (status === void 0 || status === null) continue;
|
|
1099
|
+
if (status === "running") hasRunning = true;
|
|
1100
|
+
else if (status === "error") hasError = true;
|
|
1101
|
+
else if (status === "fail") hasFail = true;
|
|
1102
|
+
else if (status === "cancelled") hasCancelled = true;
|
|
1103
|
+
else if (status === "pass") hasPass = true;
|
|
1104
|
+
else hasPending = true;
|
|
1105
|
+
}
|
|
1106
|
+
if (hasRunning) return "running";
|
|
1107
|
+
if (hasError) return "error";
|
|
1108
|
+
if (hasFail) return "fail";
|
|
1109
|
+
if (hasCancelled) return "cancelled";
|
|
1110
|
+
if (hasPending || !hasPass) return "pending";
|
|
1111
|
+
return "pass";
|
|
1112
|
+
}
|
|
1113
|
+
/**
|
|
1114
|
+
* Derive an aggregate status from a scoped set of case rows.
|
|
1115
|
+
*
|
|
1116
|
+
* Pass `lifecycleStatus` only when the parent scope's raw run lifecycle should
|
|
1117
|
+
* override the derived child result, such as for a whole-run display.
|
|
1118
|
+
*/
|
|
1119
|
+
function deriveStatusFromCaseRows(params) {
|
|
1120
|
+
return deriveStatusFromChildStatuses({
|
|
1121
|
+
statuses: Array.from(params.caseRows, (caseRow) => caseRow.status),
|
|
1122
|
+
lifecycleStatus: params.lifecycleStatus
|
|
1123
|
+
});
|
|
1124
|
+
}
|
|
1125
|
+
/**
|
|
1126
|
+
* Derive counts, aggregate metrics, and display status from a scoped set of
|
|
1127
|
+
* case rows.
|
|
1128
|
+
*/
|
|
1129
|
+
function deriveScopedSummaryFromCases(params) {
|
|
1130
|
+
const caseRows = [...params.caseRows];
|
|
1131
|
+
let passedCases = 0;
|
|
1132
|
+
let failedCases = 0;
|
|
1133
|
+
let errorCases = 0;
|
|
1134
|
+
let cancelledCases = 0;
|
|
1135
|
+
let pendingCases = 0;
|
|
1136
|
+
let runningCases = 0;
|
|
1137
|
+
let totalDurationMs = 0;
|
|
1138
|
+
let hasDuration = false;
|
|
1139
|
+
for (const caseRow of caseRows) {
|
|
1140
|
+
if (caseRow.status === "pass") passedCases += 1;
|
|
1141
|
+
else if (caseRow.status === "fail") failedCases += 1;
|
|
1142
|
+
else if (caseRow.status === "error") errorCases += 1;
|
|
1143
|
+
else if (caseRow.status === "cancelled") cancelledCases += 1;
|
|
1144
|
+
else if (caseRow.status === "running") runningCases += 1;
|
|
1145
|
+
else pendingCases += 1;
|
|
1146
|
+
if (caseRow.latencyMs !== null) {
|
|
1147
|
+
totalDurationMs += caseRow.latencyMs;
|
|
1148
|
+
hasDuration = true;
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
return {
|
|
1152
|
+
status: deriveStatusFromCaseRows({
|
|
1153
|
+
caseRows,
|
|
1154
|
+
lifecycleStatus: params.lifecycleStatus
|
|
1155
|
+
}),
|
|
1156
|
+
totalCases: caseRows.length,
|
|
1157
|
+
passedCases,
|
|
1158
|
+
failedCases,
|
|
1159
|
+
errorCases,
|
|
1160
|
+
cancelledCases,
|
|
1161
|
+
pendingCases,
|
|
1162
|
+
runningCases,
|
|
1163
|
+
totalDurationMs: hasDuration ? totalDurationMs : null
|
|
1164
|
+
};
|
|
1165
|
+
}
|
|
1166
|
+
//#endregion
|
|
1167
|
+
//#region ../shared/src/evalStatus.ts
|
|
1168
|
+
/**
|
|
1169
|
+
* Derive the user-facing eval status from the raw latest run result plus
|
|
1170
|
+
* freshness state.
|
|
1171
|
+
*/
|
|
1172
|
+
function getEvalDisplayStatus(params) {
|
|
1173
|
+
const { stale, outdated, lastRunStatus, isRunning = false } = params;
|
|
1174
|
+
if (isRunning || lastRunStatus === "running") return "running";
|
|
1175
|
+
if (lastRunStatus === "pass") {
|
|
1176
|
+
if (stale) return "stale";
|
|
1177
|
+
if (outdated) return "outdated";
|
|
1178
|
+
}
|
|
1179
|
+
return lastRunStatus ?? "pending";
|
|
1180
|
+
}
|
|
1181
|
+
//#endregion
|
|
1182
|
+
//#region ../shared/src/evalTitle.ts
|
|
1183
|
+
function humanizeEvalId(id) {
|
|
1184
|
+
const normalized = id.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/[-_\s]+/g, " ").trim();
|
|
1185
|
+
if (normalized.length === 0) return id;
|
|
1186
|
+
return normalized.split(" ").map((segment) => {
|
|
1187
|
+
const firstChar = segment.slice(0, 1);
|
|
1188
|
+
const remainder = segment.slice(1);
|
|
1189
|
+
return `${firstChar.toUpperCase()}${remainder}`;
|
|
1190
|
+
}).join(" ");
|
|
1191
|
+
}
|
|
1192
|
+
/**
|
|
1193
|
+
* Resolve the display title for an eval.
|
|
1194
|
+
*
|
|
1195
|
+
* Returns the authored `title` when present; otherwise derives a human-readable
|
|
1196
|
+
* label from the stable eval `id` so display surfaces can avoid repeating both
|
|
1197
|
+
* fields in common cases.
|
|
1198
|
+
*/
|
|
1199
|
+
function getEvalTitle(evalLike) {
|
|
1200
|
+
if (evalLike.title !== void 0) return evalLike.title;
|
|
1201
|
+
return humanizeEvalId(evalLike.id);
|
|
1202
|
+
}
|
|
1203
|
+
z.enum([
|
|
1204
|
+
"discovery.updated",
|
|
1205
|
+
"run.started",
|
|
1206
|
+
"run.summary",
|
|
1207
|
+
"case.started",
|
|
1208
|
+
"case.updated",
|
|
1209
|
+
"case.finished",
|
|
1210
|
+
"trace.span",
|
|
1211
|
+
"run.finished",
|
|
1212
|
+
"run.cancelled",
|
|
1213
|
+
"run.error"
|
|
1214
|
+
]);
|
|
1215
|
+
/** Schema for the SSE envelope used to stream run updates to clients. */
|
|
1216
|
+
const sseEnvelopeSchema = z.object({
|
|
1217
|
+
type: z.string(),
|
|
1218
|
+
runId: z.string().optional(),
|
|
1219
|
+
timestamp: z.string(),
|
|
1220
|
+
payload: z.unknown()
|
|
1221
|
+
});
|
|
1222
|
+
//#endregion
|
|
1223
|
+
//#region ../shared/src/schemas/api.ts
|
|
1224
|
+
/** Schema for the API request that starts a new eval run. */
|
|
1225
|
+
const createRunRequestSchema = z.object({
|
|
1226
|
+
target: z.object({
|
|
1227
|
+
mode: z.enum([
|
|
1228
|
+
"all",
|
|
1229
|
+
"evalIds",
|
|
1230
|
+
"caseIds"
|
|
1231
|
+
]),
|
|
1232
|
+
evalIds: z.array(z.string()).optional(),
|
|
1233
|
+
caseIds: z.array(z.string()).optional()
|
|
1234
|
+
}),
|
|
1235
|
+
trials: z.number().min(1),
|
|
1236
|
+
/**
|
|
1237
|
+
* Optional cache controls for the run. When omitted, the cache is used in
|
|
1238
|
+
* its default read-through / write-on-miss mode.
|
|
1239
|
+
*/
|
|
1240
|
+
cache: z.object({ mode: cacheModeSchema.default("use") }).optional()
|
|
1241
|
+
});
|
|
1242
|
+
/** Schema for updating a UI-authored manual score on one persisted case. */
|
|
1243
|
+
const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1).nullable() });
|
|
1244
|
+
//#endregion
|
|
1245
|
+
//#region ../runner/src/cacheStore.ts
|
|
1246
|
+
/**
|
|
1247
|
+
* Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
|
|
1248
|
+
*
|
|
1249
|
+
* Writes use `<name>.tmp` + atomic `rename` to avoid partial reads under
|
|
1250
|
+
* concurrent access.
|
|
1251
|
+
*/
|
|
1252
|
+
function createFsCacheStore(options) {
|
|
1253
|
+
const cacheDir = resolve(options.workspaceRoot, options.dir ?? ".agent-evals/cache");
|
|
1254
|
+
return {
|
|
1255
|
+
dir() {
|
|
1256
|
+
return cacheDir;
|
|
1257
|
+
},
|
|
1258
|
+
async lookup(namespace, keyHash) {
|
|
1259
|
+
const filePath = entryPath(cacheDir, namespace, keyHash);
|
|
1260
|
+
if (!existsSync(filePath)) return null;
|
|
1261
|
+
const json = safeJsonParse(await readFile(filePath, "utf-8"));
|
|
1262
|
+
if (json === null) return null;
|
|
1263
|
+
const parsed = cacheEntrySchema.safeParse(json);
|
|
1264
|
+
if (!parsed.success) return null;
|
|
1265
|
+
return parsed.data;
|
|
1266
|
+
},
|
|
1267
|
+
async write(entry) {
|
|
1268
|
+
const filePath = entryPath(cacheDir, entry.namespace, entry.key);
|
|
1269
|
+
await mkdir(dirname(filePath), { recursive: true });
|
|
1270
|
+
const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
|
|
1271
|
+
await writeFile(tmpPath, JSON.stringify(entry));
|
|
1272
|
+
await rename(tmpPath, filePath);
|
|
1273
|
+
},
|
|
1274
|
+
async list() {
|
|
1275
|
+
if (!existsSync(cacheDir)) return [];
|
|
1276
|
+
const namespaces = await readdir(cacheDir);
|
|
1277
|
+
const items = [];
|
|
1278
|
+
for (const namespace of namespaces) {
|
|
1279
|
+
const nsPath = join(cacheDir, namespace);
|
|
1280
|
+
if (!(await stat(nsPath)).isDirectory()) continue;
|
|
1281
|
+
const files = await readdir(nsPath);
|
|
1282
|
+
for (const fileName of files) {
|
|
1283
|
+
if (!fileName.endsWith(".json")) continue;
|
|
1284
|
+
const filePath = join(nsPath, fileName);
|
|
1285
|
+
const json = safeJsonParse(await readFile(filePath, "utf-8"));
|
|
1286
|
+
if (json === null) continue;
|
|
1287
|
+
const parsed = cacheEntrySchema.safeParse(json);
|
|
1288
|
+
if (!parsed.success) continue;
|
|
1289
|
+
const fileStat = await stat(filePath);
|
|
1290
|
+
items.push({
|
|
1291
|
+
key: parsed.data.key,
|
|
1292
|
+
namespace: parsed.data.namespace,
|
|
1293
|
+
spanName: parsed.data.spanName,
|
|
1294
|
+
spanKind: parsed.data.spanKind,
|
|
1295
|
+
storedAt: parsed.data.storedAt,
|
|
1296
|
+
codeFingerprint: parsed.data.codeFingerprint,
|
|
1297
|
+
sizeBytes: fileStat.size
|
|
1298
|
+
});
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
items.sort((a, b) => a.storedAt < b.storedAt ? 1 : -1);
|
|
1302
|
+
return items;
|
|
1303
|
+
},
|
|
1304
|
+
async clear(filter) {
|
|
1305
|
+
if (!existsSync(cacheDir)) return;
|
|
1306
|
+
if (!filter || filter.namespace === void 0 && filter.key === void 0) {
|
|
1307
|
+
await rm(cacheDir, {
|
|
1308
|
+
recursive: true,
|
|
1309
|
+
force: true
|
|
1310
|
+
});
|
|
1311
|
+
return;
|
|
1312
|
+
}
|
|
1313
|
+
if (filter.namespace !== void 0 && filter.key === void 0) {
|
|
1314
|
+
await rm(join(cacheDir, filter.namespace), {
|
|
1315
|
+
recursive: true,
|
|
1316
|
+
force: true
|
|
1317
|
+
});
|
|
1318
|
+
return;
|
|
1319
|
+
}
|
|
1320
|
+
if (filter.namespace !== void 0 && filter.key !== void 0) {
|
|
1321
|
+
await rm(entryPath(cacheDir, filter.namespace, filter.key), { force: true });
|
|
1322
|
+
return;
|
|
1323
|
+
}
|
|
1324
|
+
const namespaces = await readdir(cacheDir);
|
|
1325
|
+
for (const namespace of namespaces) {
|
|
1326
|
+
const filePath = entryPath(cacheDir, namespace, filter.key ?? "");
|
|
1327
|
+
if (existsSync(filePath)) await rm(filePath, { force: true });
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
};
|
|
1331
|
+
}
|
|
1332
|
+
/**
|
|
1333
|
+
* Create a write-buffered cache adapter for one trial attempt.
|
|
1334
|
+
*
|
|
1335
|
+
* Lookups first consult entries written earlier in the same trial, then fall
|
|
1336
|
+
* back to the shared backing store. Call `commit()` after selecting the
|
|
1337
|
+
* winning trial so only that trial's writes reach the shared cache.
|
|
1338
|
+
*/
|
|
1339
|
+
function createBufferedCacheStore(backingStore) {
|
|
1340
|
+
const pendingEntries = /* @__PURE__ */ new Map();
|
|
1341
|
+
return {
|
|
1342
|
+
async lookup(namespace, keyHash) {
|
|
1343
|
+
const buffered = pendingEntries.get(toPendingKey(namespace, keyHash));
|
|
1344
|
+
if (buffered !== void 0) return buffered;
|
|
1345
|
+
return backingStore.lookup(namespace, keyHash);
|
|
1346
|
+
},
|
|
1347
|
+
write(entry) {
|
|
1348
|
+
pendingEntries.set(toPendingKey(entry.namespace, entry.key), entry);
|
|
1349
|
+
return Promise.resolve();
|
|
1350
|
+
},
|
|
1351
|
+
async commit() {
|
|
1352
|
+
for (const entry of pendingEntries.values()) await backingStore.write(entry);
|
|
1353
|
+
},
|
|
1354
|
+
getPendingEntries() {
|
|
1355
|
+
return [...pendingEntries.values()];
|
|
1356
|
+
}
|
|
1357
|
+
};
|
|
1358
|
+
}
|
|
1359
|
+
function entryPath(cacheDir, namespace, keyHash) {
|
|
1360
|
+
return join(cacheDir, sanitizeSegment$1(namespace), `${keyHash}.json`);
|
|
1361
|
+
}
|
|
1362
|
+
function toPendingKey(namespace, keyHash) {
|
|
1363
|
+
return `${namespace}::${keyHash}`;
|
|
1364
|
+
}
|
|
1365
|
+
function sanitizeSegment$1(segment) {
|
|
1366
|
+
return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
|
|
1367
|
+
}
|
|
1368
|
+
function safeJsonParse(text) {
|
|
1369
|
+
const parsed = resultify(() => JSON.parse(text));
|
|
1370
|
+
if (parsed.error) return null;
|
|
1371
|
+
return parsed.value;
|
|
1372
|
+
}
|
|
1373
|
+
//#endregion
|
|
1374
|
+
//#region ../runner/src/chartValidation.ts
|
|
1375
|
+
function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
|
|
1376
|
+
const columnDef = columnsByKey.get(metric.key);
|
|
1377
|
+
if (!columnDef) {
|
|
1378
|
+
warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
|
|
1379
|
+
return false;
|
|
1380
|
+
}
|
|
1381
|
+
if (metric.aggregate === "passThresholdRate") {
|
|
1382
|
+
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
1383
|
+
warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
1384
|
+
return false;
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
return true;
|
|
1388
|
+
}
|
|
1389
|
+
function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
|
|
1390
|
+
const columnDef = columnsByKey.get(extra.key);
|
|
1391
|
+
if (!columnDef) {
|
|
1392
|
+
warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
|
|
1393
|
+
return false;
|
|
1394
|
+
}
|
|
1395
|
+
if (extra.aggregate === "passThresholdRate") {
|
|
1396
|
+
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
1397
|
+
warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
1398
|
+
return false;
|
|
1399
|
+
}
|
|
1400
|
+
}
|
|
1401
|
+
return true;
|
|
1402
|
+
}
|
|
1403
|
+
function sanitizeChart(chart, columnsByKey, evalId, warnings) {
|
|
1404
|
+
const metrics = chart.metrics.filter((metric) => {
|
|
1405
|
+
if (metric.source === "builtin") return true;
|
|
1406
|
+
return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
|
|
1407
|
+
});
|
|
1408
|
+
if (metrics.length === 0) {
|
|
1409
|
+
warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
|
|
1410
|
+
return null;
|
|
1411
|
+
}
|
|
1412
|
+
const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
|
|
1413
|
+
if (extra.source === "builtin") return true;
|
|
1414
|
+
return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
|
|
1415
|
+
});
|
|
1416
|
+
return {
|
|
1417
|
+
...chart,
|
|
1418
|
+
metrics,
|
|
1419
|
+
tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
|
|
1420
|
+
};
|
|
1421
|
+
}
|
|
1422
|
+
/**
|
|
1423
|
+
* Validate and sanitize an authored `charts` config against the eval's
|
|
1424
|
+
* declared columns. Drops metrics/extras that reference unknown columns or
|
|
1425
|
+
* misuse `passThresholdRate`, and drops entire charts whose metrics are all
|
|
1426
|
+
* invalid. Returns `charts: undefined` when nothing valid remains so the UI
|
|
1427
|
+
* falls back to rendering no chart (matching the opt-in default).
|
|
1428
|
+
*/
|
|
1429
|
+
function validateCharts(params) {
|
|
1430
|
+
const { charts, columnDefs, evalId } = params;
|
|
1431
|
+
if (!charts || charts.length === 0) return {
|
|
1432
|
+
charts: void 0,
|
|
1433
|
+
warnings: []
|
|
1434
|
+
};
|
|
1435
|
+
const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
|
|
1436
|
+
const warnings = [];
|
|
1437
|
+
const sanitized = [];
|
|
1438
|
+
for (const chart of charts) {
|
|
1439
|
+
const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
|
|
1440
|
+
if (result) sanitized.push(result);
|
|
1441
|
+
}
|
|
1442
|
+
return {
|
|
1443
|
+
charts: sanitized.length > 0 ? sanitized : void 0,
|
|
1444
|
+
warnings
|
|
1445
|
+
};
|
|
1446
|
+
}
|
|
1447
|
+
//#endregion
|
|
1448
|
+
//#region ../runner/src/columnBuilder.ts
|
|
1449
|
+
/**
|
|
1450
|
+
* Normalize a user-provided score definition (either a function or an
|
|
1451
|
+
* object literal with `compute`/`passThreshold`/`label`) to a common
|
|
1452
|
+
* shape used internally.
|
|
1453
|
+
*/
|
|
1454
|
+
function normalizeScoreDef(def) {
|
|
1455
|
+
if (typeof def === "function") return {
|
|
1456
|
+
compute: def,
|
|
1457
|
+
passThreshold: void 0,
|
|
1458
|
+
label: void 0
|
|
1459
|
+
};
|
|
1460
|
+
return {
|
|
1461
|
+
compute: def.compute,
|
|
1462
|
+
passThreshold: def.passThreshold,
|
|
1463
|
+
label: def.label
|
|
1464
|
+
};
|
|
1465
|
+
}
|
|
1466
|
+
function getScoreOverride(def) {
|
|
1467
|
+
if (def === void 0 || typeof def === "function") return void 0;
|
|
1468
|
+
return {
|
|
1469
|
+
label: def.label,
|
|
1470
|
+
format: def.format,
|
|
1471
|
+
numberFormat: def.numberFormat,
|
|
1472
|
+
hideInTable: def.hideInTable,
|
|
1473
|
+
sortable: def.sortable,
|
|
1474
|
+
align: def.align,
|
|
1475
|
+
maxStars: def.maxStars
|
|
1476
|
+
};
|
|
1477
|
+
}
|
|
1478
|
+
function mergeOverrides(base, override) {
|
|
1479
|
+
if (base === void 0) return override;
|
|
1480
|
+
if (override === void 0) return base;
|
|
1481
|
+
return {
|
|
1482
|
+
label: override.label ?? base.label,
|
|
1483
|
+
format: override.format ?? base.format,
|
|
1484
|
+
numberFormat: override.numberFormat ?? base.numberFormat,
|
|
1485
|
+
hideInTable: override.hideInTable ?? base.hideInTable,
|
|
1486
|
+
sortable: override.sortable ?? base.sortable,
|
|
1487
|
+
align: override.align ?? base.align,
|
|
1488
|
+
maxStars: override.maxStars ?? base.maxStars
|
|
1489
|
+
};
|
|
1490
|
+
}
|
|
1491
|
+
/**
|
|
1492
|
+
* Populate `target` with `ColumnDef` entries for any keys in `columns`
|
|
1493
|
+
* that aren't already present, applying user-supplied `overrides` and
|
|
1494
|
+
* flagging score columns declared via `scores`.
|
|
1495
|
+
*/
|
|
1496
|
+
function mergeColumnDefs(target, columns, overrides, scores, manualScores) {
|
|
1497
|
+
const scoreKeys = new Set(Object.keys(scores ?? {}));
|
|
1498
|
+
const manualScoreKeys = new Set(Object.keys(manualScores ?? {}));
|
|
1499
|
+
const overrideMap = overrides ?? {};
|
|
1500
|
+
for (const [key, value] of Object.entries(columns)) {
|
|
1501
|
+
if (target.has(key)) continue;
|
|
1502
|
+
const override = mergeOverrides(getScoreOverride(scores?.[key]) ?? manualScores?.[key], overrideMap[key]);
|
|
1503
|
+
const isScore = scoreKeys.has(key) || manualScoreKeys.has(key);
|
|
1504
|
+
target.set(key, createColumnDef({
|
|
1505
|
+
key,
|
|
1506
|
+
override,
|
|
1507
|
+
scoreDef: scores?.[key],
|
|
1508
|
+
manualScoreDef: manualScores?.[key],
|
|
1509
|
+
inferredKind: isScore ? "number" : inferKind(value),
|
|
1510
|
+
isScore,
|
|
1511
|
+
isManualScore: manualScoreKeys.has(key)
|
|
1512
|
+
}));
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
/**
|
|
1516
|
+
* Build the column definitions declared directly on an eval before any runtime
|
|
1517
|
+
* output values exist. This lets discovery metadata describe authored rich
|
|
1518
|
+
* output columns even for runs created by another process.
|
|
1519
|
+
*/
|
|
1520
|
+
function buildDeclaredColumnDefs(overrides, scores, manualScores) {
|
|
1521
|
+
const declaredDefs = /* @__PURE__ */ new Map();
|
|
1522
|
+
for (const [key, override] of Object.entries(overrides ?? {})) {
|
|
1523
|
+
const isScore = scores?.[key] !== void 0 || manualScores?.[key] !== void 0;
|
|
1524
|
+
const mergedOverride = mergeOverrides(getScoreOverride(scores?.[key]) ?? manualScores?.[key], override);
|
|
1525
|
+
declaredDefs.set(key, createColumnDef({
|
|
1526
|
+
key,
|
|
1527
|
+
override: mergedOverride,
|
|
1528
|
+
scoreDef: scores?.[key],
|
|
1529
|
+
manualScoreDef: manualScores?.[key],
|
|
1530
|
+
inferredKind: inferKindFromFormat(mergedOverride?.format) ?? (mergedOverride?.numberFormat === void 0 ? void 0 : "number"),
|
|
1531
|
+
isScore,
|
|
1532
|
+
isManualScore: manualScores?.[key] !== void 0
|
|
1533
|
+
}));
|
|
1534
|
+
}
|
|
1535
|
+
for (const [key, scoreDef] of Object.entries(scores ?? {})) {
|
|
1536
|
+
if (declaredDefs.has(key)) continue;
|
|
1537
|
+
declaredDefs.set(key, createColumnDef({
|
|
1538
|
+
key,
|
|
1539
|
+
override: getScoreOverride(scoreDef),
|
|
1540
|
+
scoreDef,
|
|
1541
|
+
inferredKind: "number",
|
|
1542
|
+
isScore: true,
|
|
1543
|
+
isManualScore: false
|
|
1544
|
+
}));
|
|
1545
|
+
}
|
|
1546
|
+
for (const [key, manualScoreDef] of Object.entries(manualScores ?? {})) {
|
|
1547
|
+
if (declaredDefs.has(key)) continue;
|
|
1548
|
+
declaredDefs.set(key, createColumnDef({
|
|
1549
|
+
key,
|
|
1550
|
+
override: manualScoreDef,
|
|
1551
|
+
manualScoreDef,
|
|
1552
|
+
inferredKind: "number",
|
|
1553
|
+
isScore: true,
|
|
1554
|
+
isManualScore: true
|
|
1555
|
+
}));
|
|
1556
|
+
}
|
|
1557
|
+
return [...declaredDefs.values()];
|
|
1558
|
+
}
|
|
1559
|
+
/** Infer a `ColumnKind` from a runtime value when no override is set. */
|
|
1560
|
+
function inferKind(value) {
|
|
1561
|
+
if (typeof value === "number") return "number";
|
|
1562
|
+
if (typeof value === "boolean") return "boolean";
|
|
1563
|
+
return "string";
|
|
1564
|
+
}
|
|
1565
|
+
/**
|
|
1566
|
+
* Coerce an arbitrary runtime value into a serializable `CellValue`.
|
|
1567
|
+
* Non-primitive values fall back to `JSON.stringify`.
|
|
1568
|
+
*/
|
|
1569
|
+
function toCellValue(value, override = void 0) {
|
|
1570
|
+
if (value === null) return null;
|
|
1571
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return value;
|
|
1572
|
+
if (value === void 0) return void 0;
|
|
1573
|
+
if (override?.format === "image" || override?.format === "audio" || override?.format === "video" || override?.format === "file") {
|
|
1574
|
+
const parsed = fileRefSchema.safeParse(value);
|
|
1575
|
+
if (parsed.success) return parsed.data;
|
|
1576
|
+
}
|
|
1577
|
+
if (override?.format === "json") {
|
|
1578
|
+
const parsed = jsonCellSchema.safeParse(value);
|
|
1579
|
+
if (parsed.success) return parsed.data;
|
|
1580
|
+
}
|
|
1581
|
+
return JSON.stringify(value);
|
|
1582
|
+
}
|
|
1583
|
+
function inferKindFromFormat(format) {
|
|
1584
|
+
if (format === "boolean") return "boolean";
|
|
1585
|
+
if (format === "duration" || format === "percent" || format === "number" || format === "passFail" || format === "stars") return "number";
|
|
1586
|
+
if (format === void 0) return void 0;
|
|
1587
|
+
return "string";
|
|
1588
|
+
}
|
|
1589
|
+
function createColumnDef(params) {
|
|
1590
|
+
const { key, override, scoreDef, manualScoreDef, inferredKind, isScore, isManualScore } = params;
|
|
1591
|
+
const kind = inferredKind ?? (isScore ? "number" : "string");
|
|
1592
|
+
const def = {
|
|
1593
|
+
key,
|
|
1594
|
+
label: override?.label ?? key,
|
|
1595
|
+
kind
|
|
1596
|
+
};
|
|
1597
|
+
if (override?.format !== void 0) def.format = override.format;
|
|
1598
|
+
if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
|
|
1599
|
+
if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
|
|
1600
|
+
if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
|
|
1601
|
+
if (override?.sortable !== void 0) def.sortable = override.sortable;
|
|
1602
|
+
if (override?.align !== void 0) def.align = override.align;
|
|
1603
|
+
if (!isScore) return def;
|
|
1604
|
+
def.isScore = true;
|
|
1605
|
+
if (isManualScore) {
|
|
1606
|
+
def.isManualScore = true;
|
|
1607
|
+
if (manualScoreDef?.passThreshold !== void 0) def.passThreshold = manualScoreDef.passThreshold;
|
|
1608
|
+
return def;
|
|
1609
|
+
}
|
|
1610
|
+
if (typeof scoreDef === "function" || scoreDef === void 0) return def;
|
|
1611
|
+
if (scoreDef.passThreshold !== void 0) def.passThreshold = scoreDef.passThreshold;
|
|
1612
|
+
if (scoreDef.label !== void 0 && override?.label === void 0) def.label = scoreDef.label;
|
|
1613
|
+
return def;
|
|
1614
|
+
}
|
|
1615
|
+
//#endregion
|
|
1616
|
+
//#region ../runner/src/config.ts
|
|
1617
|
+
const configModuleSchema = z.object({
|
|
1618
|
+
default: agentEvalsConfigSchema.optional(),
|
|
1619
|
+
config: agentEvalsConfigSchema.optional()
|
|
1620
|
+
});
|
|
1621
|
+
const defaultConfig = {
|
|
1622
|
+
include: ["**/*.eval.ts"],
|
|
1623
|
+
defaultTrials: 1,
|
|
1624
|
+
trialSelection: "lowestScore",
|
|
1625
|
+
concurrency: 2,
|
|
1626
|
+
staleAfterDays: 14,
|
|
1627
|
+
traceDisplay: { attributes: [{
|
|
1628
|
+
path: "input",
|
|
1629
|
+
label: "Input",
|
|
1630
|
+
format: "json",
|
|
1631
|
+
placements: ["section"]
|
|
1632
|
+
}, {
|
|
1633
|
+
path: "output",
|
|
1634
|
+
label: "Output",
|
|
1635
|
+
format: "json",
|
|
1636
|
+
placements: ["section"]
|
|
1637
|
+
}] }
|
|
1638
|
+
};
|
|
1639
|
+
async function loadConfig() {
|
|
1640
|
+
const configPath = resolve(process.cwd(), "agent-evals.config.ts");
|
|
1641
|
+
if (!existsSync(configPath)) return defaultConfig;
|
|
1642
|
+
try {
|
|
1643
|
+
const imported = await import(pathToFileURL(configPath).href);
|
|
1644
|
+
const configModule = configModuleSchema.parse(imported);
|
|
1645
|
+
const userConfig = configModule.default ?? configModule.config;
|
|
1646
|
+
if (!userConfig) return defaultConfig;
|
|
1647
|
+
return {
|
|
1648
|
+
...defaultConfig,
|
|
1649
|
+
...userConfig
|
|
1650
|
+
};
|
|
1651
|
+
} catch (error) {
|
|
1652
|
+
console.error("Failed to load agent-evals.config.ts:", error);
|
|
1653
|
+
return defaultConfig;
|
|
1654
|
+
}
|
|
1655
|
+
}
|
|
1656
|
+
//#endregion
|
|
1657
|
+
//#region ../runner/src/discovery.ts
|
|
1658
|
+
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
1659
|
+
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
1660
|
+
function parseEvalMetas(filePath, content) {
|
|
1661
|
+
const metas = [];
|
|
1662
|
+
let searchIndex = 0;
|
|
1663
|
+
while (searchIndex < content.length) {
|
|
1664
|
+
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
1665
|
+
if (defineEvalIndex === -1) break;
|
|
1666
|
+
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
1667
|
+
if (!extracted) {
|
|
1668
|
+
searchIndex = defineEvalIndex + 10;
|
|
1669
|
+
continue;
|
|
1670
|
+
}
|
|
1671
|
+
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
1672
|
+
if (id !== void 0) {
|
|
1673
|
+
const result = {
|
|
1674
|
+
filePath,
|
|
1675
|
+
id
|
|
1676
|
+
};
|
|
1677
|
+
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
1678
|
+
if (title !== void 0) result.title = title;
|
|
1679
|
+
metas.push(result);
|
|
1680
|
+
}
|
|
1681
|
+
searchIndex = extracted.nextIndex;
|
|
1682
|
+
}
|
|
1683
|
+
return metas;
|
|
1684
|
+
}
|
|
1685
|
+
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
1686
|
+
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
1687
|
+
if (openParenIndex === -1) return void 0;
|
|
1688
|
+
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
1689
|
+
if (objectStartIndex === -1) return void 0;
|
|
1690
|
+
let depth = 0;
|
|
1691
|
+
let quote;
|
|
1692
|
+
let inBlockComment = false;
|
|
1693
|
+
let inLineComment = false;
|
|
1694
|
+
let isEscaped = false;
|
|
1695
|
+
for (let index = objectStartIndex; index < content.length; index++) {
|
|
1696
|
+
const currentChar = content[index];
|
|
1697
|
+
const nextChar = content[index + 1];
|
|
1698
|
+
if (inLineComment) {
|
|
1699
|
+
if (currentChar === "\n") inLineComment = false;
|
|
1700
|
+
continue;
|
|
1701
|
+
}
|
|
1702
|
+
if (inBlockComment) {
|
|
1703
|
+
if (currentChar === "*" && nextChar === "/") {
|
|
1704
|
+
inBlockComment = false;
|
|
1705
|
+
index++;
|
|
1706
|
+
}
|
|
1707
|
+
continue;
|
|
1708
|
+
}
|
|
1709
|
+
if (quote) {
|
|
1710
|
+
if (isEscaped) {
|
|
1711
|
+
isEscaped = false;
|
|
1712
|
+
continue;
|
|
1713
|
+
}
|
|
1714
|
+
if (currentChar === "\\") {
|
|
1715
|
+
isEscaped = true;
|
|
1716
|
+
continue;
|
|
1717
|
+
}
|
|
1718
|
+
if (currentChar === quote) quote = void 0;
|
|
1719
|
+
continue;
|
|
1720
|
+
}
|
|
1721
|
+
if (currentChar === "/" && nextChar === "/") {
|
|
1722
|
+
inLineComment = true;
|
|
1723
|
+
index++;
|
|
1724
|
+
continue;
|
|
1725
|
+
}
|
|
1726
|
+
if (currentChar === "/" && nextChar === "*") {
|
|
1727
|
+
inBlockComment = true;
|
|
1728
|
+
index++;
|
|
1729
|
+
continue;
|
|
1730
|
+
}
|
|
1731
|
+
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
1732
|
+
quote = currentChar;
|
|
1733
|
+
continue;
|
|
1734
|
+
}
|
|
1735
|
+
if (currentChar === "{") {
|
|
1736
|
+
depth++;
|
|
1737
|
+
continue;
|
|
1738
|
+
}
|
|
1739
|
+
if (currentChar === "}") {
|
|
1740
|
+
depth--;
|
|
1741
|
+
if (depth === 0) return {
|
|
1742
|
+
nextIndex: index + 1,
|
|
1743
|
+
objectText: content.slice(objectStartIndex, index + 1)
|
|
1744
|
+
};
|
|
1745
|
+
}
|
|
1746
|
+
}
|
|
1747
|
+
}
|
|
1748
|
+
//#endregion
|
|
1749
|
+
//#region ../runner/src/evalModuleLoader.ts
|
|
1750
|
+
/**
|
|
1751
|
+
* Import one eval module with a cache key derived from its current source so
|
|
1752
|
+
* repeated discovery and runs observe the latest authored definition.
|
|
1753
|
+
*/
|
|
1754
|
+
async function loadEvalModule(filePath, sourceFingerprint = void 0) {
|
|
1755
|
+
const moduleUrl = new URL(pathToFileURL(filePath).href);
|
|
1756
|
+
if (sourceFingerprint !== void 0) moduleUrl.searchParams.set("v", sourceFingerprint);
|
|
1757
|
+
await import(moduleUrl.href);
|
|
1758
|
+
}
|
|
1759
|
+
//#endregion
|
|
1760
|
+
//#region ../runner/src/freshness.ts
|
|
1761
|
+
/**
|
|
1762
|
+
* Derive eval freshness from the latest run, current eval-file fingerprint,
|
|
1763
|
+
* current git commit, and an age threshold.
|
|
1764
|
+
*/
|
|
1765
|
+
function deriveEvalFreshness(params) {
|
|
1766
|
+
const { latestRun, gitState, currentEvalSourceFingerprint, staleAfterDays, now = /* @__PURE__ */ new Date() } = params;
|
|
1767
|
+
const stale = latestRun?.evalSourceFingerprint !== void 0 && latestRun.evalSourceFingerprint !== null && currentEvalSourceFingerprint !== null && currentEvalSourceFingerprint !== latestRun.evalSourceFingerprint;
|
|
1768
|
+
const latestRunCommitSha = latestRun?.commitSha;
|
|
1769
|
+
if (latestRunCommitSha === void 0 || latestRunCommitSha === null) return {
|
|
1770
|
+
freshnessStatus: stale ? "stale" : "fresh",
|
|
1771
|
+
stale,
|
|
1772
|
+
outdated: false
|
|
1773
|
+
};
|
|
1774
|
+
if (gitState.commitSha === null) return {
|
|
1775
|
+
freshnessStatus: stale ? "stale" : "fresh",
|
|
1776
|
+
stale,
|
|
1777
|
+
outdated: false
|
|
1778
|
+
};
|
|
1779
|
+
if (latestRunCommitSha === gitState.commitSha) return {
|
|
1780
|
+
freshnessStatus: stale ? "stale" : "fresh",
|
|
1781
|
+
stale,
|
|
1782
|
+
outdated: false
|
|
1783
|
+
};
|
|
1784
|
+
const latestRunStartedAt = new Date(latestRun?.startedAt ?? "").getTime();
|
|
1785
|
+
if (!Number.isFinite(latestRunStartedAt)) return {
|
|
1786
|
+
freshnessStatus: stale ? "stale" : "fresh",
|
|
1787
|
+
stale,
|
|
1788
|
+
outdated: false
|
|
1789
|
+
};
|
|
1790
|
+
const outdated = now.getTime() - latestRunStartedAt >= staleAfterDays * 24 * 60 * 60 * 1e3;
|
|
1791
|
+
return {
|
|
1792
|
+
freshnessStatus: stale ? "stale" : outdated ? "outdated" : "fresh",
|
|
1793
|
+
stale,
|
|
1794
|
+
outdated
|
|
1795
|
+
};
|
|
1796
|
+
}
|
|
1797
|
+
/** Return the timestamp used when ordering and displaying a run recency. */
|
|
1798
|
+
function getRunFreshnessTimestamp(manifest) {
|
|
1799
|
+
return manifest.endedAt ?? manifest.startedAt;
|
|
1800
|
+
}
|
|
1801
|
+
//#endregion
|
|
1802
|
+
//#region ../runner/src/evalSummaries.ts
|
|
1803
|
+
/** Build the API/UI summary payload for one discovered eval. */
|
|
1804
|
+
function buildEvalSummary(params) {
|
|
1805
|
+
const { meta, config, gitState, latestRun, lastRunStatus } = params;
|
|
1806
|
+
const { sourceFingerprint, ...summaryMeta } = meta;
|
|
1807
|
+
const freshness = deriveEvalFreshness({
|
|
1808
|
+
latestRun,
|
|
1809
|
+
gitState,
|
|
1810
|
+
currentEvalSourceFingerprint: sourceFingerprint,
|
|
1811
|
+
staleAfterDays: config.staleAfterDays ?? 14
|
|
1812
|
+
});
|
|
1813
|
+
return {
|
|
1814
|
+
...summaryMeta,
|
|
1815
|
+
stale: freshness.stale,
|
|
1816
|
+
outdated: freshness.outdated,
|
|
1817
|
+
freshnessStatus: freshness.freshnessStatus,
|
|
1818
|
+
latestRunAt: latestRun?.startedAt ?? null,
|
|
1819
|
+
latestRunCommitSha: latestRun?.commitSha ?? null,
|
|
1820
|
+
currentCommitSha: gitState.commitSha,
|
|
1821
|
+
lastRunStatus
|
|
1822
|
+
};
|
|
1823
|
+
}
|
|
1824
|
+
/** Resolve which eval ids a run request should mark as the latest run. */
|
|
1825
|
+
function getTargetEvalIds(params) {
|
|
1826
|
+
const { request, sortedEvalIds, knownEvalIds } = params;
|
|
1827
|
+
if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.filter((evalId) => knownEvalIds.has(evalId));
|
|
1828
|
+
return sortedEvalIds;
|
|
1829
|
+
}
|
|
1830
|
+
/** Write one latest-run snapshot to each targeted eval id. */
|
|
1831
|
+
function setLatestRunInfoMap(params) {
|
|
1832
|
+
const { latestRunInfoMap, evalIds, info } = params;
|
|
1833
|
+
for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
|
|
1834
|
+
}
|
|
1835
|
+
//#endregion
|
|
1836
|
+
//#region ../runner/src/gitState.ts
|
|
1837
|
+
function runGitCommand(workspaceRoot, args) {
|
|
1838
|
+
const result = spawnSync("git", args, {
|
|
1839
|
+
cwd: workspaceRoot,
|
|
1840
|
+
encoding: "utf8",
|
|
1841
|
+
stdio: [
|
|
1842
|
+
"ignore",
|
|
1843
|
+
"pipe",
|
|
1844
|
+
"ignore"
|
|
1845
|
+
]
|
|
1846
|
+
});
|
|
1847
|
+
return {
|
|
1848
|
+
status: result.status,
|
|
1849
|
+
stdout: result.stdout.trim()
|
|
1850
|
+
};
|
|
1851
|
+
}
|
|
1852
|
+
/** Read the current git commit for the workspace, if available. */
|
|
1853
|
+
function readGitWorktreeState(workspaceRoot) {
|
|
1854
|
+
const insideWorktree = runGitCommand(workspaceRoot, ["rev-parse", "--is-inside-work-tree"]);
|
|
1855
|
+
if (insideWorktree.status !== 0 || insideWorktree.stdout !== "true") return { commitSha: null };
|
|
1856
|
+
const commitResult = runGitCommand(workspaceRoot, ["rev-parse", "HEAD"]);
|
|
1857
|
+
return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
|
|
1858
|
+
}
|
|
1859
|
+
//#endregion
|
|
1860
|
+
//#region ../runner/src/outputArtifacts.ts
|
|
1861
|
+
const mimeTypeExtensionMap = {
|
|
1862
|
+
"application/json": ".json",
|
|
1863
|
+
"application/pdf": ".pdf",
|
|
1864
|
+
"audio/mpeg": ".mp3",
|
|
1865
|
+
"audio/mp4": ".m4a",
|
|
1866
|
+
"audio/wav": ".wav",
|
|
1867
|
+
"image/gif": ".gif",
|
|
1868
|
+
"image/jpeg": ".jpg",
|
|
1869
|
+
"image/png": ".png",
|
|
1870
|
+
"image/svg+xml": ".svg",
|
|
1871
|
+
"image/webp": ".webp",
|
|
1872
|
+
"text/html": ".html",
|
|
1873
|
+
"text/markdown": ".md",
|
|
1874
|
+
"text/plain": ".txt",
|
|
1875
|
+
"video/mp4": ".mp4",
|
|
1876
|
+
"video/webm": ".webm"
|
|
1877
|
+
};
|
|
1878
|
+
/**
|
|
1879
|
+
* Persist a `Blob`/`File` emitted via `setEvalOutput(...)` into the current run's
|
|
1880
|
+
* artifact directory and return the resulting run artifact reference.
|
|
1881
|
+
*/
|
|
1882
|
+
async function persistInlineArtifact({ artifactDir, runId, caseId, outputKey, trial, value }) {
|
|
1883
|
+
await mkdir(artifactDir, { recursive: true });
|
|
1884
|
+
const mimeType = normalizeMimeType(value.type);
|
|
1885
|
+
const fileName = getArtifactFileName({
|
|
1886
|
+
outputKey,
|
|
1887
|
+
mimeType,
|
|
1888
|
+
value
|
|
1889
|
+
});
|
|
1890
|
+
const artifactId = [
|
|
1891
|
+
sanitizeSegment(runId),
|
|
1892
|
+
sanitizeSegment(caseId),
|
|
1893
|
+
`t${String(trial)}`,
|
|
1894
|
+
sanitizeSegment(outputKey),
|
|
1895
|
+
sanitizeFileName(fileName)
|
|
1896
|
+
].join("__");
|
|
1897
|
+
await writeFile(join(artifactDir, artifactId), new Uint8Array(await value.arrayBuffer()));
|
|
1898
|
+
return {
|
|
1899
|
+
source: "run",
|
|
1900
|
+
artifactId,
|
|
1901
|
+
mimeType,
|
|
1902
|
+
fileName
|
|
1903
|
+
};
|
|
1904
|
+
}
|
|
1905
|
+
/** Resolve a persisted run artifact path from its artifact id. */
|
|
1906
|
+
function resolveArtifactPath(runsDir, artifactId) {
|
|
1907
|
+
const [runId] = artifactId.split("__", 1);
|
|
1908
|
+
if (!runId) return void 0;
|
|
1909
|
+
return join(runsDir, runId, "artifacts", artifactId);
|
|
1910
|
+
}
|
|
1911
|
+
function normalizeMimeType(value) {
|
|
1912
|
+
const normalized = value.trim();
|
|
1913
|
+
return normalized.length > 0 ? normalized : "application/octet-stream";
|
|
1914
|
+
}
|
|
1915
|
+
function getArtifactFileName(params) {
|
|
1916
|
+
const { outputKey, mimeType, value } = params;
|
|
1917
|
+
if (isFile(value) && value.name.trim().length > 0) return value.name.trim();
|
|
1918
|
+
const extension = getExtensionForMimeType(mimeType);
|
|
1919
|
+
return extension.length > 0 ? `${sanitizeSegment(outputKey)}${extension}` : sanitizeSegment(outputKey);
|
|
1920
|
+
}
|
|
1921
|
+
function getExtensionForMimeType(mimeType) {
|
|
1922
|
+
const exactMatch = mimeTypeExtensionMap[mimeType];
|
|
1923
|
+
if (exactMatch) return exactMatch;
|
|
1924
|
+
const subtype = mimeType.split("/")[1];
|
|
1925
|
+
if (subtype === void 0 || subtype.length === 0) return "";
|
|
1926
|
+
const withoutSuffix = subtype.split("+")[0] ?? subtype;
|
|
1927
|
+
return withoutSuffix.length > 0 ? `.${withoutSuffix}` : "";
|
|
1928
|
+
}
|
|
1929
|
+
function sanitizeSegment(value) {
|
|
1930
|
+
const normalized = value.trim().replaceAll(/[^A-Za-z0-9._-]+/g, "-");
|
|
1931
|
+
return normalized.length > 0 ? normalized : "artifact";
|
|
1932
|
+
}
|
|
1933
|
+
function sanitizeFileName(value) {
|
|
1934
|
+
const normalized = sanitizeSegment(value);
|
|
1935
|
+
const extension = extname(normalized);
|
|
1936
|
+
if (extension.length === 0) return normalized;
|
|
1937
|
+
return `${normalized.slice(0, -extension.length).replaceAll(".", "-")}${extension}`;
|
|
1938
|
+
}
|
|
1939
|
+
function isFile(value) {
|
|
1940
|
+
return value instanceof File;
|
|
1941
|
+
}
|
|
1942
|
+
//#endregion
|
|
1943
|
+
//#region ../runner/src/runMaintenance.ts
|
|
1944
|
+
async function persistRunState(runState) {
|
|
1945
|
+
await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
|
|
1946
|
+
await writeFile(join(runState.runDir, "run.json"), JSON.stringify(runState.manifest, null, 2));
|
|
1947
|
+
const casesJsonl = runState.cases.map((c) => JSON.stringify(c)).join("\n");
|
|
1948
|
+
await writeFile(join(runState.runDir, "cases.jsonl"), casesJsonl);
|
|
1949
|
+
}
|
|
1950
|
+
/**
|
|
1951
|
+
* Recompute a persisted case's status after score definitions changed.
|
|
1952
|
+
*
|
|
1953
|
+
* Pass/fail gates are per-score: a case fails when any score with a declared
|
|
1954
|
+
* `passThreshold` reports a numeric value below that threshold. Scores
|
|
1955
|
+
* without a threshold are informational and never gate. Cancelled and
|
|
1956
|
+
* errored cases retain their terminal status.
|
|
1957
|
+
*/
|
|
1958
|
+
function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
|
|
1959
|
+
if (caseRow.status === "cancelled") return "cancelled";
|
|
1960
|
+
if (caseDetail?.error !== null && caseDetail?.error !== void 0) return "error";
|
|
1961
|
+
if ((caseDetail?.assertionFailures.length ?? 0) > 0) return "fail";
|
|
1962
|
+
for (const [key, passThreshold] of scoreThresholds) {
|
|
1963
|
+
const rawValue = caseRow.columns[key] ?? caseDetail?.columns[key];
|
|
1964
|
+
if (typeof rawValue !== "number") continue;
|
|
1965
|
+
if (rawValue < passThreshold) return "fail";
|
|
1966
|
+
}
|
|
1967
|
+
return caseRow.status === "error" ? "error" : "pass";
|
|
1968
|
+
}
|
|
1969
|
+
function runTouchesEval(params) {
|
|
1970
|
+
if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
|
|
1971
|
+
if (params.target.mode === "all") return params.evalExists;
|
|
1972
|
+
if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
|
|
1973
|
+
return false;
|
|
1974
|
+
}
|
|
1975
|
+
async function recomputeEvalStatusesInRuns(params) {
|
|
1976
|
+
let updatedRuns = 0;
|
|
1977
|
+
for (const run of params.runs) {
|
|
1978
|
+
if (!runTouchesEval({
|
|
1979
|
+
target: run.manifest.target,
|
|
1980
|
+
caseRows: run.cases,
|
|
1981
|
+
evalId: params.evalId,
|
|
1982
|
+
evalExists: params.evalExists
|
|
1983
|
+
})) continue;
|
|
1984
|
+
if (run.manifest.status === "running") continue;
|
|
1985
|
+
let changed = false;
|
|
1986
|
+
for (const caseRow of run.cases) {
|
|
1987
|
+
if (caseRow.evalId !== params.evalId) continue;
|
|
1988
|
+
const caseDetail = run.caseDetails.get(caseRow.caseId);
|
|
1989
|
+
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
|
|
1990
|
+
if (caseRow.status === nextStatus) continue;
|
|
1991
|
+
caseRow.status = nextStatus;
|
|
1992
|
+
if (caseDetail) {
|
|
1993
|
+
caseDetail.status = nextStatus;
|
|
1994
|
+
await params.persistCaseDetail(run.runDir, caseDetail);
|
|
1995
|
+
}
|
|
1996
|
+
changed = true;
|
|
1997
|
+
}
|
|
1998
|
+
if (!changed) continue;
|
|
1999
|
+
const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
|
|
2000
|
+
run.summary.totalCases = derivedSummary.totalCases;
|
|
2001
|
+
run.summary.passedCases = derivedSummary.passedCases;
|
|
2002
|
+
run.summary.failedCases = derivedSummary.failedCases;
|
|
2003
|
+
run.summary.errorCases = derivedSummary.errorCases;
|
|
2004
|
+
run.summary.cancelledCases = derivedSummary.cancelledCases;
|
|
2005
|
+
await persistRunState(run);
|
|
2006
|
+
updatedRuns += 1;
|
|
2007
|
+
}
|
|
2008
|
+
return updatedRuns;
|
|
2009
|
+
}
|
|
2010
|
+
//#endregion
|
|
2011
|
+
//#region ../runner/src/traceDisplay.ts
|
|
2012
|
+
function isRecord$1(value) {
|
|
2013
|
+
return typeof value === "object" && value !== null;
|
|
2014
|
+
}
|
|
2015
|
+
function getNestedAttribute(value, path) {
|
|
2016
|
+
const parts = path.split(".");
|
|
2017
|
+
let current = value;
|
|
2018
|
+
for (const part of parts) {
|
|
2019
|
+
if (!isRecord$1(current) || !(part in current)) return;
|
|
2020
|
+
current = current[part];
|
|
2021
|
+
}
|
|
2022
|
+
return current;
|
|
2023
|
+
}
|
|
2024
|
+
function mergeNestedAttribute(value, path, attributeValue) {
|
|
2025
|
+
const root = value === void 0 ? {} : { ...value };
|
|
2026
|
+
const parts = path.split(".");
|
|
2027
|
+
let current = root;
|
|
2028
|
+
for (const [index, part] of parts.entries()) {
|
|
2029
|
+
if (index === parts.length - 1) {
|
|
2030
|
+
current[part] = attributeValue;
|
|
2031
|
+
continue;
|
|
2032
|
+
}
|
|
2033
|
+
const nextValue = current[part];
|
|
2034
|
+
const nextRecord = isRecord$1(nextValue) ? { ...nextValue } : {};
|
|
2035
|
+
current[part] = nextRecord;
|
|
2036
|
+
current = nextRecord;
|
|
2037
|
+
}
|
|
2038
|
+
return root;
|
|
2039
|
+
}
|
|
2040
|
+
function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
2041
|
+
const merged = /* @__PURE__ */ new Map();
|
|
2042
|
+
for (const attribute of globalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
2043
|
+
for (const attribute of evalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
2044
|
+
const resolvedAttributes = [];
|
|
2045
|
+
const transformedTrace = spans.map((span) => ({
|
|
2046
|
+
...span,
|
|
2047
|
+
attributes: span.attributes === void 0 ? void 0 : { ...span.attributes }
|
|
2048
|
+
}));
|
|
2049
|
+
for (const attribute of merged.values()) {
|
|
2050
|
+
const resolvedPath = attribute.transform ? `__display.${attribute.key ?? attribute.path}` : attribute.path;
|
|
2051
|
+
resolvedAttributes.push({
|
|
2052
|
+
key: attribute.key,
|
|
2053
|
+
path: resolvedPath,
|
|
2054
|
+
label: attribute.label,
|
|
2055
|
+
format: attribute.format,
|
|
2056
|
+
numberFormat: attribute.numberFormat,
|
|
2057
|
+
placements: attribute.placements,
|
|
2058
|
+
scope: attribute.scope,
|
|
2059
|
+
mode: attribute.mode
|
|
2060
|
+
});
|
|
2061
|
+
if (!attribute.transform) continue;
|
|
2062
|
+
for (const span of transformedTrace) {
|
|
2063
|
+
const sourceValue = getNestedAttribute(span.attributes, attribute.path);
|
|
2064
|
+
if (sourceValue === void 0) continue;
|
|
2065
|
+
const transformedValue = attribute.transform({
|
|
2066
|
+
value: sourceValue,
|
|
2067
|
+
span
|
|
2068
|
+
});
|
|
2069
|
+
if (transformedValue === void 0) continue;
|
|
2070
|
+
span.attributes = mergeNestedAttribute(span.attributes, resolvedPath, transformedValue);
|
|
2071
|
+
}
|
|
2072
|
+
}
|
|
2073
|
+
return {
|
|
2074
|
+
trace: transformedTrace,
|
|
2075
|
+
traceDisplay: { attributes: resolvedAttributes }
|
|
2076
|
+
};
|
|
2077
|
+
}
|
|
2078
|
+
//#endregion
|
|
2079
|
+
//#region ../runner/src/runExecution.ts
|
|
2080
|
+
function filterEvalCases(cases, evalIds, caseIds, evalId) {
|
|
2081
|
+
if (evalIds && evalIds.length > 0 && !evalIds.includes(evalId)) return [];
|
|
2082
|
+
if (!caseIds || caseIds.length === 0) return cases;
|
|
2083
|
+
const selectedCaseIds = new Set(caseIds);
|
|
2084
|
+
return cases.filter((evalCase) => selectedCaseIds.has(evalCase.id));
|
|
2085
|
+
}
|
|
2086
|
+
function resolveRunnableEvalCases(params) {
|
|
2087
|
+
const { cases, evalId } = params;
|
|
2088
|
+
if (cases.length > 0) return cases;
|
|
2089
|
+
return [{
|
|
2090
|
+
id: `${evalId}-no-output`,
|
|
2091
|
+
input: {}
|
|
2092
|
+
}];
|
|
2093
|
+
}
|
|
2094
|
+
async function callWithUnknownResult(fn, args) {
|
|
2095
|
+
return await Reflect.apply(fn, void 0, args);
|
|
2096
|
+
}
|
|
2097
|
+
async function runCase(params) {
|
|
2098
|
+
const { evalDef, evalId, evalCase, globalTraceDisplay, trial, signal, startTime, cacheAdapter, cacheMode, codeFingerprint, artifactDir, runId } = params;
|
|
2099
|
+
const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
|
|
2100
|
+
await Reflect.apply(evalDef.execute, evalDef, [{
|
|
2101
|
+
input: evalCase.input,
|
|
2102
|
+
signal
|
|
2103
|
+
}]);
|
|
2104
|
+
}, { cacheContext: cacheAdapter ? {
|
|
2105
|
+
adapter: cacheAdapter,
|
|
2106
|
+
mode: cacheMode,
|
|
2107
|
+
evalId,
|
|
2108
|
+
codeFingerprint
|
|
2109
|
+
} : void 0 });
|
|
2110
|
+
const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
|
|
2111
|
+
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
2112
|
+
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
2113
|
+
if (!nonAssertError && evalDef.deriveFromTracing) try {
|
|
2114
|
+
const derived = await callWithUnknownResult(evalDef.deriveFromTracing, [{
|
|
2115
|
+
trace: traceTree,
|
|
2116
|
+
input: evalCase.input,
|
|
2117
|
+
case: evalCase
|
|
2118
|
+
}]);
|
|
2119
|
+
if (!isRecord(derived)) throw new Error("deriveFromTracing must return an object");
|
|
2120
|
+
for (const [key, value] of Object.entries(derived)) if (!(key in scope.outputs)) scope.outputs[key] = value;
|
|
2121
|
+
} catch (e) {
|
|
2122
|
+
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
2123
|
+
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
2124
|
+
}
|
|
2125
|
+
const scoreResults = /* @__PURE__ */ new Map();
|
|
2126
|
+
const scoringTraces = {};
|
|
2127
|
+
if (!nonAssertError && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
|
|
2128
|
+
const { compute, passThreshold, label } = normalizeScoreDef(def);
|
|
2129
|
+
const scoreRun = await runInEvalScope(evalCase.id, async () => await callWithUnknownResult(compute, [{
|
|
2130
|
+
input: evalCase.input,
|
|
2131
|
+
outputs: { ...scope.outputs },
|
|
2132
|
+
case: evalCase
|
|
2133
|
+
}]), { cacheContext: cacheAdapter ? {
|
|
2134
|
+
adapter: cacheAdapter,
|
|
2135
|
+
mode: cacheMode,
|
|
2136
|
+
evalId: `${evalId}__score__${key}`,
|
|
2137
|
+
codeFingerprint
|
|
2138
|
+
} : void 0 });
|
|
2139
|
+
const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
|
|
2140
|
+
if (trace.length > 0) scoringTraces[key] = {
|
|
2141
|
+
trace,
|
|
2142
|
+
traceDisplay
|
|
2143
|
+
};
|
|
2144
|
+
const rawValue = scoreRun.result;
|
|
2145
|
+
if (scoreRun.error) {
|
|
2146
|
+
const message = `score "${key}" threw: ${scoreRun.error.message}`;
|
|
2147
|
+
scope.assertionFailures.push(toAssertionFailure(message, scoreRun.error));
|
|
2148
|
+
scope.outputs[key] = 0;
|
|
2149
|
+
scoreResults.set(key, {
|
|
2150
|
+
value: 0,
|
|
2151
|
+
passThreshold,
|
|
2152
|
+
label
|
|
2153
|
+
});
|
|
2154
|
+
continue;
|
|
2155
|
+
}
|
|
2156
|
+
if (typeof rawValue !== "number") {
|
|
2157
|
+
scope.assertionFailures.push(toAssertionFailure(`score "${key}" must return a number`));
|
|
2158
|
+
scope.outputs[key] = 0;
|
|
2159
|
+
scoreResults.set(key, {
|
|
2160
|
+
value: 0,
|
|
2161
|
+
passThreshold,
|
|
2162
|
+
label
|
|
2163
|
+
});
|
|
2164
|
+
continue;
|
|
2165
|
+
}
|
|
2166
|
+
const value = rawValue;
|
|
2167
|
+
scope.outputs[key] = value;
|
|
2168
|
+
scoreResults.set(key, {
|
|
2169
|
+
value,
|
|
2170
|
+
passThreshold,
|
|
2171
|
+
label
|
|
2172
|
+
});
|
|
2173
|
+
}
|
|
2174
|
+
let passed = scope.assertionFailures.length === 0 && !nonAssertError;
|
|
2175
|
+
if (passed) {
|
|
2176
|
+
for (const [, scoreEntry] of scoreResults) if (scoreEntry.passThreshold !== void 0 && scoreEntry.value < scoreEntry.passThreshold) {
|
|
2177
|
+
passed = false;
|
|
2178
|
+
break;
|
|
2179
|
+
}
|
|
2180
|
+
}
|
|
2181
|
+
const status = nonAssertError ? "error" : passed ? "pass" : "fail";
|
|
2182
|
+
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
|
|
2183
|
+
const columns = {};
|
|
2184
|
+
for (const [key, value] of Object.entries(scope.outputs)) {
|
|
2185
|
+
const cell = isBlob(value) ? await persistInlineArtifact({
|
|
2186
|
+
artifactDir,
|
|
2187
|
+
runId,
|
|
2188
|
+
caseId: evalCase.id,
|
|
2189
|
+
outputKey: key,
|
|
2190
|
+
trial,
|
|
2191
|
+
value
|
|
2192
|
+
}) : toCellValue(value, evalDef.columns?.[key]);
|
|
2193
|
+
if (cell !== void 0) columns[key] = cell;
|
|
2194
|
+
}
|
|
2195
|
+
for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
|
|
2196
|
+
const errorInfo = nonAssertError ? {
|
|
2197
|
+
name: nonAssertError.name,
|
|
2198
|
+
message: nonAssertError.message,
|
|
2199
|
+
stack: nonAssertError.stack
|
|
2200
|
+
} : null;
|
|
2201
|
+
const caseDetail = {
|
|
2202
|
+
caseId: evalCase.id,
|
|
2203
|
+
evalId,
|
|
2204
|
+
status,
|
|
2205
|
+
input: evalCase.input,
|
|
2206
|
+
trace: displayTrace,
|
|
2207
|
+
traceDisplay,
|
|
2208
|
+
columns,
|
|
2209
|
+
assertionFailures: scope.assertionFailures,
|
|
2210
|
+
error: errorInfo,
|
|
2211
|
+
trial
|
|
2212
|
+
};
|
|
2213
|
+
if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
|
|
2214
|
+
return {
|
|
2215
|
+
caseDetail,
|
|
2216
|
+
caseRowUpdate: {
|
|
2217
|
+
status,
|
|
2218
|
+
latencyMs: Date.now() - startTime,
|
|
2219
|
+
columns
|
|
2220
|
+
}
|
|
2221
|
+
};
|
|
2222
|
+
}
|
|
2223
|
+
function isRecord(value) {
|
|
2224
|
+
return typeof value === "object" && value !== null;
|
|
2225
|
+
}
|
|
2226
|
+
function isBlob(value) {
|
|
2227
|
+
return value instanceof Blob;
|
|
2228
|
+
}
|
|
2229
|
+
function toAssertionFailure(message, error = void 0) {
|
|
2230
|
+
return error?.stack ? {
|
|
2231
|
+
message,
|
|
2232
|
+
stack: error.stack
|
|
2233
|
+
} : { message };
|
|
2234
|
+
}
|
|
2235
|
+
//#endregion
|
|
2236
|
+
//#region ../runner/src/runPersistence.ts
|
|
2237
|
+
const SHORT_ID_PATTERN = /^r(\d+)$/;
|
|
2238
|
+
/**
|
|
2239
|
+
* Generate a filesystem-safe, sortable run id combining a UTC timestamp
|
|
2240
|
+
* with a short random suffix.
|
|
2241
|
+
*/
|
|
2242
|
+
function generateRunId() {
|
|
2243
|
+
const now = /* @__PURE__ */ new Date();
|
|
2244
|
+
const pad = (n) => String(n).padStart(2, "0");
|
|
2245
|
+
return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
|
|
2246
|
+
}
|
|
2247
|
+
function parseShortIdNum(shortId) {
|
|
2248
|
+
if (shortId === void 0) return null;
|
|
2249
|
+
const match = SHORT_ID_PATTERN.exec(shortId);
|
|
2250
|
+
if (!match) return null;
|
|
2251
|
+
const num = Number(match[1]);
|
|
2252
|
+
if (!Number.isFinite(num)) return null;
|
|
2253
|
+
return num;
|
|
2254
|
+
}
|
|
2255
|
+
/**
|
|
2256
|
+
* Return the next `shortId` number to assign based on the existing
|
|
2257
|
+
* loaded snapshots. Legacy runs that don't match the `r\d+` format are
|
|
2258
|
+
* ignored.
|
|
2259
|
+
*/
|
|
2260
|
+
function nextShortIdFromSnapshots(snapshots) {
|
|
2261
|
+
let maxNum = -1;
|
|
2262
|
+
for (const snapshot of snapshots) {
|
|
2263
|
+
const num = parseShortIdNum(snapshot.manifest.shortId);
|
|
2264
|
+
if (num !== null && num > maxNum) maxNum = num;
|
|
2265
|
+
}
|
|
2266
|
+
return maxNum + 1;
|
|
2267
|
+
}
|
|
2268
|
+
async function loadPersistedRunSnapshots(localStateDir) {
|
|
2269
|
+
const runsDir = join(localStateDir, "runs");
|
|
2270
|
+
const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
|
|
2271
|
+
if (entriesResult.error) return [];
|
|
2272
|
+
const snapshots = [];
|
|
2273
|
+
const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
|
|
2274
|
+
for (const runDir of runDirs) {
|
|
2275
|
+
const snapshot = await loadPersistedRunSnapshot(runDir);
|
|
2276
|
+
if (!snapshot) continue;
|
|
2277
|
+
snapshots.push(snapshot);
|
|
2278
|
+
}
|
|
2279
|
+
return snapshots;
|
|
2280
|
+
}
|
|
2281
|
+
async function persistCaseDetail(runDir, caseDetail) {
|
|
2282
|
+
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
|
|
2283
|
+
}
|
|
2284
|
+
function getLastRunStatuses(params) {
|
|
2285
|
+
const latestRunInfos = getLatestRunInfos(params);
|
|
2286
|
+
return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
|
|
2287
|
+
}
|
|
2288
|
+
/**
|
|
2289
|
+
* Return the latest scoped run metadata for each eval based on persisted and
|
|
2290
|
+
* in-memory runs.
|
|
2291
|
+
*/
|
|
2292
|
+
function getLatestRunInfos(params) {
|
|
2293
|
+
const { runs, knownEvals } = params;
|
|
2294
|
+
const knownEvalMetas = [...knownEvals];
|
|
2295
|
+
const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
|
|
2296
|
+
const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
|
|
2297
|
+
const latestRunInfos = /* @__PURE__ */ new Map();
|
|
2298
|
+
for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
|
|
2299
|
+
status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
|
|
2300
|
+
startedAt: getRunFreshnessTimestamp(run.manifest),
|
|
2301
|
+
commitSha: run.manifest.commitSha ?? null,
|
|
2302
|
+
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
|
|
2303
|
+
});
|
|
2304
|
+
return latestRunInfos;
|
|
2305
|
+
}
|
|
2306
|
+
function toLastRunStatus$1(status) {
|
|
2307
|
+
return status === "pending" ? null : status;
|
|
2308
|
+
}
|
|
2309
|
+
async function loadPersistedRunSnapshot(runDir) {
|
|
2310
|
+
const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
|
|
2311
|
+
if (!manifest) return null;
|
|
2312
|
+
const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
|
|
2313
|
+
if (!summary) return null;
|
|
2314
|
+
return {
|
|
2315
|
+
runDir,
|
|
2316
|
+
manifest,
|
|
2317
|
+
summary,
|
|
2318
|
+
cases: await readCaseRows(runDir),
|
|
2319
|
+
caseDetails: await readCaseDetails(runDir)
|
|
2320
|
+
};
|
|
2321
|
+
}
|
|
2322
|
+
async function readParsedJsonFile(filePath, schema) {
|
|
2323
|
+
const fileResult = await resultify(() => readFile(filePath, "utf-8"));
|
|
2324
|
+
if (fileResult.error) return null;
|
|
2325
|
+
const jsonResult = resultify(() => JSON.parse(fileResult.value));
|
|
2326
|
+
if (jsonResult.error) return null;
|
|
2327
|
+
const parsed = schema.safeParse(jsonResult.value);
|
|
2328
|
+
if (!parsed.success) return null;
|
|
2329
|
+
return parsed.data;
|
|
2330
|
+
}
|
|
2331
|
+
async function readCaseRows(runDir) {
|
|
2332
|
+
const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
|
|
2333
|
+
if (fileResult.error) return [];
|
|
2334
|
+
const rows = [];
|
|
2335
|
+
for (const rawLine of fileResult.value.split("\n")) {
|
|
2336
|
+
const line = rawLine.trim();
|
|
2337
|
+
if (line.length === 0) continue;
|
|
2338
|
+
const jsonResult = resultify(() => JSON.parse(line));
|
|
2339
|
+
if (jsonResult.error) continue;
|
|
2340
|
+
const parsed = caseRowSchema.safeParse(jsonResult.value);
|
|
2341
|
+
if (!parsed.success) continue;
|
|
2342
|
+
rows.push(parsed.data);
|
|
2343
|
+
}
|
|
2344
|
+
return rows;
|
|
2345
|
+
}
|
|
2346
|
+
async function readCaseDetails(runDir) {
|
|
2347
|
+
const detailsDir = join(runDir, "case-details");
|
|
2348
|
+
const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
|
|
2349
|
+
if (entriesResult.error) return /* @__PURE__ */ new Map();
|
|
2350
|
+
const caseDetails = /* @__PURE__ */ new Map();
|
|
2351
|
+
for (const entry of entriesResult.value) {
|
|
2352
|
+
if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
|
|
2353
|
+
const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
|
|
2354
|
+
if (!detail) continue;
|
|
2355
|
+
caseDetails.set(detail.caseId, detail);
|
|
2356
|
+
}
|
|
2357
|
+
return caseDetails;
|
|
2358
|
+
}
|
|
2359
|
+
function getRunEvalIds(run, knownEvalIds) {
|
|
2360
|
+
const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
|
|
2361
|
+
if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
|
|
2362
|
+
else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
|
|
2363
|
+
return [...evalIds];
|
|
2364
|
+
}
|
|
2365
|
+
function getEvalStatusForRun(run, evalId, manualScoreKeys) {
|
|
2366
|
+
const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
|
|
2367
|
+
if (evalCases.length > 0) {
|
|
2368
|
+
if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
|
|
2369
|
+
return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
|
|
2370
|
+
}
|
|
2371
|
+
return toLastRunStatus$1(deriveStatusFromChildStatuses({
|
|
2372
|
+
statuses: [],
|
|
2373
|
+
lifecycleStatus: run.manifest.status
|
|
2374
|
+
}));
|
|
2375
|
+
}
|
|
2376
|
+
function hasPendingManualScores(caseRows, manualScoreKeys) {
|
|
2377
|
+
if (manualScoreKeys.length === 0) return false;
|
|
2378
|
+
return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
|
|
2379
|
+
const value = caseRow.columns[key];
|
|
2380
|
+
return typeof value !== "number" || !Number.isFinite(value);
|
|
2381
|
+
}));
|
|
2382
|
+
}
|
|
2383
|
+
function encodeCaseDetailFileName(caseId) {
|
|
2384
|
+
return encodeURIComponent(caseId);
|
|
2385
|
+
}
|
|
2386
|
+
//#endregion
|
|
2387
|
+
//#region ../runner/src/runQueue.ts
|
|
2388
|
+
async function executeQueuedCases(params) {
|
|
2389
|
+
const { runState, queuedCases, concurrency, globalTraceDisplay } = params;
|
|
2390
|
+
let nextCaseIndex = 0;
|
|
2391
|
+
let workerError = void 0;
|
|
2392
|
+
const workerCount = Math.min(concurrency, queuedCases.length);
|
|
2393
|
+
const workers = Array.from({ length: workerCount }, async () => {
|
|
2394
|
+
while (!runState.abortController.signal.aborted && workerError === void 0) {
|
|
2395
|
+
const queuedCase = queuedCases[nextCaseIndex];
|
|
2396
|
+
nextCaseIndex += 1;
|
|
2397
|
+
if (queuedCase === void 0) return;
|
|
2398
|
+
try {
|
|
2399
|
+
await executeQueuedCase({
|
|
2400
|
+
queuedCase,
|
|
2401
|
+
runState,
|
|
2402
|
+
globalTraceDisplay
|
|
2403
|
+
});
|
|
2404
|
+
} catch (error) {
|
|
2405
|
+
workerError = error instanceof Error ? error : new Error(String(error));
|
|
2406
|
+
return;
|
|
2407
|
+
}
|
|
2408
|
+
}
|
|
2409
|
+
});
|
|
2410
|
+
await Promise.all(workers);
|
|
2411
|
+
if (workerError instanceof Error) throw workerError;
|
|
2412
|
+
if (workerError !== void 0) throw new Error(typeof workerError === "string" ? workerError : typeof workerError === "number" || typeof workerError === "boolean" || typeof workerError === "bigint" ? String(workerError) : workerError === null ? "null" : "Unknown queue worker error");
|
|
2413
|
+
}
|
|
2414
|
+
async function executeQueuedCase(params) {
|
|
2415
|
+
const { queuedCase, runState, globalTraceDisplay } = params;
|
|
2416
|
+
const startTime = Date.now();
|
|
2417
|
+
const result = await queuedCase.execute({
|
|
2418
|
+
globalTraceDisplay,
|
|
2419
|
+
signal: runState.abortController.signal,
|
|
2420
|
+
startTime
|
|
2421
|
+
});
|
|
2422
|
+
await queuedCase.onComplete(result);
|
|
2423
|
+
}
|
|
2424
|
+
//#endregion
|
|
2425
|
+
//#region ../runner/src/runOrchestration.ts
|
|
2426
|
+
/**
|
|
2427
|
+
* Ranks case statuses from worst to best. Used to order trial attempts so the
|
|
2428
|
+
* pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
|
|
2429
|
+
* non-terminal status outside `pass`/`fail`/`error` is treated as indistinct
|
|
2430
|
+
* from `fail` for comparison purposes.
|
|
2431
|
+
*/
|
|
2432
|
+
function statusRank(status) {
|
|
2433
|
+
if (status === "pass") return 2;
|
|
2434
|
+
if (status === "error") return 0;
|
|
2435
|
+
return 1;
|
|
2436
|
+
}
|
|
2437
|
+
/**
|
|
2438
|
+
* Returns the minimum numeric value across the declared score columns for a
|
|
2439
|
+
* trial, or `-Infinity` when no score has a numeric value. Used as a
|
|
2440
|
+
* tiebreaker between trials that share the same status.
|
|
2441
|
+
*/
|
|
2442
|
+
function minScoreValue(caseRow, scoreKeys) {
|
|
2443
|
+
let min = Number.POSITIVE_INFINITY;
|
|
2444
|
+
for (const key of scoreKeys) {
|
|
2445
|
+
const v = caseRow.columns[key];
|
|
2446
|
+
if (typeof v === "number" && Number.isFinite(v)) {
|
|
2447
|
+
if (v < min) min = v;
|
|
2448
|
+
}
|
|
2449
|
+
}
|
|
2450
|
+
return Number.isFinite(min) ? min : Number.NEGATIVE_INFINITY;
|
|
2451
|
+
}
|
|
2452
|
+
function compareTrialResults(left, right, scoreKeys) {
|
|
2453
|
+
const statusDiff = statusRank(left.caseRow.status) - statusRank(right.caseRow.status);
|
|
2454
|
+
if (statusDiff !== 0) return statusDiff;
|
|
2455
|
+
const scoreDiff = minScoreValue(left.caseRow, scoreKeys) - minScoreValue(right.caseRow, scoreKeys);
|
|
2456
|
+
if (scoreDiff !== 0) return scoreDiff;
|
|
2457
|
+
return left.caseRow.trial - right.caseRow.trial;
|
|
2458
|
+
}
|
|
2459
|
+
function pickWinningTrial(params) {
|
|
2460
|
+
const orderedAttempts = [...params.attempts].toSorted((left, right) => compareTrialResults(left, right, params.scoreKeys));
|
|
2461
|
+
if (params.strategy === "lowestScore") {
|
|
2462
|
+
const [lowestAttempt] = orderedAttempts;
|
|
2463
|
+
if (lowestAttempt === void 0) throw new Error("Expected at least one trial attempt");
|
|
2464
|
+
return lowestAttempt;
|
|
2465
|
+
}
|
|
2466
|
+
const medianAttempt = orderedAttempts[Math.floor((orderedAttempts.length - 1) / 2)];
|
|
2467
|
+
if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
|
|
2468
|
+
return medianAttempt;
|
|
2469
|
+
}
|
|
2470
|
+
async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals }) {
|
|
2471
|
+
try {
|
|
2472
|
+
const targetEvals = getTargetEvals(request);
|
|
2473
|
+
emitEvent(runState, {
|
|
2474
|
+
type: "run.started",
|
|
2475
|
+
runId: runState.manifest.id,
|
|
2476
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2477
|
+
payload: runState.manifest
|
|
2478
|
+
});
|
|
2479
|
+
const allCaseRows = [];
|
|
2480
|
+
const evalErrors = [];
|
|
2481
|
+
const queuedCases = [];
|
|
2482
|
+
const preparedEvals = [];
|
|
2483
|
+
const cacheMode = runState.manifest.cacheMode ?? "use";
|
|
2484
|
+
const cacheEnabled = config.cache?.enabled !== false;
|
|
2485
|
+
for (const evalMeta of targetEvals) {
|
|
2486
|
+
if (runState.abortController.signal.aborted) break;
|
|
2487
|
+
const evalFilePath = evalMeta.sourceFilePath;
|
|
2488
|
+
let codeFingerprint = "";
|
|
2489
|
+
try {
|
|
2490
|
+
codeFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
|
|
2491
|
+
} catch {
|
|
2492
|
+
codeFingerprint = "";
|
|
2493
|
+
}
|
|
2494
|
+
if (codeFingerprint.length > 0) runState.manifest.evalSourceFingerprints[evalMeta.id] = codeFingerprint;
|
|
2495
|
+
else delete runState.manifest.evalSourceFingerprints[evalMeta.id];
|
|
2496
|
+
try {
|
|
2497
|
+
const registry = getEvalRegistry();
|
|
2498
|
+
await loadEvalModule(evalFilePath, codeFingerprint);
|
|
2499
|
+
const entry = registry.get(evalMeta.id);
|
|
2500
|
+
if (!entry) {
|
|
2501
|
+
evalErrors.push({
|
|
2502
|
+
evalId: evalMeta.id,
|
|
2503
|
+
message: `Eval "${evalMeta.id}" was not registered after importing ${evalFilePath}`
|
|
2504
|
+
});
|
|
2505
|
+
continue;
|
|
2506
|
+
}
|
|
2507
|
+
await entry.use(async (evalDef) => {
|
|
2508
|
+
const cases = filterEvalCases(resolveRunnableEvalCases({
|
|
2509
|
+
cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
|
|
2510
|
+
evalId: evalMeta.id
|
|
2511
|
+
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
2512
|
+
runState.summary.totalCases += cases.length;
|
|
2513
|
+
const accumulatedColumns = /* @__PURE__ */ new Map();
|
|
2514
|
+
const evalCaseRows = [];
|
|
2515
|
+
const preparedCases = [];
|
|
2516
|
+
const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
|
|
2517
|
+
const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
|
|
2518
|
+
preparedEvals.push({
|
|
2519
|
+
evalMeta,
|
|
2520
|
+
accumulatedColumns,
|
|
2521
|
+
evalCaseRows,
|
|
2522
|
+
preparedCases,
|
|
2523
|
+
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
|
|
2524
|
+
mergeColumns: (columns) => {
|
|
2525
|
+
mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
2526
|
+
}
|
|
2527
|
+
});
|
|
2528
|
+
for (const evalCase of cases) {
|
|
2529
|
+
if (runState.abortController.signal.aborted) break;
|
|
2530
|
+
const trialResults = [];
|
|
2531
|
+
preparedCases.push({
|
|
2532
|
+
caseId: evalCase.id,
|
|
2533
|
+
trialResults
|
|
2534
|
+
});
|
|
2535
|
+
for (let trial = 0; trial < request.trials; trial++) {
|
|
2536
|
+
const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
|
|
2537
|
+
queuedCases.push({
|
|
2538
|
+
execute: async ({ startTime, signal, globalTraceDisplay }) => {
|
|
2539
|
+
const { caseDetail, caseRowUpdate } = await runCase({
|
|
2540
|
+
evalDef,
|
|
2541
|
+
evalId: evalMeta.id,
|
|
2542
|
+
evalCase,
|
|
2543
|
+
globalTraceDisplay,
|
|
2544
|
+
trial,
|
|
2545
|
+
signal,
|
|
2546
|
+
startTime,
|
|
2547
|
+
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
2548
|
+
cacheMode,
|
|
2549
|
+
codeFingerprint,
|
|
2550
|
+
artifactDir: join(runDir, "artifacts"),
|
|
2551
|
+
runId: runState.manifest.id
|
|
2552
|
+
});
|
|
2553
|
+
return {
|
|
2554
|
+
caseDetail,
|
|
2555
|
+
caseRow: {
|
|
2556
|
+
caseId: evalCase.id,
|
|
2557
|
+
evalId: evalMeta.id,
|
|
2558
|
+
status: caseRowUpdate.status ?? "pending",
|
|
2559
|
+
latencyMs: caseRowUpdate.latencyMs ?? null,
|
|
2560
|
+
columns: caseRowUpdate.columns ?? {},
|
|
2561
|
+
trial
|
|
2562
|
+
}
|
|
2563
|
+
};
|
|
2564
|
+
},
|
|
2565
|
+
onComplete: ({ caseDetail, caseRow }) => {
|
|
2566
|
+
trialResults.push({
|
|
2567
|
+
caseDetail,
|
|
2568
|
+
caseRow,
|
|
2569
|
+
bufferedCacheStore
|
|
2570
|
+
});
|
|
2571
|
+
}
|
|
2572
|
+
});
|
|
2573
|
+
}
|
|
2574
|
+
}
|
|
2575
|
+
});
|
|
2576
|
+
} catch (error) {
|
|
2577
|
+
console.error(`Error running eval ${evalMeta.id}:`, error);
|
|
2578
|
+
evalErrors.push({
|
|
2579
|
+
evalId: evalMeta.id,
|
|
2580
|
+
message: error instanceof Error ? error.message : String(error)
|
|
2581
|
+
});
|
|
2582
|
+
lastRunStatusMap.set(evalMeta.id, "error");
|
|
2583
|
+
latestRunInfoMap.set(evalMeta.id, {
|
|
2584
|
+
status: "error",
|
|
2585
|
+
startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
|
|
2586
|
+
commitSha: runState.manifest.commitSha ?? null,
|
|
2587
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.id] ?? null
|
|
2588
|
+
});
|
|
2589
|
+
}
|
|
2590
|
+
}
|
|
2591
|
+
await executeQueuedCases({
|
|
2592
|
+
runState,
|
|
2593
|
+
queuedCases,
|
|
2594
|
+
concurrency: getConfiguredConcurrency(),
|
|
2595
|
+
globalTraceDisplay: config.traceDisplay
|
|
2596
|
+
});
|
|
2597
|
+
for (const preparedEval of preparedEvals) {
|
|
2598
|
+
for (const preparedCase of preparedEval.preparedCases) {
|
|
2599
|
+
if (preparedCase.trialResults.length === 0) continue;
|
|
2600
|
+
const winningTrial = pickWinningTrial({
|
|
2601
|
+
strategy: runState.manifest.trialSelection,
|
|
2602
|
+
attempts: preparedCase.trialResults,
|
|
2603
|
+
scoreKeys: preparedEval.scoreKeys
|
|
2604
|
+
});
|
|
2605
|
+
if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
|
|
2606
|
+
runState.cases.push(winningTrial.caseRow);
|
|
2607
|
+
runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
|
|
2608
|
+
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
2609
|
+
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
2610
|
+
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
2611
|
+
else runState.summary.failedCases++;
|
|
2612
|
+
await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
2613
|
+
await persistCaseDetail(runDir, winningTrial.caseDetail);
|
|
2614
|
+
emitEvent(runState, {
|
|
2615
|
+
type: "case.finished",
|
|
2616
|
+
runId: runState.manifest.id,
|
|
2617
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2618
|
+
payload: winningTrial.caseRow
|
|
2619
|
+
});
|
|
2620
|
+
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
2621
|
+
allCaseRows.push(winningTrial.caseRow);
|
|
2622
|
+
}
|
|
2623
|
+
preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
|
|
2624
|
+
lastRunStatusMap.set(preparedEval.evalMeta.id, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
|
|
2625
|
+
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.id) ?? null;
|
|
2626
|
+
latestRunInfoMap.set(preparedEval.evalMeta.id, {
|
|
2627
|
+
status: latestStatus,
|
|
2628
|
+
startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
|
|
2629
|
+
commitSha: runState.manifest.commitSha ?? null,
|
|
2630
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.id] ?? null
|
|
2631
|
+
});
|
|
2632
|
+
}
|
|
2633
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
2634
|
+
runState.summary.totalDurationMs = endTime.getTime() - new Date(runState.manifest.startedAt).getTime();
|
|
2635
|
+
const finalStatus = runState.abortController.signal.aborted ? "cancelled" : evalErrors.length > 0 ? "error" : "completed";
|
|
2636
|
+
runState.summary.status = finalStatus;
|
|
2637
|
+
runState.manifest.status = finalStatus;
|
|
2638
|
+
const completedRunAt = endTime.toISOString();
|
|
2639
|
+
runState.manifest.endedAt = completedRunAt;
|
|
2640
|
+
runState.summary.errorMessage = evalErrors.length > 0 ? evalErrors.map((entry) => `[${entry.evalId}] ${entry.message}`).join("\n") : null;
|
|
2641
|
+
for (const evalId of getTargetEvalIds({
|
|
2642
|
+
request,
|
|
2643
|
+
sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
|
|
2644
|
+
knownEvalIds: new Set(evals.keys())
|
|
2645
|
+
})) {
|
|
2646
|
+
const latestStatus = lastRunStatusMap.get(evalId) ?? toLastRunStatus(deriveStatusFromCaseRows({
|
|
2647
|
+
caseRows: [],
|
|
2648
|
+
lifecycleStatus: runState.manifest.status
|
|
2649
|
+
}));
|
|
2650
|
+
latestRunInfoMap.set(evalId, {
|
|
2651
|
+
status: latestStatus,
|
|
2652
|
+
startedAt: completedRunAt,
|
|
2653
|
+
commitSha: runState.manifest.commitSha ?? null,
|
|
2654
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalId] ?? null
|
|
2655
|
+
});
|
|
2656
|
+
}
|
|
2657
|
+
emitEvent(runState, {
|
|
2658
|
+
type: "run.summary",
|
|
2659
|
+
runId: runState.manifest.id,
|
|
2660
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2661
|
+
payload: runState.summary
|
|
2662
|
+
});
|
|
2663
|
+
if (finalStatus === "error") emitEvent(runState, {
|
|
2664
|
+
type: "run.error",
|
|
2665
|
+
runId: runState.manifest.id,
|
|
2666
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2667
|
+
payload: { message: evalErrors.map((entry) => `[${entry.evalId}] ${entry.message}`).join("\n") }
|
|
2668
|
+
});
|
|
2669
|
+
else emitEvent(runState, {
|
|
2670
|
+
type: "run.finished",
|
|
2671
|
+
runId: runState.manifest.id,
|
|
2672
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2673
|
+
payload: runState.summary
|
|
2674
|
+
});
|
|
2675
|
+
await persistRunState(runState);
|
|
2676
|
+
emitDiscoveryEvent();
|
|
2677
|
+
} catch (error) {
|
|
2678
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2679
|
+
runState.manifest.status = "error";
|
|
2680
|
+
runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2681
|
+
runState.summary.status = "error";
|
|
2682
|
+
runState.summary.errorMessage = message;
|
|
2683
|
+
emitEvent(runState, {
|
|
2684
|
+
type: "run.error",
|
|
2685
|
+
runId: runState.manifest.id,
|
|
2686
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2687
|
+
payload: { message }
|
|
2688
|
+
});
|
|
2689
|
+
await persistRunState(runState);
|
|
2690
|
+
emitDiscoveryEvent();
|
|
2691
|
+
}
|
|
2692
|
+
}
|
|
2693
|
+
function toLastRunStatus(status) {
|
|
2694
|
+
return status === "pending" ? null : status;
|
|
2695
|
+
}
|
|
2696
|
+
//#endregion
|
|
2697
|
+
//#region ../runner/src/runner.ts
|
|
2698
|
+
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
2699
|
+
function createRunner({ watchForChanges = true } = {}) {
|
|
2700
|
+
let config;
|
|
2701
|
+
let workspaceRoot;
|
|
2702
|
+
let localStateDir;
|
|
2703
|
+
let cacheStore;
|
|
2704
|
+
const evals = /* @__PURE__ */ new Map();
|
|
2705
|
+
const runs = /* @__PURE__ */ new Map();
|
|
2706
|
+
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
2707
|
+
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
2708
|
+
const discoveryListeners = /* @__PURE__ */ new Set();
|
|
2709
|
+
let nextShortIdNum = 0;
|
|
2710
|
+
function toWorkspaceRelativePath(filePath) {
|
|
2711
|
+
return relative(workspaceRoot, filePath).replaceAll("\\", "/");
|
|
2712
|
+
}
|
|
2713
|
+
function getSortedEvalMetas() {
|
|
2714
|
+
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
2715
|
+
}
|
|
2716
|
+
function getSourceFingerprint(source) {
|
|
2717
|
+
return createHash("sha256").update(source).digest("hex");
|
|
2718
|
+
}
|
|
2719
|
+
function getConfiguredConcurrency() {
|
|
2720
|
+
const configuredConcurrency = config.concurrency;
|
|
2721
|
+
if (typeof configuredConcurrency !== "number" || !Number.isFinite(configuredConcurrency)) return 1;
|
|
2722
|
+
return Math.max(1, Math.floor(configuredConcurrency));
|
|
2723
|
+
}
|
|
2724
|
+
const runner = {
|
|
2725
|
+
async init() {
|
|
2726
|
+
config = await loadConfig();
|
|
2727
|
+
workspaceRoot = config.workspaceRoot ?? process.cwd();
|
|
2728
|
+
localStateDir = resolve(workspaceRoot, ".agent-evals");
|
|
2729
|
+
await mkdir(localStateDir, { recursive: true });
|
|
2730
|
+
await mkdir(join(localStateDir, "runs"), { recursive: true });
|
|
2731
|
+
cacheStore = createFsCacheStore({
|
|
2732
|
+
workspaceRoot,
|
|
2733
|
+
dir: config.cache?.dir
|
|
2734
|
+
});
|
|
2735
|
+
await loadPersistedRuns();
|
|
2736
|
+
await runner.refreshDiscovery();
|
|
2737
|
+
if (watchForChanges) setupWatcher();
|
|
2738
|
+
},
|
|
2739
|
+
async listCache() {
|
|
2740
|
+
return cacheStore.list();
|
|
2741
|
+
},
|
|
2742
|
+
async clearCache(filter) {
|
|
2743
|
+
await cacheStore.clear(filter);
|
|
2744
|
+
},
|
|
2745
|
+
async recomputeStatusesForEval(evalId) {
|
|
2746
|
+
const evalMeta = evals.get(evalId);
|
|
2747
|
+
if (!evalMeta) return { updatedRuns: 0 };
|
|
2748
|
+
const registry = getEvalRegistry();
|
|
2749
|
+
await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
|
|
2750
|
+
const entry = registry.get(evalId);
|
|
2751
|
+
if (!entry) return { updatedRuns: 0 };
|
|
2752
|
+
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
2753
|
+
entry.use((evalDef) => {
|
|
2754
|
+
for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
|
|
2755
|
+
const threshold = normalizeScoreDef(def).passThreshold;
|
|
2756
|
+
if (threshold !== void 0) scoreThresholds.set(key, threshold);
|
|
2757
|
+
}
|
|
2758
|
+
for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
|
|
2759
|
+
});
|
|
2760
|
+
const updatedRuns = await recomputeEvalStatusesInRuns({
|
|
2761
|
+
runs: runs.values(),
|
|
2762
|
+
evalId,
|
|
2763
|
+
evalExists: evals.has(evalId),
|
|
2764
|
+
scoreThresholds,
|
|
2765
|
+
persistCaseDetail
|
|
2766
|
+
});
|
|
2767
|
+
emitDiscoveryEvent();
|
|
2768
|
+
return { updatedRuns };
|
|
2769
|
+
},
|
|
2770
|
+
async cleanRunsForEval(evalId) {
|
|
2771
|
+
let deletedRuns = 0;
|
|
2772
|
+
for (const [runId, run] of [...runs]) {
|
|
2773
|
+
if (!runTouchesEval({
|
|
2774
|
+
target: run.manifest.target,
|
|
2775
|
+
caseRows: run.cases,
|
|
2776
|
+
evalId,
|
|
2777
|
+
evalExists: evals.has(evalId)
|
|
2778
|
+
})) continue;
|
|
2779
|
+
if (run.manifest.status === "running") continue;
|
|
2780
|
+
runs.delete(runId);
|
|
2781
|
+
await rm(run.runDir, {
|
|
2782
|
+
recursive: true,
|
|
2783
|
+
force: true
|
|
2784
|
+
});
|
|
2785
|
+
deletedRuns += 1;
|
|
2786
|
+
}
|
|
2787
|
+
emitDiscoveryEvent();
|
|
2788
|
+
return { deletedRuns };
|
|
2789
|
+
},
|
|
2790
|
+
async updateManualScore({ runId, caseId, scoreKey, value }) {
|
|
2791
|
+
const run = runs.get(runId);
|
|
2792
|
+
if (!run) return {
|
|
2793
|
+
updated: false,
|
|
2794
|
+
reason: "Run not found"
|
|
2795
|
+
};
|
|
2796
|
+
if (run.manifest.status === "running") return {
|
|
2797
|
+
updated: false,
|
|
2798
|
+
reason: "Run is still running"
|
|
2799
|
+
};
|
|
2800
|
+
const caseRow = run.cases.find((row) => row.caseId === caseId);
|
|
2801
|
+
if (!caseRow) return {
|
|
2802
|
+
updated: false,
|
|
2803
|
+
reason: "Case not found"
|
|
2804
|
+
};
|
|
2805
|
+
const evalMeta = evals.get(caseRow.evalId);
|
|
2806
|
+
if (!evalMeta) return {
|
|
2807
|
+
updated: false,
|
|
2808
|
+
reason: "Eval not found"
|
|
2809
|
+
};
|
|
2810
|
+
if (evalMeta.columnDefs.find((def) => def.key === scoreKey)?.isManualScore !== true) return {
|
|
2811
|
+
updated: false,
|
|
2812
|
+
reason: "Manual score not found"
|
|
2813
|
+
};
|
|
2814
|
+
const caseDetail = run.caseDetails.get(caseId);
|
|
2815
|
+
if (!caseDetail) return {
|
|
2816
|
+
updated: false,
|
|
2817
|
+
reason: "Case detail not found"
|
|
2818
|
+
};
|
|
2819
|
+
caseRow.columns[scoreKey] = value;
|
|
2820
|
+
caseDetail.columns[scoreKey] = value;
|
|
2821
|
+
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
2822
|
+
for (const def of evalMeta.columnDefs) {
|
|
2823
|
+
if (def.isScore !== true || def.passThreshold === void 0) continue;
|
|
2824
|
+
scoreThresholds.set(def.key, def.passThreshold);
|
|
2825
|
+
}
|
|
2826
|
+
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds);
|
|
2827
|
+
caseRow.status = nextStatus;
|
|
2828
|
+
caseDetail.status = nextStatus;
|
|
2829
|
+
const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
|
|
2830
|
+
run.summary.totalCases = derivedSummary.totalCases;
|
|
2831
|
+
run.summary.passedCases = derivedSummary.passedCases;
|
|
2832
|
+
run.summary.failedCases = derivedSummary.failedCases;
|
|
2833
|
+
run.summary.errorCases = derivedSummary.errorCases;
|
|
2834
|
+
run.summary.cancelledCases = derivedSummary.cancelledCases;
|
|
2835
|
+
run.summary.totalDurationMs = derivedSummary.totalDurationMs;
|
|
2836
|
+
await persistCaseDetail(run.runDir, caseDetail);
|
|
2837
|
+
await persistRunState(run);
|
|
2838
|
+
emitDiscoveryEvent();
|
|
2839
|
+
return {
|
|
2840
|
+
updated: true,
|
|
2841
|
+
run: {
|
|
2842
|
+
manifest: run.manifest,
|
|
2843
|
+
summary: run.summary,
|
|
2844
|
+
cases: run.cases
|
|
2845
|
+
},
|
|
2846
|
+
caseDetail
|
|
2847
|
+
};
|
|
2848
|
+
},
|
|
2849
|
+
async deleteRun(runId) {
|
|
2850
|
+
const run = runs.get(runId);
|
|
2851
|
+
if (!run) return { deleted: false };
|
|
2852
|
+
if (run.manifest.status === "running") return { deleted: false };
|
|
2853
|
+
runs.delete(runId);
|
|
2854
|
+
await rm(run.runDir, {
|
|
2855
|
+
recursive: true,
|
|
2856
|
+
force: true
|
|
2857
|
+
});
|
|
2858
|
+
emitDiscoveryEvent();
|
|
2859
|
+
return { deleted: true };
|
|
2860
|
+
},
|
|
2861
|
+
getEvals() {
|
|
2862
|
+
const gitState = readGitWorktreeState(workspaceRoot);
|
|
2863
|
+
const result = [];
|
|
2864
|
+
for (const meta of getSortedEvalMetas()) result.push(buildEvalSummary({
|
|
2865
|
+
meta,
|
|
2866
|
+
config,
|
|
2867
|
+
gitState,
|
|
2868
|
+
latestRun: latestRunInfoMap.get(meta.id),
|
|
2869
|
+
lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
|
|
2870
|
+
}));
|
|
2871
|
+
return result;
|
|
2872
|
+
},
|
|
2873
|
+
getEval(id) {
|
|
2874
|
+
const meta = evals.get(id);
|
|
2875
|
+
if (!meta) return void 0;
|
|
2876
|
+
return buildEvalSummary({
|
|
2877
|
+
meta,
|
|
2878
|
+
config,
|
|
2879
|
+
gitState: readGitWorktreeState(workspaceRoot),
|
|
2880
|
+
latestRun: latestRunInfoMap.get(meta.id),
|
|
2881
|
+
lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
|
|
2882
|
+
});
|
|
2883
|
+
},
|
|
2884
|
+
async refreshDiscovery() {
|
|
2885
|
+
const patterns = config.include;
|
|
2886
|
+
const discovered = [];
|
|
2887
|
+
for (const pattern of patterns) {
|
|
2888
|
+
const files = await glob(pattern, {
|
|
2889
|
+
cwd: workspaceRoot,
|
|
2890
|
+
absolute: true
|
|
2891
|
+
});
|
|
2892
|
+
discovered.push(...files);
|
|
2893
|
+
}
|
|
2894
|
+
evals.clear();
|
|
2895
|
+
for (const filePath of discovered) try {
|
|
2896
|
+
const content = await readFile(filePath, "utf-8");
|
|
2897
|
+
const discoveredMetas = parseEvalMetas(filePath, content);
|
|
2898
|
+
const sourceFingerprint = getSourceFingerprint(content);
|
|
2899
|
+
const registry = getEvalRegistry();
|
|
2900
|
+
try {
|
|
2901
|
+
await loadEvalModule(filePath, sourceFingerprint);
|
|
2902
|
+
} catch {}
|
|
2903
|
+
for (const meta of discoveredMetas) {
|
|
2904
|
+
const discoveredEntry = registry.get(meta.id);
|
|
2905
|
+
const title = meta.title;
|
|
2906
|
+
let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
|
|
2907
|
+
let stats;
|
|
2908
|
+
let charts;
|
|
2909
|
+
discoveredEntry?.use((evalDef) => {
|
|
2910
|
+
columnDefs = buildDeclaredColumnDefs(evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
2911
|
+
stats = evalDef.stats;
|
|
2912
|
+
const validated = validateCharts({
|
|
2913
|
+
charts: evalDef.charts,
|
|
2914
|
+
columnDefs,
|
|
2915
|
+
evalId: meta.id
|
|
2916
|
+
});
|
|
2917
|
+
for (const warning of validated.warnings) console.warn(warning);
|
|
2918
|
+
charts = validated.charts;
|
|
2919
|
+
});
|
|
2920
|
+
evals.set(meta.id, {
|
|
2921
|
+
id: meta.id,
|
|
2922
|
+
title,
|
|
2923
|
+
filePath: toWorkspaceRelativePath(meta.filePath),
|
|
2924
|
+
sourceFilePath: meta.filePath,
|
|
2925
|
+
sourceFingerprint,
|
|
2926
|
+
columnDefs,
|
|
2927
|
+
caseCount: null,
|
|
2928
|
+
stats,
|
|
2929
|
+
charts
|
|
2930
|
+
});
|
|
2931
|
+
}
|
|
2932
|
+
} catch {}
|
|
2933
|
+
emitDiscoveryEvent();
|
|
2934
|
+
},
|
|
2935
|
+
async startRun(request) {
|
|
2936
|
+
const runId = generateRunId();
|
|
2937
|
+
const shortId = `r${String(nextShortIdNum++)}`;
|
|
2938
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
2939
|
+
const cacheMode = request.cache?.mode ?? "use";
|
|
2940
|
+
const runDir = join(localStateDir, "runs", runId);
|
|
2941
|
+
const manifest = {
|
|
2942
|
+
id: runId,
|
|
2943
|
+
shortId,
|
|
2944
|
+
status: "running",
|
|
2945
|
+
startedAt: now,
|
|
2946
|
+
endedAt: null,
|
|
2947
|
+
commitSha: readGitWorktreeState(workspaceRoot).commitSha,
|
|
2948
|
+
evalSourceFingerprints: {},
|
|
2949
|
+
target: request.target,
|
|
2950
|
+
trials: request.trials,
|
|
2951
|
+
trialSelection: config.trialSelection ?? "lowestScore",
|
|
2952
|
+
cacheMode
|
|
2953
|
+
};
|
|
2954
|
+
const summary = {
|
|
2955
|
+
runId,
|
|
2956
|
+
status: "running",
|
|
2957
|
+
totalCases: 0,
|
|
2958
|
+
passedCases: 0,
|
|
2959
|
+
failedCases: 0,
|
|
2960
|
+
errorCases: 0,
|
|
2961
|
+
cancelledCases: 0,
|
|
2962
|
+
totalDurationMs: null,
|
|
2963
|
+
errorMessage: null
|
|
2964
|
+
};
|
|
2965
|
+
const abortController = new AbortController();
|
|
2966
|
+
const runState = {
|
|
2967
|
+
runDir,
|
|
2968
|
+
manifest,
|
|
2969
|
+
summary,
|
|
2970
|
+
cases: [],
|
|
2971
|
+
caseDetails: /* @__PURE__ */ new Map(),
|
|
2972
|
+
listeners: /* @__PURE__ */ new Set(),
|
|
2973
|
+
abortController
|
|
2974
|
+
};
|
|
2975
|
+
runs.set(runId, runState);
|
|
2976
|
+
setLatestRunInfoMap({
|
|
2977
|
+
latestRunInfoMap,
|
|
2978
|
+
evalIds: getTargetEvalIds({
|
|
2979
|
+
request,
|
|
2980
|
+
sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
|
|
2981
|
+
knownEvalIds: new Set(evals.keys())
|
|
2982
|
+
}),
|
|
2983
|
+
info: {
|
|
2984
|
+
status: "running",
|
|
2985
|
+
startedAt: now,
|
|
2986
|
+
commitSha: manifest.commitSha ?? null,
|
|
2987
|
+
evalSourceFingerprint: null
|
|
2988
|
+
}
|
|
2989
|
+
});
|
|
2990
|
+
await mkdir(runDir, { recursive: true });
|
|
2991
|
+
await mkdir(join(runDir, "traces"), { recursive: true });
|
|
2992
|
+
await mkdir(join(runDir, "artifacts"), { recursive: true });
|
|
2993
|
+
await mkdir(join(runDir, "case-details"), { recursive: true });
|
|
2994
|
+
await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
|
|
2995
|
+
executeRun({
|
|
2996
|
+
runState,
|
|
2997
|
+
request,
|
|
2998
|
+
runDir,
|
|
2999
|
+
config,
|
|
3000
|
+
evals,
|
|
3001
|
+
cacheStore,
|
|
3002
|
+
lastRunStatusMap,
|
|
3003
|
+
latestRunInfoMap,
|
|
3004
|
+
emitEvent,
|
|
3005
|
+
emitDiscoveryEvent,
|
|
3006
|
+
getSourceFingerprint,
|
|
3007
|
+
getConfiguredConcurrency,
|
|
3008
|
+
getSortedEvalMetas,
|
|
3009
|
+
getTargetEvals
|
|
3010
|
+
});
|
|
3011
|
+
return {
|
|
3012
|
+
manifest,
|
|
3013
|
+
summary,
|
|
3014
|
+
cases: []
|
|
3015
|
+
};
|
|
3016
|
+
},
|
|
3017
|
+
getRuns() {
|
|
3018
|
+
return [...runs.values()].map((r) => r.manifest);
|
|
3019
|
+
},
|
|
3020
|
+
getRun(id) {
|
|
3021
|
+
const run = runs.get(id);
|
|
3022
|
+
if (!run) return void 0;
|
|
3023
|
+
return {
|
|
3024
|
+
manifest: run.manifest,
|
|
3025
|
+
summary: run.summary,
|
|
3026
|
+
cases: run.cases
|
|
3027
|
+
};
|
|
3028
|
+
},
|
|
3029
|
+
cancelRun(id) {
|
|
3030
|
+
const run = runs.get(id);
|
|
3031
|
+
if (!run) return;
|
|
3032
|
+
run.abortController.abort();
|
|
3033
|
+
run.manifest.status = "cancelled";
|
|
3034
|
+
run.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
3035
|
+
run.summary.status = "cancelled";
|
|
3036
|
+
emitEvent(run, {
|
|
3037
|
+
type: "run.cancelled",
|
|
3038
|
+
runId: id,
|
|
3039
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3040
|
+
payload: run.summary
|
|
3041
|
+
});
|
|
3042
|
+
},
|
|
3043
|
+
getCaseDetail(runId, caseId) {
|
|
3044
|
+
const run = runs.get(runId);
|
|
3045
|
+
if (!run) return void 0;
|
|
3046
|
+
return run.caseDetails.get(caseId);
|
|
3047
|
+
},
|
|
3048
|
+
subscribe(runId, listener) {
|
|
3049
|
+
const run = runs.get(runId);
|
|
3050
|
+
if (!run) return () => {};
|
|
3051
|
+
run.listeners.add(listener);
|
|
3052
|
+
return () => {
|
|
3053
|
+
run.listeners.delete(listener);
|
|
3054
|
+
};
|
|
3055
|
+
},
|
|
3056
|
+
subscribeDiscovery(listener) {
|
|
3057
|
+
discoveryListeners.add(listener);
|
|
3058
|
+
return () => {
|
|
3059
|
+
discoveryListeners.delete(listener);
|
|
3060
|
+
};
|
|
3061
|
+
},
|
|
3062
|
+
getWorkspaceRoot() {
|
|
3063
|
+
return workspaceRoot;
|
|
3064
|
+
},
|
|
3065
|
+
getArtifactPath(artifactId_) {
|
|
3066
|
+
return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
|
|
3067
|
+
}
|
|
3068
|
+
};
|
|
3069
|
+
function setupWatcher() {
|
|
3070
|
+
const watcher = watch(config.include.map((p) => resolve(workspaceRoot, p)), {
|
|
3071
|
+
ignoreInitial: true,
|
|
3072
|
+
persistent: true
|
|
3073
|
+
});
|
|
3074
|
+
watcher.on("change", () => {
|
|
3075
|
+
runner.refreshDiscovery();
|
|
3076
|
+
});
|
|
3077
|
+
watcher.on("add", () => {
|
|
3078
|
+
runner.refreshDiscovery();
|
|
3079
|
+
});
|
|
3080
|
+
watcher.on("unlink", () => {
|
|
3081
|
+
runner.refreshDiscovery();
|
|
3082
|
+
});
|
|
3083
|
+
}
|
|
3084
|
+
function emitDiscoveryEvent() {
|
|
3085
|
+
const lastRunStatuses = getLastRunStatuses({
|
|
3086
|
+
runs: runs.values(),
|
|
3087
|
+
knownEvals: evals.values()
|
|
3088
|
+
});
|
|
3089
|
+
const latestRunInfos = getLatestRunInfos({
|
|
3090
|
+
runs: runs.values(),
|
|
3091
|
+
knownEvals: evals.values()
|
|
3092
|
+
});
|
|
3093
|
+
lastRunStatusMap.clear();
|
|
3094
|
+
for (const [evalId, status] of lastRunStatuses) lastRunStatusMap.set(evalId, status);
|
|
3095
|
+
latestRunInfoMap.clear();
|
|
3096
|
+
for (const [evalId, info] of latestRunInfos) latestRunInfoMap.set(evalId, info);
|
|
3097
|
+
const event = {
|
|
3098
|
+
type: "discovery.updated",
|
|
3099
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3100
|
+
payload: runner.getEvals()
|
|
3101
|
+
};
|
|
3102
|
+
for (const listener of discoveryListeners) listener(event);
|
|
3103
|
+
}
|
|
3104
|
+
function getTargetEvals(request) {
|
|
3105
|
+
if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.map((id) => evals.get(id)).filter((e) => e !== void 0);
|
|
3106
|
+
return getSortedEvalMetas();
|
|
3107
|
+
}
|
|
3108
|
+
function emitEvent(runState, event) {
|
|
3109
|
+
for (const listener of runState.listeners) try {
|
|
3110
|
+
listener(event);
|
|
3111
|
+
} catch {}
|
|
3112
|
+
}
|
|
3113
|
+
async function loadPersistedRuns() {
|
|
3114
|
+
runs.clear();
|
|
3115
|
+
const persistedRuns = await loadPersistedRunSnapshots(localStateDir);
|
|
3116
|
+
nextShortIdNum = nextShortIdFromSnapshots(persistedRuns);
|
|
3117
|
+
for (const persistedRun of persistedRuns) runs.set(persistedRun.manifest.id, {
|
|
3118
|
+
...persistedRun,
|
|
3119
|
+
listeners: /* @__PURE__ */ new Set(),
|
|
3120
|
+
abortController: new AbortController()
|
|
3121
|
+
});
|
|
3122
|
+
}
|
|
3123
|
+
return runner;
|
|
3124
|
+
}
|
|
3125
|
+
//#endregion
|
|
3126
|
+
//#region src/cli.ts
|
|
3127
|
+
function parseArgs(argv) {
|
|
3128
|
+
const args = {
|
|
3129
|
+
command: "help",
|
|
3130
|
+
subcommand: void 0,
|
|
3131
|
+
evalIds: [],
|
|
3132
|
+
caseIds: [],
|
|
3133
|
+
trials: 1,
|
|
3134
|
+
json: false,
|
|
3135
|
+
port: 4100,
|
|
3136
|
+
cacheMode: "use",
|
|
3137
|
+
clearCache: false,
|
|
3138
|
+
all: false
|
|
3139
|
+
};
|
|
3140
|
+
const command = argv[0];
|
|
3141
|
+
if (command === "app" || command === "list" || command === "run" || command === "cache" || command === "help") args.command = command;
|
|
3142
|
+
let cursor = 1;
|
|
3143
|
+
if (args.command === "cache") {
|
|
3144
|
+
const sub = argv[cursor];
|
|
3145
|
+
if (sub === "list" || sub === "clear") {
|
|
3146
|
+
args.subcommand = sub;
|
|
3147
|
+
cursor++;
|
|
3148
|
+
}
|
|
3149
|
+
}
|
|
3150
|
+
for (let i = cursor; i < argv.length; i++) {
|
|
3151
|
+
const arg = argv[i];
|
|
3152
|
+
const next = argv[i + 1];
|
|
3153
|
+
if (arg === "--eval" && next) {
|
|
3154
|
+
args.evalIds.push(...next.split(","));
|
|
3155
|
+
i++;
|
|
3156
|
+
} else if (arg === "--case" && next) {
|
|
3157
|
+
args.caseIds.push(...next.split(","));
|
|
3158
|
+
i++;
|
|
3159
|
+
} else if (arg === "--trials" && next) {
|
|
3160
|
+
args.trials = Number(next);
|
|
3161
|
+
i++;
|
|
3162
|
+
} else if (arg === "--json") args.json = true;
|
|
3163
|
+
else if (arg === "--port" && next) {
|
|
3164
|
+
args.port = Number(next);
|
|
3165
|
+
i++;
|
|
3166
|
+
} else if (arg === "--cache" && next) {
|
|
3167
|
+
if (next === "use" || next === "bypass" || next === "refresh") args.cacheMode = next;
|
|
3168
|
+
i++;
|
|
3169
|
+
} else if (arg === "--no-cache") args.cacheMode = "bypass";
|
|
3170
|
+
else if (arg === "--refresh-cache") args.cacheMode = "refresh";
|
|
3171
|
+
else if (arg === "--clear-cache") args.clearCache = true;
|
|
3172
|
+
else if (arg === "--all") args.all = true;
|
|
3173
|
+
}
|
|
3174
|
+
return args;
|
|
3175
|
+
}
|
|
3176
|
+
/**
|
|
3177
|
+
* Run the Agent Evals CLI against the current workspace.
|
|
3178
|
+
*
|
|
3179
|
+
* @param argv Raw command-line arguments excluding the executable name.
|
|
3180
|
+
*/
|
|
3181
|
+
async function runCli(argv) {
|
|
3182
|
+
const args = parseArgs(argv);
|
|
3183
|
+
switch (args.command) {
|
|
3184
|
+
case "app":
|
|
3185
|
+
await commandApp(args);
|
|
3186
|
+
break;
|
|
3187
|
+
case "list":
|
|
3188
|
+
await commandList(args);
|
|
3189
|
+
break;
|
|
3190
|
+
case "run":
|
|
3191
|
+
await commandRun(args);
|
|
3192
|
+
break;
|
|
3193
|
+
case "cache":
|
|
3194
|
+
await commandCache(args);
|
|
3195
|
+
break;
|
|
3196
|
+
default:
|
|
3197
|
+
printHelp();
|
|
3198
|
+
break;
|
|
3199
|
+
}
|
|
3200
|
+
}
|
|
3201
|
+
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
3202
|
+
const repoRoot = resolve(currentDir, "../../..");
|
|
3203
|
+
const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
|
|
3204
|
+
function hasRepoWebWorkspace() {
|
|
3205
|
+
return existsSync(resolve(repoRoot, "apps/web/package.json"));
|
|
3206
|
+
}
|
|
3207
|
+
async function ensureWebUiIsBuilt() {
|
|
3208
|
+
if (!hasRepoWebWorkspace()) return;
|
|
3209
|
+
console.info("Preparing web UI...");
|
|
3210
|
+
await new Promise((resolvePromise, rejectPromise) => {
|
|
3211
|
+
const child = spawn(pnpmCommand, [
|
|
3212
|
+
"--filter",
|
|
3213
|
+
"@agent-evals/web",
|
|
3214
|
+
"build"
|
|
3215
|
+
], {
|
|
3216
|
+
cwd: repoRoot,
|
|
3217
|
+
stdio: "inherit"
|
|
3218
|
+
});
|
|
3219
|
+
child.once("error", (error) => {
|
|
3220
|
+
rejectPromise(error);
|
|
3221
|
+
});
|
|
3222
|
+
child.once("exit", (code, signal) => {
|
|
3223
|
+
if (signal) {
|
|
3224
|
+
rejectPromise(/* @__PURE__ */ new Error(`Web UI build stopped with signal ${signal}.`));
|
|
3225
|
+
return;
|
|
3226
|
+
}
|
|
3227
|
+
if (code !== 0) {
|
|
3228
|
+
rejectPromise(/* @__PURE__ */ new Error(`Web UI build failed with exit code ${String(code)}.`));
|
|
3229
|
+
return;
|
|
3230
|
+
}
|
|
3231
|
+
resolvePromise();
|
|
3232
|
+
});
|
|
3233
|
+
});
|
|
3234
|
+
}
|
|
3235
|
+
function isHonoAppModule(mod) {
|
|
3236
|
+
if (typeof mod !== "object" || mod === null || !("app" in mod)) return false;
|
|
3237
|
+
const { app } = mod;
|
|
3238
|
+
return typeof app === "object" && app !== null && "fetch" in app && typeof app.fetch === "function";
|
|
3239
|
+
}
|
|
3240
|
+
function isServerRunnerModule(mod) {
|
|
3241
|
+
if (typeof mod !== "object" || mod === null || !("initRunner" in mod)) return false;
|
|
3242
|
+
return typeof mod.initRunner === "function";
|
|
3243
|
+
}
|
|
3244
|
+
async function commandApp(args) {
|
|
3245
|
+
await ensureWebUiIsBuilt();
|
|
3246
|
+
const { serve } = await import("@hono/node-server");
|
|
3247
|
+
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
3248
|
+
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
3249
|
+
const appModule = await import("./app-CKa9TjXw.mjs");
|
|
3250
|
+
const runnerModule = await import("./runner-Ck4X0H3p.mjs");
|
|
3251
|
+
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
3252
|
+
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
3253
|
+
await runnerModule.initRunner();
|
|
3254
|
+
console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
|
|
3255
|
+
serve({
|
|
3256
|
+
fetch: appModule.app.fetch,
|
|
3257
|
+
port: args.port
|
|
3258
|
+
});
|
|
3259
|
+
}
|
|
3260
|
+
async function commandList(args_) {
|
|
3261
|
+
const runner = createRunner({ watchForChanges: false });
|
|
3262
|
+
await runner.init();
|
|
3263
|
+
const evals = runner.getEvals();
|
|
3264
|
+
if (evals.length === 0) {
|
|
3265
|
+
console.info("No eval files found.");
|
|
3266
|
+
return;
|
|
3267
|
+
}
|
|
3268
|
+
console.info("Discovered evals:\n");
|
|
3269
|
+
for (const ev of evals) {
|
|
3270
|
+
const displayStatus = getEvalDisplayStatus({
|
|
3271
|
+
freshnessStatus: ev.freshnessStatus,
|
|
3272
|
+
stale: ev.stale,
|
|
3273
|
+
outdated: ev.outdated,
|
|
3274
|
+
lastRunStatus: ev.lastRunStatus
|
|
3275
|
+
});
|
|
3276
|
+
const title = getEvalTitle(ev);
|
|
3277
|
+
console.info(` ${title}`);
|
|
3278
|
+
console.info(` id: ${ev.id}`);
|
|
3279
|
+
console.info(` file: ${ev.filePath}`);
|
|
3280
|
+
if (displayStatus !== "pending") console.info(` status: ${displayStatus}`);
|
|
3281
|
+
if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
|
|
3282
|
+
console.info("");
|
|
3283
|
+
}
|
|
3284
|
+
}
|
|
3285
|
+
async function commandRun(args) {
|
|
3286
|
+
const runner = createRunner({ watchForChanges: false });
|
|
3287
|
+
await runner.init();
|
|
3288
|
+
if (args.clearCache) {
|
|
3289
|
+
await runner.clearCache();
|
|
3290
|
+
if (!args.json) {
|
|
3291
|
+
console.info("Cleared cache before run.");
|
|
3292
|
+
console.info("");
|
|
3293
|
+
}
|
|
3294
|
+
}
|
|
3295
|
+
const target = args.caseIds.length > 0 ? {
|
|
3296
|
+
mode: "caseIds",
|
|
3297
|
+
caseIds: args.caseIds,
|
|
3298
|
+
evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
|
|
3299
|
+
} : args.evalIds.length > 0 ? {
|
|
3300
|
+
mode: "evalIds",
|
|
3301
|
+
evalIds: args.evalIds
|
|
3302
|
+
} : { mode: "all" };
|
|
3303
|
+
const run = await runner.startRun({
|
|
3304
|
+
target,
|
|
3305
|
+
trials: args.trials,
|
|
3306
|
+
cache: { mode: args.cacheMode }
|
|
3307
|
+
});
|
|
3308
|
+
if (!args.json) {
|
|
3309
|
+
console.info(`Run started: ${run.manifest.id}`);
|
|
3310
|
+
console.info(`Trials: ${String(args.trials)}`);
|
|
3311
|
+
if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
|
|
3312
|
+
console.info("");
|
|
3313
|
+
}
|
|
3314
|
+
await waitForRunCompletion(runner, run.manifest.id);
|
|
3315
|
+
const finalRun = runner.getRun(run.manifest.id);
|
|
3316
|
+
if (!finalRun) {
|
|
3317
|
+
process.exit(1);
|
|
3318
|
+
return;
|
|
3319
|
+
}
|
|
3320
|
+
const { summary } = finalRun;
|
|
3321
|
+
if (args.json) console.info(JSON.stringify(summary, null, 2));
|
|
3322
|
+
else {
|
|
3323
|
+
console.info("--- Run Summary ---");
|
|
3324
|
+
console.info(`Status: ${summary.status}`);
|
|
3325
|
+
console.info(`Total: ${String(summary.totalCases)}`);
|
|
3326
|
+
console.info(`Passed: ${String(summary.passedCases)}`);
|
|
3327
|
+
console.info(`Failed: ${String(summary.failedCases)}`);
|
|
3328
|
+
console.info(`Errors: ${String(summary.errorCases)}`);
|
|
3329
|
+
if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
|
|
3330
|
+
if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
|
|
3331
|
+
}
|
|
3332
|
+
if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
|
|
3333
|
+
}
|
|
3334
|
+
async function commandCache(args) {
|
|
3335
|
+
const runner = createRunner({ watchForChanges: false });
|
|
3336
|
+
await runner.init();
|
|
3337
|
+
if (args.subcommand === "list" || args.subcommand === void 0) {
|
|
3338
|
+
const entries = await runner.listCache();
|
|
3339
|
+
if (args.json) {
|
|
3340
|
+
console.info(JSON.stringify(entries, null, 2));
|
|
3341
|
+
return;
|
|
3342
|
+
}
|
|
3343
|
+
if (entries.length === 0) {
|
|
3344
|
+
console.info("No cache entries.");
|
|
3345
|
+
return;
|
|
3346
|
+
}
|
|
3347
|
+
console.info(`Cache entries (${String(entries.length)}):\n`);
|
|
3348
|
+
for (const entry of entries) {
|
|
3349
|
+
console.info(` ${entry.namespace}`);
|
|
3350
|
+
console.info(` key: ${entry.key}`);
|
|
3351
|
+
console.info(` span: ${entry.spanName} (${entry.spanKind})`);
|
|
3352
|
+
console.info(` stored: ${entry.storedAt}`);
|
|
3353
|
+
console.info(` size: ${String(entry.sizeBytes)} bytes`);
|
|
3354
|
+
console.info("");
|
|
3355
|
+
}
|
|
3356
|
+
return;
|
|
3357
|
+
}
|
|
3358
|
+
if (args.subcommand === "clear") {
|
|
3359
|
+
if (args.evalIds.length > 0) {
|
|
3360
|
+
for (const evalId of args.evalIds) {
|
|
3361
|
+
const entries = await runner.listCache();
|
|
3362
|
+
const prefix = `${evalId}__`;
|
|
3363
|
+
const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
|
|
3364
|
+
for (const entry of matching) await runner.clearCache({
|
|
3365
|
+
namespace: entry.namespace,
|
|
3366
|
+
key: entry.key
|
|
3367
|
+
});
|
|
3368
|
+
}
|
|
3369
|
+
console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
|
|
3370
|
+
return;
|
|
3371
|
+
}
|
|
3372
|
+
if (args.all) {
|
|
3373
|
+
await runner.clearCache();
|
|
3374
|
+
console.info("Cleared all cache entries.");
|
|
3375
|
+
return;
|
|
3376
|
+
}
|
|
3377
|
+
console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
|
|
3378
|
+
process.exit(1);
|
|
3379
|
+
return;
|
|
3380
|
+
}
|
|
3381
|
+
printHelp();
|
|
3382
|
+
}
|
|
3383
|
+
async function waitForRunCompletion(runner, runId) {
|
|
3384
|
+
return new Promise((resolvePromise) => {
|
|
3385
|
+
const check = () => {
|
|
3386
|
+
const run = runner.getRun(runId);
|
|
3387
|
+
if (!run || run.manifest.status === "completed" || run.manifest.status === "cancelled" || run.manifest.status === "error") {
|
|
3388
|
+
resolvePromise();
|
|
3389
|
+
return;
|
|
3390
|
+
}
|
|
3391
|
+
setTimeout(check, 200);
|
|
3392
|
+
};
|
|
3393
|
+
check();
|
|
3394
|
+
});
|
|
3395
|
+
}
|
|
3396
|
+
function printHelp() {
|
|
3397
|
+
console.info(`
|
|
3398
|
+
agent-evals - LLM/Agent eval runner
|
|
3399
|
+
|
|
3400
|
+
Commands:
|
|
3401
|
+
app Start server with UI
|
|
3402
|
+
list List discovered evals
|
|
3403
|
+
run Run evals
|
|
3404
|
+
cache list List cached operation entries
|
|
3405
|
+
cache clear --eval <id> Clear cache entries for one eval
|
|
3406
|
+
cache clear --all Clear every cached entry
|
|
3407
|
+
help Show this help
|
|
3408
|
+
|
|
3409
|
+
Options:
|
|
3410
|
+
--eval <id> Run specific eval(s) (comma-separated)
|
|
3411
|
+
--case <id> Run specific case(s) (comma-separated)
|
|
3412
|
+
--trials <n> Number of trials per case
|
|
3413
|
+
--json Output results as JSON
|
|
3414
|
+
--port <n> Server port (default: 4100)
|
|
3415
|
+
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
3416
|
+
--no-cache Shortcut for --cache bypass
|
|
3417
|
+
--refresh-cache Shortcut for --cache refresh
|
|
3418
|
+
--clear-cache Clear the cache before starting the run
|
|
3419
|
+
`);
|
|
3420
|
+
}
|
|
3421
|
+
//#endregion
|
|
3422
|
+
export { jsonCellSchema as $, scoreTraceSchema as A, traceAttributeDisplayFormatSchema as B, caseDetailSchema as C, evalStatItemSchema as D, evalStatAggregateSchema as E, evalChartConfigSchema as F, traceDisplayInputConfigSchema as G, traceAttributeDisplayPlacementSchema as H, evalChartMetricSchema as I, cellValueSchema as J, traceSpanKindSchema as K, evalChartTooltipExtraSchema as L, evalChartAxisSchema as M, evalChartBuiltinMetricSchema as N, evalStatsConfigSchema as O, evalChartColorSchema as P, fileRefSchema as Q, evalChartTypeSchema as R, assertionFailureSchema as S, evalFreshnessStatusSchema as T, traceAttributeDisplaySchema as U, traceAttributeDisplayInputSchema as V, traceDisplayConfigSchema as W, columnFormatSchema as X, columnDefSchema as Y, columnKindSchema as Z, cacheModeSchema as _, getEvalRegistry as _t, sseEnvelopeSchema as a, evalTracer as at, serializedCacheSpanSchema as b, deriveScopedSummaryFromCases as c, evalAssert as ct, runManifestSchema as d, isInEvalScope as dt, numberDisplayOptionsSchema as et, runSummarySchema as f, runInEvalScope as ft, cacheListItemSchema as g, defineEval as gt, cacheEntrySchema as h, repoFile as ht, updateManualScoreRequestSchema as i, evalSpan as it, evalChartAggregateSchema as j, evalSummarySchema as k, deriveStatusFromCaseRows as l, getCurrentScope as lt, trialSelectionModeSchema as m, setScopeCacheContext as mt, createRunner as n, runArtifactRefSchema as nt, getEvalTitle as o, hashCacheKey as ot, agentEvalsConfigSchema as p, setEvalOutput as pt, traceSpanSchema as q, createRunRequestSchema as r, buildTraceTree as rt, getEvalDisplayStatus as s, EvalAssertionError as st, runCli as t, repoFileRefSchema as tt, deriveStatusFromChildStatuses as u, incrementEvalOutput as ut, cacheRecordingOpSchema as v, caseRowSchema as w, spanCacheOptionsSchema as x, cacheRecordingSchema as y, evalChartsConfigSchema as z };
|