@ls-stack/agent-eval 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3422 @@
1
+ import { createHash } from "node:crypto";
2
+ import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
3
+ import { dirname, extname, join, relative, resolve } from "node:path";
4
+ import { AsyncLocalStorage } from "node:async_hooks";
5
+ import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
6
+ import { z } from "zod/v4";
7
+ import { watch } from "chokidar";
8
+ import { glob } from "glob";
9
+ import { existsSync } from "node:fs";
10
+ import { resultify } from "t-result";
11
+ import { fileURLToPath, pathToFileURL } from "node:url";
12
+ import { spawn, spawnSync } from "node:child_process";
13
+ //#region ../sdk/src/defineEval.ts
14
+ const evalRegistry = /* @__PURE__ */ new Map();
15
+ /** Return the in-memory registry of evals defined in the current process. */
16
+ function getEvalRegistry() {
17
+ return evalRegistry;
18
+ }
19
+ /**
20
+ * Register an eval definition with the SDK so the runner can discover it
21
+ * after importing the eval module.
22
+ */
23
+ function defineEval(definition) {
24
+ evalRegistry.set(definition.id, {
25
+ id: definition.id,
26
+ title: definition.title,
27
+ use: (fn) => fn(definition)
28
+ });
29
+ }
30
+ //#endregion
31
+ //#region ../sdk/src/repoFile.ts
32
+ /**
33
+ * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
34
+ * by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
35
+ *
36
+ * @param path Relative or absolute path to the repository file.
37
+ * @param mimeType Optional MIME type hint for UI rendering.
38
+ * @returns A repo-backed file reference suitable for file/media columns.
39
+ */
40
+ function repoFile(path, mimeType) {
41
+ return {
42
+ source: "repo",
43
+ path,
44
+ mimeType
45
+ };
46
+ }
47
+ //#endregion
48
+ //#region ../sdk/src/runtime.ts
49
+ const scopeStorage = new AsyncLocalStorage();
50
+ let activeEvalScopeCount = 0;
51
+ /** Error thrown when an eval assertion fails during case execution. */
52
+ var EvalAssertionError = class extends Error {
53
+ constructor(message) {
54
+ super(message);
55
+ this.name = "EvalAssertionError";
56
+ }
57
+ };
58
+ /** Return the current eval scope for the active async context, if any. */
59
+ function getCurrentScope() {
60
+ if (activeEvalScopeCount === 0) return void 0;
61
+ return scopeStorage.getStore();
62
+ }
63
+ /**
64
+ * Return whether the current async execution is inside an active eval case.
65
+ *
66
+ * This is useful for shared workflow code that wants to branch on eval-only
67
+ * behavior without importing or inspecting the full eval scope.
68
+ */
69
+ function isInEvalScope() {
70
+ return getCurrentScope() !== void 0;
71
+ }
72
+ /**
73
+ * Attach cache context (adapter, mode, eval id, fingerprint) to a scope.
74
+ *
75
+ * Runner-internal helper called immediately before the user's `execute`
76
+ * function runs inside `runInEvalScope`.
77
+ */
78
+ function setScopeCacheContext(scope, context) {
79
+ scope.cacheContext = context;
80
+ }
81
+ /**
82
+ * Execute a callback inside a fresh eval case scope and capture its outputs,
83
+ * trace data, and terminal error state.
84
+ */
85
+ async function runInEvalScope(caseId, fn, options = {}) {
86
+ const scope = {
87
+ caseId,
88
+ outputs: {},
89
+ assertionFailures: [],
90
+ spans: [],
91
+ checkpoints: /* @__PURE__ */ new Map(),
92
+ spanStack: [],
93
+ activeSpanStack: [],
94
+ recordingStack: [],
95
+ replayingDepth: 0,
96
+ cacheContext: options.cacheContext
97
+ };
98
+ activeEvalScopeCount++;
99
+ try {
100
+ return await scopeStorage.run(scope, async () => {
101
+ try {
102
+ return {
103
+ result: await fn(),
104
+ scope,
105
+ error: void 0
106
+ };
107
+ } catch (error) {
108
+ return {
109
+ result: void 0,
110
+ scope,
111
+ error: error instanceof Error ? error : new Error(String(error))
112
+ };
113
+ }
114
+ });
115
+ } finally {
116
+ activeEvalScopeCount--;
117
+ }
118
+ }
119
+ function recordOpIfActive(scope, op) {
120
+ if (scope.replayingDepth > 0) return;
121
+ const top = scope.recordingStack.at(-1);
122
+ if (top) top.ops.push(op);
123
+ }
124
+ function toAssertionFailure$1(message, error = void 0) {
125
+ return error?.stack ? {
126
+ message,
127
+ stack: error.stack
128
+ } : { message };
129
+ }
130
+ /**
131
+ * Record or replace an output value for the current case scope.
132
+ *
133
+ * Supported values include scalars, JSON-safe objects/arrays, explicit file
134
+ * refs, and native `Blob`/`File` instances for media or file columns.
135
+ */
136
+ function setEvalOutput(key, value) {
137
+ const scope = getCurrentScope();
138
+ if (!scope) return;
139
+ scope.outputs[key] = value;
140
+ recordOpIfActive(scope, {
141
+ kind: "setOutput",
142
+ key,
143
+ value
144
+ });
145
+ }
146
+ /**
147
+ * Add a numeric delta to an output value in the current case scope.
148
+ *
149
+ * If the existing value is non-numeric, the operation is recorded as an
150
+ * assertion failure instead of mutating the output.
151
+ */
152
+ function incrementEvalOutput(key, delta) {
153
+ const scope = getCurrentScope();
154
+ if (!scope) return;
155
+ const existing = scope.outputs[key];
156
+ if (existing === void 0) {
157
+ scope.outputs[key] = delta;
158
+ recordOpIfActive(scope, {
159
+ kind: "incrementOutput",
160
+ key,
161
+ delta
162
+ });
163
+ return;
164
+ }
165
+ if (typeof existing !== "number") {
166
+ scope.assertionFailures.push(toAssertionFailure$1(`incrementEvalOutput("${key}"): existing value is ${typeof existing}, expected number`));
167
+ return;
168
+ }
169
+ scope.outputs[key] = existing + delta;
170
+ recordOpIfActive(scope, {
171
+ kind: "incrementOutput",
172
+ key,
173
+ delta
174
+ });
175
+ }
176
+ /**
177
+ * Assert a condition for the current eval case and throw on failure.
178
+ *
179
+ * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
180
+ * can safely reuse `evalAssert(...)` when it also runs outside an eval.
181
+ */
182
+ function evalAssert(condition, message) {
183
+ if (condition) return;
184
+ const scope = getCurrentScope();
185
+ if (!scope) return;
186
+ const error = new EvalAssertionError(message);
187
+ scope.assertionFailures.push(toAssertionFailure$1(message, error));
188
+ throw error;
189
+ }
190
+ //#endregion
191
+ //#region ../sdk/src/tracer.ts
192
+ let spanIdCounter = 0;
193
+ function generateSpanId() {
194
+ spanIdCounter++;
195
+ return `span_${String(Date.now())}_${String(spanIdCounter)}`;
196
+ }
197
+ function updateCurrentSpan(update) {
198
+ const currentSpan = getCurrentScope()?.activeSpanStack.at(-1);
199
+ if (!currentSpan) return;
200
+ update(currentSpan);
201
+ }
202
+ function noopActiveSpan() {
203
+ return {
204
+ setName() {},
205
+ setAttribute() {},
206
+ setAttributes() {}
207
+ };
208
+ }
209
+ function mergeSpanAttributes(span, attributes) {
210
+ span.attributes = {
211
+ ...span.attributes,
212
+ ...attributes
213
+ };
214
+ }
215
+ function createSpanHandle(span) {
216
+ return {
217
+ setName(value) {
218
+ span.name = value;
219
+ },
220
+ setAttribute(key, value) {
221
+ mergeSpanAttributes(span, { [key]: value });
222
+ },
223
+ setAttributes(value) {
224
+ mergeSpanAttributes(span, value);
225
+ }
226
+ };
227
+ }
228
+ /**
229
+ * Ambient handle for the active span in the current async context.
230
+ *
231
+ * Calls are no-ops when executed outside of `evalTracer.span(...)`.
232
+ */
233
+ const evalSpan = {
234
+ setName(value) {
235
+ updateCurrentSpan((currentSpan) => {
236
+ currentSpan.name = value;
237
+ });
238
+ },
239
+ setAttribute(key, value) {
240
+ updateCurrentSpan((currentSpan) => {
241
+ mergeSpanAttributes(currentSpan, { [key]: value });
242
+ });
243
+ },
244
+ setAttributes(value) {
245
+ updateCurrentSpan((currentSpan) => {
246
+ mergeSpanAttributes(currentSpan, value);
247
+ });
248
+ }
249
+ };
250
+ async function traceSpan(info, fn) {
251
+ const scope = getCurrentScope();
252
+ if (!scope) return await fn(noopActiveSpan());
253
+ const id = generateSpanId();
254
+ const spanRecord = {
255
+ id,
256
+ parentId: scope.activeSpanStack.at(-1)?.id ?? null,
257
+ caseId: scope.caseId,
258
+ kind: info.kind,
259
+ name: info.name,
260
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
261
+ endedAt: null,
262
+ status: "running",
263
+ attributes: info.attributes
264
+ };
265
+ scope.spans.push(spanRecord);
266
+ scope.spanStack.push(id);
267
+ scope.activeSpanStack.push(spanRecord);
268
+ const activeSpan = createSpanHandle(spanRecord);
269
+ try {
270
+ const cacheOpts = info.cache;
271
+ const cacheCtx = scope.cacheContext;
272
+ if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
273
+ const ctx = cacheCtx;
274
+ const namespace = cacheOpts.namespace ?? `${ctx.evalId}__${info.name}`;
275
+ const keyHash = hashCacheKey({
276
+ namespace,
277
+ codeFingerprint: ctx.codeFingerprint,
278
+ key: cacheOpts.key
279
+ });
280
+ mergeSpanAttributes(spanRecord, {
281
+ "cache.key": keyHash,
282
+ "cache.namespace": namespace
283
+ });
284
+ if (ctx.mode === "use") {
285
+ const hit = await ctx.adapter.lookup(namespace, keyHash);
286
+ if (hit) {
287
+ const storedAt = hit.storedAt;
288
+ mergeSpanAttributes(spanRecord, {
289
+ "cache.status": "hit",
290
+ "cache.storedAt": storedAt,
291
+ "cache.age": Date.now() - new Date(storedAt).getTime()
292
+ });
293
+ replayRecording(scope, spanRecord, hit.recording);
294
+ spanRecord.status = "ok";
295
+ spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
296
+ return hit.recording.returnValue;
297
+ }
298
+ mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
299
+ } else if (ctx.mode === "refresh") mergeSpanAttributes(spanRecord, { "cache.status": "refresh" });
300
+ else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
301
+ const frame = {
302
+ baseSpanIndex: scope.spans.length,
303
+ cachedSpanId: id,
304
+ ops: []
305
+ };
306
+ scope.recordingStack.push(frame);
307
+ let bodyResult;
308
+ try {
309
+ bodyResult = await fn(activeSpan);
310
+ } finally {
311
+ scope.recordingStack.pop();
312
+ }
313
+ appendSubSpanOps(scope, frame);
314
+ if (ctx.mode !== "bypass") {
315
+ const recording = {
316
+ returnValue: toJsonSafe(bodyResult),
317
+ finalAttributes: stripCacheAttributes(spanRecord.attributes),
318
+ ops: frame.ops
319
+ };
320
+ const entry = {
321
+ version: 1,
322
+ key: keyHash,
323
+ namespace,
324
+ spanName: info.name,
325
+ spanKind: info.kind,
326
+ storedAt: (/* @__PURE__ */ new Date()).toISOString(),
327
+ codeFingerprint: ctx.codeFingerprint,
328
+ recording
329
+ };
330
+ await ctx.adapter.write(entry);
331
+ }
332
+ spanRecord.status = "ok";
333
+ spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
334
+ return bodyResult;
335
+ }
336
+ const result = await fn(activeSpan);
337
+ spanRecord.status = "ok";
338
+ spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
339
+ return result;
340
+ } catch (error) {
341
+ spanRecord.status = "error";
342
+ spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
343
+ if (error instanceof Error) spanRecord.error = {
344
+ name: error.name,
345
+ message: error.message,
346
+ stack: error.stack
347
+ };
348
+ else spanRecord.error = { message: String(error) };
349
+ throw error;
350
+ } finally {
351
+ scope.spanStack.pop();
352
+ scope.activeSpanStack.pop();
353
+ }
354
+ }
355
+ /**
356
+ * Trace builder used to create hierarchical spans and checkpoints during eval
357
+ * execution.
358
+ */
359
+ const evalTracer = {
360
+ /** Run a callback inside a new trace span and record its lifecycle. */
361
+ span: traceSpan,
362
+ /** Record a named point-in-time value alongside the trace. */
363
+ checkpoint(name, data) {
364
+ const scope = getCurrentScope();
365
+ if (!scope) return;
366
+ scope.checkpoints.set(name, data);
367
+ const id = generateSpanId();
368
+ const parentId = scope.spanStack.at(-1) ?? null;
369
+ scope.spans.push({
370
+ id,
371
+ parentId,
372
+ caseId: scope.caseId,
373
+ kind: "checkpoint",
374
+ name,
375
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
376
+ endedAt: (/* @__PURE__ */ new Date()).toISOString(),
377
+ status: "ok",
378
+ attributes: { value: data }
379
+ });
380
+ if (scope.replayingDepth === 0) {
381
+ const top = scope.recordingStack.at(-1);
382
+ if (top) top.ops.push({
383
+ kind: "checkpoint",
384
+ name,
385
+ data
386
+ });
387
+ }
388
+ }
389
+ };
390
+ /** Build a queryable trace tree helper from a flat span list and checkpoints. */
391
+ function buildTraceTree(spans, checkpoints) {
392
+ return {
393
+ spans,
394
+ rootSpans: spans.filter((s) => s.parentId === null),
395
+ findSpan(name) {
396
+ return spans.find((s) => s.name === name);
397
+ },
398
+ findSpansByKind(kind) {
399
+ return spans.filter((s) => s.kind === kind);
400
+ },
401
+ flattenDfs() {
402
+ const result = [];
403
+ function visit(parentId) {
404
+ for (const childSpan of spans) if (childSpan.parentId === parentId) {
405
+ result.push(childSpan);
406
+ visit(childSpan.id);
407
+ }
408
+ }
409
+ visit(null);
410
+ return result;
411
+ },
412
+ checkpoints
413
+ };
414
+ }
415
+ /** Hash the components of a cache key into a deterministic hex digest. */
416
+ function hashCacheKey(input) {
417
+ return createHash("sha256").update(getCompositeKey(input)).digest("hex");
418
+ }
419
+ function toJsonSafe(value) {
420
+ if (value === void 0) return void 0;
421
+ const text = JSON.stringify(value);
422
+ return JSON.parse(text);
423
+ }
424
+ function stripCacheAttributes(attributes) {
425
+ if (!attributes) return {};
426
+ const result = {};
427
+ for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
428
+ return result;
429
+ }
430
+ function serializeSubSpanTree(scope, spanId) {
431
+ const original = scope.spans.find((s) => s.id === spanId);
432
+ if (!original) return {
433
+ kind: "custom",
434
+ name: "unknown",
435
+ attributes: void 0,
436
+ status: "ok",
437
+ error: void 0,
438
+ children: []
439
+ };
440
+ const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
441
+ return {
442
+ kind: original.kind,
443
+ name: original.name,
444
+ attributes: original.attributes,
445
+ status: original.status,
446
+ error: original.error,
447
+ children
448
+ };
449
+ }
450
+ function appendSubSpanOps(scope, frame) {
451
+ for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
452
+ const candidate = scope.spans[i];
453
+ if (candidate?.parentId === frame.cachedSpanId) frame.ops.push({
454
+ kind: "subSpan",
455
+ span: serializeSubSpanTree(scope, candidate.id)
456
+ });
457
+ }
458
+ }
459
+ function replayRecording(scope, parentSpan, recording) {
460
+ scope.replayingDepth++;
461
+ try {
462
+ for (const op of recording.ops) applyRecordingOp(scope, parentSpan, op);
463
+ if (Object.keys(recording.finalAttributes).length > 0) mergeSpanAttributes(parentSpan, recording.finalAttributes);
464
+ } finally {
465
+ scope.replayingDepth--;
466
+ }
467
+ }
468
+ function applyRecordingOp(scope, parentSpan, op) {
469
+ if (op.kind === "setOutput") {
470
+ scope.outputs[op.key] = op.value;
471
+ return;
472
+ }
473
+ if (op.kind === "incrementOutput") {
474
+ const existing = scope.outputs[op.key];
475
+ if (existing === void 0) scope.outputs[op.key] = op.delta;
476
+ else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
477
+ else scope.assertionFailures.push({ message: `replay incrementEvalOutput("${op.key}"): existing value is ${typeof existing}, expected number` });
478
+ return;
479
+ }
480
+ if (op.kind === "checkpoint") {
481
+ scope.checkpoints.set(op.name, op.data);
482
+ return;
483
+ }
484
+ replaySerializedSpan(scope, parentSpan.id, op.span);
485
+ }
486
+ function replaySerializedSpan(scope, parentId, serialized) {
487
+ const id = generateSpanId();
488
+ const now = (/* @__PURE__ */ new Date()).toISOString();
489
+ const replayed = {
490
+ id,
491
+ parentId,
492
+ caseId: scope.caseId,
493
+ kind: serialized.kind,
494
+ name: serialized.name,
495
+ startedAt: now,
496
+ endedAt: now,
497
+ status: serialized.status,
498
+ attributes: serialized.attributes,
499
+ error: serialized.error
500
+ };
501
+ scope.spans.push(replayed);
502
+ for (const child of serialized.children) replaySerializedSpan(scope, id, child);
503
+ }
504
+ //#endregion
505
+ //#region ../shared/src/schemas/display.ts
506
+ const scalarCellSchema = z.union([
507
+ z.string(),
508
+ z.number(),
509
+ z.boolean(),
510
+ z.null()
511
+ ]);
512
+ const jsonCellSchema = z.lazy(() => z.union([
513
+ scalarCellSchema,
514
+ z.array(jsonCellSchema),
515
+ z.record(z.string(), jsonCellSchema)
516
+ ]));
517
+ const repoFileRefSchema = z.object({
518
+ source: z.literal("repo"),
519
+ path: z.string(),
520
+ mimeType: z.string().optional()
521
+ });
522
+ const runArtifactRefSchema = z.object({
523
+ source: z.literal("run"),
524
+ artifactId: z.string(),
525
+ mimeType: z.string(),
526
+ fileName: z.string().optional()
527
+ });
528
+ const fileRefSchema = z.union([repoFileRefSchema, runArtifactRefSchema]);
529
+ /** Schema for numeric presentation options used by number-formatted values. */
530
+ const numberDisplayOptionsSchema = z.object({
531
+ notation: z.enum(["standard", "compact"]).optional(),
532
+ compactDisplay: z.enum(["short", "long"]).optional(),
533
+ prefix: z.string().optional(),
534
+ suffix: z.string().optional(),
535
+ decimalPlaces: z.number().int().min(0).optional()
536
+ });
537
+ /** Schema for the supported column rendering kinds in list views. */
538
+ const columnKindSchema = z.enum([
539
+ "string",
540
+ "number",
541
+ "boolean"
542
+ ]);
543
+ /** Schema for the built-in column formatting presets. */
544
+ const columnFormatSchema = z.enum([
545
+ "boolean",
546
+ "markdown",
547
+ "json",
548
+ "image",
549
+ "audio",
550
+ "video",
551
+ "file",
552
+ "duration",
553
+ "percent",
554
+ "number",
555
+ "passFail",
556
+ "stars"
557
+ ]);
558
+ /** Schema describing a rendered column in the eval results table. */
559
+ const columnDefSchema = z.object({
560
+ key: z.string(),
561
+ label: z.string(),
562
+ kind: columnKindSchema,
563
+ format: columnFormatSchema.optional(),
564
+ numberFormat: numberDisplayOptionsSchema.optional(),
565
+ isScore: z.boolean().optional(),
566
+ isManualScore: z.boolean().optional(),
567
+ passThreshold: z.number().optional(),
568
+ maxStars: z.number().int().min(2).optional(),
569
+ hideInTable: z.boolean().optional(),
570
+ sortable: z.boolean().optional(),
571
+ align: z.enum([
572
+ "left",
573
+ "center",
574
+ "right"
575
+ ]).optional()
576
+ });
577
+ /** Schema for any supported value that can populate a table cell. */
578
+ const cellValueSchema = z.union([jsonCellSchema, fileRefSchema]);
579
+ //#endregion
580
+ //#region ../shared/src/schemas/trace.ts
581
+ /** Schema for the semantic categories used to classify trace spans. */
582
+ const traceSpanKindSchema = z.enum([
583
+ "eval",
584
+ "agent",
585
+ "llm",
586
+ "tool",
587
+ "retrieval",
588
+ "scorer",
589
+ "checkpoint",
590
+ "custom"
591
+ ]);
592
+ /** Schema for the supported presentation formats of trace attributes. */
593
+ const traceAttributeDisplayFormatSchema = z.enum([
594
+ "string",
595
+ "number",
596
+ "duration",
597
+ "json"
598
+ ]);
599
+ /** Schema for the UI locations where a trace attribute can appear. */
600
+ const traceAttributeDisplayPlacementSchema = z.enum([
601
+ "tree",
602
+ "detail",
603
+ "section"
604
+ ]);
605
+ /** Schema for resolved trace display rules sent to the UI. */
606
+ const traceAttributeDisplaySchema = z.object({
607
+ key: z.string().optional(),
608
+ path: z.string(),
609
+ label: z.string().optional(),
610
+ format: traceAttributeDisplayFormatSchema.optional(),
611
+ numberFormat: numberDisplayOptionsSchema.optional(),
612
+ placements: z.array(traceAttributeDisplayPlacementSchema).optional(),
613
+ scope: z.enum(["self", "subtree"]).optional(),
614
+ mode: z.enum([
615
+ "all",
616
+ "last",
617
+ "sum"
618
+ ]).optional()
619
+ });
620
+ /** Schema for trace display config after transforms have been resolved. */
621
+ const traceDisplayConfigSchema = z.object({ attributes: z.array(traceAttributeDisplaySchema).optional() });
622
+ /** Schema for authored trace display rules accepted from user config. */
623
+ const traceAttributeDisplayInputSchema = z.object({
624
+ key: z.string().optional(),
625
+ path: z.string(),
626
+ label: z.string().optional(),
627
+ format: traceAttributeDisplayFormatSchema.optional(),
628
+ numberFormat: numberDisplayOptionsSchema.optional(),
629
+ placements: z.array(traceAttributeDisplayPlacementSchema).optional(),
630
+ scope: z.enum(["self", "subtree"]).optional(),
631
+ mode: z.enum([
632
+ "all",
633
+ "last",
634
+ "sum"
635
+ ]).optional(),
636
+ transform: z.custom((value) => value === void 0 || typeof value === "function", { message: "Expected a transform function" }).optional()
637
+ });
638
+ /** Schema for authored trace display config in eval or workspace config. */
639
+ const traceDisplayInputConfigSchema = z.object({ attributes: z.array(traceAttributeDisplayInputSchema).optional() });
640
+ /** Schema for a persisted trace span captured during case execution. */
641
+ const traceSpanSchema = z.object({
642
+ id: z.string(),
643
+ parentId: z.string().nullable(),
644
+ caseId: z.string(),
645
+ kind: traceSpanKindSchema,
646
+ name: z.string(),
647
+ startedAt: z.string(),
648
+ endedAt: z.string().nullable(),
649
+ status: z.enum([
650
+ "running",
651
+ "ok",
652
+ "error",
653
+ "cancelled"
654
+ ]),
655
+ attributes: z.record(z.string(), z.unknown()).optional(),
656
+ error: z.object({
657
+ name: z.string().optional(),
658
+ message: z.string(),
659
+ stack: z.string().optional()
660
+ }).optional()
661
+ });
662
+ //#endregion
663
+ //#region ../shared/src/schemas/chart.ts
664
+ /** Chart type rendered for a single eval history chart. */
665
+ const evalChartTypeSchema = z.enum([
666
+ "area",
667
+ "line",
668
+ "bar"
669
+ ]);
670
+ /**
671
+ * Run-level metric sourced from the aggregated `RunSummary` for a run, rather
672
+ * than from a per-case column.
673
+ */
674
+ const evalChartBuiltinMetricSchema = z.enum(["passRate", "durationMs"]);
675
+ /** Reducer applied to a numeric column across all cases of a single run. */
676
+ const evalChartAggregateSchema = z.enum([
677
+ "avg",
678
+ "sum",
679
+ "min",
680
+ "max",
681
+ "latest",
682
+ "passThresholdRate"
683
+ ]);
684
+ /**
685
+ * Semantic color token resolved to a theme color by the web UI. The SDK does
686
+ * not emit raw hex so authored evals stay decoupled from the web theme.
687
+ */
688
+ const evalChartColorSchema = z.enum([
689
+ "accent",
690
+ "accentDim",
691
+ "success",
692
+ "error",
693
+ "warning",
694
+ "textMuted"
695
+ ]);
696
+ /** Y-axis placement for a plotted series on a dual-axis chart. */
697
+ const evalChartAxisSchema = z.enum(["left", "right"]);
698
+ /**
699
+ * One plotted series on an eval history chart. `builtin` metrics come from the
700
+ * per-run `RunSummary`; `column` metrics aggregate a per-case score or
701
+ * `setEvalOutput` column across the run using `aggregate`.
702
+ */
703
+ const evalChartMetricSchema = z.discriminatedUnion("source", [z.object({
704
+ source: z.literal("builtin"),
705
+ metric: evalChartBuiltinMetricSchema,
706
+ label: z.string().optional(),
707
+ color: evalChartColorSchema.optional(),
708
+ axis: evalChartAxisSchema.optional()
709
+ }), z.object({
710
+ source: z.literal("column"),
711
+ /** Matches a declared score key or a `setEvalOutput` key on the eval. */
712
+ key: z.string().min(1),
713
+ aggregate: evalChartAggregateSchema,
714
+ label: z.string().optional(),
715
+ color: evalChartColorSchema.optional(),
716
+ axis: evalChartAxisSchema.optional()
717
+ })]);
718
+ /** Extra field rendered only in the tooltip, not plotted as a series. */
719
+ const evalChartTooltipExtraSchema = z.discriminatedUnion("source", [z.object({
720
+ source: z.literal("builtin"),
721
+ metric: evalChartBuiltinMetricSchema,
722
+ label: z.string().optional()
723
+ }), z.object({
724
+ source: z.literal("column"),
725
+ key: z.string().min(1),
726
+ aggregate: evalChartAggregateSchema,
727
+ label: z.string().optional()
728
+ })]);
729
+ /**
730
+ * Authored configuration for one eval history chart rendered in `EvalCard`.
731
+ * Authors declare a list of these via `EvalDefinition.charts` — the UI renders
732
+ * each entry as its own chart frame, stacked in authoring order.
733
+ */
734
+ const evalChartConfigSchema = z.object({
735
+ /** Optional heading shown above the chart frame in the UI. */
736
+ heading: z.string().optional(),
737
+ type: evalChartTypeSchema,
738
+ /** At least one series must be declared. */
739
+ metrics: z.array(evalChartMetricSchema).min(1),
740
+ /**
741
+ * Per-axis Y domain. Omit either side for automatic scaling. When unset the
742
+ * chart auto-scales — there is no implicit `[0, 1]` clamp.
743
+ */
744
+ yDomain: z.object({
745
+ left: z.object({
746
+ min: z.number().optional(),
747
+ max: z.number().optional()
748
+ }).optional(),
749
+ right: z.object({
750
+ min: z.number().optional(),
751
+ max: z.number().optional()
752
+ }).optional()
753
+ }).optional(),
754
+ tooltipExtras: z.array(evalChartTooltipExtraSchema).optional()
755
+ });
756
+ /**
757
+ * Ordered list of history charts rendered for an eval. Opt-in: when omitted or
758
+ * empty, the UI renders no history chart at all.
759
+ */
760
+ const evalChartsConfigSchema = z.array(evalChartConfigSchema);
761
+ //#endregion
762
+ //#region ../shared/src/schemas/eval.ts
763
+ /** Freshness signal derived from the latest relevant run plus git state. */
764
+ const evalFreshnessStatusSchema = z.enum([
765
+ "fresh",
766
+ "stale",
767
+ "outdated"
768
+ ]);
769
+ /** Reducer used to collapse a column's per-case values into a single stat. */
770
+ const evalStatAggregateSchema = z.enum([
771
+ "avg",
772
+ "min",
773
+ "max",
774
+ "sum",
775
+ "last"
776
+ ]);
777
+ /**
778
+ * One entry in the EvalCard stats row. Built-in kinds use latest run totals;
779
+ * `column` aggregates a score or numeric output column across the latest run.
780
+ */
781
+ const evalStatItemSchema = z.discriminatedUnion("kind", [
782
+ z.object({ kind: z.literal("cases") }),
783
+ z.object({
784
+ kind: z.literal("passRate"),
785
+ accent: z.boolean().optional()
786
+ }),
787
+ z.object({ kind: z.literal("duration") }),
788
+ z.object({
789
+ kind: z.literal("column"),
790
+ key: z.string(),
791
+ label: z.string().optional(),
792
+ aggregate: evalStatAggregateSchema,
793
+ format: columnFormatSchema.optional(),
794
+ accent: z.boolean().optional()
795
+ })
796
+ ]);
797
+ /** Ordered list of stats rendered in the EvalCard stats row. */
798
+ const evalStatsConfigSchema = z.array(evalStatItemSchema);
799
+ /** Schema summarizing a discovered eval for list and overview screens. */
800
+ const evalSummarySchema = z.object({
801
+ id: z.string(),
802
+ title: z.string().optional(),
803
+ /** Eval file path relative to the active workspace root. */
804
+ filePath: z.string(),
805
+ /** Indicates the eval file changed since the latest passing result. */
806
+ stale: z.boolean(),
807
+ /** Indicates the latest comparable run is from an older commit and too old. */
808
+ outdated: z.boolean(),
809
+ /** Latest derived freshness signal for this eval. */
810
+ freshnessStatus: evalFreshnessStatusSchema,
811
+ /** Timestamp for the latest run considered when deriving freshness. */
812
+ latestRunAt: z.string().nullable(),
813
+ /** Commit SHA recorded on the latest run considered for freshness. */
814
+ latestRunCommitSha: z.string().nullable(),
815
+ /** Current workspace commit SHA when the summary was requested. */
816
+ currentCommitSha: z.string().nullable(),
817
+ columnDefs: z.array(columnDefSchema),
818
+ caseCount: z.number().nullable(),
819
+ lastRunStatus: z.enum([
820
+ "pass",
821
+ "fail",
822
+ "error",
823
+ "running",
824
+ "cancelled",
825
+ "unscored"
826
+ ]).nullable(),
827
+ /**
828
+ * Optional per-eval stats row configuration for the EvalCard. Opt-in: when
829
+ * omitted or empty, the UI renders no stats row at all.
830
+ */
831
+ stats: evalStatsConfigSchema.optional(),
832
+ /**
833
+ * Ordered per-eval history chart configuration for the EvalCard. Opt-in:
834
+ * when omitted or empty, the UI renders no history chart at all.
835
+ */
836
+ charts: evalChartsConfigSchema.optional()
837
+ });
838
+ /** Schema for one case row in an eval run result table. */
839
+ const caseRowSchema = z.object({
840
+ caseId: z.string(),
841
+ evalId: z.string(),
842
+ status: z.enum([
843
+ "pending",
844
+ "running",
845
+ "pass",
846
+ "fail",
847
+ "error",
848
+ "cancelled"
849
+ ]),
850
+ latencyMs: z.number().nullable(),
851
+ costUsd: z.number().nullable().optional(),
852
+ columns: z.record(z.string(), cellValueSchema),
853
+ /** Winning trial index for the persisted case result. */
854
+ trial: z.number()
855
+ });
856
+ /** Structured assertion failure metadata captured for one case run. */
857
+ const assertionFailureSchema = z.object({
858
+ /** Human-readable assertion failure message shown in the UI and artifacts. */
859
+ message: z.string(),
860
+ /** Stack trace captured from the originating error when available. */
861
+ stack: z.string().optional()
862
+ });
863
+ const legacyAssertionFailureSchema = z.string().transform((message) => ({ message }));
864
+ /** Trace payload captured while computing one score for a case. */
865
+ const scoreTraceSchema = z.object({
866
+ trace: z.array(traceSpanSchema),
867
+ traceDisplay: traceDisplayConfigSchema
868
+ });
869
+ /** Schema for the detailed payload shown when opening a specific case. */
870
+ const caseDetailSchema = z.object({
871
+ caseId: z.string(),
872
+ evalId: z.string(),
873
+ status: z.enum([
874
+ "pending",
875
+ "running",
876
+ "pass",
877
+ "fail",
878
+ "error",
879
+ "cancelled"
880
+ ]),
881
+ input: z.unknown(),
882
+ trace: z.array(traceSpanSchema),
883
+ traceDisplay: traceDisplayConfigSchema,
884
+ /**
885
+ * Separate trace payloads emitted by score computation. These are kept out
886
+ * of `trace` so derive-from-execution metrics do not include judge/scorer
887
+ * work.
888
+ */
889
+ scoringTraces: z.record(z.string(), scoreTraceSchema).optional(),
890
+ columns: z.record(z.string(), cellValueSchema),
891
+ assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
892
+ error: z.object({
893
+ name: z.string().optional(),
894
+ message: z.string(),
895
+ stack: z.string().optional()
896
+ }).nullable(),
897
+ /** Winning trial index for the persisted case detail. */
898
+ trial: z.number()
899
+ });
900
+ //#endregion
901
+ //#region ../shared/src/schemas/cache.ts
902
+ /**
903
+ * Mode that controls how the cache is consulted for a given run.
904
+ *
905
+ * - `use`: read cache on hit, write on miss. Default.
906
+ * - `bypass`: never read, never write.
907
+ * - `refresh`: never read, always write (forces re-execution and overwrites).
908
+ */
909
+ const cacheModeSchema = z.enum([
910
+ "use",
911
+ "bypass",
912
+ "refresh"
913
+ ]);
914
+ /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
915
+ const spanCacheOptionsSchema = z.object({
916
+ /** Arbitrary JSON-safe value used to derive the cache key. */
917
+ key: z.unknown(),
918
+ /** Override the default namespace (`${evalId}__${spanName}`). */
919
+ namespace: z.string().optional()
920
+ });
921
+ /** Summary of a single persisted cache entry, used by list/delete endpoints. */
922
+ const cacheListItemSchema = z.object({
923
+ key: z.string(),
924
+ namespace: z.string(),
925
+ spanName: z.string(),
926
+ spanKind: traceSpanKindSchema,
927
+ storedAt: z.string(),
928
+ codeFingerprint: z.string(),
929
+ sizeBytes: z.number()
930
+ });
931
+ /** Zod schema for `SerializedCacheSpan`, defined lazily for recursion. */
932
+ const serializedCacheSpanSchema = z.object({
933
+ kind: traceSpanKindSchema,
934
+ name: z.string(),
935
+ attributes: z.record(z.string(), z.unknown()).optional(),
936
+ status: z.enum([
937
+ "running",
938
+ "ok",
939
+ "error",
940
+ "cancelled"
941
+ ]),
942
+ error: z.object({
943
+ name: z.string().optional(),
944
+ message: z.string(),
945
+ stack: z.string().optional()
946
+ }).optional()
947
+ }).extend({ children: z.lazy(() => z.array(serializedCacheSpanSchema)) });
948
+ /**
949
+ * One captured operation performed while a cached span's body executed.
950
+ *
951
+ * Operations are replayed in order against a fresh scope on cache hit to
952
+ * reproduce the observable effects of the original run.
953
+ */
954
+ const cacheRecordingOpSchema = z.discriminatedUnion("kind", [
955
+ z.object({
956
+ kind: z.literal("setOutput"),
957
+ key: z.string(),
958
+ value: z.unknown()
959
+ }),
960
+ z.object({
961
+ kind: z.literal("incrementOutput"),
962
+ key: z.string(),
963
+ delta: z.number()
964
+ }),
965
+ z.object({
966
+ kind: z.literal("checkpoint"),
967
+ name: z.string(),
968
+ data: z.unknown()
969
+ }),
970
+ z.object({
971
+ kind: z.literal("subSpan"),
972
+ span: serializedCacheSpanSchema
973
+ })
974
+ ]);
975
+ /** Captured observable effects + return value of a cached span body. */
976
+ const cacheRecordingSchema = z.object({
977
+ returnValue: z.unknown(),
978
+ finalAttributes: z.record(z.string(), z.unknown()),
979
+ ops: z.array(cacheRecordingOpSchema)
980
+ });
981
+ /** Persisted cache file containing metadata and a recording. */
982
+ const cacheEntrySchema = z.object({
983
+ version: z.literal(1),
984
+ key: z.string(),
985
+ namespace: z.string(),
986
+ spanName: z.string(),
987
+ spanKind: traceSpanKindSchema,
988
+ storedAt: z.string(),
989
+ codeFingerprint: z.string(),
990
+ recording: cacheRecordingSchema
991
+ });
992
+ //#endregion
993
+ //#region ../shared/src/schemas/config.ts
994
+ /** Strategy used to collapse repeated trials into one stored case result. */
995
+ const trialSelectionModeSchema = z.enum(["lowestScore", "median"]);
996
+ /** Zod schema for validating `agent-evals.config.ts` input. */
997
+ const agentEvalsConfigSchema = z.object({
998
+ workspaceRoot: z.string().optional(),
999
+ include: z.array(z.string()),
1000
+ defaultTrials: z.number().optional(),
1001
+ trialSelection: trialSelectionModeSchema.optional(),
1002
+ concurrency: z.number().optional(),
1003
+ staleAfterDays: z.number().optional(),
1004
+ traceDisplay: traceDisplayInputConfigSchema.optional(),
1005
+ cache: z.object({
1006
+ enabled: z.boolean().optional(),
1007
+ dir: z.string().optional()
1008
+ }).optional()
1009
+ });
1010
+ //#endregion
1011
+ //#region ../shared/src/schemas/run.ts
1012
+ /** Schema for persisted metadata about a single run invocation. */
1013
+ const runManifestSchema = z.object({
1014
+ id: z.string(),
1015
+ /**
1016
+ * Short, human-readable run id (e.g. `r0`, `r1`). Monotonic global counter
1017
+ * assigned at creation; oldest run is `r0`. Legacy persisted runs are
1018
+ * migrated to have a `shortId` on load.
1019
+ */
1020
+ shortId: z.string(),
1021
+ status: z.enum([
1022
+ "pending",
1023
+ "running",
1024
+ "completed",
1025
+ "cancelled",
1026
+ "error"
1027
+ ]),
1028
+ startedAt: z.string(),
1029
+ endedAt: z.string().nullable(),
1030
+ /**
1031
+ * Git commit SHA for the workspace when the run started. Older persisted
1032
+ * runs may not include this field.
1033
+ */
1034
+ commitSha: z.string().nullable().optional().default(null),
1035
+ /**
1036
+ * Eval-file fingerprints captured for this run, keyed by eval id. Older
1037
+ * persisted runs may not include this field.
1038
+ */
1039
+ evalSourceFingerprints: z.record(z.string(), z.string()).optional().default({}),
1040
+ target: z.object({
1041
+ mode: z.enum([
1042
+ "all",
1043
+ "evalIds",
1044
+ "caseIds"
1045
+ ]),
1046
+ evalIds: z.array(z.string()).optional(),
1047
+ caseIds: z.array(z.string()).optional()
1048
+ }),
1049
+ /** Number of trial attempts executed for each case in this run. */
1050
+ trials: z.number(),
1051
+ /**
1052
+ * Strategy used to collapse repeated trials into the single persisted case
1053
+ * result for this run. Older persisted runs may not include this field.
1054
+ */
1055
+ trialSelection: trialSelectionModeSchema.optional().default("lowestScore"),
1056
+ /** Cache mode used for this run. Defaults to `use` when absent. */
1057
+ cacheMode: cacheModeSchema.optional()
1058
+ });
1059
+ /** Schema for aggregate metrics computed over a completed or active run. */
1060
+ const runSummarySchema = z.object({
1061
+ runId: z.string(),
1062
+ status: z.enum([
1063
+ "pending",
1064
+ "running",
1065
+ "completed",
1066
+ "cancelled",
1067
+ "error"
1068
+ ]),
1069
+ totalCases: z.number(),
1070
+ passedCases: z.number(),
1071
+ failedCases: z.number(),
1072
+ errorCases: z.number(),
1073
+ cancelledCases: z.number(),
1074
+ totalDurationMs: z.number().nullable(),
1075
+ errorMessage: z.string().nullable().default(null)
1076
+ });
1077
+ //#endregion
1078
+ //#region ../shared/src/status.ts
1079
+ function deriveLifecycleStatus(lifecycleStatus) {
1080
+ if (lifecycleStatus === "pending" || lifecycleStatus === "running" || lifecycleStatus === "cancelled" || lifecycleStatus === "error") return lifecycleStatus;
1081
+ return null;
1082
+ }
1083
+ /**
1084
+ * Derive an aggregate status from child statuses, optionally allowing a raw run
1085
+ * lifecycle status to override active terminal states such as `running`,
1086
+ * `cancelled`, and `error`.
1087
+ */
1088
+ function deriveStatusFromChildStatuses(params) {
1089
+ const lifecycle = deriveLifecycleStatus(params.lifecycleStatus);
1090
+ if (lifecycle !== null) return lifecycle;
1091
+ let hasPass = false;
1092
+ let hasPending = false;
1093
+ let hasRunning = false;
1094
+ let hasCancelled = false;
1095
+ let hasError = false;
1096
+ let hasFail = false;
1097
+ for (const status of params.statuses) {
1098
+ if (status === void 0 || status === null) continue;
1099
+ if (status === "running") hasRunning = true;
1100
+ else if (status === "error") hasError = true;
1101
+ else if (status === "fail") hasFail = true;
1102
+ else if (status === "cancelled") hasCancelled = true;
1103
+ else if (status === "pass") hasPass = true;
1104
+ else hasPending = true;
1105
+ }
1106
+ if (hasRunning) return "running";
1107
+ if (hasError) return "error";
1108
+ if (hasFail) return "fail";
1109
+ if (hasCancelled) return "cancelled";
1110
+ if (hasPending || !hasPass) return "pending";
1111
+ return "pass";
1112
+ }
1113
+ /**
1114
+ * Derive an aggregate status from a scoped set of case rows.
1115
+ *
1116
+ * Pass `lifecycleStatus` only when the parent scope's raw run lifecycle should
1117
+ * override the derived child result, such as for a whole-run display.
1118
+ */
1119
+ function deriveStatusFromCaseRows(params) {
1120
+ return deriveStatusFromChildStatuses({
1121
+ statuses: Array.from(params.caseRows, (caseRow) => caseRow.status),
1122
+ lifecycleStatus: params.lifecycleStatus
1123
+ });
1124
+ }
1125
+ /**
1126
+ * Derive counts, aggregate metrics, and display status from a scoped set of
1127
+ * case rows.
1128
+ */
1129
+ function deriveScopedSummaryFromCases(params) {
1130
+ const caseRows = [...params.caseRows];
1131
+ let passedCases = 0;
1132
+ let failedCases = 0;
1133
+ let errorCases = 0;
1134
+ let cancelledCases = 0;
1135
+ let pendingCases = 0;
1136
+ let runningCases = 0;
1137
+ let totalDurationMs = 0;
1138
+ let hasDuration = false;
1139
+ for (const caseRow of caseRows) {
1140
+ if (caseRow.status === "pass") passedCases += 1;
1141
+ else if (caseRow.status === "fail") failedCases += 1;
1142
+ else if (caseRow.status === "error") errorCases += 1;
1143
+ else if (caseRow.status === "cancelled") cancelledCases += 1;
1144
+ else if (caseRow.status === "running") runningCases += 1;
1145
+ else pendingCases += 1;
1146
+ if (caseRow.latencyMs !== null) {
1147
+ totalDurationMs += caseRow.latencyMs;
1148
+ hasDuration = true;
1149
+ }
1150
+ }
1151
+ return {
1152
+ status: deriveStatusFromCaseRows({
1153
+ caseRows,
1154
+ lifecycleStatus: params.lifecycleStatus
1155
+ }),
1156
+ totalCases: caseRows.length,
1157
+ passedCases,
1158
+ failedCases,
1159
+ errorCases,
1160
+ cancelledCases,
1161
+ pendingCases,
1162
+ runningCases,
1163
+ totalDurationMs: hasDuration ? totalDurationMs : null
1164
+ };
1165
+ }
1166
+ //#endregion
1167
+ //#region ../shared/src/evalStatus.ts
1168
+ /**
1169
+ * Derive the user-facing eval status from the raw latest run result plus
1170
+ * freshness state.
1171
+ */
1172
+ function getEvalDisplayStatus(params) {
1173
+ const { stale, outdated, lastRunStatus, isRunning = false } = params;
1174
+ if (isRunning || lastRunStatus === "running") return "running";
1175
+ if (lastRunStatus === "pass") {
1176
+ if (stale) return "stale";
1177
+ if (outdated) return "outdated";
1178
+ }
1179
+ return lastRunStatus ?? "pending";
1180
+ }
1181
+ //#endregion
1182
+ //#region ../shared/src/evalTitle.ts
1183
+ function humanizeEvalId(id) {
1184
+ const normalized = id.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/[-_\s]+/g, " ").trim();
1185
+ if (normalized.length === 0) return id;
1186
+ return normalized.split(" ").map((segment) => {
1187
+ const firstChar = segment.slice(0, 1);
1188
+ const remainder = segment.slice(1);
1189
+ return `${firstChar.toUpperCase()}${remainder}`;
1190
+ }).join(" ");
1191
+ }
1192
+ /**
1193
+ * Resolve the display title for an eval.
1194
+ *
1195
+ * Returns the authored `title` when present; otherwise derives a human-readable
1196
+ * label from the stable eval `id` so display surfaces can avoid repeating both
1197
+ * fields in common cases.
1198
+ */
1199
+ function getEvalTitle(evalLike) {
1200
+ if (evalLike.title !== void 0) return evalLike.title;
1201
+ return humanizeEvalId(evalLike.id);
1202
+ }
1203
+ z.enum([
1204
+ "discovery.updated",
1205
+ "run.started",
1206
+ "run.summary",
1207
+ "case.started",
1208
+ "case.updated",
1209
+ "case.finished",
1210
+ "trace.span",
1211
+ "run.finished",
1212
+ "run.cancelled",
1213
+ "run.error"
1214
+ ]);
1215
+ /** Schema for the SSE envelope used to stream run updates to clients. */
1216
+ const sseEnvelopeSchema = z.object({
1217
+ type: z.string(),
1218
+ runId: z.string().optional(),
1219
+ timestamp: z.string(),
1220
+ payload: z.unknown()
1221
+ });
1222
+ //#endregion
1223
+ //#region ../shared/src/schemas/api.ts
1224
+ /** Schema for the API request that starts a new eval run. */
1225
+ const createRunRequestSchema = z.object({
1226
+ target: z.object({
1227
+ mode: z.enum([
1228
+ "all",
1229
+ "evalIds",
1230
+ "caseIds"
1231
+ ]),
1232
+ evalIds: z.array(z.string()).optional(),
1233
+ caseIds: z.array(z.string()).optional()
1234
+ }),
1235
+ trials: z.number().min(1),
1236
+ /**
1237
+ * Optional cache controls for the run. When omitted, the cache is used in
1238
+ * its default read-through / write-on-miss mode.
1239
+ */
1240
+ cache: z.object({ mode: cacheModeSchema.default("use") }).optional()
1241
+ });
1242
+ /** Schema for updating a UI-authored manual score on one persisted case. */
1243
+ const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1).nullable() });
1244
+ //#endregion
1245
+ //#region ../runner/src/cacheStore.ts
1246
+ /**
1247
+ * Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
1248
+ *
1249
+ * Writes use `<name>.tmp` + atomic `rename` to avoid partial reads under
1250
+ * concurrent access.
1251
+ */
1252
+ function createFsCacheStore(options) {
1253
+ const cacheDir = resolve(options.workspaceRoot, options.dir ?? ".agent-evals/cache");
1254
+ return {
1255
+ dir() {
1256
+ return cacheDir;
1257
+ },
1258
+ async lookup(namespace, keyHash) {
1259
+ const filePath = entryPath(cacheDir, namespace, keyHash);
1260
+ if (!existsSync(filePath)) return null;
1261
+ const json = safeJsonParse(await readFile(filePath, "utf-8"));
1262
+ if (json === null) return null;
1263
+ const parsed = cacheEntrySchema.safeParse(json);
1264
+ if (!parsed.success) return null;
1265
+ return parsed.data;
1266
+ },
1267
+ async write(entry) {
1268
+ const filePath = entryPath(cacheDir, entry.namespace, entry.key);
1269
+ await mkdir(dirname(filePath), { recursive: true });
1270
+ const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
1271
+ await writeFile(tmpPath, JSON.stringify(entry));
1272
+ await rename(tmpPath, filePath);
1273
+ },
1274
+ async list() {
1275
+ if (!existsSync(cacheDir)) return [];
1276
+ const namespaces = await readdir(cacheDir);
1277
+ const items = [];
1278
+ for (const namespace of namespaces) {
1279
+ const nsPath = join(cacheDir, namespace);
1280
+ if (!(await stat(nsPath)).isDirectory()) continue;
1281
+ const files = await readdir(nsPath);
1282
+ for (const fileName of files) {
1283
+ if (!fileName.endsWith(".json")) continue;
1284
+ const filePath = join(nsPath, fileName);
1285
+ const json = safeJsonParse(await readFile(filePath, "utf-8"));
1286
+ if (json === null) continue;
1287
+ const parsed = cacheEntrySchema.safeParse(json);
1288
+ if (!parsed.success) continue;
1289
+ const fileStat = await stat(filePath);
1290
+ items.push({
1291
+ key: parsed.data.key,
1292
+ namespace: parsed.data.namespace,
1293
+ spanName: parsed.data.spanName,
1294
+ spanKind: parsed.data.spanKind,
1295
+ storedAt: parsed.data.storedAt,
1296
+ codeFingerprint: parsed.data.codeFingerprint,
1297
+ sizeBytes: fileStat.size
1298
+ });
1299
+ }
1300
+ }
1301
+ items.sort((a, b) => a.storedAt < b.storedAt ? 1 : -1);
1302
+ return items;
1303
+ },
1304
+ async clear(filter) {
1305
+ if (!existsSync(cacheDir)) return;
1306
+ if (!filter || filter.namespace === void 0 && filter.key === void 0) {
1307
+ await rm(cacheDir, {
1308
+ recursive: true,
1309
+ force: true
1310
+ });
1311
+ return;
1312
+ }
1313
+ if (filter.namespace !== void 0 && filter.key === void 0) {
1314
+ await rm(join(cacheDir, filter.namespace), {
1315
+ recursive: true,
1316
+ force: true
1317
+ });
1318
+ return;
1319
+ }
1320
+ if (filter.namespace !== void 0 && filter.key !== void 0) {
1321
+ await rm(entryPath(cacheDir, filter.namespace, filter.key), { force: true });
1322
+ return;
1323
+ }
1324
+ const namespaces = await readdir(cacheDir);
1325
+ for (const namespace of namespaces) {
1326
+ const filePath = entryPath(cacheDir, namespace, filter.key ?? "");
1327
+ if (existsSync(filePath)) await rm(filePath, { force: true });
1328
+ }
1329
+ }
1330
+ };
1331
+ }
1332
+ /**
1333
+ * Create a write-buffered cache adapter for one trial attempt.
1334
+ *
1335
+ * Lookups first consult entries written earlier in the same trial, then fall
1336
+ * back to the shared backing store. Call `commit()` after selecting the
1337
+ * winning trial so only that trial's writes reach the shared cache.
1338
+ */
1339
+ function createBufferedCacheStore(backingStore) {
1340
+ const pendingEntries = /* @__PURE__ */ new Map();
1341
+ return {
1342
+ async lookup(namespace, keyHash) {
1343
+ const buffered = pendingEntries.get(toPendingKey(namespace, keyHash));
1344
+ if (buffered !== void 0) return buffered;
1345
+ return backingStore.lookup(namespace, keyHash);
1346
+ },
1347
+ write(entry) {
1348
+ pendingEntries.set(toPendingKey(entry.namespace, entry.key), entry);
1349
+ return Promise.resolve();
1350
+ },
1351
+ async commit() {
1352
+ for (const entry of pendingEntries.values()) await backingStore.write(entry);
1353
+ },
1354
+ getPendingEntries() {
1355
+ return [...pendingEntries.values()];
1356
+ }
1357
+ };
1358
+ }
1359
+ function entryPath(cacheDir, namespace, keyHash) {
1360
+ return join(cacheDir, sanitizeSegment$1(namespace), `${keyHash}.json`);
1361
+ }
1362
+ function toPendingKey(namespace, keyHash) {
1363
+ return `${namespace}::${keyHash}`;
1364
+ }
1365
+ function sanitizeSegment$1(segment) {
1366
+ return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
1367
+ }
1368
+ function safeJsonParse(text) {
1369
+ const parsed = resultify(() => JSON.parse(text));
1370
+ if (parsed.error) return null;
1371
+ return parsed.value;
1372
+ }
1373
+ //#endregion
1374
+ //#region ../runner/src/chartValidation.ts
1375
+ function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
1376
+ const columnDef = columnsByKey.get(metric.key);
1377
+ if (!columnDef) {
1378
+ warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
1379
+ return false;
1380
+ }
1381
+ if (metric.aggregate === "passThresholdRate") {
1382
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
1383
+ warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
1384
+ return false;
1385
+ }
1386
+ }
1387
+ return true;
1388
+ }
1389
+ function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
1390
+ const columnDef = columnsByKey.get(extra.key);
1391
+ if (!columnDef) {
1392
+ warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
1393
+ return false;
1394
+ }
1395
+ if (extra.aggregate === "passThresholdRate") {
1396
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
1397
+ warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
1398
+ return false;
1399
+ }
1400
+ }
1401
+ return true;
1402
+ }
1403
+ function sanitizeChart(chart, columnsByKey, evalId, warnings) {
1404
+ const metrics = chart.metrics.filter((metric) => {
1405
+ if (metric.source === "builtin") return true;
1406
+ return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
1407
+ });
1408
+ if (metrics.length === 0) {
1409
+ warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
1410
+ return null;
1411
+ }
1412
+ const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
1413
+ if (extra.source === "builtin") return true;
1414
+ return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
1415
+ });
1416
+ return {
1417
+ ...chart,
1418
+ metrics,
1419
+ tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
1420
+ };
1421
+ }
1422
+ /**
1423
+ * Validate and sanitize an authored `charts` config against the eval's
1424
+ * declared columns. Drops metrics/extras that reference unknown columns or
1425
+ * misuse `passThresholdRate`, and drops entire charts whose metrics are all
1426
+ * invalid. Returns `charts: undefined` when nothing valid remains so the UI
1427
+ * falls back to rendering no chart (matching the opt-in default).
1428
+ */
1429
+ function validateCharts(params) {
1430
+ const { charts, columnDefs, evalId } = params;
1431
+ if (!charts || charts.length === 0) return {
1432
+ charts: void 0,
1433
+ warnings: []
1434
+ };
1435
+ const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
1436
+ const warnings = [];
1437
+ const sanitized = [];
1438
+ for (const chart of charts) {
1439
+ const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
1440
+ if (result) sanitized.push(result);
1441
+ }
1442
+ return {
1443
+ charts: sanitized.length > 0 ? sanitized : void 0,
1444
+ warnings
1445
+ };
1446
+ }
1447
+ //#endregion
1448
+ //#region ../runner/src/columnBuilder.ts
1449
+ /**
1450
+ * Normalize a user-provided score definition (either a function or an
1451
+ * object literal with `compute`/`passThreshold`/`label`) to a common
1452
+ * shape used internally.
1453
+ */
1454
+ function normalizeScoreDef(def) {
1455
+ if (typeof def === "function") return {
1456
+ compute: def,
1457
+ passThreshold: void 0,
1458
+ label: void 0
1459
+ };
1460
+ return {
1461
+ compute: def.compute,
1462
+ passThreshold: def.passThreshold,
1463
+ label: def.label
1464
+ };
1465
+ }
1466
+ function getScoreOverride(def) {
1467
+ if (def === void 0 || typeof def === "function") return void 0;
1468
+ return {
1469
+ label: def.label,
1470
+ format: def.format,
1471
+ numberFormat: def.numberFormat,
1472
+ hideInTable: def.hideInTable,
1473
+ sortable: def.sortable,
1474
+ align: def.align,
1475
+ maxStars: def.maxStars
1476
+ };
1477
+ }
1478
+ function mergeOverrides(base, override) {
1479
+ if (base === void 0) return override;
1480
+ if (override === void 0) return base;
1481
+ return {
1482
+ label: override.label ?? base.label,
1483
+ format: override.format ?? base.format,
1484
+ numberFormat: override.numberFormat ?? base.numberFormat,
1485
+ hideInTable: override.hideInTable ?? base.hideInTable,
1486
+ sortable: override.sortable ?? base.sortable,
1487
+ align: override.align ?? base.align,
1488
+ maxStars: override.maxStars ?? base.maxStars
1489
+ };
1490
+ }
1491
+ /**
1492
+ * Populate `target` with `ColumnDef` entries for any keys in `columns`
1493
+ * that aren't already present, applying user-supplied `overrides` and
1494
+ * flagging score columns declared via `scores`.
1495
+ */
1496
+ function mergeColumnDefs(target, columns, overrides, scores, manualScores) {
1497
+ const scoreKeys = new Set(Object.keys(scores ?? {}));
1498
+ const manualScoreKeys = new Set(Object.keys(manualScores ?? {}));
1499
+ const overrideMap = overrides ?? {};
1500
+ for (const [key, value] of Object.entries(columns)) {
1501
+ if (target.has(key)) continue;
1502
+ const override = mergeOverrides(getScoreOverride(scores?.[key]) ?? manualScores?.[key], overrideMap[key]);
1503
+ const isScore = scoreKeys.has(key) || manualScoreKeys.has(key);
1504
+ target.set(key, createColumnDef({
1505
+ key,
1506
+ override,
1507
+ scoreDef: scores?.[key],
1508
+ manualScoreDef: manualScores?.[key],
1509
+ inferredKind: isScore ? "number" : inferKind(value),
1510
+ isScore,
1511
+ isManualScore: manualScoreKeys.has(key)
1512
+ }));
1513
+ }
1514
+ }
1515
+ /**
1516
+ * Build the column definitions declared directly on an eval before any runtime
1517
+ * output values exist. This lets discovery metadata describe authored rich
1518
+ * output columns even for runs created by another process.
1519
+ */
1520
+ function buildDeclaredColumnDefs(overrides, scores, manualScores) {
1521
+ const declaredDefs = /* @__PURE__ */ new Map();
1522
+ for (const [key, override] of Object.entries(overrides ?? {})) {
1523
+ const isScore = scores?.[key] !== void 0 || manualScores?.[key] !== void 0;
1524
+ const mergedOverride = mergeOverrides(getScoreOverride(scores?.[key]) ?? manualScores?.[key], override);
1525
+ declaredDefs.set(key, createColumnDef({
1526
+ key,
1527
+ override: mergedOverride,
1528
+ scoreDef: scores?.[key],
1529
+ manualScoreDef: manualScores?.[key],
1530
+ inferredKind: inferKindFromFormat(mergedOverride?.format) ?? (mergedOverride?.numberFormat === void 0 ? void 0 : "number"),
1531
+ isScore,
1532
+ isManualScore: manualScores?.[key] !== void 0
1533
+ }));
1534
+ }
1535
+ for (const [key, scoreDef] of Object.entries(scores ?? {})) {
1536
+ if (declaredDefs.has(key)) continue;
1537
+ declaredDefs.set(key, createColumnDef({
1538
+ key,
1539
+ override: getScoreOverride(scoreDef),
1540
+ scoreDef,
1541
+ inferredKind: "number",
1542
+ isScore: true,
1543
+ isManualScore: false
1544
+ }));
1545
+ }
1546
+ for (const [key, manualScoreDef] of Object.entries(manualScores ?? {})) {
1547
+ if (declaredDefs.has(key)) continue;
1548
+ declaredDefs.set(key, createColumnDef({
1549
+ key,
1550
+ override: manualScoreDef,
1551
+ manualScoreDef,
1552
+ inferredKind: "number",
1553
+ isScore: true,
1554
+ isManualScore: true
1555
+ }));
1556
+ }
1557
+ return [...declaredDefs.values()];
1558
+ }
1559
+ /** Infer a `ColumnKind` from a runtime value when no override is set. */
1560
+ function inferKind(value) {
1561
+ if (typeof value === "number") return "number";
1562
+ if (typeof value === "boolean") return "boolean";
1563
+ return "string";
1564
+ }
1565
+ /**
1566
+ * Coerce an arbitrary runtime value into a serializable `CellValue`.
1567
+ * Non-primitive values fall back to `JSON.stringify`.
1568
+ */
1569
+ function toCellValue(value, override = void 0) {
1570
+ if (value === null) return null;
1571
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return value;
1572
+ if (value === void 0) return void 0;
1573
+ if (override?.format === "image" || override?.format === "audio" || override?.format === "video" || override?.format === "file") {
1574
+ const parsed = fileRefSchema.safeParse(value);
1575
+ if (parsed.success) return parsed.data;
1576
+ }
1577
+ if (override?.format === "json") {
1578
+ const parsed = jsonCellSchema.safeParse(value);
1579
+ if (parsed.success) return parsed.data;
1580
+ }
1581
+ return JSON.stringify(value);
1582
+ }
1583
+ function inferKindFromFormat(format) {
1584
+ if (format === "boolean") return "boolean";
1585
+ if (format === "duration" || format === "percent" || format === "number" || format === "passFail" || format === "stars") return "number";
1586
+ if (format === void 0) return void 0;
1587
+ return "string";
1588
+ }
1589
+ function createColumnDef(params) {
1590
+ const { key, override, scoreDef, manualScoreDef, inferredKind, isScore, isManualScore } = params;
1591
+ const kind = inferredKind ?? (isScore ? "number" : "string");
1592
+ const def = {
1593
+ key,
1594
+ label: override?.label ?? key,
1595
+ kind
1596
+ };
1597
+ if (override?.format !== void 0) def.format = override.format;
1598
+ if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
1599
+ if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
1600
+ if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
1601
+ if (override?.sortable !== void 0) def.sortable = override.sortable;
1602
+ if (override?.align !== void 0) def.align = override.align;
1603
+ if (!isScore) return def;
1604
+ def.isScore = true;
1605
+ if (isManualScore) {
1606
+ def.isManualScore = true;
1607
+ if (manualScoreDef?.passThreshold !== void 0) def.passThreshold = manualScoreDef.passThreshold;
1608
+ return def;
1609
+ }
1610
+ if (typeof scoreDef === "function" || scoreDef === void 0) return def;
1611
+ if (scoreDef.passThreshold !== void 0) def.passThreshold = scoreDef.passThreshold;
1612
+ if (scoreDef.label !== void 0 && override?.label === void 0) def.label = scoreDef.label;
1613
+ return def;
1614
+ }
1615
+ //#endregion
1616
+ //#region ../runner/src/config.ts
1617
+ const configModuleSchema = z.object({
1618
+ default: agentEvalsConfigSchema.optional(),
1619
+ config: agentEvalsConfigSchema.optional()
1620
+ });
1621
+ const defaultConfig = {
1622
+ include: ["**/*.eval.ts"],
1623
+ defaultTrials: 1,
1624
+ trialSelection: "lowestScore",
1625
+ concurrency: 2,
1626
+ staleAfterDays: 14,
1627
+ traceDisplay: { attributes: [{
1628
+ path: "input",
1629
+ label: "Input",
1630
+ format: "json",
1631
+ placements: ["section"]
1632
+ }, {
1633
+ path: "output",
1634
+ label: "Output",
1635
+ format: "json",
1636
+ placements: ["section"]
1637
+ }] }
1638
+ };
1639
+ async function loadConfig() {
1640
+ const configPath = resolve(process.cwd(), "agent-evals.config.ts");
1641
+ if (!existsSync(configPath)) return defaultConfig;
1642
+ try {
1643
+ const imported = await import(pathToFileURL(configPath).href);
1644
+ const configModule = configModuleSchema.parse(imported);
1645
+ const userConfig = configModule.default ?? configModule.config;
1646
+ if (!userConfig) return defaultConfig;
1647
+ return {
1648
+ ...defaultConfig,
1649
+ ...userConfig
1650
+ };
1651
+ } catch (error) {
1652
+ console.error("Failed to load agent-evals.config.ts:", error);
1653
+ return defaultConfig;
1654
+ }
1655
+ }
1656
+ //#endregion
1657
+ //#region ../runner/src/discovery.ts
1658
+ const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
1659
+ const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
1660
+ function parseEvalMetas(filePath, content) {
1661
+ const metas = [];
1662
+ let searchIndex = 0;
1663
+ while (searchIndex < content.length) {
1664
+ const defineEvalIndex = content.indexOf("defineEval", searchIndex);
1665
+ if (defineEvalIndex === -1) break;
1666
+ const extracted = extractDefineEvalObject(content, defineEvalIndex);
1667
+ if (!extracted) {
1668
+ searchIndex = defineEvalIndex + 10;
1669
+ continue;
1670
+ }
1671
+ const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
1672
+ if (id !== void 0) {
1673
+ const result = {
1674
+ filePath,
1675
+ id
1676
+ };
1677
+ const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
1678
+ if (title !== void 0) result.title = title;
1679
+ metas.push(result);
1680
+ }
1681
+ searchIndex = extracted.nextIndex;
1682
+ }
1683
+ return metas;
1684
+ }
1685
+ function extractDefineEvalObject(content, defineEvalIndex) {
1686
+ const openParenIndex = content.indexOf("(", defineEvalIndex);
1687
+ if (openParenIndex === -1) return void 0;
1688
+ const objectStartIndex = content.indexOf("{", openParenIndex);
1689
+ if (objectStartIndex === -1) return void 0;
1690
+ let depth = 0;
1691
+ let quote;
1692
+ let inBlockComment = false;
1693
+ let inLineComment = false;
1694
+ let isEscaped = false;
1695
+ for (let index = objectStartIndex; index < content.length; index++) {
1696
+ const currentChar = content[index];
1697
+ const nextChar = content[index + 1];
1698
+ if (inLineComment) {
1699
+ if (currentChar === "\n") inLineComment = false;
1700
+ continue;
1701
+ }
1702
+ if (inBlockComment) {
1703
+ if (currentChar === "*" && nextChar === "/") {
1704
+ inBlockComment = false;
1705
+ index++;
1706
+ }
1707
+ continue;
1708
+ }
1709
+ if (quote) {
1710
+ if (isEscaped) {
1711
+ isEscaped = false;
1712
+ continue;
1713
+ }
1714
+ if (currentChar === "\\") {
1715
+ isEscaped = true;
1716
+ continue;
1717
+ }
1718
+ if (currentChar === quote) quote = void 0;
1719
+ continue;
1720
+ }
1721
+ if (currentChar === "/" && nextChar === "/") {
1722
+ inLineComment = true;
1723
+ index++;
1724
+ continue;
1725
+ }
1726
+ if (currentChar === "/" && nextChar === "*") {
1727
+ inBlockComment = true;
1728
+ index++;
1729
+ continue;
1730
+ }
1731
+ if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
1732
+ quote = currentChar;
1733
+ continue;
1734
+ }
1735
+ if (currentChar === "{") {
1736
+ depth++;
1737
+ continue;
1738
+ }
1739
+ if (currentChar === "}") {
1740
+ depth--;
1741
+ if (depth === 0) return {
1742
+ nextIndex: index + 1,
1743
+ objectText: content.slice(objectStartIndex, index + 1)
1744
+ };
1745
+ }
1746
+ }
1747
+ }
1748
+ //#endregion
1749
+ //#region ../runner/src/evalModuleLoader.ts
1750
+ /**
1751
+ * Import one eval module with a cache key derived from its current source so
1752
+ * repeated discovery and runs observe the latest authored definition.
1753
+ */
1754
+ async function loadEvalModule(filePath, sourceFingerprint = void 0) {
1755
+ const moduleUrl = new URL(pathToFileURL(filePath).href);
1756
+ if (sourceFingerprint !== void 0) moduleUrl.searchParams.set("v", sourceFingerprint);
1757
+ await import(moduleUrl.href);
1758
+ }
1759
+ //#endregion
1760
+ //#region ../runner/src/freshness.ts
1761
+ /**
1762
+ * Derive eval freshness from the latest run, current eval-file fingerprint,
1763
+ * current git commit, and an age threshold.
1764
+ */
1765
+ function deriveEvalFreshness(params) {
1766
+ const { latestRun, gitState, currentEvalSourceFingerprint, staleAfterDays, now = /* @__PURE__ */ new Date() } = params;
1767
+ const stale = latestRun?.evalSourceFingerprint !== void 0 && latestRun.evalSourceFingerprint !== null && currentEvalSourceFingerprint !== null && currentEvalSourceFingerprint !== latestRun.evalSourceFingerprint;
1768
+ const latestRunCommitSha = latestRun?.commitSha;
1769
+ if (latestRunCommitSha === void 0 || latestRunCommitSha === null) return {
1770
+ freshnessStatus: stale ? "stale" : "fresh",
1771
+ stale,
1772
+ outdated: false
1773
+ };
1774
+ if (gitState.commitSha === null) return {
1775
+ freshnessStatus: stale ? "stale" : "fresh",
1776
+ stale,
1777
+ outdated: false
1778
+ };
1779
+ if (latestRunCommitSha === gitState.commitSha) return {
1780
+ freshnessStatus: stale ? "stale" : "fresh",
1781
+ stale,
1782
+ outdated: false
1783
+ };
1784
+ const latestRunStartedAt = new Date(latestRun?.startedAt ?? "").getTime();
1785
+ if (!Number.isFinite(latestRunStartedAt)) return {
1786
+ freshnessStatus: stale ? "stale" : "fresh",
1787
+ stale,
1788
+ outdated: false
1789
+ };
1790
+ const outdated = now.getTime() - latestRunStartedAt >= staleAfterDays * 24 * 60 * 60 * 1e3;
1791
+ return {
1792
+ freshnessStatus: stale ? "stale" : outdated ? "outdated" : "fresh",
1793
+ stale,
1794
+ outdated
1795
+ };
1796
+ }
1797
+ /** Return the timestamp used when ordering and displaying a run recency. */
1798
+ function getRunFreshnessTimestamp(manifest) {
1799
+ return manifest.endedAt ?? manifest.startedAt;
1800
+ }
1801
+ //#endregion
1802
+ //#region ../runner/src/evalSummaries.ts
1803
+ /** Build the API/UI summary payload for one discovered eval. */
1804
+ function buildEvalSummary(params) {
1805
+ const { meta, config, gitState, latestRun, lastRunStatus } = params;
1806
+ const { sourceFingerprint, ...summaryMeta } = meta;
1807
+ const freshness = deriveEvalFreshness({
1808
+ latestRun,
1809
+ gitState,
1810
+ currentEvalSourceFingerprint: sourceFingerprint,
1811
+ staleAfterDays: config.staleAfterDays ?? 14
1812
+ });
1813
+ return {
1814
+ ...summaryMeta,
1815
+ stale: freshness.stale,
1816
+ outdated: freshness.outdated,
1817
+ freshnessStatus: freshness.freshnessStatus,
1818
+ latestRunAt: latestRun?.startedAt ?? null,
1819
+ latestRunCommitSha: latestRun?.commitSha ?? null,
1820
+ currentCommitSha: gitState.commitSha,
1821
+ lastRunStatus
1822
+ };
1823
+ }
1824
+ /** Resolve which eval ids a run request should mark as the latest run. */
1825
+ function getTargetEvalIds(params) {
1826
+ const { request, sortedEvalIds, knownEvalIds } = params;
1827
+ if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.filter((evalId) => knownEvalIds.has(evalId));
1828
+ return sortedEvalIds;
1829
+ }
1830
+ /** Write one latest-run snapshot to each targeted eval id. */
1831
+ function setLatestRunInfoMap(params) {
1832
+ const { latestRunInfoMap, evalIds, info } = params;
1833
+ for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
1834
+ }
1835
+ //#endregion
1836
+ //#region ../runner/src/gitState.ts
1837
+ function runGitCommand(workspaceRoot, args) {
1838
+ const result = spawnSync("git", args, {
1839
+ cwd: workspaceRoot,
1840
+ encoding: "utf8",
1841
+ stdio: [
1842
+ "ignore",
1843
+ "pipe",
1844
+ "ignore"
1845
+ ]
1846
+ });
1847
+ return {
1848
+ status: result.status,
1849
+ stdout: result.stdout.trim()
1850
+ };
1851
+ }
1852
+ /** Read the current git commit for the workspace, if available. */
1853
+ function readGitWorktreeState(workspaceRoot) {
1854
+ const insideWorktree = runGitCommand(workspaceRoot, ["rev-parse", "--is-inside-work-tree"]);
1855
+ if (insideWorktree.status !== 0 || insideWorktree.stdout !== "true") return { commitSha: null };
1856
+ const commitResult = runGitCommand(workspaceRoot, ["rev-parse", "HEAD"]);
1857
+ return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
1858
+ }
1859
+ //#endregion
1860
+ //#region ../runner/src/outputArtifacts.ts
1861
+ const mimeTypeExtensionMap = {
1862
+ "application/json": ".json",
1863
+ "application/pdf": ".pdf",
1864
+ "audio/mpeg": ".mp3",
1865
+ "audio/mp4": ".m4a",
1866
+ "audio/wav": ".wav",
1867
+ "image/gif": ".gif",
1868
+ "image/jpeg": ".jpg",
1869
+ "image/png": ".png",
1870
+ "image/svg+xml": ".svg",
1871
+ "image/webp": ".webp",
1872
+ "text/html": ".html",
1873
+ "text/markdown": ".md",
1874
+ "text/plain": ".txt",
1875
+ "video/mp4": ".mp4",
1876
+ "video/webm": ".webm"
1877
+ };
1878
+ /**
1879
+ * Persist a `Blob`/`File` emitted via `setEvalOutput(...)` into the current run's
1880
+ * artifact directory and return the resulting run artifact reference.
1881
+ */
1882
+ async function persistInlineArtifact({ artifactDir, runId, caseId, outputKey, trial, value }) {
1883
+ await mkdir(artifactDir, { recursive: true });
1884
+ const mimeType = normalizeMimeType(value.type);
1885
+ const fileName = getArtifactFileName({
1886
+ outputKey,
1887
+ mimeType,
1888
+ value
1889
+ });
1890
+ const artifactId = [
1891
+ sanitizeSegment(runId),
1892
+ sanitizeSegment(caseId),
1893
+ `t${String(trial)}`,
1894
+ sanitizeSegment(outputKey),
1895
+ sanitizeFileName(fileName)
1896
+ ].join("__");
1897
+ await writeFile(join(artifactDir, artifactId), new Uint8Array(await value.arrayBuffer()));
1898
+ return {
1899
+ source: "run",
1900
+ artifactId,
1901
+ mimeType,
1902
+ fileName
1903
+ };
1904
+ }
1905
+ /** Resolve a persisted run artifact path from its artifact id. */
1906
+ function resolveArtifactPath(runsDir, artifactId) {
1907
+ const [runId] = artifactId.split("__", 1);
1908
+ if (!runId) return void 0;
1909
+ return join(runsDir, runId, "artifacts", artifactId);
1910
+ }
1911
+ function normalizeMimeType(value) {
1912
+ const normalized = value.trim();
1913
+ return normalized.length > 0 ? normalized : "application/octet-stream";
1914
+ }
1915
+ function getArtifactFileName(params) {
1916
+ const { outputKey, mimeType, value } = params;
1917
+ if (isFile(value) && value.name.trim().length > 0) return value.name.trim();
1918
+ const extension = getExtensionForMimeType(mimeType);
1919
+ return extension.length > 0 ? `${sanitizeSegment(outputKey)}${extension}` : sanitizeSegment(outputKey);
1920
+ }
1921
+ function getExtensionForMimeType(mimeType) {
1922
+ const exactMatch = mimeTypeExtensionMap[mimeType];
1923
+ if (exactMatch) return exactMatch;
1924
+ const subtype = mimeType.split("/")[1];
1925
+ if (subtype === void 0 || subtype.length === 0) return "";
1926
+ const withoutSuffix = subtype.split("+")[0] ?? subtype;
1927
+ return withoutSuffix.length > 0 ? `.${withoutSuffix}` : "";
1928
+ }
1929
+ function sanitizeSegment(value) {
1930
+ const normalized = value.trim().replaceAll(/[^A-Za-z0-9._-]+/g, "-");
1931
+ return normalized.length > 0 ? normalized : "artifact";
1932
+ }
1933
+ function sanitizeFileName(value) {
1934
+ const normalized = sanitizeSegment(value);
1935
+ const extension = extname(normalized);
1936
+ if (extension.length === 0) return normalized;
1937
+ return `${normalized.slice(0, -extension.length).replaceAll(".", "-")}${extension}`;
1938
+ }
1939
+ function isFile(value) {
1940
+ return value instanceof File;
1941
+ }
1942
+ //#endregion
1943
+ //#region ../runner/src/runMaintenance.ts
1944
+ async function persistRunState(runState) {
1945
+ await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
1946
+ await writeFile(join(runState.runDir, "run.json"), JSON.stringify(runState.manifest, null, 2));
1947
+ const casesJsonl = runState.cases.map((c) => JSON.stringify(c)).join("\n");
1948
+ await writeFile(join(runState.runDir, "cases.jsonl"), casesJsonl);
1949
+ }
1950
+ /**
1951
+ * Recompute a persisted case's status after score definitions changed.
1952
+ *
1953
+ * Pass/fail gates are per-score: a case fails when any score with a declared
1954
+ * `passThreshold` reports a numeric value below that threshold. Scores
1955
+ * without a threshold are informational and never gate. Cancelled and
1956
+ * errored cases retain their terminal status.
1957
+ */
1958
+ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
1959
+ if (caseRow.status === "cancelled") return "cancelled";
1960
+ if (caseDetail?.error !== null && caseDetail?.error !== void 0) return "error";
1961
+ if ((caseDetail?.assertionFailures.length ?? 0) > 0) return "fail";
1962
+ for (const [key, passThreshold] of scoreThresholds) {
1963
+ const rawValue = caseRow.columns[key] ?? caseDetail?.columns[key];
1964
+ if (typeof rawValue !== "number") continue;
1965
+ if (rawValue < passThreshold) return "fail";
1966
+ }
1967
+ return caseRow.status === "error" ? "error" : "pass";
1968
+ }
1969
+ function runTouchesEval(params) {
1970
+ if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
1971
+ if (params.target.mode === "all") return params.evalExists;
1972
+ if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
1973
+ return false;
1974
+ }
1975
+ async function recomputeEvalStatusesInRuns(params) {
1976
+ let updatedRuns = 0;
1977
+ for (const run of params.runs) {
1978
+ if (!runTouchesEval({
1979
+ target: run.manifest.target,
1980
+ caseRows: run.cases,
1981
+ evalId: params.evalId,
1982
+ evalExists: params.evalExists
1983
+ })) continue;
1984
+ if (run.manifest.status === "running") continue;
1985
+ let changed = false;
1986
+ for (const caseRow of run.cases) {
1987
+ if (caseRow.evalId !== params.evalId) continue;
1988
+ const caseDetail = run.caseDetails.get(caseRow.caseId);
1989
+ const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
1990
+ if (caseRow.status === nextStatus) continue;
1991
+ caseRow.status = nextStatus;
1992
+ if (caseDetail) {
1993
+ caseDetail.status = nextStatus;
1994
+ await params.persistCaseDetail(run.runDir, caseDetail);
1995
+ }
1996
+ changed = true;
1997
+ }
1998
+ if (!changed) continue;
1999
+ const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
2000
+ run.summary.totalCases = derivedSummary.totalCases;
2001
+ run.summary.passedCases = derivedSummary.passedCases;
2002
+ run.summary.failedCases = derivedSummary.failedCases;
2003
+ run.summary.errorCases = derivedSummary.errorCases;
2004
+ run.summary.cancelledCases = derivedSummary.cancelledCases;
2005
+ await persistRunState(run);
2006
+ updatedRuns += 1;
2007
+ }
2008
+ return updatedRuns;
2009
+ }
2010
+ //#endregion
2011
+ //#region ../runner/src/traceDisplay.ts
2012
+ function isRecord$1(value) {
2013
+ return typeof value === "object" && value !== null;
2014
+ }
2015
+ function getNestedAttribute(value, path) {
2016
+ const parts = path.split(".");
2017
+ let current = value;
2018
+ for (const part of parts) {
2019
+ if (!isRecord$1(current) || !(part in current)) return;
2020
+ current = current[part];
2021
+ }
2022
+ return current;
2023
+ }
2024
+ function mergeNestedAttribute(value, path, attributeValue) {
2025
+ const root = value === void 0 ? {} : { ...value };
2026
+ const parts = path.split(".");
2027
+ let current = root;
2028
+ for (const [index, part] of parts.entries()) {
2029
+ if (index === parts.length - 1) {
2030
+ current[part] = attributeValue;
2031
+ continue;
2032
+ }
2033
+ const nextValue = current[part];
2034
+ const nextRecord = isRecord$1(nextValue) ? { ...nextValue } : {};
2035
+ current[part] = nextRecord;
2036
+ current = nextRecord;
2037
+ }
2038
+ return root;
2039
+ }
2040
+ function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
2041
+ const merged = /* @__PURE__ */ new Map();
2042
+ for (const attribute of globalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
2043
+ for (const attribute of evalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
2044
+ const resolvedAttributes = [];
2045
+ const transformedTrace = spans.map((span) => ({
2046
+ ...span,
2047
+ attributes: span.attributes === void 0 ? void 0 : { ...span.attributes }
2048
+ }));
2049
+ for (const attribute of merged.values()) {
2050
+ const resolvedPath = attribute.transform ? `__display.${attribute.key ?? attribute.path}` : attribute.path;
2051
+ resolvedAttributes.push({
2052
+ key: attribute.key,
2053
+ path: resolvedPath,
2054
+ label: attribute.label,
2055
+ format: attribute.format,
2056
+ numberFormat: attribute.numberFormat,
2057
+ placements: attribute.placements,
2058
+ scope: attribute.scope,
2059
+ mode: attribute.mode
2060
+ });
2061
+ if (!attribute.transform) continue;
2062
+ for (const span of transformedTrace) {
2063
+ const sourceValue = getNestedAttribute(span.attributes, attribute.path);
2064
+ if (sourceValue === void 0) continue;
2065
+ const transformedValue = attribute.transform({
2066
+ value: sourceValue,
2067
+ span
2068
+ });
2069
+ if (transformedValue === void 0) continue;
2070
+ span.attributes = mergeNestedAttribute(span.attributes, resolvedPath, transformedValue);
2071
+ }
2072
+ }
2073
+ return {
2074
+ trace: transformedTrace,
2075
+ traceDisplay: { attributes: resolvedAttributes }
2076
+ };
2077
+ }
2078
+ //#endregion
2079
+ //#region ../runner/src/runExecution.ts
2080
+ function filterEvalCases(cases, evalIds, caseIds, evalId) {
2081
+ if (evalIds && evalIds.length > 0 && !evalIds.includes(evalId)) return [];
2082
+ if (!caseIds || caseIds.length === 0) return cases;
2083
+ const selectedCaseIds = new Set(caseIds);
2084
+ return cases.filter((evalCase) => selectedCaseIds.has(evalCase.id));
2085
+ }
2086
+ function resolveRunnableEvalCases(params) {
2087
+ const { cases, evalId } = params;
2088
+ if (cases.length > 0) return cases;
2089
+ return [{
2090
+ id: `${evalId}-no-output`,
2091
+ input: {}
2092
+ }];
2093
+ }
2094
+ async function callWithUnknownResult(fn, args) {
2095
+ return await Reflect.apply(fn, void 0, args);
2096
+ }
2097
+ async function runCase(params) {
2098
+ const { evalDef, evalId, evalCase, globalTraceDisplay, trial, signal, startTime, cacheAdapter, cacheMode, codeFingerprint, artifactDir, runId } = params;
2099
+ const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
2100
+ await Reflect.apply(evalDef.execute, evalDef, [{
2101
+ input: evalCase.input,
2102
+ signal
2103
+ }]);
2104
+ }, { cacheContext: cacheAdapter ? {
2105
+ adapter: cacheAdapter,
2106
+ mode: cacheMode,
2107
+ evalId,
2108
+ codeFingerprint
2109
+ } : void 0 });
2110
+ const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
2111
+ const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
2112
+ if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
2113
+ if (!nonAssertError && evalDef.deriveFromTracing) try {
2114
+ const derived = await callWithUnknownResult(evalDef.deriveFromTracing, [{
2115
+ trace: traceTree,
2116
+ input: evalCase.input,
2117
+ case: evalCase
2118
+ }]);
2119
+ if (!isRecord(derived)) throw new Error("deriveFromTracing must return an object");
2120
+ for (const [key, value] of Object.entries(derived)) if (!(key in scope.outputs)) scope.outputs[key] = value;
2121
+ } catch (e) {
2122
+ const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
2123
+ scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
2124
+ }
2125
+ const scoreResults = /* @__PURE__ */ new Map();
2126
+ const scoringTraces = {};
2127
+ if (!nonAssertError && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
2128
+ const { compute, passThreshold, label } = normalizeScoreDef(def);
2129
+ const scoreRun = await runInEvalScope(evalCase.id, async () => await callWithUnknownResult(compute, [{
2130
+ input: evalCase.input,
2131
+ outputs: { ...scope.outputs },
2132
+ case: evalCase
2133
+ }]), { cacheContext: cacheAdapter ? {
2134
+ adapter: cacheAdapter,
2135
+ mode: cacheMode,
2136
+ evalId: `${evalId}__score__${key}`,
2137
+ codeFingerprint
2138
+ } : void 0 });
2139
+ const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
2140
+ if (trace.length > 0) scoringTraces[key] = {
2141
+ trace,
2142
+ traceDisplay
2143
+ };
2144
+ const rawValue = scoreRun.result;
2145
+ if (scoreRun.error) {
2146
+ const message = `score "${key}" threw: ${scoreRun.error.message}`;
2147
+ scope.assertionFailures.push(toAssertionFailure(message, scoreRun.error));
2148
+ scope.outputs[key] = 0;
2149
+ scoreResults.set(key, {
2150
+ value: 0,
2151
+ passThreshold,
2152
+ label
2153
+ });
2154
+ continue;
2155
+ }
2156
+ if (typeof rawValue !== "number") {
2157
+ scope.assertionFailures.push(toAssertionFailure(`score "${key}" must return a number`));
2158
+ scope.outputs[key] = 0;
2159
+ scoreResults.set(key, {
2160
+ value: 0,
2161
+ passThreshold,
2162
+ label
2163
+ });
2164
+ continue;
2165
+ }
2166
+ const value = rawValue;
2167
+ scope.outputs[key] = value;
2168
+ scoreResults.set(key, {
2169
+ value,
2170
+ passThreshold,
2171
+ label
2172
+ });
2173
+ }
2174
+ let passed = scope.assertionFailures.length === 0 && !nonAssertError;
2175
+ if (passed) {
2176
+ for (const [, scoreEntry] of scoreResults) if (scoreEntry.passThreshold !== void 0 && scoreEntry.value < scoreEntry.passThreshold) {
2177
+ passed = false;
2178
+ break;
2179
+ }
2180
+ }
2181
+ const status = nonAssertError ? "error" : passed ? "pass" : "fail";
2182
+ const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
2183
+ const columns = {};
2184
+ for (const [key, value] of Object.entries(scope.outputs)) {
2185
+ const cell = isBlob(value) ? await persistInlineArtifact({
2186
+ artifactDir,
2187
+ runId,
2188
+ caseId: evalCase.id,
2189
+ outputKey: key,
2190
+ trial,
2191
+ value
2192
+ }) : toCellValue(value, evalDef.columns?.[key]);
2193
+ if (cell !== void 0) columns[key] = cell;
2194
+ }
2195
+ for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
2196
+ const errorInfo = nonAssertError ? {
2197
+ name: nonAssertError.name,
2198
+ message: nonAssertError.message,
2199
+ stack: nonAssertError.stack
2200
+ } : null;
2201
+ const caseDetail = {
2202
+ caseId: evalCase.id,
2203
+ evalId,
2204
+ status,
2205
+ input: evalCase.input,
2206
+ trace: displayTrace,
2207
+ traceDisplay,
2208
+ columns,
2209
+ assertionFailures: scope.assertionFailures,
2210
+ error: errorInfo,
2211
+ trial
2212
+ };
2213
+ if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
2214
+ return {
2215
+ caseDetail,
2216
+ caseRowUpdate: {
2217
+ status,
2218
+ latencyMs: Date.now() - startTime,
2219
+ columns
2220
+ }
2221
+ };
2222
+ }
2223
+ function isRecord(value) {
2224
+ return typeof value === "object" && value !== null;
2225
+ }
2226
+ function isBlob(value) {
2227
+ return value instanceof Blob;
2228
+ }
2229
+ function toAssertionFailure(message, error = void 0) {
2230
+ return error?.stack ? {
2231
+ message,
2232
+ stack: error.stack
2233
+ } : { message };
2234
+ }
2235
+ //#endregion
2236
+ //#region ../runner/src/runPersistence.ts
2237
+ const SHORT_ID_PATTERN = /^r(\d+)$/;
2238
+ /**
2239
+ * Generate a filesystem-safe, sortable run id combining a UTC timestamp
2240
+ * with a short random suffix.
2241
+ */
2242
+ function generateRunId() {
2243
+ const now = /* @__PURE__ */ new Date();
2244
+ const pad = (n) => String(n).padStart(2, "0");
2245
+ return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
2246
+ }
2247
+ function parseShortIdNum(shortId) {
2248
+ if (shortId === void 0) return null;
2249
+ const match = SHORT_ID_PATTERN.exec(shortId);
2250
+ if (!match) return null;
2251
+ const num = Number(match[1]);
2252
+ if (!Number.isFinite(num)) return null;
2253
+ return num;
2254
+ }
2255
+ /**
2256
+ * Return the next `shortId` number to assign based on the existing
2257
+ * loaded snapshots. Legacy runs that don't match the `r\d+` format are
2258
+ * ignored.
2259
+ */
2260
+ function nextShortIdFromSnapshots(snapshots) {
2261
+ let maxNum = -1;
2262
+ for (const snapshot of snapshots) {
2263
+ const num = parseShortIdNum(snapshot.manifest.shortId);
2264
+ if (num !== null && num > maxNum) maxNum = num;
2265
+ }
2266
+ return maxNum + 1;
2267
+ }
2268
+ async function loadPersistedRunSnapshots(localStateDir) {
2269
+ const runsDir = join(localStateDir, "runs");
2270
+ const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
2271
+ if (entriesResult.error) return [];
2272
+ const snapshots = [];
2273
+ const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
2274
+ for (const runDir of runDirs) {
2275
+ const snapshot = await loadPersistedRunSnapshot(runDir);
2276
+ if (!snapshot) continue;
2277
+ snapshots.push(snapshot);
2278
+ }
2279
+ return snapshots;
2280
+ }
2281
+ async function persistCaseDetail(runDir, caseDetail) {
2282
+ await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
2283
+ }
2284
+ function getLastRunStatuses(params) {
2285
+ const latestRunInfos = getLatestRunInfos(params);
2286
+ return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
2287
+ }
2288
+ /**
2289
+ * Return the latest scoped run metadata for each eval based on persisted and
2290
+ * in-memory runs.
2291
+ */
2292
+ function getLatestRunInfos(params) {
2293
+ const { runs, knownEvals } = params;
2294
+ const knownEvalMetas = [...knownEvals];
2295
+ const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
2296
+ const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
2297
+ const latestRunInfos = /* @__PURE__ */ new Map();
2298
+ for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
2299
+ status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
2300
+ startedAt: getRunFreshnessTimestamp(run.manifest),
2301
+ commitSha: run.manifest.commitSha ?? null,
2302
+ evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
2303
+ });
2304
+ return latestRunInfos;
2305
+ }
2306
+ function toLastRunStatus$1(status) {
2307
+ return status === "pending" ? null : status;
2308
+ }
2309
+ async function loadPersistedRunSnapshot(runDir) {
2310
+ const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
2311
+ if (!manifest) return null;
2312
+ const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
2313
+ if (!summary) return null;
2314
+ return {
2315
+ runDir,
2316
+ manifest,
2317
+ summary,
2318
+ cases: await readCaseRows(runDir),
2319
+ caseDetails: await readCaseDetails(runDir)
2320
+ };
2321
+ }
2322
+ async function readParsedJsonFile(filePath, schema) {
2323
+ const fileResult = await resultify(() => readFile(filePath, "utf-8"));
2324
+ if (fileResult.error) return null;
2325
+ const jsonResult = resultify(() => JSON.parse(fileResult.value));
2326
+ if (jsonResult.error) return null;
2327
+ const parsed = schema.safeParse(jsonResult.value);
2328
+ if (!parsed.success) return null;
2329
+ return parsed.data;
2330
+ }
2331
+ async function readCaseRows(runDir) {
2332
+ const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
2333
+ if (fileResult.error) return [];
2334
+ const rows = [];
2335
+ for (const rawLine of fileResult.value.split("\n")) {
2336
+ const line = rawLine.trim();
2337
+ if (line.length === 0) continue;
2338
+ const jsonResult = resultify(() => JSON.parse(line));
2339
+ if (jsonResult.error) continue;
2340
+ const parsed = caseRowSchema.safeParse(jsonResult.value);
2341
+ if (!parsed.success) continue;
2342
+ rows.push(parsed.data);
2343
+ }
2344
+ return rows;
2345
+ }
2346
+ async function readCaseDetails(runDir) {
2347
+ const detailsDir = join(runDir, "case-details");
2348
+ const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
2349
+ if (entriesResult.error) return /* @__PURE__ */ new Map();
2350
+ const caseDetails = /* @__PURE__ */ new Map();
2351
+ for (const entry of entriesResult.value) {
2352
+ if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
2353
+ const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
2354
+ if (!detail) continue;
2355
+ caseDetails.set(detail.caseId, detail);
2356
+ }
2357
+ return caseDetails;
2358
+ }
2359
+ function getRunEvalIds(run, knownEvalIds) {
2360
+ const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
2361
+ if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
2362
+ else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
2363
+ return [...evalIds];
2364
+ }
2365
+ function getEvalStatusForRun(run, evalId, manualScoreKeys) {
2366
+ const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
2367
+ if (evalCases.length > 0) {
2368
+ if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
2369
+ return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
2370
+ }
2371
+ return toLastRunStatus$1(deriveStatusFromChildStatuses({
2372
+ statuses: [],
2373
+ lifecycleStatus: run.manifest.status
2374
+ }));
2375
+ }
2376
+ function hasPendingManualScores(caseRows, manualScoreKeys) {
2377
+ if (manualScoreKeys.length === 0) return false;
2378
+ return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
2379
+ const value = caseRow.columns[key];
2380
+ return typeof value !== "number" || !Number.isFinite(value);
2381
+ }));
2382
+ }
2383
+ function encodeCaseDetailFileName(caseId) {
2384
+ return encodeURIComponent(caseId);
2385
+ }
2386
+ //#endregion
2387
+ //#region ../runner/src/runQueue.ts
2388
+ async function executeQueuedCases(params) {
2389
+ const { runState, queuedCases, concurrency, globalTraceDisplay } = params;
2390
+ let nextCaseIndex = 0;
2391
+ let workerError = void 0;
2392
+ const workerCount = Math.min(concurrency, queuedCases.length);
2393
+ const workers = Array.from({ length: workerCount }, async () => {
2394
+ while (!runState.abortController.signal.aborted && workerError === void 0) {
2395
+ const queuedCase = queuedCases[nextCaseIndex];
2396
+ nextCaseIndex += 1;
2397
+ if (queuedCase === void 0) return;
2398
+ try {
2399
+ await executeQueuedCase({
2400
+ queuedCase,
2401
+ runState,
2402
+ globalTraceDisplay
2403
+ });
2404
+ } catch (error) {
2405
+ workerError = error instanceof Error ? error : new Error(String(error));
2406
+ return;
2407
+ }
2408
+ }
2409
+ });
2410
+ await Promise.all(workers);
2411
+ if (workerError instanceof Error) throw workerError;
2412
+ if (workerError !== void 0) throw new Error(typeof workerError === "string" ? workerError : typeof workerError === "number" || typeof workerError === "boolean" || typeof workerError === "bigint" ? String(workerError) : workerError === null ? "null" : "Unknown queue worker error");
2413
+ }
2414
+ async function executeQueuedCase(params) {
2415
+ const { queuedCase, runState, globalTraceDisplay } = params;
2416
+ const startTime = Date.now();
2417
+ const result = await queuedCase.execute({
2418
+ globalTraceDisplay,
2419
+ signal: runState.abortController.signal,
2420
+ startTime
2421
+ });
2422
+ await queuedCase.onComplete(result);
2423
+ }
2424
+ //#endregion
2425
+ //#region ../runner/src/runOrchestration.ts
2426
+ /**
2427
+ * Ranks case statuses from worst to best. Used to order trial attempts so the
2428
+ * pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
2429
+ * non-terminal status outside `pass`/`fail`/`error` is treated as indistinct
2430
+ * from `fail` for comparison purposes.
2431
+ */
2432
+ function statusRank(status) {
2433
+ if (status === "pass") return 2;
2434
+ if (status === "error") return 0;
2435
+ return 1;
2436
+ }
2437
+ /**
2438
+ * Returns the minimum numeric value across the declared score columns for a
2439
+ * trial, or `-Infinity` when no score has a numeric value. Used as a
2440
+ * tiebreaker between trials that share the same status.
2441
+ */
2442
+ function minScoreValue(caseRow, scoreKeys) {
2443
+ let min = Number.POSITIVE_INFINITY;
2444
+ for (const key of scoreKeys) {
2445
+ const v = caseRow.columns[key];
2446
+ if (typeof v === "number" && Number.isFinite(v)) {
2447
+ if (v < min) min = v;
2448
+ }
2449
+ }
2450
+ return Number.isFinite(min) ? min : Number.NEGATIVE_INFINITY;
2451
+ }
2452
+ function compareTrialResults(left, right, scoreKeys) {
2453
+ const statusDiff = statusRank(left.caseRow.status) - statusRank(right.caseRow.status);
2454
+ if (statusDiff !== 0) return statusDiff;
2455
+ const scoreDiff = minScoreValue(left.caseRow, scoreKeys) - minScoreValue(right.caseRow, scoreKeys);
2456
+ if (scoreDiff !== 0) return scoreDiff;
2457
+ return left.caseRow.trial - right.caseRow.trial;
2458
+ }
2459
+ function pickWinningTrial(params) {
2460
+ const orderedAttempts = [...params.attempts].toSorted((left, right) => compareTrialResults(left, right, params.scoreKeys));
2461
+ if (params.strategy === "lowestScore") {
2462
+ const [lowestAttempt] = orderedAttempts;
2463
+ if (lowestAttempt === void 0) throw new Error("Expected at least one trial attempt");
2464
+ return lowestAttempt;
2465
+ }
2466
+ const medianAttempt = orderedAttempts[Math.floor((orderedAttempts.length - 1) / 2)];
2467
+ if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
2468
+ return medianAttempt;
2469
+ }
2470
+ async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals }) {
2471
+ try {
2472
+ const targetEvals = getTargetEvals(request);
2473
+ emitEvent(runState, {
2474
+ type: "run.started",
2475
+ runId: runState.manifest.id,
2476
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2477
+ payload: runState.manifest
2478
+ });
2479
+ const allCaseRows = [];
2480
+ const evalErrors = [];
2481
+ const queuedCases = [];
2482
+ const preparedEvals = [];
2483
+ const cacheMode = runState.manifest.cacheMode ?? "use";
2484
+ const cacheEnabled = config.cache?.enabled !== false;
2485
+ for (const evalMeta of targetEvals) {
2486
+ if (runState.abortController.signal.aborted) break;
2487
+ const evalFilePath = evalMeta.sourceFilePath;
2488
+ let codeFingerprint = "";
2489
+ try {
2490
+ codeFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
2491
+ } catch {
2492
+ codeFingerprint = "";
2493
+ }
2494
+ if (codeFingerprint.length > 0) runState.manifest.evalSourceFingerprints[evalMeta.id] = codeFingerprint;
2495
+ else delete runState.manifest.evalSourceFingerprints[evalMeta.id];
2496
+ try {
2497
+ const registry = getEvalRegistry();
2498
+ await loadEvalModule(evalFilePath, codeFingerprint);
2499
+ const entry = registry.get(evalMeta.id);
2500
+ if (!entry) {
2501
+ evalErrors.push({
2502
+ evalId: evalMeta.id,
2503
+ message: `Eval "${evalMeta.id}" was not registered after importing ${evalFilePath}`
2504
+ });
2505
+ continue;
2506
+ }
2507
+ await entry.use(async (evalDef) => {
2508
+ const cases = filterEvalCases(resolveRunnableEvalCases({
2509
+ cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
2510
+ evalId: evalMeta.id
2511
+ }), request.target.evalIds, request.target.caseIds, evalMeta.id);
2512
+ runState.summary.totalCases += cases.length;
2513
+ const accumulatedColumns = /* @__PURE__ */ new Map();
2514
+ const evalCaseRows = [];
2515
+ const preparedCases = [];
2516
+ const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
2517
+ const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
2518
+ preparedEvals.push({
2519
+ evalMeta,
2520
+ accumulatedColumns,
2521
+ evalCaseRows,
2522
+ preparedCases,
2523
+ scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
2524
+ mergeColumns: (columns) => {
2525
+ mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
2526
+ }
2527
+ });
2528
+ for (const evalCase of cases) {
2529
+ if (runState.abortController.signal.aborted) break;
2530
+ const trialResults = [];
2531
+ preparedCases.push({
2532
+ caseId: evalCase.id,
2533
+ trialResults
2534
+ });
2535
+ for (let trial = 0; trial < request.trials; trial++) {
2536
+ const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
2537
+ queuedCases.push({
2538
+ execute: async ({ startTime, signal, globalTraceDisplay }) => {
2539
+ const { caseDetail, caseRowUpdate } = await runCase({
2540
+ evalDef,
2541
+ evalId: evalMeta.id,
2542
+ evalCase,
2543
+ globalTraceDisplay,
2544
+ trial,
2545
+ signal,
2546
+ startTime,
2547
+ cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
2548
+ cacheMode,
2549
+ codeFingerprint,
2550
+ artifactDir: join(runDir, "artifacts"),
2551
+ runId: runState.manifest.id
2552
+ });
2553
+ return {
2554
+ caseDetail,
2555
+ caseRow: {
2556
+ caseId: evalCase.id,
2557
+ evalId: evalMeta.id,
2558
+ status: caseRowUpdate.status ?? "pending",
2559
+ latencyMs: caseRowUpdate.latencyMs ?? null,
2560
+ columns: caseRowUpdate.columns ?? {},
2561
+ trial
2562
+ }
2563
+ };
2564
+ },
2565
+ onComplete: ({ caseDetail, caseRow }) => {
2566
+ trialResults.push({
2567
+ caseDetail,
2568
+ caseRow,
2569
+ bufferedCacheStore
2570
+ });
2571
+ }
2572
+ });
2573
+ }
2574
+ }
2575
+ });
2576
+ } catch (error) {
2577
+ console.error(`Error running eval ${evalMeta.id}:`, error);
2578
+ evalErrors.push({
2579
+ evalId: evalMeta.id,
2580
+ message: error instanceof Error ? error.message : String(error)
2581
+ });
2582
+ lastRunStatusMap.set(evalMeta.id, "error");
2583
+ latestRunInfoMap.set(evalMeta.id, {
2584
+ status: "error",
2585
+ startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
2586
+ commitSha: runState.manifest.commitSha ?? null,
2587
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.id] ?? null
2588
+ });
2589
+ }
2590
+ }
2591
+ await executeQueuedCases({
2592
+ runState,
2593
+ queuedCases,
2594
+ concurrency: getConfiguredConcurrency(),
2595
+ globalTraceDisplay: config.traceDisplay
2596
+ });
2597
+ for (const preparedEval of preparedEvals) {
2598
+ for (const preparedCase of preparedEval.preparedCases) {
2599
+ if (preparedCase.trialResults.length === 0) continue;
2600
+ const winningTrial = pickWinningTrial({
2601
+ strategy: runState.manifest.trialSelection,
2602
+ attempts: preparedCase.trialResults,
2603
+ scoreKeys: preparedEval.scoreKeys
2604
+ });
2605
+ if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
2606
+ runState.cases.push(winningTrial.caseRow);
2607
+ runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
2608
+ preparedEval.mergeColumns(winningTrial.caseDetail.columns);
2609
+ if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
2610
+ else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
2611
+ else runState.summary.failedCases++;
2612
+ await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
2613
+ await persistCaseDetail(runDir, winningTrial.caseDetail);
2614
+ emitEvent(runState, {
2615
+ type: "case.finished",
2616
+ runId: runState.manifest.id,
2617
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2618
+ payload: winningTrial.caseRow
2619
+ });
2620
+ preparedEval.evalCaseRows.push(winningTrial.caseRow);
2621
+ allCaseRows.push(winningTrial.caseRow);
2622
+ }
2623
+ preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
2624
+ lastRunStatusMap.set(preparedEval.evalMeta.id, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
2625
+ const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.id) ?? null;
2626
+ latestRunInfoMap.set(preparedEval.evalMeta.id, {
2627
+ status: latestStatus,
2628
+ startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
2629
+ commitSha: runState.manifest.commitSha ?? null,
2630
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.id] ?? null
2631
+ });
2632
+ }
2633
+ const endTime = /* @__PURE__ */ new Date();
2634
+ runState.summary.totalDurationMs = endTime.getTime() - new Date(runState.manifest.startedAt).getTime();
2635
+ const finalStatus = runState.abortController.signal.aborted ? "cancelled" : evalErrors.length > 0 ? "error" : "completed";
2636
+ runState.summary.status = finalStatus;
2637
+ runState.manifest.status = finalStatus;
2638
+ const completedRunAt = endTime.toISOString();
2639
+ runState.manifest.endedAt = completedRunAt;
2640
+ runState.summary.errorMessage = evalErrors.length > 0 ? evalErrors.map((entry) => `[${entry.evalId}] ${entry.message}`).join("\n") : null;
2641
+ for (const evalId of getTargetEvalIds({
2642
+ request,
2643
+ sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
2644
+ knownEvalIds: new Set(evals.keys())
2645
+ })) {
2646
+ const latestStatus = lastRunStatusMap.get(evalId) ?? toLastRunStatus(deriveStatusFromCaseRows({
2647
+ caseRows: [],
2648
+ lifecycleStatus: runState.manifest.status
2649
+ }));
2650
+ latestRunInfoMap.set(evalId, {
2651
+ status: latestStatus,
2652
+ startedAt: completedRunAt,
2653
+ commitSha: runState.manifest.commitSha ?? null,
2654
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalId] ?? null
2655
+ });
2656
+ }
2657
+ emitEvent(runState, {
2658
+ type: "run.summary",
2659
+ runId: runState.manifest.id,
2660
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2661
+ payload: runState.summary
2662
+ });
2663
+ if (finalStatus === "error") emitEvent(runState, {
2664
+ type: "run.error",
2665
+ runId: runState.manifest.id,
2666
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2667
+ payload: { message: evalErrors.map((entry) => `[${entry.evalId}] ${entry.message}`).join("\n") }
2668
+ });
2669
+ else emitEvent(runState, {
2670
+ type: "run.finished",
2671
+ runId: runState.manifest.id,
2672
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2673
+ payload: runState.summary
2674
+ });
2675
+ await persistRunState(runState);
2676
+ emitDiscoveryEvent();
2677
+ } catch (error) {
2678
+ const message = error instanceof Error ? error.message : String(error);
2679
+ runState.manifest.status = "error";
2680
+ runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
2681
+ runState.summary.status = "error";
2682
+ runState.summary.errorMessage = message;
2683
+ emitEvent(runState, {
2684
+ type: "run.error",
2685
+ runId: runState.manifest.id,
2686
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2687
+ payload: { message }
2688
+ });
2689
+ await persistRunState(runState);
2690
+ emitDiscoveryEvent();
2691
+ }
2692
+ }
2693
+ function toLastRunStatus(status) {
2694
+ return status === "pending" ? null : status;
2695
+ }
2696
+ //#endregion
2697
+ //#region ../runner/src/runner.ts
2698
+ /** Create an in-memory eval runner bound to the current workspace config. */
2699
+ function createRunner({ watchForChanges = true } = {}) {
2700
+ let config;
2701
+ let workspaceRoot;
2702
+ let localStateDir;
2703
+ let cacheStore;
2704
+ const evals = /* @__PURE__ */ new Map();
2705
+ const runs = /* @__PURE__ */ new Map();
2706
+ const lastRunStatusMap = /* @__PURE__ */ new Map();
2707
+ const latestRunInfoMap = /* @__PURE__ */ new Map();
2708
+ const discoveryListeners = /* @__PURE__ */ new Set();
2709
+ let nextShortIdNum = 0;
2710
+ function toWorkspaceRelativePath(filePath) {
2711
+ return relative(workspaceRoot, filePath).replaceAll("\\", "/");
2712
+ }
2713
+ function getSortedEvalMetas() {
2714
+ return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
2715
+ }
2716
+ function getSourceFingerprint(source) {
2717
+ return createHash("sha256").update(source).digest("hex");
2718
+ }
2719
+ function getConfiguredConcurrency() {
2720
+ const configuredConcurrency = config.concurrency;
2721
+ if (typeof configuredConcurrency !== "number" || !Number.isFinite(configuredConcurrency)) return 1;
2722
+ return Math.max(1, Math.floor(configuredConcurrency));
2723
+ }
2724
+ const runner = {
2725
+ async init() {
2726
+ config = await loadConfig();
2727
+ workspaceRoot = config.workspaceRoot ?? process.cwd();
2728
+ localStateDir = resolve(workspaceRoot, ".agent-evals");
2729
+ await mkdir(localStateDir, { recursive: true });
2730
+ await mkdir(join(localStateDir, "runs"), { recursive: true });
2731
+ cacheStore = createFsCacheStore({
2732
+ workspaceRoot,
2733
+ dir: config.cache?.dir
2734
+ });
2735
+ await loadPersistedRuns();
2736
+ await runner.refreshDiscovery();
2737
+ if (watchForChanges) setupWatcher();
2738
+ },
2739
+ async listCache() {
2740
+ return cacheStore.list();
2741
+ },
2742
+ async clearCache(filter) {
2743
+ await cacheStore.clear(filter);
2744
+ },
2745
+ async recomputeStatusesForEval(evalId) {
2746
+ const evalMeta = evals.get(evalId);
2747
+ if (!evalMeta) return { updatedRuns: 0 };
2748
+ const registry = getEvalRegistry();
2749
+ await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
2750
+ const entry = registry.get(evalId);
2751
+ if (!entry) return { updatedRuns: 0 };
2752
+ const scoreThresholds = /* @__PURE__ */ new Map();
2753
+ entry.use((evalDef) => {
2754
+ for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
2755
+ const threshold = normalizeScoreDef(def).passThreshold;
2756
+ if (threshold !== void 0) scoreThresholds.set(key, threshold);
2757
+ }
2758
+ for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
2759
+ });
2760
+ const updatedRuns = await recomputeEvalStatusesInRuns({
2761
+ runs: runs.values(),
2762
+ evalId,
2763
+ evalExists: evals.has(evalId),
2764
+ scoreThresholds,
2765
+ persistCaseDetail
2766
+ });
2767
+ emitDiscoveryEvent();
2768
+ return { updatedRuns };
2769
+ },
2770
+ async cleanRunsForEval(evalId) {
2771
+ let deletedRuns = 0;
2772
+ for (const [runId, run] of [...runs]) {
2773
+ if (!runTouchesEval({
2774
+ target: run.manifest.target,
2775
+ caseRows: run.cases,
2776
+ evalId,
2777
+ evalExists: evals.has(evalId)
2778
+ })) continue;
2779
+ if (run.manifest.status === "running") continue;
2780
+ runs.delete(runId);
2781
+ await rm(run.runDir, {
2782
+ recursive: true,
2783
+ force: true
2784
+ });
2785
+ deletedRuns += 1;
2786
+ }
2787
+ emitDiscoveryEvent();
2788
+ return { deletedRuns };
2789
+ },
2790
+ async updateManualScore({ runId, caseId, scoreKey, value }) {
2791
+ const run = runs.get(runId);
2792
+ if (!run) return {
2793
+ updated: false,
2794
+ reason: "Run not found"
2795
+ };
2796
+ if (run.manifest.status === "running") return {
2797
+ updated: false,
2798
+ reason: "Run is still running"
2799
+ };
2800
+ const caseRow = run.cases.find((row) => row.caseId === caseId);
2801
+ if (!caseRow) return {
2802
+ updated: false,
2803
+ reason: "Case not found"
2804
+ };
2805
+ const evalMeta = evals.get(caseRow.evalId);
2806
+ if (!evalMeta) return {
2807
+ updated: false,
2808
+ reason: "Eval not found"
2809
+ };
2810
+ if (evalMeta.columnDefs.find((def) => def.key === scoreKey)?.isManualScore !== true) return {
2811
+ updated: false,
2812
+ reason: "Manual score not found"
2813
+ };
2814
+ const caseDetail = run.caseDetails.get(caseId);
2815
+ if (!caseDetail) return {
2816
+ updated: false,
2817
+ reason: "Case detail not found"
2818
+ };
2819
+ caseRow.columns[scoreKey] = value;
2820
+ caseDetail.columns[scoreKey] = value;
2821
+ const scoreThresholds = /* @__PURE__ */ new Map();
2822
+ for (const def of evalMeta.columnDefs) {
2823
+ if (def.isScore !== true || def.passThreshold === void 0) continue;
2824
+ scoreThresholds.set(def.key, def.passThreshold);
2825
+ }
2826
+ const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds);
2827
+ caseRow.status = nextStatus;
2828
+ caseDetail.status = nextStatus;
2829
+ const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
2830
+ run.summary.totalCases = derivedSummary.totalCases;
2831
+ run.summary.passedCases = derivedSummary.passedCases;
2832
+ run.summary.failedCases = derivedSummary.failedCases;
2833
+ run.summary.errorCases = derivedSummary.errorCases;
2834
+ run.summary.cancelledCases = derivedSummary.cancelledCases;
2835
+ run.summary.totalDurationMs = derivedSummary.totalDurationMs;
2836
+ await persistCaseDetail(run.runDir, caseDetail);
2837
+ await persistRunState(run);
2838
+ emitDiscoveryEvent();
2839
+ return {
2840
+ updated: true,
2841
+ run: {
2842
+ manifest: run.manifest,
2843
+ summary: run.summary,
2844
+ cases: run.cases
2845
+ },
2846
+ caseDetail
2847
+ };
2848
+ },
2849
+ async deleteRun(runId) {
2850
+ const run = runs.get(runId);
2851
+ if (!run) return { deleted: false };
2852
+ if (run.manifest.status === "running") return { deleted: false };
2853
+ runs.delete(runId);
2854
+ await rm(run.runDir, {
2855
+ recursive: true,
2856
+ force: true
2857
+ });
2858
+ emitDiscoveryEvent();
2859
+ return { deleted: true };
2860
+ },
2861
+ getEvals() {
2862
+ const gitState = readGitWorktreeState(workspaceRoot);
2863
+ const result = [];
2864
+ for (const meta of getSortedEvalMetas()) result.push(buildEvalSummary({
2865
+ meta,
2866
+ config,
2867
+ gitState,
2868
+ latestRun: latestRunInfoMap.get(meta.id),
2869
+ lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
2870
+ }));
2871
+ return result;
2872
+ },
2873
+ getEval(id) {
2874
+ const meta = evals.get(id);
2875
+ if (!meta) return void 0;
2876
+ return buildEvalSummary({
2877
+ meta,
2878
+ config,
2879
+ gitState: readGitWorktreeState(workspaceRoot),
2880
+ latestRun: latestRunInfoMap.get(meta.id),
2881
+ lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
2882
+ });
2883
+ },
2884
+ async refreshDiscovery() {
2885
+ const patterns = config.include;
2886
+ const discovered = [];
2887
+ for (const pattern of patterns) {
2888
+ const files = await glob(pattern, {
2889
+ cwd: workspaceRoot,
2890
+ absolute: true
2891
+ });
2892
+ discovered.push(...files);
2893
+ }
2894
+ evals.clear();
2895
+ for (const filePath of discovered) try {
2896
+ const content = await readFile(filePath, "utf-8");
2897
+ const discoveredMetas = parseEvalMetas(filePath, content);
2898
+ const sourceFingerprint = getSourceFingerprint(content);
2899
+ const registry = getEvalRegistry();
2900
+ try {
2901
+ await loadEvalModule(filePath, sourceFingerprint);
2902
+ } catch {}
2903
+ for (const meta of discoveredMetas) {
2904
+ const discoveredEntry = registry.get(meta.id);
2905
+ const title = meta.title;
2906
+ let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
2907
+ let stats;
2908
+ let charts;
2909
+ discoveredEntry?.use((evalDef) => {
2910
+ columnDefs = buildDeclaredColumnDefs(evalDef.columns, evalDef.scores, evalDef.manualScores);
2911
+ stats = evalDef.stats;
2912
+ const validated = validateCharts({
2913
+ charts: evalDef.charts,
2914
+ columnDefs,
2915
+ evalId: meta.id
2916
+ });
2917
+ for (const warning of validated.warnings) console.warn(warning);
2918
+ charts = validated.charts;
2919
+ });
2920
+ evals.set(meta.id, {
2921
+ id: meta.id,
2922
+ title,
2923
+ filePath: toWorkspaceRelativePath(meta.filePath),
2924
+ sourceFilePath: meta.filePath,
2925
+ sourceFingerprint,
2926
+ columnDefs,
2927
+ caseCount: null,
2928
+ stats,
2929
+ charts
2930
+ });
2931
+ }
2932
+ } catch {}
2933
+ emitDiscoveryEvent();
2934
+ },
2935
+ async startRun(request) {
2936
+ const runId = generateRunId();
2937
+ const shortId = `r${String(nextShortIdNum++)}`;
2938
+ const now = (/* @__PURE__ */ new Date()).toISOString();
2939
+ const cacheMode = request.cache?.mode ?? "use";
2940
+ const runDir = join(localStateDir, "runs", runId);
2941
+ const manifest = {
2942
+ id: runId,
2943
+ shortId,
2944
+ status: "running",
2945
+ startedAt: now,
2946
+ endedAt: null,
2947
+ commitSha: readGitWorktreeState(workspaceRoot).commitSha,
2948
+ evalSourceFingerprints: {},
2949
+ target: request.target,
2950
+ trials: request.trials,
2951
+ trialSelection: config.trialSelection ?? "lowestScore",
2952
+ cacheMode
2953
+ };
2954
+ const summary = {
2955
+ runId,
2956
+ status: "running",
2957
+ totalCases: 0,
2958
+ passedCases: 0,
2959
+ failedCases: 0,
2960
+ errorCases: 0,
2961
+ cancelledCases: 0,
2962
+ totalDurationMs: null,
2963
+ errorMessage: null
2964
+ };
2965
+ const abortController = new AbortController();
2966
+ const runState = {
2967
+ runDir,
2968
+ manifest,
2969
+ summary,
2970
+ cases: [],
2971
+ caseDetails: /* @__PURE__ */ new Map(),
2972
+ listeners: /* @__PURE__ */ new Set(),
2973
+ abortController
2974
+ };
2975
+ runs.set(runId, runState);
2976
+ setLatestRunInfoMap({
2977
+ latestRunInfoMap,
2978
+ evalIds: getTargetEvalIds({
2979
+ request,
2980
+ sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
2981
+ knownEvalIds: new Set(evals.keys())
2982
+ }),
2983
+ info: {
2984
+ status: "running",
2985
+ startedAt: now,
2986
+ commitSha: manifest.commitSha ?? null,
2987
+ evalSourceFingerprint: null
2988
+ }
2989
+ });
2990
+ await mkdir(runDir, { recursive: true });
2991
+ await mkdir(join(runDir, "traces"), { recursive: true });
2992
+ await mkdir(join(runDir, "artifacts"), { recursive: true });
2993
+ await mkdir(join(runDir, "case-details"), { recursive: true });
2994
+ await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
2995
+ executeRun({
2996
+ runState,
2997
+ request,
2998
+ runDir,
2999
+ config,
3000
+ evals,
3001
+ cacheStore,
3002
+ lastRunStatusMap,
3003
+ latestRunInfoMap,
3004
+ emitEvent,
3005
+ emitDiscoveryEvent,
3006
+ getSourceFingerprint,
3007
+ getConfiguredConcurrency,
3008
+ getSortedEvalMetas,
3009
+ getTargetEvals
3010
+ });
3011
+ return {
3012
+ manifest,
3013
+ summary,
3014
+ cases: []
3015
+ };
3016
+ },
3017
+ getRuns() {
3018
+ return [...runs.values()].map((r) => r.manifest);
3019
+ },
3020
+ getRun(id) {
3021
+ const run = runs.get(id);
3022
+ if (!run) return void 0;
3023
+ return {
3024
+ manifest: run.manifest,
3025
+ summary: run.summary,
3026
+ cases: run.cases
3027
+ };
3028
+ },
3029
+ cancelRun(id) {
3030
+ const run = runs.get(id);
3031
+ if (!run) return;
3032
+ run.abortController.abort();
3033
+ run.manifest.status = "cancelled";
3034
+ run.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
3035
+ run.summary.status = "cancelled";
3036
+ emitEvent(run, {
3037
+ type: "run.cancelled",
3038
+ runId: id,
3039
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3040
+ payload: run.summary
3041
+ });
3042
+ },
3043
+ getCaseDetail(runId, caseId) {
3044
+ const run = runs.get(runId);
3045
+ if (!run) return void 0;
3046
+ return run.caseDetails.get(caseId);
3047
+ },
3048
+ subscribe(runId, listener) {
3049
+ const run = runs.get(runId);
3050
+ if (!run) return () => {};
3051
+ run.listeners.add(listener);
3052
+ return () => {
3053
+ run.listeners.delete(listener);
3054
+ };
3055
+ },
3056
+ subscribeDiscovery(listener) {
3057
+ discoveryListeners.add(listener);
3058
+ return () => {
3059
+ discoveryListeners.delete(listener);
3060
+ };
3061
+ },
3062
+ getWorkspaceRoot() {
3063
+ return workspaceRoot;
3064
+ },
3065
+ getArtifactPath(artifactId_) {
3066
+ return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
3067
+ }
3068
+ };
3069
+ function setupWatcher() {
3070
+ const watcher = watch(config.include.map((p) => resolve(workspaceRoot, p)), {
3071
+ ignoreInitial: true,
3072
+ persistent: true
3073
+ });
3074
+ watcher.on("change", () => {
3075
+ runner.refreshDiscovery();
3076
+ });
3077
+ watcher.on("add", () => {
3078
+ runner.refreshDiscovery();
3079
+ });
3080
+ watcher.on("unlink", () => {
3081
+ runner.refreshDiscovery();
3082
+ });
3083
+ }
3084
+ function emitDiscoveryEvent() {
3085
+ const lastRunStatuses = getLastRunStatuses({
3086
+ runs: runs.values(),
3087
+ knownEvals: evals.values()
3088
+ });
3089
+ const latestRunInfos = getLatestRunInfos({
3090
+ runs: runs.values(),
3091
+ knownEvals: evals.values()
3092
+ });
3093
+ lastRunStatusMap.clear();
3094
+ for (const [evalId, status] of lastRunStatuses) lastRunStatusMap.set(evalId, status);
3095
+ latestRunInfoMap.clear();
3096
+ for (const [evalId, info] of latestRunInfos) latestRunInfoMap.set(evalId, info);
3097
+ const event = {
3098
+ type: "discovery.updated",
3099
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3100
+ payload: runner.getEvals()
3101
+ };
3102
+ for (const listener of discoveryListeners) listener(event);
3103
+ }
3104
+ function getTargetEvals(request) {
3105
+ if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.map((id) => evals.get(id)).filter((e) => e !== void 0);
3106
+ return getSortedEvalMetas();
3107
+ }
3108
+ function emitEvent(runState, event) {
3109
+ for (const listener of runState.listeners) try {
3110
+ listener(event);
3111
+ } catch {}
3112
+ }
3113
+ async function loadPersistedRuns() {
3114
+ runs.clear();
3115
+ const persistedRuns = await loadPersistedRunSnapshots(localStateDir);
3116
+ nextShortIdNum = nextShortIdFromSnapshots(persistedRuns);
3117
+ for (const persistedRun of persistedRuns) runs.set(persistedRun.manifest.id, {
3118
+ ...persistedRun,
3119
+ listeners: /* @__PURE__ */ new Set(),
3120
+ abortController: new AbortController()
3121
+ });
3122
+ }
3123
+ return runner;
3124
+ }
3125
+ //#endregion
3126
+ //#region src/cli.ts
3127
+ function parseArgs(argv) {
3128
+ const args = {
3129
+ command: "help",
3130
+ subcommand: void 0,
3131
+ evalIds: [],
3132
+ caseIds: [],
3133
+ trials: 1,
3134
+ json: false,
3135
+ port: 4100,
3136
+ cacheMode: "use",
3137
+ clearCache: false,
3138
+ all: false
3139
+ };
3140
+ const command = argv[0];
3141
+ if (command === "app" || command === "list" || command === "run" || command === "cache" || command === "help") args.command = command;
3142
+ let cursor = 1;
3143
+ if (args.command === "cache") {
3144
+ const sub = argv[cursor];
3145
+ if (sub === "list" || sub === "clear") {
3146
+ args.subcommand = sub;
3147
+ cursor++;
3148
+ }
3149
+ }
3150
+ for (let i = cursor; i < argv.length; i++) {
3151
+ const arg = argv[i];
3152
+ const next = argv[i + 1];
3153
+ if (arg === "--eval" && next) {
3154
+ args.evalIds.push(...next.split(","));
3155
+ i++;
3156
+ } else if (arg === "--case" && next) {
3157
+ args.caseIds.push(...next.split(","));
3158
+ i++;
3159
+ } else if (arg === "--trials" && next) {
3160
+ args.trials = Number(next);
3161
+ i++;
3162
+ } else if (arg === "--json") args.json = true;
3163
+ else if (arg === "--port" && next) {
3164
+ args.port = Number(next);
3165
+ i++;
3166
+ } else if (arg === "--cache" && next) {
3167
+ if (next === "use" || next === "bypass" || next === "refresh") args.cacheMode = next;
3168
+ i++;
3169
+ } else if (arg === "--no-cache") args.cacheMode = "bypass";
3170
+ else if (arg === "--refresh-cache") args.cacheMode = "refresh";
3171
+ else if (arg === "--clear-cache") args.clearCache = true;
3172
+ else if (arg === "--all") args.all = true;
3173
+ }
3174
+ return args;
3175
+ }
3176
+ /**
3177
+ * Run the Agent Evals CLI against the current workspace.
3178
+ *
3179
+ * @param argv Raw command-line arguments excluding the executable name.
3180
+ */
3181
+ async function runCli(argv) {
3182
+ const args = parseArgs(argv);
3183
+ switch (args.command) {
3184
+ case "app":
3185
+ await commandApp(args);
3186
+ break;
3187
+ case "list":
3188
+ await commandList(args);
3189
+ break;
3190
+ case "run":
3191
+ await commandRun(args);
3192
+ break;
3193
+ case "cache":
3194
+ await commandCache(args);
3195
+ break;
3196
+ default:
3197
+ printHelp();
3198
+ break;
3199
+ }
3200
+ }
3201
+ const currentDir = dirname(fileURLToPath(import.meta.url));
3202
+ const repoRoot = resolve(currentDir, "../../..");
3203
+ const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
3204
+ function hasRepoWebWorkspace() {
3205
+ return existsSync(resolve(repoRoot, "apps/web/package.json"));
3206
+ }
3207
+ async function ensureWebUiIsBuilt() {
3208
+ if (!hasRepoWebWorkspace()) return;
3209
+ console.info("Preparing web UI...");
3210
+ await new Promise((resolvePromise, rejectPromise) => {
3211
+ const child = spawn(pnpmCommand, [
3212
+ "--filter",
3213
+ "@agent-evals/web",
3214
+ "build"
3215
+ ], {
3216
+ cwd: repoRoot,
3217
+ stdio: "inherit"
3218
+ });
3219
+ child.once("error", (error) => {
3220
+ rejectPromise(error);
3221
+ });
3222
+ child.once("exit", (code, signal) => {
3223
+ if (signal) {
3224
+ rejectPromise(/* @__PURE__ */ new Error(`Web UI build stopped with signal ${signal}.`));
3225
+ return;
3226
+ }
3227
+ if (code !== 0) {
3228
+ rejectPromise(/* @__PURE__ */ new Error(`Web UI build failed with exit code ${String(code)}.`));
3229
+ return;
3230
+ }
3231
+ resolvePromise();
3232
+ });
3233
+ });
3234
+ }
3235
+ function isHonoAppModule(mod) {
3236
+ if (typeof mod !== "object" || mod === null || !("app" in mod)) return false;
3237
+ const { app } = mod;
3238
+ return typeof app === "object" && app !== null && "fetch" in app && typeof app.fetch === "function";
3239
+ }
3240
+ function isServerRunnerModule(mod) {
3241
+ if (typeof mod !== "object" || mod === null || !("initRunner" in mod)) return false;
3242
+ return typeof mod.initRunner === "function";
3243
+ }
3244
+ async function commandApp(args) {
3245
+ await ensureWebUiIsBuilt();
3246
+ const { serve } = await import("@hono/node-server");
3247
+ const bundledWebDist = resolve(currentDir, "apps/web/dist");
3248
+ if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
3249
+ const appModule = await import("./app-CKa9TjXw.mjs");
3250
+ const runnerModule = await import("./runner-Ck4X0H3p.mjs");
3251
+ if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
3252
+ if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
3253
+ await runnerModule.initRunner();
3254
+ console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
3255
+ serve({
3256
+ fetch: appModule.app.fetch,
3257
+ port: args.port
3258
+ });
3259
+ }
3260
+ async function commandList(args_) {
3261
+ const runner = createRunner({ watchForChanges: false });
3262
+ await runner.init();
3263
+ const evals = runner.getEvals();
3264
+ if (evals.length === 0) {
3265
+ console.info("No eval files found.");
3266
+ return;
3267
+ }
3268
+ console.info("Discovered evals:\n");
3269
+ for (const ev of evals) {
3270
+ const displayStatus = getEvalDisplayStatus({
3271
+ freshnessStatus: ev.freshnessStatus,
3272
+ stale: ev.stale,
3273
+ outdated: ev.outdated,
3274
+ lastRunStatus: ev.lastRunStatus
3275
+ });
3276
+ const title = getEvalTitle(ev);
3277
+ console.info(` ${title}`);
3278
+ console.info(` id: ${ev.id}`);
3279
+ console.info(` file: ${ev.filePath}`);
3280
+ if (displayStatus !== "pending") console.info(` status: ${displayStatus}`);
3281
+ if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
3282
+ console.info("");
3283
+ }
3284
+ }
3285
+ async function commandRun(args) {
3286
+ const runner = createRunner({ watchForChanges: false });
3287
+ await runner.init();
3288
+ if (args.clearCache) {
3289
+ await runner.clearCache();
3290
+ if (!args.json) {
3291
+ console.info("Cleared cache before run.");
3292
+ console.info("");
3293
+ }
3294
+ }
3295
+ const target = args.caseIds.length > 0 ? {
3296
+ mode: "caseIds",
3297
+ caseIds: args.caseIds,
3298
+ evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
3299
+ } : args.evalIds.length > 0 ? {
3300
+ mode: "evalIds",
3301
+ evalIds: args.evalIds
3302
+ } : { mode: "all" };
3303
+ const run = await runner.startRun({
3304
+ target,
3305
+ trials: args.trials,
3306
+ cache: { mode: args.cacheMode }
3307
+ });
3308
+ if (!args.json) {
3309
+ console.info(`Run started: ${run.manifest.id}`);
3310
+ console.info(`Trials: ${String(args.trials)}`);
3311
+ if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
3312
+ console.info("");
3313
+ }
3314
+ await waitForRunCompletion(runner, run.manifest.id);
3315
+ const finalRun = runner.getRun(run.manifest.id);
3316
+ if (!finalRun) {
3317
+ process.exit(1);
3318
+ return;
3319
+ }
3320
+ const { summary } = finalRun;
3321
+ if (args.json) console.info(JSON.stringify(summary, null, 2));
3322
+ else {
3323
+ console.info("--- Run Summary ---");
3324
+ console.info(`Status: ${summary.status}`);
3325
+ console.info(`Total: ${String(summary.totalCases)}`);
3326
+ console.info(`Passed: ${String(summary.passedCases)}`);
3327
+ console.info(`Failed: ${String(summary.failedCases)}`);
3328
+ console.info(`Errors: ${String(summary.errorCases)}`);
3329
+ if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
3330
+ if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
3331
+ }
3332
+ if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
3333
+ }
3334
+ async function commandCache(args) {
3335
+ const runner = createRunner({ watchForChanges: false });
3336
+ await runner.init();
3337
+ if (args.subcommand === "list" || args.subcommand === void 0) {
3338
+ const entries = await runner.listCache();
3339
+ if (args.json) {
3340
+ console.info(JSON.stringify(entries, null, 2));
3341
+ return;
3342
+ }
3343
+ if (entries.length === 0) {
3344
+ console.info("No cache entries.");
3345
+ return;
3346
+ }
3347
+ console.info(`Cache entries (${String(entries.length)}):\n`);
3348
+ for (const entry of entries) {
3349
+ console.info(` ${entry.namespace}`);
3350
+ console.info(` key: ${entry.key}`);
3351
+ console.info(` span: ${entry.spanName} (${entry.spanKind})`);
3352
+ console.info(` stored: ${entry.storedAt}`);
3353
+ console.info(` size: ${String(entry.sizeBytes)} bytes`);
3354
+ console.info("");
3355
+ }
3356
+ return;
3357
+ }
3358
+ if (args.subcommand === "clear") {
3359
+ if (args.evalIds.length > 0) {
3360
+ for (const evalId of args.evalIds) {
3361
+ const entries = await runner.listCache();
3362
+ const prefix = `${evalId}__`;
3363
+ const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
3364
+ for (const entry of matching) await runner.clearCache({
3365
+ namespace: entry.namespace,
3366
+ key: entry.key
3367
+ });
3368
+ }
3369
+ console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
3370
+ return;
3371
+ }
3372
+ if (args.all) {
3373
+ await runner.clearCache();
3374
+ console.info("Cleared all cache entries.");
3375
+ return;
3376
+ }
3377
+ console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
3378
+ process.exit(1);
3379
+ return;
3380
+ }
3381
+ printHelp();
3382
+ }
3383
+ async function waitForRunCompletion(runner, runId) {
3384
+ return new Promise((resolvePromise) => {
3385
+ const check = () => {
3386
+ const run = runner.getRun(runId);
3387
+ if (!run || run.manifest.status === "completed" || run.manifest.status === "cancelled" || run.manifest.status === "error") {
3388
+ resolvePromise();
3389
+ return;
3390
+ }
3391
+ setTimeout(check, 200);
3392
+ };
3393
+ check();
3394
+ });
3395
+ }
3396
+ function printHelp() {
3397
+ console.info(`
3398
+ agent-evals - LLM/Agent eval runner
3399
+
3400
+ Commands:
3401
+ app Start server with UI
3402
+ list List discovered evals
3403
+ run Run evals
3404
+ cache list List cached operation entries
3405
+ cache clear --eval <id> Clear cache entries for one eval
3406
+ cache clear --all Clear every cached entry
3407
+ help Show this help
3408
+
3409
+ Options:
3410
+ --eval <id> Run specific eval(s) (comma-separated)
3411
+ --case <id> Run specific case(s) (comma-separated)
3412
+ --trials <n> Number of trials per case
3413
+ --json Output results as JSON
3414
+ --port <n> Server port (default: 4100)
3415
+ --cache <use|bypass|refresh> Cache mode for this run (default: use)
3416
+ --no-cache Shortcut for --cache bypass
3417
+ --refresh-cache Shortcut for --cache refresh
3418
+ --clear-cache Clear the cache before starting the run
3419
+ `);
3420
+ }
3421
+ //#endregion
3422
+ export { jsonCellSchema as $, scoreTraceSchema as A, traceAttributeDisplayFormatSchema as B, caseDetailSchema as C, evalStatItemSchema as D, evalStatAggregateSchema as E, evalChartConfigSchema as F, traceDisplayInputConfigSchema as G, traceAttributeDisplayPlacementSchema as H, evalChartMetricSchema as I, cellValueSchema as J, traceSpanKindSchema as K, evalChartTooltipExtraSchema as L, evalChartAxisSchema as M, evalChartBuiltinMetricSchema as N, evalStatsConfigSchema as O, evalChartColorSchema as P, fileRefSchema as Q, evalChartTypeSchema as R, assertionFailureSchema as S, evalFreshnessStatusSchema as T, traceAttributeDisplaySchema as U, traceAttributeDisplayInputSchema as V, traceDisplayConfigSchema as W, columnFormatSchema as X, columnDefSchema as Y, columnKindSchema as Z, cacheModeSchema as _, getEvalRegistry as _t, sseEnvelopeSchema as a, evalTracer as at, serializedCacheSpanSchema as b, deriveScopedSummaryFromCases as c, evalAssert as ct, runManifestSchema as d, isInEvalScope as dt, numberDisplayOptionsSchema as et, runSummarySchema as f, runInEvalScope as ft, cacheListItemSchema as g, defineEval as gt, cacheEntrySchema as h, repoFile as ht, updateManualScoreRequestSchema as i, evalSpan as it, evalChartAggregateSchema as j, evalSummarySchema as k, deriveStatusFromCaseRows as l, getCurrentScope as lt, trialSelectionModeSchema as m, setScopeCacheContext as mt, createRunner as n, runArtifactRefSchema as nt, getEvalTitle as o, hashCacheKey as ot, agentEvalsConfigSchema as p, setEvalOutput as pt, traceSpanSchema as q, createRunRequestSchema as r, buildTraceTree as rt, getEvalDisplayStatus as s, EvalAssertionError as st, runCli as t, repoFileRefSchema as tt, deriveStatusFromChildStatuses as u, incrementEvalOutput as ut, cacheRecordingOpSchema as v, caseRowSchema as w, spanCacheOptionsSchema as x, cacheRecordingSchema as y, evalChartsConfigSchema as z };