@ls-stack/agent-eval 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,14 @@
1
+ import { createRequire, registerHooks } from "node:module";
1
2
  import { createHash } from "node:crypto";
2
3
  import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
3
- import { dirname, extname, join, relative, resolve } from "node:path";
4
+ import { extname, isAbsolute, join, relative, resolve } from "node:path";
5
+ import { z, z as z$1 } from "zod/v4";
4
6
  import { AsyncLocalStorage } from "node:async_hooks";
5
7
  import { Buffer as Buffer$1 } from "node:buffer";
6
8
  import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
7
- import { z } from "zod/v4";
8
- import { watch } from "chokidar";
9
- import { glob } from "glob";
10
9
  import { existsSync } from "node:fs";
11
10
  import { resultify } from "t-result";
12
11
  import { fileURLToPath, pathToFileURL } from "node:url";
13
- import { spawn, spawnSync } from "node:child_process";
14
12
  //#region ../sdk/src/defineEval.ts
15
13
  const evalRegistry = /* @__PURE__ */ new Map();
16
14
  /** Return the in-memory registry of evals defined in the current process. */
@@ -70,6 +68,27 @@ function getCurrentScope() {
70
68
  function isInEvalScope() {
71
69
  return getCurrentScope() !== void 0;
72
70
  }
71
+ function isObjectLike(value) {
72
+ return typeof value === "object" && value !== null;
73
+ }
74
+ function isObjectRecord(value) {
75
+ return typeof value === "object" && value !== null && !Array.isArray(value);
76
+ }
77
+ function copyArray$1(value) {
78
+ return value.map((item) => item);
79
+ }
80
+ function getEvalCaseInput(path = void 0) {
81
+ const scope = getCurrentScope();
82
+ if (!scope) return void 0;
83
+ if (path === void 0) return scope.input;
84
+ if (path.length === 0) return void 0;
85
+ let current = scope.input;
86
+ for (const segment of path.split(".")) {
87
+ if (segment.length === 0 || !isObjectLike(current)) return;
88
+ current = current[segment];
89
+ }
90
+ return current;
91
+ }
73
92
  /**
74
93
  * Attach cache context (adapter, mode, eval id, fingerprint) to a scope.
75
94
  *
@@ -86,6 +105,7 @@ function setScopeCacheContext(scope, context) {
86
105
  async function runInEvalScope(caseId, fn, options = {}) {
87
106
  const scope = {
88
107
  caseId,
108
+ input: options.input,
89
109
  outputs: {},
90
110
  assertionFailures: [],
91
111
  spans: [],
@@ -145,6 +165,58 @@ function setEvalOutput(key, value) {
145
165
  });
146
166
  }
147
167
  /**
168
+ * Append an item to an output array in the current case scope.
169
+ *
170
+ * Missing values become `[value]`, existing arrays receive the item, and
171
+ * existing scalar/object values are preserved as `[existing, value]`.
172
+ */
173
+ function appendToEvalOutput(key, value) {
174
+ const scope = getCurrentScope();
175
+ if (!scope) return;
176
+ const existing = scope.outputs[key];
177
+ if (existing === void 0) scope.outputs[key] = [value];
178
+ else if (Array.isArray(existing)) scope.outputs[key] = [...copyArray$1(existing), value];
179
+ else scope.outputs[key] = [existing, value];
180
+ recordOpIfActive(scope, {
181
+ kind: "appendOutput",
182
+ key,
183
+ value
184
+ });
185
+ }
186
+ /**
187
+ * Shallow-merge object fields into an output value in the current case scope.
188
+ *
189
+ * Missing values become a copy of `patch`. Non-object existing values are
190
+ * recorded as assertion failures instead of being replaced.
191
+ */
192
+ function mergeEvalOutput(key, patch) {
193
+ const scope = getCurrentScope();
194
+ if (!scope) return;
195
+ const existing = scope.outputs[key];
196
+ if (existing === void 0) {
197
+ scope.outputs[key] = { ...patch };
198
+ recordOpIfActive(scope, {
199
+ kind: "mergeOutput",
200
+ key,
201
+ patch
202
+ });
203
+ return;
204
+ }
205
+ if (!isObjectRecord(existing)) {
206
+ scope.assertionFailures.push(toAssertionFailure$1(`mergeEvalOutput("${key}"): existing value is ${Array.isArray(existing) ? "array" : typeof existing}, expected object`));
207
+ return;
208
+ }
209
+ scope.outputs[key] = {
210
+ ...existing,
211
+ ...patch
212
+ };
213
+ recordOpIfActive(scope, {
214
+ kind: "mergeOutput",
215
+ key,
216
+ patch
217
+ });
218
+ }
219
+ /**
148
220
  * Add a numeric delta to an output value in the current case scope.
149
221
  *
150
222
  * If the existing value is non-numeric, the operation is recorded as an
@@ -189,18 +261,267 @@ function evalAssert(condition, message) {
189
261
  throw error;
190
262
  }
191
263
  //#endregion
192
- //#region ../sdk/src/tracer.ts
193
- let spanIdCounter = 0;
264
+ //#region ../sdk/src/cacheKey.ts
265
+ var SerializedCacheKeyValue = class {
266
+ value;
267
+ constructor(value) {
268
+ this.value = value;
269
+ }
270
+ };
271
+ /**
272
+ * Hash the components of a cache key into a deterministic hex digest.
273
+ *
274
+ * Native `Blob` and `File` values are read asynchronously and hashed by
275
+ * content. Use `hashCacheKeySync` only when the key contains no async values.
276
+ */
277
+ async function hashCacheKey(input) {
278
+ return hashCacheKeySyncMaterialized(await materializeAsyncCacheKeyValue(input));
279
+ }
280
+ /**
281
+ * Synchronously hash cache key components. This supports JSON-like data and
282
+ * in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
283
+ * but cannot content-hash native `Blob` or `File` values.
284
+ */
285
+ function hashCacheKeySync(input) {
286
+ return hashCacheKeySyncMaterialized(input);
287
+ }
288
+ function hashCacheKeySyncMaterialized(input) {
289
+ return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
290
+ }
291
+ function stringifyCacheKeyValue(value) {
292
+ if (value instanceof SerializedCacheKeyValue) return value.value;
293
+ if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
294
+ if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
295
+ if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
296
+ if (isArrayBufferView(value)) {
297
+ const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
298
+ return `$${value.constructor.name}:${hashBytes(bytes)}`;
299
+ }
300
+ if (isFile$1(value)) return `$file:${getCompositeKey({
301
+ lastModified: value.lastModified,
302
+ name: value.name,
303
+ size: value.size,
304
+ type: value.type
305
+ })}`;
306
+ if (isBlob$1(value)) return `$blob:${getCompositeKey({
307
+ size: value.size,
308
+ type: value.type
309
+ })}`;
310
+ }
311
+ async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
312
+ const serialized = await stringifyAsyncCacheKeyValue(value);
313
+ if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
314
+ if (stringifyCacheKeyValue(value) !== void 0) return value;
315
+ if (!value || typeof value !== "object") return value;
316
+ if (Array.isArray(value)) {
317
+ const items = [];
318
+ for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
319
+ return items;
320
+ }
321
+ if (refs.has(value)) throw new Error("Circular reference detected");
322
+ refs.add(value);
323
+ const entries = [];
324
+ for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
325
+ refs.delete(value);
326
+ return Object.fromEntries(entries);
327
+ }
328
+ async function stringifyAsyncCacheKeyValue(value) {
329
+ if (isFile$1(value)) return `$file:${getCompositeKey({
330
+ bytes: await hashBlobBytes(value),
331
+ lastModified: value.lastModified,
332
+ name: value.name,
333
+ size: value.size,
334
+ type: value.type
335
+ })}`;
336
+ if (isBlob$1(value)) return `$blob:${getCompositeKey({
337
+ bytes: await hashBlobBytes(value),
338
+ size: value.size,
339
+ type: value.type
340
+ })}`;
341
+ }
342
+ async function hashBlobBytes(value) {
343
+ return hashBytes(new Uint8Array(await value.arrayBuffer()));
344
+ }
345
+ function hashBytes(value) {
346
+ return createHash("sha256").update(value).digest("hex");
347
+ }
348
+ function isArrayBuffer(value) {
349
+ return value instanceof ArrayBuffer;
350
+ }
351
+ function isSharedArrayBuffer(value) {
352
+ return value instanceof SharedArrayBuffer;
353
+ }
354
+ function isArrayBufferView(value) {
355
+ return ArrayBuffer.isView(value);
356
+ }
357
+ function isBlob$1(value) {
358
+ return value instanceof Blob;
359
+ }
360
+ function isFile$1(value) {
361
+ return value instanceof File;
362
+ }
363
+ function toJsonSafe(value) {
364
+ if (value === void 0) return void 0;
365
+ const text = JSON.stringify(value);
366
+ return JSON.parse(text);
367
+ }
368
+ //#endregion
369
+ //#region ../sdk/src/cacheRecording.ts
370
+ function mergeSpanAttributes$1(span, attributes) {
371
+ span.attributes = {
372
+ ...span.attributes,
373
+ ...attributes
374
+ };
375
+ }
376
+ function isRecordLike$1(value) {
377
+ return typeof value === "object" && value !== null && !Array.isArray(value);
378
+ }
379
+ function valueKind$1(value) {
380
+ return Array.isArray(value) ? "array" : typeof value;
381
+ }
382
+ function copyArray(value) {
383
+ return value.map((item) => item);
384
+ }
385
+ function stripCacheAttributes(attributes) {
386
+ if (!attributes) return {};
387
+ const result = {};
388
+ for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
389
+ return result;
390
+ }
391
+ function snapshotNonCacheAttributes(span) {
392
+ const snapshot = toJsonSafe(stripCacheAttributes(span?.attributes));
393
+ return isRecordLike$1(snapshot) ? snapshot : {};
394
+ }
395
+ function diffNonCacheAttributes(before, after) {
396
+ const result = {};
397
+ for (const [key, value] of Object.entries(after)) if (!cacheAttributeValuesEqual(before[key], value)) result[key] = value;
398
+ return result;
399
+ }
400
+ function cacheAttributeValuesEqual(left, right) {
401
+ if (Object.is(left, right)) return true;
402
+ try {
403
+ return JSON.stringify(left) === JSON.stringify(right);
404
+ } catch {
405
+ return false;
406
+ }
407
+ }
408
+ function appendCacheRef(span, ref) {
409
+ if (span === void 0) return;
410
+ const existing = span.attributes?.["cache.refs"];
411
+ mergeSpanAttributes$1(span, { "cache.refs": [...Array.isArray(existing) ? copyArray(existing) : [], ref] });
412
+ }
413
+ function serializeSubSpanTree(scope, spanId) {
414
+ const original = scope.spans.find((s) => s.id === spanId);
415
+ if (!original) return {
416
+ kind: "custom",
417
+ name: "unknown",
418
+ attributes: void 0,
419
+ status: "ok",
420
+ error: void 0,
421
+ errors: void 0,
422
+ warning: void 0,
423
+ warnings: void 0,
424
+ children: []
425
+ };
426
+ const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
427
+ return {
428
+ kind: original.kind,
429
+ name: original.name,
430
+ attributes: original.attributes,
431
+ status: original.status,
432
+ error: original.error,
433
+ errors: original.errors,
434
+ warning: original.warning,
435
+ warnings: original.warnings,
436
+ children
437
+ };
438
+ }
439
+ function appendSubSpanOps(scope, frame) {
440
+ for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
441
+ const candidate = scope.spans[i];
442
+ if (candidate?.parentId === frame.replayParentSpanId) frame.ops.push({
443
+ kind: "subSpan",
444
+ span: serializeSubSpanTree(scope, candidate.id)
445
+ });
446
+ }
447
+ }
448
+ function replayRecording(scope, parentSpan, recording, options) {
449
+ scope.replayingDepth++;
450
+ try {
451
+ for (const op of recording.ops) applyRecordingOp(scope, parentSpan, op, options);
452
+ if (parentSpan !== void 0 && Object.keys(recording.finalAttributes).length > 0) mergeSpanAttributes$1(parentSpan, recording.finalAttributes);
453
+ if (parentSpan !== void 0 && recording.finalError !== void 0) parentSpan.error = recording.finalError;
454
+ if (parentSpan !== void 0 && recording.finalErrors !== void 0) parentSpan.errors = recording.finalErrors;
455
+ if (parentSpan !== void 0 && recording.finalWarning !== void 0) parentSpan.warning = recording.finalWarning;
456
+ if (parentSpan !== void 0 && recording.finalWarnings !== void 0) parentSpan.warnings = recording.finalWarnings;
457
+ } finally {
458
+ scope.replayingDepth--;
459
+ }
460
+ }
461
+ function applyRecordingOp(scope, parentSpan, op, options) {
462
+ if (op.kind === "setOutput") {
463
+ scope.outputs[op.key] = op.value;
464
+ return;
465
+ }
466
+ if (op.kind === "appendOutput") {
467
+ const existing = scope.outputs[op.key];
468
+ if (existing === void 0) scope.outputs[op.key] = [op.value];
469
+ else if (Array.isArray(existing)) scope.outputs[op.key] = [...copyArray(existing), op.value];
470
+ else scope.outputs[op.key] = [existing, op.value];
471
+ return;
472
+ }
473
+ if (op.kind === "mergeOutput") {
474
+ const existing = scope.outputs[op.key];
475
+ if (existing === void 0) scope.outputs[op.key] = { ...op.patch };
476
+ else if (isRecordLike$1(existing)) scope.outputs[op.key] = {
477
+ ...existing,
478
+ ...op.patch
479
+ };
480
+ else scope.assertionFailures.push({ message: `replay mergeEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected object` });
481
+ return;
482
+ }
483
+ if (op.kind === "incrementOutput") {
484
+ const existing = scope.outputs[op.key];
485
+ if (existing === void 0) scope.outputs[op.key] = op.delta;
486
+ else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
487
+ else scope.assertionFailures.push({ message: `replay incrementEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected number` });
488
+ return;
489
+ }
490
+ if (op.kind === "checkpoint") {
491
+ scope.checkpoints.set(op.name, op.data);
492
+ return;
493
+ }
494
+ replaySerializedSpan(scope, parentSpan?.id ?? null, op.span, options);
495
+ }
496
+ function replaySerializedSpan(scope, parentId, serialized, options) {
497
+ const id = options.generateSpanId();
498
+ const now = (/* @__PURE__ */ new Date()).toISOString();
499
+ const replayed = {
500
+ id,
501
+ parentId,
502
+ caseId: scope.caseId,
503
+ kind: serialized.kind,
504
+ name: serialized.name,
505
+ startedAt: now,
506
+ endedAt: now,
507
+ status: serialized.status,
508
+ attributes: serialized.attributes,
509
+ error: serialized.error,
510
+ errors: serialized.errors,
511
+ warning: serialized.warning,
512
+ warnings: serialized.warnings
513
+ };
514
+ scope.spans.push(replayed);
515
+ for (const child of serialized.children) replaySerializedSpan(scope, id, child, options);
516
+ }
517
+ //#endregion
518
+ //#region ../sdk/src/traceDiagnostics.ts
194
519
  const errorCoreFields = new Set([
195
520
  "name",
196
521
  "message",
197
522
  "stack",
198
523
  "capturedAt"
199
524
  ]);
200
- function generateSpanId() {
201
- spanIdCounter++;
202
- return `span_${String(Date.now())}_${String(spanIdCounter)}`;
203
- }
204
525
  function isRecord$2(value) {
205
526
  return typeof value === "object" && value !== null && !Array.isArray(value);
206
527
  }
@@ -221,33 +542,6 @@ function formatUnknownErrorMessage(error) {
221
542
  function getErrorExtraFields(error) {
222
543
  return Object.fromEntries(Object.entries(error).filter(([key]) => !errorCoreFields.has(key)));
223
544
  }
224
- function updateCurrentSpan(update) {
225
- const currentSpan = getCurrentScope()?.activeSpanStack.at(-1);
226
- if (!currentSpan) return;
227
- update(currentSpan);
228
- }
229
- function noopActiveSpan() {
230
- return {
231
- setName() {},
232
- setAttribute() {},
233
- setAttributes() {}
234
- };
235
- }
236
- function noopExternalSpan(id) {
237
- return {
238
- id,
239
- setName() {},
240
- setAttribute() {},
241
- setAttributes() {},
242
- end() {}
243
- };
244
- }
245
- function mergeSpanAttributes(span, attributes) {
246
- span.attributes = {
247
- ...span.attributes,
248
- ...attributes
249
- };
250
- }
251
545
  function normalizeTraceError(error, capturedAt = void 0) {
252
546
  if (error instanceof Error) return {
253
547
  ...getErrorExtraFields(error),
@@ -274,19 +568,233 @@ function normalizeTraceError(error, capturedAt = void 0) {
274
568
  capturedAt
275
569
  };
276
570
  }
277
- function normalizeTraceErrors(errorOrErrors, additionalErrors, capturedAt) {
278
- return (additionalErrors.length > 0 ? [errorOrErrors, ...additionalErrors] : Array.isArray(errorOrErrors) ? errorOrErrors : [errorOrErrors]).map((error) => normalizeTraceError(error, capturedAt));
571
+ function normalizeTraceErrors(errorOrErrors, additionalErrors, capturedAt) {
572
+ return (additionalErrors.length > 0 ? [errorOrErrors, ...additionalErrors] : Array.isArray(errorOrErrors) ? errorOrErrors : [errorOrErrors]).map((error) => normalizeTraceError(error, capturedAt));
573
+ }
574
+ function normalizeTraceWarnings(warningOrWarnings, additionalWarnings, capturedAt) {
575
+ return (additionalWarnings.length > 0 ? [warningOrWarnings, ...additionalWarnings] : Array.isArray(warningOrWarnings) ? warningOrWarnings : [warningOrWarnings]).map((warning) => normalizeTraceError(warning, capturedAt));
576
+ }
577
+ function isCaptureEvalSpanErrorOptions(value) {
578
+ if (!isRecord$2(value)) return false;
579
+ const keys = Object.keys(value);
580
+ if (keys.length === 0) return false;
581
+ if (!keys.every((key) => key === "level")) return false;
582
+ return value.level === void 0 || isCaptureEvalSpanErrorLevel(value.level);
583
+ }
584
+ function isCaptureEvalSpanErrorLevel(value) {
585
+ return value === "error" || value === "warning";
586
+ }
587
+ function splitCaptureEvalSpanErrorArgs(additionalErrorsOrOptions) {
588
+ const lastArg = additionalErrorsOrOptions.at(-1);
589
+ if (isCaptureEvalSpanErrorLevel(lastArg)) return {
590
+ additionalErrors: additionalErrorsOrOptions.slice(0, -1),
591
+ options: { level: lastArg }
592
+ };
593
+ if (isCaptureEvalSpanErrorOptions(lastArg)) return {
594
+ additionalErrors: additionalErrorsOrOptions.slice(0, -1),
595
+ options: lastArg
596
+ };
597
+ return {
598
+ additionalErrors: additionalErrorsOrOptions,
599
+ options: {}
600
+ };
601
+ }
602
+ function appendSpanErrors(span, errors) {
603
+ if (errors.length === 0) return;
604
+ const latestError = errors.at(-1);
605
+ if (latestError === void 0) return;
606
+ span.errors = [...span.errors ?? [], ...errors];
607
+ span.error = latestError;
608
+ span.status = "error";
609
+ }
610
+ function appendSpanWarnings(span, warnings) {
611
+ if (warnings.length === 0) return;
612
+ const latestWarning = warnings.at(-1);
613
+ if (latestWarning === void 0) return;
614
+ span.warnings = [...span.warnings ?? [], ...warnings];
615
+ span.warning = latestWarning;
616
+ }
617
+ function hasSpanError(span) {
618
+ return span.error !== void 0 || (span.errors?.length ?? 0) > 0;
619
+ }
620
+ //#endregion
621
+ //#region ../sdk/src/valueCache.ts
622
+ function createTraceCache(generateSpanId) {
623
+ return async function traceCache(info, fn) {
624
+ const scope = getCurrentScope();
625
+ if (!scope) return await fn();
626
+ const cacheCtx = scope.cacheContext;
627
+ if (cacheCtx === void 0 || scope.replayingDepth > 0) return await fn();
628
+ const namespace = info.namespace ?? `${cacheCtx.evalId}__${info.name}`;
629
+ const keyHash = await hashCacheKey({
630
+ namespace,
631
+ codeFingerprint: cacheCtx.codeFingerprint,
632
+ key: info.key
633
+ });
634
+ const activeSpan = scope.activeSpanStack.at(-1);
635
+ if (cacheCtx.mode === "use") {
636
+ const hit = await cacheCtx.adapter.lookup(namespace, keyHash);
637
+ if (hit) {
638
+ const storedAt = hit.storedAt;
639
+ const age = Date.now() - new Date(storedAt).getTime();
640
+ appendCacheRef(activeSpan, {
641
+ type: "value",
642
+ name: info.name,
643
+ namespace,
644
+ key: keyHash,
645
+ status: "hit",
646
+ storedAt,
647
+ age
648
+ });
649
+ replayRecording(scope, activeSpan, hit.recording, { generateSpanId });
650
+ return hit.recording.returnValue;
651
+ }
652
+ appendCacheRef(activeSpan, {
653
+ type: "value",
654
+ name: info.name,
655
+ namespace,
656
+ key: keyHash,
657
+ status: "miss"
658
+ });
659
+ } else if (cacheCtx.mode === "refresh") appendCacheRef(activeSpan, {
660
+ type: "value",
661
+ name: info.name,
662
+ namespace,
663
+ key: keyHash,
664
+ status: "refresh"
665
+ });
666
+ else appendCacheRef(activeSpan, {
667
+ type: "value",
668
+ name: info.name,
669
+ namespace,
670
+ key: keyHash,
671
+ status: "bypass"
672
+ });
673
+ const beforeAttributes = snapshotNonCacheAttributes(activeSpan);
674
+ const frame = {
675
+ baseSpanIndex: scope.spans.length,
676
+ replayParentSpanId: activeSpan?.id ?? null,
677
+ ops: []
678
+ };
679
+ scope.recordingStack.push(frame);
680
+ let bodyResult;
681
+ try {
682
+ bodyResult = await fn();
683
+ } finally {
684
+ scope.recordingStack.pop();
685
+ }
686
+ appendSubSpanOps(scope, frame);
687
+ if (cacheCtx.mode !== "bypass") {
688
+ const finalAttributes = diffNonCacheAttributes(beforeAttributes, snapshotNonCacheAttributes(activeSpan));
689
+ const recording = {
690
+ returnValue: toJsonSafe(bodyResult),
691
+ finalAttributes,
692
+ ops: frame.ops
693
+ };
694
+ await cacheCtx.adapter.write({
695
+ version: 1,
696
+ key: keyHash,
697
+ namespace,
698
+ operationType: "value",
699
+ operationName: info.name,
700
+ storedAt: (/* @__PURE__ */ new Date()).toISOString(),
701
+ codeFingerprint: cacheCtx.codeFingerprint,
702
+ recording
703
+ });
704
+ }
705
+ return bodyResult;
706
+ };
707
+ }
708
+ //#endregion
709
+ //#region ../sdk/src/tracer.ts
710
+ let spanIdCounter = 0;
711
+ function generateSpanId() {
712
+ spanIdCounter++;
713
+ return `span_${String(Date.now())}_${String(spanIdCounter)}`;
714
+ }
715
+ function updateCurrentSpan(update) {
716
+ const currentSpan = getCurrentScope()?.activeSpanStack.at(-1);
717
+ if (!currentSpan) return;
718
+ update(currentSpan);
719
+ }
720
+ function noopActiveSpan() {
721
+ return {
722
+ setName() {},
723
+ setAttribute() {},
724
+ setAttributes() {},
725
+ incrementAttribute() {},
726
+ appendToAttribute() {},
727
+ mergeAttribute() {}
728
+ };
729
+ }
730
+ function noopExternalSpan(id) {
731
+ return {
732
+ id,
733
+ setName() {},
734
+ setAttribute() {},
735
+ setAttributes() {},
736
+ incrementAttribute() {},
737
+ appendToAttribute() {},
738
+ mergeAttribute() {},
739
+ end() {}
740
+ };
741
+ }
742
+ function mergeSpanAttributes(span, attributes) {
743
+ span.attributes = {
744
+ ...span.attributes,
745
+ ...attributes
746
+ };
279
747
  }
280
- function appendSpanErrors(span, errors) {
281
- if (errors.length === 0) return;
282
- const latestError = errors.at(-1);
283
- if (latestError === void 0) return;
284
- span.errors = [...span.errors ?? [], ...errors];
285
- span.error = latestError;
286
- span.status = "error";
748
+ function isRecordLike(value) {
749
+ return typeof value === "object" && value !== null && !Array.isArray(value);
287
750
  }
288
- function hasSpanError(span) {
289
- return span.error !== void 0 || (span.errors?.length ?? 0) > 0;
751
+ function valueKind(value) {
752
+ return Array.isArray(value) ? "array" : typeof value;
753
+ }
754
+ function recordSpanAttributeAssertion(message) {
755
+ const scope = getCurrentScope();
756
+ if (!scope) return;
757
+ scope.assertionFailures.push({ message });
758
+ }
759
+ function incrementSpanAttribute(span, key, delta) {
760
+ const existing = span.attributes?.[key];
761
+ if (existing === void 0) {
762
+ mergeSpanAttributes(span, { [key]: delta });
763
+ return;
764
+ }
765
+ if (typeof existing !== "number") {
766
+ recordSpanAttributeAssertion(`evalSpan.incrementAttribute("${key}"): existing value is ${valueKind(existing)}, expected number`);
767
+ return;
768
+ }
769
+ mergeSpanAttributes(span, { [key]: existing + delta });
770
+ }
771
+ function appendToSpanAttribute(span, key, value) {
772
+ const existing = span.attributes?.[key];
773
+ if (existing === void 0) {
774
+ mergeSpanAttributes(span, { [key]: [value] });
775
+ return;
776
+ }
777
+ if (Array.isArray(existing)) {
778
+ const items = existing.map((item) => item);
779
+ mergeSpanAttributes(span, { [key]: [...items, value] });
780
+ return;
781
+ }
782
+ mergeSpanAttributes(span, { [key]: [existing, value] });
783
+ }
784
+ function mergeSpanAttribute(span, key, patch) {
785
+ const existing = span.attributes?.[key];
786
+ if (existing === void 0) {
787
+ mergeSpanAttributes(span, { [key]: { ...patch } });
788
+ return;
789
+ }
790
+ if (!isRecordLike(existing)) {
791
+ recordSpanAttributeAssertion(`evalSpan.mergeAttribute("${key}"): existing value is ${valueKind(existing)}, expected object`);
792
+ return;
793
+ }
794
+ mergeSpanAttributes(span, { [key]: {
795
+ ...existing,
796
+ ...patch
797
+ } });
290
798
  }
291
799
  function finishSpanWithoutThrownError(span) {
292
800
  span.status = hasSpanError(span) ? "error" : "ok";
@@ -302,9 +810,25 @@ function createSpanHandle(span) {
302
810
  },
303
811
  setAttributes(value) {
304
812
  mergeSpanAttributes(span, value);
813
+ },
814
+ incrementAttribute(key, delta) {
815
+ incrementSpanAttribute(span, key, delta);
816
+ },
817
+ appendToAttribute(key, value) {
818
+ appendToSpanAttribute(span, key, value);
819
+ },
820
+ mergeAttribute(key, patch) {
821
+ mergeSpanAttribute(span, key, patch);
305
822
  }
306
823
  };
307
824
  }
825
+ function updateExternalSpanRecord(id, update) {
826
+ const scope = getCurrentScope();
827
+ if (!scope) return;
828
+ const span = findSpan(scope, id);
829
+ if (!span) return;
830
+ update(span);
831
+ }
308
832
  function createExternalSpanHandle(id) {
309
833
  return {
310
834
  id,
@@ -326,6 +850,21 @@ function createExternalSpanHandle(id) {
326
850
  attributes: value
327
851
  });
328
852
  },
853
+ incrementAttribute(key, delta) {
854
+ updateExternalSpanRecord(id, (span) => {
855
+ incrementSpanAttribute(span, key, delta);
856
+ });
857
+ },
858
+ appendToAttribute(key, value) {
859
+ updateExternalSpanRecord(id, (span) => {
860
+ appendToSpanAttribute(span, key, value);
861
+ });
862
+ },
863
+ mergeAttribute(key, patch) {
864
+ updateExternalSpanRecord(id, (span) => {
865
+ mergeSpanAttribute(span, key, patch);
866
+ });
867
+ },
329
868
  end(info = {}) {
330
869
  endExternalSpan({
331
870
  ...info,
@@ -382,6 +921,8 @@ function updateExternalSpan(info) {
382
921
  if (info.name !== void 0) span.name = info.name;
383
922
  if (info.status !== void 0) span.status = info.status;
384
923
  if (info.error !== void 0) span.error = info.error;
924
+ if (info.warning !== void 0) span.warning = info.warning;
925
+ if (info.warnings !== void 0) span.warnings = info.warnings;
385
926
  if (info.attributes !== void 0) mergeSpanAttributes(span, info.attributes);
386
927
  }
387
928
  function endExternalSpan(info) {
@@ -410,6 +951,8 @@ function recordExternalSpan(info) {
410
951
  existing.status = status;
411
952
  existing.attributes = info.attributes;
412
953
  existing.error = info.error;
954
+ existing.warning = info.warning;
955
+ existing.warnings = info.warnings;
413
956
  return id;
414
957
  }
415
958
  scope.spans.push({
@@ -422,7 +965,9 @@ function recordExternalSpan(info) {
422
965
  endedAt,
423
966
  status,
424
967
  attributes: info.attributes,
425
- error: info.error
968
+ error: info.error,
969
+ warning: info.warning,
970
+ warnings: info.warnings
426
971
  });
427
972
  return id;
428
973
  }
@@ -446,16 +991,42 @@ const evalSpan = {
446
991
  updateCurrentSpan((currentSpan) => {
447
992
  mergeSpanAttributes(currentSpan, value);
448
993
  });
994
+ },
995
+ incrementAttribute(key, delta) {
996
+ updateCurrentSpan((currentSpan) => {
997
+ incrementSpanAttribute(currentSpan, key, delta);
998
+ });
999
+ },
1000
+ appendToAttribute(key, value) {
1001
+ updateCurrentSpan((currentSpan) => {
1002
+ appendToSpanAttribute(currentSpan, key, value);
1003
+ });
1004
+ },
1005
+ mergeAttribute(key, patch) {
1006
+ updateCurrentSpan((currentSpan) => {
1007
+ mergeSpanAttribute(currentSpan, key, patch);
1008
+ });
449
1009
  }
450
1010
  };
451
1011
  /**
452
1012
  * Attach one or more recoverable errors to the active eval span.
453
1013
  *
454
- * The active span is marked as `error` even if its callback later completes
455
- * without throwing. Calls outside `evalTracer.span(...)` are ignored.
1014
+ * By default the active span is marked as `error` even if its callback later
1015
+ * completes without throwing. Pass `'warning'` or `{ level: 'warning' }` as the
1016
+ * final argument to record the diagnostic without changing span status. Calls
1017
+ * outside `evalTracer.span(...)` are ignored.
456
1018
  */
457
- function captureEvalSpanError(errorOrErrors, ...additionalErrors) {
458
- const errors = normalizeTraceErrors(errorOrErrors, additionalErrors, (/* @__PURE__ */ new Date()).toISOString());
1019
+ function captureEvalSpanError(errorOrErrors, ...additionalErrorsOrOptions) {
1020
+ const { additionalErrors, options } = splitCaptureEvalSpanErrorArgs(additionalErrorsOrOptions);
1021
+ const capturedAt = (/* @__PURE__ */ new Date()).toISOString();
1022
+ if ((options.level ?? "error") === "warning") {
1023
+ const warnings = normalizeTraceWarnings(errorOrErrors, additionalErrors, capturedAt);
1024
+ updateCurrentSpan((currentSpan) => {
1025
+ appendSpanWarnings(currentSpan, warnings);
1026
+ });
1027
+ return;
1028
+ }
1029
+ const errors = normalizeTraceErrors(errorOrErrors, additionalErrors, capturedAt);
459
1030
  updateCurrentSpan((currentSpan) => {
460
1031
  appendSpanErrors(currentSpan, errors);
461
1032
  });
@@ -503,7 +1074,7 @@ async function traceSpan(info, fn) {
503
1074
  "cache.storedAt": storedAt,
504
1075
  "cache.age": Date.now() - new Date(storedAt).getTime()
505
1076
  });
506
- replayRecording(scope, spanRecord, hit.recording);
1077
+ replayRecording(scope, spanRecord, hit.recording, { generateSpanId });
507
1078
  spanRecord.status = hit.recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
508
1079
  spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
509
1080
  return hit.recording.returnValue;
@@ -513,7 +1084,7 @@ async function traceSpan(info, fn) {
513
1084
  else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
514
1085
  const frame = {
515
1086
  baseSpanIndex: scope.spans.length,
516
- cachedSpanId: id,
1087
+ replayParentSpanId: id,
517
1088
  ops: []
518
1089
  };
519
1090
  scope.recordingStack.push(frame);
@@ -532,12 +1103,16 @@ async function traceSpan(info, fn) {
532
1103
  finalStatus: spanRecord.status,
533
1104
  finalError: spanRecord.error,
534
1105
  finalErrors: spanRecord.errors,
1106
+ finalWarning: spanRecord.warning,
1107
+ finalWarnings: spanRecord.warnings,
535
1108
  ops: frame.ops
536
1109
  };
537
1110
  const entry = {
538
1111
  version: 1,
539
1112
  key: keyHash,
540
1113
  namespace,
1114
+ operationType: "span",
1115
+ operationName: info.name,
541
1116
  spanName: info.name,
542
1117
  spanKind: info.kind,
543
1118
  storedAt: (/* @__PURE__ */ new Date()).toISOString(),
@@ -569,6 +1144,13 @@ const evalTracer = {
569
1144
  /** Run a callback inside a new trace span and record its lifecycle. */
570
1145
  span: traceSpan,
571
1146
  /**
1147
+ * Cache a pure value without creating a trace span.
1148
+ *
1149
+ * When called inside an active span, the span receives a `cache.refs` entry
1150
+ * describing the value cache status for this run.
1151
+ */
1152
+ cache: createTraceCache(generateSpanId),
1153
+ /**
572
1154
  * Start a span whose lifecycle is controlled by an external tracer/exporter.
573
1155
  *
574
1156
  * Calls are no-ops outside an eval case scope, except that a generated or
@@ -649,194 +1231,6 @@ function buildTraceTree(spans, checkpoints) {
649
1231
  checkpoints
650
1232
  };
651
1233
  }
652
- var SerializedCacheKeyValue = class {
653
- value;
654
- constructor(value) {
655
- this.value = value;
656
- }
657
- };
658
- /**
659
- * Hash the components of a cache key into a deterministic hex digest.
660
- *
661
- * Native `Blob` and `File` values are read asynchronously and hashed by
662
- * content. Use `hashCacheKeySync` only when the key contains no async values.
663
- */
664
- async function hashCacheKey(input) {
665
- return hashCacheKeySyncMaterialized(await materializeAsyncCacheKeyValue(input));
666
- }
667
- /**
668
- * Synchronously hash cache key components. This supports JSON-like data and
669
- * in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
670
- * but cannot content-hash native `Blob` or `File` values.
671
- */
672
- function hashCacheKeySync(input) {
673
- return hashCacheKeySyncMaterialized(input);
674
- }
675
- function hashCacheKeySyncMaterialized(input) {
676
- return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
677
- }
678
- function stringifyCacheKeyValue(value) {
679
- if (value instanceof SerializedCacheKeyValue) return value.value;
680
- if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
681
- if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
682
- if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
683
- if (isArrayBufferView(value)) {
684
- const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
685
- return `$${value.constructor.name}:${hashBytes(bytes)}`;
686
- }
687
- if (isFile$1(value)) return `$file:${getCompositeKey({
688
- lastModified: value.lastModified,
689
- name: value.name,
690
- size: value.size,
691
- type: value.type
692
- })}`;
693
- if (isBlob$1(value)) return `$blob:${getCompositeKey({
694
- size: value.size,
695
- type: value.type
696
- })}`;
697
- }
698
- async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
699
- const serialized = await stringifyAsyncCacheKeyValue(value);
700
- if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
701
- if (stringifyCacheKeyValue(value) !== void 0) return value;
702
- if (!value || typeof value !== "object") return value;
703
- if (Array.isArray(value)) {
704
- const items = [];
705
- for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
706
- return items;
707
- }
708
- if (refs.has(value)) throw new Error("Circular reference detected");
709
- refs.add(value);
710
- const entries = [];
711
- for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
712
- refs.delete(value);
713
- return Object.fromEntries(entries);
714
- }
715
- async function stringifyAsyncCacheKeyValue(value) {
716
- if (isFile$1(value)) return `$file:${getCompositeKey({
717
- bytes: await hashBlobBytes(value),
718
- lastModified: value.lastModified,
719
- name: value.name,
720
- size: value.size,
721
- type: value.type
722
- })}`;
723
- if (isBlob$1(value)) return `$blob:${getCompositeKey({
724
- bytes: await hashBlobBytes(value),
725
- size: value.size,
726
- type: value.type
727
- })}`;
728
- }
729
- async function hashBlobBytes(value) {
730
- return hashBytes(new Uint8Array(await value.arrayBuffer()));
731
- }
732
- function hashBytes(value) {
733
- return createHash("sha256").update(value).digest("hex");
734
- }
735
- function isArrayBuffer(value) {
736
- return value instanceof ArrayBuffer;
737
- }
738
- function isSharedArrayBuffer(value) {
739
- return value instanceof SharedArrayBuffer;
740
- }
741
- function isArrayBufferView(value) {
742
- return ArrayBuffer.isView(value);
743
- }
744
- function isBlob$1(value) {
745
- return value instanceof Blob;
746
- }
747
- function isFile$1(value) {
748
- return value instanceof File;
749
- }
750
- function toJsonSafe(value) {
751
- if (value === void 0) return void 0;
752
- const text = JSON.stringify(value);
753
- return JSON.parse(text);
754
- }
755
- function stripCacheAttributes(attributes) {
756
- if (!attributes) return {};
757
- const result = {};
758
- for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
759
- return result;
760
- }
761
- function serializeSubSpanTree(scope, spanId) {
762
- const original = scope.spans.find((s) => s.id === spanId);
763
- if (!original) return {
764
- kind: "custom",
765
- name: "unknown",
766
- attributes: void 0,
767
- status: "ok",
768
- error: void 0,
769
- errors: void 0,
770
- children: []
771
- };
772
- const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
773
- return {
774
- kind: original.kind,
775
- name: original.name,
776
- attributes: original.attributes,
777
- status: original.status,
778
- error: original.error,
779
- errors: original.errors,
780
- children
781
- };
782
- }
783
- function appendSubSpanOps(scope, frame) {
784
- for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
785
- const candidate = scope.spans[i];
786
- if (candidate?.parentId === frame.cachedSpanId) frame.ops.push({
787
- kind: "subSpan",
788
- span: serializeSubSpanTree(scope, candidate.id)
789
- });
790
- }
791
- }
792
- function replayRecording(scope, parentSpan, recording) {
793
- scope.replayingDepth++;
794
- try {
795
- for (const op of recording.ops) applyRecordingOp(scope, parentSpan, op);
796
- if (Object.keys(recording.finalAttributes).length > 0) mergeSpanAttributes(parentSpan, recording.finalAttributes);
797
- if (recording.finalError !== void 0) parentSpan.error = recording.finalError;
798
- if (recording.finalErrors !== void 0) parentSpan.errors = recording.finalErrors;
799
- } finally {
800
- scope.replayingDepth--;
801
- }
802
- }
803
- function applyRecordingOp(scope, parentSpan, op) {
804
- if (op.kind === "setOutput") {
805
- scope.outputs[op.key] = op.value;
806
- return;
807
- }
808
- if (op.kind === "incrementOutput") {
809
- const existing = scope.outputs[op.key];
810
- if (existing === void 0) scope.outputs[op.key] = op.delta;
811
- else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
812
- else scope.assertionFailures.push({ message: `replay incrementEvalOutput("${op.key}"): existing value is ${typeof existing}, expected number` });
813
- return;
814
- }
815
- if (op.kind === "checkpoint") {
816
- scope.checkpoints.set(op.name, op.data);
817
- return;
818
- }
819
- replaySerializedSpan(scope, parentSpan.id, op.span);
820
- }
821
- function replaySerializedSpan(scope, parentId, serialized) {
822
- const id = generateSpanId();
823
- const now = (/* @__PURE__ */ new Date()).toISOString();
824
- const replayed = {
825
- id,
826
- parentId,
827
- caseId: scope.caseId,
828
- kind: serialized.kind,
829
- name: serialized.name,
830
- startedAt: now,
831
- endedAt: now,
832
- status: serialized.status,
833
- attributes: serialized.attributes,
834
- error: serialized.error,
835
- errors: serialized.errors
836
- };
837
- scope.spans.push(replayed);
838
- for (const child of serialized.children) replaySerializedSpan(scope, id, child);
839
- }
840
1234
  //#endregion
841
1235
  //#region ../shared/src/schemas/display.ts
842
1236
  const scalarCellSchema = z.union([
@@ -976,6 +1370,8 @@ const traceSpanErrorSchema = z.object({
976
1370
  stack: z.string().optional(),
977
1371
  capturedAt: z.string().optional()
978
1372
  }).catchall(z.unknown());
1373
+ /** Schema for a warning attached to a trace span. */
1374
+ const traceSpanWarningSchema = traceSpanErrorSchema;
979
1375
  /** Schema for a persisted trace span captured during case execution. */
980
1376
  const traceSpanSchema = z.object({
981
1377
  id: z.string(),
@@ -993,7 +1389,9 @@ const traceSpanSchema = z.object({
993
1389
  ]),
994
1390
  attributes: z.record(z.string(), z.unknown()).optional(),
995
1391
  error: traceSpanErrorSchema.optional(),
996
- errors: z.array(traceSpanErrorSchema).optional()
1392
+ errors: z.array(traceSpanErrorSchema).optional(),
1393
+ warning: traceSpanWarningSchema.optional(),
1394
+ warnings: z.array(traceSpanWarningSchema).optional()
997
1395
  });
998
1396
  //#endregion
999
1397
  //#region ../shared/src/schemas/chart.ts
@@ -1254,12 +1652,16 @@ const spanCacheOptionsSchema = z.object({
1254
1652
  /** Override the default namespace (`${evalId}__${spanName}`). */
1255
1653
  namespace: z.string().optional()
1256
1654
  });
1655
+ /** Category of operation stored in the eval cache. */
1656
+ const cacheOperationTypeSchema = z.enum(["span", "value"]);
1257
1657
  /** Summary of a single persisted cache entry, used by list/delete endpoints. */
1258
1658
  const cacheListItemSchema = z.object({
1259
1659
  key: z.string(),
1260
1660
  namespace: z.string(),
1261
- spanName: z.string(),
1262
- spanKind: traceSpanKindSchema,
1661
+ operationType: cacheOperationTypeSchema,
1662
+ operationName: z.string(),
1663
+ spanName: z.string().optional(),
1664
+ spanKind: traceSpanKindSchema.optional(),
1263
1665
  storedAt: z.string(),
1264
1666
  codeFingerprint: z.string(),
1265
1667
  sizeBytes: z.number()
@@ -1276,7 +1678,9 @@ const serializedCacheSpanSchema = z.object({
1276
1678
  "cancelled"
1277
1679
  ]),
1278
1680
  error: traceSpanErrorSchema.optional(),
1279
- errors: z.array(traceSpanErrorSchema).optional()
1681
+ errors: z.array(traceSpanErrorSchema).optional(),
1682
+ warning: traceSpanWarningSchema.optional(),
1683
+ warnings: z.array(traceSpanWarningSchema).optional()
1280
1684
  }).extend({ children: z.lazy(() => z.array(serializedCacheSpanSchema)) });
1281
1685
  /**
1282
1686
  * One captured operation performed while a cached span's body executed.
@@ -1290,6 +1694,16 @@ const cacheRecordingOpSchema = z.discriminatedUnion("kind", [
1290
1694
  key: z.string(),
1291
1695
  value: z.unknown()
1292
1696
  }),
1697
+ z.object({
1698
+ kind: z.literal("appendOutput"),
1699
+ key: z.string(),
1700
+ value: z.unknown()
1701
+ }),
1702
+ z.object({
1703
+ kind: z.literal("mergeOutput"),
1704
+ key: z.string(),
1705
+ patch: z.record(z.string(), z.unknown())
1706
+ }),
1293
1707
  z.object({
1294
1708
  kind: z.literal("incrementOutput"),
1295
1709
  key: z.string(),
@@ -1317,6 +1731,8 @@ const cacheRecordingSchema = z.object({
1317
1731
  ]).optional(),
1318
1732
  finalError: traceSpanErrorSchema.optional(),
1319
1733
  finalErrors: z.array(traceSpanErrorSchema).optional(),
1734
+ finalWarning: traceSpanWarningSchema.optional(),
1735
+ finalWarnings: z.array(traceSpanWarningSchema).optional(),
1320
1736
  ops: z.array(cacheRecordingOpSchema)
1321
1737
  });
1322
1738
  /** Persisted cache file containing metadata and a recording. */
@@ -1324,8 +1740,10 @@ const cacheEntrySchema = z.object({
1324
1740
  version: z.literal(1),
1325
1741
  key: z.string(),
1326
1742
  namespace: z.string(),
1327
- spanName: z.string(),
1328
- spanKind: traceSpanKindSchema,
1743
+ operationType: cacheOperationTypeSchema.optional(),
1744
+ operationName: z.string().optional(),
1745
+ spanName: z.string().optional(),
1746
+ spanKind: traceSpanKindSchema.optional(),
1329
1747
  storedAt: z.string(),
1330
1748
  codeFingerprint: z.string(),
1331
1749
  recording: cacheRecordingSchema
@@ -1635,15 +2053,21 @@ function createFsCacheStore(options) {
1635
2053
  if (fileStatResult.error || !fileStatResult.value.isFile()) continue;
1636
2054
  const cacheFile = await readCacheFilePath(filePath);
1637
2055
  if (cacheFile === null) continue;
1638
- for (const entry of Object.values(cacheFile.entries)) items.push({
1639
- key: entry.key,
1640
- namespace: entry.namespace,
1641
- spanName: entry.spanName,
1642
- spanKind: entry.spanKind,
1643
- storedAt: entry.storedAt,
1644
- codeFingerprint: entry.codeFingerprint,
1645
- sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
1646
- });
2056
+ for (const entry of Object.values(cacheFile.entries)) {
2057
+ const operationType = entry.operationType ?? "span";
2058
+ const operationName = entry.operationName ?? entry.spanName ?? entry.namespace;
2059
+ items.push({
2060
+ key: entry.key,
2061
+ namespace: entry.namespace,
2062
+ operationType,
2063
+ operationName,
2064
+ spanName: entry.spanName,
2065
+ spanKind: entry.spanKind,
2066
+ storedAt: entry.storedAt,
2067
+ codeFingerprint: entry.codeFingerprint,
2068
+ sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
2069
+ });
2070
+ }
1647
2071
  }
1648
2072
  items.sort((a, b) => a.storedAt < b.storedAt ? 1 : -1);
1649
2073
  return items;
@@ -1804,80 +2228,6 @@ function safeJsonParse(text) {
1804
2228
  return parsed.value;
1805
2229
  }
1806
2230
  //#endregion
1807
- //#region ../runner/src/chartValidation.ts
1808
- function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
1809
- const columnDef = columnsByKey.get(metric.key);
1810
- if (!columnDef) {
1811
- warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
1812
- return false;
1813
- }
1814
- if (metric.aggregate === "passThresholdRate") {
1815
- if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
1816
- warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
1817
- return false;
1818
- }
1819
- }
1820
- return true;
1821
- }
1822
- function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
1823
- const columnDef = columnsByKey.get(extra.key);
1824
- if (!columnDef) {
1825
- warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
1826
- return false;
1827
- }
1828
- if (extra.aggregate === "passThresholdRate") {
1829
- if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
1830
- warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
1831
- return false;
1832
- }
1833
- }
1834
- return true;
1835
- }
1836
- function sanitizeChart(chart, columnsByKey, evalId, warnings) {
1837
- const metrics = chart.metrics.filter((metric) => {
1838
- if (metric.source === "builtin") return true;
1839
- return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
1840
- });
1841
- if (metrics.length === 0) {
1842
- warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
1843
- return null;
1844
- }
1845
- const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
1846
- if (extra.source === "builtin") return true;
1847
- return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
1848
- });
1849
- return {
1850
- ...chart,
1851
- metrics,
1852
- tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
1853
- };
1854
- }
1855
- /**
1856
- * Validate and sanitize an authored `charts` config against the eval's
1857
- * declared columns. Drops metrics/extras that reference unknown columns or
1858
- * misuse `passThresholdRate`, and drops entire charts whose metrics are all
1859
- * invalid. Returns `charts: undefined` when nothing valid remains so the UI
1860
- * falls back to rendering no chart (matching the opt-in default).
1861
- */
1862
- function validateCharts(params) {
1863
- const { charts, columnDefs, evalId } = params;
1864
- if (!charts || charts.length === 0) return {
1865
- charts: void 0,
1866
- warnings: []
1867
- };
1868
- const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
1869
- const warnings = [];
1870
- const sanitized = [];
1871
- for (const chart of charts) {
1872
- const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
1873
- if (result) sanitized.push(result);
1874
- }
1875
- return {
1876
- charts: sanitized.length > 0 ? sanitized : void 0,
1877
- warnings
1878
- };
1879
- }
1880
- //#endregion
1881
2231
  //#region ../runner/src/columnBuilder.ts
1882
2232
  /**
1883
2233
  * Normalize a user-provided score definition (either a function or an
@@ -2087,98 +2437,6 @@ async function loadConfig() {
2087
2437
  }
2088
2438
  }
2089
2439
  //#endregion
2090
- //#region ../runner/src/discovery.ts
2091
- const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
2092
- const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
2093
- function parseEvalMetas(filePath, content) {
2094
- const metas = [];
2095
- let searchIndex = 0;
2096
- while (searchIndex < content.length) {
2097
- const defineEvalIndex = content.indexOf("defineEval", searchIndex);
2098
- if (defineEvalIndex === -1) break;
2099
- const extracted = extractDefineEvalObject(content, defineEvalIndex);
2100
- if (!extracted) {
2101
- searchIndex = defineEvalIndex + 10;
2102
- continue;
2103
- }
2104
- const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
2105
- if (id !== void 0) {
2106
- const result = {
2107
- filePath,
2108
- id
2109
- };
2110
- const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
2111
- if (title !== void 0) result.title = title;
2112
- metas.push(result);
2113
- }
2114
- searchIndex = extracted.nextIndex;
2115
- }
2116
- return metas;
2117
- }
2118
- function extractDefineEvalObject(content, defineEvalIndex) {
2119
- const openParenIndex = content.indexOf("(", defineEvalIndex);
2120
- if (openParenIndex === -1) return void 0;
2121
- const objectStartIndex = content.indexOf("{", openParenIndex);
2122
- if (objectStartIndex === -1) return void 0;
2123
- let depth = 0;
2124
- let quote;
2125
- let inBlockComment = false;
2126
- let inLineComment = false;
2127
- let isEscaped = false;
2128
- for (let index = objectStartIndex; index < content.length; index++) {
2129
- const currentChar = content[index];
2130
- const nextChar = content[index + 1];
2131
- if (inLineComment) {
2132
- if (currentChar === "\n") inLineComment = false;
2133
- continue;
2134
- }
2135
- if (inBlockComment) {
2136
- if (currentChar === "*" && nextChar === "/") {
2137
- inBlockComment = false;
2138
- index++;
2139
- }
2140
- continue;
2141
- }
2142
- if (quote) {
2143
- if (isEscaped) {
2144
- isEscaped = false;
2145
- continue;
2146
- }
2147
- if (currentChar === "\\") {
2148
- isEscaped = true;
2149
- continue;
2150
- }
2151
- if (currentChar === quote) quote = void 0;
2152
- continue;
2153
- }
2154
- if (currentChar === "/" && nextChar === "/") {
2155
- inLineComment = true;
2156
- index++;
2157
- continue;
2158
- }
2159
- if (currentChar === "/" && nextChar === "*") {
2160
- inBlockComment = true;
2161
- index++;
2162
- continue;
2163
- }
2164
- if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
2165
- quote = currentChar;
2166
- continue;
2167
- }
2168
- if (currentChar === "{") {
2169
- depth++;
2170
- continue;
2171
- }
2172
- if (currentChar === "}") {
2173
- depth--;
2174
- if (depth === 0) return {
2175
- nextIndex: index + 1,
2176
- objectText: content.slice(objectStartIndex, index + 1)
2177
- };
2178
- }
2179
- }
2180
- }
2181
- //#endregion
2182
2440
  //#region ../runner/src/evalModuleLoader.ts
2183
2441
  /**
2184
2442
  * Import one eval module with a cache key derived from its current source so
@@ -2266,30 +2524,6 @@ function setLatestRunInfoMap(params) {
2266
2524
  for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
2267
2525
  }
2268
2526
  //#endregion
2269
- //#region ../runner/src/gitState.ts
2270
- function runGitCommand(workspaceRoot, args) {
2271
- const result = spawnSync("git", args, {
2272
- cwd: workspaceRoot,
2273
- encoding: "utf8",
2274
- stdio: [
2275
- "ignore",
2276
- "pipe",
2277
- "ignore"
2278
- ]
2279
- });
2280
- return {
2281
- status: result.status,
2282
- stdout: result.stdout.trim()
2283
- };
2284
- }
2285
- /** Read the current git commit for the workspace, if available. */
2286
- function readGitWorktreeState(workspaceRoot) {
2287
- const insideWorktree = runGitCommand(workspaceRoot, ["rev-parse", "--is-inside-work-tree"]);
2288
- if (insideWorktree.status !== 0 || insideWorktree.stdout !== "true") return { commitSha: null };
2289
- const commitResult = runGitCommand(workspaceRoot, ["rev-parse", "HEAD"]);
2290
- return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
2291
- }
2292
- //#endregion
2293
2527
  //#region ../runner/src/outputArtifacts.ts
2294
2528
  const mimeTypeExtensionMap = {
2295
2529
  "application/json": ".json",
@@ -2345,100 +2579,329 @@ function normalizeMimeType(value) {
2345
2579
  const normalized = value.trim();
2346
2580
  return normalized.length > 0 ? normalized : "application/octet-stream";
2347
2581
  }
2348
- function getArtifactFileName(params) {
2349
- const { outputKey, mimeType, value } = params;
2350
- if (isFile(value) && value.name.trim().length > 0) return value.name.trim();
2351
- const extension = getExtensionForMimeType(mimeType);
2352
- return extension.length > 0 ? `${sanitizeSegment(outputKey)}${extension}` : sanitizeSegment(outputKey);
2582
+ function getArtifactFileName(params) {
2583
+ const { outputKey, mimeType, value } = params;
2584
+ if (isFile(value) && value.name.trim().length > 0) return value.name.trim();
2585
+ const extension = getExtensionForMimeType(mimeType);
2586
+ return extension.length > 0 ? `${sanitizeSegment(outputKey)}${extension}` : sanitizeSegment(outputKey);
2587
+ }
2588
+ function getExtensionForMimeType(mimeType) {
2589
+ const exactMatch = mimeTypeExtensionMap[mimeType];
2590
+ if (exactMatch) return exactMatch;
2591
+ const subtype = mimeType.split("/")[1];
2592
+ if (subtype === void 0 || subtype.length === 0) return "";
2593
+ const withoutSuffix = subtype.split("+")[0] ?? subtype;
2594
+ return withoutSuffix.length > 0 ? `.${withoutSuffix}` : "";
2595
+ }
2596
+ function sanitizeSegment(value) {
2597
+ const normalized = value.trim().replaceAll(/[^A-Za-z0-9._-]+/g, "-");
2598
+ return normalized.length > 0 ? normalized : "artifact";
2599
+ }
2600
+ function sanitizeFileName(value) {
2601
+ const normalized = sanitizeSegment(value);
2602
+ const extension = extname(normalized);
2603
+ if (extension.length === 0) return normalized;
2604
+ return `${normalized.slice(0, -extension.length).replaceAll(".", "-")}${extension}`;
2605
+ }
2606
+ function isFile(value) {
2607
+ return value instanceof File;
2608
+ }
2609
+ //#endregion
2610
+ //#region ../runner/src/runMaintenance.ts
2611
+ async function persistRunState(runState) {
2612
+ await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
2613
+ await writeFile(join(runState.runDir, "run.json"), JSON.stringify(runState.manifest, null, 2));
2614
+ const casesJsonl = runState.cases.map((c) => JSON.stringify(c)).join("\n");
2615
+ await writeFile(join(runState.runDir, "cases.jsonl"), casesJsonl);
2616
+ }
2617
+ /**
2618
+ * Recompute a persisted case's status after score definitions changed.
2619
+ *
2620
+ * Pass/fail gates are per-score: a case fails when any score with a declared
2621
+ * `passThreshold` reports a numeric value below that threshold. Scores
2622
+ * without a threshold are informational and never gate. Cancelled and
2623
+ * errored cases retain their terminal status.
2624
+ */
2625
+ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
2626
+ if (caseRow.status === "cancelled") return "cancelled";
2627
+ if (caseDetail?.error !== null && caseDetail?.error !== void 0) return "error";
2628
+ if ((caseDetail?.assertionFailures.length ?? 0) > 0) return "fail";
2629
+ for (const [key, passThreshold] of scoreThresholds) {
2630
+ const rawValue = caseRow.columns[key] ?? caseDetail?.columns[key];
2631
+ if (typeof rawValue !== "number") continue;
2632
+ if (rawValue < passThreshold) return "fail";
2633
+ }
2634
+ return caseRow.status === "error" ? "error" : "pass";
2635
+ }
2636
+ function runTouchesEval(params) {
2637
+ if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
2638
+ if (params.target.mode === "all") return params.evalExists;
2639
+ if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
2640
+ return false;
2641
+ }
2642
+ async function recomputeEvalStatusesInRuns(params) {
2643
+ let updatedRuns = 0;
2644
+ for (const run of params.runs) {
2645
+ if (!runTouchesEval({
2646
+ target: run.manifest.target,
2647
+ caseRows: run.cases,
2648
+ evalId: params.evalId,
2649
+ evalExists: params.evalExists
2650
+ })) continue;
2651
+ if (run.manifest.status === "running") continue;
2652
+ let changed = false;
2653
+ for (const caseRow of run.cases) {
2654
+ if (caseRow.evalId !== params.evalId) continue;
2655
+ const caseDetail = run.caseDetails.get(caseRow.caseId);
2656
+ const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
2657
+ if (caseRow.status === nextStatus) continue;
2658
+ caseRow.status = nextStatus;
2659
+ if (caseDetail) {
2660
+ caseDetail.status = nextStatus;
2661
+ await params.persistCaseDetail(run.runDir, caseDetail);
2662
+ }
2663
+ changed = true;
2664
+ }
2665
+ if (!changed) continue;
2666
+ const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
2667
+ run.summary.totalCases = derivedSummary.totalCases;
2668
+ run.summary.passedCases = derivedSummary.passedCases;
2669
+ run.summary.failedCases = derivedSummary.failedCases;
2670
+ run.summary.errorCases = derivedSummary.errorCases;
2671
+ run.summary.cancelledCases = derivedSummary.cancelledCases;
2672
+ await persistRunState(run);
2673
+ updatedRuns += 1;
2674
+ }
2675
+ return updatedRuns;
2676
+ }
2677
+ //#endregion
2678
+ //#region ../runner/src/runPersistence.ts
2679
+ const SHORT_ID_PATTERN = /^r(\d+)$/;
2680
+ /**
2681
+ * Generate a filesystem-safe, sortable run id combining a UTC timestamp
2682
+ * with a short random suffix.
2683
+ */
2684
+ function generateRunId() {
2685
+ const now = /* @__PURE__ */ new Date();
2686
+ const pad = (n) => String(n).padStart(2, "0");
2687
+ return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
2688
+ }
2689
+ function parseShortIdNum(shortId) {
2690
+ if (shortId === void 0) return null;
2691
+ const match = SHORT_ID_PATTERN.exec(shortId);
2692
+ if (!match) return null;
2693
+ const num = Number(match[1]);
2694
+ if (!Number.isFinite(num)) return null;
2695
+ return num;
2696
+ }
2697
+ /**
2698
+ * Return the next `shortId` number to assign based on the existing
2699
+ * loaded snapshots. Legacy runs that don't match the `r\d+` format are
2700
+ * ignored.
2701
+ */
2702
+ function nextShortIdFromSnapshots(snapshots) {
2703
+ let maxNum = -1;
2704
+ for (const snapshot of snapshots) {
2705
+ const num = parseShortIdNum(snapshot.manifest.shortId);
2706
+ if (num !== null && num > maxNum) maxNum = num;
2707
+ }
2708
+ return maxNum + 1;
2709
+ }
2710
+ async function loadPersistedRunSnapshots(localStateDir) {
2711
+ const runsDir = join(localStateDir, "runs");
2712
+ const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
2713
+ if (entriesResult.error) return [];
2714
+ const snapshots = [];
2715
+ const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
2716
+ for (const runDir of runDirs) {
2717
+ const snapshot = await loadPersistedRunSnapshot(runDir);
2718
+ if (!snapshot) continue;
2719
+ snapshots.push(snapshot);
2720
+ }
2721
+ return snapshots;
2722
+ }
2723
+ async function persistCaseDetail(runDir, caseDetail) {
2724
+ await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
2725
+ }
2726
+ function getLastRunStatuses(params) {
2727
+ const latestRunInfos = getLatestRunInfos(params);
2728
+ return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
2729
+ }
2730
+ /**
2731
+ * Return the latest scoped run metadata for each eval based on persisted and
2732
+ * in-memory runs.
2733
+ */
2734
+ function getLatestRunInfos(params) {
2735
+ const { runs, knownEvals } = params;
2736
+ const knownEvalMetas = [...knownEvals];
2737
+ const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
2738
+ const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
2739
+ const latestRunInfos = /* @__PURE__ */ new Map();
2740
+ for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
2741
+ status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
2742
+ startedAt: getRunFreshnessTimestamp(run.manifest),
2743
+ commitSha: run.manifest.commitSha ?? null,
2744
+ evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
2745
+ });
2746
+ return latestRunInfos;
2747
+ }
2748
+ function toLastRunStatus$1(status) {
2749
+ return status === "pending" ? null : status;
2750
+ }
2751
+ async function loadPersistedRunSnapshot(runDir) {
2752
+ const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
2753
+ if (!manifest) return null;
2754
+ const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
2755
+ if (!summary) return null;
2756
+ return {
2757
+ runDir,
2758
+ manifest,
2759
+ summary,
2760
+ cases: await readCaseRows(runDir),
2761
+ caseDetails: await readCaseDetails(runDir)
2762
+ };
2763
+ }
2764
+ async function readParsedJsonFile(filePath, schema) {
2765
+ const fileResult = await resultify(() => readFile(filePath, "utf-8"));
2766
+ if (fileResult.error) return null;
2767
+ const jsonResult = resultify(() => JSON.parse(fileResult.value));
2768
+ if (jsonResult.error) return null;
2769
+ const parsed = schema.safeParse(jsonResult.value);
2770
+ if (!parsed.success) return null;
2771
+ return parsed.data;
2772
+ }
2773
+ async function readCaseRows(runDir) {
2774
+ const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
2775
+ if (fileResult.error) return [];
2776
+ const rows = [];
2777
+ for (const rawLine of fileResult.value.split("\n")) {
2778
+ const line = rawLine.trim();
2779
+ if (line.length === 0) continue;
2780
+ const jsonResult = resultify(() => JSON.parse(line));
2781
+ if (jsonResult.error) continue;
2782
+ const parsed = caseRowSchema.safeParse(jsonResult.value);
2783
+ if (!parsed.success) continue;
2784
+ rows.push(parsed.data);
2785
+ }
2786
+ return rows;
2787
+ }
2788
+ async function readCaseDetails(runDir) {
2789
+ const detailsDir = join(runDir, "case-details");
2790
+ const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
2791
+ if (entriesResult.error) return /* @__PURE__ */ new Map();
2792
+ const caseDetails = /* @__PURE__ */ new Map();
2793
+ for (const entry of entriesResult.value) {
2794
+ if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
2795
+ const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
2796
+ if (!detail) continue;
2797
+ caseDetails.set(detail.caseId, detail);
2798
+ }
2799
+ return caseDetails;
2353
2800
  }
2354
- function getExtensionForMimeType(mimeType) {
2355
- const exactMatch = mimeTypeExtensionMap[mimeType];
2356
- if (exactMatch) return exactMatch;
2357
- const subtype = mimeType.split("/")[1];
2358
- if (subtype === void 0 || subtype.length === 0) return "";
2359
- const withoutSuffix = subtype.split("+")[0] ?? subtype;
2360
- return withoutSuffix.length > 0 ? `.${withoutSuffix}` : "";
2801
+ function getRunEvalIds(run, knownEvalIds) {
2802
+ const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
2803
+ if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
2804
+ else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
2805
+ return [...evalIds];
2361
2806
  }
2362
- function sanitizeSegment(value) {
2363
- const normalized = value.trim().replaceAll(/[^A-Za-z0-9._-]+/g, "-");
2364
- return normalized.length > 0 ? normalized : "artifact";
2807
+ function getEvalStatusForRun(run, evalId, manualScoreKeys) {
2808
+ const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
2809
+ if (evalCases.length > 0) {
2810
+ if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
2811
+ return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
2812
+ }
2813
+ return toLastRunStatus$1(deriveStatusFromChildStatuses({
2814
+ statuses: [],
2815
+ lifecycleStatus: run.manifest.status
2816
+ }));
2365
2817
  }
2366
- function sanitizeFileName(value) {
2367
- const normalized = sanitizeSegment(value);
2368
- const extension = extname(normalized);
2369
- if (extension.length === 0) return normalized;
2370
- return `${normalized.slice(0, -extension.length).replaceAll(".", "-")}${extension}`;
2818
+ function hasPendingManualScores(caseRows, manualScoreKeys) {
2819
+ if (manualScoreKeys.length === 0) return false;
2820
+ return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
2821
+ const value = caseRow.columns[key];
2822
+ return typeof value !== "number" || !Number.isFinite(value);
2823
+ }));
2371
2824
  }
2372
- function isFile(value) {
2373
- return value instanceof File;
2825
+ function encodeCaseDetailFileName(caseId) {
2826
+ return encodeURIComponent(caseId);
2374
2827
  }
2375
2828
  //#endregion
2376
- //#region ../runner/src/runMaintenance.ts
2377
- async function persistRunState(runState) {
2378
- await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
2379
- await writeFile(join(runState.runDir, "run.json"), JSON.stringify(runState.manifest, null, 2));
2380
- const casesJsonl = runState.cases.map((c) => JSON.stringify(c)).join("\n");
2381
- await writeFile(join(runState.runDir, "cases.jsonl"), casesJsonl);
2829
+ //#region ../runner/src/moduleIsolation.ts
2830
+ const isolationParam = "agent-evals-isolate";
2831
+ const pathSegmentSeparatorPattern = /[\\/]+/;
2832
+ const isolationStorage = new AsyncLocalStorage();
2833
+ const activeIsolationRoots = /* @__PURE__ */ new Map();
2834
+ let hooksRegistered = false;
2835
+ const requireFromRunner = createRequire(import.meta.url);
2836
+ const agentPackageUrlBySpecifier = new Map([
2837
+ "@ls-stack/agent-eval",
2838
+ "@agent-evals/sdk",
2839
+ "@agent-evals/shared",
2840
+ "@agent-evals/runner",
2841
+ "@agent-evals/runner/run-child"
2842
+ ].flatMap((specifier) => {
2843
+ try {
2844
+ return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
2845
+ } catch {
2846
+ return [];
2847
+ }
2848
+ }));
2849
+ function isAgentEvalsPackageSpecifier(specifier) {
2850
+ return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
2851
+ }
2852
+ function getIsolationKeyFromParent(parentURL) {
2853
+ if (!parentURL?.startsWith("file:")) return null;
2854
+ const value = new URL(parentURL).searchParams.get(isolationParam);
2855
+ return activeIsolationRoots.has(value ?? "") ? value : null;
2856
+ }
2857
+ function isWorkspaceFile(url, workspaceRoot) {
2858
+ if (url.protocol !== "file:") return false;
2859
+ const relativePath = relative(workspaceRoot, fileURLToPath(url));
2860
+ if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
2861
+ const segments = relativePath.split(pathSegmentSeparatorPattern);
2862
+ return !segments.includes("node_modules") && !segments.includes(".agent-evals");
2863
+ }
2864
+ function addIsolationParam(url, key) {
2865
+ const moduleUrl = new URL(url);
2866
+ if (moduleUrl.searchParams.get(isolationParam) === key) return url;
2867
+ moduleUrl.searchParams.set(isolationParam, key);
2868
+ return moduleUrl.href;
2869
+ }
2870
+ function registerModuleIsolationHooks() {
2871
+ if (hooksRegistered) return;
2872
+ hooksRegistered = true;
2873
+ registerHooks({ resolve(specifier, context, nextResolve) {
2874
+ const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
2875
+ if (agentPackageUrl !== void 0) return {
2876
+ url: agentPackageUrl,
2877
+ shortCircuit: true
2878
+ };
2879
+ const resolved = nextResolve(specifier, context);
2880
+ if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
2881
+ const activeContext = isolationStorage.getStore();
2882
+ const inferredKey = getIsolationKeyFromParent(context.parentURL);
2883
+ const isolationKey = activeContext?.key ?? inferredKey;
2884
+ if (isolationKey === null) return resolved;
2885
+ const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
2886
+ if (workspaceRoot === void 0) return resolved;
2887
+ if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
2888
+ return {
2889
+ ...resolved,
2890
+ url: addIsolationParam(resolved.url, isolationKey)
2891
+ };
2892
+ } });
2382
2893
  }
2383
2894
  /**
2384
- * Recompute a persisted case's status after score definitions changed.
2895
+ * Execute module loading and eval code with fresh workspace module URLs.
2385
2896
  *
2386
- * Pass/fail gates are per-score: a case fails when any score with a declared
2387
- * `passThreshold` reports a numeric value below that threshold. Scores
2388
- * without a threshold are informational and never gate. Cancelled and
2389
- * errored cases retain their terminal status.
2897
+ * Node does not expose an ESM cache reset API, so the runner appends a
2898
+ * run-scoped query parameter to workspace file imports. Package imports are
2899
+ * left alone so SDK singletons, such as the eval registry, remain shared.
2390
2900
  */
2391
- function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
2392
- if (caseRow.status === "cancelled") return "cancelled";
2393
- if (caseDetail?.error !== null && caseDetail?.error !== void 0) return "error";
2394
- if ((caseDetail?.assertionFailures.length ?? 0) > 0) return "fail";
2395
- for (const [key, passThreshold] of scoreThresholds) {
2396
- const rawValue = caseRow.columns[key] ?? caseDetail?.columns[key];
2397
- if (typeof rawValue !== "number") continue;
2398
- if (rawValue < passThreshold) return "fail";
2399
- }
2400
- return caseRow.status === "error" ? "error" : "pass";
2401
- }
2402
- function runTouchesEval(params) {
2403
- if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
2404
- if (params.target.mode === "all") return params.evalExists;
2405
- if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
2406
- return false;
2407
- }
2408
- async function recomputeEvalStatusesInRuns(params) {
2409
- let updatedRuns = 0;
2410
- for (const run of params.runs) {
2411
- if (!runTouchesEval({
2412
- target: run.manifest.target,
2413
- caseRows: run.cases,
2414
- evalId: params.evalId,
2415
- evalExists: params.evalExists
2416
- })) continue;
2417
- if (run.manifest.status === "running") continue;
2418
- let changed = false;
2419
- for (const caseRow of run.cases) {
2420
- if (caseRow.evalId !== params.evalId) continue;
2421
- const caseDetail = run.caseDetails.get(caseRow.caseId);
2422
- const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
2423
- if (caseRow.status === nextStatus) continue;
2424
- caseRow.status = nextStatus;
2425
- if (caseDetail) {
2426
- caseDetail.status = nextStatus;
2427
- await params.persistCaseDetail(run.runDir, caseDetail);
2428
- }
2429
- changed = true;
2430
- }
2431
- if (!changed) continue;
2432
- const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
2433
- run.summary.totalCases = derivedSummary.totalCases;
2434
- run.summary.passedCases = derivedSummary.passedCases;
2435
- run.summary.failedCases = derivedSummary.failedCases;
2436
- run.summary.errorCases = derivedSummary.errorCases;
2437
- run.summary.cancelledCases = derivedSummary.cancelledCases;
2438
- await persistRunState(run);
2439
- updatedRuns += 1;
2440
- }
2441
- return updatedRuns;
2901
+ async function runWithModuleIsolation(context, fn) {
2902
+ registerModuleIsolationHooks();
2903
+ activeIsolationRoots.set(context.key, context.workspaceRoot);
2904
+ return await isolationStorage.run(context, fn);
2442
2905
  }
2443
2906
  //#endregion
2444
2907
  //#region ../runner/src/traceDisplay.ts
@@ -2528,18 +2991,25 @@ async function callWithUnknownResult(fn, args) {
2528
2991
  return await Reflect.apply(fn, void 0, args);
2529
2992
  }
2530
2993
  async function runCase(params) {
2531
- const { evalDef, evalId, evalCase, globalTraceDisplay, trial, signal, startTime, cacheAdapter, cacheMode, codeFingerprint, artifactDir, runId } = params;
2994
+ const { evalDef, evalId, evalCase, globalTraceDisplay, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, artifactDir, runId } = params;
2532
2995
  const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
2533
- await Reflect.apply(evalDef.execute, evalDef, [{
2534
- input: evalCase.input,
2535
- signal
2536
- }]);
2537
- }, { cacheContext: cacheAdapter ? {
2538
- adapter: cacheAdapter,
2539
- mode: cacheMode,
2540
- evalId,
2541
- codeFingerprint
2542
- } : void 0 });
2996
+ const execute = async () => {
2997
+ await Reflect.apply(evalDef.execute, evalDef, [{ input: evalCase.input }]);
2998
+ };
2999
+ if (moduleIsolation === void 0) {
3000
+ await execute();
3001
+ return;
3002
+ }
3003
+ await runWithModuleIsolation(moduleIsolation, execute);
3004
+ }, {
3005
+ input: evalCase.input,
3006
+ cacheContext: cacheAdapter ? {
3007
+ adapter: cacheAdapter,
3008
+ mode: cacheMode,
3009
+ evalId,
3010
+ codeFingerprint
3011
+ } : void 0
3012
+ });
2543
3013
  const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
2544
3014
  const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
2545
3015
  if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
@@ -2555,20 +3025,35 @@ async function runCase(params) {
2555
3025
  const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
2556
3026
  scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
2557
3027
  }
3028
+ if (!nonAssertError && evalDef.outputsSchema) {
3029
+ const parsedOutputs = evalDef.outputsSchema.safeParse(getOutputsSchemaInput(evalDef.outputsSchema, scope.outputs));
3030
+ if (parsedOutputs.success) scope.outputs = {
3031
+ ...scope.outputs,
3032
+ ...parsedOutputs.data
3033
+ };
3034
+ else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error)));
3035
+ }
2558
3036
  const scoreResults = /* @__PURE__ */ new Map();
2559
3037
  const scoringTraces = {};
2560
- if (!nonAssertError && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
3038
+ if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
2561
3039
  const { compute, passThreshold, label } = normalizeScoreDef(def);
2562
- const scoreRun = await runInEvalScope(evalCase.id, async () => await callWithUnknownResult(compute, [{
3040
+ const scoreRun = await runInEvalScope(evalCase.id, async () => {
3041
+ const computeScore = async () => await callWithUnknownResult(compute, [{
3042
+ input: evalCase.input,
3043
+ outputs: { ...scope.outputs },
3044
+ case: evalCase
3045
+ }]);
3046
+ if (moduleIsolation === void 0) return await computeScore();
3047
+ return await runWithModuleIsolation(moduleIsolation, computeScore);
3048
+ }, {
2563
3049
  input: evalCase.input,
2564
- outputs: { ...scope.outputs },
2565
- case: evalCase
2566
- }]), { cacheContext: cacheAdapter ? {
2567
- adapter: cacheAdapter,
2568
- mode: cacheMode,
2569
- evalId: `${evalId}__score__${key}`,
2570
- codeFingerprint
2571
- } : void 0 });
3050
+ cacheContext: cacheAdapter ? {
3051
+ adapter: cacheAdapter,
3052
+ mode: cacheMode,
3053
+ evalId: `${evalId}__score__${key}`,
3054
+ codeFingerprint
3055
+ } : void 0
3056
+ });
2572
3057
  const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
2573
3058
  if (trace.length > 0) scoringTraces[key] = {
2574
3059
  trace,
@@ -2638,200 +3123,61 @@ async function runCase(params) {
2638
3123
  input: evalCase.input,
2639
3124
  trace: displayTrace,
2640
3125
  traceDisplay,
2641
- columns,
2642
- assertionFailures: scope.assertionFailures,
2643
- error: errorInfo,
2644
- trial
2645
- };
2646
- if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
2647
- return {
2648
- caseDetail,
2649
- caseRowUpdate: {
2650
- status,
2651
- latencyMs: Date.now() - startTime,
2652
- columns
2653
- }
2654
- };
2655
- }
2656
- function isRecord(value) {
2657
- return typeof value === "object" && value !== null;
2658
- }
2659
- function isBlob(value) {
2660
- return value instanceof Blob;
2661
- }
2662
- function toAssertionFailure(message, error = void 0) {
2663
- return error?.stack ? {
2664
- message,
2665
- stack: error.stack
2666
- } : { message };
2667
- }
2668
- //#endregion
2669
- //#region ../runner/src/runPersistence.ts
2670
- const SHORT_ID_PATTERN = /^r(\d+)$/;
2671
- /**
2672
- * Generate a filesystem-safe, sortable run id combining a UTC timestamp
2673
- * with a short random suffix.
2674
- */
2675
- function generateRunId() {
2676
- const now = /* @__PURE__ */ new Date();
2677
- const pad = (n) => String(n).padStart(2, "0");
2678
- return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
2679
- }
2680
- function parseShortIdNum(shortId) {
2681
- if (shortId === void 0) return null;
2682
- const match = SHORT_ID_PATTERN.exec(shortId);
2683
- if (!match) return null;
2684
- const num = Number(match[1]);
2685
- if (!Number.isFinite(num)) return null;
2686
- return num;
2687
- }
2688
- /**
2689
- * Return the next `shortId` number to assign based on the existing
2690
- * loaded snapshots. Legacy runs that don't match the `r\d+` format are
2691
- * ignored.
2692
- */
2693
- function nextShortIdFromSnapshots(snapshots) {
2694
- let maxNum = -1;
2695
- for (const snapshot of snapshots) {
2696
- const num = parseShortIdNum(snapshot.manifest.shortId);
2697
- if (num !== null && num > maxNum) maxNum = num;
2698
- }
2699
- return maxNum + 1;
2700
- }
2701
- async function loadPersistedRunSnapshots(localStateDir) {
2702
- const runsDir = join(localStateDir, "runs");
2703
- const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
2704
- if (entriesResult.error) return [];
2705
- const snapshots = [];
2706
- const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
2707
- for (const runDir of runDirs) {
2708
- const snapshot = await loadPersistedRunSnapshot(runDir);
2709
- if (!snapshot) continue;
2710
- snapshots.push(snapshot);
2711
- }
2712
- return snapshots;
2713
- }
2714
- async function persistCaseDetail(runDir, caseDetail) {
2715
- await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
2716
- }
2717
- function getLastRunStatuses(params) {
2718
- const latestRunInfos = getLatestRunInfos(params);
2719
- return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
2720
- }
2721
- /**
2722
- * Return the latest scoped run metadata for each eval based on persisted and
2723
- * in-memory runs.
2724
- */
2725
- function getLatestRunInfos(params) {
2726
- const { runs, knownEvals } = params;
2727
- const knownEvalMetas = [...knownEvals];
2728
- const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
2729
- const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
2730
- const latestRunInfos = /* @__PURE__ */ new Map();
2731
- for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
2732
- status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
2733
- startedAt: getRunFreshnessTimestamp(run.manifest),
2734
- commitSha: run.manifest.commitSha ?? null,
2735
- evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
2736
- });
2737
- return latestRunInfos;
2738
- }
2739
- function toLastRunStatus$1(status) {
2740
- return status === "pending" ? null : status;
2741
- }
2742
- async function loadPersistedRunSnapshot(runDir) {
2743
- const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
2744
- if (!manifest) return null;
2745
- const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
2746
- if (!summary) return null;
3126
+ columns,
3127
+ assertionFailures: scope.assertionFailures,
3128
+ error: errorInfo,
3129
+ trial
3130
+ };
3131
+ if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
2747
3132
  return {
2748
- runDir,
2749
- manifest,
2750
- summary,
2751
- cases: await readCaseRows(runDir),
2752
- caseDetails: await readCaseDetails(runDir)
3133
+ caseDetail,
3134
+ caseRowUpdate: {
3135
+ status,
3136
+ latencyMs: Date.now() - startTime,
3137
+ columns
3138
+ }
2753
3139
  };
2754
3140
  }
2755
- async function readParsedJsonFile(filePath, schema) {
2756
- const fileResult = await resultify(() => readFile(filePath, "utf-8"));
2757
- if (fileResult.error) return null;
2758
- const jsonResult = resultify(() => JSON.parse(fileResult.value));
2759
- if (jsonResult.error) return null;
2760
- const parsed = schema.safeParse(jsonResult.value);
2761
- if (!parsed.success) return null;
2762
- return parsed.data;
2763
- }
2764
- async function readCaseRows(runDir) {
2765
- const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
2766
- if (fileResult.error) return [];
2767
- const rows = [];
2768
- for (const rawLine of fileResult.value.split("\n")) {
2769
- const line = rawLine.trim();
2770
- if (line.length === 0) continue;
2771
- const jsonResult = resultify(() => JSON.parse(line));
2772
- if (jsonResult.error) continue;
2773
- const parsed = caseRowSchema.safeParse(jsonResult.value);
2774
- if (!parsed.success) continue;
2775
- rows.push(parsed.data);
2776
- }
2777
- return rows;
2778
- }
2779
- async function readCaseDetails(runDir) {
2780
- const detailsDir = join(runDir, "case-details");
2781
- const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
2782
- if (entriesResult.error) return /* @__PURE__ */ new Map();
2783
- const caseDetails = /* @__PURE__ */ new Map();
2784
- for (const entry of entriesResult.value) {
2785
- if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
2786
- const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
2787
- if (!detail) continue;
2788
- caseDetails.set(detail.caseId, detail);
2789
- }
2790
- return caseDetails;
3141
+ function isRecord(value) {
3142
+ return typeof value === "object" && value !== null;
2791
3143
  }
2792
- function getRunEvalIds(run, knownEvalIds) {
2793
- const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
2794
- if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
2795
- else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
2796
- return [...evalIds];
3144
+ function isBlob(value) {
3145
+ return value instanceof Blob;
2797
3146
  }
2798
- function getEvalStatusForRun(run, evalId, manualScoreKeys) {
2799
- const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
2800
- if (evalCases.length > 0) {
2801
- if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
2802
- return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
2803
- }
2804
- return toLastRunStatus$1(deriveStatusFromChildStatuses({
2805
- statuses: [],
2806
- lifecycleStatus: run.manifest.status
2807
- }));
3147
+ function getOutputsSchemaInput(schema, outputs) {
3148
+ if (!(schema instanceof z.ZodObject)) return outputs;
3149
+ const configuredOutputs = {};
3150
+ for (const key of Object.keys(schema.shape)) if (key in outputs) configuredOutputs[key] = outputs[key];
3151
+ return configuredOutputs;
2808
3152
  }
2809
- function hasPendingManualScores(caseRows, manualScoreKeys) {
2810
- if (manualScoreKeys.length === 0) return false;
2811
- return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
2812
- const value = caseRow.columns[key];
2813
- return typeof value !== "number" || !Number.isFinite(value);
2814
- }));
3153
+ function formatOutputsSchemaError(error) {
3154
+ const issueLines = error.issues.map((issue) => {
3155
+ return `${issue.path.length > 0 ? issue.path.join(".") : "<root>"}: ${issue.message}`;
3156
+ });
3157
+ if (issueLines.length === 0) return "outputsSchema validation failed";
3158
+ return `outputsSchema validation failed:\n${issueLines.join("\n")}`;
2815
3159
  }
2816
- function encodeCaseDetailFileName(caseId) {
2817
- return encodeURIComponent(caseId);
3160
+ function toAssertionFailure(message, error = void 0) {
3161
+ return error?.stack ? {
3162
+ message,
3163
+ stack: error.stack
3164
+ } : { message };
2818
3165
  }
2819
3166
  //#endregion
2820
3167
  //#region ../runner/src/runQueue.ts
2821
3168
  async function executeQueuedCases(params) {
2822
- const { runState, queuedCases, concurrency, globalTraceDisplay } = params;
3169
+ const { queuedCases, concurrency, globalTraceDisplay } = params;
2823
3170
  let nextCaseIndex = 0;
2824
3171
  let workerError = void 0;
2825
3172
  const workerCount = Math.min(concurrency, queuedCases.length);
2826
3173
  const workers = Array.from({ length: workerCount }, async () => {
2827
- while (!runState.abortController.signal.aborted && workerError === void 0) {
3174
+ while (workerError === void 0) {
2828
3175
  const queuedCase = queuedCases[nextCaseIndex];
2829
3176
  nextCaseIndex += 1;
2830
3177
  if (queuedCase === void 0) return;
2831
3178
  try {
2832
3179
  await executeQueuedCase({
2833
3180
  queuedCase,
2834
- runState,
2835
3181
  globalTraceDisplay
2836
3182
  });
2837
3183
  } catch (error) {
@@ -2845,11 +3191,10 @@ async function executeQueuedCases(params) {
2845
3191
  if (workerError !== void 0) throw new Error(typeof workerError === "string" ? workerError : typeof workerError === "number" || typeof workerError === "boolean" || typeof workerError === "bigint" ? String(workerError) : workerError === null ? "null" : "Unknown queue worker error");
2846
3192
  }
2847
3193
  async function executeQueuedCase(params) {
2848
- const { queuedCase, runState, globalTraceDisplay } = params;
3194
+ const { queuedCase, globalTraceDisplay } = params;
2849
3195
  const startTime = Date.now();
2850
3196
  const result = await queuedCase.execute({
2851
3197
  globalTraceDisplay,
2852
- signal: runState.abortController.signal,
2853
3198
  startTime
2854
3199
  });
2855
3200
  await queuedCase.onComplete(result);
@@ -2900,7 +3245,48 @@ function pickWinningTrial(params) {
2900
3245
  if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
2901
3246
  return medianAttempt;
2902
3247
  }
2903
- async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals }) {
3248
+ async function finalizePreparedCase(params) {
3249
+ const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
3250
+ if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
3251
+ preparedCase.finalized = true;
3252
+ const winningTrial = pickWinningTrial({
3253
+ strategy: runState.manifest.trialSelection,
3254
+ attempts: preparedCase.trialResults,
3255
+ scoreKeys: preparedEval.scoreKeys
3256
+ });
3257
+ if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
3258
+ runState.cases.push(winningTrial.caseRow);
3259
+ runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
3260
+ preparedEval.mergeColumns(winningTrial.caseDetail.columns);
3261
+ if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
3262
+ else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
3263
+ else runState.summary.failedCases++;
3264
+ await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
3265
+ await persistCaseDetail(runDir, winningTrial.caseDetail);
3266
+ onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
3267
+ emitEvent(runState, {
3268
+ type: "case.finished",
3269
+ runId: runState.manifest.id,
3270
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3271
+ payload: winningTrial.caseRow
3272
+ });
3273
+ preparedEval.evalCaseRows.push(winningTrial.caseRow);
3274
+ }
3275
+ function getPreparedCaseOrderKey(caseRow) {
3276
+ return `${caseRow.evalId}\u0000${caseRow.caseId}`;
3277
+ }
3278
+ function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
3279
+ const orderByCase = /* @__PURE__ */ new Map();
3280
+ let order = 0;
3281
+ for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
3282
+ orderByCase.set(`${preparedEval.evalMeta.id}\u0000${preparedCase.caseId}`, order);
3283
+ order++;
3284
+ }
3285
+ caseRows.sort((left, right) => {
3286
+ return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
3287
+ });
3288
+ }
3289
+ async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
2904
3290
  try {
2905
3291
  const targetEvals = getTargetEvals(request);
2906
3292
  emitEvent(runState, {
@@ -2909,14 +3295,16 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
2909
3295
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2910
3296
  payload: runState.manifest
2911
3297
  });
2912
- const allCaseRows = [];
2913
3298
  const evalErrors = [];
2914
3299
  const queuedCases = [];
2915
3300
  const preparedEvals = [];
2916
3301
  const cacheMode = runState.manifest.cacheMode ?? "use";
2917
3302
  const cacheEnabled = config.cache?.enabled !== false;
3303
+ const moduleIsolation = {
3304
+ key: runState.manifest.id,
3305
+ workspaceRoot
3306
+ };
2918
3307
  for (const evalMeta of targetEvals) {
2919
- if (runState.abortController.signal.aborted) break;
2920
3308
  const evalFilePath = evalMeta.sourceFilePath;
2921
3309
  let codeFingerprint = "";
2922
3310
  try {
@@ -2928,7 +3316,9 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
2928
3316
  else delete runState.manifest.evalSourceFingerprints[evalMeta.id];
2929
3317
  try {
2930
3318
  const registry = getEvalRegistry();
2931
- await loadEvalModule(evalFilePath, codeFingerprint);
3319
+ await runWithModuleIsolation(moduleIsolation, async () => {
3320
+ await loadEvalModule(evalFilePath, codeFingerprint);
3321
+ });
2932
3322
  const entry = registry.get(evalMeta.id);
2933
3323
  if (!entry) {
2934
3324
  evalErrors.push({
@@ -2937,74 +3327,87 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
2937
3327
  });
2938
3328
  continue;
2939
3329
  }
2940
- await entry.use(async (evalDef) => {
2941
- const cases = filterEvalCases(resolveRunnableEvalCases({
2942
- cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
2943
- evalId: evalMeta.id
2944
- }), request.target.evalIds, request.target.caseIds, evalMeta.id);
2945
- runState.summary.totalCases += cases.length;
2946
- const accumulatedColumns = /* @__PURE__ */ new Map();
2947
- const evalCaseRows = [];
2948
- const preparedCases = [];
2949
- const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
2950
- const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
2951
- preparedEvals.push({
2952
- evalMeta,
2953
- accumulatedColumns,
2954
- evalCaseRows,
2955
- preparedCases,
2956
- scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
2957
- mergeColumns: (columns) => {
2958
- mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
2959
- }
2960
- });
2961
- for (const evalCase of cases) {
2962
- if (runState.abortController.signal.aborted) break;
2963
- const trialResults = [];
2964
- preparedCases.push({
2965
- caseId: evalCase.id,
2966
- trialResults
2967
- });
2968
- for (let trial = 0; trial < request.trials; trial++) {
2969
- const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
2970
- queuedCases.push({
2971
- execute: async ({ startTime, signal, globalTraceDisplay }) => {
2972
- const { caseDetail, caseRowUpdate } = await runCase({
2973
- evalDef,
2974
- evalId: evalMeta.id,
2975
- evalCase,
2976
- globalTraceDisplay,
2977
- trial,
2978
- signal,
2979
- startTime,
2980
- cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
2981
- cacheMode,
2982
- codeFingerprint,
2983
- artifactDir: join(runDir, "artifacts"),
2984
- runId: runState.manifest.id
2985
- });
2986
- return {
2987
- caseDetail,
2988
- caseRow: {
2989
- caseId: evalCase.id,
3330
+ await runWithModuleIsolation(moduleIsolation, async () => {
3331
+ await entry.use(async (evalDef) => {
3332
+ const cases = filterEvalCases(resolveRunnableEvalCases({
3333
+ cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
3334
+ evalId: evalMeta.id
3335
+ }), request.target.evalIds, request.target.caseIds, evalMeta.id);
3336
+ runState.summary.totalCases += cases.length;
3337
+ const accumulatedColumns = /* @__PURE__ */ new Map();
3338
+ const evalCaseRows = [];
3339
+ const preparedCases = [];
3340
+ const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
3341
+ const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
3342
+ const preparedEval = {
3343
+ evalMeta,
3344
+ accumulatedColumns,
3345
+ evalCaseRows,
3346
+ preparedCases,
3347
+ scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
3348
+ mergeColumns: (columns) => {
3349
+ mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
3350
+ }
3351
+ };
3352
+ preparedEvals.push(preparedEval);
3353
+ for (const evalCase of cases) {
3354
+ const trialResults = [];
3355
+ const preparedCase = {
3356
+ caseId: evalCase.id,
3357
+ trialResults,
3358
+ finalized: false
3359
+ };
3360
+ preparedCases.push(preparedCase);
3361
+ for (let trial = 0; trial < request.trials; trial++) {
3362
+ const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
3363
+ queuedCases.push({
3364
+ execute: async ({ startTime, globalTraceDisplay }) => {
3365
+ const { caseDetail, caseRowUpdate } = await runCase({
3366
+ evalDef,
2990
3367
  evalId: evalMeta.id,
2991
- status: caseRowUpdate.status ?? "pending",
2992
- latencyMs: caseRowUpdate.latencyMs ?? null,
2993
- columns: caseRowUpdate.columns ?? {},
2994
- trial
2995
- }
2996
- };
2997
- },
2998
- onComplete: ({ caseDetail, caseRow }) => {
2999
- trialResults.push({
3000
- caseDetail,
3001
- caseRow,
3002
- bufferedCacheStore
3003
- });
3004
- }
3005
- });
3368
+ evalCase,
3369
+ globalTraceDisplay,
3370
+ trial,
3371
+ startTime,
3372
+ cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
3373
+ cacheMode,
3374
+ codeFingerprint,
3375
+ moduleIsolation,
3376
+ artifactDir: join(runDir, "artifacts"),
3377
+ runId: runState.manifest.id
3378
+ });
3379
+ return {
3380
+ caseDetail,
3381
+ caseRow: {
3382
+ caseId: evalCase.id,
3383
+ evalId: evalMeta.id,
3384
+ status: caseRowUpdate.status ?? "pending",
3385
+ latencyMs: caseRowUpdate.latencyMs ?? null,
3386
+ columns: caseRowUpdate.columns ?? {},
3387
+ trial
3388
+ }
3389
+ };
3390
+ },
3391
+ onComplete: async ({ caseDetail, caseRow }) => {
3392
+ trialResults.push({
3393
+ caseDetail,
3394
+ caseRow,
3395
+ bufferedCacheStore
3396
+ });
3397
+ if (trialResults.length !== request.trials) return;
3398
+ await finalizePreparedCase({
3399
+ runState,
3400
+ runDir,
3401
+ preparedEval,
3402
+ preparedCase,
3403
+ onCaseFinished,
3404
+ emitEvent
3405
+ });
3406
+ }
3407
+ });
3408
+ }
3006
3409
  }
3007
- }
3410
+ });
3008
3411
  });
3009
3412
  } catch (error) {
3010
3413
  console.error(`Error running eval ${evalMeta.id}:`, error);
@@ -3022,37 +3425,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3022
3425
  }
3023
3426
  }
3024
3427
  await executeQueuedCases({
3025
- runState,
3026
3428
  queuedCases,
3027
3429
  concurrency: getConfiguredConcurrency(),
3028
3430
  globalTraceDisplay: config.traceDisplay
3029
3431
  });
3030
3432
  for (const preparedEval of preparedEvals) {
3031
- for (const preparedCase of preparedEval.preparedCases) {
3032
- if (preparedCase.trialResults.length === 0) continue;
3033
- const winningTrial = pickWinningTrial({
3034
- strategy: runState.manifest.trialSelection,
3035
- attempts: preparedCase.trialResults,
3036
- scoreKeys: preparedEval.scoreKeys
3037
- });
3038
- if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
3039
- runState.cases.push(winningTrial.caseRow);
3040
- runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
3041
- preparedEval.mergeColumns(winningTrial.caseDetail.columns);
3042
- if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
3043
- else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
3044
- else runState.summary.failedCases++;
3045
- await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
3046
- await persistCaseDetail(runDir, winningTrial.caseDetail);
3047
- emitEvent(runState, {
3048
- type: "case.finished",
3049
- runId: runState.manifest.id,
3050
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3051
- payload: winningTrial.caseRow
3052
- });
3053
- preparedEval.evalCaseRows.push(winningTrial.caseRow);
3054
- allCaseRows.push(winningTrial.caseRow);
3055
- }
3433
+ for (const preparedCase of preparedEval.preparedCases) await finalizePreparedCase({
3434
+ runState,
3435
+ runDir,
3436
+ preparedEval,
3437
+ preparedCase,
3438
+ onCaseFinished,
3439
+ emitEvent
3440
+ });
3056
3441
  preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
3057
3442
  lastRunStatusMap.set(preparedEval.evalMeta.id, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
3058
3443
  const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.id) ?? null;
@@ -3063,9 +3448,11 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3063
3448
  evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.id] ?? null
3064
3449
  });
3065
3450
  }
3451
+ sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
3452
+ for (const preparedEval of preparedEvals) sortCaseRowsByPreparedOrder(preparedEval.evalCaseRows, preparedEvals);
3066
3453
  const endTime = /* @__PURE__ */ new Date();
3067
3454
  runState.summary.totalDurationMs = endTime.getTime() - new Date(runState.manifest.startedAt).getTime();
3068
- const finalStatus = runState.abortController.signal.aborted ? "cancelled" : evalErrors.length > 0 ? "error" : "completed";
3455
+ const finalStatus = evalErrors.length > 0 ? "error" : "completed";
3069
3456
  runState.summary.status = finalStatus;
3070
3457
  runState.manifest.status = finalStatus;
3071
3458
  const completedRunAt = endTime.toISOString();
@@ -3087,6 +3474,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3087
3474
  evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalId] ?? null
3088
3475
  });
3089
3476
  }
3477
+ await persistRunState(runState);
3090
3478
  emitEvent(runState, {
3091
3479
  type: "run.summary",
3092
3480
  runId: runState.manifest.id,
@@ -3105,7 +3493,6 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3105
3493
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3106
3494
  payload: runState.summary
3107
3495
  });
3108
- await persistRunState(runState);
3109
3496
  emitDiscoveryEvent();
3110
3497
  } catch (error) {
3111
3498
  const message = error instanceof Error ? error.message : String(error);
@@ -3113,13 +3500,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3113
3500
  runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
3114
3501
  runState.summary.status = "error";
3115
3502
  runState.summary.errorMessage = message;
3503
+ await persistRunState(runState);
3116
3504
  emitEvent(runState, {
3117
3505
  type: "run.error",
3118
3506
  runId: runState.manifest.id,
3119
3507
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3120
3508
  payload: { message }
3121
3509
  });
3122
- await persistRunState(runState);
3123
3510
  emitDiscoveryEvent();
3124
3511
  }
3125
3512
  }
@@ -3127,872 +3514,4 @@ function toLastRunStatus(status) {
3127
3514
  return status === "pending" ? null : status;
3128
3515
  }
3129
3516
  //#endregion
3130
- //#region ../runner/src/runner.ts
3131
- const globMagicCharacters = new Set([
3132
- "*",
3133
- "?",
3134
- "[",
3135
- "]",
3136
- "{",
3137
- "}",
3138
- "(",
3139
- ")",
3140
- "!",
3141
- "+",
3142
- "@"
3143
- ]);
3144
- function hasGlobMagic(value) {
3145
- for (const char of value) if (globMagicCharacters.has(char)) return true;
3146
- return false;
3147
- }
3148
- function getWatchRootForIncludePattern(params) {
3149
- const segments = params.pattern.replaceAll("\\", "/").split("/").filter((part) => part !== "");
3150
- const firstGlobSegmentIndex = segments.findIndex(hasGlobMagic);
3151
- if (firstGlobSegmentIndex === -1) return dirname(resolve(params.workspaceRoot, params.pattern));
3152
- if (firstGlobSegmentIndex === 0) return params.workspaceRoot;
3153
- return resolve(params.workspaceRoot, segments.slice(0, firstGlobSegmentIndex).join("/"));
3154
- }
3155
- function getWatchRootsForIncludePatterns(params) {
3156
- const roots = /* @__PURE__ */ new Set();
3157
- for (const pattern of params.patterns) roots.add(getWatchRootForIncludePattern({
3158
- pattern,
3159
- workspaceRoot: params.workspaceRoot
3160
- }));
3161
- if (roots.size === 0) return [params.workspaceRoot];
3162
- return [...roots];
3163
- }
3164
- /** Create an in-memory eval runner bound to the current workspace config. */
3165
- function createRunner({ watchForChanges = true } = {}) {
3166
- let config;
3167
- let workspaceRoot;
3168
- let localStateDir;
3169
- let cacheStore;
3170
- const evals = /* @__PURE__ */ new Map();
3171
- const runs = /* @__PURE__ */ new Map();
3172
- const lastRunStatusMap = /* @__PURE__ */ new Map();
3173
- const latestRunInfoMap = /* @__PURE__ */ new Map();
3174
- const discoveryListeners = /* @__PURE__ */ new Set();
3175
- let nextShortIdNum = 0;
3176
- let discoveryWatcher;
3177
- let discoveryRefreshTimer;
3178
- function toWorkspaceRelativePath(filePath) {
3179
- return relative(workspaceRoot, filePath).replaceAll("\\", "/");
3180
- }
3181
- function getSortedEvalMetas() {
3182
- return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
3183
- }
3184
- function getSourceFingerprint(source) {
3185
- return createHash("sha256").update(source).digest("hex");
3186
- }
3187
- function getConfiguredConcurrency() {
3188
- const configuredConcurrency = config.concurrency;
3189
- if (typeof configuredConcurrency !== "number" || !Number.isFinite(configuredConcurrency)) return 1;
3190
- return Math.max(1, Math.floor(configuredConcurrency));
3191
- }
3192
- const runner = {
3193
- async init() {
3194
- config = await loadConfig();
3195
- workspaceRoot = config.workspaceRoot ?? process.cwd();
3196
- localStateDir = resolve(workspaceRoot, ".agent-evals");
3197
- await mkdir(localStateDir, { recursive: true });
3198
- await mkdir(join(localStateDir, "runs"), { recursive: true });
3199
- cacheStore = createFsCacheStore({
3200
- workspaceRoot,
3201
- dir: config.cache?.dir,
3202
- maxEntriesPerEval: config.cache?.maxEntriesPerEval
3203
- });
3204
- await loadPersistedRuns();
3205
- await runner.refreshDiscovery();
3206
- if (watchForChanges) await setupWatcher();
3207
- },
3208
- async listCache() {
3209
- return cacheStore.list();
3210
- },
3211
- async clearCache(filter) {
3212
- await cacheStore.clear(filter);
3213
- },
3214
- async recomputeStatusesForEval(evalId) {
3215
- const evalMeta = evals.get(evalId);
3216
- if (!evalMeta) return { updatedRuns: 0 };
3217
- const registry = getEvalRegistry();
3218
- await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
3219
- const entry = registry.get(evalId);
3220
- if (!entry) return { updatedRuns: 0 };
3221
- const scoreThresholds = /* @__PURE__ */ new Map();
3222
- entry.use((evalDef) => {
3223
- for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
3224
- const threshold = normalizeScoreDef(def).passThreshold;
3225
- if (threshold !== void 0) scoreThresholds.set(key, threshold);
3226
- }
3227
- for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
3228
- });
3229
- const updatedRuns = await recomputeEvalStatusesInRuns({
3230
- runs: runs.values(),
3231
- evalId,
3232
- evalExists: evals.has(evalId),
3233
- scoreThresholds,
3234
- persistCaseDetail
3235
- });
3236
- emitDiscoveryEvent();
3237
- return { updatedRuns };
3238
- },
3239
- async cleanRunsForEval(evalId) {
3240
- let deletedRuns = 0;
3241
- for (const [runId, run] of [...runs]) {
3242
- if (!runTouchesEval({
3243
- target: run.manifest.target,
3244
- caseRows: run.cases,
3245
- evalId,
3246
- evalExists: evals.has(evalId)
3247
- })) continue;
3248
- if (run.manifest.status === "running") continue;
3249
- runs.delete(runId);
3250
- await rm(run.runDir, {
3251
- recursive: true,
3252
- force: true
3253
- });
3254
- deletedRuns += 1;
3255
- }
3256
- emitDiscoveryEvent();
3257
- return { deletedRuns };
3258
- },
3259
- async updateManualScore({ runId, caseId, scoreKey, value }) {
3260
- const run = runs.get(runId);
3261
- if (!run) return {
3262
- updated: false,
3263
- reason: "Run not found"
3264
- };
3265
- if (run.manifest.status === "running") return {
3266
- updated: false,
3267
- reason: "Run is still running"
3268
- };
3269
- const caseRow = run.cases.find((row) => row.caseId === caseId);
3270
- if (!caseRow) return {
3271
- updated: false,
3272
- reason: "Case not found"
3273
- };
3274
- const evalMeta = evals.get(caseRow.evalId);
3275
- if (!evalMeta) return {
3276
- updated: false,
3277
- reason: "Eval not found"
3278
- };
3279
- if (evalMeta.columnDefs.find((def) => def.key === scoreKey)?.isManualScore !== true) return {
3280
- updated: false,
3281
- reason: "Manual score not found"
3282
- };
3283
- const caseDetail = run.caseDetails.get(caseId);
3284
- if (!caseDetail) return {
3285
- updated: false,
3286
- reason: "Case detail not found"
3287
- };
3288
- caseRow.columns[scoreKey] = value;
3289
- caseDetail.columns[scoreKey] = value;
3290
- const scoreThresholds = /* @__PURE__ */ new Map();
3291
- for (const def of evalMeta.columnDefs) {
3292
- if (def.isScore !== true || def.passThreshold === void 0) continue;
3293
- scoreThresholds.set(def.key, def.passThreshold);
3294
- }
3295
- const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds);
3296
- caseRow.status = nextStatus;
3297
- caseDetail.status = nextStatus;
3298
- const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
3299
- run.summary.totalCases = derivedSummary.totalCases;
3300
- run.summary.passedCases = derivedSummary.passedCases;
3301
- run.summary.failedCases = derivedSummary.failedCases;
3302
- run.summary.errorCases = derivedSummary.errorCases;
3303
- run.summary.cancelledCases = derivedSummary.cancelledCases;
3304
- run.summary.totalDurationMs = derivedSummary.totalDurationMs;
3305
- await persistCaseDetail(run.runDir, caseDetail);
3306
- await persistRunState(run);
3307
- emitDiscoveryEvent();
3308
- return {
3309
- updated: true,
3310
- run: {
3311
- manifest: run.manifest,
3312
- summary: run.summary,
3313
- cases: run.cases
3314
- },
3315
- caseDetail
3316
- };
3317
- },
3318
- async deleteRun(runId) {
3319
- const run = runs.get(runId);
3320
- if (!run) return { deleted: false };
3321
- if (run.manifest.status === "running") return { deleted: false };
3322
- runs.delete(runId);
3323
- await rm(run.runDir, {
3324
- recursive: true,
3325
- force: true
3326
- });
3327
- emitDiscoveryEvent();
3328
- return { deleted: true };
3329
- },
3330
- getEvals() {
3331
- const gitState = readGitWorktreeState(workspaceRoot);
3332
- const result = [];
3333
- for (const meta of getSortedEvalMetas()) result.push(buildEvalSummary({
3334
- meta,
3335
- config,
3336
- gitState,
3337
- latestRun: latestRunInfoMap.get(meta.id),
3338
- lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
3339
- }));
3340
- return result;
3341
- },
3342
- getEval(id) {
3343
- const meta = evals.get(id);
3344
- if (!meta) return void 0;
3345
- return buildEvalSummary({
3346
- meta,
3347
- config,
3348
- gitState: readGitWorktreeState(workspaceRoot),
3349
- latestRun: latestRunInfoMap.get(meta.id),
3350
- lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
3351
- });
3352
- },
3353
- async refreshDiscovery() {
3354
- const patterns = config.include;
3355
- const discovered = [];
3356
- for (const pattern of patterns) {
3357
- const files = await glob(pattern, {
3358
- cwd: workspaceRoot,
3359
- absolute: true
3360
- });
3361
- discovered.push(...files);
3362
- }
3363
- evals.clear();
3364
- for (const filePath of discovered) try {
3365
- const content = await readFile(filePath, "utf-8");
3366
- const discoveredMetas = parseEvalMetas(filePath, content);
3367
- const sourceFingerprint = getSourceFingerprint(content);
3368
- const registry = getEvalRegistry();
3369
- try {
3370
- await loadEvalModule(filePath, sourceFingerprint);
3371
- } catch {}
3372
- for (const meta of discoveredMetas) {
3373
- const discoveredEntry = registry.get(meta.id);
3374
- const title = meta.title;
3375
- let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
3376
- let stats;
3377
- let charts;
3378
- discoveredEntry?.use((evalDef) => {
3379
- columnDefs = buildDeclaredColumnDefs(evalDef.columns, evalDef.scores, evalDef.manualScores);
3380
- stats = evalDef.stats;
3381
- const validated = validateCharts({
3382
- charts: evalDef.charts,
3383
- columnDefs,
3384
- evalId: meta.id
3385
- });
3386
- for (const warning of validated.warnings) console.warn(warning);
3387
- charts = validated.charts;
3388
- });
3389
- evals.set(meta.id, {
3390
- id: meta.id,
3391
- title,
3392
- filePath: toWorkspaceRelativePath(meta.filePath),
3393
- sourceFilePath: meta.filePath,
3394
- sourceFingerprint,
3395
- columnDefs,
3396
- caseCount: null,
3397
- stats,
3398
- charts
3399
- });
3400
- }
3401
- } catch {}
3402
- emitDiscoveryEvent();
3403
- },
3404
- async startRun(request) {
3405
- const runId = generateRunId();
3406
- const shortId = `r${String(nextShortIdNum++)}`;
3407
- const now = (/* @__PURE__ */ new Date()).toISOString();
3408
- const cacheMode = request.cache?.mode ?? "use";
3409
- const runDir = join(localStateDir, "runs", runId);
3410
- const manifest = {
3411
- id: runId,
3412
- shortId,
3413
- status: "running",
3414
- startedAt: now,
3415
- endedAt: null,
3416
- commitSha: readGitWorktreeState(workspaceRoot).commitSha,
3417
- evalSourceFingerprints: {},
3418
- target: request.target,
3419
- trials: request.trials,
3420
- trialSelection: config.trialSelection ?? "lowestScore",
3421
- cacheMode
3422
- };
3423
- const summary = {
3424
- runId,
3425
- status: "running",
3426
- totalCases: 0,
3427
- passedCases: 0,
3428
- failedCases: 0,
3429
- errorCases: 0,
3430
- cancelledCases: 0,
3431
- totalDurationMs: null,
3432
- errorMessage: null
3433
- };
3434
- const abortController = new AbortController();
3435
- const runState = {
3436
- runDir,
3437
- manifest,
3438
- summary,
3439
- cases: [],
3440
- caseDetails: /* @__PURE__ */ new Map(),
3441
- listeners: /* @__PURE__ */ new Set(),
3442
- abortController
3443
- };
3444
- runs.set(runId, runState);
3445
- setLatestRunInfoMap({
3446
- latestRunInfoMap,
3447
- evalIds: getTargetEvalIds({
3448
- request,
3449
- sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
3450
- knownEvalIds: new Set(evals.keys())
3451
- }),
3452
- info: {
3453
- status: "running",
3454
- startedAt: now,
3455
- commitSha: manifest.commitSha ?? null,
3456
- evalSourceFingerprint: null
3457
- }
3458
- });
3459
- await mkdir(runDir, { recursive: true });
3460
- await mkdir(join(runDir, "traces"), { recursive: true });
3461
- await mkdir(join(runDir, "artifacts"), { recursive: true });
3462
- await mkdir(join(runDir, "case-details"), { recursive: true });
3463
- await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
3464
- executeRun({
3465
- runState,
3466
- request,
3467
- runDir,
3468
- config,
3469
- evals,
3470
- cacheStore,
3471
- lastRunStatusMap,
3472
- latestRunInfoMap,
3473
- emitEvent,
3474
- emitDiscoveryEvent,
3475
- getSourceFingerprint,
3476
- getConfiguredConcurrency,
3477
- getSortedEvalMetas,
3478
- getTargetEvals
3479
- });
3480
- return {
3481
- manifest,
3482
- summary,
3483
- cases: []
3484
- };
3485
- },
3486
- getRuns() {
3487
- return [...runs.values()].map((r) => r.manifest);
3488
- },
3489
- getRun(id) {
3490
- const run = runs.get(id);
3491
- if (!run) return void 0;
3492
- return {
3493
- manifest: run.manifest,
3494
- summary: run.summary,
3495
- cases: run.cases
3496
- };
3497
- },
3498
- cancelRun(id) {
3499
- const run = runs.get(id);
3500
- if (!run) return;
3501
- run.abortController.abort();
3502
- run.manifest.status = "cancelled";
3503
- run.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
3504
- run.summary.status = "cancelled";
3505
- emitEvent(run, {
3506
- type: "run.cancelled",
3507
- runId: id,
3508
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3509
- payload: run.summary
3510
- });
3511
- },
3512
- getCaseDetail(runId, caseId) {
3513
- const run = runs.get(runId);
3514
- if (!run) return void 0;
3515
- return run.caseDetails.get(caseId);
3516
- },
3517
- subscribe(runId, listener) {
3518
- const run = runs.get(runId);
3519
- if (!run) return () => {};
3520
- run.listeners.add(listener);
3521
- return () => {
3522
- run.listeners.delete(listener);
3523
- };
3524
- },
3525
- subscribeDiscovery(listener) {
3526
- discoveryListeners.add(listener);
3527
- return () => {
3528
- discoveryListeners.delete(listener);
3529
- };
3530
- },
3531
- async close() {
3532
- if (discoveryRefreshTimer !== void 0) {
3533
- clearTimeout(discoveryRefreshTimer);
3534
- discoveryRefreshTimer = void 0;
3535
- }
3536
- const watcher = discoveryWatcher;
3537
- if (watcher === void 0) return;
3538
- discoveryWatcher = void 0;
3539
- await watcher.close();
3540
- },
3541
- getWorkspaceRoot() {
3542
- return workspaceRoot;
3543
- },
3544
- getArtifactPath(artifactId_) {
3545
- return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
3546
- }
3547
- };
3548
- async function setupWatcher() {
3549
- const watcher = watch(getWatchRootsForIncludePatterns({
3550
- patterns: config.include,
3551
- workspaceRoot
3552
- }), {
3553
- ignoreInitial: true,
3554
- persistent: true
3555
- });
3556
- discoveryWatcher = watcher;
3557
- const scheduleRefresh = () => {
3558
- if (discoveryRefreshTimer !== void 0) clearTimeout(discoveryRefreshTimer);
3559
- discoveryRefreshTimer = setTimeout(() => {
3560
- discoveryRefreshTimer = void 0;
3561
- runner.refreshDiscovery();
3562
- }, 50);
3563
- };
3564
- watcher.on("change", scheduleRefresh);
3565
- watcher.on("add", scheduleRefresh);
3566
- watcher.on("unlink", scheduleRefresh);
3567
- watcher.on("addDir", scheduleRefresh);
3568
- watcher.on("unlinkDir", scheduleRefresh);
3569
- await new Promise((ready) => {
3570
- watcher.once("ready", ready);
3571
- });
3572
- }
3573
- function emitDiscoveryEvent() {
3574
- const lastRunStatuses = getLastRunStatuses({
3575
- runs: runs.values(),
3576
- knownEvals: evals.values()
3577
- });
3578
- const latestRunInfos = getLatestRunInfos({
3579
- runs: runs.values(),
3580
- knownEvals: evals.values()
3581
- });
3582
- lastRunStatusMap.clear();
3583
- for (const [evalId, status] of lastRunStatuses) lastRunStatusMap.set(evalId, status);
3584
- latestRunInfoMap.clear();
3585
- for (const [evalId, info] of latestRunInfos) latestRunInfoMap.set(evalId, info);
3586
- const event = {
3587
- type: "discovery.updated",
3588
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3589
- payload: runner.getEvals()
3590
- };
3591
- for (const listener of discoveryListeners) listener(event);
3592
- }
3593
- function getTargetEvals(request) {
3594
- if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.map((id) => evals.get(id)).filter((e) => e !== void 0);
3595
- return getSortedEvalMetas();
3596
- }
3597
- function emitEvent(runState, event) {
3598
- for (const listener of runState.listeners) try {
3599
- listener(event);
3600
- } catch {}
3601
- }
3602
- async function loadPersistedRuns() {
3603
- runs.clear();
3604
- const persistedRuns = await loadPersistedRunSnapshots(localStateDir);
3605
- nextShortIdNum = nextShortIdFromSnapshots(persistedRuns);
3606
- for (const persistedRun of persistedRuns) runs.set(persistedRun.manifest.id, {
3607
- ...persistedRun,
3608
- listeners: /* @__PURE__ */ new Set(),
3609
- abortController: new AbortController()
3610
- });
3611
- }
3612
- return runner;
3613
- }
3614
- //#endregion
3615
- //#region src/cli.ts
3616
- function parseArgs(argv) {
3617
- const args = {
3618
- command: "help",
3619
- subcommand: void 0,
3620
- showHelp: false,
3621
- helpTopic: "global",
3622
- unknownHelpTarget: void 0,
3623
- evalIds: [],
3624
- caseIds: [],
3625
- trials: 1,
3626
- json: false,
3627
- port: 4100,
3628
- cacheMode: "use",
3629
- clearCache: false,
3630
- all: false
3631
- };
3632
- const command = argv[0];
3633
- if (command === "--help" || command === "-h") {
3634
- args.showHelp = true;
3635
- return args;
3636
- }
3637
- if (isCliCommand(command)) {
3638
- args.command = command;
3639
- args.helpTopic = command === "help" ? "global" : command;
3640
- } else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
3641
- let cursor = 1;
3642
- if (args.command === "cache") {
3643
- const sub = argv[cursor];
3644
- if (sub === "list" || sub === "clear") {
3645
- args.subcommand = sub;
3646
- args.helpTopic = `cache ${sub}`;
3647
- cursor++;
3648
- } else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
3649
- }
3650
- for (let i = cursor; i < argv.length; i++) {
3651
- const arg = argv[i];
3652
- const next = argv[i + 1];
3653
- if (arg === "--help" || arg === "-h") args.showHelp = true;
3654
- else if (arg === "--eval" && next) {
3655
- args.evalIds.push(...next.split(","));
3656
- i++;
3657
- } else if (arg === "--case" && next) {
3658
- args.caseIds.push(...next.split(","));
3659
- i++;
3660
- } else if (arg === "--trials" && next) {
3661
- args.trials = Number(next);
3662
- i++;
3663
- } else if (arg === "--json") args.json = true;
3664
- else if (arg === "--port" && next) {
3665
- args.port = Number(next);
3666
- i++;
3667
- } else if (arg === "--cache" && next) {
3668
- if (next === "use" || next === "bypass" || next === "refresh") args.cacheMode = next;
3669
- i++;
3670
- } else if (arg === "--no-cache") args.cacheMode = "bypass";
3671
- else if (arg === "--refresh-cache") args.cacheMode = "refresh";
3672
- else if (arg === "--clear-cache") args.clearCache = true;
3673
- else if (arg === "--all") args.all = true;
3674
- }
3675
- return args;
3676
- }
3677
- /**
3678
- * Run the Agent Evals CLI against the current workspace.
3679
- *
3680
- * @param argv Raw command-line arguments excluding the executable name.
3681
- */
3682
- async function runCli(argv) {
3683
- const args = parseArgs(argv);
3684
- if (args.showHelp) {
3685
- if (args.unknownHelpTarget !== void 0) {
3686
- console.error(`No help found for "${args.unknownHelpTarget}".`);
3687
- process.exit(1);
3688
- return;
3689
- }
3690
- printHelp(args.helpTopic);
3691
- return;
3692
- }
3693
- switch (args.command) {
3694
- case "app":
3695
- await commandApp(args);
3696
- break;
3697
- case "list":
3698
- await commandList(args);
3699
- break;
3700
- case "run":
3701
- await commandRun(args);
3702
- break;
3703
- case "cache":
3704
- await commandCache(args);
3705
- break;
3706
- default:
3707
- printHelp(args.helpTopic);
3708
- break;
3709
- }
3710
- }
3711
- function isCliCommand(command) {
3712
- return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
3713
- }
3714
- const currentDir = dirname(fileURLToPath(import.meta.url));
3715
- const repoRoot = resolve(currentDir, "../../..");
3716
- const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
3717
- function hasRepoWebWorkspace() {
3718
- return existsSync(resolve(repoRoot, "apps/web/package.json"));
3719
- }
3720
- async function ensureWebUiIsBuilt() {
3721
- if (!hasRepoWebWorkspace()) return;
3722
- console.info("Preparing web UI...");
3723
- await new Promise((resolvePromise, rejectPromise) => {
3724
- const child = spawn(pnpmCommand, [
3725
- "--filter",
3726
- "@agent-evals/web",
3727
- "build"
3728
- ], {
3729
- cwd: repoRoot,
3730
- stdio: "inherit"
3731
- });
3732
- child.once("error", (error) => {
3733
- rejectPromise(error);
3734
- });
3735
- child.once("exit", (code, signal) => {
3736
- if (signal) {
3737
- rejectPromise(/* @__PURE__ */ new Error(`Web UI build stopped with signal ${signal}.`));
3738
- return;
3739
- }
3740
- if (code !== 0) {
3741
- rejectPromise(/* @__PURE__ */ new Error(`Web UI build failed with exit code ${String(code)}.`));
3742
- return;
3743
- }
3744
- resolvePromise();
3745
- });
3746
- });
3747
- }
3748
- function isHonoAppModule(mod) {
3749
- if (typeof mod !== "object" || mod === null || !("app" in mod)) return false;
3750
- const { app } = mod;
3751
- return typeof app === "object" && app !== null && "fetch" in app && typeof app.fetch === "function";
3752
- }
3753
- function isServerRunnerModule(mod) {
3754
- if (typeof mod !== "object" || mod === null || !("initRunner" in mod)) return false;
3755
- return typeof mod.initRunner === "function";
3756
- }
3757
- async function commandApp(args) {
3758
- await ensureWebUiIsBuilt();
3759
- const { serve } = await import("@hono/node-server");
3760
- const bundledWebDist = resolve(currentDir, "apps/web/dist");
3761
- if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
3762
- const appModule = await import("./app-C5CJ1sX6.mjs");
3763
- const runnerModule = await import("./runner-Cdlvk56X.mjs");
3764
- if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
3765
- if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
3766
- await runnerModule.initRunner();
3767
- console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
3768
- serve({
3769
- fetch: appModule.app.fetch,
3770
- port: args.port
3771
- });
3772
- }
3773
- async function commandList(args_) {
3774
- const runner = createRunner({ watchForChanges: false });
3775
- await runner.init();
3776
- const evals = runner.getEvals();
3777
- if (evals.length === 0) {
3778
- console.info("No eval files found.");
3779
- return;
3780
- }
3781
- console.info("Discovered evals:\n");
3782
- for (const ev of evals) {
3783
- const displayStatus = getEvalDisplayStatus({
3784
- freshnessStatus: ev.freshnessStatus,
3785
- stale: ev.stale,
3786
- outdated: ev.outdated,
3787
- lastRunStatus: ev.lastRunStatus
3788
- });
3789
- const title = getEvalTitle(ev);
3790
- console.info(` ${title}`);
3791
- console.info(` id: ${ev.id}`);
3792
- console.info(` file: ${ev.filePath}`);
3793
- if (displayStatus !== "pending") console.info(` status: ${displayStatus}`);
3794
- if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
3795
- console.info("");
3796
- }
3797
- }
3798
- async function commandRun(args) {
3799
- const runner = createRunner({ watchForChanges: false });
3800
- await runner.init();
3801
- if (args.clearCache) {
3802
- await runner.clearCache();
3803
- if (!args.json) {
3804
- console.info("Cleared cache before run.");
3805
- console.info("");
3806
- }
3807
- }
3808
- const target = args.caseIds.length > 0 ? {
3809
- mode: "caseIds",
3810
- caseIds: args.caseIds,
3811
- evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
3812
- } : args.evalIds.length > 0 ? {
3813
- mode: "evalIds",
3814
- evalIds: args.evalIds
3815
- } : { mode: "all" };
3816
- const run = await runner.startRun({
3817
- target,
3818
- trials: args.trials,
3819
- cache: { mode: args.cacheMode }
3820
- });
3821
- if (!args.json) {
3822
- console.info(`Run started: ${run.manifest.id}`);
3823
- console.info(`Trials: ${String(args.trials)}`);
3824
- if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
3825
- console.info("");
3826
- }
3827
- await waitForRunCompletion(runner, run.manifest.id);
3828
- const finalRun = runner.getRun(run.manifest.id);
3829
- if (!finalRun) {
3830
- process.exit(1);
3831
- return;
3832
- }
3833
- const { summary } = finalRun;
3834
- if (args.json) console.info(JSON.stringify(summary, null, 2));
3835
- else {
3836
- console.info("--- Run Summary ---");
3837
- console.info(`Status: ${summary.status}`);
3838
- console.info(`Total: ${String(summary.totalCases)}`);
3839
- console.info(`Passed: ${String(summary.passedCases)}`);
3840
- console.info(`Failed: ${String(summary.failedCases)}`);
3841
- console.info(`Errors: ${String(summary.errorCases)}`);
3842
- if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
3843
- if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
3844
- }
3845
- if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
3846
- }
3847
- async function commandCache(args) {
3848
- const runner = createRunner({ watchForChanges: false });
3849
- await runner.init();
3850
- if (args.subcommand === "list" || args.subcommand === void 0) {
3851
- const entries = await runner.listCache();
3852
- if (args.json) {
3853
- console.info(JSON.stringify(entries, null, 2));
3854
- return;
3855
- }
3856
- if (entries.length === 0) {
3857
- console.info("No cache entries.");
3858
- return;
3859
- }
3860
- console.info(`Cache entries (${String(entries.length)}):\n`);
3861
- for (const entry of entries) {
3862
- console.info(` ${entry.namespace}`);
3863
- console.info(` key: ${entry.key}`);
3864
- console.info(` span: ${entry.spanName} (${entry.spanKind})`);
3865
- console.info(` stored: ${entry.storedAt}`);
3866
- console.info(` size: ${String(entry.sizeBytes)} bytes`);
3867
- console.info("");
3868
- }
3869
- return;
3870
- }
3871
- if (args.subcommand === "clear") {
3872
- if (args.evalIds.length > 0) {
3873
- for (const evalId of args.evalIds) {
3874
- const entries = await runner.listCache();
3875
- const prefix = `${evalId}__`;
3876
- const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
3877
- for (const entry of matching) await runner.clearCache({
3878
- namespace: entry.namespace,
3879
- key: entry.key
3880
- });
3881
- }
3882
- console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
3883
- return;
3884
- }
3885
- if (args.all) {
3886
- await runner.clearCache();
3887
- console.info("Cleared all cache entries.");
3888
- return;
3889
- }
3890
- console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
3891
- process.exit(1);
3892
- return;
3893
- }
3894
- printHelp(args.helpTopic);
3895
- }
3896
- async function waitForRunCompletion(runner, runId) {
3897
- return new Promise((resolvePromise) => {
3898
- const check = () => {
3899
- const run = runner.getRun(runId);
3900
- if (!run || run.manifest.status === "completed" || run.manifest.status === "cancelled" || run.manifest.status === "error") {
3901
- resolvePromise();
3902
- return;
3903
- }
3904
- setTimeout(check, 200);
3905
- };
3906
- check();
3907
- });
3908
- }
3909
- function printHelp(topic = "global") {
3910
- if (topic === "app") {
3911
- console.info(`
3912
- agent-evals app - Start server with UI
3913
-
3914
- Usage:
3915
- agent-evals app [flags]
3916
-
3917
- Flags:
3918
- --port <n> Server port (default: 4100)
3919
- --help, -h Show this help
3920
- `);
3921
- return;
3922
- }
3923
- if (topic === "list") {
3924
- console.info(`
3925
- agent-evals list - List discovered evals
3926
-
3927
- Usage:
3928
- agent-evals list [flags]
3929
-
3930
- Flags:
3931
- --help, -h Show this help
3932
- `);
3933
- return;
3934
- }
3935
- if (topic === "run") {
3936
- console.info(`
3937
- agent-evals run - Run evals
3938
-
3939
- Usage:
3940
- agent-evals run [flags]
3941
-
3942
- Flags:
3943
- --eval <id> Run specific eval(s) (comma-separated)
3944
- --case <id> Run specific case(s) (comma-separated)
3945
- --trials <n> Number of trials per case
3946
- --json Output run summary as JSON
3947
- --cache <use|bypass|refresh> Cache mode for this run (default: use)
3948
- --no-cache Shortcut for --cache bypass
3949
- --refresh-cache Shortcut for --cache refresh
3950
- --clear-cache Clear the cache before starting the run
3951
- --help, -h Show this help
3952
- `);
3953
- return;
3954
- }
3955
- if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
3956
- console.info(`
3957
- agent-evals cache - Manage cached operation entries
3958
-
3959
- Usage:
3960
- agent-evals cache list [flags]
3961
- agent-evals cache clear --eval <id>
3962
- agent-evals cache clear --all
3963
-
3964
- Flags:
3965
- --eval <id> Clear entries for specific eval(s) (comma-separated)
3966
- --all Confirm clearing every cached entry
3967
- --json Output cache listing as JSON
3968
- --help, -h Show this help
3969
- `);
3970
- return;
3971
- }
3972
- console.info(`
3973
- agent-evals - LLM/Agent eval runner
3974
-
3975
- Commands:
3976
- app Start server with UI
3977
- list List discovered evals
3978
- run Run evals
3979
- cache list List cached operation entries
3980
- cache clear --eval <id> Clear cache entries for one eval
3981
- cache clear --all Clear every cached entry
3982
- help Show this help
3983
-
3984
- Options:
3985
- --eval <id> Run specific eval(s) (comma-separated)
3986
- --case <id> Run specific case(s) (comma-separated)
3987
- --trials <n> Number of trials per case
3988
- --json Output results as JSON
3989
- --port <n> Server port (default: 4100)
3990
- --cache <use|bypass|refresh> Cache mode for this run (default: use)
3991
- --no-cache Shortcut for --cache bypass
3992
- --refresh-cache Shortcut for --cache refresh
3993
- --clear-cache Clear the cache before starting the run
3994
- --help, -h Show help
3995
- `);
3996
- }
3997
- //#endregion
3998
- export { columnKindSchema as $, evalSummarySchema as A, evalChartsConfigSchema as B, assertionFailureSchema as C, evalStatAggregateSchema as D, evalFreshnessStatusSchema as E, evalChartColorSchema as F, traceDisplayConfigSchema as G, traceAttributeDisplayInputSchema as H, evalChartConfigSchema as I, traceSpanKindSchema as J, traceDisplayInputConfigSchema as K, evalChartMetricSchema as L, evalChartAggregateSchema as M, evalChartAxisSchema as N, evalStatItemSchema as O, evalChartBuiltinMetricSchema as P, columnFormatSchema as Q, evalChartTooltipExtraSchema as R, spanCacheOptionsSchema as S, caseRowSchema as T, traceAttributeDisplayPlacementSchema as U, traceAttributeDisplayFormatSchema as V, traceAttributeDisplaySchema as W, cellValueSchema as X, traceSpanSchema as Y, columnDefSchema as Z, cacheListItemSchema as _, setEvalOutput as _t, sseEnvelopeSchema as a, buildTraceTree as at, cacheRecordingSchema as b, defineEval as bt, deriveScopedSummaryFromCases as c, evalTracer as ct, runManifestSchema as d, EvalAssertionError as dt, fileRefSchema as et, runSummarySchema as f, evalAssert as ft, cacheFileSchema as g, runInEvalScope as gt, cacheEntrySchema as h, isInEvalScope as ht, updateManualScoreRequestSchema as i, runArtifactRefSchema as it, scoreTraceSchema as j, evalStatsConfigSchema as k, deriveStatusFromCaseRows as l, hashCacheKey as lt, trialSelectionModeSchema as m, incrementEvalOutput as mt, createRunner as n, numberDisplayOptionsSchema as nt, getEvalTitle as o, captureEvalSpanError as ot, agentEvalsConfigSchema as p, getCurrentScope as pt, traceSpanErrorSchema as q, createRunRequestSchema as r, repoFileRefSchema as rt, getEvalDisplayStatus as s, evalSpan as st, runCli as t, jsonCellSchema as tt, deriveStatusFromChildStatuses as u, hashCacheKeySync as ut, cacheModeSchema as v, setScopeCacheContext as vt, caseDetailSchema as w, serializedCacheSpanSchema as x, getEvalRegistry as xt, cacheRecordingOpSchema as y, repoFile as yt, evalChartTypeSchema as z };
3517
+ export { evalChartAxisSchema as $, runManifestSchema as A, evalTracer as At, cacheRecordingSchema as B, mergeEvalOutput as Bt, updateManualScoreRequestSchema as C, numberDisplayOptionsSchema as Ct, deriveScopedSummaryFromCases as D, buildTraceTree as Dt, getEvalDisplayStatus as E, z$1 as Et, cacheFileSchema as F, evalAssert as Ft, caseRowSchema as G, defineEval as Gt, spanCacheOptionsSchema as H, setEvalOutput as Ht, cacheListItemSchema as I, getCurrentScope as It, evalStatItemSchema as J, evalFreshnessStatusSchema as K, getEvalRegistry as Kt, cacheModeSchema as L, getEvalCaseInput as Lt, agentEvalsConfigSchema as M, hashCacheKeySync as Mt, trialSelectionModeSchema as N, EvalAssertionError as Nt, deriveStatusFromCaseRows as O, captureEvalSpanError as Ot, cacheEntrySchema as P, appendToEvalOutput as Pt, evalChartAggregateSchema as Q, cacheOperationTypeSchema as R, incrementEvalOutput as Rt, createRunRequestSchema as S, jsonCellSchema as St, getEvalTitle as T, runArtifactRefSchema as Tt, assertionFailureSchema as U, setScopeCacheContext as Ut, serializedCacheSpanSchema as V, runInEvalScope as Vt, caseDetailSchema as W, repoFile as Wt, evalSummarySchema as X, evalStatsConfigSchema as Y, scoreTraceSchema as Z, loadEvalModule as _, cellValueSchema as _t, loadPersistedRunSnapshot as a, evalChartTypeSchema as at, normalizeScoreDef as b, columnKindSchema as bt, persistCaseDetail as c, traceAttributeDisplayInputSchema as ct, recomputePersistedCaseStatus as d, traceDisplayConfigSchema as dt, evalChartBuiltinMetricSchema as et, runTouchesEval as f, traceDisplayInputConfigSchema as ft, setLatestRunInfoMap as g, traceSpanWarningSchema as gt, getTargetEvalIds as h, traceSpanSchema as ht, getLatestRunInfos as i, evalChartTooltipExtraSchema as it, runSummarySchema as j, hashCacheKey as jt, deriveStatusFromChildStatuses as k, evalSpan as kt, persistRunState as l, traceAttributeDisplayPlacementSchema as lt, buildEvalSummary as m, traceSpanKindSchema as mt, generateRunId as n, evalChartConfigSchema as nt, loadPersistedRunSnapshots as o, evalChartsConfigSchema as ot, resolveArtifactPath as p, traceSpanErrorSchema as pt, evalStatAggregateSchema as q, getLastRunStatuses as r, evalChartMetricSchema as rt, nextShortIdFromSnapshots as s, traceAttributeDisplayFormatSchema as st, executeRun as t, evalChartColorSchema as tt, recomputeEvalStatusesInRuns as u, traceAttributeDisplaySchema as ut, loadConfig as v, columnDefSchema as vt, sseEnvelopeSchema as w, repoFileRefSchema as wt, createFsCacheStore as x, fileRefSchema as xt, buildDeclaredColumnDefs as y, columnFormatSchema as yt, cacheRecordingOpSchema as z, isInEvalScope as zt };