@ls-stack/agent-eval 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-C5CJ1sX6.mjs → app-TjV5nDMM.mjs} +5 -5
- package/dist/apps/web/dist/assets/index-ClE28i5w.css +1 -0
- package/dist/apps/web/dist/assets/index-gGumCEnD.js +112 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +39 -6
- package/dist/cli-BTtgQLjB.mjs +1285 -0
- package/dist/index.d.mts +1072 -829
- package/dist/index.mjs +4 -3
- package/dist/runChild.d.mts +1 -0
- package/dist/runChild.mjs +107 -0
- package/dist/{cli-C5FL7C4G.mjs → runOrchestration-HaMahl6b.mjs} +1216 -1697
- package/dist/{runner-Cdlvk56X.mjs → runner-CBDZos0Z.mjs} +1 -1
- package/dist/{runner-K2bN8KRS.mjs → runner-DGVoOyJt.mjs} +2 -2
- package/dist/src-Bt5Fz9HS.mjs +3 -0
- package/package.json +3 -2
- package/dist/apps/web/dist/assets/index-CBvHVkE7.js +0 -109
- package/dist/apps/web/dist/assets/index-Dd7I28ts.css +0 -1
- package/dist/src-gqm1z1Nu.mjs +0 -2
|
@@ -1,16 +1,14 @@
|
|
|
1
|
+
import { createRequire, registerHooks } from "node:module";
|
|
1
2
|
import { createHash } from "node:crypto";
|
|
2
3
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
3
|
-
import {
|
|
4
|
+
import { extname, isAbsolute, join, relative, resolve } from "node:path";
|
|
5
|
+
import { z, z as z$1 } from "zod/v4";
|
|
4
6
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
5
7
|
import { Buffer as Buffer$1 } from "node:buffer";
|
|
6
8
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
7
|
-
import { z } from "zod/v4";
|
|
8
|
-
import { watch } from "chokidar";
|
|
9
|
-
import { glob } from "glob";
|
|
10
9
|
import { existsSync } from "node:fs";
|
|
11
10
|
import { resultify } from "t-result";
|
|
12
11
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
13
|
-
import { spawn, spawnSync } from "node:child_process";
|
|
14
12
|
//#region ../sdk/src/defineEval.ts
|
|
15
13
|
const evalRegistry = /* @__PURE__ */ new Map();
|
|
16
14
|
/** Return the in-memory registry of evals defined in the current process. */
|
|
@@ -70,6 +68,27 @@ function getCurrentScope() {
|
|
|
70
68
|
function isInEvalScope() {
|
|
71
69
|
return getCurrentScope() !== void 0;
|
|
72
70
|
}
|
|
71
|
+
function isObjectLike(value) {
|
|
72
|
+
return typeof value === "object" && value !== null;
|
|
73
|
+
}
|
|
74
|
+
function isObjectRecord(value) {
|
|
75
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
76
|
+
}
|
|
77
|
+
function copyArray$1(value) {
|
|
78
|
+
return value.map((item) => item);
|
|
79
|
+
}
|
|
80
|
+
function getEvalCaseInput(path = void 0) {
|
|
81
|
+
const scope = getCurrentScope();
|
|
82
|
+
if (!scope) return void 0;
|
|
83
|
+
if (path === void 0) return scope.input;
|
|
84
|
+
if (path.length === 0) return void 0;
|
|
85
|
+
let current = scope.input;
|
|
86
|
+
for (const segment of path.split(".")) {
|
|
87
|
+
if (segment.length === 0 || !isObjectLike(current)) return;
|
|
88
|
+
current = current[segment];
|
|
89
|
+
}
|
|
90
|
+
return current;
|
|
91
|
+
}
|
|
73
92
|
/**
|
|
74
93
|
* Attach cache context (adapter, mode, eval id, fingerprint) to a scope.
|
|
75
94
|
*
|
|
@@ -86,6 +105,7 @@ function setScopeCacheContext(scope, context) {
|
|
|
86
105
|
async function runInEvalScope(caseId, fn, options = {}) {
|
|
87
106
|
const scope = {
|
|
88
107
|
caseId,
|
|
108
|
+
input: options.input,
|
|
89
109
|
outputs: {},
|
|
90
110
|
assertionFailures: [],
|
|
91
111
|
spans: [],
|
|
@@ -145,6 +165,58 @@ function setEvalOutput(key, value) {
|
|
|
145
165
|
});
|
|
146
166
|
}
|
|
147
167
|
/**
|
|
168
|
+
* Append an item to an output array in the current case scope.
|
|
169
|
+
*
|
|
170
|
+
* Missing values become `[value]`, existing arrays receive the item, and
|
|
171
|
+
* existing scalar/object values are preserved as `[existing, value]`.
|
|
172
|
+
*/
|
|
173
|
+
function appendToEvalOutput(key, value) {
|
|
174
|
+
const scope = getCurrentScope();
|
|
175
|
+
if (!scope) return;
|
|
176
|
+
const existing = scope.outputs[key];
|
|
177
|
+
if (existing === void 0) scope.outputs[key] = [value];
|
|
178
|
+
else if (Array.isArray(existing)) scope.outputs[key] = [...copyArray$1(existing), value];
|
|
179
|
+
else scope.outputs[key] = [existing, value];
|
|
180
|
+
recordOpIfActive(scope, {
|
|
181
|
+
kind: "appendOutput",
|
|
182
|
+
key,
|
|
183
|
+
value
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Shallow-merge object fields into an output value in the current case scope.
|
|
188
|
+
*
|
|
189
|
+
* Missing values become a copy of `patch`. Non-object existing values are
|
|
190
|
+
* recorded as assertion failures instead of being replaced.
|
|
191
|
+
*/
|
|
192
|
+
function mergeEvalOutput(key, patch) {
|
|
193
|
+
const scope = getCurrentScope();
|
|
194
|
+
if (!scope) return;
|
|
195
|
+
const existing = scope.outputs[key];
|
|
196
|
+
if (existing === void 0) {
|
|
197
|
+
scope.outputs[key] = { ...patch };
|
|
198
|
+
recordOpIfActive(scope, {
|
|
199
|
+
kind: "mergeOutput",
|
|
200
|
+
key,
|
|
201
|
+
patch
|
|
202
|
+
});
|
|
203
|
+
return;
|
|
204
|
+
}
|
|
205
|
+
if (!isObjectRecord(existing)) {
|
|
206
|
+
scope.assertionFailures.push(toAssertionFailure$1(`mergeEvalOutput("${key}"): existing value is ${Array.isArray(existing) ? "array" : typeof existing}, expected object`));
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
scope.outputs[key] = {
|
|
210
|
+
...existing,
|
|
211
|
+
...patch
|
|
212
|
+
};
|
|
213
|
+
recordOpIfActive(scope, {
|
|
214
|
+
kind: "mergeOutput",
|
|
215
|
+
key,
|
|
216
|
+
patch
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
148
220
|
* Add a numeric delta to an output value in the current case scope.
|
|
149
221
|
*
|
|
150
222
|
* If the existing value is non-numeric, the operation is recorded as an
|
|
@@ -189,18 +261,267 @@ function evalAssert(condition, message) {
|
|
|
189
261
|
throw error;
|
|
190
262
|
}
|
|
191
263
|
//#endregion
|
|
192
|
-
//#region ../sdk/src/
|
|
193
|
-
|
|
264
|
+
//#region ../sdk/src/cacheKey.ts
|
|
265
|
+
var SerializedCacheKeyValue = class {
|
|
266
|
+
value;
|
|
267
|
+
constructor(value) {
|
|
268
|
+
this.value = value;
|
|
269
|
+
}
|
|
270
|
+
};
|
|
271
|
+
/**
|
|
272
|
+
* Hash the components of a cache key into a deterministic hex digest.
|
|
273
|
+
*
|
|
274
|
+
* Native `Blob` and `File` values are read asynchronously and hashed by
|
|
275
|
+
* content. Use `hashCacheKeySync` only when the key contains no async values.
|
|
276
|
+
*/
|
|
277
|
+
async function hashCacheKey(input) {
|
|
278
|
+
return hashCacheKeySyncMaterialized(await materializeAsyncCacheKeyValue(input));
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Synchronously hash cache key components. This supports JSON-like data and
|
|
282
|
+
* in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
|
|
283
|
+
* but cannot content-hash native `Blob` or `File` values.
|
|
284
|
+
*/
|
|
285
|
+
function hashCacheKeySync(input) {
|
|
286
|
+
return hashCacheKeySyncMaterialized(input);
|
|
287
|
+
}
|
|
288
|
+
function hashCacheKeySyncMaterialized(input) {
|
|
289
|
+
return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
|
|
290
|
+
}
|
|
291
|
+
function stringifyCacheKeyValue(value) {
|
|
292
|
+
if (value instanceof SerializedCacheKeyValue) return value.value;
|
|
293
|
+
if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
|
|
294
|
+
if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
|
|
295
|
+
if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
|
|
296
|
+
if (isArrayBufferView(value)) {
|
|
297
|
+
const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
|
|
298
|
+
return `$${value.constructor.name}:${hashBytes(bytes)}`;
|
|
299
|
+
}
|
|
300
|
+
if (isFile$1(value)) return `$file:${getCompositeKey({
|
|
301
|
+
lastModified: value.lastModified,
|
|
302
|
+
name: value.name,
|
|
303
|
+
size: value.size,
|
|
304
|
+
type: value.type
|
|
305
|
+
})}`;
|
|
306
|
+
if (isBlob$1(value)) return `$blob:${getCompositeKey({
|
|
307
|
+
size: value.size,
|
|
308
|
+
type: value.type
|
|
309
|
+
})}`;
|
|
310
|
+
}
|
|
311
|
+
async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
|
|
312
|
+
const serialized = await stringifyAsyncCacheKeyValue(value);
|
|
313
|
+
if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
|
|
314
|
+
if (stringifyCacheKeyValue(value) !== void 0) return value;
|
|
315
|
+
if (!value || typeof value !== "object") return value;
|
|
316
|
+
if (Array.isArray(value)) {
|
|
317
|
+
const items = [];
|
|
318
|
+
for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
|
|
319
|
+
return items;
|
|
320
|
+
}
|
|
321
|
+
if (refs.has(value)) throw new Error("Circular reference detected");
|
|
322
|
+
refs.add(value);
|
|
323
|
+
const entries = [];
|
|
324
|
+
for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
|
|
325
|
+
refs.delete(value);
|
|
326
|
+
return Object.fromEntries(entries);
|
|
327
|
+
}
|
|
328
|
+
async function stringifyAsyncCacheKeyValue(value) {
|
|
329
|
+
if (isFile$1(value)) return `$file:${getCompositeKey({
|
|
330
|
+
bytes: await hashBlobBytes(value),
|
|
331
|
+
lastModified: value.lastModified,
|
|
332
|
+
name: value.name,
|
|
333
|
+
size: value.size,
|
|
334
|
+
type: value.type
|
|
335
|
+
})}`;
|
|
336
|
+
if (isBlob$1(value)) return `$blob:${getCompositeKey({
|
|
337
|
+
bytes: await hashBlobBytes(value),
|
|
338
|
+
size: value.size,
|
|
339
|
+
type: value.type
|
|
340
|
+
})}`;
|
|
341
|
+
}
|
|
342
|
+
async function hashBlobBytes(value) {
|
|
343
|
+
return hashBytes(new Uint8Array(await value.arrayBuffer()));
|
|
344
|
+
}
|
|
345
|
+
function hashBytes(value) {
|
|
346
|
+
return createHash("sha256").update(value).digest("hex");
|
|
347
|
+
}
|
|
348
|
+
function isArrayBuffer(value) {
|
|
349
|
+
return value instanceof ArrayBuffer;
|
|
350
|
+
}
|
|
351
|
+
function isSharedArrayBuffer(value) {
|
|
352
|
+
return value instanceof SharedArrayBuffer;
|
|
353
|
+
}
|
|
354
|
+
function isArrayBufferView(value) {
|
|
355
|
+
return ArrayBuffer.isView(value);
|
|
356
|
+
}
|
|
357
|
+
function isBlob$1(value) {
|
|
358
|
+
return value instanceof Blob;
|
|
359
|
+
}
|
|
360
|
+
function isFile$1(value) {
|
|
361
|
+
return value instanceof File;
|
|
362
|
+
}
|
|
363
|
+
function toJsonSafe(value) {
|
|
364
|
+
if (value === void 0) return void 0;
|
|
365
|
+
const text = JSON.stringify(value);
|
|
366
|
+
return JSON.parse(text);
|
|
367
|
+
}
|
|
368
|
+
//#endregion
|
|
369
|
+
//#region ../sdk/src/cacheRecording.ts
|
|
370
|
+
function mergeSpanAttributes$1(span, attributes) {
|
|
371
|
+
span.attributes = {
|
|
372
|
+
...span.attributes,
|
|
373
|
+
...attributes
|
|
374
|
+
};
|
|
375
|
+
}
|
|
376
|
+
function isRecordLike$1(value) {
|
|
377
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
378
|
+
}
|
|
379
|
+
function valueKind$1(value) {
|
|
380
|
+
return Array.isArray(value) ? "array" : typeof value;
|
|
381
|
+
}
|
|
382
|
+
function copyArray(value) {
|
|
383
|
+
return value.map((item) => item);
|
|
384
|
+
}
|
|
385
|
+
function stripCacheAttributes(attributes) {
|
|
386
|
+
if (!attributes) return {};
|
|
387
|
+
const result = {};
|
|
388
|
+
for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
|
|
389
|
+
return result;
|
|
390
|
+
}
|
|
391
|
+
function snapshotNonCacheAttributes(span) {
|
|
392
|
+
const snapshot = toJsonSafe(stripCacheAttributes(span?.attributes));
|
|
393
|
+
return isRecordLike$1(snapshot) ? snapshot : {};
|
|
394
|
+
}
|
|
395
|
+
function diffNonCacheAttributes(before, after) {
|
|
396
|
+
const result = {};
|
|
397
|
+
for (const [key, value] of Object.entries(after)) if (!cacheAttributeValuesEqual(before[key], value)) result[key] = value;
|
|
398
|
+
return result;
|
|
399
|
+
}
|
|
400
|
+
function cacheAttributeValuesEqual(left, right) {
|
|
401
|
+
if (Object.is(left, right)) return true;
|
|
402
|
+
try {
|
|
403
|
+
return JSON.stringify(left) === JSON.stringify(right);
|
|
404
|
+
} catch {
|
|
405
|
+
return false;
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
function appendCacheRef(span, ref) {
|
|
409
|
+
if (span === void 0) return;
|
|
410
|
+
const existing = span.attributes?.["cache.refs"];
|
|
411
|
+
mergeSpanAttributes$1(span, { "cache.refs": [...Array.isArray(existing) ? copyArray(existing) : [], ref] });
|
|
412
|
+
}
|
|
413
|
+
function serializeSubSpanTree(scope, spanId) {
|
|
414
|
+
const original = scope.spans.find((s) => s.id === spanId);
|
|
415
|
+
if (!original) return {
|
|
416
|
+
kind: "custom",
|
|
417
|
+
name: "unknown",
|
|
418
|
+
attributes: void 0,
|
|
419
|
+
status: "ok",
|
|
420
|
+
error: void 0,
|
|
421
|
+
errors: void 0,
|
|
422
|
+
warning: void 0,
|
|
423
|
+
warnings: void 0,
|
|
424
|
+
children: []
|
|
425
|
+
};
|
|
426
|
+
const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
|
|
427
|
+
return {
|
|
428
|
+
kind: original.kind,
|
|
429
|
+
name: original.name,
|
|
430
|
+
attributes: original.attributes,
|
|
431
|
+
status: original.status,
|
|
432
|
+
error: original.error,
|
|
433
|
+
errors: original.errors,
|
|
434
|
+
warning: original.warning,
|
|
435
|
+
warnings: original.warnings,
|
|
436
|
+
children
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
function appendSubSpanOps(scope, frame) {
|
|
440
|
+
for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
|
|
441
|
+
const candidate = scope.spans[i];
|
|
442
|
+
if (candidate?.parentId === frame.replayParentSpanId) frame.ops.push({
|
|
443
|
+
kind: "subSpan",
|
|
444
|
+
span: serializeSubSpanTree(scope, candidate.id)
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
function replayRecording(scope, parentSpan, recording, options) {
|
|
449
|
+
scope.replayingDepth++;
|
|
450
|
+
try {
|
|
451
|
+
for (const op of recording.ops) applyRecordingOp(scope, parentSpan, op, options);
|
|
452
|
+
if (parentSpan !== void 0 && Object.keys(recording.finalAttributes).length > 0) mergeSpanAttributes$1(parentSpan, recording.finalAttributes);
|
|
453
|
+
if (parentSpan !== void 0 && recording.finalError !== void 0) parentSpan.error = recording.finalError;
|
|
454
|
+
if (parentSpan !== void 0 && recording.finalErrors !== void 0) parentSpan.errors = recording.finalErrors;
|
|
455
|
+
if (parentSpan !== void 0 && recording.finalWarning !== void 0) parentSpan.warning = recording.finalWarning;
|
|
456
|
+
if (parentSpan !== void 0 && recording.finalWarnings !== void 0) parentSpan.warnings = recording.finalWarnings;
|
|
457
|
+
} finally {
|
|
458
|
+
scope.replayingDepth--;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
function applyRecordingOp(scope, parentSpan, op, options) {
|
|
462
|
+
if (op.kind === "setOutput") {
|
|
463
|
+
scope.outputs[op.key] = op.value;
|
|
464
|
+
return;
|
|
465
|
+
}
|
|
466
|
+
if (op.kind === "appendOutput") {
|
|
467
|
+
const existing = scope.outputs[op.key];
|
|
468
|
+
if (existing === void 0) scope.outputs[op.key] = [op.value];
|
|
469
|
+
else if (Array.isArray(existing)) scope.outputs[op.key] = [...copyArray(existing), op.value];
|
|
470
|
+
else scope.outputs[op.key] = [existing, op.value];
|
|
471
|
+
return;
|
|
472
|
+
}
|
|
473
|
+
if (op.kind === "mergeOutput") {
|
|
474
|
+
const existing = scope.outputs[op.key];
|
|
475
|
+
if (existing === void 0) scope.outputs[op.key] = { ...op.patch };
|
|
476
|
+
else if (isRecordLike$1(existing)) scope.outputs[op.key] = {
|
|
477
|
+
...existing,
|
|
478
|
+
...op.patch
|
|
479
|
+
};
|
|
480
|
+
else scope.assertionFailures.push({ message: `replay mergeEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected object` });
|
|
481
|
+
return;
|
|
482
|
+
}
|
|
483
|
+
if (op.kind === "incrementOutput") {
|
|
484
|
+
const existing = scope.outputs[op.key];
|
|
485
|
+
if (existing === void 0) scope.outputs[op.key] = op.delta;
|
|
486
|
+
else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
|
|
487
|
+
else scope.assertionFailures.push({ message: `replay incrementEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected number` });
|
|
488
|
+
return;
|
|
489
|
+
}
|
|
490
|
+
if (op.kind === "checkpoint") {
|
|
491
|
+
scope.checkpoints.set(op.name, op.data);
|
|
492
|
+
return;
|
|
493
|
+
}
|
|
494
|
+
replaySerializedSpan(scope, parentSpan?.id ?? null, op.span, options);
|
|
495
|
+
}
|
|
496
|
+
function replaySerializedSpan(scope, parentId, serialized, options) {
|
|
497
|
+
const id = options.generateSpanId();
|
|
498
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
499
|
+
const replayed = {
|
|
500
|
+
id,
|
|
501
|
+
parentId,
|
|
502
|
+
caseId: scope.caseId,
|
|
503
|
+
kind: serialized.kind,
|
|
504
|
+
name: serialized.name,
|
|
505
|
+
startedAt: now,
|
|
506
|
+
endedAt: now,
|
|
507
|
+
status: serialized.status,
|
|
508
|
+
attributes: serialized.attributes,
|
|
509
|
+
error: serialized.error,
|
|
510
|
+
errors: serialized.errors,
|
|
511
|
+
warning: serialized.warning,
|
|
512
|
+
warnings: serialized.warnings
|
|
513
|
+
};
|
|
514
|
+
scope.spans.push(replayed);
|
|
515
|
+
for (const child of serialized.children) replaySerializedSpan(scope, id, child, options);
|
|
516
|
+
}
|
|
517
|
+
//#endregion
|
|
518
|
+
//#region ../sdk/src/traceDiagnostics.ts
|
|
194
519
|
const errorCoreFields = new Set([
|
|
195
520
|
"name",
|
|
196
521
|
"message",
|
|
197
522
|
"stack",
|
|
198
523
|
"capturedAt"
|
|
199
524
|
]);
|
|
200
|
-
function generateSpanId() {
|
|
201
|
-
spanIdCounter++;
|
|
202
|
-
return `span_${String(Date.now())}_${String(spanIdCounter)}`;
|
|
203
|
-
}
|
|
204
525
|
function isRecord$2(value) {
|
|
205
526
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
206
527
|
}
|
|
@@ -221,33 +542,6 @@ function formatUnknownErrorMessage(error) {
|
|
|
221
542
|
function getErrorExtraFields(error) {
|
|
222
543
|
return Object.fromEntries(Object.entries(error).filter(([key]) => !errorCoreFields.has(key)));
|
|
223
544
|
}
|
|
224
|
-
function updateCurrentSpan(update) {
|
|
225
|
-
const currentSpan = getCurrentScope()?.activeSpanStack.at(-1);
|
|
226
|
-
if (!currentSpan) return;
|
|
227
|
-
update(currentSpan);
|
|
228
|
-
}
|
|
229
|
-
function noopActiveSpan() {
|
|
230
|
-
return {
|
|
231
|
-
setName() {},
|
|
232
|
-
setAttribute() {},
|
|
233
|
-
setAttributes() {}
|
|
234
|
-
};
|
|
235
|
-
}
|
|
236
|
-
function noopExternalSpan(id) {
|
|
237
|
-
return {
|
|
238
|
-
id,
|
|
239
|
-
setName() {},
|
|
240
|
-
setAttribute() {},
|
|
241
|
-
setAttributes() {},
|
|
242
|
-
end() {}
|
|
243
|
-
};
|
|
244
|
-
}
|
|
245
|
-
function mergeSpanAttributes(span, attributes) {
|
|
246
|
-
span.attributes = {
|
|
247
|
-
...span.attributes,
|
|
248
|
-
...attributes
|
|
249
|
-
};
|
|
250
|
-
}
|
|
251
545
|
function normalizeTraceError(error, capturedAt = void 0) {
|
|
252
546
|
if (error instanceof Error) return {
|
|
253
547
|
...getErrorExtraFields(error),
|
|
@@ -274,19 +568,233 @@ function normalizeTraceError(error, capturedAt = void 0) {
|
|
|
274
568
|
capturedAt
|
|
275
569
|
};
|
|
276
570
|
}
|
|
277
|
-
function normalizeTraceErrors(errorOrErrors, additionalErrors, capturedAt) {
|
|
278
|
-
return (additionalErrors.length > 0 ? [errorOrErrors, ...additionalErrors] : Array.isArray(errorOrErrors) ? errorOrErrors : [errorOrErrors]).map((error) => normalizeTraceError(error, capturedAt));
|
|
571
|
+
function normalizeTraceErrors(errorOrErrors, additionalErrors, capturedAt) {
|
|
572
|
+
return (additionalErrors.length > 0 ? [errorOrErrors, ...additionalErrors] : Array.isArray(errorOrErrors) ? errorOrErrors : [errorOrErrors]).map((error) => normalizeTraceError(error, capturedAt));
|
|
573
|
+
}
|
|
574
|
+
function normalizeTraceWarnings(warningOrWarnings, additionalWarnings, capturedAt) {
|
|
575
|
+
return (additionalWarnings.length > 0 ? [warningOrWarnings, ...additionalWarnings] : Array.isArray(warningOrWarnings) ? warningOrWarnings : [warningOrWarnings]).map((warning) => normalizeTraceError(warning, capturedAt));
|
|
576
|
+
}
|
|
577
|
+
function isCaptureEvalSpanErrorOptions(value) {
|
|
578
|
+
if (!isRecord$2(value)) return false;
|
|
579
|
+
const keys = Object.keys(value);
|
|
580
|
+
if (keys.length === 0) return false;
|
|
581
|
+
if (!keys.every((key) => key === "level")) return false;
|
|
582
|
+
return value.level === void 0 || isCaptureEvalSpanErrorLevel(value.level);
|
|
583
|
+
}
|
|
584
|
+
function isCaptureEvalSpanErrorLevel(value) {
|
|
585
|
+
return value === "error" || value === "warning";
|
|
586
|
+
}
|
|
587
|
+
function splitCaptureEvalSpanErrorArgs(additionalErrorsOrOptions) {
|
|
588
|
+
const lastArg = additionalErrorsOrOptions.at(-1);
|
|
589
|
+
if (isCaptureEvalSpanErrorLevel(lastArg)) return {
|
|
590
|
+
additionalErrors: additionalErrorsOrOptions.slice(0, -1),
|
|
591
|
+
options: { level: lastArg }
|
|
592
|
+
};
|
|
593
|
+
if (isCaptureEvalSpanErrorOptions(lastArg)) return {
|
|
594
|
+
additionalErrors: additionalErrorsOrOptions.slice(0, -1),
|
|
595
|
+
options: lastArg
|
|
596
|
+
};
|
|
597
|
+
return {
|
|
598
|
+
additionalErrors: additionalErrorsOrOptions,
|
|
599
|
+
options: {}
|
|
600
|
+
};
|
|
601
|
+
}
|
|
602
|
+
function appendSpanErrors(span, errors) {
|
|
603
|
+
if (errors.length === 0) return;
|
|
604
|
+
const latestError = errors.at(-1);
|
|
605
|
+
if (latestError === void 0) return;
|
|
606
|
+
span.errors = [...span.errors ?? [], ...errors];
|
|
607
|
+
span.error = latestError;
|
|
608
|
+
span.status = "error";
|
|
609
|
+
}
|
|
610
|
+
function appendSpanWarnings(span, warnings) {
|
|
611
|
+
if (warnings.length === 0) return;
|
|
612
|
+
const latestWarning = warnings.at(-1);
|
|
613
|
+
if (latestWarning === void 0) return;
|
|
614
|
+
span.warnings = [...span.warnings ?? [], ...warnings];
|
|
615
|
+
span.warning = latestWarning;
|
|
616
|
+
}
|
|
617
|
+
function hasSpanError(span) {
|
|
618
|
+
return span.error !== void 0 || (span.errors?.length ?? 0) > 0;
|
|
619
|
+
}
|
|
620
|
+
//#endregion
|
|
621
|
+
//#region ../sdk/src/valueCache.ts
|
|
622
|
+
function createTraceCache(generateSpanId) {
|
|
623
|
+
return async function traceCache(info, fn) {
|
|
624
|
+
const scope = getCurrentScope();
|
|
625
|
+
if (!scope) return await fn();
|
|
626
|
+
const cacheCtx = scope.cacheContext;
|
|
627
|
+
if (cacheCtx === void 0 || scope.replayingDepth > 0) return await fn();
|
|
628
|
+
const namespace = info.namespace ?? `${cacheCtx.evalId}__${info.name}`;
|
|
629
|
+
const keyHash = await hashCacheKey({
|
|
630
|
+
namespace,
|
|
631
|
+
codeFingerprint: cacheCtx.codeFingerprint,
|
|
632
|
+
key: info.key
|
|
633
|
+
});
|
|
634
|
+
const activeSpan = scope.activeSpanStack.at(-1);
|
|
635
|
+
if (cacheCtx.mode === "use") {
|
|
636
|
+
const hit = await cacheCtx.adapter.lookup(namespace, keyHash);
|
|
637
|
+
if (hit) {
|
|
638
|
+
const storedAt = hit.storedAt;
|
|
639
|
+
const age = Date.now() - new Date(storedAt).getTime();
|
|
640
|
+
appendCacheRef(activeSpan, {
|
|
641
|
+
type: "value",
|
|
642
|
+
name: info.name,
|
|
643
|
+
namespace,
|
|
644
|
+
key: keyHash,
|
|
645
|
+
status: "hit",
|
|
646
|
+
storedAt,
|
|
647
|
+
age
|
|
648
|
+
});
|
|
649
|
+
replayRecording(scope, activeSpan, hit.recording, { generateSpanId });
|
|
650
|
+
return hit.recording.returnValue;
|
|
651
|
+
}
|
|
652
|
+
appendCacheRef(activeSpan, {
|
|
653
|
+
type: "value",
|
|
654
|
+
name: info.name,
|
|
655
|
+
namespace,
|
|
656
|
+
key: keyHash,
|
|
657
|
+
status: "miss"
|
|
658
|
+
});
|
|
659
|
+
} else if (cacheCtx.mode === "refresh") appendCacheRef(activeSpan, {
|
|
660
|
+
type: "value",
|
|
661
|
+
name: info.name,
|
|
662
|
+
namespace,
|
|
663
|
+
key: keyHash,
|
|
664
|
+
status: "refresh"
|
|
665
|
+
});
|
|
666
|
+
else appendCacheRef(activeSpan, {
|
|
667
|
+
type: "value",
|
|
668
|
+
name: info.name,
|
|
669
|
+
namespace,
|
|
670
|
+
key: keyHash,
|
|
671
|
+
status: "bypass"
|
|
672
|
+
});
|
|
673
|
+
const beforeAttributes = snapshotNonCacheAttributes(activeSpan);
|
|
674
|
+
const frame = {
|
|
675
|
+
baseSpanIndex: scope.spans.length,
|
|
676
|
+
replayParentSpanId: activeSpan?.id ?? null,
|
|
677
|
+
ops: []
|
|
678
|
+
};
|
|
679
|
+
scope.recordingStack.push(frame);
|
|
680
|
+
let bodyResult;
|
|
681
|
+
try {
|
|
682
|
+
bodyResult = await fn();
|
|
683
|
+
} finally {
|
|
684
|
+
scope.recordingStack.pop();
|
|
685
|
+
}
|
|
686
|
+
appendSubSpanOps(scope, frame);
|
|
687
|
+
if (cacheCtx.mode !== "bypass") {
|
|
688
|
+
const finalAttributes = diffNonCacheAttributes(beforeAttributes, snapshotNonCacheAttributes(activeSpan));
|
|
689
|
+
const recording = {
|
|
690
|
+
returnValue: toJsonSafe(bodyResult),
|
|
691
|
+
finalAttributes,
|
|
692
|
+
ops: frame.ops
|
|
693
|
+
};
|
|
694
|
+
await cacheCtx.adapter.write({
|
|
695
|
+
version: 1,
|
|
696
|
+
key: keyHash,
|
|
697
|
+
namespace,
|
|
698
|
+
operationType: "value",
|
|
699
|
+
operationName: info.name,
|
|
700
|
+
storedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
701
|
+
codeFingerprint: cacheCtx.codeFingerprint,
|
|
702
|
+
recording
|
|
703
|
+
});
|
|
704
|
+
}
|
|
705
|
+
return bodyResult;
|
|
706
|
+
};
|
|
707
|
+
}
|
|
708
|
+
//#endregion
|
|
709
|
+
//#region ../sdk/src/tracer.ts
|
|
710
|
+
let spanIdCounter = 0;
|
|
711
|
+
function generateSpanId() {
|
|
712
|
+
spanIdCounter++;
|
|
713
|
+
return `span_${String(Date.now())}_${String(spanIdCounter)}`;
|
|
714
|
+
}
|
|
715
|
+
function updateCurrentSpan(update) {
|
|
716
|
+
const currentSpan = getCurrentScope()?.activeSpanStack.at(-1);
|
|
717
|
+
if (!currentSpan) return;
|
|
718
|
+
update(currentSpan);
|
|
719
|
+
}
|
|
720
|
+
function noopActiveSpan() {
|
|
721
|
+
return {
|
|
722
|
+
setName() {},
|
|
723
|
+
setAttribute() {},
|
|
724
|
+
setAttributes() {},
|
|
725
|
+
incrementAttribute() {},
|
|
726
|
+
appendToAttribute() {},
|
|
727
|
+
mergeAttribute() {}
|
|
728
|
+
};
|
|
729
|
+
}
|
|
730
|
+
function noopExternalSpan(id) {
|
|
731
|
+
return {
|
|
732
|
+
id,
|
|
733
|
+
setName() {},
|
|
734
|
+
setAttribute() {},
|
|
735
|
+
setAttributes() {},
|
|
736
|
+
incrementAttribute() {},
|
|
737
|
+
appendToAttribute() {},
|
|
738
|
+
mergeAttribute() {},
|
|
739
|
+
end() {}
|
|
740
|
+
};
|
|
741
|
+
}
|
|
742
|
+
function mergeSpanAttributes(span, attributes) {
|
|
743
|
+
span.attributes = {
|
|
744
|
+
...span.attributes,
|
|
745
|
+
...attributes
|
|
746
|
+
};
|
|
279
747
|
}
|
|
280
|
-
function
|
|
281
|
-
|
|
282
|
-
const latestError = errors.at(-1);
|
|
283
|
-
if (latestError === void 0) return;
|
|
284
|
-
span.errors = [...span.errors ?? [], ...errors];
|
|
285
|
-
span.error = latestError;
|
|
286
|
-
span.status = "error";
|
|
748
|
+
function isRecordLike(value) {
|
|
749
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
287
750
|
}
|
|
288
|
-
function
|
|
289
|
-
return
|
|
751
|
+
function valueKind(value) {
|
|
752
|
+
return Array.isArray(value) ? "array" : typeof value;
|
|
753
|
+
}
|
|
754
|
+
function recordSpanAttributeAssertion(message) {
|
|
755
|
+
const scope = getCurrentScope();
|
|
756
|
+
if (!scope) return;
|
|
757
|
+
scope.assertionFailures.push({ message });
|
|
758
|
+
}
|
|
759
|
+
function incrementSpanAttribute(span, key, delta) {
|
|
760
|
+
const existing = span.attributes?.[key];
|
|
761
|
+
if (existing === void 0) {
|
|
762
|
+
mergeSpanAttributes(span, { [key]: delta });
|
|
763
|
+
return;
|
|
764
|
+
}
|
|
765
|
+
if (typeof existing !== "number") {
|
|
766
|
+
recordSpanAttributeAssertion(`evalSpan.incrementAttribute("${key}"): existing value is ${valueKind(existing)}, expected number`);
|
|
767
|
+
return;
|
|
768
|
+
}
|
|
769
|
+
mergeSpanAttributes(span, { [key]: existing + delta });
|
|
770
|
+
}
|
|
771
|
+
function appendToSpanAttribute(span, key, value) {
|
|
772
|
+
const existing = span.attributes?.[key];
|
|
773
|
+
if (existing === void 0) {
|
|
774
|
+
mergeSpanAttributes(span, { [key]: [value] });
|
|
775
|
+
return;
|
|
776
|
+
}
|
|
777
|
+
if (Array.isArray(existing)) {
|
|
778
|
+
const items = existing.map((item) => item);
|
|
779
|
+
mergeSpanAttributes(span, { [key]: [...items, value] });
|
|
780
|
+
return;
|
|
781
|
+
}
|
|
782
|
+
mergeSpanAttributes(span, { [key]: [existing, value] });
|
|
783
|
+
}
|
|
784
|
+
function mergeSpanAttribute(span, key, patch) {
|
|
785
|
+
const existing = span.attributes?.[key];
|
|
786
|
+
if (existing === void 0) {
|
|
787
|
+
mergeSpanAttributes(span, { [key]: { ...patch } });
|
|
788
|
+
return;
|
|
789
|
+
}
|
|
790
|
+
if (!isRecordLike(existing)) {
|
|
791
|
+
recordSpanAttributeAssertion(`evalSpan.mergeAttribute("${key}"): existing value is ${valueKind(existing)}, expected object`);
|
|
792
|
+
return;
|
|
793
|
+
}
|
|
794
|
+
mergeSpanAttributes(span, { [key]: {
|
|
795
|
+
...existing,
|
|
796
|
+
...patch
|
|
797
|
+
} });
|
|
290
798
|
}
|
|
291
799
|
function finishSpanWithoutThrownError(span) {
|
|
292
800
|
span.status = hasSpanError(span) ? "error" : "ok";
|
|
@@ -302,9 +810,25 @@ function createSpanHandle(span) {
|
|
|
302
810
|
},
|
|
303
811
|
setAttributes(value) {
|
|
304
812
|
mergeSpanAttributes(span, value);
|
|
813
|
+
},
|
|
814
|
+
incrementAttribute(key, delta) {
|
|
815
|
+
incrementSpanAttribute(span, key, delta);
|
|
816
|
+
},
|
|
817
|
+
appendToAttribute(key, value) {
|
|
818
|
+
appendToSpanAttribute(span, key, value);
|
|
819
|
+
},
|
|
820
|
+
mergeAttribute(key, patch) {
|
|
821
|
+
mergeSpanAttribute(span, key, patch);
|
|
305
822
|
}
|
|
306
823
|
};
|
|
307
824
|
}
|
|
825
|
+
function updateExternalSpanRecord(id, update) {
|
|
826
|
+
const scope = getCurrentScope();
|
|
827
|
+
if (!scope) return;
|
|
828
|
+
const span = findSpan(scope, id);
|
|
829
|
+
if (!span) return;
|
|
830
|
+
update(span);
|
|
831
|
+
}
|
|
308
832
|
function createExternalSpanHandle(id) {
|
|
309
833
|
return {
|
|
310
834
|
id,
|
|
@@ -326,6 +850,21 @@ function createExternalSpanHandle(id) {
|
|
|
326
850
|
attributes: value
|
|
327
851
|
});
|
|
328
852
|
},
|
|
853
|
+
incrementAttribute(key, delta) {
|
|
854
|
+
updateExternalSpanRecord(id, (span) => {
|
|
855
|
+
incrementSpanAttribute(span, key, delta);
|
|
856
|
+
});
|
|
857
|
+
},
|
|
858
|
+
appendToAttribute(key, value) {
|
|
859
|
+
updateExternalSpanRecord(id, (span) => {
|
|
860
|
+
appendToSpanAttribute(span, key, value);
|
|
861
|
+
});
|
|
862
|
+
},
|
|
863
|
+
mergeAttribute(key, patch) {
|
|
864
|
+
updateExternalSpanRecord(id, (span) => {
|
|
865
|
+
mergeSpanAttribute(span, key, patch);
|
|
866
|
+
});
|
|
867
|
+
},
|
|
329
868
|
end(info = {}) {
|
|
330
869
|
endExternalSpan({
|
|
331
870
|
...info,
|
|
@@ -382,6 +921,8 @@ function updateExternalSpan(info) {
|
|
|
382
921
|
if (info.name !== void 0) span.name = info.name;
|
|
383
922
|
if (info.status !== void 0) span.status = info.status;
|
|
384
923
|
if (info.error !== void 0) span.error = info.error;
|
|
924
|
+
if (info.warning !== void 0) span.warning = info.warning;
|
|
925
|
+
if (info.warnings !== void 0) span.warnings = info.warnings;
|
|
385
926
|
if (info.attributes !== void 0) mergeSpanAttributes(span, info.attributes);
|
|
386
927
|
}
|
|
387
928
|
function endExternalSpan(info) {
|
|
@@ -410,6 +951,8 @@ function recordExternalSpan(info) {
|
|
|
410
951
|
existing.status = status;
|
|
411
952
|
existing.attributes = info.attributes;
|
|
412
953
|
existing.error = info.error;
|
|
954
|
+
existing.warning = info.warning;
|
|
955
|
+
existing.warnings = info.warnings;
|
|
413
956
|
return id;
|
|
414
957
|
}
|
|
415
958
|
scope.spans.push({
|
|
@@ -422,7 +965,9 @@ function recordExternalSpan(info) {
|
|
|
422
965
|
endedAt,
|
|
423
966
|
status,
|
|
424
967
|
attributes: info.attributes,
|
|
425
|
-
error: info.error
|
|
968
|
+
error: info.error,
|
|
969
|
+
warning: info.warning,
|
|
970
|
+
warnings: info.warnings
|
|
426
971
|
});
|
|
427
972
|
return id;
|
|
428
973
|
}
|
|
@@ -446,16 +991,42 @@ const evalSpan = {
|
|
|
446
991
|
updateCurrentSpan((currentSpan) => {
|
|
447
992
|
mergeSpanAttributes(currentSpan, value);
|
|
448
993
|
});
|
|
994
|
+
},
|
|
995
|
+
incrementAttribute(key, delta) {
|
|
996
|
+
updateCurrentSpan((currentSpan) => {
|
|
997
|
+
incrementSpanAttribute(currentSpan, key, delta);
|
|
998
|
+
});
|
|
999
|
+
},
|
|
1000
|
+
appendToAttribute(key, value) {
|
|
1001
|
+
updateCurrentSpan((currentSpan) => {
|
|
1002
|
+
appendToSpanAttribute(currentSpan, key, value);
|
|
1003
|
+
});
|
|
1004
|
+
},
|
|
1005
|
+
mergeAttribute(key, patch) {
|
|
1006
|
+
updateCurrentSpan((currentSpan) => {
|
|
1007
|
+
mergeSpanAttribute(currentSpan, key, patch);
|
|
1008
|
+
});
|
|
449
1009
|
}
|
|
450
1010
|
};
|
|
451
1011
|
/**
|
|
452
1012
|
* Attach one or more recoverable errors to the active eval span.
|
|
453
1013
|
*
|
|
454
|
-
*
|
|
455
|
-
* without throwing.
|
|
1014
|
+
* By default the active span is marked as `error` even if its callback later
|
|
1015
|
+
* completes without throwing. Pass `'warning'` or `{ level: 'warning' }` as the
|
|
1016
|
+
* final argument to record the diagnostic without changing span status. Calls
|
|
1017
|
+
* outside `evalTracer.span(...)` are ignored.
|
|
456
1018
|
*/
|
|
457
|
-
function captureEvalSpanError(errorOrErrors, ...
|
|
458
|
-
const
|
|
1019
|
+
function captureEvalSpanError(errorOrErrors, ...additionalErrorsOrOptions) {
|
|
1020
|
+
const { additionalErrors, options } = splitCaptureEvalSpanErrorArgs(additionalErrorsOrOptions);
|
|
1021
|
+
const capturedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
1022
|
+
if ((options.level ?? "error") === "warning") {
|
|
1023
|
+
const warnings = normalizeTraceWarnings(errorOrErrors, additionalErrors, capturedAt);
|
|
1024
|
+
updateCurrentSpan((currentSpan) => {
|
|
1025
|
+
appendSpanWarnings(currentSpan, warnings);
|
|
1026
|
+
});
|
|
1027
|
+
return;
|
|
1028
|
+
}
|
|
1029
|
+
const errors = normalizeTraceErrors(errorOrErrors, additionalErrors, capturedAt);
|
|
459
1030
|
updateCurrentSpan((currentSpan) => {
|
|
460
1031
|
appendSpanErrors(currentSpan, errors);
|
|
461
1032
|
});
|
|
@@ -503,7 +1074,7 @@ async function traceSpan(info, fn) {
|
|
|
503
1074
|
"cache.storedAt": storedAt,
|
|
504
1075
|
"cache.age": Date.now() - new Date(storedAt).getTime()
|
|
505
1076
|
});
|
|
506
|
-
replayRecording(scope, spanRecord, hit.recording);
|
|
1077
|
+
replayRecording(scope, spanRecord, hit.recording, { generateSpanId });
|
|
507
1078
|
spanRecord.status = hit.recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
|
|
508
1079
|
spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
509
1080
|
return hit.recording.returnValue;
|
|
@@ -513,7 +1084,7 @@ async function traceSpan(info, fn) {
|
|
|
513
1084
|
else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
|
|
514
1085
|
const frame = {
|
|
515
1086
|
baseSpanIndex: scope.spans.length,
|
|
516
|
-
|
|
1087
|
+
replayParentSpanId: id,
|
|
517
1088
|
ops: []
|
|
518
1089
|
};
|
|
519
1090
|
scope.recordingStack.push(frame);
|
|
@@ -532,12 +1103,16 @@ async function traceSpan(info, fn) {
|
|
|
532
1103
|
finalStatus: spanRecord.status,
|
|
533
1104
|
finalError: spanRecord.error,
|
|
534
1105
|
finalErrors: spanRecord.errors,
|
|
1106
|
+
finalWarning: spanRecord.warning,
|
|
1107
|
+
finalWarnings: spanRecord.warnings,
|
|
535
1108
|
ops: frame.ops
|
|
536
1109
|
};
|
|
537
1110
|
const entry = {
|
|
538
1111
|
version: 1,
|
|
539
1112
|
key: keyHash,
|
|
540
1113
|
namespace,
|
|
1114
|
+
operationType: "span",
|
|
1115
|
+
operationName: info.name,
|
|
541
1116
|
spanName: info.name,
|
|
542
1117
|
spanKind: info.kind,
|
|
543
1118
|
storedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -569,6 +1144,13 @@ const evalTracer = {
|
|
|
569
1144
|
/** Run a callback inside a new trace span and record its lifecycle. */
|
|
570
1145
|
span: traceSpan,
|
|
571
1146
|
/**
|
|
1147
|
+
* Cache a pure value without creating a trace span.
|
|
1148
|
+
*
|
|
1149
|
+
* When called inside an active span, the span receives a `cache.refs` entry
|
|
1150
|
+
* describing the value cache status for this run.
|
|
1151
|
+
*/
|
|
1152
|
+
cache: createTraceCache(generateSpanId),
|
|
1153
|
+
/**
|
|
572
1154
|
* Start a span whose lifecycle is controlled by an external tracer/exporter.
|
|
573
1155
|
*
|
|
574
1156
|
* Calls are no-ops outside an eval case scope, except that a generated or
|
|
@@ -649,194 +1231,6 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
649
1231
|
checkpoints
|
|
650
1232
|
};
|
|
651
1233
|
}
|
|
652
|
-
var SerializedCacheKeyValue = class {
|
|
653
|
-
value;
|
|
654
|
-
constructor(value) {
|
|
655
|
-
this.value = value;
|
|
656
|
-
}
|
|
657
|
-
};
|
|
658
|
-
/**
|
|
659
|
-
* Hash the components of a cache key into a deterministic hex digest.
|
|
660
|
-
*
|
|
661
|
-
* Native `Blob` and `File` values are read asynchronously and hashed by
|
|
662
|
-
* content. Use `hashCacheKeySync` only when the key contains no async values.
|
|
663
|
-
*/
|
|
664
|
-
async function hashCacheKey(input) {
|
|
665
|
-
return hashCacheKeySyncMaterialized(await materializeAsyncCacheKeyValue(input));
|
|
666
|
-
}
|
|
667
|
-
/**
|
|
668
|
-
* Synchronously hash cache key components. This supports JSON-like data and
|
|
669
|
-
* in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
|
|
670
|
-
* but cannot content-hash native `Blob` or `File` values.
|
|
671
|
-
*/
|
|
672
|
-
function hashCacheKeySync(input) {
|
|
673
|
-
return hashCacheKeySyncMaterialized(input);
|
|
674
|
-
}
|
|
675
|
-
function hashCacheKeySyncMaterialized(input) {
|
|
676
|
-
return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
|
|
677
|
-
}
|
|
678
|
-
function stringifyCacheKeyValue(value) {
|
|
679
|
-
if (value instanceof SerializedCacheKeyValue) return value.value;
|
|
680
|
-
if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
|
|
681
|
-
if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
|
|
682
|
-
if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
|
|
683
|
-
if (isArrayBufferView(value)) {
|
|
684
|
-
const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
|
|
685
|
-
return `$${value.constructor.name}:${hashBytes(bytes)}`;
|
|
686
|
-
}
|
|
687
|
-
if (isFile$1(value)) return `$file:${getCompositeKey({
|
|
688
|
-
lastModified: value.lastModified,
|
|
689
|
-
name: value.name,
|
|
690
|
-
size: value.size,
|
|
691
|
-
type: value.type
|
|
692
|
-
})}`;
|
|
693
|
-
if (isBlob$1(value)) return `$blob:${getCompositeKey({
|
|
694
|
-
size: value.size,
|
|
695
|
-
type: value.type
|
|
696
|
-
})}`;
|
|
697
|
-
}
|
|
698
|
-
async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
|
|
699
|
-
const serialized = await stringifyAsyncCacheKeyValue(value);
|
|
700
|
-
if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
|
|
701
|
-
if (stringifyCacheKeyValue(value) !== void 0) return value;
|
|
702
|
-
if (!value || typeof value !== "object") return value;
|
|
703
|
-
if (Array.isArray(value)) {
|
|
704
|
-
const items = [];
|
|
705
|
-
for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
|
|
706
|
-
return items;
|
|
707
|
-
}
|
|
708
|
-
if (refs.has(value)) throw new Error("Circular reference detected");
|
|
709
|
-
refs.add(value);
|
|
710
|
-
const entries = [];
|
|
711
|
-
for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
|
|
712
|
-
refs.delete(value);
|
|
713
|
-
return Object.fromEntries(entries);
|
|
714
|
-
}
|
|
715
|
-
async function stringifyAsyncCacheKeyValue(value) {
|
|
716
|
-
if (isFile$1(value)) return `$file:${getCompositeKey({
|
|
717
|
-
bytes: await hashBlobBytes(value),
|
|
718
|
-
lastModified: value.lastModified,
|
|
719
|
-
name: value.name,
|
|
720
|
-
size: value.size,
|
|
721
|
-
type: value.type
|
|
722
|
-
})}`;
|
|
723
|
-
if (isBlob$1(value)) return `$blob:${getCompositeKey({
|
|
724
|
-
bytes: await hashBlobBytes(value),
|
|
725
|
-
size: value.size,
|
|
726
|
-
type: value.type
|
|
727
|
-
})}`;
|
|
728
|
-
}
|
|
729
|
-
async function hashBlobBytes(value) {
|
|
730
|
-
return hashBytes(new Uint8Array(await value.arrayBuffer()));
|
|
731
|
-
}
|
|
732
|
-
function hashBytes(value) {
|
|
733
|
-
return createHash("sha256").update(value).digest("hex");
|
|
734
|
-
}
|
|
735
|
-
function isArrayBuffer(value) {
|
|
736
|
-
return value instanceof ArrayBuffer;
|
|
737
|
-
}
|
|
738
|
-
function isSharedArrayBuffer(value) {
|
|
739
|
-
return value instanceof SharedArrayBuffer;
|
|
740
|
-
}
|
|
741
|
-
function isArrayBufferView(value) {
|
|
742
|
-
return ArrayBuffer.isView(value);
|
|
743
|
-
}
|
|
744
|
-
function isBlob$1(value) {
|
|
745
|
-
return value instanceof Blob;
|
|
746
|
-
}
|
|
747
|
-
function isFile$1(value) {
|
|
748
|
-
return value instanceof File;
|
|
749
|
-
}
|
|
750
|
-
function toJsonSafe(value) {
|
|
751
|
-
if (value === void 0) return void 0;
|
|
752
|
-
const text = JSON.stringify(value);
|
|
753
|
-
return JSON.parse(text);
|
|
754
|
-
}
|
|
755
|
-
function stripCacheAttributes(attributes) {
|
|
756
|
-
if (!attributes) return {};
|
|
757
|
-
const result = {};
|
|
758
|
-
for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
|
|
759
|
-
return result;
|
|
760
|
-
}
|
|
761
|
-
function serializeSubSpanTree(scope, spanId) {
|
|
762
|
-
const original = scope.spans.find((s) => s.id === spanId);
|
|
763
|
-
if (!original) return {
|
|
764
|
-
kind: "custom",
|
|
765
|
-
name: "unknown",
|
|
766
|
-
attributes: void 0,
|
|
767
|
-
status: "ok",
|
|
768
|
-
error: void 0,
|
|
769
|
-
errors: void 0,
|
|
770
|
-
children: []
|
|
771
|
-
};
|
|
772
|
-
const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
|
|
773
|
-
return {
|
|
774
|
-
kind: original.kind,
|
|
775
|
-
name: original.name,
|
|
776
|
-
attributes: original.attributes,
|
|
777
|
-
status: original.status,
|
|
778
|
-
error: original.error,
|
|
779
|
-
errors: original.errors,
|
|
780
|
-
children
|
|
781
|
-
};
|
|
782
|
-
}
|
|
783
|
-
function appendSubSpanOps(scope, frame) {
|
|
784
|
-
for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
|
|
785
|
-
const candidate = scope.spans[i];
|
|
786
|
-
if (candidate?.parentId === frame.cachedSpanId) frame.ops.push({
|
|
787
|
-
kind: "subSpan",
|
|
788
|
-
span: serializeSubSpanTree(scope, candidate.id)
|
|
789
|
-
});
|
|
790
|
-
}
|
|
791
|
-
}
|
|
792
|
-
function replayRecording(scope, parentSpan, recording) {
|
|
793
|
-
scope.replayingDepth++;
|
|
794
|
-
try {
|
|
795
|
-
for (const op of recording.ops) applyRecordingOp(scope, parentSpan, op);
|
|
796
|
-
if (Object.keys(recording.finalAttributes).length > 0) mergeSpanAttributes(parentSpan, recording.finalAttributes);
|
|
797
|
-
if (recording.finalError !== void 0) parentSpan.error = recording.finalError;
|
|
798
|
-
if (recording.finalErrors !== void 0) parentSpan.errors = recording.finalErrors;
|
|
799
|
-
} finally {
|
|
800
|
-
scope.replayingDepth--;
|
|
801
|
-
}
|
|
802
|
-
}
|
|
803
|
-
function applyRecordingOp(scope, parentSpan, op) {
|
|
804
|
-
if (op.kind === "setOutput") {
|
|
805
|
-
scope.outputs[op.key] = op.value;
|
|
806
|
-
return;
|
|
807
|
-
}
|
|
808
|
-
if (op.kind === "incrementOutput") {
|
|
809
|
-
const existing = scope.outputs[op.key];
|
|
810
|
-
if (existing === void 0) scope.outputs[op.key] = op.delta;
|
|
811
|
-
else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
|
|
812
|
-
else scope.assertionFailures.push({ message: `replay incrementEvalOutput("${op.key}"): existing value is ${typeof existing}, expected number` });
|
|
813
|
-
return;
|
|
814
|
-
}
|
|
815
|
-
if (op.kind === "checkpoint") {
|
|
816
|
-
scope.checkpoints.set(op.name, op.data);
|
|
817
|
-
return;
|
|
818
|
-
}
|
|
819
|
-
replaySerializedSpan(scope, parentSpan.id, op.span);
|
|
820
|
-
}
|
|
821
|
-
function replaySerializedSpan(scope, parentId, serialized) {
|
|
822
|
-
const id = generateSpanId();
|
|
823
|
-
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
824
|
-
const replayed = {
|
|
825
|
-
id,
|
|
826
|
-
parentId,
|
|
827
|
-
caseId: scope.caseId,
|
|
828
|
-
kind: serialized.kind,
|
|
829
|
-
name: serialized.name,
|
|
830
|
-
startedAt: now,
|
|
831
|
-
endedAt: now,
|
|
832
|
-
status: serialized.status,
|
|
833
|
-
attributes: serialized.attributes,
|
|
834
|
-
error: serialized.error,
|
|
835
|
-
errors: serialized.errors
|
|
836
|
-
};
|
|
837
|
-
scope.spans.push(replayed);
|
|
838
|
-
for (const child of serialized.children) replaySerializedSpan(scope, id, child);
|
|
839
|
-
}
|
|
840
1234
|
//#endregion
|
|
841
1235
|
//#region ../shared/src/schemas/display.ts
|
|
842
1236
|
const scalarCellSchema = z.union([
|
|
@@ -976,6 +1370,8 @@ const traceSpanErrorSchema = z.object({
|
|
|
976
1370
|
stack: z.string().optional(),
|
|
977
1371
|
capturedAt: z.string().optional()
|
|
978
1372
|
}).catchall(z.unknown());
|
|
1373
|
+
/** Schema for a warning attached to a trace span. */
|
|
1374
|
+
const traceSpanWarningSchema = traceSpanErrorSchema;
|
|
979
1375
|
/** Schema for a persisted trace span captured during case execution. */
|
|
980
1376
|
const traceSpanSchema = z.object({
|
|
981
1377
|
id: z.string(),
|
|
@@ -993,7 +1389,9 @@ const traceSpanSchema = z.object({
|
|
|
993
1389
|
]),
|
|
994
1390
|
attributes: z.record(z.string(), z.unknown()).optional(),
|
|
995
1391
|
error: traceSpanErrorSchema.optional(),
|
|
996
|
-
errors: z.array(traceSpanErrorSchema).optional()
|
|
1392
|
+
errors: z.array(traceSpanErrorSchema).optional(),
|
|
1393
|
+
warning: traceSpanWarningSchema.optional(),
|
|
1394
|
+
warnings: z.array(traceSpanWarningSchema).optional()
|
|
997
1395
|
});
|
|
998
1396
|
//#endregion
|
|
999
1397
|
//#region ../shared/src/schemas/chart.ts
|
|
@@ -1254,12 +1652,16 @@ const spanCacheOptionsSchema = z.object({
|
|
|
1254
1652
|
/** Override the default namespace (`${evalId}__${spanName}`). */
|
|
1255
1653
|
namespace: z.string().optional()
|
|
1256
1654
|
});
|
|
1655
|
+
/** Category of operation stored in the eval cache. */
|
|
1656
|
+
const cacheOperationTypeSchema = z.enum(["span", "value"]);
|
|
1257
1657
|
/** Summary of a single persisted cache entry, used by list/delete endpoints. */
|
|
1258
1658
|
const cacheListItemSchema = z.object({
|
|
1259
1659
|
key: z.string(),
|
|
1260
1660
|
namespace: z.string(),
|
|
1261
|
-
|
|
1262
|
-
|
|
1661
|
+
operationType: cacheOperationTypeSchema,
|
|
1662
|
+
operationName: z.string(),
|
|
1663
|
+
spanName: z.string().optional(),
|
|
1664
|
+
spanKind: traceSpanKindSchema.optional(),
|
|
1263
1665
|
storedAt: z.string(),
|
|
1264
1666
|
codeFingerprint: z.string(),
|
|
1265
1667
|
sizeBytes: z.number()
|
|
@@ -1276,7 +1678,9 @@ const serializedCacheSpanSchema = z.object({
|
|
|
1276
1678
|
"cancelled"
|
|
1277
1679
|
]),
|
|
1278
1680
|
error: traceSpanErrorSchema.optional(),
|
|
1279
|
-
errors: z.array(traceSpanErrorSchema).optional()
|
|
1681
|
+
errors: z.array(traceSpanErrorSchema).optional(),
|
|
1682
|
+
warning: traceSpanWarningSchema.optional(),
|
|
1683
|
+
warnings: z.array(traceSpanWarningSchema).optional()
|
|
1280
1684
|
}).extend({ children: z.lazy(() => z.array(serializedCacheSpanSchema)) });
|
|
1281
1685
|
/**
|
|
1282
1686
|
* One captured operation performed while a cached span's body executed.
|
|
@@ -1290,6 +1694,16 @@ const cacheRecordingOpSchema = z.discriminatedUnion("kind", [
|
|
|
1290
1694
|
key: z.string(),
|
|
1291
1695
|
value: z.unknown()
|
|
1292
1696
|
}),
|
|
1697
|
+
z.object({
|
|
1698
|
+
kind: z.literal("appendOutput"),
|
|
1699
|
+
key: z.string(),
|
|
1700
|
+
value: z.unknown()
|
|
1701
|
+
}),
|
|
1702
|
+
z.object({
|
|
1703
|
+
kind: z.literal("mergeOutput"),
|
|
1704
|
+
key: z.string(),
|
|
1705
|
+
patch: z.record(z.string(), z.unknown())
|
|
1706
|
+
}),
|
|
1293
1707
|
z.object({
|
|
1294
1708
|
kind: z.literal("incrementOutput"),
|
|
1295
1709
|
key: z.string(),
|
|
@@ -1317,6 +1731,8 @@ const cacheRecordingSchema = z.object({
|
|
|
1317
1731
|
]).optional(),
|
|
1318
1732
|
finalError: traceSpanErrorSchema.optional(),
|
|
1319
1733
|
finalErrors: z.array(traceSpanErrorSchema).optional(),
|
|
1734
|
+
finalWarning: traceSpanWarningSchema.optional(),
|
|
1735
|
+
finalWarnings: z.array(traceSpanWarningSchema).optional(),
|
|
1320
1736
|
ops: z.array(cacheRecordingOpSchema)
|
|
1321
1737
|
});
|
|
1322
1738
|
/** Persisted cache file containing metadata and a recording. */
|
|
@@ -1324,8 +1740,10 @@ const cacheEntrySchema = z.object({
|
|
|
1324
1740
|
version: z.literal(1),
|
|
1325
1741
|
key: z.string(),
|
|
1326
1742
|
namespace: z.string(),
|
|
1327
|
-
|
|
1328
|
-
|
|
1743
|
+
operationType: cacheOperationTypeSchema.optional(),
|
|
1744
|
+
operationName: z.string().optional(),
|
|
1745
|
+
spanName: z.string().optional(),
|
|
1746
|
+
spanKind: traceSpanKindSchema.optional(),
|
|
1329
1747
|
storedAt: z.string(),
|
|
1330
1748
|
codeFingerprint: z.string(),
|
|
1331
1749
|
recording: cacheRecordingSchema
|
|
@@ -1635,15 +2053,21 @@ function createFsCacheStore(options) {
|
|
|
1635
2053
|
if (fileStatResult.error || !fileStatResult.value.isFile()) continue;
|
|
1636
2054
|
const cacheFile = await readCacheFilePath(filePath);
|
|
1637
2055
|
if (cacheFile === null) continue;
|
|
1638
|
-
for (const entry of Object.values(cacheFile.entries))
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
2056
|
+
for (const entry of Object.values(cacheFile.entries)) {
|
|
2057
|
+
const operationType = entry.operationType ?? "span";
|
|
2058
|
+
const operationName = entry.operationName ?? entry.spanName ?? entry.namespace;
|
|
2059
|
+
items.push({
|
|
2060
|
+
key: entry.key,
|
|
2061
|
+
namespace: entry.namespace,
|
|
2062
|
+
operationType,
|
|
2063
|
+
operationName,
|
|
2064
|
+
spanName: entry.spanName,
|
|
2065
|
+
spanKind: entry.spanKind,
|
|
2066
|
+
storedAt: entry.storedAt,
|
|
2067
|
+
codeFingerprint: entry.codeFingerprint,
|
|
2068
|
+
sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
|
|
2069
|
+
});
|
|
2070
|
+
}
|
|
1647
2071
|
}
|
|
1648
2072
|
items.sort((a, b) => a.storedAt < b.storedAt ? 1 : -1);
|
|
1649
2073
|
return items;
|
|
@@ -1804,80 +2228,6 @@ function safeJsonParse(text) {
|
|
|
1804
2228
|
return parsed.value;
|
|
1805
2229
|
}
|
|
1806
2230
|
//#endregion
|
|
1807
|
-
//#region ../runner/src/chartValidation.ts
|
|
1808
|
-
function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
|
|
1809
|
-
const columnDef = columnsByKey.get(metric.key);
|
|
1810
|
-
if (!columnDef) {
|
|
1811
|
-
warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
|
|
1812
|
-
return false;
|
|
1813
|
-
}
|
|
1814
|
-
if (metric.aggregate === "passThresholdRate") {
|
|
1815
|
-
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
1816
|
-
warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
1817
|
-
return false;
|
|
1818
|
-
}
|
|
1819
|
-
}
|
|
1820
|
-
return true;
|
|
1821
|
-
}
|
|
1822
|
-
function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
|
|
1823
|
-
const columnDef = columnsByKey.get(extra.key);
|
|
1824
|
-
if (!columnDef) {
|
|
1825
|
-
warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
|
|
1826
|
-
return false;
|
|
1827
|
-
}
|
|
1828
|
-
if (extra.aggregate === "passThresholdRate") {
|
|
1829
|
-
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
1830
|
-
warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
1831
|
-
return false;
|
|
1832
|
-
}
|
|
1833
|
-
}
|
|
1834
|
-
return true;
|
|
1835
|
-
}
|
|
1836
|
-
function sanitizeChart(chart, columnsByKey, evalId, warnings) {
|
|
1837
|
-
const metrics = chart.metrics.filter((metric) => {
|
|
1838
|
-
if (metric.source === "builtin") return true;
|
|
1839
|
-
return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
|
|
1840
|
-
});
|
|
1841
|
-
if (metrics.length === 0) {
|
|
1842
|
-
warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
|
|
1843
|
-
return null;
|
|
1844
|
-
}
|
|
1845
|
-
const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
|
|
1846
|
-
if (extra.source === "builtin") return true;
|
|
1847
|
-
return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
|
|
1848
|
-
});
|
|
1849
|
-
return {
|
|
1850
|
-
...chart,
|
|
1851
|
-
metrics,
|
|
1852
|
-
tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
|
|
1853
|
-
};
|
|
1854
|
-
}
|
|
1855
|
-
/**
|
|
1856
|
-
* Validate and sanitize an authored `charts` config against the eval's
|
|
1857
|
-
* declared columns. Drops metrics/extras that reference unknown columns or
|
|
1858
|
-
* misuse `passThresholdRate`, and drops entire charts whose metrics are all
|
|
1859
|
-
* invalid. Returns `charts: undefined` when nothing valid remains so the UI
|
|
1860
|
-
* falls back to rendering no chart (matching the opt-in default).
|
|
1861
|
-
*/
|
|
1862
|
-
function validateCharts(params) {
|
|
1863
|
-
const { charts, columnDefs, evalId } = params;
|
|
1864
|
-
if (!charts || charts.length === 0) return {
|
|
1865
|
-
charts: void 0,
|
|
1866
|
-
warnings: []
|
|
1867
|
-
};
|
|
1868
|
-
const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
|
|
1869
|
-
const warnings = [];
|
|
1870
|
-
const sanitized = [];
|
|
1871
|
-
for (const chart of charts) {
|
|
1872
|
-
const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
|
|
1873
|
-
if (result) sanitized.push(result);
|
|
1874
|
-
}
|
|
1875
|
-
return {
|
|
1876
|
-
charts: sanitized.length > 0 ? sanitized : void 0,
|
|
1877
|
-
warnings
|
|
1878
|
-
};
|
|
1879
|
-
}
|
|
1880
|
-
//#endregion
|
|
1881
2231
|
//#region ../runner/src/columnBuilder.ts
|
|
1882
2232
|
/**
|
|
1883
2233
|
* Normalize a user-provided score definition (either a function or an
|
|
@@ -2087,98 +2437,6 @@ async function loadConfig() {
|
|
|
2087
2437
|
}
|
|
2088
2438
|
}
|
|
2089
2439
|
//#endregion
|
|
2090
|
-
//#region ../runner/src/discovery.ts
|
|
2091
|
-
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
2092
|
-
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
2093
|
-
function parseEvalMetas(filePath, content) {
|
|
2094
|
-
const metas = [];
|
|
2095
|
-
let searchIndex = 0;
|
|
2096
|
-
while (searchIndex < content.length) {
|
|
2097
|
-
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
2098
|
-
if (defineEvalIndex === -1) break;
|
|
2099
|
-
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
2100
|
-
if (!extracted) {
|
|
2101
|
-
searchIndex = defineEvalIndex + 10;
|
|
2102
|
-
continue;
|
|
2103
|
-
}
|
|
2104
|
-
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
2105
|
-
if (id !== void 0) {
|
|
2106
|
-
const result = {
|
|
2107
|
-
filePath,
|
|
2108
|
-
id
|
|
2109
|
-
};
|
|
2110
|
-
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
2111
|
-
if (title !== void 0) result.title = title;
|
|
2112
|
-
metas.push(result);
|
|
2113
|
-
}
|
|
2114
|
-
searchIndex = extracted.nextIndex;
|
|
2115
|
-
}
|
|
2116
|
-
return metas;
|
|
2117
|
-
}
|
|
2118
|
-
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
2119
|
-
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
2120
|
-
if (openParenIndex === -1) return void 0;
|
|
2121
|
-
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
2122
|
-
if (objectStartIndex === -1) return void 0;
|
|
2123
|
-
let depth = 0;
|
|
2124
|
-
let quote;
|
|
2125
|
-
let inBlockComment = false;
|
|
2126
|
-
let inLineComment = false;
|
|
2127
|
-
let isEscaped = false;
|
|
2128
|
-
for (let index = objectStartIndex; index < content.length; index++) {
|
|
2129
|
-
const currentChar = content[index];
|
|
2130
|
-
const nextChar = content[index + 1];
|
|
2131
|
-
if (inLineComment) {
|
|
2132
|
-
if (currentChar === "\n") inLineComment = false;
|
|
2133
|
-
continue;
|
|
2134
|
-
}
|
|
2135
|
-
if (inBlockComment) {
|
|
2136
|
-
if (currentChar === "*" && nextChar === "/") {
|
|
2137
|
-
inBlockComment = false;
|
|
2138
|
-
index++;
|
|
2139
|
-
}
|
|
2140
|
-
continue;
|
|
2141
|
-
}
|
|
2142
|
-
if (quote) {
|
|
2143
|
-
if (isEscaped) {
|
|
2144
|
-
isEscaped = false;
|
|
2145
|
-
continue;
|
|
2146
|
-
}
|
|
2147
|
-
if (currentChar === "\\") {
|
|
2148
|
-
isEscaped = true;
|
|
2149
|
-
continue;
|
|
2150
|
-
}
|
|
2151
|
-
if (currentChar === quote) quote = void 0;
|
|
2152
|
-
continue;
|
|
2153
|
-
}
|
|
2154
|
-
if (currentChar === "/" && nextChar === "/") {
|
|
2155
|
-
inLineComment = true;
|
|
2156
|
-
index++;
|
|
2157
|
-
continue;
|
|
2158
|
-
}
|
|
2159
|
-
if (currentChar === "/" && nextChar === "*") {
|
|
2160
|
-
inBlockComment = true;
|
|
2161
|
-
index++;
|
|
2162
|
-
continue;
|
|
2163
|
-
}
|
|
2164
|
-
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
2165
|
-
quote = currentChar;
|
|
2166
|
-
continue;
|
|
2167
|
-
}
|
|
2168
|
-
if (currentChar === "{") {
|
|
2169
|
-
depth++;
|
|
2170
|
-
continue;
|
|
2171
|
-
}
|
|
2172
|
-
if (currentChar === "}") {
|
|
2173
|
-
depth--;
|
|
2174
|
-
if (depth === 0) return {
|
|
2175
|
-
nextIndex: index + 1,
|
|
2176
|
-
objectText: content.slice(objectStartIndex, index + 1)
|
|
2177
|
-
};
|
|
2178
|
-
}
|
|
2179
|
-
}
|
|
2180
|
-
}
|
|
2181
|
-
//#endregion
|
|
2182
2440
|
//#region ../runner/src/evalModuleLoader.ts
|
|
2183
2441
|
/**
|
|
2184
2442
|
* Import one eval module with a cache key derived from its current source so
|
|
@@ -2266,30 +2524,6 @@ function setLatestRunInfoMap(params) {
|
|
|
2266
2524
|
for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
|
|
2267
2525
|
}
|
|
2268
2526
|
//#endregion
|
|
2269
|
-
//#region ../runner/src/gitState.ts
|
|
2270
|
-
function runGitCommand(workspaceRoot, args) {
|
|
2271
|
-
const result = spawnSync("git", args, {
|
|
2272
|
-
cwd: workspaceRoot,
|
|
2273
|
-
encoding: "utf8",
|
|
2274
|
-
stdio: [
|
|
2275
|
-
"ignore",
|
|
2276
|
-
"pipe",
|
|
2277
|
-
"ignore"
|
|
2278
|
-
]
|
|
2279
|
-
});
|
|
2280
|
-
return {
|
|
2281
|
-
status: result.status,
|
|
2282
|
-
stdout: result.stdout.trim()
|
|
2283
|
-
};
|
|
2284
|
-
}
|
|
2285
|
-
/** Read the current git commit for the workspace, if available. */
|
|
2286
|
-
function readGitWorktreeState(workspaceRoot) {
|
|
2287
|
-
const insideWorktree = runGitCommand(workspaceRoot, ["rev-parse", "--is-inside-work-tree"]);
|
|
2288
|
-
if (insideWorktree.status !== 0 || insideWorktree.stdout !== "true") return { commitSha: null };
|
|
2289
|
-
const commitResult = runGitCommand(workspaceRoot, ["rev-parse", "HEAD"]);
|
|
2290
|
-
return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
|
|
2291
|
-
}
|
|
2292
|
-
//#endregion
|
|
2293
2527
|
//#region ../runner/src/outputArtifacts.ts
|
|
2294
2528
|
const mimeTypeExtensionMap = {
|
|
2295
2529
|
"application/json": ".json",
|
|
@@ -2345,100 +2579,329 @@ function normalizeMimeType(value) {
|
|
|
2345
2579
|
const normalized = value.trim();
|
|
2346
2580
|
return normalized.length > 0 ? normalized : "application/octet-stream";
|
|
2347
2581
|
}
|
|
2348
|
-
function getArtifactFileName(params) {
|
|
2349
|
-
const { outputKey, mimeType, value } = params;
|
|
2350
|
-
if (isFile(value) && value.name.trim().length > 0) return value.name.trim();
|
|
2351
|
-
const extension = getExtensionForMimeType(mimeType);
|
|
2352
|
-
return extension.length > 0 ? `${sanitizeSegment(outputKey)}${extension}` : sanitizeSegment(outputKey);
|
|
2582
|
+
function getArtifactFileName(params) {
|
|
2583
|
+
const { outputKey, mimeType, value } = params;
|
|
2584
|
+
if (isFile(value) && value.name.trim().length > 0) return value.name.trim();
|
|
2585
|
+
const extension = getExtensionForMimeType(mimeType);
|
|
2586
|
+
return extension.length > 0 ? `${sanitizeSegment(outputKey)}${extension}` : sanitizeSegment(outputKey);
|
|
2587
|
+
}
|
|
2588
|
+
function getExtensionForMimeType(mimeType) {
|
|
2589
|
+
const exactMatch = mimeTypeExtensionMap[mimeType];
|
|
2590
|
+
if (exactMatch) return exactMatch;
|
|
2591
|
+
const subtype = mimeType.split("/")[1];
|
|
2592
|
+
if (subtype === void 0 || subtype.length === 0) return "";
|
|
2593
|
+
const withoutSuffix = subtype.split("+")[0] ?? subtype;
|
|
2594
|
+
return withoutSuffix.length > 0 ? `.${withoutSuffix}` : "";
|
|
2595
|
+
}
|
|
2596
|
+
function sanitizeSegment(value) {
|
|
2597
|
+
const normalized = value.trim().replaceAll(/[^A-Za-z0-9._-]+/g, "-");
|
|
2598
|
+
return normalized.length > 0 ? normalized : "artifact";
|
|
2599
|
+
}
|
|
2600
|
+
function sanitizeFileName(value) {
|
|
2601
|
+
const normalized = sanitizeSegment(value);
|
|
2602
|
+
const extension = extname(normalized);
|
|
2603
|
+
if (extension.length === 0) return normalized;
|
|
2604
|
+
return `${normalized.slice(0, -extension.length).replaceAll(".", "-")}${extension}`;
|
|
2605
|
+
}
|
|
2606
|
+
function isFile(value) {
|
|
2607
|
+
return value instanceof File;
|
|
2608
|
+
}
|
|
2609
|
+
//#endregion
|
|
2610
|
+
//#region ../runner/src/runMaintenance.ts
|
|
2611
|
+
async function persistRunState(runState) {
|
|
2612
|
+
await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
|
|
2613
|
+
await writeFile(join(runState.runDir, "run.json"), JSON.stringify(runState.manifest, null, 2));
|
|
2614
|
+
const casesJsonl = runState.cases.map((c) => JSON.stringify(c)).join("\n");
|
|
2615
|
+
await writeFile(join(runState.runDir, "cases.jsonl"), casesJsonl);
|
|
2616
|
+
}
|
|
2617
|
+
/**
|
|
2618
|
+
* Recompute a persisted case's status after score definitions changed.
|
|
2619
|
+
*
|
|
2620
|
+
* Pass/fail gates are per-score: a case fails when any score with a declared
|
|
2621
|
+
* `passThreshold` reports a numeric value below that threshold. Scores
|
|
2622
|
+
* without a threshold are informational and never gate. Cancelled and
|
|
2623
|
+
* errored cases retain their terminal status.
|
|
2624
|
+
*/
|
|
2625
|
+
function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
|
|
2626
|
+
if (caseRow.status === "cancelled") return "cancelled";
|
|
2627
|
+
if (caseDetail?.error !== null && caseDetail?.error !== void 0) return "error";
|
|
2628
|
+
if ((caseDetail?.assertionFailures.length ?? 0) > 0) return "fail";
|
|
2629
|
+
for (const [key, passThreshold] of scoreThresholds) {
|
|
2630
|
+
const rawValue = caseRow.columns[key] ?? caseDetail?.columns[key];
|
|
2631
|
+
if (typeof rawValue !== "number") continue;
|
|
2632
|
+
if (rawValue < passThreshold) return "fail";
|
|
2633
|
+
}
|
|
2634
|
+
return caseRow.status === "error" ? "error" : "pass";
|
|
2635
|
+
}
|
|
2636
|
+
function runTouchesEval(params) {
|
|
2637
|
+
if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
|
|
2638
|
+
if (params.target.mode === "all") return params.evalExists;
|
|
2639
|
+
if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
|
|
2640
|
+
return false;
|
|
2641
|
+
}
|
|
2642
|
+
async function recomputeEvalStatusesInRuns(params) {
|
|
2643
|
+
let updatedRuns = 0;
|
|
2644
|
+
for (const run of params.runs) {
|
|
2645
|
+
if (!runTouchesEval({
|
|
2646
|
+
target: run.manifest.target,
|
|
2647
|
+
caseRows: run.cases,
|
|
2648
|
+
evalId: params.evalId,
|
|
2649
|
+
evalExists: params.evalExists
|
|
2650
|
+
})) continue;
|
|
2651
|
+
if (run.manifest.status === "running") continue;
|
|
2652
|
+
let changed = false;
|
|
2653
|
+
for (const caseRow of run.cases) {
|
|
2654
|
+
if (caseRow.evalId !== params.evalId) continue;
|
|
2655
|
+
const caseDetail = run.caseDetails.get(caseRow.caseId);
|
|
2656
|
+
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
|
|
2657
|
+
if (caseRow.status === nextStatus) continue;
|
|
2658
|
+
caseRow.status = nextStatus;
|
|
2659
|
+
if (caseDetail) {
|
|
2660
|
+
caseDetail.status = nextStatus;
|
|
2661
|
+
await params.persistCaseDetail(run.runDir, caseDetail);
|
|
2662
|
+
}
|
|
2663
|
+
changed = true;
|
|
2664
|
+
}
|
|
2665
|
+
if (!changed) continue;
|
|
2666
|
+
const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
|
|
2667
|
+
run.summary.totalCases = derivedSummary.totalCases;
|
|
2668
|
+
run.summary.passedCases = derivedSummary.passedCases;
|
|
2669
|
+
run.summary.failedCases = derivedSummary.failedCases;
|
|
2670
|
+
run.summary.errorCases = derivedSummary.errorCases;
|
|
2671
|
+
run.summary.cancelledCases = derivedSummary.cancelledCases;
|
|
2672
|
+
await persistRunState(run);
|
|
2673
|
+
updatedRuns += 1;
|
|
2674
|
+
}
|
|
2675
|
+
return updatedRuns;
|
|
2676
|
+
}
|
|
2677
|
+
//#endregion
|
|
2678
|
+
//#region ../runner/src/runPersistence.ts
|
|
2679
|
+
const SHORT_ID_PATTERN = /^r(\d+)$/;
|
|
2680
|
+
/**
|
|
2681
|
+
* Generate a filesystem-safe, sortable run id combining a UTC timestamp
|
|
2682
|
+
* with a short random suffix.
|
|
2683
|
+
*/
|
|
2684
|
+
function generateRunId() {
|
|
2685
|
+
const now = /* @__PURE__ */ new Date();
|
|
2686
|
+
const pad = (n) => String(n).padStart(2, "0");
|
|
2687
|
+
return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
|
|
2688
|
+
}
|
|
2689
|
+
function parseShortIdNum(shortId) {
|
|
2690
|
+
if (shortId === void 0) return null;
|
|
2691
|
+
const match = SHORT_ID_PATTERN.exec(shortId);
|
|
2692
|
+
if (!match) return null;
|
|
2693
|
+
const num = Number(match[1]);
|
|
2694
|
+
if (!Number.isFinite(num)) return null;
|
|
2695
|
+
return num;
|
|
2696
|
+
}
|
|
2697
|
+
/**
|
|
2698
|
+
* Return the next `shortId` number to assign based on the existing
|
|
2699
|
+
* loaded snapshots. Legacy runs that don't match the `r\d+` format are
|
|
2700
|
+
* ignored.
|
|
2701
|
+
*/
|
|
2702
|
+
function nextShortIdFromSnapshots(snapshots) {
|
|
2703
|
+
let maxNum = -1;
|
|
2704
|
+
for (const snapshot of snapshots) {
|
|
2705
|
+
const num = parseShortIdNum(snapshot.manifest.shortId);
|
|
2706
|
+
if (num !== null && num > maxNum) maxNum = num;
|
|
2707
|
+
}
|
|
2708
|
+
return maxNum + 1;
|
|
2709
|
+
}
|
|
2710
|
+
async function loadPersistedRunSnapshots(localStateDir) {
|
|
2711
|
+
const runsDir = join(localStateDir, "runs");
|
|
2712
|
+
const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
|
|
2713
|
+
if (entriesResult.error) return [];
|
|
2714
|
+
const snapshots = [];
|
|
2715
|
+
const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
|
|
2716
|
+
for (const runDir of runDirs) {
|
|
2717
|
+
const snapshot = await loadPersistedRunSnapshot(runDir);
|
|
2718
|
+
if (!snapshot) continue;
|
|
2719
|
+
snapshots.push(snapshot);
|
|
2720
|
+
}
|
|
2721
|
+
return snapshots;
|
|
2722
|
+
}
|
|
2723
|
+
async function persistCaseDetail(runDir, caseDetail) {
|
|
2724
|
+
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
|
|
2725
|
+
}
|
|
2726
|
+
function getLastRunStatuses(params) {
|
|
2727
|
+
const latestRunInfos = getLatestRunInfos(params);
|
|
2728
|
+
return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
|
|
2729
|
+
}
|
|
2730
|
+
/**
|
|
2731
|
+
* Return the latest scoped run metadata for each eval based on persisted and
|
|
2732
|
+
* in-memory runs.
|
|
2733
|
+
*/
|
|
2734
|
+
function getLatestRunInfos(params) {
|
|
2735
|
+
const { runs, knownEvals } = params;
|
|
2736
|
+
const knownEvalMetas = [...knownEvals];
|
|
2737
|
+
const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
|
|
2738
|
+
const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
|
|
2739
|
+
const latestRunInfos = /* @__PURE__ */ new Map();
|
|
2740
|
+
for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
|
|
2741
|
+
status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
|
|
2742
|
+
startedAt: getRunFreshnessTimestamp(run.manifest),
|
|
2743
|
+
commitSha: run.manifest.commitSha ?? null,
|
|
2744
|
+
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
|
|
2745
|
+
});
|
|
2746
|
+
return latestRunInfos;
|
|
2747
|
+
}
|
|
2748
|
+
function toLastRunStatus$1(status) {
|
|
2749
|
+
return status === "pending" ? null : status;
|
|
2750
|
+
}
|
|
2751
|
+
async function loadPersistedRunSnapshot(runDir) {
|
|
2752
|
+
const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
|
|
2753
|
+
if (!manifest) return null;
|
|
2754
|
+
const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
|
|
2755
|
+
if (!summary) return null;
|
|
2756
|
+
return {
|
|
2757
|
+
runDir,
|
|
2758
|
+
manifest,
|
|
2759
|
+
summary,
|
|
2760
|
+
cases: await readCaseRows(runDir),
|
|
2761
|
+
caseDetails: await readCaseDetails(runDir)
|
|
2762
|
+
};
|
|
2763
|
+
}
|
|
2764
|
+
async function readParsedJsonFile(filePath, schema) {
|
|
2765
|
+
const fileResult = await resultify(() => readFile(filePath, "utf-8"));
|
|
2766
|
+
if (fileResult.error) return null;
|
|
2767
|
+
const jsonResult = resultify(() => JSON.parse(fileResult.value));
|
|
2768
|
+
if (jsonResult.error) return null;
|
|
2769
|
+
const parsed = schema.safeParse(jsonResult.value);
|
|
2770
|
+
if (!parsed.success) return null;
|
|
2771
|
+
return parsed.data;
|
|
2772
|
+
}
|
|
2773
|
+
async function readCaseRows(runDir) {
|
|
2774
|
+
const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
|
|
2775
|
+
if (fileResult.error) return [];
|
|
2776
|
+
const rows = [];
|
|
2777
|
+
for (const rawLine of fileResult.value.split("\n")) {
|
|
2778
|
+
const line = rawLine.trim();
|
|
2779
|
+
if (line.length === 0) continue;
|
|
2780
|
+
const jsonResult = resultify(() => JSON.parse(line));
|
|
2781
|
+
if (jsonResult.error) continue;
|
|
2782
|
+
const parsed = caseRowSchema.safeParse(jsonResult.value);
|
|
2783
|
+
if (!parsed.success) continue;
|
|
2784
|
+
rows.push(parsed.data);
|
|
2785
|
+
}
|
|
2786
|
+
return rows;
|
|
2787
|
+
}
|
|
2788
|
+
async function readCaseDetails(runDir) {
|
|
2789
|
+
const detailsDir = join(runDir, "case-details");
|
|
2790
|
+
const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
|
|
2791
|
+
if (entriesResult.error) return /* @__PURE__ */ new Map();
|
|
2792
|
+
const caseDetails = /* @__PURE__ */ new Map();
|
|
2793
|
+
for (const entry of entriesResult.value) {
|
|
2794
|
+
if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
|
|
2795
|
+
const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
|
|
2796
|
+
if (!detail) continue;
|
|
2797
|
+
caseDetails.set(detail.caseId, detail);
|
|
2798
|
+
}
|
|
2799
|
+
return caseDetails;
|
|
2353
2800
|
}
|
|
2354
|
-
function
|
|
2355
|
-
const
|
|
2356
|
-
if (
|
|
2357
|
-
const
|
|
2358
|
-
|
|
2359
|
-
const withoutSuffix = subtype.split("+")[0] ?? subtype;
|
|
2360
|
-
return withoutSuffix.length > 0 ? `.${withoutSuffix}` : "";
|
|
2801
|
+
function getRunEvalIds(run, knownEvalIds) {
|
|
2802
|
+
const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
|
|
2803
|
+
if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
|
|
2804
|
+
else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
|
|
2805
|
+
return [...evalIds];
|
|
2361
2806
|
}
|
|
2362
|
-
function
|
|
2363
|
-
const
|
|
2364
|
-
|
|
2807
|
+
function getEvalStatusForRun(run, evalId, manualScoreKeys) {
|
|
2808
|
+
const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
|
|
2809
|
+
if (evalCases.length > 0) {
|
|
2810
|
+
if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
|
|
2811
|
+
return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
|
|
2812
|
+
}
|
|
2813
|
+
return toLastRunStatus$1(deriveStatusFromChildStatuses({
|
|
2814
|
+
statuses: [],
|
|
2815
|
+
lifecycleStatus: run.manifest.status
|
|
2816
|
+
}));
|
|
2365
2817
|
}
|
|
2366
|
-
function
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
|
|
2370
|
-
|
|
2818
|
+
function hasPendingManualScores(caseRows, manualScoreKeys) {
|
|
2819
|
+
if (manualScoreKeys.length === 0) return false;
|
|
2820
|
+
return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
|
|
2821
|
+
const value = caseRow.columns[key];
|
|
2822
|
+
return typeof value !== "number" || !Number.isFinite(value);
|
|
2823
|
+
}));
|
|
2371
2824
|
}
|
|
2372
|
-
function
|
|
2373
|
-
return
|
|
2825
|
+
function encodeCaseDetailFileName(caseId) {
|
|
2826
|
+
return encodeURIComponent(caseId);
|
|
2374
2827
|
}
|
|
2375
2828
|
//#endregion
|
|
2376
|
-
//#region ../runner/src/
|
|
2377
|
-
|
|
2378
|
-
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2829
|
+
//#region ../runner/src/moduleIsolation.ts
|
|
2830
|
+
const isolationParam = "agent-evals-isolate";
|
|
2831
|
+
const pathSegmentSeparatorPattern = /[\\/]+/;
|
|
2832
|
+
const isolationStorage = new AsyncLocalStorage();
|
|
2833
|
+
const activeIsolationRoots = /* @__PURE__ */ new Map();
|
|
2834
|
+
let hooksRegistered = false;
|
|
2835
|
+
const requireFromRunner = createRequire(import.meta.url);
|
|
2836
|
+
const agentPackageUrlBySpecifier = new Map([
|
|
2837
|
+
"@ls-stack/agent-eval",
|
|
2838
|
+
"@agent-evals/sdk",
|
|
2839
|
+
"@agent-evals/shared",
|
|
2840
|
+
"@agent-evals/runner",
|
|
2841
|
+
"@agent-evals/runner/run-child"
|
|
2842
|
+
].flatMap((specifier) => {
|
|
2843
|
+
try {
|
|
2844
|
+
return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
|
|
2845
|
+
} catch {
|
|
2846
|
+
return [];
|
|
2847
|
+
}
|
|
2848
|
+
}));
|
|
2849
|
+
function isAgentEvalsPackageSpecifier(specifier) {
|
|
2850
|
+
return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
|
|
2851
|
+
}
|
|
2852
|
+
function getIsolationKeyFromParent(parentURL) {
|
|
2853
|
+
if (!parentURL?.startsWith("file:")) return null;
|
|
2854
|
+
const value = new URL(parentURL).searchParams.get(isolationParam);
|
|
2855
|
+
return activeIsolationRoots.has(value ?? "") ? value : null;
|
|
2856
|
+
}
|
|
2857
|
+
function isWorkspaceFile(url, workspaceRoot) {
|
|
2858
|
+
if (url.protocol !== "file:") return false;
|
|
2859
|
+
const relativePath = relative(workspaceRoot, fileURLToPath(url));
|
|
2860
|
+
if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
|
|
2861
|
+
const segments = relativePath.split(pathSegmentSeparatorPattern);
|
|
2862
|
+
return !segments.includes("node_modules") && !segments.includes(".agent-evals");
|
|
2863
|
+
}
|
|
2864
|
+
function addIsolationParam(url, key) {
|
|
2865
|
+
const moduleUrl = new URL(url);
|
|
2866
|
+
if (moduleUrl.searchParams.get(isolationParam) === key) return url;
|
|
2867
|
+
moduleUrl.searchParams.set(isolationParam, key);
|
|
2868
|
+
return moduleUrl.href;
|
|
2869
|
+
}
|
|
2870
|
+
function registerModuleIsolationHooks() {
|
|
2871
|
+
if (hooksRegistered) return;
|
|
2872
|
+
hooksRegistered = true;
|
|
2873
|
+
registerHooks({ resolve(specifier, context, nextResolve) {
|
|
2874
|
+
const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
|
|
2875
|
+
if (agentPackageUrl !== void 0) return {
|
|
2876
|
+
url: agentPackageUrl,
|
|
2877
|
+
shortCircuit: true
|
|
2878
|
+
};
|
|
2879
|
+
const resolved = nextResolve(specifier, context);
|
|
2880
|
+
if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
|
|
2881
|
+
const activeContext = isolationStorage.getStore();
|
|
2882
|
+
const inferredKey = getIsolationKeyFromParent(context.parentURL);
|
|
2883
|
+
const isolationKey = activeContext?.key ?? inferredKey;
|
|
2884
|
+
if (isolationKey === null) return resolved;
|
|
2885
|
+
const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
|
|
2886
|
+
if (workspaceRoot === void 0) return resolved;
|
|
2887
|
+
if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
|
|
2888
|
+
return {
|
|
2889
|
+
...resolved,
|
|
2890
|
+
url: addIsolationParam(resolved.url, isolationKey)
|
|
2891
|
+
};
|
|
2892
|
+
} });
|
|
2382
2893
|
}
|
|
2383
2894
|
/**
|
|
2384
|
-
*
|
|
2895
|
+
* Execute module loading and eval code with fresh workspace module URLs.
|
|
2385
2896
|
*
|
|
2386
|
-
*
|
|
2387
|
-
*
|
|
2388
|
-
*
|
|
2389
|
-
* errored cases retain their terminal status.
|
|
2897
|
+
* Node does not expose an ESM cache reset API, so the runner appends a
|
|
2898
|
+
* run-scoped query parameter to workspace file imports. Package imports are
|
|
2899
|
+
* left alone so SDK singletons, such as the eval registry, remain shared.
|
|
2390
2900
|
*/
|
|
2391
|
-
function
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
for (const [key, passThreshold] of scoreThresholds) {
|
|
2396
|
-
const rawValue = caseRow.columns[key] ?? caseDetail?.columns[key];
|
|
2397
|
-
if (typeof rawValue !== "number") continue;
|
|
2398
|
-
if (rawValue < passThreshold) return "fail";
|
|
2399
|
-
}
|
|
2400
|
-
return caseRow.status === "error" ? "error" : "pass";
|
|
2401
|
-
}
|
|
2402
|
-
function runTouchesEval(params) {
|
|
2403
|
-
if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
|
|
2404
|
-
if (params.target.mode === "all") return params.evalExists;
|
|
2405
|
-
if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
|
|
2406
|
-
return false;
|
|
2407
|
-
}
|
|
2408
|
-
async function recomputeEvalStatusesInRuns(params) {
|
|
2409
|
-
let updatedRuns = 0;
|
|
2410
|
-
for (const run of params.runs) {
|
|
2411
|
-
if (!runTouchesEval({
|
|
2412
|
-
target: run.manifest.target,
|
|
2413
|
-
caseRows: run.cases,
|
|
2414
|
-
evalId: params.evalId,
|
|
2415
|
-
evalExists: params.evalExists
|
|
2416
|
-
})) continue;
|
|
2417
|
-
if (run.manifest.status === "running") continue;
|
|
2418
|
-
let changed = false;
|
|
2419
|
-
for (const caseRow of run.cases) {
|
|
2420
|
-
if (caseRow.evalId !== params.evalId) continue;
|
|
2421
|
-
const caseDetail = run.caseDetails.get(caseRow.caseId);
|
|
2422
|
-
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
|
|
2423
|
-
if (caseRow.status === nextStatus) continue;
|
|
2424
|
-
caseRow.status = nextStatus;
|
|
2425
|
-
if (caseDetail) {
|
|
2426
|
-
caseDetail.status = nextStatus;
|
|
2427
|
-
await params.persistCaseDetail(run.runDir, caseDetail);
|
|
2428
|
-
}
|
|
2429
|
-
changed = true;
|
|
2430
|
-
}
|
|
2431
|
-
if (!changed) continue;
|
|
2432
|
-
const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
|
|
2433
|
-
run.summary.totalCases = derivedSummary.totalCases;
|
|
2434
|
-
run.summary.passedCases = derivedSummary.passedCases;
|
|
2435
|
-
run.summary.failedCases = derivedSummary.failedCases;
|
|
2436
|
-
run.summary.errorCases = derivedSummary.errorCases;
|
|
2437
|
-
run.summary.cancelledCases = derivedSummary.cancelledCases;
|
|
2438
|
-
await persistRunState(run);
|
|
2439
|
-
updatedRuns += 1;
|
|
2440
|
-
}
|
|
2441
|
-
return updatedRuns;
|
|
2901
|
+
async function runWithModuleIsolation(context, fn) {
|
|
2902
|
+
registerModuleIsolationHooks();
|
|
2903
|
+
activeIsolationRoots.set(context.key, context.workspaceRoot);
|
|
2904
|
+
return await isolationStorage.run(context, fn);
|
|
2442
2905
|
}
|
|
2443
2906
|
//#endregion
|
|
2444
2907
|
//#region ../runner/src/traceDisplay.ts
|
|
@@ -2528,18 +2991,25 @@ async function callWithUnknownResult(fn, args) {
|
|
|
2528
2991
|
return await Reflect.apply(fn, void 0, args);
|
|
2529
2992
|
}
|
|
2530
2993
|
async function runCase(params) {
|
|
2531
|
-
const { evalDef, evalId, evalCase, globalTraceDisplay, trial,
|
|
2994
|
+
const { evalDef, evalId, evalCase, globalTraceDisplay, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, artifactDir, runId } = params;
|
|
2532
2995
|
const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
|
|
2533
|
-
|
|
2534
|
-
input: evalCase.input
|
|
2535
|
-
|
|
2536
|
-
|
|
2537
|
-
|
|
2538
|
-
|
|
2539
|
-
|
|
2540
|
-
|
|
2541
|
-
|
|
2542
|
-
|
|
2996
|
+
const execute = async () => {
|
|
2997
|
+
await Reflect.apply(evalDef.execute, evalDef, [{ input: evalCase.input }]);
|
|
2998
|
+
};
|
|
2999
|
+
if (moduleIsolation === void 0) {
|
|
3000
|
+
await execute();
|
|
3001
|
+
return;
|
|
3002
|
+
}
|
|
3003
|
+
await runWithModuleIsolation(moduleIsolation, execute);
|
|
3004
|
+
}, {
|
|
3005
|
+
input: evalCase.input,
|
|
3006
|
+
cacheContext: cacheAdapter ? {
|
|
3007
|
+
adapter: cacheAdapter,
|
|
3008
|
+
mode: cacheMode,
|
|
3009
|
+
evalId,
|
|
3010
|
+
codeFingerprint
|
|
3011
|
+
} : void 0
|
|
3012
|
+
});
|
|
2543
3013
|
const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
|
|
2544
3014
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
2545
3015
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
@@ -2555,20 +3025,35 @@ async function runCase(params) {
|
|
|
2555
3025
|
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
2556
3026
|
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
2557
3027
|
}
|
|
3028
|
+
if (!nonAssertError && evalDef.outputsSchema) {
|
|
3029
|
+
const parsedOutputs = evalDef.outputsSchema.safeParse(getOutputsSchemaInput(evalDef.outputsSchema, scope.outputs));
|
|
3030
|
+
if (parsedOutputs.success) scope.outputs = {
|
|
3031
|
+
...scope.outputs,
|
|
3032
|
+
...parsedOutputs.data
|
|
3033
|
+
};
|
|
3034
|
+
else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error)));
|
|
3035
|
+
}
|
|
2558
3036
|
const scoreResults = /* @__PURE__ */ new Map();
|
|
2559
3037
|
const scoringTraces = {};
|
|
2560
|
-
if (!nonAssertError && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
|
|
3038
|
+
if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
|
|
2561
3039
|
const { compute, passThreshold, label } = normalizeScoreDef(def);
|
|
2562
|
-
const scoreRun = await runInEvalScope(evalCase.id, async () =>
|
|
3040
|
+
const scoreRun = await runInEvalScope(evalCase.id, async () => {
|
|
3041
|
+
const computeScore = async () => await callWithUnknownResult(compute, [{
|
|
3042
|
+
input: evalCase.input,
|
|
3043
|
+
outputs: { ...scope.outputs },
|
|
3044
|
+
case: evalCase
|
|
3045
|
+
}]);
|
|
3046
|
+
if (moduleIsolation === void 0) return await computeScore();
|
|
3047
|
+
return await runWithModuleIsolation(moduleIsolation, computeScore);
|
|
3048
|
+
}, {
|
|
2563
3049
|
input: evalCase.input,
|
|
2564
|
-
|
|
2565
|
-
|
|
2566
|
-
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
|
|
2571
|
-
} : void 0 });
|
|
3050
|
+
cacheContext: cacheAdapter ? {
|
|
3051
|
+
adapter: cacheAdapter,
|
|
3052
|
+
mode: cacheMode,
|
|
3053
|
+
evalId: `${evalId}__score__${key}`,
|
|
3054
|
+
codeFingerprint
|
|
3055
|
+
} : void 0
|
|
3056
|
+
});
|
|
2572
3057
|
const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
|
|
2573
3058
|
if (trace.length > 0) scoringTraces[key] = {
|
|
2574
3059
|
trace,
|
|
@@ -2638,200 +3123,61 @@ async function runCase(params) {
|
|
|
2638
3123
|
input: evalCase.input,
|
|
2639
3124
|
trace: displayTrace,
|
|
2640
3125
|
traceDisplay,
|
|
2641
|
-
columns,
|
|
2642
|
-
assertionFailures: scope.assertionFailures,
|
|
2643
|
-
error: errorInfo,
|
|
2644
|
-
trial
|
|
2645
|
-
};
|
|
2646
|
-
if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
|
|
2647
|
-
return {
|
|
2648
|
-
caseDetail,
|
|
2649
|
-
caseRowUpdate: {
|
|
2650
|
-
status,
|
|
2651
|
-
latencyMs: Date.now() - startTime,
|
|
2652
|
-
columns
|
|
2653
|
-
}
|
|
2654
|
-
};
|
|
2655
|
-
}
|
|
2656
|
-
function isRecord(value) {
|
|
2657
|
-
return typeof value === "object" && value !== null;
|
|
2658
|
-
}
|
|
2659
|
-
function isBlob(value) {
|
|
2660
|
-
return value instanceof Blob;
|
|
2661
|
-
}
|
|
2662
|
-
function toAssertionFailure(message, error = void 0) {
|
|
2663
|
-
return error?.stack ? {
|
|
2664
|
-
message,
|
|
2665
|
-
stack: error.stack
|
|
2666
|
-
} : { message };
|
|
2667
|
-
}
|
|
2668
|
-
//#endregion
|
|
2669
|
-
//#region ../runner/src/runPersistence.ts
|
|
2670
|
-
const SHORT_ID_PATTERN = /^r(\d+)$/;
|
|
2671
|
-
/**
|
|
2672
|
-
* Generate a filesystem-safe, sortable run id combining a UTC timestamp
|
|
2673
|
-
* with a short random suffix.
|
|
2674
|
-
*/
|
|
2675
|
-
function generateRunId() {
|
|
2676
|
-
const now = /* @__PURE__ */ new Date();
|
|
2677
|
-
const pad = (n) => String(n).padStart(2, "0");
|
|
2678
|
-
return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
|
|
2679
|
-
}
|
|
2680
|
-
function parseShortIdNum(shortId) {
|
|
2681
|
-
if (shortId === void 0) return null;
|
|
2682
|
-
const match = SHORT_ID_PATTERN.exec(shortId);
|
|
2683
|
-
if (!match) return null;
|
|
2684
|
-
const num = Number(match[1]);
|
|
2685
|
-
if (!Number.isFinite(num)) return null;
|
|
2686
|
-
return num;
|
|
2687
|
-
}
|
|
2688
|
-
/**
|
|
2689
|
-
* Return the next `shortId` number to assign based on the existing
|
|
2690
|
-
* loaded snapshots. Legacy runs that don't match the `r\d+` format are
|
|
2691
|
-
* ignored.
|
|
2692
|
-
*/
|
|
2693
|
-
function nextShortIdFromSnapshots(snapshots) {
|
|
2694
|
-
let maxNum = -1;
|
|
2695
|
-
for (const snapshot of snapshots) {
|
|
2696
|
-
const num = parseShortIdNum(snapshot.manifest.shortId);
|
|
2697
|
-
if (num !== null && num > maxNum) maxNum = num;
|
|
2698
|
-
}
|
|
2699
|
-
return maxNum + 1;
|
|
2700
|
-
}
|
|
2701
|
-
async function loadPersistedRunSnapshots(localStateDir) {
|
|
2702
|
-
const runsDir = join(localStateDir, "runs");
|
|
2703
|
-
const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
|
|
2704
|
-
if (entriesResult.error) return [];
|
|
2705
|
-
const snapshots = [];
|
|
2706
|
-
const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
|
|
2707
|
-
for (const runDir of runDirs) {
|
|
2708
|
-
const snapshot = await loadPersistedRunSnapshot(runDir);
|
|
2709
|
-
if (!snapshot) continue;
|
|
2710
|
-
snapshots.push(snapshot);
|
|
2711
|
-
}
|
|
2712
|
-
return snapshots;
|
|
2713
|
-
}
|
|
2714
|
-
async function persistCaseDetail(runDir, caseDetail) {
|
|
2715
|
-
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
|
|
2716
|
-
}
|
|
2717
|
-
function getLastRunStatuses(params) {
|
|
2718
|
-
const latestRunInfos = getLatestRunInfos(params);
|
|
2719
|
-
return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
|
|
2720
|
-
}
|
|
2721
|
-
/**
|
|
2722
|
-
* Return the latest scoped run metadata for each eval based on persisted and
|
|
2723
|
-
* in-memory runs.
|
|
2724
|
-
*/
|
|
2725
|
-
function getLatestRunInfos(params) {
|
|
2726
|
-
const { runs, knownEvals } = params;
|
|
2727
|
-
const knownEvalMetas = [...knownEvals];
|
|
2728
|
-
const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
|
|
2729
|
-
const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
|
|
2730
|
-
const latestRunInfos = /* @__PURE__ */ new Map();
|
|
2731
|
-
for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
|
|
2732
|
-
status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
|
|
2733
|
-
startedAt: getRunFreshnessTimestamp(run.manifest),
|
|
2734
|
-
commitSha: run.manifest.commitSha ?? null,
|
|
2735
|
-
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
|
|
2736
|
-
});
|
|
2737
|
-
return latestRunInfos;
|
|
2738
|
-
}
|
|
2739
|
-
function toLastRunStatus$1(status) {
|
|
2740
|
-
return status === "pending" ? null : status;
|
|
2741
|
-
}
|
|
2742
|
-
async function loadPersistedRunSnapshot(runDir) {
|
|
2743
|
-
const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
|
|
2744
|
-
if (!manifest) return null;
|
|
2745
|
-
const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
|
|
2746
|
-
if (!summary) return null;
|
|
3126
|
+
columns,
|
|
3127
|
+
assertionFailures: scope.assertionFailures,
|
|
3128
|
+
error: errorInfo,
|
|
3129
|
+
trial
|
|
3130
|
+
};
|
|
3131
|
+
if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
|
|
2747
3132
|
return {
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
3133
|
+
caseDetail,
|
|
3134
|
+
caseRowUpdate: {
|
|
3135
|
+
status,
|
|
3136
|
+
latencyMs: Date.now() - startTime,
|
|
3137
|
+
columns
|
|
3138
|
+
}
|
|
2753
3139
|
};
|
|
2754
3140
|
}
|
|
2755
|
-
|
|
2756
|
-
|
|
2757
|
-
if (fileResult.error) return null;
|
|
2758
|
-
const jsonResult = resultify(() => JSON.parse(fileResult.value));
|
|
2759
|
-
if (jsonResult.error) return null;
|
|
2760
|
-
const parsed = schema.safeParse(jsonResult.value);
|
|
2761
|
-
if (!parsed.success) return null;
|
|
2762
|
-
return parsed.data;
|
|
2763
|
-
}
|
|
2764
|
-
async function readCaseRows(runDir) {
|
|
2765
|
-
const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
|
|
2766
|
-
if (fileResult.error) return [];
|
|
2767
|
-
const rows = [];
|
|
2768
|
-
for (const rawLine of fileResult.value.split("\n")) {
|
|
2769
|
-
const line = rawLine.trim();
|
|
2770
|
-
if (line.length === 0) continue;
|
|
2771
|
-
const jsonResult = resultify(() => JSON.parse(line));
|
|
2772
|
-
if (jsonResult.error) continue;
|
|
2773
|
-
const parsed = caseRowSchema.safeParse(jsonResult.value);
|
|
2774
|
-
if (!parsed.success) continue;
|
|
2775
|
-
rows.push(parsed.data);
|
|
2776
|
-
}
|
|
2777
|
-
return rows;
|
|
2778
|
-
}
|
|
2779
|
-
async function readCaseDetails(runDir) {
|
|
2780
|
-
const detailsDir = join(runDir, "case-details");
|
|
2781
|
-
const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
|
|
2782
|
-
if (entriesResult.error) return /* @__PURE__ */ new Map();
|
|
2783
|
-
const caseDetails = /* @__PURE__ */ new Map();
|
|
2784
|
-
for (const entry of entriesResult.value) {
|
|
2785
|
-
if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
|
|
2786
|
-
const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
|
|
2787
|
-
if (!detail) continue;
|
|
2788
|
-
caseDetails.set(detail.caseId, detail);
|
|
2789
|
-
}
|
|
2790
|
-
return caseDetails;
|
|
3141
|
+
function isRecord(value) {
|
|
3142
|
+
return typeof value === "object" && value !== null;
|
|
2791
3143
|
}
|
|
2792
|
-
function
|
|
2793
|
-
|
|
2794
|
-
if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
|
|
2795
|
-
else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
|
|
2796
|
-
return [...evalIds];
|
|
3144
|
+
function isBlob(value) {
|
|
3145
|
+
return value instanceof Blob;
|
|
2797
3146
|
}
|
|
2798
|
-
function
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
}
|
|
2804
|
-
return toLastRunStatus$1(deriveStatusFromChildStatuses({
|
|
2805
|
-
statuses: [],
|
|
2806
|
-
lifecycleStatus: run.manifest.status
|
|
2807
|
-
}));
|
|
3147
|
+
function getOutputsSchemaInput(schema, outputs) {
|
|
3148
|
+
if (!(schema instanceof z.ZodObject)) return outputs;
|
|
3149
|
+
const configuredOutputs = {};
|
|
3150
|
+
for (const key of Object.keys(schema.shape)) if (key in outputs) configuredOutputs[key] = outputs[key];
|
|
3151
|
+
return configuredOutputs;
|
|
2808
3152
|
}
|
|
2809
|
-
function
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
}
|
|
3153
|
+
function formatOutputsSchemaError(error) {
|
|
3154
|
+
const issueLines = error.issues.map((issue) => {
|
|
3155
|
+
return `${issue.path.length > 0 ? issue.path.join(".") : "<root>"}: ${issue.message}`;
|
|
3156
|
+
});
|
|
3157
|
+
if (issueLines.length === 0) return "outputsSchema validation failed";
|
|
3158
|
+
return `outputsSchema validation failed:\n${issueLines.join("\n")}`;
|
|
2815
3159
|
}
|
|
2816
|
-
function
|
|
2817
|
-
return
|
|
3160
|
+
function toAssertionFailure(message, error = void 0) {
|
|
3161
|
+
return error?.stack ? {
|
|
3162
|
+
message,
|
|
3163
|
+
stack: error.stack
|
|
3164
|
+
} : { message };
|
|
2818
3165
|
}
|
|
2819
3166
|
//#endregion
|
|
2820
3167
|
//#region ../runner/src/runQueue.ts
|
|
2821
3168
|
async function executeQueuedCases(params) {
|
|
2822
|
-
const {
|
|
3169
|
+
const { queuedCases, concurrency, globalTraceDisplay } = params;
|
|
2823
3170
|
let nextCaseIndex = 0;
|
|
2824
3171
|
let workerError = void 0;
|
|
2825
3172
|
const workerCount = Math.min(concurrency, queuedCases.length);
|
|
2826
3173
|
const workers = Array.from({ length: workerCount }, async () => {
|
|
2827
|
-
while (
|
|
3174
|
+
while (workerError === void 0) {
|
|
2828
3175
|
const queuedCase = queuedCases[nextCaseIndex];
|
|
2829
3176
|
nextCaseIndex += 1;
|
|
2830
3177
|
if (queuedCase === void 0) return;
|
|
2831
3178
|
try {
|
|
2832
3179
|
await executeQueuedCase({
|
|
2833
3180
|
queuedCase,
|
|
2834
|
-
runState,
|
|
2835
3181
|
globalTraceDisplay
|
|
2836
3182
|
});
|
|
2837
3183
|
} catch (error) {
|
|
@@ -2845,11 +3191,10 @@ async function executeQueuedCases(params) {
|
|
|
2845
3191
|
if (workerError !== void 0) throw new Error(typeof workerError === "string" ? workerError : typeof workerError === "number" || typeof workerError === "boolean" || typeof workerError === "bigint" ? String(workerError) : workerError === null ? "null" : "Unknown queue worker error");
|
|
2846
3192
|
}
|
|
2847
3193
|
async function executeQueuedCase(params) {
|
|
2848
|
-
const { queuedCase,
|
|
3194
|
+
const { queuedCase, globalTraceDisplay } = params;
|
|
2849
3195
|
const startTime = Date.now();
|
|
2850
3196
|
const result = await queuedCase.execute({
|
|
2851
3197
|
globalTraceDisplay,
|
|
2852
|
-
signal: runState.abortController.signal,
|
|
2853
3198
|
startTime
|
|
2854
3199
|
});
|
|
2855
3200
|
await queuedCase.onComplete(result);
|
|
@@ -2900,7 +3245,48 @@ function pickWinningTrial(params) {
|
|
|
2900
3245
|
if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
|
|
2901
3246
|
return medianAttempt;
|
|
2902
3247
|
}
|
|
2903
|
-
async function
|
|
3248
|
+
async function finalizePreparedCase(params) {
|
|
3249
|
+
const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
|
|
3250
|
+
if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
|
|
3251
|
+
preparedCase.finalized = true;
|
|
3252
|
+
const winningTrial = pickWinningTrial({
|
|
3253
|
+
strategy: runState.manifest.trialSelection,
|
|
3254
|
+
attempts: preparedCase.trialResults,
|
|
3255
|
+
scoreKeys: preparedEval.scoreKeys
|
|
3256
|
+
});
|
|
3257
|
+
if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
|
|
3258
|
+
runState.cases.push(winningTrial.caseRow);
|
|
3259
|
+
runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
|
|
3260
|
+
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
3261
|
+
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
3262
|
+
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
3263
|
+
else runState.summary.failedCases++;
|
|
3264
|
+
await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
3265
|
+
await persistCaseDetail(runDir, winningTrial.caseDetail);
|
|
3266
|
+
onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
|
|
3267
|
+
emitEvent(runState, {
|
|
3268
|
+
type: "case.finished",
|
|
3269
|
+
runId: runState.manifest.id,
|
|
3270
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3271
|
+
payload: winningTrial.caseRow
|
|
3272
|
+
});
|
|
3273
|
+
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
3274
|
+
}
|
|
3275
|
+
function getPreparedCaseOrderKey(caseRow) {
|
|
3276
|
+
return `${caseRow.evalId}\u0000${caseRow.caseId}`;
|
|
3277
|
+
}
|
|
3278
|
+
function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
|
|
3279
|
+
const orderByCase = /* @__PURE__ */ new Map();
|
|
3280
|
+
let order = 0;
|
|
3281
|
+
for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
|
|
3282
|
+
orderByCase.set(`${preparedEval.evalMeta.id}\u0000${preparedCase.caseId}`, order);
|
|
3283
|
+
order++;
|
|
3284
|
+
}
|
|
3285
|
+
caseRows.sort((left, right) => {
|
|
3286
|
+
return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
|
|
3287
|
+
});
|
|
3288
|
+
}
|
|
3289
|
+
async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
|
|
2904
3290
|
try {
|
|
2905
3291
|
const targetEvals = getTargetEvals(request);
|
|
2906
3292
|
emitEvent(runState, {
|
|
@@ -2909,14 +3295,16 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
2909
3295
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2910
3296
|
payload: runState.manifest
|
|
2911
3297
|
});
|
|
2912
|
-
const allCaseRows = [];
|
|
2913
3298
|
const evalErrors = [];
|
|
2914
3299
|
const queuedCases = [];
|
|
2915
3300
|
const preparedEvals = [];
|
|
2916
3301
|
const cacheMode = runState.manifest.cacheMode ?? "use";
|
|
2917
3302
|
const cacheEnabled = config.cache?.enabled !== false;
|
|
3303
|
+
const moduleIsolation = {
|
|
3304
|
+
key: runState.manifest.id,
|
|
3305
|
+
workspaceRoot
|
|
3306
|
+
};
|
|
2918
3307
|
for (const evalMeta of targetEvals) {
|
|
2919
|
-
if (runState.abortController.signal.aborted) break;
|
|
2920
3308
|
const evalFilePath = evalMeta.sourceFilePath;
|
|
2921
3309
|
let codeFingerprint = "";
|
|
2922
3310
|
try {
|
|
@@ -2928,7 +3316,9 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
2928
3316
|
else delete runState.manifest.evalSourceFingerprints[evalMeta.id];
|
|
2929
3317
|
try {
|
|
2930
3318
|
const registry = getEvalRegistry();
|
|
2931
|
-
await
|
|
3319
|
+
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
3320
|
+
await loadEvalModule(evalFilePath, codeFingerprint);
|
|
3321
|
+
});
|
|
2932
3322
|
const entry = registry.get(evalMeta.id);
|
|
2933
3323
|
if (!entry) {
|
|
2934
3324
|
evalErrors.push({
|
|
@@ -2937,74 +3327,87 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
2937
3327
|
});
|
|
2938
3328
|
continue;
|
|
2939
3329
|
}
|
|
2940
|
-
await
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
|
|
2957
|
-
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
const
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
trial,
|
|
2978
|
-
signal,
|
|
2979
|
-
startTime,
|
|
2980
|
-
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
2981
|
-
cacheMode,
|
|
2982
|
-
codeFingerprint,
|
|
2983
|
-
artifactDir: join(runDir, "artifacts"),
|
|
2984
|
-
runId: runState.manifest.id
|
|
2985
|
-
});
|
|
2986
|
-
return {
|
|
2987
|
-
caseDetail,
|
|
2988
|
-
caseRow: {
|
|
2989
|
-
caseId: evalCase.id,
|
|
3330
|
+
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
3331
|
+
await entry.use(async (evalDef) => {
|
|
3332
|
+
const cases = filterEvalCases(resolveRunnableEvalCases({
|
|
3333
|
+
cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
|
|
3334
|
+
evalId: evalMeta.id
|
|
3335
|
+
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
3336
|
+
runState.summary.totalCases += cases.length;
|
|
3337
|
+
const accumulatedColumns = /* @__PURE__ */ new Map();
|
|
3338
|
+
const evalCaseRows = [];
|
|
3339
|
+
const preparedCases = [];
|
|
3340
|
+
const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
|
|
3341
|
+
const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
|
|
3342
|
+
const preparedEval = {
|
|
3343
|
+
evalMeta,
|
|
3344
|
+
accumulatedColumns,
|
|
3345
|
+
evalCaseRows,
|
|
3346
|
+
preparedCases,
|
|
3347
|
+
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
|
|
3348
|
+
mergeColumns: (columns) => {
|
|
3349
|
+
mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
3350
|
+
}
|
|
3351
|
+
};
|
|
3352
|
+
preparedEvals.push(preparedEval);
|
|
3353
|
+
for (const evalCase of cases) {
|
|
3354
|
+
const trialResults = [];
|
|
3355
|
+
const preparedCase = {
|
|
3356
|
+
caseId: evalCase.id,
|
|
3357
|
+
trialResults,
|
|
3358
|
+
finalized: false
|
|
3359
|
+
};
|
|
3360
|
+
preparedCases.push(preparedCase);
|
|
3361
|
+
for (let trial = 0; trial < request.trials; trial++) {
|
|
3362
|
+
const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
|
|
3363
|
+
queuedCases.push({
|
|
3364
|
+
execute: async ({ startTime, globalTraceDisplay }) => {
|
|
3365
|
+
const { caseDetail, caseRowUpdate } = await runCase({
|
|
3366
|
+
evalDef,
|
|
2990
3367
|
evalId: evalMeta.id,
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3004
|
-
|
|
3005
|
-
|
|
3368
|
+
evalCase,
|
|
3369
|
+
globalTraceDisplay,
|
|
3370
|
+
trial,
|
|
3371
|
+
startTime,
|
|
3372
|
+
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
3373
|
+
cacheMode,
|
|
3374
|
+
codeFingerprint,
|
|
3375
|
+
moduleIsolation,
|
|
3376
|
+
artifactDir: join(runDir, "artifacts"),
|
|
3377
|
+
runId: runState.manifest.id
|
|
3378
|
+
});
|
|
3379
|
+
return {
|
|
3380
|
+
caseDetail,
|
|
3381
|
+
caseRow: {
|
|
3382
|
+
caseId: evalCase.id,
|
|
3383
|
+
evalId: evalMeta.id,
|
|
3384
|
+
status: caseRowUpdate.status ?? "pending",
|
|
3385
|
+
latencyMs: caseRowUpdate.latencyMs ?? null,
|
|
3386
|
+
columns: caseRowUpdate.columns ?? {},
|
|
3387
|
+
trial
|
|
3388
|
+
}
|
|
3389
|
+
};
|
|
3390
|
+
},
|
|
3391
|
+
onComplete: async ({ caseDetail, caseRow }) => {
|
|
3392
|
+
trialResults.push({
|
|
3393
|
+
caseDetail,
|
|
3394
|
+
caseRow,
|
|
3395
|
+
bufferedCacheStore
|
|
3396
|
+
});
|
|
3397
|
+
if (trialResults.length !== request.trials) return;
|
|
3398
|
+
await finalizePreparedCase({
|
|
3399
|
+
runState,
|
|
3400
|
+
runDir,
|
|
3401
|
+
preparedEval,
|
|
3402
|
+
preparedCase,
|
|
3403
|
+
onCaseFinished,
|
|
3404
|
+
emitEvent
|
|
3405
|
+
});
|
|
3406
|
+
}
|
|
3407
|
+
});
|
|
3408
|
+
}
|
|
3006
3409
|
}
|
|
3007
|
-
}
|
|
3410
|
+
});
|
|
3008
3411
|
});
|
|
3009
3412
|
} catch (error) {
|
|
3010
3413
|
console.error(`Error running eval ${evalMeta.id}:`, error);
|
|
@@ -3022,37 +3425,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3022
3425
|
}
|
|
3023
3426
|
}
|
|
3024
3427
|
await executeQueuedCases({
|
|
3025
|
-
runState,
|
|
3026
3428
|
queuedCases,
|
|
3027
3429
|
concurrency: getConfiguredConcurrency(),
|
|
3028
3430
|
globalTraceDisplay: config.traceDisplay
|
|
3029
3431
|
});
|
|
3030
3432
|
for (const preparedEval of preparedEvals) {
|
|
3031
|
-
for (const preparedCase of preparedEval.preparedCases) {
|
|
3032
|
-
|
|
3033
|
-
|
|
3034
|
-
|
|
3035
|
-
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
runState.cases.push(winningTrial.caseRow);
|
|
3040
|
-
runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
|
|
3041
|
-
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
3042
|
-
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
3043
|
-
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
3044
|
-
else runState.summary.failedCases++;
|
|
3045
|
-
await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
3046
|
-
await persistCaseDetail(runDir, winningTrial.caseDetail);
|
|
3047
|
-
emitEvent(runState, {
|
|
3048
|
-
type: "case.finished",
|
|
3049
|
-
runId: runState.manifest.id,
|
|
3050
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3051
|
-
payload: winningTrial.caseRow
|
|
3052
|
-
});
|
|
3053
|
-
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
3054
|
-
allCaseRows.push(winningTrial.caseRow);
|
|
3055
|
-
}
|
|
3433
|
+
for (const preparedCase of preparedEval.preparedCases) await finalizePreparedCase({
|
|
3434
|
+
runState,
|
|
3435
|
+
runDir,
|
|
3436
|
+
preparedEval,
|
|
3437
|
+
preparedCase,
|
|
3438
|
+
onCaseFinished,
|
|
3439
|
+
emitEvent
|
|
3440
|
+
});
|
|
3056
3441
|
preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
|
|
3057
3442
|
lastRunStatusMap.set(preparedEval.evalMeta.id, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
|
|
3058
3443
|
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.id) ?? null;
|
|
@@ -3063,9 +3448,11 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3063
3448
|
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.id] ?? null
|
|
3064
3449
|
});
|
|
3065
3450
|
}
|
|
3451
|
+
sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
|
|
3452
|
+
for (const preparedEval of preparedEvals) sortCaseRowsByPreparedOrder(preparedEval.evalCaseRows, preparedEvals);
|
|
3066
3453
|
const endTime = /* @__PURE__ */ new Date();
|
|
3067
3454
|
runState.summary.totalDurationMs = endTime.getTime() - new Date(runState.manifest.startedAt).getTime();
|
|
3068
|
-
const finalStatus =
|
|
3455
|
+
const finalStatus = evalErrors.length > 0 ? "error" : "completed";
|
|
3069
3456
|
runState.summary.status = finalStatus;
|
|
3070
3457
|
runState.manifest.status = finalStatus;
|
|
3071
3458
|
const completedRunAt = endTime.toISOString();
|
|
@@ -3087,6 +3474,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3087
3474
|
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalId] ?? null
|
|
3088
3475
|
});
|
|
3089
3476
|
}
|
|
3477
|
+
await persistRunState(runState);
|
|
3090
3478
|
emitEvent(runState, {
|
|
3091
3479
|
type: "run.summary",
|
|
3092
3480
|
runId: runState.manifest.id,
|
|
@@ -3105,7 +3493,6 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3105
3493
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3106
3494
|
payload: runState.summary
|
|
3107
3495
|
});
|
|
3108
|
-
await persistRunState(runState);
|
|
3109
3496
|
emitDiscoveryEvent();
|
|
3110
3497
|
} catch (error) {
|
|
3111
3498
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -3113,13 +3500,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3113
3500
|
runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
3114
3501
|
runState.summary.status = "error";
|
|
3115
3502
|
runState.summary.errorMessage = message;
|
|
3503
|
+
await persistRunState(runState);
|
|
3116
3504
|
emitEvent(runState, {
|
|
3117
3505
|
type: "run.error",
|
|
3118
3506
|
runId: runState.manifest.id,
|
|
3119
3507
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3120
3508
|
payload: { message }
|
|
3121
3509
|
});
|
|
3122
|
-
await persistRunState(runState);
|
|
3123
3510
|
emitDiscoveryEvent();
|
|
3124
3511
|
}
|
|
3125
3512
|
}
|
|
@@ -3127,872 +3514,4 @@ function toLastRunStatus(status) {
|
|
|
3127
3514
|
return status === "pending" ? null : status;
|
|
3128
3515
|
}
|
|
3129
3516
|
//#endregion
|
|
3130
|
-
|
|
3131
|
-
const globMagicCharacters = new Set([
|
|
3132
|
-
"*",
|
|
3133
|
-
"?",
|
|
3134
|
-
"[",
|
|
3135
|
-
"]",
|
|
3136
|
-
"{",
|
|
3137
|
-
"}",
|
|
3138
|
-
"(",
|
|
3139
|
-
")",
|
|
3140
|
-
"!",
|
|
3141
|
-
"+",
|
|
3142
|
-
"@"
|
|
3143
|
-
]);
|
|
3144
|
-
function hasGlobMagic(value) {
|
|
3145
|
-
for (const char of value) if (globMagicCharacters.has(char)) return true;
|
|
3146
|
-
return false;
|
|
3147
|
-
}
|
|
3148
|
-
function getWatchRootForIncludePattern(params) {
|
|
3149
|
-
const segments = params.pattern.replaceAll("\\", "/").split("/").filter((part) => part !== "");
|
|
3150
|
-
const firstGlobSegmentIndex = segments.findIndex(hasGlobMagic);
|
|
3151
|
-
if (firstGlobSegmentIndex === -1) return dirname(resolve(params.workspaceRoot, params.pattern));
|
|
3152
|
-
if (firstGlobSegmentIndex === 0) return params.workspaceRoot;
|
|
3153
|
-
return resolve(params.workspaceRoot, segments.slice(0, firstGlobSegmentIndex).join("/"));
|
|
3154
|
-
}
|
|
3155
|
-
function getWatchRootsForIncludePatterns(params) {
|
|
3156
|
-
const roots = /* @__PURE__ */ new Set();
|
|
3157
|
-
for (const pattern of params.patterns) roots.add(getWatchRootForIncludePattern({
|
|
3158
|
-
pattern,
|
|
3159
|
-
workspaceRoot: params.workspaceRoot
|
|
3160
|
-
}));
|
|
3161
|
-
if (roots.size === 0) return [params.workspaceRoot];
|
|
3162
|
-
return [...roots];
|
|
3163
|
-
}
|
|
3164
|
-
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
3165
|
-
function createRunner({ watchForChanges = true } = {}) {
|
|
3166
|
-
let config;
|
|
3167
|
-
let workspaceRoot;
|
|
3168
|
-
let localStateDir;
|
|
3169
|
-
let cacheStore;
|
|
3170
|
-
const evals = /* @__PURE__ */ new Map();
|
|
3171
|
-
const runs = /* @__PURE__ */ new Map();
|
|
3172
|
-
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
3173
|
-
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
3174
|
-
const discoveryListeners = /* @__PURE__ */ new Set();
|
|
3175
|
-
let nextShortIdNum = 0;
|
|
3176
|
-
let discoveryWatcher;
|
|
3177
|
-
let discoveryRefreshTimer;
|
|
3178
|
-
function toWorkspaceRelativePath(filePath) {
|
|
3179
|
-
return relative(workspaceRoot, filePath).replaceAll("\\", "/");
|
|
3180
|
-
}
|
|
3181
|
-
function getSortedEvalMetas() {
|
|
3182
|
-
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
3183
|
-
}
|
|
3184
|
-
function getSourceFingerprint(source) {
|
|
3185
|
-
return createHash("sha256").update(source).digest("hex");
|
|
3186
|
-
}
|
|
3187
|
-
function getConfiguredConcurrency() {
|
|
3188
|
-
const configuredConcurrency = config.concurrency;
|
|
3189
|
-
if (typeof configuredConcurrency !== "number" || !Number.isFinite(configuredConcurrency)) return 1;
|
|
3190
|
-
return Math.max(1, Math.floor(configuredConcurrency));
|
|
3191
|
-
}
|
|
3192
|
-
const runner = {
|
|
3193
|
-
async init() {
|
|
3194
|
-
config = await loadConfig();
|
|
3195
|
-
workspaceRoot = config.workspaceRoot ?? process.cwd();
|
|
3196
|
-
localStateDir = resolve(workspaceRoot, ".agent-evals");
|
|
3197
|
-
await mkdir(localStateDir, { recursive: true });
|
|
3198
|
-
await mkdir(join(localStateDir, "runs"), { recursive: true });
|
|
3199
|
-
cacheStore = createFsCacheStore({
|
|
3200
|
-
workspaceRoot,
|
|
3201
|
-
dir: config.cache?.dir,
|
|
3202
|
-
maxEntriesPerEval: config.cache?.maxEntriesPerEval
|
|
3203
|
-
});
|
|
3204
|
-
await loadPersistedRuns();
|
|
3205
|
-
await runner.refreshDiscovery();
|
|
3206
|
-
if (watchForChanges) await setupWatcher();
|
|
3207
|
-
},
|
|
3208
|
-
async listCache() {
|
|
3209
|
-
return cacheStore.list();
|
|
3210
|
-
},
|
|
3211
|
-
async clearCache(filter) {
|
|
3212
|
-
await cacheStore.clear(filter);
|
|
3213
|
-
},
|
|
3214
|
-
async recomputeStatusesForEval(evalId) {
|
|
3215
|
-
const evalMeta = evals.get(evalId);
|
|
3216
|
-
if (!evalMeta) return { updatedRuns: 0 };
|
|
3217
|
-
const registry = getEvalRegistry();
|
|
3218
|
-
await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
|
|
3219
|
-
const entry = registry.get(evalId);
|
|
3220
|
-
if (!entry) return { updatedRuns: 0 };
|
|
3221
|
-
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
3222
|
-
entry.use((evalDef) => {
|
|
3223
|
-
for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
|
|
3224
|
-
const threshold = normalizeScoreDef(def).passThreshold;
|
|
3225
|
-
if (threshold !== void 0) scoreThresholds.set(key, threshold);
|
|
3226
|
-
}
|
|
3227
|
-
for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
|
|
3228
|
-
});
|
|
3229
|
-
const updatedRuns = await recomputeEvalStatusesInRuns({
|
|
3230
|
-
runs: runs.values(),
|
|
3231
|
-
evalId,
|
|
3232
|
-
evalExists: evals.has(evalId),
|
|
3233
|
-
scoreThresholds,
|
|
3234
|
-
persistCaseDetail
|
|
3235
|
-
});
|
|
3236
|
-
emitDiscoveryEvent();
|
|
3237
|
-
return { updatedRuns };
|
|
3238
|
-
},
|
|
3239
|
-
async cleanRunsForEval(evalId) {
|
|
3240
|
-
let deletedRuns = 0;
|
|
3241
|
-
for (const [runId, run] of [...runs]) {
|
|
3242
|
-
if (!runTouchesEval({
|
|
3243
|
-
target: run.manifest.target,
|
|
3244
|
-
caseRows: run.cases,
|
|
3245
|
-
evalId,
|
|
3246
|
-
evalExists: evals.has(evalId)
|
|
3247
|
-
})) continue;
|
|
3248
|
-
if (run.manifest.status === "running") continue;
|
|
3249
|
-
runs.delete(runId);
|
|
3250
|
-
await rm(run.runDir, {
|
|
3251
|
-
recursive: true,
|
|
3252
|
-
force: true
|
|
3253
|
-
});
|
|
3254
|
-
deletedRuns += 1;
|
|
3255
|
-
}
|
|
3256
|
-
emitDiscoveryEvent();
|
|
3257
|
-
return { deletedRuns };
|
|
3258
|
-
},
|
|
3259
|
-
async updateManualScore({ runId, caseId, scoreKey, value }) {
|
|
3260
|
-
const run = runs.get(runId);
|
|
3261
|
-
if (!run) return {
|
|
3262
|
-
updated: false,
|
|
3263
|
-
reason: "Run not found"
|
|
3264
|
-
};
|
|
3265
|
-
if (run.manifest.status === "running") return {
|
|
3266
|
-
updated: false,
|
|
3267
|
-
reason: "Run is still running"
|
|
3268
|
-
};
|
|
3269
|
-
const caseRow = run.cases.find((row) => row.caseId === caseId);
|
|
3270
|
-
if (!caseRow) return {
|
|
3271
|
-
updated: false,
|
|
3272
|
-
reason: "Case not found"
|
|
3273
|
-
};
|
|
3274
|
-
const evalMeta = evals.get(caseRow.evalId);
|
|
3275
|
-
if (!evalMeta) return {
|
|
3276
|
-
updated: false,
|
|
3277
|
-
reason: "Eval not found"
|
|
3278
|
-
};
|
|
3279
|
-
if (evalMeta.columnDefs.find((def) => def.key === scoreKey)?.isManualScore !== true) return {
|
|
3280
|
-
updated: false,
|
|
3281
|
-
reason: "Manual score not found"
|
|
3282
|
-
};
|
|
3283
|
-
const caseDetail = run.caseDetails.get(caseId);
|
|
3284
|
-
if (!caseDetail) return {
|
|
3285
|
-
updated: false,
|
|
3286
|
-
reason: "Case detail not found"
|
|
3287
|
-
};
|
|
3288
|
-
caseRow.columns[scoreKey] = value;
|
|
3289
|
-
caseDetail.columns[scoreKey] = value;
|
|
3290
|
-
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
3291
|
-
for (const def of evalMeta.columnDefs) {
|
|
3292
|
-
if (def.isScore !== true || def.passThreshold === void 0) continue;
|
|
3293
|
-
scoreThresholds.set(def.key, def.passThreshold);
|
|
3294
|
-
}
|
|
3295
|
-
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds);
|
|
3296
|
-
caseRow.status = nextStatus;
|
|
3297
|
-
caseDetail.status = nextStatus;
|
|
3298
|
-
const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
|
|
3299
|
-
run.summary.totalCases = derivedSummary.totalCases;
|
|
3300
|
-
run.summary.passedCases = derivedSummary.passedCases;
|
|
3301
|
-
run.summary.failedCases = derivedSummary.failedCases;
|
|
3302
|
-
run.summary.errorCases = derivedSummary.errorCases;
|
|
3303
|
-
run.summary.cancelledCases = derivedSummary.cancelledCases;
|
|
3304
|
-
run.summary.totalDurationMs = derivedSummary.totalDurationMs;
|
|
3305
|
-
await persistCaseDetail(run.runDir, caseDetail);
|
|
3306
|
-
await persistRunState(run);
|
|
3307
|
-
emitDiscoveryEvent();
|
|
3308
|
-
return {
|
|
3309
|
-
updated: true,
|
|
3310
|
-
run: {
|
|
3311
|
-
manifest: run.manifest,
|
|
3312
|
-
summary: run.summary,
|
|
3313
|
-
cases: run.cases
|
|
3314
|
-
},
|
|
3315
|
-
caseDetail
|
|
3316
|
-
};
|
|
3317
|
-
},
|
|
3318
|
-
async deleteRun(runId) {
|
|
3319
|
-
const run = runs.get(runId);
|
|
3320
|
-
if (!run) return { deleted: false };
|
|
3321
|
-
if (run.manifest.status === "running") return { deleted: false };
|
|
3322
|
-
runs.delete(runId);
|
|
3323
|
-
await rm(run.runDir, {
|
|
3324
|
-
recursive: true,
|
|
3325
|
-
force: true
|
|
3326
|
-
});
|
|
3327
|
-
emitDiscoveryEvent();
|
|
3328
|
-
return { deleted: true };
|
|
3329
|
-
},
|
|
3330
|
-
getEvals() {
|
|
3331
|
-
const gitState = readGitWorktreeState(workspaceRoot);
|
|
3332
|
-
const result = [];
|
|
3333
|
-
for (const meta of getSortedEvalMetas()) result.push(buildEvalSummary({
|
|
3334
|
-
meta,
|
|
3335
|
-
config,
|
|
3336
|
-
gitState,
|
|
3337
|
-
latestRun: latestRunInfoMap.get(meta.id),
|
|
3338
|
-
lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
|
|
3339
|
-
}));
|
|
3340
|
-
return result;
|
|
3341
|
-
},
|
|
3342
|
-
getEval(id) {
|
|
3343
|
-
const meta = evals.get(id);
|
|
3344
|
-
if (!meta) return void 0;
|
|
3345
|
-
return buildEvalSummary({
|
|
3346
|
-
meta,
|
|
3347
|
-
config,
|
|
3348
|
-
gitState: readGitWorktreeState(workspaceRoot),
|
|
3349
|
-
latestRun: latestRunInfoMap.get(meta.id),
|
|
3350
|
-
lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
|
|
3351
|
-
});
|
|
3352
|
-
},
|
|
3353
|
-
async refreshDiscovery() {
|
|
3354
|
-
const patterns = config.include;
|
|
3355
|
-
const discovered = [];
|
|
3356
|
-
for (const pattern of patterns) {
|
|
3357
|
-
const files = await glob(pattern, {
|
|
3358
|
-
cwd: workspaceRoot,
|
|
3359
|
-
absolute: true
|
|
3360
|
-
});
|
|
3361
|
-
discovered.push(...files);
|
|
3362
|
-
}
|
|
3363
|
-
evals.clear();
|
|
3364
|
-
for (const filePath of discovered) try {
|
|
3365
|
-
const content = await readFile(filePath, "utf-8");
|
|
3366
|
-
const discoveredMetas = parseEvalMetas(filePath, content);
|
|
3367
|
-
const sourceFingerprint = getSourceFingerprint(content);
|
|
3368
|
-
const registry = getEvalRegistry();
|
|
3369
|
-
try {
|
|
3370
|
-
await loadEvalModule(filePath, sourceFingerprint);
|
|
3371
|
-
} catch {}
|
|
3372
|
-
for (const meta of discoveredMetas) {
|
|
3373
|
-
const discoveredEntry = registry.get(meta.id);
|
|
3374
|
-
const title = meta.title;
|
|
3375
|
-
let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
|
|
3376
|
-
let stats;
|
|
3377
|
-
let charts;
|
|
3378
|
-
discoveredEntry?.use((evalDef) => {
|
|
3379
|
-
columnDefs = buildDeclaredColumnDefs(evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
3380
|
-
stats = evalDef.stats;
|
|
3381
|
-
const validated = validateCharts({
|
|
3382
|
-
charts: evalDef.charts,
|
|
3383
|
-
columnDefs,
|
|
3384
|
-
evalId: meta.id
|
|
3385
|
-
});
|
|
3386
|
-
for (const warning of validated.warnings) console.warn(warning);
|
|
3387
|
-
charts = validated.charts;
|
|
3388
|
-
});
|
|
3389
|
-
evals.set(meta.id, {
|
|
3390
|
-
id: meta.id,
|
|
3391
|
-
title,
|
|
3392
|
-
filePath: toWorkspaceRelativePath(meta.filePath),
|
|
3393
|
-
sourceFilePath: meta.filePath,
|
|
3394
|
-
sourceFingerprint,
|
|
3395
|
-
columnDefs,
|
|
3396
|
-
caseCount: null,
|
|
3397
|
-
stats,
|
|
3398
|
-
charts
|
|
3399
|
-
});
|
|
3400
|
-
}
|
|
3401
|
-
} catch {}
|
|
3402
|
-
emitDiscoveryEvent();
|
|
3403
|
-
},
|
|
3404
|
-
async startRun(request) {
|
|
3405
|
-
const runId = generateRunId();
|
|
3406
|
-
const shortId = `r${String(nextShortIdNum++)}`;
|
|
3407
|
-
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
3408
|
-
const cacheMode = request.cache?.mode ?? "use";
|
|
3409
|
-
const runDir = join(localStateDir, "runs", runId);
|
|
3410
|
-
const manifest = {
|
|
3411
|
-
id: runId,
|
|
3412
|
-
shortId,
|
|
3413
|
-
status: "running",
|
|
3414
|
-
startedAt: now,
|
|
3415
|
-
endedAt: null,
|
|
3416
|
-
commitSha: readGitWorktreeState(workspaceRoot).commitSha,
|
|
3417
|
-
evalSourceFingerprints: {},
|
|
3418
|
-
target: request.target,
|
|
3419
|
-
trials: request.trials,
|
|
3420
|
-
trialSelection: config.trialSelection ?? "lowestScore",
|
|
3421
|
-
cacheMode
|
|
3422
|
-
};
|
|
3423
|
-
const summary = {
|
|
3424
|
-
runId,
|
|
3425
|
-
status: "running",
|
|
3426
|
-
totalCases: 0,
|
|
3427
|
-
passedCases: 0,
|
|
3428
|
-
failedCases: 0,
|
|
3429
|
-
errorCases: 0,
|
|
3430
|
-
cancelledCases: 0,
|
|
3431
|
-
totalDurationMs: null,
|
|
3432
|
-
errorMessage: null
|
|
3433
|
-
};
|
|
3434
|
-
const abortController = new AbortController();
|
|
3435
|
-
const runState = {
|
|
3436
|
-
runDir,
|
|
3437
|
-
manifest,
|
|
3438
|
-
summary,
|
|
3439
|
-
cases: [],
|
|
3440
|
-
caseDetails: /* @__PURE__ */ new Map(),
|
|
3441
|
-
listeners: /* @__PURE__ */ new Set(),
|
|
3442
|
-
abortController
|
|
3443
|
-
};
|
|
3444
|
-
runs.set(runId, runState);
|
|
3445
|
-
setLatestRunInfoMap({
|
|
3446
|
-
latestRunInfoMap,
|
|
3447
|
-
evalIds: getTargetEvalIds({
|
|
3448
|
-
request,
|
|
3449
|
-
sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
|
|
3450
|
-
knownEvalIds: new Set(evals.keys())
|
|
3451
|
-
}),
|
|
3452
|
-
info: {
|
|
3453
|
-
status: "running",
|
|
3454
|
-
startedAt: now,
|
|
3455
|
-
commitSha: manifest.commitSha ?? null,
|
|
3456
|
-
evalSourceFingerprint: null
|
|
3457
|
-
}
|
|
3458
|
-
});
|
|
3459
|
-
await mkdir(runDir, { recursive: true });
|
|
3460
|
-
await mkdir(join(runDir, "traces"), { recursive: true });
|
|
3461
|
-
await mkdir(join(runDir, "artifacts"), { recursive: true });
|
|
3462
|
-
await mkdir(join(runDir, "case-details"), { recursive: true });
|
|
3463
|
-
await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
|
|
3464
|
-
executeRun({
|
|
3465
|
-
runState,
|
|
3466
|
-
request,
|
|
3467
|
-
runDir,
|
|
3468
|
-
config,
|
|
3469
|
-
evals,
|
|
3470
|
-
cacheStore,
|
|
3471
|
-
lastRunStatusMap,
|
|
3472
|
-
latestRunInfoMap,
|
|
3473
|
-
emitEvent,
|
|
3474
|
-
emitDiscoveryEvent,
|
|
3475
|
-
getSourceFingerprint,
|
|
3476
|
-
getConfiguredConcurrency,
|
|
3477
|
-
getSortedEvalMetas,
|
|
3478
|
-
getTargetEvals
|
|
3479
|
-
});
|
|
3480
|
-
return {
|
|
3481
|
-
manifest,
|
|
3482
|
-
summary,
|
|
3483
|
-
cases: []
|
|
3484
|
-
};
|
|
3485
|
-
},
|
|
3486
|
-
getRuns() {
|
|
3487
|
-
return [...runs.values()].map((r) => r.manifest);
|
|
3488
|
-
},
|
|
3489
|
-
getRun(id) {
|
|
3490
|
-
const run = runs.get(id);
|
|
3491
|
-
if (!run) return void 0;
|
|
3492
|
-
return {
|
|
3493
|
-
manifest: run.manifest,
|
|
3494
|
-
summary: run.summary,
|
|
3495
|
-
cases: run.cases
|
|
3496
|
-
};
|
|
3497
|
-
},
|
|
3498
|
-
cancelRun(id) {
|
|
3499
|
-
const run = runs.get(id);
|
|
3500
|
-
if (!run) return;
|
|
3501
|
-
run.abortController.abort();
|
|
3502
|
-
run.manifest.status = "cancelled";
|
|
3503
|
-
run.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
3504
|
-
run.summary.status = "cancelled";
|
|
3505
|
-
emitEvent(run, {
|
|
3506
|
-
type: "run.cancelled",
|
|
3507
|
-
runId: id,
|
|
3508
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3509
|
-
payload: run.summary
|
|
3510
|
-
});
|
|
3511
|
-
},
|
|
3512
|
-
getCaseDetail(runId, caseId) {
|
|
3513
|
-
const run = runs.get(runId);
|
|
3514
|
-
if (!run) return void 0;
|
|
3515
|
-
return run.caseDetails.get(caseId);
|
|
3516
|
-
},
|
|
3517
|
-
subscribe(runId, listener) {
|
|
3518
|
-
const run = runs.get(runId);
|
|
3519
|
-
if (!run) return () => {};
|
|
3520
|
-
run.listeners.add(listener);
|
|
3521
|
-
return () => {
|
|
3522
|
-
run.listeners.delete(listener);
|
|
3523
|
-
};
|
|
3524
|
-
},
|
|
3525
|
-
subscribeDiscovery(listener) {
|
|
3526
|
-
discoveryListeners.add(listener);
|
|
3527
|
-
return () => {
|
|
3528
|
-
discoveryListeners.delete(listener);
|
|
3529
|
-
};
|
|
3530
|
-
},
|
|
3531
|
-
async close() {
|
|
3532
|
-
if (discoveryRefreshTimer !== void 0) {
|
|
3533
|
-
clearTimeout(discoveryRefreshTimer);
|
|
3534
|
-
discoveryRefreshTimer = void 0;
|
|
3535
|
-
}
|
|
3536
|
-
const watcher = discoveryWatcher;
|
|
3537
|
-
if (watcher === void 0) return;
|
|
3538
|
-
discoveryWatcher = void 0;
|
|
3539
|
-
await watcher.close();
|
|
3540
|
-
},
|
|
3541
|
-
getWorkspaceRoot() {
|
|
3542
|
-
return workspaceRoot;
|
|
3543
|
-
},
|
|
3544
|
-
getArtifactPath(artifactId_) {
|
|
3545
|
-
return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
|
|
3546
|
-
}
|
|
3547
|
-
};
|
|
3548
|
-
async function setupWatcher() {
|
|
3549
|
-
const watcher = watch(getWatchRootsForIncludePatterns({
|
|
3550
|
-
patterns: config.include,
|
|
3551
|
-
workspaceRoot
|
|
3552
|
-
}), {
|
|
3553
|
-
ignoreInitial: true,
|
|
3554
|
-
persistent: true
|
|
3555
|
-
});
|
|
3556
|
-
discoveryWatcher = watcher;
|
|
3557
|
-
const scheduleRefresh = () => {
|
|
3558
|
-
if (discoveryRefreshTimer !== void 0) clearTimeout(discoveryRefreshTimer);
|
|
3559
|
-
discoveryRefreshTimer = setTimeout(() => {
|
|
3560
|
-
discoveryRefreshTimer = void 0;
|
|
3561
|
-
runner.refreshDiscovery();
|
|
3562
|
-
}, 50);
|
|
3563
|
-
};
|
|
3564
|
-
watcher.on("change", scheduleRefresh);
|
|
3565
|
-
watcher.on("add", scheduleRefresh);
|
|
3566
|
-
watcher.on("unlink", scheduleRefresh);
|
|
3567
|
-
watcher.on("addDir", scheduleRefresh);
|
|
3568
|
-
watcher.on("unlinkDir", scheduleRefresh);
|
|
3569
|
-
await new Promise((ready) => {
|
|
3570
|
-
watcher.once("ready", ready);
|
|
3571
|
-
});
|
|
3572
|
-
}
|
|
3573
|
-
function emitDiscoveryEvent() {
|
|
3574
|
-
const lastRunStatuses = getLastRunStatuses({
|
|
3575
|
-
runs: runs.values(),
|
|
3576
|
-
knownEvals: evals.values()
|
|
3577
|
-
});
|
|
3578
|
-
const latestRunInfos = getLatestRunInfos({
|
|
3579
|
-
runs: runs.values(),
|
|
3580
|
-
knownEvals: evals.values()
|
|
3581
|
-
});
|
|
3582
|
-
lastRunStatusMap.clear();
|
|
3583
|
-
for (const [evalId, status] of lastRunStatuses) lastRunStatusMap.set(evalId, status);
|
|
3584
|
-
latestRunInfoMap.clear();
|
|
3585
|
-
for (const [evalId, info] of latestRunInfos) latestRunInfoMap.set(evalId, info);
|
|
3586
|
-
const event = {
|
|
3587
|
-
type: "discovery.updated",
|
|
3588
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3589
|
-
payload: runner.getEvals()
|
|
3590
|
-
};
|
|
3591
|
-
for (const listener of discoveryListeners) listener(event);
|
|
3592
|
-
}
|
|
3593
|
-
function getTargetEvals(request) {
|
|
3594
|
-
if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.map((id) => evals.get(id)).filter((e) => e !== void 0);
|
|
3595
|
-
return getSortedEvalMetas();
|
|
3596
|
-
}
|
|
3597
|
-
function emitEvent(runState, event) {
|
|
3598
|
-
for (const listener of runState.listeners) try {
|
|
3599
|
-
listener(event);
|
|
3600
|
-
} catch {}
|
|
3601
|
-
}
|
|
3602
|
-
async function loadPersistedRuns() {
|
|
3603
|
-
runs.clear();
|
|
3604
|
-
const persistedRuns = await loadPersistedRunSnapshots(localStateDir);
|
|
3605
|
-
nextShortIdNum = nextShortIdFromSnapshots(persistedRuns);
|
|
3606
|
-
for (const persistedRun of persistedRuns) runs.set(persistedRun.manifest.id, {
|
|
3607
|
-
...persistedRun,
|
|
3608
|
-
listeners: /* @__PURE__ */ new Set(),
|
|
3609
|
-
abortController: new AbortController()
|
|
3610
|
-
});
|
|
3611
|
-
}
|
|
3612
|
-
return runner;
|
|
3613
|
-
}
|
|
3614
|
-
//#endregion
|
|
3615
|
-
//#region src/cli.ts
|
|
3616
|
-
function parseArgs(argv) {
|
|
3617
|
-
const args = {
|
|
3618
|
-
command: "help",
|
|
3619
|
-
subcommand: void 0,
|
|
3620
|
-
showHelp: false,
|
|
3621
|
-
helpTopic: "global",
|
|
3622
|
-
unknownHelpTarget: void 0,
|
|
3623
|
-
evalIds: [],
|
|
3624
|
-
caseIds: [],
|
|
3625
|
-
trials: 1,
|
|
3626
|
-
json: false,
|
|
3627
|
-
port: 4100,
|
|
3628
|
-
cacheMode: "use",
|
|
3629
|
-
clearCache: false,
|
|
3630
|
-
all: false
|
|
3631
|
-
};
|
|
3632
|
-
const command = argv[0];
|
|
3633
|
-
if (command === "--help" || command === "-h") {
|
|
3634
|
-
args.showHelp = true;
|
|
3635
|
-
return args;
|
|
3636
|
-
}
|
|
3637
|
-
if (isCliCommand(command)) {
|
|
3638
|
-
args.command = command;
|
|
3639
|
-
args.helpTopic = command === "help" ? "global" : command;
|
|
3640
|
-
} else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
|
|
3641
|
-
let cursor = 1;
|
|
3642
|
-
if (args.command === "cache") {
|
|
3643
|
-
const sub = argv[cursor];
|
|
3644
|
-
if (sub === "list" || sub === "clear") {
|
|
3645
|
-
args.subcommand = sub;
|
|
3646
|
-
args.helpTopic = `cache ${sub}`;
|
|
3647
|
-
cursor++;
|
|
3648
|
-
} else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
|
|
3649
|
-
}
|
|
3650
|
-
for (let i = cursor; i < argv.length; i++) {
|
|
3651
|
-
const arg = argv[i];
|
|
3652
|
-
const next = argv[i + 1];
|
|
3653
|
-
if (arg === "--help" || arg === "-h") args.showHelp = true;
|
|
3654
|
-
else if (arg === "--eval" && next) {
|
|
3655
|
-
args.evalIds.push(...next.split(","));
|
|
3656
|
-
i++;
|
|
3657
|
-
} else if (arg === "--case" && next) {
|
|
3658
|
-
args.caseIds.push(...next.split(","));
|
|
3659
|
-
i++;
|
|
3660
|
-
} else if (arg === "--trials" && next) {
|
|
3661
|
-
args.trials = Number(next);
|
|
3662
|
-
i++;
|
|
3663
|
-
} else if (arg === "--json") args.json = true;
|
|
3664
|
-
else if (arg === "--port" && next) {
|
|
3665
|
-
args.port = Number(next);
|
|
3666
|
-
i++;
|
|
3667
|
-
} else if (arg === "--cache" && next) {
|
|
3668
|
-
if (next === "use" || next === "bypass" || next === "refresh") args.cacheMode = next;
|
|
3669
|
-
i++;
|
|
3670
|
-
} else if (arg === "--no-cache") args.cacheMode = "bypass";
|
|
3671
|
-
else if (arg === "--refresh-cache") args.cacheMode = "refresh";
|
|
3672
|
-
else if (arg === "--clear-cache") args.clearCache = true;
|
|
3673
|
-
else if (arg === "--all") args.all = true;
|
|
3674
|
-
}
|
|
3675
|
-
return args;
|
|
3676
|
-
}
|
|
3677
|
-
/**
|
|
3678
|
-
* Run the Agent Evals CLI against the current workspace.
|
|
3679
|
-
*
|
|
3680
|
-
* @param argv Raw command-line arguments excluding the executable name.
|
|
3681
|
-
*/
|
|
3682
|
-
async function runCli(argv) {
|
|
3683
|
-
const args = parseArgs(argv);
|
|
3684
|
-
if (args.showHelp) {
|
|
3685
|
-
if (args.unknownHelpTarget !== void 0) {
|
|
3686
|
-
console.error(`No help found for "${args.unknownHelpTarget}".`);
|
|
3687
|
-
process.exit(1);
|
|
3688
|
-
return;
|
|
3689
|
-
}
|
|
3690
|
-
printHelp(args.helpTopic);
|
|
3691
|
-
return;
|
|
3692
|
-
}
|
|
3693
|
-
switch (args.command) {
|
|
3694
|
-
case "app":
|
|
3695
|
-
await commandApp(args);
|
|
3696
|
-
break;
|
|
3697
|
-
case "list":
|
|
3698
|
-
await commandList(args);
|
|
3699
|
-
break;
|
|
3700
|
-
case "run":
|
|
3701
|
-
await commandRun(args);
|
|
3702
|
-
break;
|
|
3703
|
-
case "cache":
|
|
3704
|
-
await commandCache(args);
|
|
3705
|
-
break;
|
|
3706
|
-
default:
|
|
3707
|
-
printHelp(args.helpTopic);
|
|
3708
|
-
break;
|
|
3709
|
-
}
|
|
3710
|
-
}
|
|
3711
|
-
function isCliCommand(command) {
|
|
3712
|
-
return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
|
|
3713
|
-
}
|
|
3714
|
-
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
3715
|
-
const repoRoot = resolve(currentDir, "../../..");
|
|
3716
|
-
const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
|
|
3717
|
-
function hasRepoWebWorkspace() {
|
|
3718
|
-
return existsSync(resolve(repoRoot, "apps/web/package.json"));
|
|
3719
|
-
}
|
|
3720
|
-
async function ensureWebUiIsBuilt() {
|
|
3721
|
-
if (!hasRepoWebWorkspace()) return;
|
|
3722
|
-
console.info("Preparing web UI...");
|
|
3723
|
-
await new Promise((resolvePromise, rejectPromise) => {
|
|
3724
|
-
const child = spawn(pnpmCommand, [
|
|
3725
|
-
"--filter",
|
|
3726
|
-
"@agent-evals/web",
|
|
3727
|
-
"build"
|
|
3728
|
-
], {
|
|
3729
|
-
cwd: repoRoot,
|
|
3730
|
-
stdio: "inherit"
|
|
3731
|
-
});
|
|
3732
|
-
child.once("error", (error) => {
|
|
3733
|
-
rejectPromise(error);
|
|
3734
|
-
});
|
|
3735
|
-
child.once("exit", (code, signal) => {
|
|
3736
|
-
if (signal) {
|
|
3737
|
-
rejectPromise(/* @__PURE__ */ new Error(`Web UI build stopped with signal ${signal}.`));
|
|
3738
|
-
return;
|
|
3739
|
-
}
|
|
3740
|
-
if (code !== 0) {
|
|
3741
|
-
rejectPromise(/* @__PURE__ */ new Error(`Web UI build failed with exit code ${String(code)}.`));
|
|
3742
|
-
return;
|
|
3743
|
-
}
|
|
3744
|
-
resolvePromise();
|
|
3745
|
-
});
|
|
3746
|
-
});
|
|
3747
|
-
}
|
|
3748
|
-
function isHonoAppModule(mod) {
|
|
3749
|
-
if (typeof mod !== "object" || mod === null || !("app" in mod)) return false;
|
|
3750
|
-
const { app } = mod;
|
|
3751
|
-
return typeof app === "object" && app !== null && "fetch" in app && typeof app.fetch === "function";
|
|
3752
|
-
}
|
|
3753
|
-
function isServerRunnerModule(mod) {
|
|
3754
|
-
if (typeof mod !== "object" || mod === null || !("initRunner" in mod)) return false;
|
|
3755
|
-
return typeof mod.initRunner === "function";
|
|
3756
|
-
}
|
|
3757
|
-
async function commandApp(args) {
|
|
3758
|
-
await ensureWebUiIsBuilt();
|
|
3759
|
-
const { serve } = await import("@hono/node-server");
|
|
3760
|
-
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
3761
|
-
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
3762
|
-
const appModule = await import("./app-C5CJ1sX6.mjs");
|
|
3763
|
-
const runnerModule = await import("./runner-Cdlvk56X.mjs");
|
|
3764
|
-
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
3765
|
-
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
3766
|
-
await runnerModule.initRunner();
|
|
3767
|
-
console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
|
|
3768
|
-
serve({
|
|
3769
|
-
fetch: appModule.app.fetch,
|
|
3770
|
-
port: args.port
|
|
3771
|
-
});
|
|
3772
|
-
}
|
|
3773
|
-
async function commandList(args_) {
|
|
3774
|
-
const runner = createRunner({ watchForChanges: false });
|
|
3775
|
-
await runner.init();
|
|
3776
|
-
const evals = runner.getEvals();
|
|
3777
|
-
if (evals.length === 0) {
|
|
3778
|
-
console.info("No eval files found.");
|
|
3779
|
-
return;
|
|
3780
|
-
}
|
|
3781
|
-
console.info("Discovered evals:\n");
|
|
3782
|
-
for (const ev of evals) {
|
|
3783
|
-
const displayStatus = getEvalDisplayStatus({
|
|
3784
|
-
freshnessStatus: ev.freshnessStatus,
|
|
3785
|
-
stale: ev.stale,
|
|
3786
|
-
outdated: ev.outdated,
|
|
3787
|
-
lastRunStatus: ev.lastRunStatus
|
|
3788
|
-
});
|
|
3789
|
-
const title = getEvalTitle(ev);
|
|
3790
|
-
console.info(` ${title}`);
|
|
3791
|
-
console.info(` id: ${ev.id}`);
|
|
3792
|
-
console.info(` file: ${ev.filePath}`);
|
|
3793
|
-
if (displayStatus !== "pending") console.info(` status: ${displayStatus}`);
|
|
3794
|
-
if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
|
|
3795
|
-
console.info("");
|
|
3796
|
-
}
|
|
3797
|
-
}
|
|
3798
|
-
async function commandRun(args) {
|
|
3799
|
-
const runner = createRunner({ watchForChanges: false });
|
|
3800
|
-
await runner.init();
|
|
3801
|
-
if (args.clearCache) {
|
|
3802
|
-
await runner.clearCache();
|
|
3803
|
-
if (!args.json) {
|
|
3804
|
-
console.info("Cleared cache before run.");
|
|
3805
|
-
console.info("");
|
|
3806
|
-
}
|
|
3807
|
-
}
|
|
3808
|
-
const target = args.caseIds.length > 0 ? {
|
|
3809
|
-
mode: "caseIds",
|
|
3810
|
-
caseIds: args.caseIds,
|
|
3811
|
-
evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
|
|
3812
|
-
} : args.evalIds.length > 0 ? {
|
|
3813
|
-
mode: "evalIds",
|
|
3814
|
-
evalIds: args.evalIds
|
|
3815
|
-
} : { mode: "all" };
|
|
3816
|
-
const run = await runner.startRun({
|
|
3817
|
-
target,
|
|
3818
|
-
trials: args.trials,
|
|
3819
|
-
cache: { mode: args.cacheMode }
|
|
3820
|
-
});
|
|
3821
|
-
if (!args.json) {
|
|
3822
|
-
console.info(`Run started: ${run.manifest.id}`);
|
|
3823
|
-
console.info(`Trials: ${String(args.trials)}`);
|
|
3824
|
-
if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
|
|
3825
|
-
console.info("");
|
|
3826
|
-
}
|
|
3827
|
-
await waitForRunCompletion(runner, run.manifest.id);
|
|
3828
|
-
const finalRun = runner.getRun(run.manifest.id);
|
|
3829
|
-
if (!finalRun) {
|
|
3830
|
-
process.exit(1);
|
|
3831
|
-
return;
|
|
3832
|
-
}
|
|
3833
|
-
const { summary } = finalRun;
|
|
3834
|
-
if (args.json) console.info(JSON.stringify(summary, null, 2));
|
|
3835
|
-
else {
|
|
3836
|
-
console.info("--- Run Summary ---");
|
|
3837
|
-
console.info(`Status: ${summary.status}`);
|
|
3838
|
-
console.info(`Total: ${String(summary.totalCases)}`);
|
|
3839
|
-
console.info(`Passed: ${String(summary.passedCases)}`);
|
|
3840
|
-
console.info(`Failed: ${String(summary.failedCases)}`);
|
|
3841
|
-
console.info(`Errors: ${String(summary.errorCases)}`);
|
|
3842
|
-
if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
|
|
3843
|
-
if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
|
|
3844
|
-
}
|
|
3845
|
-
if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
|
|
3846
|
-
}
|
|
3847
|
-
async function commandCache(args) {
|
|
3848
|
-
const runner = createRunner({ watchForChanges: false });
|
|
3849
|
-
await runner.init();
|
|
3850
|
-
if (args.subcommand === "list" || args.subcommand === void 0) {
|
|
3851
|
-
const entries = await runner.listCache();
|
|
3852
|
-
if (args.json) {
|
|
3853
|
-
console.info(JSON.stringify(entries, null, 2));
|
|
3854
|
-
return;
|
|
3855
|
-
}
|
|
3856
|
-
if (entries.length === 0) {
|
|
3857
|
-
console.info("No cache entries.");
|
|
3858
|
-
return;
|
|
3859
|
-
}
|
|
3860
|
-
console.info(`Cache entries (${String(entries.length)}):\n`);
|
|
3861
|
-
for (const entry of entries) {
|
|
3862
|
-
console.info(` ${entry.namespace}`);
|
|
3863
|
-
console.info(` key: ${entry.key}`);
|
|
3864
|
-
console.info(` span: ${entry.spanName} (${entry.spanKind})`);
|
|
3865
|
-
console.info(` stored: ${entry.storedAt}`);
|
|
3866
|
-
console.info(` size: ${String(entry.sizeBytes)} bytes`);
|
|
3867
|
-
console.info("");
|
|
3868
|
-
}
|
|
3869
|
-
return;
|
|
3870
|
-
}
|
|
3871
|
-
if (args.subcommand === "clear") {
|
|
3872
|
-
if (args.evalIds.length > 0) {
|
|
3873
|
-
for (const evalId of args.evalIds) {
|
|
3874
|
-
const entries = await runner.listCache();
|
|
3875
|
-
const prefix = `${evalId}__`;
|
|
3876
|
-
const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
|
|
3877
|
-
for (const entry of matching) await runner.clearCache({
|
|
3878
|
-
namespace: entry.namespace,
|
|
3879
|
-
key: entry.key
|
|
3880
|
-
});
|
|
3881
|
-
}
|
|
3882
|
-
console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
|
|
3883
|
-
return;
|
|
3884
|
-
}
|
|
3885
|
-
if (args.all) {
|
|
3886
|
-
await runner.clearCache();
|
|
3887
|
-
console.info("Cleared all cache entries.");
|
|
3888
|
-
return;
|
|
3889
|
-
}
|
|
3890
|
-
console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
|
|
3891
|
-
process.exit(1);
|
|
3892
|
-
return;
|
|
3893
|
-
}
|
|
3894
|
-
printHelp(args.helpTopic);
|
|
3895
|
-
}
|
|
3896
|
-
async function waitForRunCompletion(runner, runId) {
|
|
3897
|
-
return new Promise((resolvePromise) => {
|
|
3898
|
-
const check = () => {
|
|
3899
|
-
const run = runner.getRun(runId);
|
|
3900
|
-
if (!run || run.manifest.status === "completed" || run.manifest.status === "cancelled" || run.manifest.status === "error") {
|
|
3901
|
-
resolvePromise();
|
|
3902
|
-
return;
|
|
3903
|
-
}
|
|
3904
|
-
setTimeout(check, 200);
|
|
3905
|
-
};
|
|
3906
|
-
check();
|
|
3907
|
-
});
|
|
3908
|
-
}
|
|
3909
|
-
function printHelp(topic = "global") {
|
|
3910
|
-
if (topic === "app") {
|
|
3911
|
-
console.info(`
|
|
3912
|
-
agent-evals app - Start server with UI
|
|
3913
|
-
|
|
3914
|
-
Usage:
|
|
3915
|
-
agent-evals app [flags]
|
|
3916
|
-
|
|
3917
|
-
Flags:
|
|
3918
|
-
--port <n> Server port (default: 4100)
|
|
3919
|
-
--help, -h Show this help
|
|
3920
|
-
`);
|
|
3921
|
-
return;
|
|
3922
|
-
}
|
|
3923
|
-
if (topic === "list") {
|
|
3924
|
-
console.info(`
|
|
3925
|
-
agent-evals list - List discovered evals
|
|
3926
|
-
|
|
3927
|
-
Usage:
|
|
3928
|
-
agent-evals list [flags]
|
|
3929
|
-
|
|
3930
|
-
Flags:
|
|
3931
|
-
--help, -h Show this help
|
|
3932
|
-
`);
|
|
3933
|
-
return;
|
|
3934
|
-
}
|
|
3935
|
-
if (topic === "run") {
|
|
3936
|
-
console.info(`
|
|
3937
|
-
agent-evals run - Run evals
|
|
3938
|
-
|
|
3939
|
-
Usage:
|
|
3940
|
-
agent-evals run [flags]
|
|
3941
|
-
|
|
3942
|
-
Flags:
|
|
3943
|
-
--eval <id> Run specific eval(s) (comma-separated)
|
|
3944
|
-
--case <id> Run specific case(s) (comma-separated)
|
|
3945
|
-
--trials <n> Number of trials per case
|
|
3946
|
-
--json Output run summary as JSON
|
|
3947
|
-
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
3948
|
-
--no-cache Shortcut for --cache bypass
|
|
3949
|
-
--refresh-cache Shortcut for --cache refresh
|
|
3950
|
-
--clear-cache Clear the cache before starting the run
|
|
3951
|
-
--help, -h Show this help
|
|
3952
|
-
`);
|
|
3953
|
-
return;
|
|
3954
|
-
}
|
|
3955
|
-
if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
|
|
3956
|
-
console.info(`
|
|
3957
|
-
agent-evals cache - Manage cached operation entries
|
|
3958
|
-
|
|
3959
|
-
Usage:
|
|
3960
|
-
agent-evals cache list [flags]
|
|
3961
|
-
agent-evals cache clear --eval <id>
|
|
3962
|
-
agent-evals cache clear --all
|
|
3963
|
-
|
|
3964
|
-
Flags:
|
|
3965
|
-
--eval <id> Clear entries for specific eval(s) (comma-separated)
|
|
3966
|
-
--all Confirm clearing every cached entry
|
|
3967
|
-
--json Output cache listing as JSON
|
|
3968
|
-
--help, -h Show this help
|
|
3969
|
-
`);
|
|
3970
|
-
return;
|
|
3971
|
-
}
|
|
3972
|
-
console.info(`
|
|
3973
|
-
agent-evals - LLM/Agent eval runner
|
|
3974
|
-
|
|
3975
|
-
Commands:
|
|
3976
|
-
app Start server with UI
|
|
3977
|
-
list List discovered evals
|
|
3978
|
-
run Run evals
|
|
3979
|
-
cache list List cached operation entries
|
|
3980
|
-
cache clear --eval <id> Clear cache entries for one eval
|
|
3981
|
-
cache clear --all Clear every cached entry
|
|
3982
|
-
help Show this help
|
|
3983
|
-
|
|
3984
|
-
Options:
|
|
3985
|
-
--eval <id> Run specific eval(s) (comma-separated)
|
|
3986
|
-
--case <id> Run specific case(s) (comma-separated)
|
|
3987
|
-
--trials <n> Number of trials per case
|
|
3988
|
-
--json Output results as JSON
|
|
3989
|
-
--port <n> Server port (default: 4100)
|
|
3990
|
-
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
3991
|
-
--no-cache Shortcut for --cache bypass
|
|
3992
|
-
--refresh-cache Shortcut for --cache refresh
|
|
3993
|
-
--clear-cache Clear the cache before starting the run
|
|
3994
|
-
--help, -h Show help
|
|
3995
|
-
`);
|
|
3996
|
-
}
|
|
3997
|
-
//#endregion
|
|
3998
|
-
export { columnKindSchema as $, evalSummarySchema as A, evalChartsConfigSchema as B, assertionFailureSchema as C, evalStatAggregateSchema as D, evalFreshnessStatusSchema as E, evalChartColorSchema as F, traceDisplayConfigSchema as G, traceAttributeDisplayInputSchema as H, evalChartConfigSchema as I, traceSpanKindSchema as J, traceDisplayInputConfigSchema as K, evalChartMetricSchema as L, evalChartAggregateSchema as M, evalChartAxisSchema as N, evalStatItemSchema as O, evalChartBuiltinMetricSchema as P, columnFormatSchema as Q, evalChartTooltipExtraSchema as R, spanCacheOptionsSchema as S, caseRowSchema as T, traceAttributeDisplayPlacementSchema as U, traceAttributeDisplayFormatSchema as V, traceAttributeDisplaySchema as W, cellValueSchema as X, traceSpanSchema as Y, columnDefSchema as Z, cacheListItemSchema as _, setEvalOutput as _t, sseEnvelopeSchema as a, buildTraceTree as at, cacheRecordingSchema as b, defineEval as bt, deriveScopedSummaryFromCases as c, evalTracer as ct, runManifestSchema as d, EvalAssertionError as dt, fileRefSchema as et, runSummarySchema as f, evalAssert as ft, cacheFileSchema as g, runInEvalScope as gt, cacheEntrySchema as h, isInEvalScope as ht, updateManualScoreRequestSchema as i, runArtifactRefSchema as it, scoreTraceSchema as j, evalStatsConfigSchema as k, deriveStatusFromCaseRows as l, hashCacheKey as lt, trialSelectionModeSchema as m, incrementEvalOutput as mt, createRunner as n, numberDisplayOptionsSchema as nt, getEvalTitle as o, captureEvalSpanError as ot, agentEvalsConfigSchema as p, getCurrentScope as pt, traceSpanErrorSchema as q, createRunRequestSchema as r, repoFileRefSchema as rt, getEvalDisplayStatus as s, evalSpan as st, runCli as t, jsonCellSchema as tt, deriveStatusFromChildStatuses as u, hashCacheKeySync as ut, cacheModeSchema as v, setScopeCacheContext as vt, caseDetailSchema as w, serializedCacheSpanSchema as x, getEvalRegistry as xt, cacheRecordingOpSchema as y, repoFile as yt, evalChartTypeSchema as z };
|
|
3517
|
+
export { evalChartAxisSchema as $, runManifestSchema as A, evalTracer as At, cacheRecordingSchema as B, mergeEvalOutput as Bt, updateManualScoreRequestSchema as C, numberDisplayOptionsSchema as Ct, deriveScopedSummaryFromCases as D, buildTraceTree as Dt, getEvalDisplayStatus as E, z$1 as Et, cacheFileSchema as F, evalAssert as Ft, caseRowSchema as G, defineEval as Gt, spanCacheOptionsSchema as H, setEvalOutput as Ht, cacheListItemSchema as I, getCurrentScope as It, evalStatItemSchema as J, evalFreshnessStatusSchema as K, getEvalRegistry as Kt, cacheModeSchema as L, getEvalCaseInput as Lt, agentEvalsConfigSchema as M, hashCacheKeySync as Mt, trialSelectionModeSchema as N, EvalAssertionError as Nt, deriveStatusFromCaseRows as O, captureEvalSpanError as Ot, cacheEntrySchema as P, appendToEvalOutput as Pt, evalChartAggregateSchema as Q, cacheOperationTypeSchema as R, incrementEvalOutput as Rt, createRunRequestSchema as S, jsonCellSchema as St, getEvalTitle as T, runArtifactRefSchema as Tt, assertionFailureSchema as U, setScopeCacheContext as Ut, serializedCacheSpanSchema as V, runInEvalScope as Vt, caseDetailSchema as W, repoFile as Wt, evalSummarySchema as X, evalStatsConfigSchema as Y, scoreTraceSchema as Z, loadEvalModule as _, cellValueSchema as _t, loadPersistedRunSnapshot as a, evalChartTypeSchema as at, normalizeScoreDef as b, columnKindSchema as bt, persistCaseDetail as c, traceAttributeDisplayInputSchema as ct, recomputePersistedCaseStatus as d, traceDisplayConfigSchema as dt, evalChartBuiltinMetricSchema as et, runTouchesEval as f, traceDisplayInputConfigSchema as ft, setLatestRunInfoMap as g, traceSpanWarningSchema as gt, getTargetEvalIds as h, traceSpanSchema as ht, getLatestRunInfos as i, evalChartTooltipExtraSchema as it, runSummarySchema as j, hashCacheKey as jt, deriveStatusFromChildStatuses as k, evalSpan as kt, persistRunState as l, traceAttributeDisplayPlacementSchema as lt, buildEvalSummary as m, traceSpanKindSchema as mt, generateRunId as n, evalChartConfigSchema as nt, loadPersistedRunSnapshots as o, evalChartsConfigSchema as ot, resolveArtifactPath as p, traceSpanErrorSchema as pt, evalStatAggregateSchema as q, getLastRunStatuses as r, evalChartMetricSchema as rt, nextShortIdFromSnapshots as s, traceAttributeDisplayFormatSchema as st, executeRun as t, evalChartColorSchema as tt, recomputeEvalStatusesInRuns as u, traceAttributeDisplaySchema as ut, loadConfig as v, columnDefSchema as vt, sseEnvelopeSchema as w, repoFileRefSchema as wt, createFsCacheStore as x, fileRefSchema as xt, buildDeclaredColumnDefs as y, columnFormatSchema as yt, cacheRecordingOpSchema as z, isInEvalScope as zt };
|