@ls-stack/agent-eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CKa9TjXw.mjs → app-DXYLqlWb.mjs} +3 -3
- package/dist/apps/web/dist/assets/{index-Dm50Ynbs.js → index-Bq4Dz6AV.js} +30 -30
- package/dist/apps/web/dist/assets/index-b2k20tzL.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-CwEFLP0w.mjs → cli-Dw9et3_Z.mjs} +499 -66
- package/dist/index.d.mts +140 -5
- package/dist/index.mjs +3 -3
- package/dist/{runner-Ck4X0H3p.mjs → runner-CToL8eJs.mjs} +1 -1
- package/dist/{runner-CD5aDJ0C.mjs → runner-kSiHsl91.mjs} +2 -2
- package/dist/src-CXclO9ZI.mjs +2 -0
- package/package.json +5 -5
- package/dist/apps/web/dist/assets/index-BUz24J7O.css +0 -1
- package/dist/src-BDRmaWFu.mjs +0 -2
|
@@ -2,6 +2,7 @@ import { createHash } from "node:crypto";
|
|
|
2
2
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, extname, join, relative, resolve } from "node:path";
|
|
4
4
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
5
|
+
import { Buffer as Buffer$1 } from "node:buffer";
|
|
5
6
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
6
7
|
import { z } from "zod/v4";
|
|
7
8
|
import { watch } from "chokidar";
|
|
@@ -206,6 +207,15 @@ function noopActiveSpan() {
|
|
|
206
207
|
setAttributes() {}
|
|
207
208
|
};
|
|
208
209
|
}
|
|
210
|
+
function noopExternalSpan(id) {
|
|
211
|
+
return {
|
|
212
|
+
id,
|
|
213
|
+
setName() {},
|
|
214
|
+
setAttribute() {},
|
|
215
|
+
setAttributes() {},
|
|
216
|
+
end() {}
|
|
217
|
+
};
|
|
218
|
+
}
|
|
209
219
|
function mergeSpanAttributes(span, attributes) {
|
|
210
220
|
span.attributes = {
|
|
211
221
|
...span.attributes,
|
|
@@ -225,6 +235,127 @@ function createSpanHandle(span) {
|
|
|
225
235
|
}
|
|
226
236
|
};
|
|
227
237
|
}
|
|
238
|
+
function createExternalSpanHandle(id) {
|
|
239
|
+
return {
|
|
240
|
+
id,
|
|
241
|
+
setName(value) {
|
|
242
|
+
updateExternalSpan({
|
|
243
|
+
id,
|
|
244
|
+
name: value
|
|
245
|
+
});
|
|
246
|
+
},
|
|
247
|
+
setAttribute(key, value) {
|
|
248
|
+
updateExternalSpan({
|
|
249
|
+
id,
|
|
250
|
+
attributes: { [key]: value }
|
|
251
|
+
});
|
|
252
|
+
},
|
|
253
|
+
setAttributes(value) {
|
|
254
|
+
updateExternalSpan({
|
|
255
|
+
id,
|
|
256
|
+
attributes: value
|
|
257
|
+
});
|
|
258
|
+
},
|
|
259
|
+
end(info = {}) {
|
|
260
|
+
endExternalSpan({
|
|
261
|
+
...info,
|
|
262
|
+
id
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
function toIsoTimestamp(value) {
|
|
268
|
+
if (value === void 0) return (/* @__PURE__ */ new Date()).toISOString();
|
|
269
|
+
if (typeof value === "string") return value;
|
|
270
|
+
return value.toISOString();
|
|
271
|
+
}
|
|
272
|
+
function findSpan(scope, id) {
|
|
273
|
+
return scope.spans.find((span) => span.id === id);
|
|
274
|
+
}
|
|
275
|
+
function resolveExternalParentId(scope, parentId) {
|
|
276
|
+
if (parentId !== void 0) return parentId;
|
|
277
|
+
return scope.activeSpanStack.at(-1)?.id ?? null;
|
|
278
|
+
}
|
|
279
|
+
function startExternalSpan(info) {
|
|
280
|
+
const id = info.id ?? generateSpanId();
|
|
281
|
+
const scope = getCurrentScope();
|
|
282
|
+
if (!scope) return noopExternalSpan(id);
|
|
283
|
+
const existing = findSpan(scope, id);
|
|
284
|
+
if (existing) {
|
|
285
|
+
existing.parentId = resolveExternalParentId(scope, info.parentId);
|
|
286
|
+
existing.kind = info.kind;
|
|
287
|
+
existing.name = info.name;
|
|
288
|
+
existing.startedAt = toIsoTimestamp(info.startedAt);
|
|
289
|
+
existing.status = "running";
|
|
290
|
+
existing.endedAt = null;
|
|
291
|
+
if (info.attributes !== void 0) existing.attributes = info.attributes;
|
|
292
|
+
return createExternalSpanHandle(id);
|
|
293
|
+
}
|
|
294
|
+
scope.spans.push({
|
|
295
|
+
id,
|
|
296
|
+
parentId: resolveExternalParentId(scope, info.parentId),
|
|
297
|
+
caseId: scope.caseId,
|
|
298
|
+
kind: info.kind,
|
|
299
|
+
name: info.name,
|
|
300
|
+
startedAt: toIsoTimestamp(info.startedAt),
|
|
301
|
+
endedAt: null,
|
|
302
|
+
status: "running",
|
|
303
|
+
attributes: info.attributes
|
|
304
|
+
});
|
|
305
|
+
return createExternalSpanHandle(id);
|
|
306
|
+
}
|
|
307
|
+
function updateExternalSpan(info) {
|
|
308
|
+
const scope = getCurrentScope();
|
|
309
|
+
if (!scope) return;
|
|
310
|
+
const span = findSpan(scope, info.id);
|
|
311
|
+
if (!span) return;
|
|
312
|
+
if (info.name !== void 0) span.name = info.name;
|
|
313
|
+
if (info.status !== void 0) span.status = info.status;
|
|
314
|
+
if (info.error !== void 0) span.error = info.error;
|
|
315
|
+
if (info.attributes !== void 0) mergeSpanAttributes(span, info.attributes);
|
|
316
|
+
}
|
|
317
|
+
function endExternalSpan(info) {
|
|
318
|
+
const scope = getCurrentScope();
|
|
319
|
+
if (!scope) return;
|
|
320
|
+
const span = findSpan(scope, info.id);
|
|
321
|
+
if (!span) return;
|
|
322
|
+
updateExternalSpan(info);
|
|
323
|
+
span.status = info.status ?? (info.error ? "error" : "ok");
|
|
324
|
+
span.endedAt = toIsoTimestamp(info.endedAt);
|
|
325
|
+
}
|
|
326
|
+
function recordExternalSpan(info) {
|
|
327
|
+
const id = info.id ?? generateSpanId();
|
|
328
|
+
const scope = getCurrentScope();
|
|
329
|
+
if (!scope) return id;
|
|
330
|
+
const startedAt = toIsoTimestamp(info.startedAt);
|
|
331
|
+
const endedAt = info.endedAt === null ? null : info.endedAt ? toIsoTimestamp(info.endedAt) : startedAt;
|
|
332
|
+
const existing = findSpan(scope, id);
|
|
333
|
+
const status = info.status ?? (info.error ? "error" : "ok");
|
|
334
|
+
if (existing) {
|
|
335
|
+
existing.parentId = resolveExternalParentId(scope, info.parentId);
|
|
336
|
+
existing.kind = info.kind;
|
|
337
|
+
existing.name = info.name;
|
|
338
|
+
existing.startedAt = startedAt;
|
|
339
|
+
existing.endedAt = endedAt;
|
|
340
|
+
existing.status = status;
|
|
341
|
+
existing.attributes = info.attributes;
|
|
342
|
+
existing.error = info.error;
|
|
343
|
+
return id;
|
|
344
|
+
}
|
|
345
|
+
scope.spans.push({
|
|
346
|
+
id,
|
|
347
|
+
parentId: resolveExternalParentId(scope, info.parentId),
|
|
348
|
+
caseId: scope.caseId,
|
|
349
|
+
kind: info.kind,
|
|
350
|
+
name: info.name,
|
|
351
|
+
startedAt,
|
|
352
|
+
endedAt,
|
|
353
|
+
status,
|
|
354
|
+
attributes: info.attributes,
|
|
355
|
+
error: info.error
|
|
356
|
+
});
|
|
357
|
+
return id;
|
|
358
|
+
}
|
|
228
359
|
/**
|
|
229
360
|
* Ambient handle for the active span in the current async context.
|
|
230
361
|
*
|
|
@@ -272,7 +403,7 @@ async function traceSpan(info, fn) {
|
|
|
272
403
|
if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
|
|
273
404
|
const ctx = cacheCtx;
|
|
274
405
|
const namespace = cacheOpts.namespace ?? `${ctx.evalId}__${info.name}`;
|
|
275
|
-
const keyHash = hashCacheKey({
|
|
406
|
+
const keyHash = await hashCacheKey({
|
|
276
407
|
namespace,
|
|
277
408
|
codeFingerprint: ctx.codeFingerprint,
|
|
278
409
|
key: cacheOpts.key
|
|
@@ -359,6 +490,34 @@ async function traceSpan(info, fn) {
|
|
|
359
490
|
const evalTracer = {
|
|
360
491
|
/** Run a callback inside a new trace span and record its lifecycle. */
|
|
361
492
|
span: traceSpan,
|
|
493
|
+
/**
|
|
494
|
+
* Start a span whose lifecycle is controlled by an external tracer/exporter.
|
|
495
|
+
*
|
|
496
|
+
* Calls are no-ops outside an eval case scope, except that a generated or
|
|
497
|
+
* caller-provided id is still returned for ergonomic adapter code.
|
|
498
|
+
*/
|
|
499
|
+
startSpan: startExternalSpan,
|
|
500
|
+
/**
|
|
501
|
+
* Merge updates into an externally managed span that was started earlier.
|
|
502
|
+
*
|
|
503
|
+
* This is intended for observability exporters that receive span update
|
|
504
|
+
* events before the final end event.
|
|
505
|
+
*/
|
|
506
|
+
updateSpan: updateExternalSpan,
|
|
507
|
+
/**
|
|
508
|
+
* Finish an externally managed span and attach final attributes or errors.
|
|
509
|
+
*
|
|
510
|
+
* Missing spans are ignored so exporter adapters can safely forward events
|
|
511
|
+
* even when they are emitted outside an eval case scope.
|
|
512
|
+
*/
|
|
513
|
+
endSpan: endExternalSpan,
|
|
514
|
+
/**
|
|
515
|
+
* Record a complete external span in one call.
|
|
516
|
+
*
|
|
517
|
+
* Use this when an upstream tracer only exposes completed spans rather than
|
|
518
|
+
* start/update/end events.
|
|
519
|
+
*/
|
|
520
|
+
recordSpan: recordExternalSpan,
|
|
362
521
|
/** Record a named point-in-time value alongside the trace. */
|
|
363
522
|
checkpoint(name, data) {
|
|
364
523
|
const scope = getCurrentScope();
|
|
@@ -412,9 +571,103 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
412
571
|
checkpoints
|
|
413
572
|
};
|
|
414
573
|
}
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
574
|
+
var SerializedCacheKeyValue = class {
|
|
575
|
+
value;
|
|
576
|
+
constructor(value) {
|
|
577
|
+
this.value = value;
|
|
578
|
+
}
|
|
579
|
+
};
|
|
580
|
+
/**
|
|
581
|
+
* Hash the components of a cache key into a deterministic hex digest.
|
|
582
|
+
*
|
|
583
|
+
* Native `Blob` and `File` values are read asynchronously and hashed by
|
|
584
|
+
* content. Use `hashCacheKeySync` only when the key contains no async values.
|
|
585
|
+
*/
|
|
586
|
+
async function hashCacheKey(input) {
|
|
587
|
+
return hashCacheKeySyncMaterialized(await materializeAsyncCacheKeyValue(input));
|
|
588
|
+
}
|
|
589
|
+
/**
|
|
590
|
+
* Synchronously hash cache key components. This supports JSON-like data and
|
|
591
|
+
* in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
|
|
592
|
+
* but cannot content-hash native `Blob` or `File` values.
|
|
593
|
+
*/
|
|
594
|
+
function hashCacheKeySync(input) {
|
|
595
|
+
return hashCacheKeySyncMaterialized(input);
|
|
596
|
+
}
|
|
597
|
+
function hashCacheKeySyncMaterialized(input) {
|
|
598
|
+
return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
|
|
599
|
+
}
|
|
600
|
+
function stringifyCacheKeyValue(value) {
|
|
601
|
+
if (value instanceof SerializedCacheKeyValue) return value.value;
|
|
602
|
+
if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
|
|
603
|
+
if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
|
|
604
|
+
if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
|
|
605
|
+
if (isArrayBufferView(value)) {
|
|
606
|
+
const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
|
|
607
|
+
return `$${value.constructor.name}:${hashBytes(bytes)}`;
|
|
608
|
+
}
|
|
609
|
+
if (isFile$1(value)) return `$file:${getCompositeKey({
|
|
610
|
+
lastModified: value.lastModified,
|
|
611
|
+
name: value.name,
|
|
612
|
+
size: value.size,
|
|
613
|
+
type: value.type
|
|
614
|
+
})}`;
|
|
615
|
+
if (isBlob$1(value)) return `$blob:${getCompositeKey({
|
|
616
|
+
size: value.size,
|
|
617
|
+
type: value.type
|
|
618
|
+
})}`;
|
|
619
|
+
}
|
|
620
|
+
async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
|
|
621
|
+
const serialized = await stringifyAsyncCacheKeyValue(value);
|
|
622
|
+
if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
|
|
623
|
+
if (stringifyCacheKeyValue(value) !== void 0) return value;
|
|
624
|
+
if (!value || typeof value !== "object") return value;
|
|
625
|
+
if (Array.isArray(value)) {
|
|
626
|
+
const items = [];
|
|
627
|
+
for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
|
|
628
|
+
return items;
|
|
629
|
+
}
|
|
630
|
+
if (refs.has(value)) throw new Error("Circular reference detected");
|
|
631
|
+
refs.add(value);
|
|
632
|
+
const entries = [];
|
|
633
|
+
for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
|
|
634
|
+
refs.delete(value);
|
|
635
|
+
return Object.fromEntries(entries);
|
|
636
|
+
}
|
|
637
|
+
async function stringifyAsyncCacheKeyValue(value) {
|
|
638
|
+
if (isFile$1(value)) return `$file:${getCompositeKey({
|
|
639
|
+
bytes: await hashBlobBytes(value),
|
|
640
|
+
lastModified: value.lastModified,
|
|
641
|
+
name: value.name,
|
|
642
|
+
size: value.size,
|
|
643
|
+
type: value.type
|
|
644
|
+
})}`;
|
|
645
|
+
if (isBlob$1(value)) return `$blob:${getCompositeKey({
|
|
646
|
+
bytes: await hashBlobBytes(value),
|
|
647
|
+
size: value.size,
|
|
648
|
+
type: value.type
|
|
649
|
+
})}`;
|
|
650
|
+
}
|
|
651
|
+
async function hashBlobBytes(value) {
|
|
652
|
+
return hashBytes(new Uint8Array(await value.arrayBuffer()));
|
|
653
|
+
}
|
|
654
|
+
function hashBytes(value) {
|
|
655
|
+
return createHash("sha256").update(value).digest("hex");
|
|
656
|
+
}
|
|
657
|
+
function isArrayBuffer(value) {
|
|
658
|
+
return value instanceof ArrayBuffer;
|
|
659
|
+
}
|
|
660
|
+
function isSharedArrayBuffer(value) {
|
|
661
|
+
return value instanceof SharedArrayBuffer;
|
|
662
|
+
}
|
|
663
|
+
function isArrayBufferView(value) {
|
|
664
|
+
return ArrayBuffer.isView(value);
|
|
665
|
+
}
|
|
666
|
+
function isBlob$1(value) {
|
|
667
|
+
return value instanceof Blob;
|
|
668
|
+
}
|
|
669
|
+
function isFile$1(value) {
|
|
670
|
+
return value instanceof File;
|
|
418
671
|
}
|
|
419
672
|
function toJsonSafe(value) {
|
|
420
673
|
if (value === void 0) return void 0;
|
|
@@ -989,6 +1242,12 @@ const cacheEntrySchema = z.object({
|
|
|
989
1242
|
codeFingerprint: z.string(),
|
|
990
1243
|
recording: cacheRecordingSchema
|
|
991
1244
|
});
|
|
1245
|
+
/** Persisted per-owner cache file containing multiple cache entries. */
|
|
1246
|
+
const cacheFileSchema = z.object({
|
|
1247
|
+
version: z.literal(1),
|
|
1248
|
+
owner: z.string(),
|
|
1249
|
+
entries: z.record(z.string(), cacheEntrySchema)
|
|
1250
|
+
});
|
|
992
1251
|
//#endregion
|
|
993
1252
|
//#region ../shared/src/schemas/config.ts
|
|
994
1253
|
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
@@ -1004,7 +1263,8 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1004
1263
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
1005
1264
|
cache: z.object({
|
|
1006
1265
|
enabled: z.boolean().optional(),
|
|
1007
|
-
dir: z.string().optional()
|
|
1266
|
+
dir: z.string().optional(),
|
|
1267
|
+
maxEntriesPerEval: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional())
|
|
1008
1268
|
}).optional()
|
|
1009
1269
|
});
|
|
1010
1270
|
//#endregion
|
|
@@ -1243,60 +1503,59 @@ const createRunRequestSchema = z.object({
|
|
|
1243
1503
|
const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1).nullable() });
|
|
1244
1504
|
//#endregion
|
|
1245
1505
|
//#region ../runner/src/cacheStore.ts
|
|
1506
|
+
const defaultMaxEntriesPerEval = 100;
|
|
1246
1507
|
/**
|
|
1247
1508
|
* Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
|
|
1248
1509
|
*
|
|
1249
|
-
*
|
|
1250
|
-
*
|
|
1510
|
+
* Cache entries are grouped into one inspectable JSON file per eval/cache
|
|
1511
|
+
* owner. Writes use a short-lived lock directory plus `<name>.tmp` + atomic
|
|
1512
|
+
* `rename` to avoid partial reads and lost updates under concurrent access.
|
|
1251
1513
|
*/
|
|
1252
1514
|
function createFsCacheStore(options) {
|
|
1253
1515
|
const cacheDir = resolve(options.workspaceRoot, options.dir ?? ".agent-evals/cache");
|
|
1516
|
+
const maxEntriesPerEval = normalizeMaxEntries(options.maxEntriesPerEval);
|
|
1254
1517
|
return {
|
|
1255
1518
|
dir() {
|
|
1256
1519
|
return cacheDir;
|
|
1257
1520
|
},
|
|
1258
1521
|
async lookup(namespace, keyHash) {
|
|
1259
|
-
|
|
1260
|
-
if (!existsSync(filePath)) return null;
|
|
1261
|
-
const json = safeJsonParse(await readFile(filePath, "utf-8"));
|
|
1262
|
-
if (json === null) return null;
|
|
1263
|
-
const parsed = cacheEntrySchema.safeParse(json);
|
|
1264
|
-
if (!parsed.success) return null;
|
|
1265
|
-
return parsed.data;
|
|
1522
|
+
return (await readCacheFile(cacheDir, ownerFromNamespace(namespace)))?.entries[keyHash] ?? null;
|
|
1266
1523
|
},
|
|
1267
1524
|
async write(entry) {
|
|
1268
|
-
const
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
await
|
|
1272
|
-
|
|
1525
|
+
const owner = ownerFromNamespace(entry.namespace);
|
|
1526
|
+
const filePath = ownerPath(cacheDir, owner);
|
|
1527
|
+
await mkdir(cacheDir, { recursive: true });
|
|
1528
|
+
await withCacheFileLock(filePath, async () => {
|
|
1529
|
+
await writeCacheFile(cacheDir, {
|
|
1530
|
+
version: 1,
|
|
1531
|
+
owner,
|
|
1532
|
+
entries: pruneEntries({
|
|
1533
|
+
...(await readCacheFile(cacheDir, owner))?.entries ?? {},
|
|
1534
|
+
[entry.key]: entry
|
|
1535
|
+
}, maxEntriesPerEval, entry.key)
|
|
1536
|
+
});
|
|
1537
|
+
});
|
|
1273
1538
|
},
|
|
1274
1539
|
async list() {
|
|
1275
1540
|
if (!existsSync(cacheDir)) return [];
|
|
1276
|
-
const
|
|
1541
|
+
const files = await readdir(cacheDir);
|
|
1277
1542
|
const items = [];
|
|
1278
|
-
for (const
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
const
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
spanKind: parsed.data.spanKind,
|
|
1295
|
-
storedAt: parsed.data.storedAt,
|
|
1296
|
-
codeFingerprint: parsed.data.codeFingerprint,
|
|
1297
|
-
sizeBytes: fileStat.size
|
|
1298
|
-
});
|
|
1299
|
-
}
|
|
1543
|
+
for (const fileName of files) {
|
|
1544
|
+
if (!fileName.endsWith(".json")) continue;
|
|
1545
|
+
const filePath = join(cacheDir, fileName);
|
|
1546
|
+
const fileStatResult = await resultify(() => stat(filePath));
|
|
1547
|
+
if (fileStatResult.error || !fileStatResult.value.isFile()) continue;
|
|
1548
|
+
const cacheFile = await readCacheFilePath(filePath);
|
|
1549
|
+
if (cacheFile === null) continue;
|
|
1550
|
+
for (const entry of Object.values(cacheFile.entries)) items.push({
|
|
1551
|
+
key: entry.key,
|
|
1552
|
+
namespace: entry.namespace,
|
|
1553
|
+
spanName: entry.spanName,
|
|
1554
|
+
spanKind: entry.spanKind,
|
|
1555
|
+
storedAt: entry.storedAt,
|
|
1556
|
+
codeFingerprint: entry.codeFingerprint,
|
|
1557
|
+
sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
|
|
1558
|
+
});
|
|
1300
1559
|
}
|
|
1301
1560
|
items.sort((a, b) => a.storedAt < b.storedAt ? 1 : -1);
|
|
1302
1561
|
return items;
|
|
@@ -1310,21 +1569,36 @@ function createFsCacheStore(options) {
|
|
|
1310
1569
|
});
|
|
1311
1570
|
return;
|
|
1312
1571
|
}
|
|
1313
|
-
if (filter.namespace !== void 0
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1572
|
+
if (filter.namespace !== void 0) {
|
|
1573
|
+
const owner = ownerFromNamespace(filter.namespace);
|
|
1574
|
+
await withCacheFileLock(ownerPath(cacheDir, owner), async () => {
|
|
1575
|
+
const cacheFile = await readCacheFile(cacheDir, owner);
|
|
1576
|
+
if (cacheFile === null) return;
|
|
1577
|
+
await writeOrRemoveCacheFile(cacheDir, {
|
|
1578
|
+
version: 1,
|
|
1579
|
+
owner,
|
|
1580
|
+
entries: Object.fromEntries(Object.entries(cacheFile.entries).filter(([key, entry]) => {
|
|
1581
|
+
if (filter.key !== void 0) return key !== filter.key;
|
|
1582
|
+
return entry.namespace !== filter.namespace;
|
|
1583
|
+
}))
|
|
1584
|
+
});
|
|
1317
1585
|
});
|
|
1318
1586
|
return;
|
|
1319
1587
|
}
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1588
|
+
const files = await readdir(cacheDir);
|
|
1589
|
+
for (const fileName of files) {
|
|
1590
|
+
if (!fileName.endsWith(".json")) continue;
|
|
1591
|
+
const filePath = join(cacheDir, fileName);
|
|
1592
|
+
await withCacheFileLock(filePath, async () => {
|
|
1593
|
+
const cacheFile = await readCacheFilePath(filePath);
|
|
1594
|
+
if (cacheFile === null) return;
|
|
1595
|
+
const entries = Object.fromEntries(Object.entries(cacheFile.entries).filter(([key]) => key !== filter.key));
|
|
1596
|
+
await writeOrRemoveCacheFile(cacheDir, {
|
|
1597
|
+
version: 1,
|
|
1598
|
+
owner: cacheFile.owner,
|
|
1599
|
+
entries
|
|
1600
|
+
});
|
|
1601
|
+
});
|
|
1328
1602
|
}
|
|
1329
1603
|
}
|
|
1330
1604
|
};
|
|
@@ -1356,8 +1630,16 @@ function createBufferedCacheStore(backingStore) {
|
|
|
1356
1630
|
}
|
|
1357
1631
|
};
|
|
1358
1632
|
}
|
|
1359
|
-
function
|
|
1360
|
-
|
|
1633
|
+
function normalizeMaxEntries(value) {
|
|
1634
|
+
if (value === void 0 || !Number.isFinite(value) || value <= 0) return defaultMaxEntriesPerEval;
|
|
1635
|
+
return Math.floor(value);
|
|
1636
|
+
}
|
|
1637
|
+
function ownerFromNamespace(namespace) {
|
|
1638
|
+
const [owner] = namespace.split("__");
|
|
1639
|
+
return owner === void 0 || owner.length === 0 ? namespace : owner;
|
|
1640
|
+
}
|
|
1641
|
+
function ownerPath(cacheDir, owner) {
|
|
1642
|
+
return join(cacheDir, `${sanitizeSegment$1(owner)}.json`);
|
|
1361
1643
|
}
|
|
1362
1644
|
function toPendingKey(namespace, keyHash) {
|
|
1363
1645
|
return `${namespace}::${keyHash}`;
|
|
@@ -1365,6 +1647,69 @@ function toPendingKey(namespace, keyHash) {
|
|
|
1365
1647
|
function sanitizeSegment$1(segment) {
|
|
1366
1648
|
return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
|
|
1367
1649
|
}
|
|
1650
|
+
async function readCacheFile(cacheDir, owner) {
|
|
1651
|
+
return readCacheFilePath(ownerPath(cacheDir, owner));
|
|
1652
|
+
}
|
|
1653
|
+
async function readCacheFilePath(filePath) {
|
|
1654
|
+
if (!existsSync(filePath)) return null;
|
|
1655
|
+
const rawResult = await resultify(() => readFile(filePath, "utf-8"));
|
|
1656
|
+
if (rawResult.error) return null;
|
|
1657
|
+
const json = safeJsonParse(rawResult.value);
|
|
1658
|
+
if (json === null) return null;
|
|
1659
|
+
const parsed = cacheFileSchema.safeParse(json);
|
|
1660
|
+
if (!parsed.success) return null;
|
|
1661
|
+
return parsed.data;
|
|
1662
|
+
}
|
|
1663
|
+
async function writeOrRemoveCacheFile(cacheDir, cacheFile) {
|
|
1664
|
+
if (Object.keys(cacheFile.entries).length === 0) {
|
|
1665
|
+
await rm(ownerPath(cacheDir, cacheFile.owner), { force: true });
|
|
1666
|
+
return;
|
|
1667
|
+
}
|
|
1668
|
+
await writeCacheFile(cacheDir, cacheFile);
|
|
1669
|
+
}
|
|
1670
|
+
async function writeCacheFile(cacheDir, cacheFile) {
|
|
1671
|
+
await mkdir(cacheDir, { recursive: true });
|
|
1672
|
+
const filePath = ownerPath(cacheDir, cacheFile.owner);
|
|
1673
|
+
const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
|
|
1674
|
+
await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
|
|
1675
|
+
await rename(tmpPath, filePath);
|
|
1676
|
+
}
|
|
1677
|
+
function pruneEntries(entries, maxEntries, protectedKey) {
|
|
1678
|
+
const sorted = Object.values(entries).toSorted((a, b) => a.storedAt < b.storedAt ? 1 : -1);
|
|
1679
|
+
const kept = /* @__PURE__ */ new Map();
|
|
1680
|
+
const protectedEntry = entries[protectedKey];
|
|
1681
|
+
if (protectedEntry !== void 0) kept.set(protectedEntry.key, protectedEntry);
|
|
1682
|
+
for (const entry of sorted) {
|
|
1683
|
+
if (kept.size >= maxEntries) break;
|
|
1684
|
+
kept.set(entry.key, entry);
|
|
1685
|
+
}
|
|
1686
|
+
return Object.fromEntries([...kept.values()].toSorted((a, b) => a.key < b.key ? -1 : 1).map((entry) => [entry.key, entry]));
|
|
1687
|
+
}
|
|
1688
|
+
async function withCacheFileLock(filePath, fn) {
|
|
1689
|
+
const lockPath = `${filePath}.lock`;
|
|
1690
|
+
await acquireLock(lockPath);
|
|
1691
|
+
const result = await resultify(fn);
|
|
1692
|
+
await rm(lockPath, {
|
|
1693
|
+
recursive: true,
|
|
1694
|
+
force: true
|
|
1695
|
+
});
|
|
1696
|
+
if (result.error) throw result.error;
|
|
1697
|
+
}
|
|
1698
|
+
async function acquireLock(lockPath) {
|
|
1699
|
+
const startedAt = Date.now();
|
|
1700
|
+
let lastError;
|
|
1701
|
+
while (Date.now() - startedAt < 5e3) {
|
|
1702
|
+
const result = await resultify(() => mkdir(lockPath, { recursive: false }));
|
|
1703
|
+
if (!result.error) return;
|
|
1704
|
+
lastError = result.error;
|
|
1705
|
+
await sleep(20);
|
|
1706
|
+
}
|
|
1707
|
+
if (lastError instanceof Error) throw lastError;
|
|
1708
|
+
throw new Error(`Timed out acquiring cache lock at ${lockPath}`);
|
|
1709
|
+
}
|
|
1710
|
+
function sleep(ms) {
|
|
1711
|
+
return new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
|
|
1712
|
+
}
|
|
1368
1713
|
function safeJsonParse(text) {
|
|
1369
1714
|
const parsed = resultify(() => JSON.parse(text));
|
|
1370
1715
|
if (parsed.error) return null;
|
|
@@ -2730,7 +3075,8 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
2730
3075
|
await mkdir(join(localStateDir, "runs"), { recursive: true });
|
|
2731
3076
|
cacheStore = createFsCacheStore({
|
|
2732
3077
|
workspaceRoot,
|
|
2733
|
-
dir: config.cache?.dir
|
|
3078
|
+
dir: config.cache?.dir,
|
|
3079
|
+
maxEntriesPerEval: config.cache?.maxEntriesPerEval
|
|
2734
3080
|
});
|
|
2735
3081
|
await loadPersistedRuns();
|
|
2736
3082
|
await runner.refreshDiscovery();
|
|
@@ -3128,6 +3474,9 @@ function parseArgs(argv) {
|
|
|
3128
3474
|
const args = {
|
|
3129
3475
|
command: "help",
|
|
3130
3476
|
subcommand: void 0,
|
|
3477
|
+
showHelp: false,
|
|
3478
|
+
helpTopic: "global",
|
|
3479
|
+
unknownHelpTarget: void 0,
|
|
3131
3480
|
evalIds: [],
|
|
3132
3481
|
caseIds: [],
|
|
3133
3482
|
trials: 1,
|
|
@@ -3138,19 +3487,28 @@ function parseArgs(argv) {
|
|
|
3138
3487
|
all: false
|
|
3139
3488
|
};
|
|
3140
3489
|
const command = argv[0];
|
|
3141
|
-
if (command === "
|
|
3490
|
+
if (command === "--help" || command === "-h") {
|
|
3491
|
+
args.showHelp = true;
|
|
3492
|
+
return args;
|
|
3493
|
+
}
|
|
3494
|
+
if (isCliCommand(command)) {
|
|
3495
|
+
args.command = command;
|
|
3496
|
+
args.helpTopic = command === "help" ? "global" : command;
|
|
3497
|
+
} else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
|
|
3142
3498
|
let cursor = 1;
|
|
3143
3499
|
if (args.command === "cache") {
|
|
3144
3500
|
const sub = argv[cursor];
|
|
3145
3501
|
if (sub === "list" || sub === "clear") {
|
|
3146
3502
|
args.subcommand = sub;
|
|
3503
|
+
args.helpTopic = `cache ${sub}`;
|
|
3147
3504
|
cursor++;
|
|
3148
|
-
}
|
|
3505
|
+
} else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
|
|
3149
3506
|
}
|
|
3150
3507
|
for (let i = cursor; i < argv.length; i++) {
|
|
3151
3508
|
const arg = argv[i];
|
|
3152
3509
|
const next = argv[i + 1];
|
|
3153
|
-
if (arg === "--
|
|
3510
|
+
if (arg === "--help" || arg === "-h") args.showHelp = true;
|
|
3511
|
+
else if (arg === "--eval" && next) {
|
|
3154
3512
|
args.evalIds.push(...next.split(","));
|
|
3155
3513
|
i++;
|
|
3156
3514
|
} else if (arg === "--case" && next) {
|
|
@@ -3180,6 +3538,15 @@ function parseArgs(argv) {
|
|
|
3180
3538
|
*/
|
|
3181
3539
|
async function runCli(argv) {
|
|
3182
3540
|
const args = parseArgs(argv);
|
|
3541
|
+
if (args.showHelp) {
|
|
3542
|
+
if (args.unknownHelpTarget !== void 0) {
|
|
3543
|
+
console.error(`No help found for "${args.unknownHelpTarget}".`);
|
|
3544
|
+
process.exit(1);
|
|
3545
|
+
return;
|
|
3546
|
+
}
|
|
3547
|
+
printHelp(args.helpTopic);
|
|
3548
|
+
return;
|
|
3549
|
+
}
|
|
3183
3550
|
switch (args.command) {
|
|
3184
3551
|
case "app":
|
|
3185
3552
|
await commandApp(args);
|
|
@@ -3194,10 +3561,13 @@ async function runCli(argv) {
|
|
|
3194
3561
|
await commandCache(args);
|
|
3195
3562
|
break;
|
|
3196
3563
|
default:
|
|
3197
|
-
printHelp();
|
|
3564
|
+
printHelp(args.helpTopic);
|
|
3198
3565
|
break;
|
|
3199
3566
|
}
|
|
3200
3567
|
}
|
|
3568
|
+
function isCliCommand(command) {
|
|
3569
|
+
return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
|
|
3570
|
+
}
|
|
3201
3571
|
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
3202
3572
|
const repoRoot = resolve(currentDir, "../../..");
|
|
3203
3573
|
const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
|
|
@@ -3246,8 +3616,8 @@ async function commandApp(args) {
|
|
|
3246
3616
|
const { serve } = await import("@hono/node-server");
|
|
3247
3617
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
3248
3618
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
3249
|
-
const appModule = await import("./app-
|
|
3250
|
-
const runnerModule = await import("./runner-
|
|
3619
|
+
const appModule = await import("./app-DXYLqlWb.mjs");
|
|
3620
|
+
const runnerModule = await import("./runner-CToL8eJs.mjs");
|
|
3251
3621
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
3252
3622
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
3253
3623
|
await runnerModule.initRunner();
|
|
@@ -3378,7 +3748,7 @@ async function commandCache(args) {
|
|
|
3378
3748
|
process.exit(1);
|
|
3379
3749
|
return;
|
|
3380
3750
|
}
|
|
3381
|
-
printHelp();
|
|
3751
|
+
printHelp(args.helpTopic);
|
|
3382
3752
|
}
|
|
3383
3753
|
async function waitForRunCompletion(runner, runId) {
|
|
3384
3754
|
return new Promise((resolvePromise) => {
|
|
@@ -3393,7 +3763,69 @@ async function waitForRunCompletion(runner, runId) {
|
|
|
3393
3763
|
check();
|
|
3394
3764
|
});
|
|
3395
3765
|
}
|
|
3396
|
-
function printHelp() {
|
|
3766
|
+
function printHelp(topic = "global") {
|
|
3767
|
+
if (topic === "app") {
|
|
3768
|
+
console.info(`
|
|
3769
|
+
agent-evals app - Start server with UI
|
|
3770
|
+
|
|
3771
|
+
Usage:
|
|
3772
|
+
agent-evals app [flags]
|
|
3773
|
+
|
|
3774
|
+
Flags:
|
|
3775
|
+
--port <n> Server port (default: 4100)
|
|
3776
|
+
--help, -h Show this help
|
|
3777
|
+
`);
|
|
3778
|
+
return;
|
|
3779
|
+
}
|
|
3780
|
+
if (topic === "list") {
|
|
3781
|
+
console.info(`
|
|
3782
|
+
agent-evals list - List discovered evals
|
|
3783
|
+
|
|
3784
|
+
Usage:
|
|
3785
|
+
agent-evals list [flags]
|
|
3786
|
+
|
|
3787
|
+
Flags:
|
|
3788
|
+
--help, -h Show this help
|
|
3789
|
+
`);
|
|
3790
|
+
return;
|
|
3791
|
+
}
|
|
3792
|
+
if (topic === "run") {
|
|
3793
|
+
console.info(`
|
|
3794
|
+
agent-evals run - Run evals
|
|
3795
|
+
|
|
3796
|
+
Usage:
|
|
3797
|
+
agent-evals run [flags]
|
|
3798
|
+
|
|
3799
|
+
Flags:
|
|
3800
|
+
--eval <id> Run specific eval(s) (comma-separated)
|
|
3801
|
+
--case <id> Run specific case(s) (comma-separated)
|
|
3802
|
+
--trials <n> Number of trials per case
|
|
3803
|
+
--json Output run summary as JSON
|
|
3804
|
+
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
3805
|
+
--no-cache Shortcut for --cache bypass
|
|
3806
|
+
--refresh-cache Shortcut for --cache refresh
|
|
3807
|
+
--clear-cache Clear the cache before starting the run
|
|
3808
|
+
--help, -h Show this help
|
|
3809
|
+
`);
|
|
3810
|
+
return;
|
|
3811
|
+
}
|
|
3812
|
+
if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
|
|
3813
|
+
console.info(`
|
|
3814
|
+
agent-evals cache - Manage cached operation entries
|
|
3815
|
+
|
|
3816
|
+
Usage:
|
|
3817
|
+
agent-evals cache list [flags]
|
|
3818
|
+
agent-evals cache clear --eval <id>
|
|
3819
|
+
agent-evals cache clear --all
|
|
3820
|
+
|
|
3821
|
+
Flags:
|
|
3822
|
+
--eval <id> Clear entries for specific eval(s) (comma-separated)
|
|
3823
|
+
--all Confirm clearing every cached entry
|
|
3824
|
+
--json Output cache listing as JSON
|
|
3825
|
+
--help, -h Show this help
|
|
3826
|
+
`);
|
|
3827
|
+
return;
|
|
3828
|
+
}
|
|
3397
3829
|
console.info(`
|
|
3398
3830
|
agent-evals - LLM/Agent eval runner
|
|
3399
3831
|
|
|
@@ -3416,7 +3848,8 @@ Options:
|
|
|
3416
3848
|
--no-cache Shortcut for --cache bypass
|
|
3417
3849
|
--refresh-cache Shortcut for --cache refresh
|
|
3418
3850
|
--clear-cache Clear the cache before starting the run
|
|
3851
|
+
--help, -h Show help
|
|
3419
3852
|
`);
|
|
3420
3853
|
}
|
|
3421
3854
|
//#endregion
|
|
3422
|
-
export {
|
|
3855
|
+
export { fileRefSchema as $, evalSummarySchema as A, evalChartsConfigSchema as B, assertionFailureSchema as C, evalStatAggregateSchema as D, evalFreshnessStatusSchema as E, evalChartColorSchema as F, traceDisplayConfigSchema as G, traceAttributeDisplayInputSchema as H, evalChartConfigSchema as I, traceSpanSchema as J, traceDisplayInputConfigSchema as K, evalChartMetricSchema as L, evalChartAggregateSchema as M, evalChartAxisSchema as N, evalStatItemSchema as O, evalChartBuiltinMetricSchema as P, columnKindSchema as Q, evalChartTooltipExtraSchema as R, spanCacheOptionsSchema as S, caseRowSchema as T, traceAttributeDisplayPlacementSchema as U, traceAttributeDisplayFormatSchema as V, traceAttributeDisplaySchema as W, columnDefSchema as X, cellValueSchema as Y, columnFormatSchema as Z, cacheListItemSchema as _, repoFile as _t, sseEnvelopeSchema as a, evalSpan as at, cacheRecordingSchema as b, deriveScopedSummaryFromCases as c, hashCacheKeySync as ct, runManifestSchema as d, getCurrentScope as dt, jsonCellSchema as et, runSummarySchema as f, incrementEvalOutput as ft, cacheFileSchema as g, setScopeCacheContext as gt, cacheEntrySchema as h, setEvalOutput as ht, updateManualScoreRequestSchema as i, buildTraceTree as it, scoreTraceSchema as j, evalStatsConfigSchema as k, deriveStatusFromCaseRows as l, EvalAssertionError as lt, trialSelectionModeSchema as m, runInEvalScope as mt, createRunner as n, repoFileRefSchema as nt, getEvalTitle as o, evalTracer as ot, agentEvalsConfigSchema as p, isInEvalScope as pt, traceSpanKindSchema as q, createRunRequestSchema as r, runArtifactRefSchema as rt, getEvalDisplayStatus as s, hashCacheKey as st, runCli as t, numberDisplayOptionsSchema as tt, deriveStatusFromChildStatuses as u, evalAssert as ut, cacheModeSchema as v, defineEval as vt, caseDetailSchema as w, serializedCacheSpanSchema as x, cacheRecordingOpSchema as y, getEvalRegistry as yt, evalChartTypeSchema as z };
|