@ls-stack/agent-eval 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CKa9TjXw.mjs → app-CljutWb7.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-B2GWGl5i.css +1 -0
- package/dist/apps/web/dist/assets/index-ibhQ_P7i.js +109 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-CwEFLP0w.mjs → cli-B0QmsWCU.mjs} +506 -77
- package/dist/index.d.mts +147 -81
- package/dist/index.mjs +3 -3
- package/dist/{runner-CD5aDJ0C.mjs → runner-BY-y4OzF.mjs} +2 -2
- package/dist/{runner-Ck4X0H3p.mjs → runner-CsSJwWE4.mjs} +1 -1
- package/dist/src-Bivx1C6b.mjs +2 -0
- package/package.json +3 -3
- package/dist/apps/web/dist/assets/index-BUz24J7O.css +0 -1
- package/dist/apps/web/dist/assets/index-Dm50Ynbs.js +0 -109
- package/dist/src-BDRmaWFu.mjs +0 -2
|
@@ -2,6 +2,7 @@ import { createHash } from "node:crypto";
|
|
|
2
2
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, extname, join, relative, resolve } from "node:path";
|
|
4
4
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
5
|
+
import { Buffer as Buffer$1 } from "node:buffer";
|
|
5
6
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
6
7
|
import { z } from "zod/v4";
|
|
7
8
|
import { watch } from "chokidar";
|
|
@@ -206,6 +207,15 @@ function noopActiveSpan() {
|
|
|
206
207
|
setAttributes() {}
|
|
207
208
|
};
|
|
208
209
|
}
|
|
210
|
+
function noopExternalSpan(id) {
|
|
211
|
+
return {
|
|
212
|
+
id,
|
|
213
|
+
setName() {},
|
|
214
|
+
setAttribute() {},
|
|
215
|
+
setAttributes() {},
|
|
216
|
+
end() {}
|
|
217
|
+
};
|
|
218
|
+
}
|
|
209
219
|
function mergeSpanAttributes(span, attributes) {
|
|
210
220
|
span.attributes = {
|
|
211
221
|
...span.attributes,
|
|
@@ -225,6 +235,127 @@ function createSpanHandle(span) {
|
|
|
225
235
|
}
|
|
226
236
|
};
|
|
227
237
|
}
|
|
238
|
+
function createExternalSpanHandle(id) {
|
|
239
|
+
return {
|
|
240
|
+
id,
|
|
241
|
+
setName(value) {
|
|
242
|
+
updateExternalSpan({
|
|
243
|
+
id,
|
|
244
|
+
name: value
|
|
245
|
+
});
|
|
246
|
+
},
|
|
247
|
+
setAttribute(key, value) {
|
|
248
|
+
updateExternalSpan({
|
|
249
|
+
id,
|
|
250
|
+
attributes: { [key]: value }
|
|
251
|
+
});
|
|
252
|
+
},
|
|
253
|
+
setAttributes(value) {
|
|
254
|
+
updateExternalSpan({
|
|
255
|
+
id,
|
|
256
|
+
attributes: value
|
|
257
|
+
});
|
|
258
|
+
},
|
|
259
|
+
end(info = {}) {
|
|
260
|
+
endExternalSpan({
|
|
261
|
+
...info,
|
|
262
|
+
id
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
function toIsoTimestamp(value) {
|
|
268
|
+
if (value === void 0) return (/* @__PURE__ */ new Date()).toISOString();
|
|
269
|
+
if (typeof value === "string") return value;
|
|
270
|
+
return value.toISOString();
|
|
271
|
+
}
|
|
272
|
+
function findSpan(scope, id) {
|
|
273
|
+
return scope.spans.find((span) => span.id === id);
|
|
274
|
+
}
|
|
275
|
+
function resolveExternalParentId(scope, parentId) {
|
|
276
|
+
if (parentId !== void 0) return parentId;
|
|
277
|
+
return scope.activeSpanStack.at(-1)?.id ?? null;
|
|
278
|
+
}
|
|
279
|
+
function startExternalSpan(info) {
|
|
280
|
+
const id = info.id ?? generateSpanId();
|
|
281
|
+
const scope = getCurrentScope();
|
|
282
|
+
if (!scope) return noopExternalSpan(id);
|
|
283
|
+
const existing = findSpan(scope, id);
|
|
284
|
+
if (existing) {
|
|
285
|
+
existing.parentId = resolveExternalParentId(scope, info.parentId);
|
|
286
|
+
existing.kind = info.kind;
|
|
287
|
+
existing.name = info.name;
|
|
288
|
+
existing.startedAt = toIsoTimestamp(info.startedAt);
|
|
289
|
+
existing.status = "running";
|
|
290
|
+
existing.endedAt = null;
|
|
291
|
+
if (info.attributes !== void 0) existing.attributes = info.attributes;
|
|
292
|
+
return createExternalSpanHandle(id);
|
|
293
|
+
}
|
|
294
|
+
scope.spans.push({
|
|
295
|
+
id,
|
|
296
|
+
parentId: resolveExternalParentId(scope, info.parentId),
|
|
297
|
+
caseId: scope.caseId,
|
|
298
|
+
kind: info.kind,
|
|
299
|
+
name: info.name,
|
|
300
|
+
startedAt: toIsoTimestamp(info.startedAt),
|
|
301
|
+
endedAt: null,
|
|
302
|
+
status: "running",
|
|
303
|
+
attributes: info.attributes
|
|
304
|
+
});
|
|
305
|
+
return createExternalSpanHandle(id);
|
|
306
|
+
}
|
|
307
|
+
function updateExternalSpan(info) {
|
|
308
|
+
const scope = getCurrentScope();
|
|
309
|
+
if (!scope) return;
|
|
310
|
+
const span = findSpan(scope, info.id);
|
|
311
|
+
if (!span) return;
|
|
312
|
+
if (info.name !== void 0) span.name = info.name;
|
|
313
|
+
if (info.status !== void 0) span.status = info.status;
|
|
314
|
+
if (info.error !== void 0) span.error = info.error;
|
|
315
|
+
if (info.attributes !== void 0) mergeSpanAttributes(span, info.attributes);
|
|
316
|
+
}
|
|
317
|
+
function endExternalSpan(info) {
|
|
318
|
+
const scope = getCurrentScope();
|
|
319
|
+
if (!scope) return;
|
|
320
|
+
const span = findSpan(scope, info.id);
|
|
321
|
+
if (!span) return;
|
|
322
|
+
updateExternalSpan(info);
|
|
323
|
+
span.status = info.status ?? (info.error ? "error" : "ok");
|
|
324
|
+
span.endedAt = toIsoTimestamp(info.endedAt);
|
|
325
|
+
}
|
|
326
|
+
function recordExternalSpan(info) {
|
|
327
|
+
const id = info.id ?? generateSpanId();
|
|
328
|
+
const scope = getCurrentScope();
|
|
329
|
+
if (!scope) return id;
|
|
330
|
+
const startedAt = toIsoTimestamp(info.startedAt);
|
|
331
|
+
const endedAt = info.endedAt === null ? null : info.endedAt ? toIsoTimestamp(info.endedAt) : startedAt;
|
|
332
|
+
const existing = findSpan(scope, id);
|
|
333
|
+
const status = info.status ?? (info.error ? "error" : "ok");
|
|
334
|
+
if (existing) {
|
|
335
|
+
existing.parentId = resolveExternalParentId(scope, info.parentId);
|
|
336
|
+
existing.kind = info.kind;
|
|
337
|
+
existing.name = info.name;
|
|
338
|
+
existing.startedAt = startedAt;
|
|
339
|
+
existing.endedAt = endedAt;
|
|
340
|
+
existing.status = status;
|
|
341
|
+
existing.attributes = info.attributes;
|
|
342
|
+
existing.error = info.error;
|
|
343
|
+
return id;
|
|
344
|
+
}
|
|
345
|
+
scope.spans.push({
|
|
346
|
+
id,
|
|
347
|
+
parentId: resolveExternalParentId(scope, info.parentId),
|
|
348
|
+
caseId: scope.caseId,
|
|
349
|
+
kind: info.kind,
|
|
350
|
+
name: info.name,
|
|
351
|
+
startedAt,
|
|
352
|
+
endedAt,
|
|
353
|
+
status,
|
|
354
|
+
attributes: info.attributes,
|
|
355
|
+
error: info.error
|
|
356
|
+
});
|
|
357
|
+
return id;
|
|
358
|
+
}
|
|
228
359
|
/**
|
|
229
360
|
* Ambient handle for the active span in the current async context.
|
|
230
361
|
*
|
|
@@ -272,7 +403,7 @@ async function traceSpan(info, fn) {
|
|
|
272
403
|
if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
|
|
273
404
|
const ctx = cacheCtx;
|
|
274
405
|
const namespace = cacheOpts.namespace ?? `${ctx.evalId}__${info.name}`;
|
|
275
|
-
const keyHash = hashCacheKey({
|
|
406
|
+
const keyHash = await hashCacheKey({
|
|
276
407
|
namespace,
|
|
277
408
|
codeFingerprint: ctx.codeFingerprint,
|
|
278
409
|
key: cacheOpts.key
|
|
@@ -359,6 +490,34 @@ async function traceSpan(info, fn) {
|
|
|
359
490
|
const evalTracer = {
|
|
360
491
|
/** Run a callback inside a new trace span and record its lifecycle. */
|
|
361
492
|
span: traceSpan,
|
|
493
|
+
/**
|
|
494
|
+
* Start a span whose lifecycle is controlled by an external tracer/exporter.
|
|
495
|
+
*
|
|
496
|
+
* Calls are no-ops outside an eval case scope, except that a generated or
|
|
497
|
+
* caller-provided id is still returned for ergonomic adapter code.
|
|
498
|
+
*/
|
|
499
|
+
startSpan: startExternalSpan,
|
|
500
|
+
/**
|
|
501
|
+
* Merge updates into an externally managed span that was started earlier.
|
|
502
|
+
*
|
|
503
|
+
* This is intended for observability exporters that receive span update
|
|
504
|
+
* events before the final end event.
|
|
505
|
+
*/
|
|
506
|
+
updateSpan: updateExternalSpan,
|
|
507
|
+
/**
|
|
508
|
+
* Finish an externally managed span and attach final attributes or errors.
|
|
509
|
+
*
|
|
510
|
+
* Missing spans are ignored so exporter adapters can safely forward events
|
|
511
|
+
* even when they are emitted outside an eval case scope.
|
|
512
|
+
*/
|
|
513
|
+
endSpan: endExternalSpan,
|
|
514
|
+
/**
|
|
515
|
+
* Record a complete external span in one call.
|
|
516
|
+
*
|
|
517
|
+
* Use this when an upstream tracer only exposes completed spans rather than
|
|
518
|
+
* start/update/end events.
|
|
519
|
+
*/
|
|
520
|
+
recordSpan: recordExternalSpan,
|
|
362
521
|
/** Record a named point-in-time value alongside the trace. */
|
|
363
522
|
checkpoint(name, data) {
|
|
364
523
|
const scope = getCurrentScope();
|
|
@@ -412,9 +571,103 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
412
571
|
checkpoints
|
|
413
572
|
};
|
|
414
573
|
}
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
574
|
+
var SerializedCacheKeyValue = class {
|
|
575
|
+
value;
|
|
576
|
+
constructor(value) {
|
|
577
|
+
this.value = value;
|
|
578
|
+
}
|
|
579
|
+
};
|
|
580
|
+
/**
|
|
581
|
+
* Hash the components of a cache key into a deterministic hex digest.
|
|
582
|
+
*
|
|
583
|
+
* Native `Blob` and `File` values are read asynchronously and hashed by
|
|
584
|
+
* content. Use `hashCacheKeySync` only when the key contains no async values.
|
|
585
|
+
*/
|
|
586
|
+
async function hashCacheKey(input) {
|
|
587
|
+
return hashCacheKeySyncMaterialized(await materializeAsyncCacheKeyValue(input));
|
|
588
|
+
}
|
|
589
|
+
/**
|
|
590
|
+
* Synchronously hash cache key components. This supports JSON-like data and
|
|
591
|
+
* in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
|
|
592
|
+
* but cannot content-hash native `Blob` or `File` values.
|
|
593
|
+
*/
|
|
594
|
+
function hashCacheKeySync(input) {
|
|
595
|
+
return hashCacheKeySyncMaterialized(input);
|
|
596
|
+
}
|
|
597
|
+
function hashCacheKeySyncMaterialized(input) {
|
|
598
|
+
return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
|
|
599
|
+
}
|
|
600
|
+
function stringifyCacheKeyValue(value) {
|
|
601
|
+
if (value instanceof SerializedCacheKeyValue) return value.value;
|
|
602
|
+
if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
|
|
603
|
+
if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
|
|
604
|
+
if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
|
|
605
|
+
if (isArrayBufferView(value)) {
|
|
606
|
+
const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
|
|
607
|
+
return `$${value.constructor.name}:${hashBytes(bytes)}`;
|
|
608
|
+
}
|
|
609
|
+
if (isFile$1(value)) return `$file:${getCompositeKey({
|
|
610
|
+
lastModified: value.lastModified,
|
|
611
|
+
name: value.name,
|
|
612
|
+
size: value.size,
|
|
613
|
+
type: value.type
|
|
614
|
+
})}`;
|
|
615
|
+
if (isBlob$1(value)) return `$blob:${getCompositeKey({
|
|
616
|
+
size: value.size,
|
|
617
|
+
type: value.type
|
|
618
|
+
})}`;
|
|
619
|
+
}
|
|
620
|
+
async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
|
|
621
|
+
const serialized = await stringifyAsyncCacheKeyValue(value);
|
|
622
|
+
if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
|
|
623
|
+
if (stringifyCacheKeyValue(value) !== void 0) return value;
|
|
624
|
+
if (!value || typeof value !== "object") return value;
|
|
625
|
+
if (Array.isArray(value)) {
|
|
626
|
+
const items = [];
|
|
627
|
+
for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
|
|
628
|
+
return items;
|
|
629
|
+
}
|
|
630
|
+
if (refs.has(value)) throw new Error("Circular reference detected");
|
|
631
|
+
refs.add(value);
|
|
632
|
+
const entries = [];
|
|
633
|
+
for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
|
|
634
|
+
refs.delete(value);
|
|
635
|
+
return Object.fromEntries(entries);
|
|
636
|
+
}
|
|
637
|
+
async function stringifyAsyncCacheKeyValue(value) {
|
|
638
|
+
if (isFile$1(value)) return `$file:${getCompositeKey({
|
|
639
|
+
bytes: await hashBlobBytes(value),
|
|
640
|
+
lastModified: value.lastModified,
|
|
641
|
+
name: value.name,
|
|
642
|
+
size: value.size,
|
|
643
|
+
type: value.type
|
|
644
|
+
})}`;
|
|
645
|
+
if (isBlob$1(value)) return `$blob:${getCompositeKey({
|
|
646
|
+
bytes: await hashBlobBytes(value),
|
|
647
|
+
size: value.size,
|
|
648
|
+
type: value.type
|
|
649
|
+
})}`;
|
|
650
|
+
}
|
|
651
|
+
async function hashBlobBytes(value) {
|
|
652
|
+
return hashBytes(new Uint8Array(await value.arrayBuffer()));
|
|
653
|
+
}
|
|
654
|
+
function hashBytes(value) {
|
|
655
|
+
return createHash("sha256").update(value).digest("hex");
|
|
656
|
+
}
|
|
657
|
+
function isArrayBuffer(value) {
|
|
658
|
+
return value instanceof ArrayBuffer;
|
|
659
|
+
}
|
|
660
|
+
function isSharedArrayBuffer(value) {
|
|
661
|
+
return value instanceof SharedArrayBuffer;
|
|
662
|
+
}
|
|
663
|
+
function isArrayBufferView(value) {
|
|
664
|
+
return ArrayBuffer.isView(value);
|
|
665
|
+
}
|
|
666
|
+
function isBlob$1(value) {
|
|
667
|
+
return value instanceof Blob;
|
|
668
|
+
}
|
|
669
|
+
function isFile$1(value) {
|
|
670
|
+
return value instanceof File;
|
|
418
671
|
}
|
|
419
672
|
function toJsonSafe(value) {
|
|
420
673
|
if (value === void 0) return void 0;
|
|
@@ -578,17 +831,13 @@ const columnDefSchema = z.object({
|
|
|
578
831
|
const cellValueSchema = z.union([jsonCellSchema, fileRefSchema]);
|
|
579
832
|
//#endregion
|
|
580
833
|
//#region ../shared/src/schemas/trace.ts
|
|
581
|
-
/**
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
"scorer",
|
|
589
|
-
"checkpoint",
|
|
590
|
-
"custom"
|
|
591
|
-
]);
|
|
834
|
+
/**
|
|
835
|
+
* Schema for span categories recorded in traces.
|
|
836
|
+
*
|
|
837
|
+
* The value is intentionally open-ended so external tracers can preserve their
|
|
838
|
+
* native span kinds instead of collapsing them into the built-in categories.
|
|
839
|
+
*/
|
|
840
|
+
const traceSpanKindSchema = z.string().min(1);
|
|
592
841
|
/** Schema for the supported presentation formats of trace attributes. */
|
|
593
842
|
const traceAttributeDisplayFormatSchema = z.enum([
|
|
594
843
|
"string",
|
|
@@ -989,6 +1238,12 @@ const cacheEntrySchema = z.object({
|
|
|
989
1238
|
codeFingerprint: z.string(),
|
|
990
1239
|
recording: cacheRecordingSchema
|
|
991
1240
|
});
|
|
1241
|
+
/** Persisted per-owner cache file containing multiple cache entries. */
|
|
1242
|
+
const cacheFileSchema = z.object({
|
|
1243
|
+
version: z.literal(1),
|
|
1244
|
+
owner: z.string(),
|
|
1245
|
+
entries: z.record(z.string(), cacheEntrySchema)
|
|
1246
|
+
});
|
|
992
1247
|
//#endregion
|
|
993
1248
|
//#region ../shared/src/schemas/config.ts
|
|
994
1249
|
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
@@ -1004,7 +1259,8 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1004
1259
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
1005
1260
|
cache: z.object({
|
|
1006
1261
|
enabled: z.boolean().optional(),
|
|
1007
|
-
dir: z.string().optional()
|
|
1262
|
+
dir: z.string().optional(),
|
|
1263
|
+
maxEntriesPerEval: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional())
|
|
1008
1264
|
}).optional()
|
|
1009
1265
|
});
|
|
1010
1266
|
//#endregion
|
|
@@ -1243,60 +1499,59 @@ const createRunRequestSchema = z.object({
|
|
|
1243
1499
|
const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1).nullable() });
|
|
1244
1500
|
//#endregion
|
|
1245
1501
|
//#region ../runner/src/cacheStore.ts
|
|
1502
|
+
const defaultMaxEntriesPerEval = 100;
|
|
1246
1503
|
/**
|
|
1247
1504
|
* Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
|
|
1248
1505
|
*
|
|
1249
|
-
*
|
|
1250
|
-
*
|
|
1506
|
+
* Cache entries are grouped into one inspectable JSON file per eval/cache
|
|
1507
|
+
* owner. Writes use a short-lived lock directory plus `<name>.tmp` + atomic
|
|
1508
|
+
* `rename` to avoid partial reads and lost updates under concurrent access.
|
|
1251
1509
|
*/
|
|
1252
1510
|
function createFsCacheStore(options) {
|
|
1253
1511
|
const cacheDir = resolve(options.workspaceRoot, options.dir ?? ".agent-evals/cache");
|
|
1512
|
+
const maxEntriesPerEval = normalizeMaxEntries(options.maxEntriesPerEval);
|
|
1254
1513
|
return {
|
|
1255
1514
|
dir() {
|
|
1256
1515
|
return cacheDir;
|
|
1257
1516
|
},
|
|
1258
1517
|
async lookup(namespace, keyHash) {
|
|
1259
|
-
|
|
1260
|
-
if (!existsSync(filePath)) return null;
|
|
1261
|
-
const json = safeJsonParse(await readFile(filePath, "utf-8"));
|
|
1262
|
-
if (json === null) return null;
|
|
1263
|
-
const parsed = cacheEntrySchema.safeParse(json);
|
|
1264
|
-
if (!parsed.success) return null;
|
|
1265
|
-
return parsed.data;
|
|
1518
|
+
return (await readCacheFile(cacheDir, ownerFromNamespace(namespace)))?.entries[keyHash] ?? null;
|
|
1266
1519
|
},
|
|
1267
1520
|
async write(entry) {
|
|
1268
|
-
const
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
await
|
|
1272
|
-
|
|
1521
|
+
const owner = ownerFromNamespace(entry.namespace);
|
|
1522
|
+
const filePath = ownerPath(cacheDir, owner);
|
|
1523
|
+
await mkdir(cacheDir, { recursive: true });
|
|
1524
|
+
await withCacheFileLock(filePath, async () => {
|
|
1525
|
+
await writeCacheFile(cacheDir, {
|
|
1526
|
+
version: 1,
|
|
1527
|
+
owner,
|
|
1528
|
+
entries: pruneEntries({
|
|
1529
|
+
...(await readCacheFile(cacheDir, owner))?.entries ?? {},
|
|
1530
|
+
[entry.key]: entry
|
|
1531
|
+
}, maxEntriesPerEval, entry.key)
|
|
1532
|
+
});
|
|
1533
|
+
});
|
|
1273
1534
|
},
|
|
1274
1535
|
async list() {
|
|
1275
1536
|
if (!existsSync(cacheDir)) return [];
|
|
1276
|
-
const
|
|
1537
|
+
const files = await readdir(cacheDir);
|
|
1277
1538
|
const items = [];
|
|
1278
|
-
for (const
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
const
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
spanKind: parsed.data.spanKind,
|
|
1295
|
-
storedAt: parsed.data.storedAt,
|
|
1296
|
-
codeFingerprint: parsed.data.codeFingerprint,
|
|
1297
|
-
sizeBytes: fileStat.size
|
|
1298
|
-
});
|
|
1299
|
-
}
|
|
1539
|
+
for (const fileName of files) {
|
|
1540
|
+
if (!fileName.endsWith(".json")) continue;
|
|
1541
|
+
const filePath = join(cacheDir, fileName);
|
|
1542
|
+
const fileStatResult = await resultify(() => stat(filePath));
|
|
1543
|
+
if (fileStatResult.error || !fileStatResult.value.isFile()) continue;
|
|
1544
|
+
const cacheFile = await readCacheFilePath(filePath);
|
|
1545
|
+
if (cacheFile === null) continue;
|
|
1546
|
+
for (const entry of Object.values(cacheFile.entries)) items.push({
|
|
1547
|
+
key: entry.key,
|
|
1548
|
+
namespace: entry.namespace,
|
|
1549
|
+
spanName: entry.spanName,
|
|
1550
|
+
spanKind: entry.spanKind,
|
|
1551
|
+
storedAt: entry.storedAt,
|
|
1552
|
+
codeFingerprint: entry.codeFingerprint,
|
|
1553
|
+
sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
|
|
1554
|
+
});
|
|
1300
1555
|
}
|
|
1301
1556
|
items.sort((a, b) => a.storedAt < b.storedAt ? 1 : -1);
|
|
1302
1557
|
return items;
|
|
@@ -1310,21 +1565,36 @@ function createFsCacheStore(options) {
|
|
|
1310
1565
|
});
|
|
1311
1566
|
return;
|
|
1312
1567
|
}
|
|
1313
|
-
if (filter.namespace !== void 0
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1568
|
+
if (filter.namespace !== void 0) {
|
|
1569
|
+
const owner = ownerFromNamespace(filter.namespace);
|
|
1570
|
+
await withCacheFileLock(ownerPath(cacheDir, owner), async () => {
|
|
1571
|
+
const cacheFile = await readCacheFile(cacheDir, owner);
|
|
1572
|
+
if (cacheFile === null) return;
|
|
1573
|
+
await writeOrRemoveCacheFile(cacheDir, {
|
|
1574
|
+
version: 1,
|
|
1575
|
+
owner,
|
|
1576
|
+
entries: Object.fromEntries(Object.entries(cacheFile.entries).filter(([key, entry]) => {
|
|
1577
|
+
if (filter.key !== void 0) return key !== filter.key;
|
|
1578
|
+
return entry.namespace !== filter.namespace;
|
|
1579
|
+
}))
|
|
1580
|
+
});
|
|
1317
1581
|
});
|
|
1318
1582
|
return;
|
|
1319
1583
|
}
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1584
|
+
const files = await readdir(cacheDir);
|
|
1585
|
+
for (const fileName of files) {
|
|
1586
|
+
if (!fileName.endsWith(".json")) continue;
|
|
1587
|
+
const filePath = join(cacheDir, fileName);
|
|
1588
|
+
await withCacheFileLock(filePath, async () => {
|
|
1589
|
+
const cacheFile = await readCacheFilePath(filePath);
|
|
1590
|
+
if (cacheFile === null) return;
|
|
1591
|
+
const entries = Object.fromEntries(Object.entries(cacheFile.entries).filter(([key]) => key !== filter.key));
|
|
1592
|
+
await writeOrRemoveCacheFile(cacheDir, {
|
|
1593
|
+
version: 1,
|
|
1594
|
+
owner: cacheFile.owner,
|
|
1595
|
+
entries
|
|
1596
|
+
});
|
|
1597
|
+
});
|
|
1328
1598
|
}
|
|
1329
1599
|
}
|
|
1330
1600
|
};
|
|
@@ -1356,8 +1626,16 @@ function createBufferedCacheStore(backingStore) {
|
|
|
1356
1626
|
}
|
|
1357
1627
|
};
|
|
1358
1628
|
}
|
|
1359
|
-
function
|
|
1360
|
-
|
|
1629
|
+
function normalizeMaxEntries(value) {
|
|
1630
|
+
if (value === void 0 || !Number.isFinite(value) || value <= 0) return defaultMaxEntriesPerEval;
|
|
1631
|
+
return Math.floor(value);
|
|
1632
|
+
}
|
|
1633
|
+
function ownerFromNamespace(namespace) {
|
|
1634
|
+
const [owner] = namespace.split("__");
|
|
1635
|
+
return owner === void 0 || owner.length === 0 ? namespace : owner;
|
|
1636
|
+
}
|
|
1637
|
+
function ownerPath(cacheDir, owner) {
|
|
1638
|
+
return join(cacheDir, `${sanitizeSegment$1(owner)}.json`);
|
|
1361
1639
|
}
|
|
1362
1640
|
function toPendingKey(namespace, keyHash) {
|
|
1363
1641
|
return `${namespace}::${keyHash}`;
|
|
@@ -1365,6 +1643,69 @@ function toPendingKey(namespace, keyHash) {
|
|
|
1365
1643
|
function sanitizeSegment$1(segment) {
|
|
1366
1644
|
return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
|
|
1367
1645
|
}
|
|
1646
|
+
async function readCacheFile(cacheDir, owner) {
|
|
1647
|
+
return readCacheFilePath(ownerPath(cacheDir, owner));
|
|
1648
|
+
}
|
|
1649
|
+
async function readCacheFilePath(filePath) {
|
|
1650
|
+
if (!existsSync(filePath)) return null;
|
|
1651
|
+
const rawResult = await resultify(() => readFile(filePath, "utf-8"));
|
|
1652
|
+
if (rawResult.error) return null;
|
|
1653
|
+
const json = safeJsonParse(rawResult.value);
|
|
1654
|
+
if (json === null) return null;
|
|
1655
|
+
const parsed = cacheFileSchema.safeParse(json);
|
|
1656
|
+
if (!parsed.success) return null;
|
|
1657
|
+
return parsed.data;
|
|
1658
|
+
}
|
|
1659
|
+
async function writeOrRemoveCacheFile(cacheDir, cacheFile) {
|
|
1660
|
+
if (Object.keys(cacheFile.entries).length === 0) {
|
|
1661
|
+
await rm(ownerPath(cacheDir, cacheFile.owner), { force: true });
|
|
1662
|
+
return;
|
|
1663
|
+
}
|
|
1664
|
+
await writeCacheFile(cacheDir, cacheFile);
|
|
1665
|
+
}
|
|
1666
|
+
async function writeCacheFile(cacheDir, cacheFile) {
|
|
1667
|
+
await mkdir(cacheDir, { recursive: true });
|
|
1668
|
+
const filePath = ownerPath(cacheDir, cacheFile.owner);
|
|
1669
|
+
const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
|
|
1670
|
+
await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
|
|
1671
|
+
await rename(tmpPath, filePath);
|
|
1672
|
+
}
|
|
1673
|
+
function pruneEntries(entries, maxEntries, protectedKey) {
|
|
1674
|
+
const sorted = Object.values(entries).toSorted((a, b) => a.storedAt < b.storedAt ? 1 : -1);
|
|
1675
|
+
const kept = /* @__PURE__ */ new Map();
|
|
1676
|
+
const protectedEntry = entries[protectedKey];
|
|
1677
|
+
if (protectedEntry !== void 0) kept.set(protectedEntry.key, protectedEntry);
|
|
1678
|
+
for (const entry of sorted) {
|
|
1679
|
+
if (kept.size >= maxEntries) break;
|
|
1680
|
+
kept.set(entry.key, entry);
|
|
1681
|
+
}
|
|
1682
|
+
return Object.fromEntries([...kept.values()].toSorted((a, b) => a.key < b.key ? -1 : 1).map((entry) => [entry.key, entry]));
|
|
1683
|
+
}
|
|
1684
|
+
async function withCacheFileLock(filePath, fn) {
|
|
1685
|
+
const lockPath = `${filePath}.lock`;
|
|
1686
|
+
await acquireLock(lockPath);
|
|
1687
|
+
const result = await resultify(fn);
|
|
1688
|
+
await rm(lockPath, {
|
|
1689
|
+
recursive: true,
|
|
1690
|
+
force: true
|
|
1691
|
+
});
|
|
1692
|
+
if (result.error) throw result.error;
|
|
1693
|
+
}
|
|
1694
|
+
async function acquireLock(lockPath) {
|
|
1695
|
+
const startedAt = Date.now();
|
|
1696
|
+
let lastError;
|
|
1697
|
+
while (Date.now() - startedAt < 5e3) {
|
|
1698
|
+
const result = await resultify(() => mkdir(lockPath, { recursive: false }));
|
|
1699
|
+
if (!result.error) return;
|
|
1700
|
+
lastError = result.error;
|
|
1701
|
+
await sleep(20);
|
|
1702
|
+
}
|
|
1703
|
+
if (lastError instanceof Error) throw lastError;
|
|
1704
|
+
throw new Error(`Timed out acquiring cache lock at ${lockPath}`);
|
|
1705
|
+
}
|
|
1706
|
+
function sleep(ms) {
|
|
1707
|
+
return new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
|
|
1708
|
+
}
|
|
1368
1709
|
function safeJsonParse(text) {
|
|
1369
1710
|
const parsed = resultify(() => JSON.parse(text));
|
|
1370
1711
|
if (parsed.error) return null;
|
|
@@ -2730,7 +3071,8 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
2730
3071
|
await mkdir(join(localStateDir, "runs"), { recursive: true });
|
|
2731
3072
|
cacheStore = createFsCacheStore({
|
|
2732
3073
|
workspaceRoot,
|
|
2733
|
-
dir: config.cache?.dir
|
|
3074
|
+
dir: config.cache?.dir,
|
|
3075
|
+
maxEntriesPerEval: config.cache?.maxEntriesPerEval
|
|
2734
3076
|
});
|
|
2735
3077
|
await loadPersistedRuns();
|
|
2736
3078
|
await runner.refreshDiscovery();
|
|
@@ -3128,6 +3470,9 @@ function parseArgs(argv) {
|
|
|
3128
3470
|
const args = {
|
|
3129
3471
|
command: "help",
|
|
3130
3472
|
subcommand: void 0,
|
|
3473
|
+
showHelp: false,
|
|
3474
|
+
helpTopic: "global",
|
|
3475
|
+
unknownHelpTarget: void 0,
|
|
3131
3476
|
evalIds: [],
|
|
3132
3477
|
caseIds: [],
|
|
3133
3478
|
trials: 1,
|
|
@@ -3138,19 +3483,28 @@ function parseArgs(argv) {
|
|
|
3138
3483
|
all: false
|
|
3139
3484
|
};
|
|
3140
3485
|
const command = argv[0];
|
|
3141
|
-
if (command === "
|
|
3486
|
+
if (command === "--help" || command === "-h") {
|
|
3487
|
+
args.showHelp = true;
|
|
3488
|
+
return args;
|
|
3489
|
+
}
|
|
3490
|
+
if (isCliCommand(command)) {
|
|
3491
|
+
args.command = command;
|
|
3492
|
+
args.helpTopic = command === "help" ? "global" : command;
|
|
3493
|
+
} else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
|
|
3142
3494
|
let cursor = 1;
|
|
3143
3495
|
if (args.command === "cache") {
|
|
3144
3496
|
const sub = argv[cursor];
|
|
3145
3497
|
if (sub === "list" || sub === "clear") {
|
|
3146
3498
|
args.subcommand = sub;
|
|
3499
|
+
args.helpTopic = `cache ${sub}`;
|
|
3147
3500
|
cursor++;
|
|
3148
|
-
}
|
|
3501
|
+
} else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
|
|
3149
3502
|
}
|
|
3150
3503
|
for (let i = cursor; i < argv.length; i++) {
|
|
3151
3504
|
const arg = argv[i];
|
|
3152
3505
|
const next = argv[i + 1];
|
|
3153
|
-
if (arg === "--
|
|
3506
|
+
if (arg === "--help" || arg === "-h") args.showHelp = true;
|
|
3507
|
+
else if (arg === "--eval" && next) {
|
|
3154
3508
|
args.evalIds.push(...next.split(","));
|
|
3155
3509
|
i++;
|
|
3156
3510
|
} else if (arg === "--case" && next) {
|
|
@@ -3180,6 +3534,15 @@ function parseArgs(argv) {
|
|
|
3180
3534
|
*/
|
|
3181
3535
|
async function runCli(argv) {
|
|
3182
3536
|
const args = parseArgs(argv);
|
|
3537
|
+
if (args.showHelp) {
|
|
3538
|
+
if (args.unknownHelpTarget !== void 0) {
|
|
3539
|
+
console.error(`No help found for "${args.unknownHelpTarget}".`);
|
|
3540
|
+
process.exit(1);
|
|
3541
|
+
return;
|
|
3542
|
+
}
|
|
3543
|
+
printHelp(args.helpTopic);
|
|
3544
|
+
return;
|
|
3545
|
+
}
|
|
3183
3546
|
switch (args.command) {
|
|
3184
3547
|
case "app":
|
|
3185
3548
|
await commandApp(args);
|
|
@@ -3194,10 +3557,13 @@ async function runCli(argv) {
|
|
|
3194
3557
|
await commandCache(args);
|
|
3195
3558
|
break;
|
|
3196
3559
|
default:
|
|
3197
|
-
printHelp();
|
|
3560
|
+
printHelp(args.helpTopic);
|
|
3198
3561
|
break;
|
|
3199
3562
|
}
|
|
3200
3563
|
}
|
|
3564
|
+
function isCliCommand(command) {
|
|
3565
|
+
return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
|
|
3566
|
+
}
|
|
3201
3567
|
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
3202
3568
|
const repoRoot = resolve(currentDir, "../../..");
|
|
3203
3569
|
const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
|
|
@@ -3246,8 +3612,8 @@ async function commandApp(args) {
|
|
|
3246
3612
|
const { serve } = await import("@hono/node-server");
|
|
3247
3613
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
3248
3614
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
3249
|
-
const appModule = await import("./app-
|
|
3250
|
-
const runnerModule = await import("./runner-
|
|
3615
|
+
const appModule = await import("./app-CljutWb7.mjs");
|
|
3616
|
+
const runnerModule = await import("./runner-CsSJwWE4.mjs");
|
|
3251
3617
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
3252
3618
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
3253
3619
|
await runnerModule.initRunner();
|
|
@@ -3378,7 +3744,7 @@ async function commandCache(args) {
|
|
|
3378
3744
|
process.exit(1);
|
|
3379
3745
|
return;
|
|
3380
3746
|
}
|
|
3381
|
-
printHelp();
|
|
3747
|
+
printHelp(args.helpTopic);
|
|
3382
3748
|
}
|
|
3383
3749
|
async function waitForRunCompletion(runner, runId) {
|
|
3384
3750
|
return new Promise((resolvePromise) => {
|
|
@@ -3393,7 +3759,69 @@ async function waitForRunCompletion(runner, runId) {
|
|
|
3393
3759
|
check();
|
|
3394
3760
|
});
|
|
3395
3761
|
}
|
|
3396
|
-
function printHelp() {
|
|
3762
|
+
function printHelp(topic = "global") {
|
|
3763
|
+
if (topic === "app") {
|
|
3764
|
+
console.info(`
|
|
3765
|
+
agent-evals app - Start server with UI
|
|
3766
|
+
|
|
3767
|
+
Usage:
|
|
3768
|
+
agent-evals app [flags]
|
|
3769
|
+
|
|
3770
|
+
Flags:
|
|
3771
|
+
--port <n> Server port (default: 4100)
|
|
3772
|
+
--help, -h Show this help
|
|
3773
|
+
`);
|
|
3774
|
+
return;
|
|
3775
|
+
}
|
|
3776
|
+
if (topic === "list") {
|
|
3777
|
+
console.info(`
|
|
3778
|
+
agent-evals list - List discovered evals
|
|
3779
|
+
|
|
3780
|
+
Usage:
|
|
3781
|
+
agent-evals list [flags]
|
|
3782
|
+
|
|
3783
|
+
Flags:
|
|
3784
|
+
--help, -h Show this help
|
|
3785
|
+
`);
|
|
3786
|
+
return;
|
|
3787
|
+
}
|
|
3788
|
+
if (topic === "run") {
|
|
3789
|
+
console.info(`
|
|
3790
|
+
agent-evals run - Run evals
|
|
3791
|
+
|
|
3792
|
+
Usage:
|
|
3793
|
+
agent-evals run [flags]
|
|
3794
|
+
|
|
3795
|
+
Flags:
|
|
3796
|
+
--eval <id> Run specific eval(s) (comma-separated)
|
|
3797
|
+
--case <id> Run specific case(s) (comma-separated)
|
|
3798
|
+
--trials <n> Number of trials per case
|
|
3799
|
+
--json Output run summary as JSON
|
|
3800
|
+
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
3801
|
+
--no-cache Shortcut for --cache bypass
|
|
3802
|
+
--refresh-cache Shortcut for --cache refresh
|
|
3803
|
+
--clear-cache Clear the cache before starting the run
|
|
3804
|
+
--help, -h Show this help
|
|
3805
|
+
`);
|
|
3806
|
+
return;
|
|
3807
|
+
}
|
|
3808
|
+
if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
|
|
3809
|
+
console.info(`
|
|
3810
|
+
agent-evals cache - Manage cached operation entries
|
|
3811
|
+
|
|
3812
|
+
Usage:
|
|
3813
|
+
agent-evals cache list [flags]
|
|
3814
|
+
agent-evals cache clear --eval <id>
|
|
3815
|
+
agent-evals cache clear --all
|
|
3816
|
+
|
|
3817
|
+
Flags:
|
|
3818
|
+
--eval <id> Clear entries for specific eval(s) (comma-separated)
|
|
3819
|
+
--all Confirm clearing every cached entry
|
|
3820
|
+
--json Output cache listing as JSON
|
|
3821
|
+
--help, -h Show this help
|
|
3822
|
+
`);
|
|
3823
|
+
return;
|
|
3824
|
+
}
|
|
3397
3825
|
console.info(`
|
|
3398
3826
|
agent-evals - LLM/Agent eval runner
|
|
3399
3827
|
|
|
@@ -3416,7 +3844,8 @@ Options:
|
|
|
3416
3844
|
--no-cache Shortcut for --cache bypass
|
|
3417
3845
|
--refresh-cache Shortcut for --cache refresh
|
|
3418
3846
|
--clear-cache Clear the cache before starting the run
|
|
3847
|
+
--help, -h Show help
|
|
3419
3848
|
`);
|
|
3420
3849
|
}
|
|
3421
3850
|
//#endregion
|
|
3422
|
-
export {
|
|
3851
|
+
export { fileRefSchema as $, evalSummarySchema as A, evalChartsConfigSchema as B, assertionFailureSchema as C, evalStatAggregateSchema as D, evalFreshnessStatusSchema as E, evalChartColorSchema as F, traceDisplayConfigSchema as G, traceAttributeDisplayInputSchema as H, evalChartConfigSchema as I, traceSpanSchema as J, traceDisplayInputConfigSchema as K, evalChartMetricSchema as L, evalChartAggregateSchema as M, evalChartAxisSchema as N, evalStatItemSchema as O, evalChartBuiltinMetricSchema as P, columnKindSchema as Q, evalChartTooltipExtraSchema as R, spanCacheOptionsSchema as S, caseRowSchema as T, traceAttributeDisplayPlacementSchema as U, traceAttributeDisplayFormatSchema as V, traceAttributeDisplaySchema as W, columnDefSchema as X, cellValueSchema as Y, columnFormatSchema as Z, cacheListItemSchema as _, repoFile as _t, sseEnvelopeSchema as a, evalSpan as at, cacheRecordingSchema as b, deriveScopedSummaryFromCases as c, hashCacheKeySync as ct, runManifestSchema as d, getCurrentScope as dt, jsonCellSchema as et, runSummarySchema as f, incrementEvalOutput as ft, cacheFileSchema as g, setScopeCacheContext as gt, cacheEntrySchema as h, setEvalOutput as ht, updateManualScoreRequestSchema as i, buildTraceTree as it, scoreTraceSchema as j, evalStatsConfigSchema as k, deriveStatusFromCaseRows as l, EvalAssertionError as lt, trialSelectionModeSchema as m, runInEvalScope as mt, createRunner as n, repoFileRefSchema as nt, getEvalTitle as o, evalTracer as ot, agentEvalsConfigSchema as p, isInEvalScope as pt, traceSpanKindSchema as q, createRunRequestSchema as r, runArtifactRefSchema as rt, getEvalDisplayStatus as s, hashCacheKey as st, runCli as t, numberDisplayOptionsSchema as tt, deriveStatusFromChildStatuses as u, evalAssert as ut, cacheModeSchema as v, defineEval as vt, caseDetailSchema as w, serializedCacheSpanSchema as x, cacheRecordingOpSchema as y, getEvalRegistry as yt, evalChartTypeSchema as z };
|