@ls-stack/agent-eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import { createHash } from "node:crypto";
2
2
  import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
3
3
  import { dirname, extname, join, relative, resolve } from "node:path";
4
4
  import { AsyncLocalStorage } from "node:async_hooks";
5
+ import { Buffer as Buffer$1 } from "node:buffer";
5
6
  import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
6
7
  import { z } from "zod/v4";
7
8
  import { watch } from "chokidar";
@@ -206,6 +207,15 @@ function noopActiveSpan() {
206
207
  setAttributes() {}
207
208
  };
208
209
  }
210
+ function noopExternalSpan(id) {
211
+ return {
212
+ id,
213
+ setName() {},
214
+ setAttribute() {},
215
+ setAttributes() {},
216
+ end() {}
217
+ };
218
+ }
209
219
  function mergeSpanAttributes(span, attributes) {
210
220
  span.attributes = {
211
221
  ...span.attributes,
@@ -225,6 +235,127 @@ function createSpanHandle(span) {
225
235
  }
226
236
  };
227
237
  }
238
+ function createExternalSpanHandle(id) {
239
+ return {
240
+ id,
241
+ setName(value) {
242
+ updateExternalSpan({
243
+ id,
244
+ name: value
245
+ });
246
+ },
247
+ setAttribute(key, value) {
248
+ updateExternalSpan({
249
+ id,
250
+ attributes: { [key]: value }
251
+ });
252
+ },
253
+ setAttributes(value) {
254
+ updateExternalSpan({
255
+ id,
256
+ attributes: value
257
+ });
258
+ },
259
+ end(info = {}) {
260
+ endExternalSpan({
261
+ ...info,
262
+ id
263
+ });
264
+ }
265
+ };
266
+ }
267
+ function toIsoTimestamp(value) {
268
+ if (value === void 0) return (/* @__PURE__ */ new Date()).toISOString();
269
+ if (typeof value === "string") return value;
270
+ return value.toISOString();
271
+ }
272
+ function findSpan(scope, id) {
273
+ return scope.spans.find((span) => span.id === id);
274
+ }
275
+ function resolveExternalParentId(scope, parentId) {
276
+ if (parentId !== void 0) return parentId;
277
+ return scope.activeSpanStack.at(-1)?.id ?? null;
278
+ }
279
+ function startExternalSpan(info) {
280
+ const id = info.id ?? generateSpanId();
281
+ const scope = getCurrentScope();
282
+ if (!scope) return noopExternalSpan(id);
283
+ const existing = findSpan(scope, id);
284
+ if (existing) {
285
+ existing.parentId = resolveExternalParentId(scope, info.parentId);
286
+ existing.kind = info.kind;
287
+ existing.name = info.name;
288
+ existing.startedAt = toIsoTimestamp(info.startedAt);
289
+ existing.status = "running";
290
+ existing.endedAt = null;
291
+ if (info.attributes !== void 0) existing.attributes = info.attributes;
292
+ return createExternalSpanHandle(id);
293
+ }
294
+ scope.spans.push({
295
+ id,
296
+ parentId: resolveExternalParentId(scope, info.parentId),
297
+ caseId: scope.caseId,
298
+ kind: info.kind,
299
+ name: info.name,
300
+ startedAt: toIsoTimestamp(info.startedAt),
301
+ endedAt: null,
302
+ status: "running",
303
+ attributes: info.attributes
304
+ });
305
+ return createExternalSpanHandle(id);
306
+ }
307
+ function updateExternalSpan(info) {
308
+ const scope = getCurrentScope();
309
+ if (!scope) return;
310
+ const span = findSpan(scope, info.id);
311
+ if (!span) return;
312
+ if (info.name !== void 0) span.name = info.name;
313
+ if (info.status !== void 0) span.status = info.status;
314
+ if (info.error !== void 0) span.error = info.error;
315
+ if (info.attributes !== void 0) mergeSpanAttributes(span, info.attributes);
316
+ }
317
+ function endExternalSpan(info) {
318
+ const scope = getCurrentScope();
319
+ if (!scope) return;
320
+ const span = findSpan(scope, info.id);
321
+ if (!span) return;
322
+ updateExternalSpan(info);
323
+ span.status = info.status ?? (info.error ? "error" : "ok");
324
+ span.endedAt = toIsoTimestamp(info.endedAt);
325
+ }
326
+ function recordExternalSpan(info) {
327
+ const id = info.id ?? generateSpanId();
328
+ const scope = getCurrentScope();
329
+ if (!scope) return id;
330
+ const startedAt = toIsoTimestamp(info.startedAt);
331
+ const endedAt = info.endedAt === null ? null : info.endedAt ? toIsoTimestamp(info.endedAt) : startedAt;
332
+ const existing = findSpan(scope, id);
333
+ const status = info.status ?? (info.error ? "error" : "ok");
334
+ if (existing) {
335
+ existing.parentId = resolveExternalParentId(scope, info.parentId);
336
+ existing.kind = info.kind;
337
+ existing.name = info.name;
338
+ existing.startedAt = startedAt;
339
+ existing.endedAt = endedAt;
340
+ existing.status = status;
341
+ existing.attributes = info.attributes;
342
+ existing.error = info.error;
343
+ return id;
344
+ }
345
+ scope.spans.push({
346
+ id,
347
+ parentId: resolveExternalParentId(scope, info.parentId),
348
+ caseId: scope.caseId,
349
+ kind: info.kind,
350
+ name: info.name,
351
+ startedAt,
352
+ endedAt,
353
+ status,
354
+ attributes: info.attributes,
355
+ error: info.error
356
+ });
357
+ return id;
358
+ }
228
359
  /**
229
360
  * Ambient handle for the active span in the current async context.
230
361
  *
@@ -272,7 +403,7 @@ async function traceSpan(info, fn) {
272
403
  if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
273
404
  const ctx = cacheCtx;
274
405
  const namespace = cacheOpts.namespace ?? `${ctx.evalId}__${info.name}`;
275
- const keyHash = hashCacheKey({
406
+ const keyHash = await hashCacheKey({
276
407
  namespace,
277
408
  codeFingerprint: ctx.codeFingerprint,
278
409
  key: cacheOpts.key
@@ -359,6 +490,34 @@ async function traceSpan(info, fn) {
359
490
  const evalTracer = {
360
491
  /** Run a callback inside a new trace span and record its lifecycle. */
361
492
  span: traceSpan,
493
+ /**
494
+ * Start a span whose lifecycle is controlled by an external tracer/exporter.
495
+ *
496
+ * Calls are no-ops outside an eval case scope, except that a generated or
497
+ * caller-provided id is still returned for ergonomic adapter code.
498
+ */
499
+ startSpan: startExternalSpan,
500
+ /**
501
+ * Merge updates into an externally managed span that was started earlier.
502
+ *
503
+ * This is intended for observability exporters that receive span update
504
+ * events before the final end event.
505
+ */
506
+ updateSpan: updateExternalSpan,
507
+ /**
508
+ * Finish an externally managed span and attach final attributes or errors.
509
+ *
510
+ * Missing spans are ignored so exporter adapters can safely forward events
511
+ * even when they are emitted outside an eval case scope.
512
+ */
513
+ endSpan: endExternalSpan,
514
+ /**
515
+ * Record a complete external span in one call.
516
+ *
517
+ * Use this when an upstream tracer only exposes completed spans rather than
518
+ * start/update/end events.
519
+ */
520
+ recordSpan: recordExternalSpan,
362
521
  /** Record a named point-in-time value alongside the trace. */
363
522
  checkpoint(name, data) {
364
523
  const scope = getCurrentScope();
@@ -412,9 +571,103 @@ function buildTraceTree(spans, checkpoints) {
412
571
  checkpoints
413
572
  };
414
573
  }
415
- /** Hash the components of a cache key into a deterministic hex digest. */
416
- function hashCacheKey(input) {
417
- return createHash("sha256").update(getCompositeKey(input)).digest("hex");
574
+ var SerializedCacheKeyValue = class {
575
+ value;
576
+ constructor(value) {
577
+ this.value = value;
578
+ }
579
+ };
580
+ /**
581
+ * Hash the components of a cache key into a deterministic hex digest.
582
+ *
583
+ * Native `Blob` and `File` values are read asynchronously and hashed by
584
+ * content. Use `hashCacheKeySync` only when the key contains no async values.
585
+ */
586
+ async function hashCacheKey(input) {
587
+ return hashCacheKeySyncMaterialized(await materializeAsyncCacheKeyValue(input));
588
+ }
589
+ /**
590
+ * Synchronously hash cache key components. This supports JSON-like data and
591
+ * in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
592
+ * but cannot content-hash native `Blob` or `File` values.
593
+ */
594
+ function hashCacheKeySync(input) {
595
+ return hashCacheKeySyncMaterialized(input);
596
+ }
597
+ function hashCacheKeySyncMaterialized(input) {
598
+ return createHash("sha256").update(getCompositeKey(input, { stringify: stringifyCacheKeyValue })).digest("hex");
599
+ }
600
+ function stringifyCacheKeyValue(value) {
601
+ if (value instanceof SerializedCacheKeyValue) return value.value;
602
+ if (Buffer$1.isBuffer(value)) return `$buffer:${hashBytes(value)}`;
603
+ if (isArrayBuffer(value)) return `$arrayBuffer:${hashBytes(new Uint8Array(value))}`;
604
+ if (isSharedArrayBuffer(value)) return `$sharedArrayBuffer:${hashBytes(new Uint8Array(value))}`;
605
+ if (isArrayBufferView(value)) {
606
+ const bytes = new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
607
+ return `$${value.constructor.name}:${hashBytes(bytes)}`;
608
+ }
609
+ if (isFile$1(value)) return `$file:${getCompositeKey({
610
+ lastModified: value.lastModified,
611
+ name: value.name,
612
+ size: value.size,
613
+ type: value.type
614
+ })}`;
615
+ if (isBlob$1(value)) return `$blob:${getCompositeKey({
616
+ size: value.size,
617
+ type: value.type
618
+ })}`;
619
+ }
620
+ async function materializeAsyncCacheKeyValue(value, refs = /* @__PURE__ */ new WeakSet()) {
621
+ const serialized = await stringifyAsyncCacheKeyValue(value);
622
+ if (serialized !== void 0) return new SerializedCacheKeyValue(serialized);
623
+ if (stringifyCacheKeyValue(value) !== void 0) return value;
624
+ if (!value || typeof value !== "object") return value;
625
+ if (Array.isArray(value)) {
626
+ const items = [];
627
+ for (const item of value) items.push(await materializeAsyncCacheKeyValue(item, refs));
628
+ return items;
629
+ }
630
+ if (refs.has(value)) throw new Error("Circular reference detected");
631
+ refs.add(value);
632
+ const entries = [];
633
+ for (const [key, entryValue] of Object.entries(value)) entries.push([key, await materializeAsyncCacheKeyValue(entryValue, refs)]);
634
+ refs.delete(value);
635
+ return Object.fromEntries(entries);
636
+ }
637
+ async function stringifyAsyncCacheKeyValue(value) {
638
+ if (isFile$1(value)) return `$file:${getCompositeKey({
639
+ bytes: await hashBlobBytes(value),
640
+ lastModified: value.lastModified,
641
+ name: value.name,
642
+ size: value.size,
643
+ type: value.type
644
+ })}`;
645
+ if (isBlob$1(value)) return `$blob:${getCompositeKey({
646
+ bytes: await hashBlobBytes(value),
647
+ size: value.size,
648
+ type: value.type
649
+ })}`;
650
+ }
651
+ async function hashBlobBytes(value) {
652
+ return hashBytes(new Uint8Array(await value.arrayBuffer()));
653
+ }
654
+ function hashBytes(value) {
655
+ return createHash("sha256").update(value).digest("hex");
656
+ }
657
+ function isArrayBuffer(value) {
658
+ return value instanceof ArrayBuffer;
659
+ }
660
+ function isSharedArrayBuffer(value) {
661
+ return value instanceof SharedArrayBuffer;
662
+ }
663
+ function isArrayBufferView(value) {
664
+ return ArrayBuffer.isView(value);
665
+ }
666
+ function isBlob$1(value) {
667
+ return value instanceof Blob;
668
+ }
669
+ function isFile$1(value) {
670
+ return value instanceof File;
418
671
  }
419
672
  function toJsonSafe(value) {
420
673
  if (value === void 0) return void 0;
@@ -989,6 +1242,12 @@ const cacheEntrySchema = z.object({
989
1242
  codeFingerprint: z.string(),
990
1243
  recording: cacheRecordingSchema
991
1244
  });
1245
+ /** Persisted per-owner cache file containing multiple cache entries. */
1246
+ const cacheFileSchema = z.object({
1247
+ version: z.literal(1),
1248
+ owner: z.string(),
1249
+ entries: z.record(z.string(), cacheEntrySchema)
1250
+ });
992
1251
  //#endregion
993
1252
  //#region ../shared/src/schemas/config.ts
994
1253
  /** Strategy used to collapse repeated trials into one stored case result. */
@@ -1004,7 +1263,8 @@ const agentEvalsConfigSchema = z.object({
1004
1263
  traceDisplay: traceDisplayInputConfigSchema.optional(),
1005
1264
  cache: z.object({
1006
1265
  enabled: z.boolean().optional(),
1007
- dir: z.string().optional()
1266
+ dir: z.string().optional(),
1267
+ maxEntriesPerEval: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional())
1008
1268
  }).optional()
1009
1269
  });
1010
1270
  //#endregion
@@ -1243,60 +1503,59 @@ const createRunRequestSchema = z.object({
1243
1503
  const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1).nullable() });
1244
1504
  //#endregion
1245
1505
  //#region ../runner/src/cacheStore.ts
1506
+ const defaultMaxEntriesPerEval = 100;
1246
1507
  /**
1247
1508
  * Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
1248
1509
  *
1249
- * Writes use `<name>.tmp` + atomic `rename` to avoid partial reads under
1250
- * concurrent access.
1510
+ * Cache entries are grouped into one inspectable JSON file per eval/cache
1511
+ * owner. Writes use a short-lived lock directory plus `<name>.tmp` + atomic
1512
+ * `rename` to avoid partial reads and lost updates under concurrent access.
1251
1513
  */
1252
1514
  function createFsCacheStore(options) {
1253
1515
  const cacheDir = resolve(options.workspaceRoot, options.dir ?? ".agent-evals/cache");
1516
+ const maxEntriesPerEval = normalizeMaxEntries(options.maxEntriesPerEval);
1254
1517
  return {
1255
1518
  dir() {
1256
1519
  return cacheDir;
1257
1520
  },
1258
1521
  async lookup(namespace, keyHash) {
1259
- const filePath = entryPath(cacheDir, namespace, keyHash);
1260
- if (!existsSync(filePath)) return null;
1261
- const json = safeJsonParse(await readFile(filePath, "utf-8"));
1262
- if (json === null) return null;
1263
- const parsed = cacheEntrySchema.safeParse(json);
1264
- if (!parsed.success) return null;
1265
- return parsed.data;
1522
+ return (await readCacheFile(cacheDir, ownerFromNamespace(namespace)))?.entries[keyHash] ?? null;
1266
1523
  },
1267
1524
  async write(entry) {
1268
- const filePath = entryPath(cacheDir, entry.namespace, entry.key);
1269
- await mkdir(dirname(filePath), { recursive: true });
1270
- const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
1271
- await writeFile(tmpPath, JSON.stringify(entry));
1272
- await rename(tmpPath, filePath);
1525
+ const owner = ownerFromNamespace(entry.namespace);
1526
+ const filePath = ownerPath(cacheDir, owner);
1527
+ await mkdir(cacheDir, { recursive: true });
1528
+ await withCacheFileLock(filePath, async () => {
1529
+ await writeCacheFile(cacheDir, {
1530
+ version: 1,
1531
+ owner,
1532
+ entries: pruneEntries({
1533
+ ...(await readCacheFile(cacheDir, owner))?.entries ?? {},
1534
+ [entry.key]: entry
1535
+ }, maxEntriesPerEval, entry.key)
1536
+ });
1537
+ });
1273
1538
  },
1274
1539
  async list() {
1275
1540
  if (!existsSync(cacheDir)) return [];
1276
- const namespaces = await readdir(cacheDir);
1541
+ const files = await readdir(cacheDir);
1277
1542
  const items = [];
1278
- for (const namespace of namespaces) {
1279
- const nsPath = join(cacheDir, namespace);
1280
- if (!(await stat(nsPath)).isDirectory()) continue;
1281
- const files = await readdir(nsPath);
1282
- for (const fileName of files) {
1283
- if (!fileName.endsWith(".json")) continue;
1284
- const filePath = join(nsPath, fileName);
1285
- const json = safeJsonParse(await readFile(filePath, "utf-8"));
1286
- if (json === null) continue;
1287
- const parsed = cacheEntrySchema.safeParse(json);
1288
- if (!parsed.success) continue;
1289
- const fileStat = await stat(filePath);
1290
- items.push({
1291
- key: parsed.data.key,
1292
- namespace: parsed.data.namespace,
1293
- spanName: parsed.data.spanName,
1294
- spanKind: parsed.data.spanKind,
1295
- storedAt: parsed.data.storedAt,
1296
- codeFingerprint: parsed.data.codeFingerprint,
1297
- sizeBytes: fileStat.size
1298
- });
1299
- }
1543
+ for (const fileName of files) {
1544
+ if (!fileName.endsWith(".json")) continue;
1545
+ const filePath = join(cacheDir, fileName);
1546
+ const fileStatResult = await resultify(() => stat(filePath));
1547
+ if (fileStatResult.error || !fileStatResult.value.isFile()) continue;
1548
+ const cacheFile = await readCacheFilePath(filePath);
1549
+ if (cacheFile === null) continue;
1550
+ for (const entry of Object.values(cacheFile.entries)) items.push({
1551
+ key: entry.key,
1552
+ namespace: entry.namespace,
1553
+ spanName: entry.spanName,
1554
+ spanKind: entry.spanKind,
1555
+ storedAt: entry.storedAt,
1556
+ codeFingerprint: entry.codeFingerprint,
1557
+ sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
1558
+ });
1300
1559
  }
1301
1560
  items.sort((a, b) => a.storedAt < b.storedAt ? 1 : -1);
1302
1561
  return items;
@@ -1310,21 +1569,36 @@ function createFsCacheStore(options) {
1310
1569
  });
1311
1570
  return;
1312
1571
  }
1313
- if (filter.namespace !== void 0 && filter.key === void 0) {
1314
- await rm(join(cacheDir, filter.namespace), {
1315
- recursive: true,
1316
- force: true
1572
+ if (filter.namespace !== void 0) {
1573
+ const owner = ownerFromNamespace(filter.namespace);
1574
+ await withCacheFileLock(ownerPath(cacheDir, owner), async () => {
1575
+ const cacheFile = await readCacheFile(cacheDir, owner);
1576
+ if (cacheFile === null) return;
1577
+ await writeOrRemoveCacheFile(cacheDir, {
1578
+ version: 1,
1579
+ owner,
1580
+ entries: Object.fromEntries(Object.entries(cacheFile.entries).filter(([key, entry]) => {
1581
+ if (filter.key !== void 0) return key !== filter.key;
1582
+ return entry.namespace !== filter.namespace;
1583
+ }))
1584
+ });
1317
1585
  });
1318
1586
  return;
1319
1587
  }
1320
- if (filter.namespace !== void 0 && filter.key !== void 0) {
1321
- await rm(entryPath(cacheDir, filter.namespace, filter.key), { force: true });
1322
- return;
1323
- }
1324
- const namespaces = await readdir(cacheDir);
1325
- for (const namespace of namespaces) {
1326
- const filePath = entryPath(cacheDir, namespace, filter.key ?? "");
1327
- if (existsSync(filePath)) await rm(filePath, { force: true });
1588
+ const files = await readdir(cacheDir);
1589
+ for (const fileName of files) {
1590
+ if (!fileName.endsWith(".json")) continue;
1591
+ const filePath = join(cacheDir, fileName);
1592
+ await withCacheFileLock(filePath, async () => {
1593
+ const cacheFile = await readCacheFilePath(filePath);
1594
+ if (cacheFile === null) return;
1595
+ const entries = Object.fromEntries(Object.entries(cacheFile.entries).filter(([key]) => key !== filter.key));
1596
+ await writeOrRemoveCacheFile(cacheDir, {
1597
+ version: 1,
1598
+ owner: cacheFile.owner,
1599
+ entries
1600
+ });
1601
+ });
1328
1602
  }
1329
1603
  }
1330
1604
  };
@@ -1356,8 +1630,16 @@ function createBufferedCacheStore(backingStore) {
1356
1630
  }
1357
1631
  };
1358
1632
  }
1359
- function entryPath(cacheDir, namespace, keyHash) {
1360
- return join(cacheDir, sanitizeSegment$1(namespace), `${keyHash}.json`);
1633
+ function normalizeMaxEntries(value) {
1634
+ if (value === void 0 || !Number.isFinite(value) || value <= 0) return defaultMaxEntriesPerEval;
1635
+ return Math.floor(value);
1636
+ }
1637
+ function ownerFromNamespace(namespace) {
1638
+ const [owner] = namespace.split("__");
1639
+ return owner === void 0 || owner.length === 0 ? namespace : owner;
1640
+ }
1641
+ function ownerPath(cacheDir, owner) {
1642
+ return join(cacheDir, `${sanitizeSegment$1(owner)}.json`);
1361
1643
  }
1362
1644
  function toPendingKey(namespace, keyHash) {
1363
1645
  return `${namespace}::${keyHash}`;
@@ -1365,6 +1647,69 @@ function toPendingKey(namespace, keyHash) {
1365
1647
  function sanitizeSegment$1(segment) {
1366
1648
  return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
1367
1649
  }
1650
+ async function readCacheFile(cacheDir, owner) {
1651
+ return readCacheFilePath(ownerPath(cacheDir, owner));
1652
+ }
1653
+ async function readCacheFilePath(filePath) {
1654
+ if (!existsSync(filePath)) return null;
1655
+ const rawResult = await resultify(() => readFile(filePath, "utf-8"));
1656
+ if (rawResult.error) return null;
1657
+ const json = safeJsonParse(rawResult.value);
1658
+ if (json === null) return null;
1659
+ const parsed = cacheFileSchema.safeParse(json);
1660
+ if (!parsed.success) return null;
1661
+ return parsed.data;
1662
+ }
1663
+ async function writeOrRemoveCacheFile(cacheDir, cacheFile) {
1664
+ if (Object.keys(cacheFile.entries).length === 0) {
1665
+ await rm(ownerPath(cacheDir, cacheFile.owner), { force: true });
1666
+ return;
1667
+ }
1668
+ await writeCacheFile(cacheDir, cacheFile);
1669
+ }
1670
+ async function writeCacheFile(cacheDir, cacheFile) {
1671
+ await mkdir(cacheDir, { recursive: true });
1672
+ const filePath = ownerPath(cacheDir, cacheFile.owner);
1673
+ const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
1674
+ await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
1675
+ await rename(tmpPath, filePath);
1676
+ }
1677
+ function pruneEntries(entries, maxEntries, protectedKey) {
1678
+ const sorted = Object.values(entries).toSorted((a, b) => a.storedAt < b.storedAt ? 1 : -1);
1679
+ const kept = /* @__PURE__ */ new Map();
1680
+ const protectedEntry = entries[protectedKey];
1681
+ if (protectedEntry !== void 0) kept.set(protectedEntry.key, protectedEntry);
1682
+ for (const entry of sorted) {
1683
+ if (kept.size >= maxEntries) break;
1684
+ kept.set(entry.key, entry);
1685
+ }
1686
+ return Object.fromEntries([...kept.values()].toSorted((a, b) => a.key < b.key ? -1 : 1).map((entry) => [entry.key, entry]));
1687
+ }
1688
+ async function withCacheFileLock(filePath, fn) {
1689
+ const lockPath = `${filePath}.lock`;
1690
+ await acquireLock(lockPath);
1691
+ const result = await resultify(fn);
1692
+ await rm(lockPath, {
1693
+ recursive: true,
1694
+ force: true
1695
+ });
1696
+ if (result.error) throw result.error;
1697
+ }
1698
+ async function acquireLock(lockPath) {
1699
+ const startedAt = Date.now();
1700
+ let lastError;
1701
+ while (Date.now() - startedAt < 5e3) {
1702
+ const result = await resultify(() => mkdir(lockPath, { recursive: false }));
1703
+ if (!result.error) return;
1704
+ lastError = result.error;
1705
+ await sleep(20);
1706
+ }
1707
+ if (lastError instanceof Error) throw lastError;
1708
+ throw new Error(`Timed out acquiring cache lock at ${lockPath}`);
1709
+ }
1710
+ function sleep(ms) {
1711
+ return new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
1712
+ }
1368
1713
  function safeJsonParse(text) {
1369
1714
  const parsed = resultify(() => JSON.parse(text));
1370
1715
  if (parsed.error) return null;
@@ -2730,7 +3075,8 @@ function createRunner({ watchForChanges = true } = {}) {
2730
3075
  await mkdir(join(localStateDir, "runs"), { recursive: true });
2731
3076
  cacheStore = createFsCacheStore({
2732
3077
  workspaceRoot,
2733
- dir: config.cache?.dir
3078
+ dir: config.cache?.dir,
3079
+ maxEntriesPerEval: config.cache?.maxEntriesPerEval
2734
3080
  });
2735
3081
  await loadPersistedRuns();
2736
3082
  await runner.refreshDiscovery();
@@ -3128,6 +3474,9 @@ function parseArgs(argv) {
3128
3474
  const args = {
3129
3475
  command: "help",
3130
3476
  subcommand: void 0,
3477
+ showHelp: false,
3478
+ helpTopic: "global",
3479
+ unknownHelpTarget: void 0,
3131
3480
  evalIds: [],
3132
3481
  caseIds: [],
3133
3482
  trials: 1,
@@ -3138,19 +3487,28 @@ function parseArgs(argv) {
3138
3487
  all: false
3139
3488
  };
3140
3489
  const command = argv[0];
3141
- if (command === "app" || command === "list" || command === "run" || command === "cache" || command === "help") args.command = command;
3490
+ if (command === "--help" || command === "-h") {
3491
+ args.showHelp = true;
3492
+ return args;
3493
+ }
3494
+ if (isCliCommand(command)) {
3495
+ args.command = command;
3496
+ args.helpTopic = command === "help" ? "global" : command;
3497
+ } else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
3142
3498
  let cursor = 1;
3143
3499
  if (args.command === "cache") {
3144
3500
  const sub = argv[cursor];
3145
3501
  if (sub === "list" || sub === "clear") {
3146
3502
  args.subcommand = sub;
3503
+ args.helpTopic = `cache ${sub}`;
3147
3504
  cursor++;
3148
- }
3505
+ } else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
3149
3506
  }
3150
3507
  for (let i = cursor; i < argv.length; i++) {
3151
3508
  const arg = argv[i];
3152
3509
  const next = argv[i + 1];
3153
- if (arg === "--eval" && next) {
3510
+ if (arg === "--help" || arg === "-h") args.showHelp = true;
3511
+ else if (arg === "--eval" && next) {
3154
3512
  args.evalIds.push(...next.split(","));
3155
3513
  i++;
3156
3514
  } else if (arg === "--case" && next) {
@@ -3180,6 +3538,15 @@ function parseArgs(argv) {
3180
3538
  */
3181
3539
  async function runCli(argv) {
3182
3540
  const args = parseArgs(argv);
3541
+ if (args.showHelp) {
3542
+ if (args.unknownHelpTarget !== void 0) {
3543
+ console.error(`No help found for "${args.unknownHelpTarget}".`);
3544
+ process.exit(1);
3545
+ return;
3546
+ }
3547
+ printHelp(args.helpTopic);
3548
+ return;
3549
+ }
3183
3550
  switch (args.command) {
3184
3551
  case "app":
3185
3552
  await commandApp(args);
@@ -3194,10 +3561,13 @@ async function runCli(argv) {
3194
3561
  await commandCache(args);
3195
3562
  break;
3196
3563
  default:
3197
- printHelp();
3564
+ printHelp(args.helpTopic);
3198
3565
  break;
3199
3566
  }
3200
3567
  }
3568
+ function isCliCommand(command) {
3569
+ return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
3570
+ }
3201
3571
  const currentDir = dirname(fileURLToPath(import.meta.url));
3202
3572
  const repoRoot = resolve(currentDir, "../../..");
3203
3573
  const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
@@ -3246,8 +3616,8 @@ async function commandApp(args) {
3246
3616
  const { serve } = await import("@hono/node-server");
3247
3617
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
3248
3618
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
3249
- const appModule = await import("./app-CKa9TjXw.mjs");
3250
- const runnerModule = await import("./runner-Ck4X0H3p.mjs");
3619
+ const appModule = await import("./app-DXYLqlWb.mjs");
3620
+ const runnerModule = await import("./runner-CToL8eJs.mjs");
3251
3621
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
3252
3622
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
3253
3623
  await runnerModule.initRunner();
@@ -3378,7 +3748,7 @@ async function commandCache(args) {
3378
3748
  process.exit(1);
3379
3749
  return;
3380
3750
  }
3381
- printHelp();
3751
+ printHelp(args.helpTopic);
3382
3752
  }
3383
3753
  async function waitForRunCompletion(runner, runId) {
3384
3754
  return new Promise((resolvePromise) => {
@@ -3393,7 +3763,69 @@ async function waitForRunCompletion(runner, runId) {
3393
3763
  check();
3394
3764
  });
3395
3765
  }
3396
- function printHelp() {
3766
+ function printHelp(topic = "global") {
3767
+ if (topic === "app") {
3768
+ console.info(`
3769
+ agent-evals app - Start server with UI
3770
+
3771
+ Usage:
3772
+ agent-evals app [flags]
3773
+
3774
+ Flags:
3775
+ --port <n> Server port (default: 4100)
3776
+ --help, -h Show this help
3777
+ `);
3778
+ return;
3779
+ }
3780
+ if (topic === "list") {
3781
+ console.info(`
3782
+ agent-evals list - List discovered evals
3783
+
3784
+ Usage:
3785
+ agent-evals list [flags]
3786
+
3787
+ Flags:
3788
+ --help, -h Show this help
3789
+ `);
3790
+ return;
3791
+ }
3792
+ if (topic === "run") {
3793
+ console.info(`
3794
+ agent-evals run - Run evals
3795
+
3796
+ Usage:
3797
+ agent-evals run [flags]
3798
+
3799
+ Flags:
3800
+ --eval <id> Run specific eval(s) (comma-separated)
3801
+ --case <id> Run specific case(s) (comma-separated)
3802
+ --trials <n> Number of trials per case
3803
+ --json Output run summary as JSON
3804
+ --cache <use|bypass|refresh> Cache mode for this run (default: use)
3805
+ --no-cache Shortcut for --cache bypass
3806
+ --refresh-cache Shortcut for --cache refresh
3807
+ --clear-cache Clear the cache before starting the run
3808
+ --help, -h Show this help
3809
+ `);
3810
+ return;
3811
+ }
3812
+ if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
3813
+ console.info(`
3814
+ agent-evals cache - Manage cached operation entries
3815
+
3816
+ Usage:
3817
+ agent-evals cache list [flags]
3818
+ agent-evals cache clear --eval <id>
3819
+ agent-evals cache clear --all
3820
+
3821
+ Flags:
3822
+ --eval <id> Clear entries for specific eval(s) (comma-separated)
3823
+ --all Confirm clearing every cached entry
3824
+ --json Output cache listing as JSON
3825
+ --help, -h Show this help
3826
+ `);
3827
+ return;
3828
+ }
3397
3829
  console.info(`
3398
3830
  agent-evals - LLM/Agent eval runner
3399
3831
 
@@ -3416,7 +3848,8 @@ Options:
3416
3848
  --no-cache Shortcut for --cache bypass
3417
3849
  --refresh-cache Shortcut for --cache refresh
3418
3850
  --clear-cache Clear the cache before starting the run
3851
+ --help, -h Show help
3419
3852
  `);
3420
3853
  }
3421
3854
  //#endregion
3422
- export { jsonCellSchema as $, scoreTraceSchema as A, traceAttributeDisplayFormatSchema as B, caseDetailSchema as C, evalStatItemSchema as D, evalStatAggregateSchema as E, evalChartConfigSchema as F, traceDisplayInputConfigSchema as G, traceAttributeDisplayPlacementSchema as H, evalChartMetricSchema as I, cellValueSchema as J, traceSpanKindSchema as K, evalChartTooltipExtraSchema as L, evalChartAxisSchema as M, evalChartBuiltinMetricSchema as N, evalStatsConfigSchema as O, evalChartColorSchema as P, fileRefSchema as Q, evalChartTypeSchema as R, assertionFailureSchema as S, evalFreshnessStatusSchema as T, traceAttributeDisplaySchema as U, traceAttributeDisplayInputSchema as V, traceDisplayConfigSchema as W, columnFormatSchema as X, columnDefSchema as Y, columnKindSchema as Z, cacheModeSchema as _, getEvalRegistry as _t, sseEnvelopeSchema as a, evalTracer as at, serializedCacheSpanSchema as b, deriveScopedSummaryFromCases as c, evalAssert as ct, runManifestSchema as d, isInEvalScope as dt, numberDisplayOptionsSchema as et, runSummarySchema as f, runInEvalScope as ft, cacheListItemSchema as g, defineEval as gt, cacheEntrySchema as h, repoFile as ht, updateManualScoreRequestSchema as i, evalSpan as it, evalChartAggregateSchema as j, evalSummarySchema as k, deriveStatusFromCaseRows as l, getCurrentScope as lt, trialSelectionModeSchema as m, setScopeCacheContext as mt, createRunner as n, runArtifactRefSchema as nt, getEvalTitle as o, hashCacheKey as ot, agentEvalsConfigSchema as p, setEvalOutput as pt, traceSpanSchema as q, createRunRequestSchema as r, buildTraceTree as rt, getEvalDisplayStatus as s, EvalAssertionError as st, runCli as t, repoFileRefSchema as tt, deriveStatusFromChildStatuses as u, incrementEvalOutput as ut, cacheRecordingOpSchema as v, caseRowSchema as w, spanCacheOptionsSchema as x, cacheRecordingSchema as y, evalChartsConfigSchema as z };
3855
+ export { fileRefSchema as $, evalSummarySchema as A, evalChartsConfigSchema as B, assertionFailureSchema as C, evalStatAggregateSchema as D, evalFreshnessStatusSchema as E, evalChartColorSchema as F, traceDisplayConfigSchema as G, traceAttributeDisplayInputSchema as H, evalChartConfigSchema as I, traceSpanSchema as J, traceDisplayInputConfigSchema as K, evalChartMetricSchema as L, evalChartAggregateSchema as M, evalChartAxisSchema as N, evalStatItemSchema as O, evalChartBuiltinMetricSchema as P, columnKindSchema as Q, evalChartTooltipExtraSchema as R, spanCacheOptionsSchema as S, caseRowSchema as T, traceAttributeDisplayPlacementSchema as U, traceAttributeDisplayFormatSchema as V, traceAttributeDisplaySchema as W, columnDefSchema as X, cellValueSchema as Y, columnFormatSchema as Z, cacheListItemSchema as _, repoFile as _t, sseEnvelopeSchema as a, evalSpan as at, cacheRecordingSchema as b, deriveScopedSummaryFromCases as c, hashCacheKeySync as ct, runManifestSchema as d, getCurrentScope as dt, jsonCellSchema as et, runSummarySchema as f, incrementEvalOutput as ft, cacheFileSchema as g, setScopeCacheContext as gt, cacheEntrySchema as h, setEvalOutput as ht, updateManualScoreRequestSchema as i, buildTraceTree as it, scoreTraceSchema as j, evalStatsConfigSchema as k, deriveStatusFromCaseRows as l, EvalAssertionError as lt, trialSelectionModeSchema as m, runInEvalScope as mt, createRunner as n, repoFileRefSchema as nt, getEvalTitle as o, evalTracer as ot, agentEvalsConfigSchema as p, isInEvalScope as pt, traceSpanKindSchema as q, createRunRequestSchema as r, runArtifactRefSchema as rt, getEvalDisplayStatus as s, hashCacheKey as st, runCli as t, numberDisplayOptionsSchema as tt, deriveStatusFromChildStatuses as u, evalAssert as ut, cacheModeSchema as v, defineEval as vt, caseDetailSchema as w, serializedCacheSpanSchema as x, cacheRecordingOpSchema as y, getEvalRegistry as yt, evalChartTypeSchema as z };