@ls-stack/agent-eval 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-AUDD3rNB.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-r0dVFK0B.css">
28
+ <script type="module" crossorigin src="/assets/index-DqR1YaMG.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-DNsZjOms.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-D3QNOcPN.mjs";
2
+ import { t as runCli } from "./cli-ETfZ15RB.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { C as validateCharts, F as deriveScopedSummaryFromCases, Ln as getEvalRegistry, N as getEvalTitle, P as getEvalDisplayStatus, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, et as resolveApiCallsConfig, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveLlmCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig, z as runSummarySchema } from "./runOrchestration-CimthgI7.mjs";
1
+ import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-B31SV_Bq.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -8,6 +8,34 @@ import { existsSync } from "node:fs";
8
8
  import { resultify } from "t-result";
9
9
  import { fileURLToPath } from "node:url";
10
10
  import { spawn, spawnSync } from "node:child_process";
11
+ //#region ../runner/src/evalSummaries.ts
12
+ /** Build the API/UI summary payload for one discovered eval. */
13
+ function buildEvalSummary(params) {
14
+ const { meta, config, gitState, latestRun, lastRunStatus } = params;
15
+ const { sourceFingerprint, ...summaryMeta } = meta;
16
+ const freshness = deriveEvalFreshness({
17
+ latestRun,
18
+ gitState,
19
+ currentEvalSourceFingerprint: sourceFingerprint,
20
+ staleAfterDays: config.staleAfterDays ?? 14
21
+ });
22
+ return {
23
+ ...summaryMeta,
24
+ stale: freshness.stale,
25
+ outdated: freshness.outdated,
26
+ freshnessStatus: freshness.freshnessStatus,
27
+ latestRunAt: latestRun?.startedAt ?? null,
28
+ latestRunCommitSha: latestRun?.commitSha ?? null,
29
+ currentCommitSha: gitState.commitSha,
30
+ lastRunStatus
31
+ };
32
+ }
33
+ /** Write one latest-run snapshot to each targeted eval id. */
34
+ function setLatestRunInfoMap(params) {
35
+ const { latestRunInfoMap, evalIds, info } = params;
36
+ for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
37
+ }
38
+ //#endregion
11
39
  //#region ../runner/src/gitState.ts
12
40
  function runGitCommand(workspaceRoot, args) {
13
41
  const result = spawnSync("git", args, {
@@ -142,16 +170,16 @@ function handleRunChildMessage(params) {
142
170
  handleRunChildEvent(runState, message.event, managerContext);
143
171
  }
144
172
  function upsertFinishedCase(runState, caseDetail, caseRow) {
145
- const existingIndex = runState.cases.findIndex((row) => row.evalId === caseRow.evalId && row.caseId === caseRow.caseId && row.trial === caseRow.trial);
173
+ const existingIndex = runState.cases.findIndex((row) => getCaseRowCaseKey(row) === getCaseRowCaseKey(caseRow) && row.trial === caseRow.trial);
146
174
  if (existingIndex === -1) runState.cases.push(caseRow);
147
175
  else runState.cases[existingIndex] = caseRow;
148
- runState.caseDetails.set(caseDetail.caseId, caseDetail);
176
+ runState.caseDetails.set(caseDetail.caseKey ?? caseDetail.caseId, caseDetail);
149
177
  }
150
178
  function applyChildEvalMetas(evals, childMetas) {
151
179
  for (const childMeta of childMetas) {
152
- const evalMeta = evals.get(childMeta.id);
180
+ const evalMeta = evals.get(childMeta.key);
153
181
  if (evalMeta === void 0) {
154
- evals.set(childMeta.id, childMeta);
182
+ evals.set(childMeta.key, childMeta);
155
183
  continue;
156
184
  }
157
185
  evalMeta.columnDefs = childMeta.columnDefs;
@@ -266,6 +294,7 @@ function createRunner({ watchForChanges = true } = {}) {
266
294
  let llmCallsConfig = resolveLlmCallsConfig(void 0);
267
295
  let apiCallsConfig = resolveApiCallsConfig(void 0);
268
296
  const evals = /* @__PURE__ */ new Map();
297
+ let discoveryIssues = [];
269
298
  const runs = /* @__PURE__ */ new Map();
270
299
  const lastRunStatusMap = /* @__PURE__ */ new Map();
271
300
  const latestRunInfoMap = /* @__PURE__ */ new Map();
@@ -279,7 +308,13 @@ function createRunner({ watchForChanges = true } = {}) {
279
308
  return relative(workspaceRoot, filePath).replaceAll("\\", "/");
280
309
  }
281
310
  function getSortedEvalMetas() {
282
- return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
311
+ return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath) || a.id.localeCompare(b.id));
312
+ }
313
+ function resolveEvalMeta(evalRef) {
314
+ const exactMatch = evals.get(evalRef);
315
+ if (exactMatch !== void 0) return exactMatch;
316
+ const matches = getSortedEvalMetas().filter((ev) => ev.id === evalRef);
317
+ return matches.length === 1 ? matches[0] : void 0;
283
318
  }
284
319
  function getSourceFingerprint(source) {
285
320
  return createHash("sha256").update(source).digest("hex");
@@ -312,12 +347,12 @@ function createRunner({ watchForChanges = true } = {}) {
312
347
  async clearCache(filter) {
313
348
  await cacheStore.clear(filter);
314
349
  },
315
- async recomputeStatusesForEval(evalId) {
316
- const evalMeta = evals.get(evalId);
350
+ async recomputeStatusesForEval(evalKey) {
351
+ const evalMeta = resolveEvalMeta(evalKey);
317
352
  if (!evalMeta) return { updatedRuns: 0 };
318
353
  const registry = getEvalRegistry();
319
354
  await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
320
- const entry = registry.get(evalId);
355
+ const entry = registry.get(evalMeta.id);
321
356
  if (!entry) return { updatedRuns: 0 };
322
357
  const scoreThresholds = /* @__PURE__ */ new Map();
323
358
  entry.use((evalDef) => {
@@ -329,22 +364,25 @@ function createRunner({ watchForChanges = true } = {}) {
329
364
  });
330
365
  const updatedRuns = await recomputeEvalStatusesInRuns({
331
366
  runs: runs.values(),
332
- evalId,
333
- evalExists: evals.has(evalId),
367
+ evalKey: evalMeta.key,
368
+ evalId: evalMeta.id,
369
+ evalExists: evals.has(evalMeta.key),
334
370
  scoreThresholds,
335
371
  persistCaseDetail
336
372
  });
337
373
  emitDiscoveryEvent();
338
374
  return { updatedRuns };
339
375
  },
340
- async cleanRunsForEval(evalId) {
376
+ async cleanRunsForEval(evalKey) {
377
+ const evalMeta = resolveEvalMeta(evalKey);
341
378
  let deletedRuns = 0;
342
379
  for (const [runId, run] of [...runs]) {
343
380
  if (!runTouchesEval({
344
381
  target: run.manifest.target,
345
382
  caseRows: run.cases,
346
- evalId,
347
- evalExists: evals.has(evalId)
383
+ evalKey: evalMeta?.key ?? evalKey,
384
+ evalId: evalMeta?.id,
385
+ evalExists: evalMeta !== void 0
348
386
  })) continue;
349
387
  if (run.manifest.status === "running") continue;
350
388
  runs.delete(runId);
@@ -367,12 +405,12 @@ function createRunner({ watchForChanges = true } = {}) {
367
405
  updated: false,
368
406
  reason: "Run is still running"
369
407
  };
370
- const caseRow = run.cases.find((row) => row.caseId === caseId);
408
+ const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
371
409
  if (!caseRow) return {
372
410
  updated: false,
373
411
  reason: "Case not found"
374
412
  };
375
- const evalMeta = evals.get(caseRow.evalId);
413
+ const evalMeta = evals.get(getCaseRowEvalKey(caseRow));
376
414
  if (!evalMeta) return {
377
415
  updated: false,
378
416
  reason: "Eval not found"
@@ -381,7 +419,7 @@ function createRunner({ watchForChanges = true } = {}) {
381
419
  updated: false,
382
420
  reason: "Manual score not found"
383
421
  };
384
- const caseDetail = run.caseDetails.get(caseId);
422
+ const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
385
423
  if (!caseDetail) return {
386
424
  updated: false,
387
425
  reason: "Case detail not found"
@@ -435,22 +473,25 @@ function createRunner({ watchForChanges = true } = {}) {
435
473
  meta,
436
474
  config,
437
475
  gitState,
438
- latestRun: latestRunInfoMap.get(meta.id),
439
- lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
476
+ latestRun: latestRunInfoMap.get(meta.key),
477
+ lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
440
478
  }));
441
479
  return result;
442
480
  },
443
481
  getEval(id) {
444
- const meta = evals.get(id);
482
+ const meta = resolveEvalMeta(id);
445
483
  if (!meta) return void 0;
446
484
  return buildEvalSummary({
447
485
  meta,
448
486
  config,
449
487
  gitState: readGitWorktreeState(workspaceRoot),
450
- latestRun: latestRunInfoMap.get(meta.id),
451
- lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
488
+ latestRun: latestRunInfoMap.get(meta.key),
489
+ lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
452
490
  });
453
491
  },
492
+ getDiscoveryIssues() {
493
+ return discoveryIssues;
494
+ },
454
495
  async refreshDiscovery() {
455
496
  const patterns = config.include;
456
497
  const discovered = [];
@@ -462,16 +503,25 @@ function createRunner({ watchForChanges = true } = {}) {
462
503
  discovered.push(...files);
463
504
  }
464
505
  evals.clear();
506
+ discoveryIssues = [];
465
507
  for (const filePath of discovered) try {
466
508
  const content = await readFile(filePath, "utf-8");
467
- const discoveredMetas = parseEvalMetas(filePath, content);
509
+ const discovery = parseEvalDiscovery(filePath, content);
510
+ const discoveredMetas = discovery.metas;
511
+ discoveryIssues.push(...discovery.issues.map((issue) => ({
512
+ ...issue,
513
+ filePath: toWorkspaceRelativePath(issue.filePath),
514
+ message: `Duplicate eval id "${issue.evalId}" in ${toWorkspaceRelativePath(issue.filePath)}. Eval ids must be unique within one file.`
515
+ })));
468
516
  const sourceFingerprint = getSourceFingerprint(content);
469
517
  const registry = getEvalRegistry();
518
+ let moduleLoaded = false;
470
519
  try {
471
520
  await loadEvalModule(filePath, sourceFingerprint);
521
+ moduleLoaded = true;
472
522
  } catch {}
473
523
  for (const meta of discoveredMetas) {
474
- const discoveredEntry = registry.get(meta.id);
524
+ const discoveredEntry = moduleLoaded ? registry.get(meta.id) : void 0;
475
525
  const title = meta.title;
476
526
  let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
477
527
  let stats;
@@ -491,10 +541,16 @@ function createRunner({ watchForChanges = true } = {}) {
491
541
  for (const warning of validated.warnings) console.warn(warning);
492
542
  charts = validated.charts;
493
543
  });
494
- evals.set(meta.id, {
544
+ const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
545
+ const key = buildEvalKey({
546
+ filePath: relativeFilePath,
547
+ evalId: meta.id
548
+ });
549
+ evals.set(key, {
550
+ key,
495
551
  id: meta.id,
496
552
  title,
497
- filePath: toWorkspaceRelativePath(meta.filePath),
553
+ filePath: relativeFilePath,
498
554
  sourceFilePath: meta.filePath,
499
555
  sourceFingerprint,
500
556
  columnDefs,
@@ -549,10 +605,9 @@ function createRunner({ watchForChanges = true } = {}) {
549
605
  runs.set(runId, runState);
550
606
  setLatestRunInfoMap({
551
607
  latestRunInfoMap,
552
- evalIds: getTargetEvalIds({
608
+ evalIds: getTargetEvalKeys({
553
609
  request,
554
- sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
555
- knownEvalIds: new Set(evals.keys())
610
+ sortedEvals: getSortedEvalMetas()
556
611
  }),
557
612
  info: {
558
613
  status: "running",
@@ -633,7 +688,7 @@ function createRunner({ watchForChanges = true } = {}) {
633
688
  getCaseDetail(runId, caseId) {
634
689
  const run = runs.get(runId);
635
690
  if (!run) return void 0;
636
- return run.caseDetails.get(caseId);
691
+ return run.caseDetails.get(caseId) ?? run.caseDetails.get(getCaseRowCaseKey(run.cases.find((caseRow) => getCaseRowCaseKey(caseRow) === caseId || caseRow.caseId === caseId) ?? { caseId }));
637
692
  },
638
693
  subscribe(runId, listener) {
639
694
  const run = runs.get(runId);
@@ -799,6 +854,7 @@ function parseArgs(argv) {
799
854
  helpTopic: "global",
800
855
  unknownHelpTarget: void 0,
801
856
  evalIds: [],
857
+ files: [],
802
858
  caseIds: [],
803
859
  trials: 1,
804
860
  json: false,
@@ -834,6 +890,9 @@ function parseArgs(argv) {
834
890
  else if (arg === "--eval" && next) {
835
891
  args.evalIds.push(...next.split(","));
836
892
  i++;
893
+ } else if (arg === "--file" && next) {
894
+ args.files.push(...next.split(","));
895
+ i++;
837
896
  } else if (arg === "--case" && next) {
838
897
  args.caseIds.push(...next.split(","));
839
898
  i++;
@@ -899,6 +958,28 @@ async function runCli(argv) {
899
958
  function isCliCommand(command) {
900
959
  return command === "app" || command === "list" || command === "run" || command === "show-runs" || command === "cache" || command === "help";
901
960
  }
961
+ function escapeRegex(value) {
962
+ return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
963
+ }
964
+ function globToRegex(pattern) {
965
+ const normalized = pattern.replaceAll("\\", "/");
966
+ let regex = "^";
967
+ for (let i = 0; i < normalized.length; i++) {
968
+ const char = normalized[i];
969
+ const next = normalized[i + 1];
970
+ if (char === "*" && next === "*") {
971
+ regex += ".*";
972
+ i++;
973
+ } else if (char === "*") regex += "[^/]*";
974
+ else if (char === "?") regex += "[^/]";
975
+ else regex += escapeRegex(char ?? "");
976
+ }
977
+ return new RegExp(`${regex}$`);
978
+ }
979
+ function fileMatches(pattern, filePath) {
980
+ const normalized = pattern.replaceAll("\\", "/");
981
+ return normalized === filePath || globToRegex(normalized).test(filePath);
982
+ }
902
983
  function loadWorkspaceEnv() {
903
984
  const envPath = resolve(process.cwd(), ".env");
904
985
  if (!existsSync(envPath)) return true;
@@ -959,8 +1040,8 @@ async function commandApp(args) {
959
1040
  const { serve } = await import("@hono/node-server");
960
1041
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
961
1042
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
962
- const appModule = await import("./app-Cw79dJDr.mjs");
963
- const runnerModule = await import("./runner-B-SYzW8w.mjs");
1043
+ const appModule = await import("./app-DS3j_AyX.mjs");
1044
+ const runnerModule = await import("./runner-B2f2TEjp.mjs");
964
1045
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
965
1046
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
966
1047
  await runnerModule.initRunner();
@@ -973,9 +1054,16 @@ async function commandApp(args) {
973
1054
  async function commandList(args_) {
974
1055
  const runner = createRunner({ watchForChanges: false });
975
1056
  await runner.init();
1057
+ const discoveryIssues = runner.getDiscoveryIssues();
1058
+ if (discoveryIssues.length > 0) {
1059
+ console.error("Discovery errors:\n");
1060
+ for (const issue of discoveryIssues) console.error(` ${issue.message}`);
1061
+ console.error("");
1062
+ }
976
1063
  const evals = runner.getEvals();
977
1064
  if (evals.length === 0) {
978
1065
  console.info("No eval files found.");
1066
+ if (discoveryIssues.length > 0) process.exit(1);
979
1067
  return;
980
1068
  }
981
1069
  console.info("Discovered evals:\n");
@@ -994,12 +1082,13 @@ async function commandList(args_) {
994
1082
  if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
995
1083
  console.info("");
996
1084
  }
1085
+ if (discoveryIssues.length > 0) process.exit(1);
997
1086
  }
998
1087
  async function commandRun(args) {
999
1088
  const runner = createRunner({ watchForChanges: false });
1000
1089
  await runner.init();
1001
- if (args.evalIds.length === 0 && args.caseIds.length === 0 && !runner.getAllowCliRunAll()) {
1002
- console.error("This workspace disables running all evals from the CLI. Pass --eval <id> or --case <id> to run a targeted subset.");
1090
+ if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && !runner.getAllowCliRunAll()) {
1091
+ console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, or --case <id> to run a targeted subset.");
1003
1092
  process.exit(1);
1004
1093
  return;
1005
1094
  }
@@ -1013,10 +1102,15 @@ async function commandRun(args) {
1013
1102
  const target = args.caseIds.length > 0 ? {
1014
1103
  mode: "caseIds",
1015
1104
  caseIds: args.caseIds,
1016
- evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
1105
+ evalIds: args.evalIds.length > 0 ? args.evalIds : void 0,
1106
+ files: args.files.length > 0 ? args.files : void 0
1017
1107
  } : args.evalIds.length > 0 ? {
1018
1108
  mode: "evalIds",
1019
- evalIds: args.evalIds
1109
+ evalIds: args.evalIds,
1110
+ files: args.files.length > 0 ? args.files : void 0
1111
+ } : args.files.length > 0 ? {
1112
+ mode: "evalIds",
1113
+ files: args.files
1020
1114
  } : { mode: "all" };
1021
1115
  const run = await runner.startRun({
1022
1116
  target,
@@ -1046,8 +1140,12 @@ async function commandRun(args) {
1046
1140
  console.info(`Errors: ${String(summary.errorCases)}`);
1047
1141
  if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
1048
1142
  if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
1143
+ if (summary.errorMessage !== null) {
1144
+ console.info("");
1145
+ console.info(summary.errorMessage);
1146
+ }
1049
1147
  }
1050
- if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
1148
+ if (summary.status === "error" || summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
1051
1149
  }
1052
1150
  async function commandShowRuns(args) {
1053
1151
  const runner = createRunner({ watchForChanges: false });
@@ -1101,8 +1199,9 @@ async function commandCache(args) {
1101
1199
  return;
1102
1200
  }
1103
1201
  if (args.subcommand === "clear") {
1104
- if (args.evalIds.length > 0) {
1105
- for (const evalId of args.evalIds) {
1202
+ if (args.evalIds.length > 0 || args.files.length > 0) {
1203
+ const evalIds = runner.getEvals().filter((ev) => (args.evalIds.length === 0 || args.evalIds.includes(ev.id)) && (args.files.length === 0 || args.files.some((file) => fileMatches(file, ev.filePath)))).map((ev) => ev.id);
1204
+ for (const evalId of evalIds) {
1106
1205
  const entries = await runner.listCache();
1107
1206
  const prefix = `${evalId}__`;
1108
1207
  const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
@@ -1111,7 +1210,7 @@ async function commandCache(args) {
1111
1210
  key: entry.key
1112
1211
  });
1113
1212
  }
1114
- console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
1213
+ console.info(`Cleared cache entries for: ${evalIds.join(", ")}`);
1115
1214
  return;
1116
1215
  }
1117
1216
  if (args.all) {
@@ -1130,6 +1229,9 @@ function getSortedRunSnapshots(runner) {
1130
1229
  }
1131
1230
  function buildRunFileIndex(workspaceRoot, run) {
1132
1231
  const runDir = join(workspaceRoot, ".agent-evals", "runs", run.manifest.id);
1232
+ const caseIdCounts = /* @__PURE__ */ new Map();
1233
+ for (const caseRow of run.cases) caseIdCounts.set(caseRow.caseId, (caseIdCounts.get(caseRow.caseId) ?? 0) + 1);
1234
+ const seenCaseIds = /* @__PURE__ */ new Set();
1133
1235
  return {
1134
1236
  id: run.manifest.id,
1135
1237
  shortId: run.manifest.shortId,
@@ -1147,10 +1249,16 @@ function buildRunFileIndex(workspaceRoot, run) {
1147
1249
  tracesDir: join(runDir, "traces")
1148
1250
  },
1149
1251
  cases: run.cases.map((caseRow) => {
1150
- const fileName = `${encodeURIComponent(caseRow.caseId)}.json`;
1252
+ const duplicateCaseIdCount = caseIdCounts.get(caseRow.caseId) ?? 0;
1253
+ const hasPreviousCaseWithId = seenCaseIds.has(caseRow.caseId);
1254
+ const fileId = duplicateCaseIdCount > 1 && hasPreviousCaseWithId ? caseRow.caseKey ?? caseRow.caseId : caseRow.caseId;
1255
+ seenCaseIds.add(caseRow.caseId);
1256
+ const fileName = `${encodeURIComponent(fileId)}.json`;
1151
1257
  return {
1152
1258
  caseId: caseRow.caseId,
1259
+ caseKey: caseRow.caseKey,
1153
1260
  evalId: caseRow.evalId,
1261
+ evalKey: caseRow.evalKey,
1154
1262
  status: caseRow.status,
1155
1263
  files: {
1156
1264
  caseDetail: join(runDir, "case-details", fileName),
@@ -1262,7 +1370,8 @@ Usage:
1262
1370
 
1263
1371
  Flags:
1264
1372
  --eval <id> Run specific eval(s) (comma-separated)
1265
- --case <id> Run specific case(s) (comma-separated)
1373
+ --file <path|glob> Run eval files matching path/glob (comma-separated)
1374
+ --case <id> Run case(s); combine with --file/--eval if ambiguous
1266
1375
  --trials <n> Number of trials per case
1267
1376
  --inspect[=host:port] Run with the Node.js inspector enabled
1268
1377
  --inspect-brk[=host:port] Enable inspector and pause before startup