npm - @ls-stack/agent-eval - Versions diffs - 0.23.0 → 0.25.0 - Mend

@ls-stack/agent-eval 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-Cw79dJDr.mjs → app-DS3j_AyX.mjs} +6 -3
package/dist/apps/web/dist/assets/index-DNsZjOms.css +1 -0
package/dist/apps/web/dist/assets/index-DqR1YaMG.js +118 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-D3QNOcPN.mjs → cli-ETfZ15RB.mjs} +151 -42
package/dist/index.d.mts +111 -12
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +28 -18
package/dist/{runOrchestration-CimthgI7.mjs → runOrchestration-B31SV_Bq.mjs} +364 -117
package/dist/{runner-B-SYzW8w.mjs → runner-B2f2TEjp.mjs} +1 -1
package/dist/{runner-4yNYRvmF.mjs → runner-cj1TkR-H.mjs} +2 -2
package/dist/src-CyNb2ycA.mjs +3 -0
package/package.json +1 -1
package/skills/agent-eval/SKILL.md +14 -6
package/dist/apps/web/dist/assets/index-AUDD3rNB.js +0 -118
package/dist/apps/web/dist/assets/index-r0dVFK0B.css +0 -1
package/dist/src-CcXfWT4M.mjs +0 -3

package/dist/apps/web/dist/index.html CHANGED Viewed

@@ -25,8 +25,8 @@
       href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
       rel="stylesheet"
     />
-    <script type="module" crossorigin src="/assets/index-AUDD3rNB.js"></script>
-    <link rel="stylesheet" crossorigin href="/assets/index-r0dVFK0B.css">
+    <script type="module" crossorigin src="/assets/index-DqR1YaMG.js"></script>
+    <link rel="stylesheet" crossorigin href="/assets/index-DNsZjOms.css">
   </head>
   <body>
     <div id="root"></div>

package/dist/bin.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env node
-import { t as runCli } from "./cli-D3QNOcPN.mjs";
+import { t as runCli } from "./cli-ETfZ15RB.mjs";
 import { spawn } from "node:child_process";
 //#region src/bin.ts
 const moduleMocksFlag = "--experimental-test-module-mocks";

package/dist/{cli-D3QNOcPN.mjs → cli-ETfZ15RB.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { C as validateCharts, F as deriveScopedSummaryFromCases, Ln as getEvalRegistry, N as getEvalTitle, P as getEvalDisplayStatus, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, et as resolveApiCallsConfig, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveLlmCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig, z as runSummarySchema } from "./runOrchestration-CimthgI7.mjs";
+import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-B31SV_Bq.mjs";
 import { createHash } from "node:crypto";
 import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
 import { dirname, join, relative, resolve } from "node:path";
@@ -8,6 +8,34 @@ import { existsSync } from "node:fs";
 import { resultify } from "t-result";
 import { fileURLToPath } from "node:url";
 import { spawn, spawnSync } from "node:child_process";
+//#region ../runner/src/evalSummaries.ts
+/** Build the API/UI summary payload for one discovered eval. */
+function buildEvalSummary(params) {
+	const { meta, config, gitState, latestRun, lastRunStatus } = params;
+	const { sourceFingerprint, ...summaryMeta } = meta;
+	const freshness = deriveEvalFreshness({
+		latestRun,
+		gitState,
+		currentEvalSourceFingerprint: sourceFingerprint,
+		staleAfterDays: config.staleAfterDays ?? 14
+	});
+	return {
+		...summaryMeta,
+		stale: freshness.stale,
+		outdated: freshness.outdated,
+		freshnessStatus: freshness.freshnessStatus,
+		latestRunAt: latestRun?.startedAt ?? null,
+		latestRunCommitSha: latestRun?.commitSha ?? null,
+		currentCommitSha: gitState.commitSha,
+		lastRunStatus
+	};
+}
+/** Write one latest-run snapshot to each targeted eval id. */
+function setLatestRunInfoMap(params) {
+	const { latestRunInfoMap, evalIds, info } = params;
+	for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
+}
+//#endregion
 //#region ../runner/src/gitState.ts
 function runGitCommand(workspaceRoot, args) {
 	const result = spawnSync("git", args, {
@@ -142,16 +170,16 @@ function handleRunChildMessage(params) {
 	handleRunChildEvent(runState, message.event, managerContext);
 }
 function upsertFinishedCase(runState, caseDetail, caseRow) {
-	const existingIndex = runState.cases.findIndex((row) => row.evalId === caseRow.evalId && row.caseId === caseRow.caseId && row.trial === caseRow.trial);
+	const existingIndex = runState.cases.findIndex((row) => getCaseRowCaseKey(row) === getCaseRowCaseKey(caseRow) && row.trial === caseRow.trial);
 	if (existingIndex === -1) runState.cases.push(caseRow);
 	else runState.cases[existingIndex] = caseRow;
-	runState.caseDetails.set(caseDetail.caseId, caseDetail);
+	runState.caseDetails.set(caseDetail.caseKey ?? caseDetail.caseId, caseDetail);
 }
 function applyChildEvalMetas(evals, childMetas) {
 	for (const childMeta of childMetas) {
-		const evalMeta = evals.get(childMeta.id);
+		const evalMeta = evals.get(childMeta.key);
 		if (evalMeta === void 0) {
-			evals.set(childMeta.id, childMeta);
+			evals.set(childMeta.key, childMeta);
 			continue;
 		}
 		evalMeta.columnDefs = childMeta.columnDefs;
@@ -266,6 +294,7 @@ function createRunner({ watchForChanges = true } = {}) {
 	let llmCallsConfig = resolveLlmCallsConfig(void 0);
 	let apiCallsConfig = resolveApiCallsConfig(void 0);
 	const evals = /* @__PURE__ */ new Map();
+	let discoveryIssues = [];
 	const runs = /* @__PURE__ */ new Map();
 	const lastRunStatusMap = /* @__PURE__ */ new Map();
 	const latestRunInfoMap = /* @__PURE__ */ new Map();
@@ -279,7 +308,13 @@ function createRunner({ watchForChanges = true } = {}) {
 		return relative(workspaceRoot, filePath).replaceAll("\\", "/");
 	}
 	function getSortedEvalMetas() {
-		return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
+		return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath) || a.id.localeCompare(b.id));
+	}
+	function resolveEvalMeta(evalRef) {
+		const exactMatch = evals.get(evalRef);
+		if (exactMatch !== void 0) return exactMatch;
+		const matches = getSortedEvalMetas().filter((ev) => ev.id === evalRef);
+		return matches.length === 1 ? matches[0] : void 0;
 	}
 	function getSourceFingerprint(source) {
 		return createHash("sha256").update(source).digest("hex");
@@ -312,12 +347,12 @@ function createRunner({ watchForChanges = true } = {}) {
 		async clearCache(filter) {
 			await cacheStore.clear(filter);
 		},
-		async recomputeStatusesForEval(evalId) {
-			const evalMeta = evals.get(evalId);
+		async recomputeStatusesForEval(evalKey) {
+			const evalMeta = resolveEvalMeta(evalKey);
 			if (!evalMeta) return { updatedRuns: 0 };
 			const registry = getEvalRegistry();
 			await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
-			const entry = registry.get(evalId);
+			const entry = registry.get(evalMeta.id);
 			if (!entry) return { updatedRuns: 0 };
 			const scoreThresholds = /* @__PURE__ */ new Map();
 			entry.use((evalDef) => {
@@ -329,22 +364,25 @@ function createRunner({ watchForChanges = true } = {}) {
 			});
 			const updatedRuns = await recomputeEvalStatusesInRuns({
 				runs: runs.values(),
-				evalId,
-				evalExists: evals.has(evalId),
+				evalKey: evalMeta.key,
+				evalId: evalMeta.id,
+				evalExists: evals.has(evalMeta.key),
 				scoreThresholds,
 				persistCaseDetail
 			});
 			emitDiscoveryEvent();
 			return { updatedRuns };
 		},
-		async cleanRunsForEval(evalId) {
+		async cleanRunsForEval(evalKey) {
+			const evalMeta = resolveEvalMeta(evalKey);
 			let deletedRuns = 0;
 			for (const [runId, run] of [...runs]) {
 				if (!runTouchesEval({
 					target: run.manifest.target,
 					caseRows: run.cases,
-					evalId,
-					evalExists: evals.has(evalId)
+					evalKey: evalMeta?.key ?? evalKey,
+					evalId: evalMeta?.id,
+					evalExists: evalMeta !== void 0
 				})) continue;
 				if (run.manifest.status === "running") continue;
 				runs.delete(runId);
@@ -367,12 +405,12 @@ function createRunner({ watchForChanges = true } = {}) {
 				updated: false,
 				reason: "Run is still running"
 			};
-			const caseRow = run.cases.find((row) => row.caseId === caseId);
+			const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
 			if (!caseRow) return {
 				updated: false,
 				reason: "Case not found"
 			};
-			const evalMeta = evals.get(caseRow.evalId);
+			const evalMeta = evals.get(getCaseRowEvalKey(caseRow));
 			if (!evalMeta) return {
 				updated: false,
 				reason: "Eval not found"
@@ -381,7 +419,7 @@ function createRunner({ watchForChanges = true } = {}) {
 				updated: false,
 				reason: "Manual score not found"
 			};
-			const caseDetail = run.caseDetails.get(caseId);
+			const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
 			if (!caseDetail) return {
 				updated: false,
 				reason: "Case detail not found"
@@ -435,22 +473,25 @@ function createRunner({ watchForChanges = true } = {}) {
 				meta,
 				config,
 				gitState,
-				latestRun: latestRunInfoMap.get(meta.id),
-				lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
+				latestRun: latestRunInfoMap.get(meta.key),
+				lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
 			}));
 			return result;
 		},
 		getEval(id) {
-			const meta = evals.get(id);
+			const meta = resolveEvalMeta(id);
 			if (!meta) return void 0;
 			return buildEvalSummary({
 				meta,
 				config,
 				gitState: readGitWorktreeState(workspaceRoot),
-				latestRun: latestRunInfoMap.get(meta.id),
-				lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
+				latestRun: latestRunInfoMap.get(meta.key),
+				lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
 			});
 		},
+		getDiscoveryIssues() {
+			return discoveryIssues;
+		},
 		async refreshDiscovery() {
 			const patterns = config.include;
 			const discovered = [];
@@ -462,16 +503,25 @@ function createRunner({ watchForChanges = true } = {}) {
 				discovered.push(...files);
 			}
 			evals.clear();
+			discoveryIssues = [];
 			for (const filePath of discovered) try {
 				const content = await readFile(filePath, "utf-8");
-				const discoveredMetas = parseEvalMetas(filePath, content);
+				const discovery = parseEvalDiscovery(filePath, content);
+				const discoveredMetas = discovery.metas;
+				discoveryIssues.push(...discovery.issues.map((issue) => ({
+					...issue,
+					filePath: toWorkspaceRelativePath(issue.filePath),
+					message: `Duplicate eval id "${issue.evalId}" in ${toWorkspaceRelativePath(issue.filePath)}. Eval ids must be unique within one file.`
+				})));
 				const sourceFingerprint = getSourceFingerprint(content);
 				const registry = getEvalRegistry();
+				let moduleLoaded = false;
 				try {
 					await loadEvalModule(filePath, sourceFingerprint);
+					moduleLoaded = true;
 				} catch {}
 				for (const meta of discoveredMetas) {
-					const discoveredEntry = registry.get(meta.id);
+					const discoveredEntry = moduleLoaded ? registry.get(meta.id) : void 0;
 					const title = meta.title;
 					let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
 					let stats;
@@ -491,10 +541,16 @@ function createRunner({ watchForChanges = true } = {}) {
 						for (const warning of validated.warnings) console.warn(warning);
 						charts = validated.charts;
 					});
-					evals.set(meta.id, {
+					const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
+					const key = buildEvalKey({
+						filePath: relativeFilePath,
+						evalId: meta.id
+					});
+					evals.set(key, {
+						key,
 						id: meta.id,
 						title,
-						filePath: toWorkspaceRelativePath(meta.filePath),
+						filePath: relativeFilePath,
 						sourceFilePath: meta.filePath,
 						sourceFingerprint,
 						columnDefs,
@@ -549,10 +605,9 @@ function createRunner({ watchForChanges = true } = {}) {
 			runs.set(runId, runState);
 			setLatestRunInfoMap({
 				latestRunInfoMap,
-				evalIds: getTargetEvalIds({
+				evalIds: getTargetEvalKeys({
 					request,
-					sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
-					knownEvalIds: new Set(evals.keys())
+					sortedEvals: getSortedEvalMetas()
 				}),
 				info: {
 					status: "running",
@@ -633,7 +688,7 @@ function createRunner({ watchForChanges = true } = {}) {
 		getCaseDetail(runId, caseId) {
 			const run = runs.get(runId);
 			if (!run) return void 0;
-			return run.caseDetails.get(caseId);
+			return run.caseDetails.get(caseId) ?? run.caseDetails.get(getCaseRowCaseKey(run.cases.find((caseRow) => getCaseRowCaseKey(caseRow) === caseId || caseRow.caseId === caseId) ?? { caseId }));
 		},
 		subscribe(runId, listener) {
 			const run = runs.get(runId);
@@ -799,6 +854,7 @@ function parseArgs(argv) {
 		helpTopic: "global",
 		unknownHelpTarget: void 0,
 		evalIds: [],
+		files: [],
 		caseIds: [],
 		trials: 1,
 		json: false,
@@ -834,6 +890,9 @@ function parseArgs(argv) {
 		else if (arg === "--eval" && next) {
 			args.evalIds.push(...next.split(","));
 			i++;
+		} else if (arg === "--file" && next) {
+			args.files.push(...next.split(","));
+			i++;
 		} else if (arg === "--case" && next) {
 			args.caseIds.push(...next.split(","));
 			i++;
@@ -899,6 +958,28 @@ async function runCli(argv) {
 function isCliCommand(command) {
 	return command === "app" || command === "list" || command === "run" || command === "show-runs" || command === "cache" || command === "help";
 }
+function escapeRegex(value) {
+	return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
+}
+function globToRegex(pattern) {
+	const normalized = pattern.replaceAll("\\", "/");
+	let regex = "^";
+	for (let i = 0; i < normalized.length; i++) {
+		const char = normalized[i];
+		const next = normalized[i + 1];
+		if (char === "*" && next === "*") {
+			regex += ".*";
+			i++;
+		} else if (char === "*") regex += "[^/]*";
+		else if (char === "?") regex += "[^/]";
+		else regex += escapeRegex(char ?? "");
+	}
+	return new RegExp(`${regex}$`);
+}
+function fileMatches(pattern, filePath) {
+	const normalized = pattern.replaceAll("\\", "/");
+	return normalized === filePath || globToRegex(normalized).test(filePath);
+}
 function loadWorkspaceEnv() {
 	const envPath = resolve(process.cwd(), ".env");
 	if (!existsSync(envPath)) return true;
@@ -959,8 +1040,8 @@ async function commandApp(args) {
 	const { serve } = await import("@hono/node-server");
 	const bundledWebDist = resolve(currentDir, "apps/web/dist");
 	if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
-	const appModule = await import("./app-Cw79dJDr.mjs");
-	const runnerModule = await import("./runner-B-SYzW8w.mjs");
+	const appModule = await import("./app-DS3j_AyX.mjs");
+	const runnerModule = await import("./runner-B2f2TEjp.mjs");
 	if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
 	if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
 	await runnerModule.initRunner();
@@ -973,9 +1054,16 @@ async function commandApp(args) {
 async function commandList(args_) {
 	const runner = createRunner({ watchForChanges: false });
 	await runner.init();
+	const discoveryIssues = runner.getDiscoveryIssues();
+	if (discoveryIssues.length > 0) {
+		console.error("Discovery errors:\n");
+		for (const issue of discoveryIssues) console.error(`  ${issue.message}`);
+		console.error("");
+	}
 	const evals = runner.getEvals();
 	if (evals.length === 0) {
 		console.info("No eval files found.");
+		if (discoveryIssues.length > 0) process.exit(1);
 		return;
 	}
 	console.info("Discovered evals:\n");
@@ -994,12 +1082,13 @@ async function commandList(args_) {
 		if (ev.caseCount !== null) console.info(`    cases: ${String(ev.caseCount)}`);
 		console.info("");
 	}
+	if (discoveryIssues.length > 0) process.exit(1);
 }
 async function commandRun(args) {
 	const runner = createRunner({ watchForChanges: false });
 	await runner.init();
-	if (args.evalIds.length === 0 && args.caseIds.length === 0 && !runner.getAllowCliRunAll()) {
-		console.error("This workspace disables running all evals from the CLI. Pass --eval <id> or --case <id> to run a targeted subset.");
+	if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && !runner.getAllowCliRunAll()) {
+		console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, or --case <id> to run a targeted subset.");
 		process.exit(1);
 		return;
 	}
@@ -1013,10 +1102,15 @@ async function commandRun(args) {
 	const target = args.caseIds.length > 0 ? {
 		mode: "caseIds",
 		caseIds: args.caseIds,
-		evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
+		evalIds: args.evalIds.length > 0 ? args.evalIds : void 0,
+		files: args.files.length > 0 ? args.files : void 0
 	} : args.evalIds.length > 0 ? {
 		mode: "evalIds",
-		evalIds: args.evalIds
+		evalIds: args.evalIds,
+		files: args.files.length > 0 ? args.files : void 0
+	} : args.files.length > 0 ? {
+		mode: "evalIds",
+		files: args.files
 	} : { mode: "all" };
 	const run = await runner.startRun({
 		target,
@@ -1046,8 +1140,12 @@ async function commandRun(args) {
 		console.info(`Errors: ${String(summary.errorCases)}`);
 		if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
 		if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
+		if (summary.errorMessage !== null) {
+			console.info("");
+			console.info(summary.errorMessage);
+		}
 	}
-	if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
+	if (summary.status === "error" || summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
 }
 async function commandShowRuns(args) {
 	const runner = createRunner({ watchForChanges: false });
@@ -1101,8 +1199,9 @@ async function commandCache(args) {
 		return;
 	}
 	if (args.subcommand === "clear") {
-		if (args.evalIds.length > 0) {
-			for (const evalId of args.evalIds) {
+		if (args.evalIds.length > 0 || args.files.length > 0) {
+			const evalIds = runner.getEvals().filter((ev) => (args.evalIds.length === 0 || args.evalIds.includes(ev.id)) && (args.files.length === 0 || args.files.some((file) => fileMatches(file, ev.filePath)))).map((ev) => ev.id);
+			for (const evalId of evalIds) {
 				const entries = await runner.listCache();
 				const prefix = `${evalId}__`;
 				const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
@@ -1111,7 +1210,7 @@ async function commandCache(args) {
 					key: entry.key
 				});
 			}
-			console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
+			console.info(`Cleared cache entries for: ${evalIds.join(", ")}`);
 			return;
 		}
 		if (args.all) {
@@ -1130,6 +1229,9 @@ function getSortedRunSnapshots(runner) {
 }
 function buildRunFileIndex(workspaceRoot, run) {
 	const runDir = join(workspaceRoot, ".agent-evals", "runs", run.manifest.id);
+	const caseIdCounts = /* @__PURE__ */ new Map();
+	for (const caseRow of run.cases) caseIdCounts.set(caseRow.caseId, (caseIdCounts.get(caseRow.caseId) ?? 0) + 1);
+	const seenCaseIds = /* @__PURE__ */ new Set();
 	return {
 		id: run.manifest.id,
 		shortId: run.manifest.shortId,
@@ -1147,10 +1249,16 @@ function buildRunFileIndex(workspaceRoot, run) {
 			tracesDir: join(runDir, "traces")
 		},
 		cases: run.cases.map((caseRow) => {
-			const fileName = `${encodeURIComponent(caseRow.caseId)}.json`;
+			const duplicateCaseIdCount = caseIdCounts.get(caseRow.caseId) ?? 0;
+			const hasPreviousCaseWithId = seenCaseIds.has(caseRow.caseId);
+			const fileId = duplicateCaseIdCount > 1 && hasPreviousCaseWithId ? caseRow.caseKey ?? caseRow.caseId : caseRow.caseId;
+			seenCaseIds.add(caseRow.caseId);
+			const fileName = `${encodeURIComponent(fileId)}.json`;
 			return {
 				caseId: caseRow.caseId,
+				caseKey: caseRow.caseKey,
 				evalId: caseRow.evalId,
+				evalKey: caseRow.evalKey,
 				status: caseRow.status,
 				files: {
 					caseDetail: join(runDir, "case-details", fileName),
@@ -1262,7 +1370,8 @@ Usage:
 Flags:
   --eval <id>                Run specific eval(s) (comma-separated)
-  --case <id>                Run specific case(s) (comma-separated)
+  --file <path|glob>         Run eval files matching path/glob (comma-separated)
+  --case <id>                Run case(s); combine with --file/--eval if ambiguous
   --trials <n>               Number of trials per case
   --inspect[=host:port]      Run with the Node.js inspector enabled
   --inspect-brk[=host:port]  Enable inspector and pause before startup