npm - @ls-stack/agent-eval - Versions diffs - 0.24.0 → 0.25.0 - Mend

@ls-stack/agent-eval 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-DYRmucgj.mjs → app-DS3j_AyX.mjs} +6 -3
package/dist/apps/web/dist/assets/index-DNsZjOms.css +1 -0
package/dist/apps/web/dist/assets/index-DqR1YaMG.js +118 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-Be0x8CS3.mjs → cli-ETfZ15RB.mjs} +151 -42
package/dist/index.d.mts +61 -6
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +28 -18
package/dist/{runOrchestration-D697g6Qe.mjs → runOrchestration-B31SV_Bq.mjs} +222 -87
package/dist/{runner-jSujaSKt.mjs → runner-B2f2TEjp.mjs} +1 -1
package/dist/{runner-B4SosWgD.mjs → runner-cj1TkR-H.mjs} +2 -2
package/dist/src-CyNb2ycA.mjs +3 -0
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +6 -1
package/dist/apps/web/dist/assets/index-KbbX3NYr.js +0 -118
package/dist/apps/web/dist/assets/index-r0dVFK0B.css +0 -1
package/dist/src-D6cettg0.mjs +0 -3

package/dist/apps/web/dist/index.html CHANGED Viewed

@@ -25,8 +25,8 @@
       href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
       rel="stylesheet"
     />
-    <script type="module" crossorigin src="/assets/index-KbbX3NYr.js"></script>
-    <link rel="stylesheet" crossorigin href="/assets/index-r0dVFK0B.css">
+    <script type="module" crossorigin src="/assets/index-DqR1YaMG.js"></script>
+    <link rel="stylesheet" crossorigin href="/assets/index-DNsZjOms.css">
   </head>
   <body>
     <div id="root"></div>

package/dist/bin.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env node
-import { t as runCli } from "./cli-Be0x8CS3.mjs";
+import { t as runCli } from "./cli-ETfZ15RB.mjs";
 import { spawn } from "node:child_process";
 //#region src/bin.ts
 const moduleMocksFlag = "--experimental-test-module-mocks";

package/dist/{cli-Be0x8CS3.mjs → cli-ETfZ15RB.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, Rn as getEvalRegistry, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, nt as resolveLlmCallsConfig, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveApiCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-D697g6Qe.mjs";
+import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-B31SV_Bq.mjs";
 import { createHash } from "node:crypto";
 import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
 import { dirname, join, relative, resolve } from "node:path";
@@ -8,6 +8,34 @@ import { existsSync } from "node:fs";
 import { resultify } from "t-result";
 import { fileURLToPath } from "node:url";
 import { spawn, spawnSync } from "node:child_process";
+//#region ../runner/src/evalSummaries.ts
+/** Build the API/UI summary payload for one discovered eval. */
+function buildEvalSummary(params) {
+	const { meta, config, gitState, latestRun, lastRunStatus } = params;
+	const { sourceFingerprint, ...summaryMeta } = meta;
+	const freshness = deriveEvalFreshness({
+		latestRun,
+		gitState,
+		currentEvalSourceFingerprint: sourceFingerprint,
+		staleAfterDays: config.staleAfterDays ?? 14
+	});
+	return {
+		...summaryMeta,
+		stale: freshness.stale,
+		outdated: freshness.outdated,
+		freshnessStatus: freshness.freshnessStatus,
+		latestRunAt: latestRun?.startedAt ?? null,
+		latestRunCommitSha: latestRun?.commitSha ?? null,
+		currentCommitSha: gitState.commitSha,
+		lastRunStatus
+	};
+}
+/** Write one latest-run snapshot to each targeted eval id. */
+function setLatestRunInfoMap(params) {
+	const { latestRunInfoMap, evalIds, info } = params;
+	for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
+}
+//#endregion
 //#region ../runner/src/gitState.ts
 function runGitCommand(workspaceRoot, args) {
 	const result = spawnSync("git", args, {
@@ -142,16 +170,16 @@ function handleRunChildMessage(params) {
 	handleRunChildEvent(runState, message.event, managerContext);
 }
 function upsertFinishedCase(runState, caseDetail, caseRow) {
-	const existingIndex = runState.cases.findIndex((row) => row.evalId === caseRow.evalId && row.caseId === caseRow.caseId && row.trial === caseRow.trial);
+	const existingIndex = runState.cases.findIndex((row) => getCaseRowCaseKey(row) === getCaseRowCaseKey(caseRow) && row.trial === caseRow.trial);
 	if (existingIndex === -1) runState.cases.push(caseRow);
 	else runState.cases[existingIndex] = caseRow;
-	runState.caseDetails.set(caseDetail.caseId, caseDetail);
+	runState.caseDetails.set(caseDetail.caseKey ?? caseDetail.caseId, caseDetail);
 }
 function applyChildEvalMetas(evals, childMetas) {
 	for (const childMeta of childMetas) {
-		const evalMeta = evals.get(childMeta.id);
+		const evalMeta = evals.get(childMeta.key);
 		if (evalMeta === void 0) {
-			evals.set(childMeta.id, childMeta);
+			evals.set(childMeta.key, childMeta);
 			continue;
 		}
 		evalMeta.columnDefs = childMeta.columnDefs;
@@ -266,6 +294,7 @@ function createRunner({ watchForChanges = true } = {}) {
 	let llmCallsConfig = resolveLlmCallsConfig(void 0);
 	let apiCallsConfig = resolveApiCallsConfig(void 0);
 	const evals = /* @__PURE__ */ new Map();
+	let discoveryIssues = [];
 	const runs = /* @__PURE__ */ new Map();
 	const lastRunStatusMap = /* @__PURE__ */ new Map();
 	const latestRunInfoMap = /* @__PURE__ */ new Map();
@@ -279,7 +308,13 @@ function createRunner({ watchForChanges = true } = {}) {
 		return relative(workspaceRoot, filePath).replaceAll("\\", "/");
 	}
 	function getSortedEvalMetas() {
-		return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
+		return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath) || a.id.localeCompare(b.id));
+	}
+	function resolveEvalMeta(evalRef) {
+		const exactMatch = evals.get(evalRef);
+		if (exactMatch !== void 0) return exactMatch;
+		const matches = getSortedEvalMetas().filter((ev) => ev.id === evalRef);
+		return matches.length === 1 ? matches[0] : void 0;
 	}
 	function getSourceFingerprint(source) {
 		return createHash("sha256").update(source).digest("hex");
@@ -312,12 +347,12 @@ function createRunner({ watchForChanges = true } = {}) {
 		async clearCache(filter) {
 			await cacheStore.clear(filter);
 		},
-		async recomputeStatusesForEval(evalId) {
-			const evalMeta = evals.get(evalId);
+		async recomputeStatusesForEval(evalKey) {
+			const evalMeta = resolveEvalMeta(evalKey);
 			if (!evalMeta) return { updatedRuns: 0 };
 			const registry = getEvalRegistry();
 			await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
-			const entry = registry.get(evalId);
+			const entry = registry.get(evalMeta.id);
 			if (!entry) return { updatedRuns: 0 };
 			const scoreThresholds = /* @__PURE__ */ new Map();
 			entry.use((evalDef) => {
@@ -329,22 +364,25 @@ function createRunner({ watchForChanges = true } = {}) {
 			});
 			const updatedRuns = await recomputeEvalStatusesInRuns({
 				runs: runs.values(),
-				evalId,
-				evalExists: evals.has(evalId),
+				evalKey: evalMeta.key,
+				evalId: evalMeta.id,
+				evalExists: evals.has(evalMeta.key),
 				scoreThresholds,
 				persistCaseDetail
 			});
 			emitDiscoveryEvent();
 			return { updatedRuns };
 		},
-		async cleanRunsForEval(evalId) {
+		async cleanRunsForEval(evalKey) {
+			const evalMeta = resolveEvalMeta(evalKey);
 			let deletedRuns = 0;
 			for (const [runId, run] of [...runs]) {
 				if (!runTouchesEval({
 					target: run.manifest.target,
 					caseRows: run.cases,
-					evalId,
-					evalExists: evals.has(evalId)
+					evalKey: evalMeta?.key ?? evalKey,
+					evalId: evalMeta?.id,
+					evalExists: evalMeta !== void 0
 				})) continue;
 				if (run.manifest.status === "running") continue;
 				runs.delete(runId);
@@ -367,12 +405,12 @@ function createRunner({ watchForChanges = true } = {}) {
 				updated: false,
 				reason: "Run is still running"
 			};
-			const caseRow = run.cases.find((row) => row.caseId === caseId);
+			const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
 			if (!caseRow) return {
 				updated: false,
 				reason: "Case not found"
 			};
-			const evalMeta = evals.get(caseRow.evalId);
+			const evalMeta = evals.get(getCaseRowEvalKey(caseRow));
 			if (!evalMeta) return {
 				updated: false,
 				reason: "Eval not found"
@@ -381,7 +419,7 @@ function createRunner({ watchForChanges = true } = {}) {
 				updated: false,
 				reason: "Manual score not found"
 			};
-			const caseDetail = run.caseDetails.get(caseId);
+			const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
 			if (!caseDetail) return {
 				updated: false,
 				reason: "Case detail not found"
@@ -435,22 +473,25 @@ function createRunner({ watchForChanges = true } = {}) {
 				meta,
 				config,
 				gitState,
-				latestRun: latestRunInfoMap.get(meta.id),
-				lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
+				latestRun: latestRunInfoMap.get(meta.key),
+				lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
 			}));
 			return result;
 		},
 		getEval(id) {
-			const meta = evals.get(id);
+			const meta = resolveEvalMeta(id);
 			if (!meta) return void 0;
 			return buildEvalSummary({
 				meta,
 				config,
 				gitState: readGitWorktreeState(workspaceRoot),
-				latestRun: latestRunInfoMap.get(meta.id),
-				lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
+				latestRun: latestRunInfoMap.get(meta.key),
+				lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
 			});
 		},
+		getDiscoveryIssues() {
+			return discoveryIssues;
+		},
 		async refreshDiscovery() {
 			const patterns = config.include;
 			const discovered = [];
@@ -462,16 +503,25 @@ function createRunner({ watchForChanges = true } = {}) {
 				discovered.push(...files);
 			}
 			evals.clear();
+			discoveryIssues = [];
 			for (const filePath of discovered) try {
 				const content = await readFile(filePath, "utf-8");
-				const discoveredMetas = parseEvalMetas(filePath, content);
+				const discovery = parseEvalDiscovery(filePath, content);
+				const discoveredMetas = discovery.metas;
+				discoveryIssues.push(...discovery.issues.map((issue) => ({
+					...issue,
+					filePath: toWorkspaceRelativePath(issue.filePath),
+					message: `Duplicate eval id "${issue.evalId}" in ${toWorkspaceRelativePath(issue.filePath)}. Eval ids must be unique within one file.`
+				})));
 				const sourceFingerprint = getSourceFingerprint(content);
 				const registry = getEvalRegistry();
+				let moduleLoaded = false;
 				try {
 					await loadEvalModule(filePath, sourceFingerprint);
+					moduleLoaded = true;
 				} catch {}
 				for (const meta of discoveredMetas) {
-					const discoveredEntry = registry.get(meta.id);
+					const discoveredEntry = moduleLoaded ? registry.get(meta.id) : void 0;
 					const title = meta.title;
 					let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
 					let stats;
@@ -491,10 +541,16 @@ function createRunner({ watchForChanges = true } = {}) {
 						for (const warning of validated.warnings) console.warn(warning);
 						charts = validated.charts;
 					});
-					evals.set(meta.id, {
+					const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
+					const key = buildEvalKey({
+						filePath: relativeFilePath,
+						evalId: meta.id
+					});
+					evals.set(key, {
+						key,
 						id: meta.id,
 						title,
-						filePath: toWorkspaceRelativePath(meta.filePath),
+						filePath: relativeFilePath,
 						sourceFilePath: meta.filePath,
 						sourceFingerprint,
 						columnDefs,
@@ -549,10 +605,9 @@ function createRunner({ watchForChanges = true } = {}) {
 			runs.set(runId, runState);
 			setLatestRunInfoMap({
 				latestRunInfoMap,
-				evalIds: getTargetEvalIds({
+				evalIds: getTargetEvalKeys({
 					request,
-					sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
-					knownEvalIds: new Set(evals.keys())
+					sortedEvals: getSortedEvalMetas()
 				}),
 				info: {
 					status: "running",
@@ -633,7 +688,7 @@ function createRunner({ watchForChanges = true } = {}) {
 		getCaseDetail(runId, caseId) {
 			const run = runs.get(runId);
 			if (!run) return void 0;
-			return run.caseDetails.get(caseId);
+			return run.caseDetails.get(caseId) ?? run.caseDetails.get(getCaseRowCaseKey(run.cases.find((caseRow) => getCaseRowCaseKey(caseRow) === caseId || caseRow.caseId === caseId) ?? { caseId }));
 		},
 		subscribe(runId, listener) {
 			const run = runs.get(runId);
@@ -799,6 +854,7 @@ function parseArgs(argv) {
 		helpTopic: "global",
 		unknownHelpTarget: void 0,
 		evalIds: [],
+		files: [],
 		caseIds: [],
 		trials: 1,
 		json: false,
@@ -834,6 +890,9 @@ function parseArgs(argv) {
 		else if (arg === "--eval" && next) {
 			args.evalIds.push(...next.split(","));
 			i++;
+		} else if (arg === "--file" && next) {
+			args.files.push(...next.split(","));
+			i++;
 		} else if (arg === "--case" && next) {
 			args.caseIds.push(...next.split(","));
 			i++;
@@ -899,6 +958,28 @@ async function runCli(argv) {
 function isCliCommand(command) {
 	return command === "app" || command === "list" || command === "run" || command === "show-runs" || command === "cache" || command === "help";
 }
+function escapeRegex(value) {
+	return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
+}
+function globToRegex(pattern) {
+	const normalized = pattern.replaceAll("\\", "/");
+	let regex = "^";
+	for (let i = 0; i < normalized.length; i++) {
+		const char = normalized[i];
+		const next = normalized[i + 1];
+		if (char === "*" && next === "*") {
+			regex += ".*";
+			i++;
+		} else if (char === "*") regex += "[^/]*";
+		else if (char === "?") regex += "[^/]";
+		else regex += escapeRegex(char ?? "");
+	}
+	return new RegExp(`${regex}$`);
+}
+function fileMatches(pattern, filePath) {
+	const normalized = pattern.replaceAll("\\", "/");
+	return normalized === filePath || globToRegex(normalized).test(filePath);
+}
 function loadWorkspaceEnv() {
 	const envPath = resolve(process.cwd(), ".env");
 	if (!existsSync(envPath)) return true;
@@ -959,8 +1040,8 @@ async function commandApp(args) {
 	const { serve } = await import("@hono/node-server");
 	const bundledWebDist = resolve(currentDir, "apps/web/dist");
 	if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
-	const appModule = await import("./app-DYRmucgj.mjs");
-	const runnerModule = await import("./runner-jSujaSKt.mjs");
+	const appModule = await import("./app-DS3j_AyX.mjs");
+	const runnerModule = await import("./runner-B2f2TEjp.mjs");
 	if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
 	if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
 	await runnerModule.initRunner();
@@ -973,9 +1054,16 @@ async function commandApp(args) {
 async function commandList(args_) {
 	const runner = createRunner({ watchForChanges: false });
 	await runner.init();
+	const discoveryIssues = runner.getDiscoveryIssues();
+	if (discoveryIssues.length > 0) {
+		console.error("Discovery errors:\n");
+		for (const issue of discoveryIssues) console.error(`  ${issue.message}`);
+		console.error("");
+	}
 	const evals = runner.getEvals();
 	if (evals.length === 0) {
 		console.info("No eval files found.");
+		if (discoveryIssues.length > 0) process.exit(1);
 		return;
 	}
 	console.info("Discovered evals:\n");
@@ -994,12 +1082,13 @@ async function commandList(args_) {
 		if (ev.caseCount !== null) console.info(`    cases: ${String(ev.caseCount)}`);
 		console.info("");
 	}
+	if (discoveryIssues.length > 0) process.exit(1);
 }
 async function commandRun(args) {
 	const runner = createRunner({ watchForChanges: false });
 	await runner.init();
-	if (args.evalIds.length === 0 && args.caseIds.length === 0 && !runner.getAllowCliRunAll()) {
-		console.error("This workspace disables running all evals from the CLI. Pass --eval <id> or --case <id> to run a targeted subset.");
+	if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && !runner.getAllowCliRunAll()) {
+		console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, or --case <id> to run a targeted subset.");
 		process.exit(1);
 		return;
 	}
@@ -1013,10 +1102,15 @@ async function commandRun(args) {
 	const target = args.caseIds.length > 0 ? {
 		mode: "caseIds",
 		caseIds: args.caseIds,
-		evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
+		evalIds: args.evalIds.length > 0 ? args.evalIds : void 0,
+		files: args.files.length > 0 ? args.files : void 0
 	} : args.evalIds.length > 0 ? {
 		mode: "evalIds",
-		evalIds: args.evalIds
+		evalIds: args.evalIds,
+		files: args.files.length > 0 ? args.files : void 0
+	} : args.files.length > 0 ? {
+		mode: "evalIds",
+		files: args.files
 	} : { mode: "all" };
 	const run = await runner.startRun({
 		target,
@@ -1046,8 +1140,12 @@ async function commandRun(args) {
 		console.info(`Errors: ${String(summary.errorCases)}`);
 		if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
 		if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
+		if (summary.errorMessage !== null) {
+			console.info("");
+			console.info(summary.errorMessage);
+		}
 	}
-	if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
+	if (summary.status === "error" || summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
 }
 async function commandShowRuns(args) {
 	const runner = createRunner({ watchForChanges: false });
@@ -1101,8 +1199,9 @@ async function commandCache(args) {
 		return;
 	}
 	if (args.subcommand === "clear") {
-		if (args.evalIds.length > 0) {
-			for (const evalId of args.evalIds) {
+		if (args.evalIds.length > 0 || args.files.length > 0) {
+			const evalIds = runner.getEvals().filter((ev) => (args.evalIds.length === 0 || args.evalIds.includes(ev.id)) && (args.files.length === 0 || args.files.some((file) => fileMatches(file, ev.filePath)))).map((ev) => ev.id);
+			for (const evalId of evalIds) {
 				const entries = await runner.listCache();
 				const prefix = `${evalId}__`;
 				const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
@@ -1111,7 +1210,7 @@ async function commandCache(args) {
 					key: entry.key
 				});
 			}
-			console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
+			console.info(`Cleared cache entries for: ${evalIds.join(", ")}`);
 			return;
 		}
 		if (args.all) {
@@ -1130,6 +1229,9 @@ function getSortedRunSnapshots(runner) {
 }
 function buildRunFileIndex(workspaceRoot, run) {
 	const runDir = join(workspaceRoot, ".agent-evals", "runs", run.manifest.id);
+	const caseIdCounts = /* @__PURE__ */ new Map();
+	for (const caseRow of run.cases) caseIdCounts.set(caseRow.caseId, (caseIdCounts.get(caseRow.caseId) ?? 0) + 1);
+	const seenCaseIds = /* @__PURE__ */ new Set();
 	return {
 		id: run.manifest.id,
 		shortId: run.manifest.shortId,
@@ -1147,10 +1249,16 @@ function buildRunFileIndex(workspaceRoot, run) {
 			tracesDir: join(runDir, "traces")
 		},
 		cases: run.cases.map((caseRow) => {
-			const fileName = `${encodeURIComponent(caseRow.caseId)}.json`;
+			const duplicateCaseIdCount = caseIdCounts.get(caseRow.caseId) ?? 0;
+			const hasPreviousCaseWithId = seenCaseIds.has(caseRow.caseId);
+			const fileId = duplicateCaseIdCount > 1 && hasPreviousCaseWithId ? caseRow.caseKey ?? caseRow.caseId : caseRow.caseId;
+			seenCaseIds.add(caseRow.caseId);
+			const fileName = `${encodeURIComponent(fileId)}.json`;
 			return {
 				caseId: caseRow.caseId,
+				caseKey: caseRow.caseKey,
 				evalId: caseRow.evalId,
+				evalKey: caseRow.evalKey,
 				status: caseRow.status,
 				files: {
 					caseDetail: join(runDir, "case-details", fileName),
@@ -1262,7 +1370,8 @@ Usage:
 Flags:
   --eval <id>                Run specific eval(s) (comma-separated)
-  --case <id>                Run specific case(s) (comma-separated)
+  --file <path|glob>         Run eval files matching path/glob (comma-separated)
+  --case <id>                Run case(s); combine with --file/--eval if ambiguous
   --trials <n>               Number of trials per case
   --inspect[=host:port]      Run with the Node.js inspector enabled
   --inspect-brk[=host:port]  Enable inspector and pause before startup

package/dist/index.d.mts CHANGED Viewed

@@ -457,6 +457,7 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
 type EvalStatsConfig = z$1.infer<typeof evalStatsConfigSchema>;
 /** Schema summarizing a discovered eval for list and overview screens. */
 declare const evalSummarySchema: z$1.ZodObject<{
+  key: z$1.ZodDefault<z$1.ZodString>;
   id: z$1.ZodString;
   title: z$1.ZodOptional<z$1.ZodString>;
   filePath: z$1.ZodString;
@@ -635,6 +636,8 @@ declare const evalSummarySchema: z$1.ZodObject<{
 type EvalSummary = z$1.infer<typeof evalSummarySchema>;
 /** Schema for one case row in an eval run result table. */
 declare const caseRowSchema: z$1.ZodObject<{
+  evalKey: z$1.ZodOptional<z$1.ZodString>;
+  caseKey: z$1.ZodOptional<z$1.ZodString>;
   caseId: z$1.ZodString;
   evalId: z$1.ZodString;
   status: z$1.ZodEnum<{
@@ -796,6 +799,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
 type ScoreTrace = z$1.infer<typeof scoreTraceSchema>;
 /** Schema for the detailed payload shown when opening a specific case. */
 declare const caseDetailSchema: z$1.ZodObject<{
+  evalKey: z$1.ZodOptional<z$1.ZodString>;
+  caseKey: z$1.ZodOptional<z$1.ZodString>;
   caseId: z$1.ZodString;
   evalId: z$1.ZodString;
   status: z$1.ZodEnum<{
@@ -1009,6 +1014,43 @@ declare const caseDetailSchema: z$1.ZodObject<{
 }, z$1.core.$strip>;
 /** Full case payload including inputs, trace, outputs, and failures. */
 type CaseDetail = z$1.infer<typeof caseDetailSchema>;
+/** Schema for discovery problems that should be shown before running evals. */
+declare const discoveryIssueSchema: z$1.ZodObject<{
+  type: z$1.ZodEnum<{
+    "duplicate-eval-id": "duplicate-eval-id";
+  }>;
+  severity: z$1.ZodEnum<{
+    error: "error";
+  }>;
+  filePath: z$1.ZodString;
+  evalId: z$1.ZodString;
+  message: z$1.ZodString;
+}, z$1.core.$strip>;
+/** Discovery problem found while scanning eval files. */
+type DiscoveryIssue = z$1.infer<typeof discoveryIssueSchema>;
+//#endregion
+//#region ../shared/src/evalIdentity.d.ts
+/** Build the stable identity for one eval inside a workspace. */
+declare function buildEvalKey(params: {
+  filePath: string;
+  evalId: string;
+}): string;
+/** Build the stable identity for one eval case inside a workspace. */
+declare function buildCaseKey(params: {
+  filePath: string;
+  evalId: string;
+  caseId: string;
+}): string;
+/** Return the collision-safe eval key stored on a row, falling back for legacy data. */
+declare function getCaseRowEvalKey(row: {
+  evalKey?: string;
+  evalId: string;
+}): string;
+/** Return the collision-safe case key stored on a row, falling back for legacy data. */
+declare function getCaseRowCaseKey(row: {
+  caseKey?: string;
+  caseId: string;
+}): string;
 //#endregion
 //#region ../shared/src/schemas/chart.d.ts
 /** Chart type rendered for a single eval history chart. */
@@ -1338,6 +1380,8 @@ declare const runManifestSchema: z$1.ZodObject<{
       evalIds: "evalIds";
       caseIds: "caseIds";
     }>;
+    evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
+    files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
     evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
     caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
   }, z$1.core.$strip>;
@@ -2772,6 +2816,8 @@ declare const createRunRequestSchema: z$1.ZodObject<{
       evalIds: "evalIds";
       caseIds: "caseIds";
     }>;
+    evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
+    files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
     evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
     caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
   }, z$1.core.$strip>;
@@ -3564,7 +3610,8 @@ type CacheClearFilter = {
 type EvalRunner = {
   /** Load workspace config, discover evals, and start file watching when enabled. */init(): Promise<void>; /** Return the currently discovered eval summaries for the active workspace. */
   getEvals(): EvalSummary[]; /** Look up one discovered eval by id. */
-  getEval(id: string): EvalSummary | undefined; /** Re-scan configured eval files and emit a discovery update to listeners. */
+  getEval(id: string): EvalSummary | undefined; /** Return discovery errors that should be shown before running evals. */
+  getDiscoveryIssues(): DiscoveryIssue[]; /** Re-scan configured eval files and emit a discovery update to listeners. */
   refreshDiscovery(): Promise<void>;
   startRun(request: CreateRunRequest): Promise<{
     manifest: RunManifest;
@@ -3617,11 +3664,19 @@ type EvalRunner = {
    * Remove cache entries matching `filter`, or all entries when no filter is
    * supplied.
    */
-  clearCache(filter?: CacheClearFilter): Promise<void>; /** Recompute persisted case and run statuses for terminal runs touching one eval. */
-  recomputeStatusesForEval(evalId: string): Promise<{
+  clearCache(filter?: CacheClearFilter): Promise<void>;
+  /**
+   * Recompute persisted case and run statuses for terminal runs touching one
+   * eval. Accepts the exact eval key, with a legacy fallback for unique eval ids.
+   */
+  recomputeStatusesForEval(evalKey: string): Promise<{
     updatedRuns: number;
-  }>; /** Delete terminal persisted runs that touch one eval from in-memory history and disk. */
-  cleanRunsForEval(evalId: string): Promise<{
+  }>;
+  /**
+   * Delete terminal persisted runs that touch one eval from memory and disk.
+   * Accepts the exact eval key, with a legacy fallback for unique eval ids.
+   */
+  cleanRunsForEval(evalKey: string): Promise<{
     deletedRuns: number;
   }>; /** Persist a UI-authored manual score for one case and recompute affected summaries. */
   updateManualScore(params: {
@@ -3667,4 +3722,4 @@ declare function createRunner({
  */
 declare function runCli(argv: string[]): Promise<void>;
 //#endregion
-export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as llmCallsConfigSchema, $t as columnFormatSchema, A as extractApiCalls, An as runInEvalRuntimeScope, At as cacheEntryWithDebugKeySchema, B as runSummarySchema, Bt as traceCacheRefSchema, Cn as getCurrentScope, Ct as evalChartMetricSchema, D as sseEnvelopeSchema, Dn as isInEvalScope, Dt as cacheDebugKeyEntrySchema, E as updateManualScoreRequestSchema, En as incrementEvalOutput, Et as evalChartsConfigSchema, F as getEvalDisplayStatus, Fn as startEvalBackgroundJob, Ft as cacheRecordingOpSchema, G as apiCallMetricPlacementSchema, Gt as traceDisplayConfigSchema, H as DEFAULT_LLM_CALLS_CONFIG, Ht as traceAttributeDisplayInputSchema, I as deriveScopedSummaryFromCases, In as repoFile, It as cacheRecordingSchema, J as defaultConfigKeySchema, Jt as traceSpanKindSchema, K as apiCallMetricSchema, Kt as traceDisplayInputConfigSchema, L as deriveStatusFromCaseRows, Ln as defineEval, Lt as cacheStatusSchema, M as applyDerivedCallAttributes, Mn as runInExistingEvalScope, Mt as cacheListItemSchema, N as getNestedAttribute, Nn as setEvalOutput, Nt as cacheModeSchema, O as extractCacheEntries, On as mergeEvalOutput, Ot as cacheDebugKeyFileSchema, P as getEvalTitle, Pn as setScopeCacheContext, Pt as cacheOperationTypeSchema, Q as llmCallPricingSchema, Qt as columnDefSchema, R as deriveStatusFromChildStatuses, Rn as getEvalRegistry, Rt as serializedCacheSpanSchema, Sn as evalLog, St as evalChartConfigSchema, T as createRunRequestSchema, Tn as getEvalStartTime, Tt as evalChartTypeSchema, U as agentEvalsConfigSchema, Ut as traceAttributeDisplayPlacementSchema, V as DEFAULT_API_CALLS_CONFIG, Vt as traceAttributeDisplayFormatSchema, W as apiCallMetricFormatSchema, Wt as traceAttributeDisplaySchema, X as llmCallMetricPlacementSchema, Xt as traceSpanWarningSchema, Y as llmCallMetricFormatSchema, Yt as traceSpanSchema, Z as llmCallMetricSchema, Zt as cellValueSchema, _n as EvalAssertionError, _t as scoreTraceSchema, an as runArtifactRefSchema, at as assertionFailureSchema, bt as evalChartBuiltinMetricSchema, cn as captureEvalSpanError, ct as evalFreshnessStatusSchema, dn as hashCacheKey, dt as evalStatsConfigSchema, en as columnKindSchema, et as removeDefaultConfigSchema, fn as hashCacheKeySync, ft as evalSummarySchema, gn as serializeCacheValue, gt as runLogPhaseSchema, hn as serializeCacheRecording, ht as runLogLocationSchema, in as repoFileRefSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as runInEvalScope, jt as cacheFileSchema, k as extractCacheHits, kn as nextEvalId, kt as cacheEntrySchema, ln as evalSpan, lt as evalStatAggregateSchema, mn as deserializeCacheValue, mt as runLogLevelSchema, nn as jsonCellSchema, nt as resolveLlmCallsConfig, on as z, ot as caseDetailSchema, pn as deserializeCacheRecording, pt as runLogEntrySchema, q as apiCallsConfigSchema, qt as traceSpanErrorSchema, rn as numberDisplayOptionsSchema, rt as runLogsConfigSchema, sn as buildTraceTree, st as caseRowSchema, tn as fileRefSchema, tt as resolveApiCallsConfig, un as evalTracer, ut as evalStatItemSchema, vn as advanceEvalTime, vt as evalChartAggregateSchema, wn as getEvalCaseInput, wt as evalChartTooltipExtraSchema, xn as evalAssert, xt as evalChartColorSchema, yn as appendToEvalOutput, yt as evalChartAxisSchema, z as runManifestSchema, zt as spanCacheOptionsSchema } from "./runOrchestration-D697g6Qe.mjs";
-import { n as createRunner, t as runCli } from "./cli-Be0x8CS3.mjs";
-import "./src-D6cettg0.mjs";
-export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-B31SV_Bq.mjs";
+import { n as createRunner, t as runCli } from "./cli-ETfZ15RB.mjs";
+import "./src-CyNb2ycA.mjs";
+export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };