npm - @ls-stack/agent-eval - Versions diffs - 0.42.1 → 0.42.3 - Mend

@ls-stack/agent-eval 0.42.1 → 0.42.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/{app-mOYjX9zq.mjs → app-DPamBr5R.mjs} +4 -4
package/dist/apps/web/dist/assets/{index-eFM9VIsz.css → index-S3J5Nm0o.css} +1 -1
package/dist/apps/web/dist/assets/{index-CANDLTsq.js → index-XLJByNnS.js} +2 -2
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-DbVfkr9T.mjs → cli-BeJCJMQo.mjs} +49 -23
package/dist/index.d.mts +42 -43
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +1 -1
package/dist/{runOrchestration-SPaHx-SC.mjs → runOrchestration-OVUFw1fL.mjs} +12 -23
package/dist/{runner-DiCQ57JQ.mjs → runner-B1KygirW.mjs} +2 -2
package/dist/{runner-BYOdLBle.mjs → runner-BJQq7cpd.mjs} +1 -1
package/dist/{src-CANi3gpd.mjs → src-D7_xKo7h.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +15 -8

package/dist/apps/web/dist/index.html CHANGED Viewed

@@ -25,8 +25,8 @@
       href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
       rel="stylesheet"
     />
-    <script type="module" crossorigin src="/assets/index-CANDLTsq.js"></script>
-    <link rel="stylesheet" crossorigin href="/assets/index-eFM9VIsz.css">
+    <script type="module" crossorigin src="/assets/index-XLJByNnS.js"></script>
+    <link rel="stylesheet" crossorigin href="/assets/index-S3J5Nm0o.css">
   </head>
   <body>
     <div id="root"></div>

package/dist/bin.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env node
-import { t as runCli } from "./cli-DbVfkr9T.mjs";
+import { t as runCli } from "./cli-BeJCJMQo.mjs";
 import { spawn } from "node:child_process";
 //#region src/bin.ts
 const moduleMocksFlag = "--experimental-test-module-mocks";

package/dist/{cli-DbVfkr9T.mjs → cli-BeJCJMQo.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Mt as getCaseRowEvalKey, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, Rt as getEvalRegistry, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-SPaHx-SC.mjs";
+import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-OVUFw1fL.mjs";
 import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
 import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
 import { createHash, randomUUID } from "node:crypto";
@@ -476,6 +476,11 @@ function validateManualInputsForRequest(params) {
 	};
 }
 //#endregion
+//#region ../runner/src/objectUtils.ts
+function isRecord(value) {
+	return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+//#endregion
 //#region ../runner/src/recalculateDerivedAttributes.ts
 function getCaseArtifactFileIdForExistingRun(runState, caseRow) {
 	const caseKey = getCaseRowCaseKey(caseRow);
@@ -505,7 +510,7 @@ async function recalculateDerivedAttributesForCase(params) {
 	});
 	let nextTrace = spansWithDerivedAttributes;
 	let nextTraceDisplay = caseDetail.traceDisplay;
-	const evalMeta = params.evals.get(getCaseRowEvalKey(caseRow));
+	const evalMeta = caseRow.evalKey === void 0 ? void 0 : params.evals.get(caseRow.evalKey);
 	const entry = evalMeta === void 0 ? void 0 : getEvalRegistry().get(evalMeta.id);
 	if (entry !== void 0) entry.use((evalDef) => {
 		const resolved = resolveTracePresentation(spansWithDerivedAttributes, params.traceDisplayConfig, evalDef.traceDisplay);
@@ -787,6 +792,36 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
 	managerContext.emitDiscoveryEvent();
 }
 //#endregion
+//#region ../runner/src/runnerStateHydration.ts
+/** Rehydrate a persisted run while preserving live listeners/process handles. */
+function toRunnerRunState(snapshot, existing) {
+	return {
+		...snapshot,
+		listeners: existing?.listeners ?? /* @__PURE__ */ new Set(),
+		childProcess: existing?.childProcess,
+		childTerminalReceived: existing?.childTerminalReceived ?? false
+	};
+}
+//#endregion
+//#region ../runner/src/runTargetPersistence.ts
+/** Build the exact-key run target persisted in run history. */
+function buildPersistedRunTarget(params) {
+	const { target, evalKeys } = params;
+	if (target.mode === "all") return { mode: "all" };
+	const persistEvalKeys = (target.evalKeys?.length ?? 0) > 0 || (target.evalIds?.length ?? 0) > 0 || (target.files?.length ?? 0) > 0;
+	const keyedTarget = {
+		mode: target.mode,
+		evalKeys: persistEvalKeys && evalKeys.length > 0 ? evalKeys : void 0,
+		files: target.files,
+		tagsFilter: target.tagsFilter
+	};
+	if (target.mode === "caseIds") return {
+		...keyedTarget,
+		caseIds: target.caseIds
+	};
+	return keyedTarget;
+}
+//#endregion
 //#region ../runner/src/watchRoots.ts
 const globMagicCharacters = new Set([
 	"*",
@@ -823,9 +858,6 @@ function getWatchRootsForIncludePatterns(params) {
 }
 //#endregion
 //#region ../runner/src/runner.ts
-function isRecord(value) {
-	return typeof value === "object" && value !== null && !Array.isArray(value);
-}
 /** Create an in-memory eval runner bound to the current workspace config. */
 function createRunner({ watchForChanges = true } = {}) {
 	let config;
@@ -898,7 +930,6 @@ function createRunner({ watchForChanges = true } = {}) {
 			const updatedRuns = await recomputeEvalStatusesInRuns({
 				runs: runs.values(),
 				evalKey: evalMeta.key,
-				evalId: evalMeta.id,
 				evalExists: evals.has(evalMeta.key),
 				scoreThresholds,
 				persistCaseDetail
@@ -930,7 +961,6 @@ function createRunner({ watchForChanges = true } = {}) {
 					target: run.manifest.target,
 					caseRows: run.cases,
 					evalKey: evalMeta?.key ?? evalKey,
-					evalId: evalMeta?.id,
 					evalExists: evalMeta !== void 0
 				})) continue;
 				if (run.manifest.status === "running") continue;
@@ -959,7 +989,7 @@ function createRunner({ watchForChanges = true } = {}) {
 				updated: false,
 				reason: "Case not found"
 			};
-			const evalMeta = evals.get(getCaseRowEvalKey(caseRow));
+			const evalMeta = caseRow.evalKey === void 0 ? void 0 : evals.get(caseRow.evalKey);
 			if (!evalMeta) return {
 				updated: false,
 				reason: "Eval not found"
@@ -1170,6 +1200,10 @@ function createRunner({ watchForChanges = true } = {}) {
 			const cacheMode = request.cache?.mode ?? "use";
 			const runDir = join(localStateDir, "runs", runId);
 			const gitState = readGitWorktreeState(workspaceRoot);
+			const targetEvalKeys = getTargetEvalKeys({
+				request,
+				sortedEvals: getSortedEvalMetas()
+			});
 			const manifest = {
 				id: runId,
 				shortId,
@@ -1179,7 +1213,10 @@ function createRunner({ watchForChanges = true } = {}) {
 				endedAt: null,
 				commitSha: gitState.commitSha,
 				evalSourceFingerprints: {},
-				target: request.target,
+				target: buildPersistedRunTarget({
+					target: request.target,
+					evalKeys: targetEvalKeys
+				}),
 				trials: request.trials,
 				trialSelection: config.trialSelection ?? "lowestScore",
 				cacheMode
@@ -1224,10 +1261,7 @@ function createRunner({ watchForChanges = true } = {}) {
 			runs.set(runId, runState);
 			setLatestRunInfoMap({
 				latestRunInfoMap,
-				evalIds: getTargetEvalKeys({
-					request: materializedRequest,
-					sortedEvals: getSortedEvalMetas()
-				}),
+				evalIds: targetEvalKeys,
 				info: {
 					status: "running",
 					startedAt: now,
@@ -1478,14 +1512,6 @@ function createRunner({ watchForChanges = true } = {}) {
 		nextShortIdNum = Math.max(nextShortIdNum, nextShortIdFromSnapshots(persistedRuns));
 		if (changed) emitDiscoveryEvent();
 	}
-	function toRunnerRunState(snapshot, existing) {
-		return {
-			...snapshot,
-			listeners: existing?.listeners ?? /* @__PURE__ */ new Set(),
-			childProcess: existing?.childProcess,
-			childTerminalReceived: existing?.childTerminalReceived ?? false
-		};
-	}
 	return runner;
 }
 //#endregion
@@ -2046,8 +2072,8 @@ async function commandApp(args) {
 	const { serve } = await import("@hono/node-server");
 	const bundledWebDist = resolve(currentDir, "apps/web/dist");
 	if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
-	const appModule = await import("./app-mOYjX9zq.mjs");
-	const runnerModule = await import("./runner-BYOdLBle.mjs");
+	const appModule = await import("./app-DPamBr5R.mjs");
+	const runnerModule = await import("./runner-BJQq7cpd.mjs");
 	if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
 	if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
 	await runnerModule.initRunner();

package/dist/index.d.mts CHANGED Viewed

@@ -1717,13 +1717,13 @@ type ColumnKind = z$1.infer<typeof columnKindSchema>;
 declare const columnFormatSchema: z$1.ZodEnum<{
   number: "number";
   boolean: "boolean";
-  duration: "duration";
-  json: "json";
   file: "file";
   markdown: "markdown";
+  json: "json";
   image: "image";
   audio: "audio";
   video: "video";
+  duration: "duration";
   percent: "percent";
   passFail: "passFail";
   stars: "stars";
@@ -1742,13 +1742,13 @@ declare const columnDefSchema: z$1.ZodObject<{
   format: z$1.ZodOptional<z$1.ZodEnum<{
     number: "number";
     boolean: "boolean";
-    duration: "duration";
-    json: "json";
     file: "file";
     markdown: "markdown";
+    json: "json";
     image: "image";
     audio: "audio";
     video: "video";
+    duration: "duration";
     percent: "percent";
     passFail: "passFail";
     stars: "stars";
@@ -1762,8 +1762,8 @@ declare const columnDefSchema: z$1.ZodObject<{
   hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
   align: z$1.ZodOptional<z$1.ZodEnum<{
     left: "left";
-    right: "right";
     center: "center";
+    right: "right";
   }>>;
 }, z$1.core.$strip>;
 /** Column definition exposed to the UI for eval and case tables. */
@@ -1792,8 +1792,8 @@ type CellValue = z$1.infer<typeof cellValueSchema>; //#endregion
 declare const traceAttributeDisplayFormatSchema: z$1.ZodEnum<{
   string: "string";
   number: "number";
-  duration: "duration";
   json: "json";
+  duration: "duration";
 }>;
 /**
  * Formatting hint for trace attribute values rendered by the UI.
@@ -1817,8 +1817,8 @@ declare const traceAttributeDisplaySchema: z$1.ZodObject<{
   format: z$1.ZodOptional<z$1.ZodEnum<{
     string: "string";
     number: "number";
-    duration: "duration";
     json: "json";
+    duration: "duration";
   }>>;
   numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
   placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -1853,8 +1853,8 @@ declare const traceDisplayConfigSchema: z$1.ZodObject<{
     format: z$1.ZodOptional<z$1.ZodEnum<{
       string: "string";
       number: "number";
-      duration: "duration";
       json: "json";
+      duration: "duration";
     }>>;
     numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
     placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -1893,8 +1893,8 @@ declare const traceAttributeDisplayInputSchema: z$1.ZodObject<{
   format: z$1.ZodOptional<z$1.ZodEnum<{
     string: "string";
     number: "number";
-    duration: "duration";
     json: "json";
+    duration: "duration";
   }>>;
   numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
   placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -1931,8 +1931,8 @@ declare const traceDisplayInputConfigSchema: z$1.ZodObject<{
     format: z$1.ZodOptional<z$1.ZodEnum<{
       string: "string";
       number: "number";
-      duration: "duration";
       json: "json";
+      duration: "duration";
     }>>;
     numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
     placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -2063,13 +2063,13 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
   format: z$1.ZodOptional<z$1.ZodEnum<{
     number: "number";
     boolean: "boolean";
-    duration: "duration";
-    json: "json";
     file: "file";
     markdown: "markdown";
+    json: "json";
     image: "image";
     audio: "audio";
     video: "video";
+    duration: "duration";
     percent: "percent";
     passFail: "passFail";
     stars: "stars";
@@ -2105,13 +2105,13 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
   format: z$1.ZodOptional<z$1.ZodEnum<{
     number: "number";
     boolean: "boolean";
-    duration: "duration";
-    json: "json";
     file: "file";
     markdown: "markdown";
+    json: "json";
     image: "image";
     audio: "audio";
     video: "video";
+    duration: "duration";
     percent: "percent";
     passFail: "passFail";
     stars: "stars";
@@ -2149,13 +2149,13 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
     format: z$1.ZodOptional<z$1.ZodEnum<{
       number: "number";
       boolean: "boolean";
-      duration: "duration";
-      json: "json";
       file: "file";
       markdown: "markdown";
+      json: "json";
       image: "image";
       audio: "audio";
       video: "video";
+      duration: "duration";
       percent: "percent";
       passFail: "passFail";
       stars: "stars";
@@ -2169,8 +2169,8 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
     hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
     align: z$1.ZodOptional<z$1.ZodEnum<{
       left: "left";
-      right: "right";
       center: "center";
+      right: "right";
     }>>;
   }, z$1.core.$strip>>;
   caseCount: z$1.ZodNullable<z$1.ZodNumber>;
@@ -2208,13 +2208,13 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
     format: z$1.ZodOptional<z$1.ZodEnum<{
       number: "number";
       boolean: "boolean";
-      duration: "duration";
-      json: "json";
       file: "file";
       markdown: "markdown";
+      json: "json";
       image: "image";
       audio: "audio";
       video: "video";
+      duration: "duration";
       percent: "percent";
       passFail: "passFail";
       stars: "stars";
@@ -2239,8 +2239,8 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
       }>;
       label: z$1.ZodOptional<z$1.ZodString>;
       color: z$1.ZodOptional<z$1.ZodEnum<{
-        error: "error";
         success: "success";
+        error: "error";
         warning: "warning";
         accent: "accent";
         accentDim: "accentDim";
@@ -2263,8 +2263,8 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
       }>;
       label: z$1.ZodOptional<z$1.ZodString>;
       color: z$1.ZodOptional<z$1.ZodEnum<{
-        error: "error";
         success: "success";
+        error: "error";
         warning: "warning";
         accent: "accent";
         accentDim: "accentDim";
@@ -2529,8 +2529,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
       format: z$1.ZodOptional<z$1.ZodEnum<{
         string: "string";
         number: "number";
-        duration: "duration";
         json: "json";
+        duration: "duration";
       }>>;
       numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
       placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -2616,8 +2616,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
       format: z$1.ZodOptional<z$1.ZodEnum<{
         string: "string";
         number: "number";
-        duration: "duration";
         json: "json";
+        duration: "duration";
       }>>;
       numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
       placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -2685,8 +2685,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
         format: z$1.ZodOptional<z$1.ZodEnum<{
           string: "string";
           number: "number";
-          duration: "duration";
           json: "json";
+          duration: "duration";
         }>>;
         numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
         placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -2831,8 +2831,8 @@ type EvalChartAggregate = z$1.infer<typeof evalChartAggregateSchema>;
  * not emit raw hex so authored evals stay decoupled from the web theme.
  */
 declare const evalChartColorSchema: z$1.ZodEnum<{
-  error: "error";
   success: "success";
+  error: "error";
   warning: "warning";
   accent: "accent";
   accentDim: "accentDim";
@@ -2860,8 +2860,8 @@ declare const evalChartMetricSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
   }>;
   label: z$1.ZodOptional<z$1.ZodString>;
   color: z$1.ZodOptional<z$1.ZodEnum<{
-    error: "error";
     success: "success";
+    error: "error";
     warning: "warning";
     accent: "accent";
     accentDim: "accentDim";
@@ -2884,8 +2884,8 @@ declare const evalChartMetricSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
   }>;
   label: z$1.ZodOptional<z$1.ZodString>;
   color: z$1.ZodOptional<z$1.ZodEnum<{
-    error: "error";
     success: "success";
+    error: "error";
     warning: "warning";
     accent: "accent";
     accentDim: "accentDim";
@@ -2943,8 +2943,8 @@ declare const evalChartConfigSchema: z$1.ZodObject<{
     }>;
     label: z$1.ZodOptional<z$1.ZodString>;
     color: z$1.ZodOptional<z$1.ZodEnum<{
-      error: "error";
       success: "success";
+      error: "error";
       warning: "warning";
       accent: "accent";
       accentDim: "accentDim";
@@ -2967,8 +2967,8 @@ declare const evalChartConfigSchema: z$1.ZodObject<{
     }>;
     label: z$1.ZodOptional<z$1.ZodString>;
     color: z$1.ZodOptional<z$1.ZodEnum<{
-      error: "error";
       success: "success";
+      error: "error";
       warning: "warning";
       accent: "accent";
       accentDim: "accentDim";
@@ -3033,8 +3033,8 @@ declare const evalChartsConfigSchema: z$1.ZodArray<z$1.ZodObject<{
     }>;
     label: z$1.ZodOptional<z$1.ZodString>;
     color: z$1.ZodOptional<z$1.ZodEnum<{
-      error: "error";
       success: "success";
+      error: "error";
       warning: "warning";
       accent: "accent";
       accentDim: "accentDim";
@@ -3057,8 +3057,8 @@ declare const evalChartsConfigSchema: z$1.ZodArray<z$1.ZodObject<{
     }>;
     label: z$1.ZodOptional<z$1.ZodString>;
     color: z$1.ZodOptional<z$1.ZodEnum<{
-      error: "error";
       success: "success";
+      error: "error";
       warning: "warning";
       accent: "accent";
       accentDim: "accentDim";
@@ -3310,8 +3310,8 @@ declare const llmCallMetricFormatSchema$1: z$1.ZodEnum<{
   string: "string";
   number: "number";
   boolean: "boolean";
-  duration: "duration";
   json: "json";
+  duration: "duration";
 }>;
 /** Render format applied to an LLM-call metric value. */
 type LlmCallMetricFormat = z$1.infer<typeof llmCallMetricFormatSchema$1>;
@@ -3320,8 +3320,8 @@ declare const apiCallMetricFormatSchema$1: z$1.ZodEnum<{
   string: "string";
   number: "number";
   boolean: "boolean";
-  duration: "duration";
   json: "json";
+  duration: "duration";
 }>;
 /** Render format applied to an API-call metric value. */
 type ApiCallMetricFormat = z$1.infer<typeof apiCallMetricFormatSchema$1>;
@@ -3390,8 +3390,8 @@ declare const llmCallMetricSchema: z$1.ZodObject<{
     string: "string";
     number: "number";
     boolean: "boolean";
-    duration: "duration";
     json: "json";
+    duration: "duration";
   }>>;
   numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
   placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -3419,8 +3419,8 @@ declare const apiCallMetricSchema: z$1.ZodObject<{
     string: "string";
     number: "number";
     boolean: "boolean";
-    duration: "duration";
     json: "json";
+    duration: "duration";
   }>>;
   numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
   placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -3533,8 +3533,8 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
       string: "string";
       number: "number";
       boolean: "boolean";
-      duration: "duration";
       json: "json";
+      duration: "duration";
     }>>;
     numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
     placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -3569,8 +3569,8 @@ declare const apiCallsConfigSchema: z$1.ZodObject<{
       string: "string";
       number: "number";
       boolean: "boolean";
-      duration: "duration";
       json: "json";
+      duration: "duration";
     }>>;
     numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
     placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
@@ -5886,7 +5886,7 @@ type EvalRunner = {
   clearCache(filter?: CacheClearFilter): Promise<void>;
   /**
    * Recompute persisted case and run statuses for terminal runs touching one
-   * eval. Accepts the exact eval key, with a legacy fallback for unique eval ids.
+   * eval. Accepts the exact eval key.
    */
   recomputeStatusesForEval(evalKey: string): Promise<{
     updatedRuns: number;
@@ -5897,7 +5897,7 @@ type EvalRunner = {
   }): Promise<RecalculateDerivedAttributesResult>;
   /**
    * Delete terminal persisted runs that touch one eval from memory and disk.
-   * Accepts the exact eval key, with a legacy fallback for unique eval ids.
+   * Accepts the exact eval key.
    */
   cleanRunsForEval(evalKey: string): Promise<{
     deletedRuns: number;
@@ -5937,13 +5937,12 @@ type EvalRunner = {
   validateManualInputs(request: CreateRunRequest$1): ManualInputValidationResult;
 }; //#endregion
 //#region src/runner.d.ts
-type CreateRunnerOptions = {
-  watchForChanges?: boolean;
-};
 /** Create an in-memory eval runner bound to the current workspace config. */
 declare function createRunner({
   watchForChanges
-}?: CreateRunnerOptions): EvalRunner; //#endregion
+}?: {
+  watchForChanges?: boolean;
+}): EvalRunner; //#endregion
 //#region src/manualInput/files.d.ts
 type StageManualInputFileParams = {
   workspaceRoot: string;

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, Rt as getEvalRegistry, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-SPaHx-SC.mjs";
-import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DbVfkr9T.mjs";
-import { n as matchesEvalTags, t as defineEval } from "./src-CANi3gpd.mjs";
+import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-OVUFw1fL.mjs";
+import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BeJCJMQo.mjs";
+import { n as matchesEvalTags, t as defineEval } from "./src-D7_xKo7h.mjs";
 export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as evalChartsConfigSchema, It as columnDefSchema, Nt as evalStatsConfigSchema, Pt as manualInputDescriptorSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-SPaHx-SC.mjs";
+import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-OVUFw1fL.mjs";
 import { z } from "zod/v4";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";

package/dist/{runOrchestration-SPaHx-SC.mjs → runOrchestration-OVUFw1fL.mjs} RENAMED Viewed

@@ -866,10 +866,6 @@ function buildCaseKey(params) {
 		encodeURIComponent(params.caseId)
 	].join("#");
 }
-/** Return the collision-safe eval key stored on a row, falling back for legacy data. */
-function getCaseRowEvalKey(row) {
-	return row.evalKey ?? row.evalId;
-}
 /** Return the collision-safe case key stored on a row, falling back for legacy data. */
 function getCaseRowCaseKey(row) {
 	return row.caseKey ?? row.caseId;
@@ -6586,9 +6582,9 @@ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
 	return caseRow.status === "error" ? "error" : "pass";
 }
 function runTouchesEval(params) {
-	if (params.caseRows.some((caseRow) => getCaseRowEvalKey(caseRow) === params.evalKey || caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) return true;
+	if (params.caseRows.some((caseRow) => caseRow.evalKey === params.evalKey)) return true;
 	if (params.target.mode === "all") return params.evalExists;
-	if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? params.target.evalIds?.includes(params.evalId ?? params.evalKey) ?? false;
+	if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? false;
 	return false;
 }
 async function deleteTemporaryRuns(params) {
@@ -6619,13 +6615,12 @@ async function recomputeEvalStatusesInRuns(params) {
 			target: run.manifest.target,
 			caseRows: run.cases,
 			evalKey: params.evalKey,
-			evalId: params.evalId,
 			evalExists: params.evalExists
 		})) continue;
 		if (run.manifest.status === "running") continue;
 		let changed = false;
 		for (const caseRow of run.cases) {
-			if (getCaseRowEvalKey(caseRow) !== params.evalKey && !(caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) continue;
+			if (caseRow.evalKey !== params.evalKey) continue;
 			const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
 			const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
 			if (caseRow.status === nextStatus) continue;
@@ -6708,15 +6703,14 @@ function getLastRunStatuses(params) {
 function getLatestRunInfos(params) {
 	const { runs, knownEvals } = params;
 	const knownEvalMetas = [...knownEvals];
-	const evalIdByKey = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.id]));
 	const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
 	const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
 	const latestRunInfos = /* @__PURE__ */ new Map();
 	for (const run of orderedRuns) for (const evalKey of getRunEvalKeys(run, knownEvalMetas)) latestRunInfos.set(evalKey, {
-		status: getEvalStatusForRun(run, evalKey, evalIdByKey.get(evalKey), manualScoreKeysByEval.get(evalKey) ?? []),
+		status: getEvalStatusForRun(run, evalKey, manualScoreKeysByEval.get(evalKey) ?? []),
 		startedAt: getRunFreshnessTimestamp(run.manifest),
 		commitSha: run.manifest.commitSha ?? null,
-		evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? run.manifest.evalSourceFingerprints[evalIdByKey.get(evalKey) ?? ""] ?? null
+		evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? null
 	});
 	return latestRunInfos;
 }
@@ -6775,19 +6769,14 @@ async function readCaseDetails(runDir) {
 }
 function getRunEvalKeys(run, knownEvals) {
 	const knownEvalMetas = [...knownEvals];
-	const evalKeys = new Set(run.cases.map(getCaseRowEvalKey));
-	for (const caseRow of run.cases) {
-		if (caseRow.evalKey !== void 0) continue;
-		for (const evalMeta of knownEvalMetas) if (evalMeta.id === caseRow.evalId) evalKeys.add(evalMeta.key);
-	}
-	if (run.manifest.target.mode === "evalIds") {
-		for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
-		for (const evalId of run.manifest.target.evalIds ?? []) for (const evalMeta of knownEvalMetas) if (evalMeta.id === evalId) evalKeys.add(evalMeta.key);
-	} else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
+	const evalKeys = /* @__PURE__ */ new Set();
+	for (const caseRow of run.cases) if (caseRow.evalKey !== void 0) evalKeys.add(caseRow.evalKey);
+	if (run.manifest.target.mode === "evalIds") for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
+	else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
 	return [...evalKeys];
 }
-function getEvalStatusForRun(run, evalKey, evalId, manualScoreKeys) {
-	const evalCases = run.cases.filter((caseRow) => getCaseRowEvalKey(caseRow) === evalKey || caseRow.evalKey === void 0 && caseRow.evalId === evalId);
+function getEvalStatusForRun(run, evalKey, manualScoreKeys) {
+	const evalCases = run.cases.filter((caseRow) => caseRow.evalKey === evalKey);
 	if (evalCases.length > 0) {
 		if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
 		return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
@@ -7809,4 +7798,4 @@ function toLastRunStatus(status) {
 	return status === "pending" ? null : status;
 }
 //#endregion
-export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, evalChartsConfigSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, columnDefSchema as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, defineEval as Lt, z$1 as M, getCaseRowEvalKey as Mt, buildTraceTree as N, evalStatsConfigSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, manualInputDescriptorSchema as Pt, evalTime as Q, hashCacheKeySync as R, getEvalRegistry as Rt, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
+export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, columnDefSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, defineEval as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, getEvalRegistry as Lt, z$1 as M, evalStatsConfigSchema as Mt, buildTraceTree as N, manualInputDescriptorSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, evalChartsConfigSchema as Pt, evalTime as Q, hashCacheKeySync as R, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };