npm - @ls-stack/agent-eval - Versions diffs - 0.53.0 → 0.55.0 - Mend

@ls-stack/agent-eval 0.53.0 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/{app-D0a57pVo.mjs → app-CunZ8Dku.mjs} +12 -4
package/dist/apps/web/dist/assets/index-2I-eWzVL.css +1 -0
package/dist/apps/web/dist/assets/index-CvsPmlHl.js +377 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-3FrKBc9l.mjs → cli-rvPrUj6S.mjs} +51 -11
package/dist/index.d.mts +305 -133
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +2 -1
package/dist/{runOrchestration-Cn6fGL2s.mjs → runOrchestration-BWyE5lRX.mjs} +236 -134
package/dist/{runner-Dsqj431i.mjs → runner-C2fvjKZP.mjs} +1 -1
package/dist/{runner-C0qdoRSi.mjs → runner-CFQ8LZmY.mjs} +2 -2
package/dist/{src-BNmtaqeC.mjs → src-DEENkbkn.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +14 -3
package/dist/apps/web/dist/assets/index-RNejIyap.js +0 -375
package/dist/apps/web/dist/assets/index-vaLgWG8j.css +0 -1

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-Cn6fGL2s.mjs";
-import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-3FrKBc9l.mjs";
-import { n as matchesEvalTags, t as defineEval } from "./src-BNmtaqeC.mjs";
+import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, Rt as getEvalRegistry, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-BWyE5lRX.mjs";
+import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-rvPrUj6S.mjs";
+import { n as matchesEvalTags, t as defineEval } from "./src-DEENkbkn.mjs";
 export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-Cn6fGL2s.mjs";
+import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as evalChartsConfigSchema, It as columnDefSchema, Mt as evalStatAggregateSchema, Nt as evalStatsConfigSchema, Pt as manualInputDescriptorSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BWyE5lRX.mjs";
 import { z } from "zod/v4";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";
@@ -17,6 +17,7 @@ const evalMetaSchema = z.object({
 	caseCount: z.number().nullable(),
 	caseIds: z.array(z.string()).optional(),
 	stats: evalStatsConfigSchema.optional(),
+	defaultStatAggregate: evalStatAggregateSchema.optional(),
 	charts: evalChartsConfigSchema.optional(),
 	manualInputDescriptor: manualInputDescriptorSchema.optional(),
 	requiresManualInput: z.boolean().optional()

package/dist/{runOrchestration-Cn6fGL2s.mjs → runOrchestration-BWyE5lRX.mjs} RENAMED Viewed

@@ -1,6 +1,6 @@
 import { createRequire, registerHooks } from "node:module";
-import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
 import { AsyncLocalStorage } from "node:async_hooks";
+import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
 import { z, z as z$1 } from "zod/v4";
 import dayjs from "dayjs";
 import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
@@ -14,16 +14,31 @@ import { Result, resultify } from "t-result";
 import { fileURLToPath, pathToFileURL } from "node:url";
 //#region ../sdk/src/defineEval.ts
 const evalRegistry = /* @__PURE__ */ new Map();
+const evalRegistryStorage = new AsyncLocalStorage();
 /** Return the in-memory registry of evals defined in the current process. */
 function getEvalRegistry() {
-	return evalRegistry;
+	return evalRegistryStorage.getStore() ?? evalRegistry;
+}
+/**
+* Execute a callback with an empty async-local eval registry.
+*
+* Runner internals use this when importing eval modules concurrently so
+* `defineEval(...)` calls from one import cannot overwrite another import's
+* registered definitions. The callback receives the scoped registry populated
+* during its async execution.
+*/
+async function runWithEvalRegistry(fn) {
+	const scopedRegistry = /* @__PURE__ */ new Map();
+	return await evalRegistryStorage.run(scopedRegistry, async () => {
+		return await fn(scopedRegistry);
+	});
 }
 /**
 * Register an eval definition with the SDK so the runner can discover it
 * after importing the eval module.
 */
 function defineEval(definition) {
-	evalRegistry.set(definition.id, {
+	getEvalRegistry().set(definition.id, {
 		id: definition.id,
 		title: definition.title,
 		use: (fn) => fn(definition)
@@ -608,13 +623,18 @@ const evalFreshnessStatusSchema = z.enum([
 	"stale",
 	"outdated"
 ]);
-/** Reducer used to collapse a column's per-case values into a single stat. */
+/**
+* Reducer used to collapse per-case values into a single duration or column
+* stat.
+* `best` selects the highest finite value and `worst` selects the lowest.
+*/
 const evalStatAggregateSchema = z.enum([
 	"avg",
 	"min",
 	"max",
 	"sum",
-	"last"
+	"best",
+	"worst"
 ]);
 const hideIfNoValueShape = {
 /**
@@ -624,10 +644,12 @@ const hideIfNoValueShape = {
 */
 hideIfNoValue: z.boolean().optional() };
 /**
-* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
-* `cacheHits` counts Agent Eval operation-level cache hits from spans and
-* `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
-* `column` aggregates a score or numeric output column across the latest run.
+* One entry in the EvalCard stats row. Built-in kinds read from the latest run;
+* `duration` aggregates per-case durations, `cacheHits` counts Agent Eval
+* operation-level cache hits from spans and `evalTracer.cache(...)` refs, not
+* LLM provider prompt-cache read tokens. Cache hits use an independent
+* aggregate mode and default to `sum`. `column` aggregates a score or numeric
+* output column across the latest run.
 */
 const evalStatItemSchema = z.discriminatedUnion("kind", [
 	z.object({
@@ -641,10 +663,12 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
 	}),
 	z.object({
 		kind: z.literal("duration"),
+		aggregate: evalStatAggregateSchema.optional(),
 		...hideIfNoValueShape
 	}),
 	z.object({
 		kind: z.literal("cacheHits"),
+		aggregate: evalStatAggregateSchema.optional(),
 		...hideIfNoValueShape
 	}),
 	z.object({
@@ -704,6 +728,11 @@ z.object({
 	*/
 	stats: evalStatsConfigSchema.optional(),
 	/**
+	* Initial aggregate mode used for duration and column stats on this eval
+	* card. Overrides workspace-level `defaultStatAggregate` when present.
+	*/
+	defaultStatAggregate: evalStatAggregateSchema.optional(),
+	/**
 	* Ordered per-eval history chart configuration for the EvalCard. Opt-in:
 	* when omitted or empty, the UI renders no history chart at all.
 	*/
@@ -1338,6 +1367,7 @@ const agentEvalsConfigSchema = z.object({
 	columns: evalColumnsSchema.optional(),
 	deriveFromTracing: evalDeriveConfigSchema.optional(),
 	stats: evalStatsConfigSchema.optional(),
+	defaultStatAggregate: evalStatAggregateSchema.optional(),
 	llmCalls: llmCallsConfigSchema.optional(),
 	removeDefaultConfig: removeDefaultConfigSchema.optional(),
 	apiCalls: apiCallsConfigSchema.optional(),
@@ -4924,12 +4954,13 @@ function createFsCacheStore(options) {
 		},
 		async lookup(namespace, keyHash) {
 			const entry = await readCacheEntry(cacheDir, namespace, keyHash);
-			return entry === null ? null : await materializeExternalJsonCacheEntry(entry, externalJsonStore);
+			return entry === null ? null : await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
 		},
 		async lookupWithDebug(namespace, keyHash) {
 			const rawEntry = await readCacheEntry(cacheDir, namespace, keyHash);
 			if (rawEntry === null) return null;
-			const entry = await materializeExternalJsonCacheEntry(rawEntry, externalJsonStore);
+			const entry = await materializeExternalJsonCacheEntryOrNull(rawEntry, externalJsonStore);
+			if (entry === null) return null;
 			const debugKey = await readDebugEntry(debugDir, namespace, keyHash);
 			const deserializedEntry = {
 				...entry,
@@ -5026,7 +5057,7 @@ function createBufferedCacheStore(backingStore) {
 		externalJsonStore: backingStore.externalJsonStore,
 		async lookup(namespace, keyHash) {
 			const buffered = pendingEntries.get(toPendingKey(namespace, keyHash));
-			if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntry(buffered.entry, backingStore.externalJsonStore);
+			if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntryOrNull(buffered.entry, backingStore.externalJsonStore);
 			return backingStore.lookup(namespace, keyHash);
 		},
 		write(entry, debugKey) {
@@ -5273,6 +5304,10 @@ async function materializeExternalJsonCacheEntry(entry, store) {
 		recording: cacheRecordingSchema.parse(await materializeExternalJsonValues(entry.recording, store))
 	};
 }
+async function materializeExternalJsonCacheEntryOrNull(entry, store) {
+	const result = await resultify(() => materializeExternalJsonCacheEntry(entry, store));
+	return result.error ? null : result.value;
+}
 async function pruneExternalJsonBlobs(cacheDir, blobDir) {
 	if (!existsSync(blobDir)) return;
 	const referenced = await collectReferencedExternalJsonBlobPaths(cacheDir);
@@ -5864,6 +5899,7 @@ function resolveEvalDefaultConfig(params) {
 			globalRemove: params.globalRemove,
 			evalRemove
 		}),
+		defaultStatAggregate: params.evalDef.defaultStatAggregate ?? params.globalDefaultStatAggregate,
 		charts: appendDefaultCharts({
 			charts: params.evalDef.charts,
 			globalRemove: params.globalRemove,
@@ -6072,6 +6108,122 @@ async function loadEvalModule(filePath, sourceFingerprint = void 0) {
 	await import(moduleUrl.href);
 }
 //#endregion
+//#region ../runner/src/moduleIsolation.ts
+const isolationParam = "agent-evals-isolate";
+const pathSegmentSeparatorPattern = /[\\/]+/;
+const isolationStorage = new AsyncLocalStorage();
+const activeIsolationRoots = /* @__PURE__ */ new Map();
+const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
+let hooksRegistered = false;
+const requireFromRunner = createRequire(import.meta.url);
+const agentPackageUrlBySpecifier = new Map([
+	"@ls-stack/agent-eval",
+	"@agent-evals/sdk",
+	"@agent-evals/shared",
+	"@agent-evals/runner",
+	"@agent-evals/runner/run-child"
+].flatMap((specifier) => {
+	try {
+		return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
+	} catch {
+		return [];
+	}
+}));
+const agentPackageDirectoryPaths = [...new Set([...agentPackageUrlBySpecifier.values()].map((packageUrl) => dirname(fileURLToPath(packageUrl))))];
+function isAgentEvalsPackageSpecifier(specifier) {
+	return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
+}
+function getIsolationKeyFromParent(parentURL) {
+	if (!parentURL?.startsWith("file:")) return null;
+	const value = new URL(parentURL).searchParams.get(isolationParam);
+	return activeIsolationRoots.has(value ?? "") ? value : null;
+}
+function isIsolatableFile(url, workspaceRoot) {
+	if (url.protocol !== "file:") return false;
+	return isIsolatableFilePath(fileURLToPath(url), workspaceRoot);
+}
+function isIsolatableFilePath(filePath, workspaceRoot) {
+	if (isAgentEvalsPackageFilePath(filePath)) return false;
+	const relativePath = relative(workspaceRoot, filePath);
+	if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
+	return !relativePath.split(pathSegmentSeparatorPattern).includes(".agent-evals");
+}
+function isAgentEvalsPackageFilePath(filePath) {
+	return agentPackageDirectoryPaths.some((packageDirectoryPath) => {
+		const packageRelativePath = relative(packageDirectoryPath, filePath);
+		return packageRelativePath === "" || !packageRelativePath.startsWith("..") && !isAbsolute(packageRelativePath);
+	});
+}
+function addIsolationParam(url, key) {
+	const moduleUrl = new URL(url);
+	if (moduleUrl.searchParams.get(isolationParam) === key) return url;
+	moduleUrl.searchParams.set(isolationParam, key);
+	return moduleUrl.href;
+}
+function registerModuleIsolationHooks() {
+	if (hooksRegistered) return;
+	hooksRegistered = true;
+	registerHooks({ resolve(specifier, context, nextResolve) {
+		const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
+		if (agentPackageUrl !== void 0) return {
+			url: agentPackageUrl,
+			shortCircuit: true
+		};
+		const resolved = nextResolve(specifier, context);
+		if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
+		const activeContext = isolationStorage.getStore();
+		const inferredKey = getIsolationKeyFromParent(context.parentURL);
+		const isolationKey = activeContext?.key ?? inferredKey;
+		if (isolationKey === null) return resolved;
+		const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
+		if (workspaceRoot === void 0) return resolved;
+		if (!isIsolatableFile(new URL(resolved.url), workspaceRoot)) return resolved;
+		return {
+			...resolved,
+			url: addIsolationParam(resolved.url, isolationKey)
+		};
+	} });
+}
+function clearWorkspaceRequireCacheOnce(context) {
+	if (clearedRequireCacheKeys.has(context.key)) return;
+	clearedRequireCacheKeys.add(context.key);
+	for (const filePath of Object.keys(requireFromRunner.cache)) if (isIsolatableFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
+}
+/**
+* Execute module loading and eval code with fresh module URLs.
+*
+* Node does not expose an ESM cache reset API, so the runner appends a
+* scoped query parameter to workspace and dependency file imports. CommonJS
+* modules use `require.cache` behind ESM imports, so isolatable entries are
+* cleared once per scope. Agent Evals package imports are left alone so SDK
+* singletons, such as the eval registry, remain shared.
+*/
+async function runWithModuleIsolation(context, fn) {
+	registerModuleIsolationHooks();
+	clearWorkspaceRequireCacheOnce(context);
+	activeIsolationRoots.set(context.key, context.workspaceRoot);
+	return await isolationStorage.run(context, fn);
+}
+//#endregion
+//#region ../runner/src/evalRegistryLoader.ts
+async function loadIsolatedEvalRegistry(params) {
+	return await runWithEvalRegistry(async (registry) => {
+		await runWithModuleIsolation(params.moduleIsolation, async () => {
+			await runInEvalRuntimeScope(params.runtimeScope, async () => {
+				await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
+			});
+		});
+		return registry;
+	});
+}
+async function useIsolatedEvalDefinition(params) {
+	const entry = (await loadIsolatedEvalRegistry(params)).get(params.evalId);
+	if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
+	return await entry.use(async (evalDef) => {
+		return await params.use(evalDef);
+	});
+}
+//#endregion
 //#region ../runner/src/freshness.ts
 /**
 * Derive eval freshness from the latest run, current eval-file fingerprint,
@@ -6872,96 +7024,6 @@ function stripTerminalControlCodes(value) {
 	return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
 }
 //#endregion
-//#region ../runner/src/moduleIsolation.ts
-const isolationParam = "agent-evals-isolate";
-const pathSegmentSeparatorPattern = /[\\/]+/;
-const isolationStorage = new AsyncLocalStorage();
-const activeIsolationRoots = /* @__PURE__ */ new Map();
-const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
-let hooksRegistered = false;
-const requireFromRunner = createRequire(import.meta.url);
-const agentPackageUrlBySpecifier = new Map([
-	"@ls-stack/agent-eval",
-	"@agent-evals/sdk",
-	"@agent-evals/shared",
-	"@agent-evals/runner",
-	"@agent-evals/runner/run-child"
-].flatMap((specifier) => {
-	try {
-		return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
-	} catch {
-		return [];
-	}
-}));
-function isAgentEvalsPackageSpecifier(specifier) {
-	return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
-}
-function getIsolationKeyFromParent(parentURL) {
-	if (!parentURL?.startsWith("file:")) return null;
-	const value = new URL(parentURL).searchParams.get(isolationParam);
-	return activeIsolationRoots.has(value ?? "") ? value : null;
-}
-function isWorkspaceFile(url, workspaceRoot) {
-	if (url.protocol !== "file:") return false;
-	return isWorkspaceFilePath(fileURLToPath(url), workspaceRoot);
-}
-function isWorkspaceFilePath(filePath, workspaceRoot) {
-	const relativePath = relative(workspaceRoot, filePath);
-	if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
-	const segments = relativePath.split(pathSegmentSeparatorPattern);
-	return !segments.includes("node_modules") && !segments.includes(".agent-evals");
-}
-function addIsolationParam(url, key) {
-	const moduleUrl = new URL(url);
-	if (moduleUrl.searchParams.get(isolationParam) === key) return url;
-	moduleUrl.searchParams.set(isolationParam, key);
-	return moduleUrl.href;
-}
-function registerModuleIsolationHooks() {
-	if (hooksRegistered) return;
-	hooksRegistered = true;
-	registerHooks({ resolve(specifier, context, nextResolve) {
-		const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
-		if (agentPackageUrl !== void 0) return {
-			url: agentPackageUrl,
-			shortCircuit: true
-		};
-		const resolved = nextResolve(specifier, context);
-		if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
-		const activeContext = isolationStorage.getStore();
-		const inferredKey = getIsolationKeyFromParent(context.parentURL);
-		const isolationKey = activeContext?.key ?? inferredKey;
-		if (isolationKey === null) return resolved;
-		const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
-		if (workspaceRoot === void 0) return resolved;
-		if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
-		return {
-			...resolved,
-			url: addIsolationParam(resolved.url, isolationKey)
-		};
-	} });
-}
-function clearWorkspaceRequireCacheOnce(context) {
-	if (clearedRequireCacheKeys.has(context.key)) return;
-	clearedRequireCacheKeys.add(context.key);
-	for (const filePath of Object.keys(requireFromRunner.cache)) if (isWorkspaceFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
-}
-/**
-* Execute module loading and eval code with fresh workspace module URLs.
-*
-* Node does not expose an ESM cache reset API, so the runner appends a
-* run-scoped query parameter to workspace file imports. CommonJS modules use
-* `require.cache` behind ESM imports, so workspace entries are cleared once per
-* run. Package imports are left alone so SDK singletons, such as the eval
-* registry, remain shared.
-*/
-async function runWithModuleIsolation(context, fn) {
-	registerModuleIsolationHooks();
-	clearWorkspaceRequireCacheOnce(context);
-	activeIsolationRoots.set(context.key, context.workspaceRoot);
-	return await isolationStorage.run(context, fn);
-}
-//#endregion
 //#region ../runner/src/runExecution.ts
 function filterEvalCases(cases, caseIds) {
 	if (!caseIds || caseIds.length === 0) return cases;
@@ -7432,6 +7494,30 @@ function getTargetEvalKeys(params) {
 }
 //#endregion
 //#region ../runner/src/runOrchestration.ts
+function toOptionalSourceFingerprint(sourceFingerprint) {
+	return sourceFingerprint.length > 0 ? sourceFingerprint : void 0;
+}
+function buildCaseModuleIsolation(params) {
+	return {
+		key: [
+			params.runId,
+			params.evalKey,
+			params.caseId,
+			`trial-${String(params.trial)}`
+		].join(":"),
+		workspaceRoot: params.workspaceRoot
+	};
+}
+function buildEvalPreparationModuleIsolation(params) {
+	return {
+		key: [
+			params.runId,
+			params.evalKey,
+			"prepare"
+		].join(":"),
+		workspaceRoot: params.workspaceRoot
+	};
+}
 /**
 * Ranks case statuses from worst to best. Used to order trial attempts so the
 * pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
@@ -7569,14 +7655,15 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 		const preparedEvals = [];
 		const cacheMode = runState.manifest.cacheMode ?? "use";
 		const cacheEnabled = config.cache?.enabled !== false;
-		const moduleIsolation = {
-			key: runState.manifest.id,
-			workspaceRoot
-		};
 		const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
 		const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
 		for (const evalMeta of targetEvals) {
 			const evalFilePath = evalMeta.sourceFilePath;
+			const evalModuleIsolation = buildEvalPreparationModuleIsolation({
+				runId: runState.manifest.id,
+				evalKey: evalMeta.key,
+				workspaceRoot
+			});
 			let sourceFingerprint = "";
 			try {
 				sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
@@ -7591,13 +7678,12 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 				evalMeta.sourceFingerprint = null;
 			}
 			try {
-				const registry = getEvalRegistry();
-				await runWithModuleIsolation(moduleIsolation, async () => {
-					await runInEvalRuntimeScope("env", async () => {
-						await loadEvalModule(evalFilePath, sourceFingerprint);
-					});
-				});
-				const entry = registry.get(evalMeta.id);
+				const entry = (await loadIsolatedEvalRegistry({
+					evalFilePath,
+					sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
+					moduleIsolation: evalModuleIsolation,
+					runtimeScope: "env"
+				})).get(evalMeta.id);
 				if (!entry) {
 					evalErrors.push({
 						evalId: evalMeta.id,
@@ -7605,7 +7691,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 					});
 					continue;
 				}
-				await runWithModuleIsolation(moduleIsolation, async () => {
+				await runWithModuleIsolation(evalModuleIsolation, async () => {
 					await runInEvalRuntimeScope("cases", async () => {
 						await entry.use(async (evalDef) => {
 							const evalTagsResult = resolveEvalTags({
@@ -7668,6 +7754,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 								evalDef,
 								globalColumns: config.columns,
 								globalStats: config.stats,
+								globalDefaultStatAggregate: config.defaultStatAggregate,
 								globalRemove: config.removeDefaultConfig
 							});
 							const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
@@ -7679,6 +7766,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 							for (const warning of validatedCharts.warnings) console.warn(warning);
 							evalMeta.columnDefs = declaredColumnDefs;
 							evalMeta.stats = defaultConfig.stats;
+							evalMeta.defaultStatAggregate = defaultConfig.defaultStatAggregate;
 							evalMeta.charts = validatedCharts.charts;
 							const evalCaseRows = [];
 							const preparedCases = [];
@@ -7701,29 +7789,43 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 								preparedCases.push(preparedCase);
 								for (let trial = 0; trial < request.trials; trial++) {
 									const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
+									const caseModuleIsolation = buildCaseModuleIsolation({
+										runId: runState.manifest.id,
+										evalKey: evalMeta.key,
+										caseId: evalCase.id,
+										trial,
+										workspaceRoot
+									});
 									queuedCases.push({
 										execute: async ({ startTime, globalTraceDisplay }) => {
-											const { caseDetail, caseRowUpdate } = await runCase({
-												evalDef,
+											const { caseDetail, caseRowUpdate } = await useIsolatedEvalDefinition({
 												evalId: evalMeta.id,
-												evalKey: evalMeta.key,
-												evalCase,
-												globalTraceDisplay,
-												globalColumns: config.columns,
-												globalDeriveFromTracing: config.deriveFromTracing,
-												llmCallsConfig,
-												apiCallsConfig,
-												globalRemoveDefaultConfig: config.removeDefaultConfig,
-												trial,
-												startTime,
-												cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
-												cacheMode,
-												moduleIsolation,
 												evalFilePath,
-												evalFileRelativePath: evalMeta.filePath,
-												workspaceRoot,
-												artifactDir: join(runDir, "artifacts"),
-												runId: runState.manifest.id
+												sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
+												moduleIsolation: caseModuleIsolation,
+												runtimeScope: "env",
+												use: async (isolatedEvalDef) => await runCase({
+													evalDef: isolatedEvalDef,
+													evalId: evalMeta.id,
+													evalKey: evalMeta.key,
+													evalCase,
+													globalTraceDisplay,
+													globalColumns: config.columns,
+													globalDeriveFromTracing: config.deriveFromTracing,
+													llmCallsConfig,
+													apiCallsConfig,
+													globalRemoveDefaultConfig: config.removeDefaultConfig,
+													trial,
+													startTime,
+													cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
+													cacheMode,
+													moduleIsolation: caseModuleIsolation,
+													evalFilePath,
+													evalFileRelativePath: evalMeta.filePath,
+													workspaceRoot,
+													artifactDir: join(runDir, "artifacts"),
+													runId: runState.manifest.id
+												})
 											});
 											return {
 												caseDetail,
@@ -7874,4 +7976,4 @@ function toLastRunStatus(status) {
 	return status === "pending" ? null : status;
 }
 //#endregion
-export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, columnDefSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, defineEval as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, getEvalRegistry as Lt, z$1 as M, evalStatsConfigSchema as Mt, buildTraceTree as N, manualInputDescriptorSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, evalChartsConfigSchema as Pt, evalTime as Q, hashCacheKeySync as R, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
+export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, evalChartsConfigSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, columnDefSchema as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, defineEval as Lt, z$1 as M, evalStatAggregateSchema as Mt, buildTraceTree as N, evalStatsConfigSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, manualInputDescriptorSchema as Pt, evalTime as Q, hashCacheKeySync as R, getEvalRegistry as Rt, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadIsolatedEvalRegistry as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };

package/dist/{runner-Dsqj431i.mjs → runner-C2fvjKZP.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-C0qdoRSi.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-CFQ8LZmY.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-C0qdoRSi.mjs → runner-CFQ8LZmY.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-3FrKBc9l.mjs";
-import "./src-BNmtaqeC.mjs";
+import { n as createRunner } from "./cli-rvPrUj6S.mjs";
+import "./src-DEENkbkn.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{src-BNmtaqeC.mjs → src-DEENkbkn.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-Cn6fGL2s.mjs";
-import "./cli-3FrKBc9l.mjs";
+import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-BWyE5lRX.mjs";
+import "./cli-rvPrUj6S.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.53.0",
+  "version": "0.55.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/sdk": "0.0.1",
-    "@agent-evals/shared": "0.0.1"
+    "@agent-evals/shared": "0.0.1",
+    "@agent-evals/sdk": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -30,7 +30,8 @@ display rules), read the TypeScript declarations shipped with the package:
   `agent-evals.config.ts` to opt into run-all CLI behavior.
 - `agent-evals run --temporary` persists a run like normal history, but deletes
   it before the next run starts. Temporary runs appear in `show-runs` while
-  present; normal runs are never deleted by temporary-run cleanup.
+  present; normal runs are never deleted by temporary-run cleanup. In the app,
+  the run drawer can promote a temporary run to durable history.
 - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
   place when the runner is idle. If config changes during an active run, the
   reload applies after the current run reaches a terminal state.
@@ -415,10 +416,17 @@ definition. Global `stats` in `agent-evals.config.ts` combine with eval-level
 stats. Native stat kinds include `cases`, `passRate`, `duration`, and
 `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total
 cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not
-LLM provider prompt-cache read tokens such as `cachedInputTokens`. Usage stats
+LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit
+stats use a separate aggregate control and default to `sum`; `avg` is average
+per-case hit rate, and min/max/best/worst select cases by hit rate. `duration`
+aggregates per-case durations using the same modes as column stats. Usage stats
 and LLM usage charts are added by default unless removed with
 `removeDefaultConfig`. Column stats can override `format` and `numberFormat`,
-otherwise they inherit from the matching column. Number formats use
+otherwise they inherit from the matching column. Duration and column stat
+aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value),
+and `worst` (lowest finite value). Use `defaultStatAggregate` in
+`agent-evals.config.ts` to set the workspace-wide initial duration/column stat
+mode, or on an eval definition to override it for that eval. Number formats use
 `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing
 zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats
 and charts support `hideIfNoValue: true`. Charts support
@@ -566,6 +574,9 @@ For true module replacement inside an eval, register `mock.module(...)` from
 Node's `--experimental-test-module-mocks` flag automatically for CLI and app
 runs. Use dynamic
 `import(...)` inside `execute` — static imports happen too early.
+Each case/trial reloads the eval module graph in its own isolation scope, so
+module-level mock state in workspace files and ESM dependencies does not leak
+between concurrent cases.
 ```ts
 import { mock } from 'node:test';