npm - @ls-stack/agent-eval - Versions diffs - 0.52.3 → 0.54.0 - Mend

@ls-stack/agent-eval 0.52.3 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/{app-BsFcUIQp.mjs → app-CMJtftUB.mjs} +12 -4
package/dist/apps/web/dist/assets/index-BcLH_Wme.css +1 -0
package/dist/apps/web/dist/assets/index-Bx1VZctJ.js +377 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-DbVFgRO3.mjs → cli-BEtk5skO.mjs} +51 -11
package/dist/index.d.mts +339 -134
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +2 -1
package/dist/{runOrchestration-DT6cje9E.mjs → runOrchestration-C78U4Ir0.mjs} +245 -141
package/dist/{runner-WRQdfG0r.mjs → runner-5y6nEBZM.mjs} +2 -2
package/dist/{runner-CyTUvbHE.mjs → runner-pmIYculz.mjs} +1 -1
package/dist/{src-DlvYXPxG.mjs → src-BM6LW4ou.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +20 -4
package/dist/apps/web/dist/assets/index-BHc4gfUO.css +0 -1
package/dist/apps/web/dist/assets/index-D9HUAH8K.js +0 -373

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-DT6cje9E.mjs";
-import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DbVFgRO3.mjs";
-import { n as matchesEvalTags, t as defineEval } from "./src-DlvYXPxG.mjs";
+import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, Rt as getEvalRegistry, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-C78U4Ir0.mjs";
+import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BEtk5skO.mjs";
+import { n as matchesEvalTags, t as defineEval } from "./src-BM6LW4ou.mjs";
 export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-DT6cje9E.mjs";
+import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as evalChartsConfigSchema, It as columnDefSchema, Mt as evalStatAggregateSchema, Nt as evalStatsConfigSchema, Pt as manualInputDescriptorSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C78U4Ir0.mjs";
 import { z } from "zod/v4";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";
@@ -17,6 +17,7 @@ const evalMetaSchema = z.object({
 	caseCount: z.number().nullable(),
 	caseIds: z.array(z.string()).optional(),
 	stats: evalStatsConfigSchema.optional(),
+	defaultStatAggregate: evalStatAggregateSchema.optional(),
 	charts: evalChartsConfigSchema.optional(),
 	manualInputDescriptor: manualInputDescriptorSchema.optional(),
 	requiresManualInput: z.boolean().optional()

package/dist/{runOrchestration-DT6cje9E.mjs → runOrchestration-C78U4Ir0.mjs} RENAMED Viewed

@@ -1,6 +1,6 @@
 import { createRequire, registerHooks } from "node:module";
-import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
 import { AsyncLocalStorage } from "node:async_hooks";
+import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
 import { z, z as z$1 } from "zod/v4";
 import dayjs from "dayjs";
 import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
@@ -14,16 +14,31 @@ import { Result, resultify } from "t-result";
 import { fileURLToPath, pathToFileURL } from "node:url";
 //#region ../sdk/src/defineEval.ts
 const evalRegistry = /* @__PURE__ */ new Map();
+const evalRegistryStorage = new AsyncLocalStorage();
 /** Return the in-memory registry of evals defined in the current process. */
 function getEvalRegistry() {
-	return evalRegistry;
+	return evalRegistryStorage.getStore() ?? evalRegistry;
+}
+/**
+* Execute a callback with an empty async-local eval registry.
+*
+* Runner internals use this when importing eval modules concurrently so
+* `defineEval(...)` calls from one import cannot overwrite another import's
+* registered definitions. The callback receives the scoped registry populated
+* during its async execution.
+*/
+async function runWithEvalRegistry(fn) {
+	const scopedRegistry = /* @__PURE__ */ new Map();
+	return await evalRegistryStorage.run(scopedRegistry, async () => {
+		return await fn(scopedRegistry);
+	});
 }
 /**
 * Register an eval definition with the SDK so the runner can discover it
 * after importing the eval module.
 */
 function defineEval(definition) {
-	evalRegistry.set(definition.id, {
+	getEvalRegistry().set(definition.id, {
 		id: definition.id,
 		title: definition.title,
 		use: (fn) => fn(definition)
@@ -45,13 +60,15 @@ const jsonCellSchema = z.lazy(() => z.union([
 const repoFileRefSchema = z.object({
 	source: z.literal("repo"),
 	path: z.string(),
-	mimeType: z.string().optional()
+	mimeType: z.string().optional(),
+	sizeBytes: z.number().int().nonnegative().optional()
 });
 const runArtifactRefSchema = z.object({
 	source: z.literal("run"),
 	artifactId: z.string(),
 	mimeType: z.string(),
-	fileName: z.string().optional()
+	fileName: z.string().optional(),
+	sizeBytes: z.number().int().nonnegative().optional()
 });
 const fileRefSchema = z.union([repoFileRefSchema, runArtifactRefSchema]);
 /** Schema for numeric presentation options used by number-formatted values. */
@@ -82,6 +99,8 @@ const columnFormatSchema = z.enum([
 	"markdown",
 	"json",
 	"image",
+	"html",
+	"pdf",
 	"audio",
 	"video",
 	"file",
@@ -604,13 +623,18 @@ const evalFreshnessStatusSchema = z.enum([
 	"stale",
 	"outdated"
 ]);
-/** Reducer used to collapse a column's per-case values into a single stat. */
+/**
+* Reducer used to collapse per-case values into a single duration or column
+* stat.
+* `best` selects the highest finite value and `worst` selects the lowest.
+*/
 const evalStatAggregateSchema = z.enum([
 	"avg",
 	"min",
 	"max",
 	"sum",
-	"last"
+	"best",
+	"worst"
 ]);
 const hideIfNoValueShape = {
 /**
@@ -620,10 +644,12 @@ const hideIfNoValueShape = {
 */
 hideIfNoValue: z.boolean().optional() };
 /**
-* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
-* `cacheHits` counts Agent Eval operation-level cache hits from spans and
-* `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
-* `column` aggregates a score or numeric output column across the latest run.
+* One entry in the EvalCard stats row. Built-in kinds read from the latest run;
+* `duration` aggregates per-case durations, `cacheHits` counts Agent Eval
+* operation-level cache hits from spans and `evalTracer.cache(...)` refs, not
+* LLM provider prompt-cache read tokens. Cache hits use an independent
+* aggregate mode and default to `sum`. `column` aggregates a score or numeric
+* output column across the latest run.
 */
 const evalStatItemSchema = z.discriminatedUnion("kind", [
 	z.object({
@@ -637,10 +663,12 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
 	}),
 	z.object({
 		kind: z.literal("duration"),
+		aggregate: evalStatAggregateSchema.optional(),
 		...hideIfNoValueShape
 	}),
 	z.object({
 		kind: z.literal("cacheHits"),
+		aggregate: evalStatAggregateSchema.optional(),
 		...hideIfNoValueShape
 	}),
 	z.object({
@@ -700,6 +728,11 @@ z.object({
 	*/
 	stats: evalStatsConfigSchema.optional(),
 	/**
+	* Initial aggregate mode used for duration and column stats on this eval
+	* card. Overrides workspace-level `defaultStatAggregate` when present.
+	*/
+	defaultStatAggregate: evalStatAggregateSchema.optional(),
+	/**
 	* Ordered per-eval history chart configuration for the EvalCard. Opt-in:
 	* when omitted or empty, the UI renders no history chart at all.
 	*/
@@ -1334,6 +1367,7 @@ const agentEvalsConfigSchema = z.object({
 	columns: evalColumnsSchema.optional(),
 	deriveFromTracing: evalDeriveConfigSchema.optional(),
 	stats: evalStatsConfigSchema.optional(),
+	defaultStatAggregate: evalStatAggregateSchema.optional(),
 	llmCalls: llmCallsConfigSchema.optional(),
 	removeDefaultConfig: removeDefaultConfigSchema.optional(),
 	apiCalls: apiCallsConfigSchema.optional(),
@@ -3372,17 +3406,20 @@ async function readManualInputFile(value, options = {}) {
 //#region ../sdk/src/repoFile.ts
 /**
 * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
-* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
+* by a column configured with `format: 'image' | 'html' | 'pdf' | 'audio' |
+* 'video' | 'file'`.
 *
 * @param path Relative or absolute path to the repository file.
 * @param mimeType Optional MIME type hint for UI rendering.
+* @param sizeBytes Optional file size hint shown by artifact cards in the UI.
 * @returns A repo-backed file reference suitable for file/media columns.
 */
-function repoFile(path, mimeType) {
+function repoFile(path, mimeType, sizeBytes) {
 	return {
 		source: "repo",
 		path,
-		mimeType
+		mimeType,
+		sizeBytes
 	};
 }
 //#endregion
@@ -4917,12 +4954,13 @@ function createFsCacheStore(options) {
 		},
 		async lookup(namespace, keyHash) {
 			const entry = await readCacheEntry(cacheDir, namespace, keyHash);
-			return entry === null ? null : await materializeExternalJsonCacheEntry(entry, externalJsonStore);
+			return entry === null ? null : await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
 		},
 		async lookupWithDebug(namespace, keyHash) {
 			const rawEntry = await readCacheEntry(cacheDir, namespace, keyHash);
 			if (rawEntry === null) return null;
-			const entry = await materializeExternalJsonCacheEntry(rawEntry, externalJsonStore);
+			const entry = await materializeExternalJsonCacheEntryOrNull(rawEntry, externalJsonStore);
+			if (entry === null) return null;
 			const debugKey = await readDebugEntry(debugDir, namespace, keyHash);
 			const deserializedEntry = {
 				...entry,
@@ -5019,7 +5057,7 @@ function createBufferedCacheStore(backingStore) {
 		externalJsonStore: backingStore.externalJsonStore,
 		async lookup(namespace, keyHash) {
 			const buffered = pendingEntries.get(toPendingKey(namespace, keyHash));
-			if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntry(buffered.entry, backingStore.externalJsonStore);
+			if (buffered !== void 0) return backingStore.externalJsonStore === void 0 ? buffered.entry : await materializeExternalJsonCacheEntryOrNull(buffered.entry, backingStore.externalJsonStore);
 			return backingStore.lookup(namespace, keyHash);
 		},
 		write(entry, debugKey) {
@@ -5266,6 +5304,10 @@ async function materializeExternalJsonCacheEntry(entry, store) {
 		recording: cacheRecordingSchema.parse(await materializeExternalJsonValues(entry.recording, store))
 	};
 }
+async function materializeExternalJsonCacheEntryOrNull(entry, store) {
+	const result = await resultify(() => materializeExternalJsonCacheEntry(entry, store));
+	return result.error ? null : result.value;
+}
 async function pruneExternalJsonBlobs(cacheDir, blobDir) {
 	if (!existsSync(blobDir)) return;
 	const referenced = await collectReferencedExternalJsonBlobPaths(cacheDir);
@@ -5857,6 +5899,7 @@ function resolveEvalDefaultConfig(params) {
 			globalRemove: params.globalRemove,
 			evalRemove
 		}),
+		defaultStatAggregate: params.evalDef.defaultStatAggregate ?? params.globalDefaultStatAggregate,
 		charts: appendDefaultCharts({
 			charts: params.evalDef.charts,
 			globalRemove: params.globalRemove,
@@ -6065,6 +6108,114 @@ async function loadEvalModule(filePath, sourceFingerprint = void 0) {
 	await import(moduleUrl.href);
 }
 //#endregion
+//#region ../runner/src/moduleIsolation.ts
+const isolationParam = "agent-evals-isolate";
+const pathSegmentSeparatorPattern = /[\\/]+/;
+const isolationStorage = new AsyncLocalStorage();
+const activeIsolationRoots = /* @__PURE__ */ new Map();
+const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
+let hooksRegistered = false;
+const requireFromRunner = createRequire(import.meta.url);
+const agentPackageUrlBySpecifier = new Map([
+	"@ls-stack/agent-eval",
+	"@agent-evals/sdk",
+	"@agent-evals/shared",
+	"@agent-evals/runner",
+	"@agent-evals/runner/run-child"
+].flatMap((specifier) => {
+	try {
+		return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
+	} catch {
+		return [];
+	}
+}));
+function isAgentEvalsPackageSpecifier(specifier) {
+	return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
+}
+function getIsolationKeyFromParent(parentURL) {
+	if (!parentURL?.startsWith("file:")) return null;
+	const value = new URL(parentURL).searchParams.get(isolationParam);
+	return activeIsolationRoots.has(value ?? "") ? value : null;
+}
+function isIsolatableFile(url, workspaceRoot) {
+	if (url.protocol !== "file:") return false;
+	return isIsolatableFilePath(fileURLToPath(url), workspaceRoot);
+}
+function isIsolatableFilePath(filePath, workspaceRoot) {
+	const relativePath = relative(workspaceRoot, filePath);
+	if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
+	return !relativePath.split(pathSegmentSeparatorPattern).includes(".agent-evals");
+}
+function addIsolationParam(url, key) {
+	const moduleUrl = new URL(url);
+	if (moduleUrl.searchParams.get(isolationParam) === key) return url;
+	moduleUrl.searchParams.set(isolationParam, key);
+	return moduleUrl.href;
+}
+function registerModuleIsolationHooks() {
+	if (hooksRegistered) return;
+	hooksRegistered = true;
+	registerHooks({ resolve(specifier, context, nextResolve) {
+		const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
+		if (agentPackageUrl !== void 0) return {
+			url: agentPackageUrl,
+			shortCircuit: true
+		};
+		const resolved = nextResolve(specifier, context);
+		if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
+		const activeContext = isolationStorage.getStore();
+		const inferredKey = getIsolationKeyFromParent(context.parentURL);
+		const isolationKey = activeContext?.key ?? inferredKey;
+		if (isolationKey === null) return resolved;
+		const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
+		if (workspaceRoot === void 0) return resolved;
+		if (!isIsolatableFile(new URL(resolved.url), workspaceRoot)) return resolved;
+		return {
+			...resolved,
+			url: addIsolationParam(resolved.url, isolationKey)
+		};
+	} });
+}
+function clearWorkspaceRequireCacheOnce(context) {
+	if (clearedRequireCacheKeys.has(context.key)) return;
+	clearedRequireCacheKeys.add(context.key);
+	for (const filePath of Object.keys(requireFromRunner.cache)) if (isIsolatableFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
+}
+/**
+* Execute module loading and eval code with fresh module URLs.
+*
+* Node does not expose an ESM cache reset API, so the runner appends a
+* scoped query parameter to workspace and dependency file imports. CommonJS
+* modules use `require.cache` behind ESM imports, so isolatable entries are
+* cleared once per scope. Agent Evals package imports are left alone so SDK
+* singletons, such as the eval registry, remain shared.
+*/
+async function runWithModuleIsolation(context, fn) {
+	registerModuleIsolationHooks();
+	clearWorkspaceRequireCacheOnce(context);
+	activeIsolationRoots.set(context.key, context.workspaceRoot);
+	return await isolationStorage.run(context, fn);
+}
+//#endregion
+//#region ../runner/src/evalRegistryLoader.ts
+async function loadIsolatedEvalRegistry(params) {
+	return await runWithEvalRegistry(async (registry) => {
+		await runWithModuleIsolation(params.moduleIsolation, async () => {
+			await runInEvalRuntimeScope(params.runtimeScope, async () => {
+				await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
+			});
+		});
+		return registry;
+	});
+}
+async function useIsolatedEvalDefinition(params) {
+	const entry = (await loadIsolatedEvalRegistry(params)).get(params.evalId);
+	if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
+	return await entry.use(async (evalDef) => {
+		return await params.use(evalDef);
+	});
+}
+//#endregion
 //#region ../runner/src/freshness.ts
 /**
 * Derive eval freshness from the latest run, current eval-file fingerprint,
@@ -6509,12 +6660,15 @@ async function persistInlineArtifact({ artifactDir, runId, caseId, outputKey, tr
 		sanitizeSegment(outputKey),
 		sanitizeFileName(fileName)
 	].join("__");
-	await writeFile(join(artifactDir, artifactId), new Uint8Array(await value.arrayBuffer()));
+	const targetPath = join(artifactDir, artifactId);
+	const bytes = new Uint8Array(await value.arrayBuffer());
+	await writeFile(targetPath, bytes);
 	return {
 		source: "run",
 		artifactId,
 		mimeType,
-		fileName
+		fileName,
+		sizeBytes: bytes.byteLength
 	};
 }
 /** Resolve a persisted run artifact path from its artifact id. */
@@ -6862,96 +7016,6 @@ function stripTerminalControlCodes(value) {
 	return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
 }
 //#endregion
-//#region ../runner/src/moduleIsolation.ts
-const isolationParam = "agent-evals-isolate";
-const pathSegmentSeparatorPattern = /[\\/]+/;
-const isolationStorage = new AsyncLocalStorage();
-const activeIsolationRoots = /* @__PURE__ */ new Map();
-const clearedRequireCacheKeys = /* @__PURE__ */ new Set();
-let hooksRegistered = false;
-const requireFromRunner = createRequire(import.meta.url);
-const agentPackageUrlBySpecifier = new Map([
-	"@ls-stack/agent-eval",
-	"@agent-evals/sdk",
-	"@agent-evals/shared",
-	"@agent-evals/runner",
-	"@agent-evals/runner/run-child"
-].flatMap((specifier) => {
-	try {
-		return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
-	} catch {
-		return [];
-	}
-}));
-function isAgentEvalsPackageSpecifier(specifier) {
-	return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
-}
-function getIsolationKeyFromParent(parentURL) {
-	if (!parentURL?.startsWith("file:")) return null;
-	const value = new URL(parentURL).searchParams.get(isolationParam);
-	return activeIsolationRoots.has(value ?? "") ? value : null;
-}
-function isWorkspaceFile(url, workspaceRoot) {
-	if (url.protocol !== "file:") return false;
-	return isWorkspaceFilePath(fileURLToPath(url), workspaceRoot);
-}
-function isWorkspaceFilePath(filePath, workspaceRoot) {
-	const relativePath = relative(workspaceRoot, filePath);
-	if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
-	const segments = relativePath.split(pathSegmentSeparatorPattern);
-	return !segments.includes("node_modules") && !segments.includes(".agent-evals");
-}
-function addIsolationParam(url, key) {
-	const moduleUrl = new URL(url);
-	if (moduleUrl.searchParams.get(isolationParam) === key) return url;
-	moduleUrl.searchParams.set(isolationParam, key);
-	return moduleUrl.href;
-}
-function registerModuleIsolationHooks() {
-	if (hooksRegistered) return;
-	hooksRegistered = true;
-	registerHooks({ resolve(specifier, context, nextResolve) {
-		const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
-		if (agentPackageUrl !== void 0) return {
-			url: agentPackageUrl,
-			shortCircuit: true
-		};
-		const resolved = nextResolve(specifier, context);
-		if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
-		const activeContext = isolationStorage.getStore();
-		const inferredKey = getIsolationKeyFromParent(context.parentURL);
-		const isolationKey = activeContext?.key ?? inferredKey;
-		if (isolationKey === null) return resolved;
-		const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
-		if (workspaceRoot === void 0) return resolved;
-		if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
-		return {
-			...resolved,
-			url: addIsolationParam(resolved.url, isolationKey)
-		};
-	} });
-}
-function clearWorkspaceRequireCacheOnce(context) {
-	if (clearedRequireCacheKeys.has(context.key)) return;
-	clearedRequireCacheKeys.add(context.key);
-	for (const filePath of Object.keys(requireFromRunner.cache)) if (isWorkspaceFilePath(filePath, context.workspaceRoot)) delete requireFromRunner.cache[filePath];
-}
-/**
-* Execute module loading and eval code with fresh workspace module URLs.
-*
-* Node does not expose an ESM cache reset API, so the runner appends a
-* run-scoped query parameter to workspace file imports. CommonJS modules use
-* `require.cache` behind ESM imports, so workspace entries are cleared once per
-* run. Package imports are left alone so SDK singletons, such as the eval
-* registry, remain shared.
-*/
-async function runWithModuleIsolation(context, fn) {
-	registerModuleIsolationHooks();
-	clearWorkspaceRequireCacheOnce(context);
-	activeIsolationRoots.set(context.key, context.workspaceRoot);
-	return await isolationStorage.run(context, fn);
-}
-//#endregion
 //#region ../runner/src/runExecution.ts
 function filterEvalCases(cases, caseIds) {
 	if (!caseIds || caseIds.length === 0) return cases;
@@ -7422,6 +7486,30 @@ function getTargetEvalKeys(params) {
 }
 //#endregion
 //#region ../runner/src/runOrchestration.ts
+function toOptionalSourceFingerprint(sourceFingerprint) {
+	return sourceFingerprint.length > 0 ? sourceFingerprint : void 0;
+}
+function buildCaseModuleIsolation(params) {
+	return {
+		key: [
+			params.runId,
+			params.evalKey,
+			params.caseId,
+			`trial-${String(params.trial)}`
+		].join(":"),
+		workspaceRoot: params.workspaceRoot
+	};
+}
+function buildEvalPreparationModuleIsolation(params) {
+	return {
+		key: [
+			params.runId,
+			params.evalKey,
+			"prepare"
+		].join(":"),
+		workspaceRoot: params.workspaceRoot
+	};
+}
 /**
 * Ranks case statuses from worst to best. Used to order trial attempts so the
 * pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
@@ -7559,14 +7647,15 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 		const preparedEvals = [];
 		const cacheMode = runState.manifest.cacheMode ?? "use";
 		const cacheEnabled = config.cache?.enabled !== false;
-		const moduleIsolation = {
-			key: runState.manifest.id,
-			workspaceRoot
-		};
 		const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
 		const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
 		for (const evalMeta of targetEvals) {
 			const evalFilePath = evalMeta.sourceFilePath;
+			const evalModuleIsolation = buildEvalPreparationModuleIsolation({
+				runId: runState.manifest.id,
+				evalKey: evalMeta.key,
+				workspaceRoot
+			});
 			let sourceFingerprint = "";
 			try {
 				sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
@@ -7581,13 +7670,12 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 				evalMeta.sourceFingerprint = null;
 			}
 			try {
-				const registry = getEvalRegistry();
-				await runWithModuleIsolation(moduleIsolation, async () => {
-					await runInEvalRuntimeScope("env", async () => {
-						await loadEvalModule(evalFilePath, sourceFingerprint);
-					});
-				});
-				const entry = registry.get(evalMeta.id);
+				const entry = (await loadIsolatedEvalRegistry({
+					evalFilePath,
+					sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
+					moduleIsolation: evalModuleIsolation,
+					runtimeScope: "env"
+				})).get(evalMeta.id);
 				if (!entry) {
 					evalErrors.push({
 						evalId: evalMeta.id,
@@ -7595,7 +7683,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 					});
 					continue;
 				}
-				await runWithModuleIsolation(moduleIsolation, async () => {
+				await runWithModuleIsolation(evalModuleIsolation, async () => {
 					await runInEvalRuntimeScope("cases", async () => {
 						await entry.use(async (evalDef) => {
 							const evalTagsResult = resolveEvalTags({
@@ -7658,6 +7746,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 								evalDef,
 								globalColumns: config.columns,
 								globalStats: config.stats,
+								globalDefaultStatAggregate: config.defaultStatAggregate,
 								globalRemove: config.removeDefaultConfig
 							});
 							const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
@@ -7669,6 +7758,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 							for (const warning of validatedCharts.warnings) console.warn(warning);
 							evalMeta.columnDefs = declaredColumnDefs;
 							evalMeta.stats = defaultConfig.stats;
+							evalMeta.defaultStatAggregate = defaultConfig.defaultStatAggregate;
 							evalMeta.charts = validatedCharts.charts;
 							const evalCaseRows = [];
 							const preparedCases = [];
@@ -7691,29 +7781,43 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 								preparedCases.push(preparedCase);
 								for (let trial = 0; trial < request.trials; trial++) {
 									const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
+									const caseModuleIsolation = buildCaseModuleIsolation({
+										runId: runState.manifest.id,
+										evalKey: evalMeta.key,
+										caseId: evalCase.id,
+										trial,
+										workspaceRoot
+									});
 									queuedCases.push({
 										execute: async ({ startTime, globalTraceDisplay }) => {
-											const { caseDetail, caseRowUpdate } = await runCase({
-												evalDef,
+											const { caseDetail, caseRowUpdate } = await useIsolatedEvalDefinition({
 												evalId: evalMeta.id,
-												evalKey: evalMeta.key,
-												evalCase,
-												globalTraceDisplay,
-												globalColumns: config.columns,
-												globalDeriveFromTracing: config.deriveFromTracing,
-												llmCallsConfig,
-												apiCallsConfig,
-												globalRemoveDefaultConfig: config.removeDefaultConfig,
-												trial,
-												startTime,
-												cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
-												cacheMode,
-												moduleIsolation,
 												evalFilePath,
-												evalFileRelativePath: evalMeta.filePath,
-												workspaceRoot,
-												artifactDir: join(runDir, "artifacts"),
-												runId: runState.manifest.id
+												sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
+												moduleIsolation: caseModuleIsolation,
+												runtimeScope: "env",
+												use: async (isolatedEvalDef) => await runCase({
+													evalDef: isolatedEvalDef,
+													evalId: evalMeta.id,
+													evalKey: evalMeta.key,
+													evalCase,
+													globalTraceDisplay,
+													globalColumns: config.columns,
+													globalDeriveFromTracing: config.deriveFromTracing,
+													llmCallsConfig,
+													apiCallsConfig,
+													globalRemoveDefaultConfig: config.removeDefaultConfig,
+													trial,
+													startTime,
+													cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
+													cacheMode,
+													moduleIsolation: caseModuleIsolation,
+													evalFilePath,
+													evalFileRelativePath: evalMeta.filePath,
+													workspaceRoot,
+													artifactDir: join(runDir, "artifacts"),
+													runId: runState.manifest.id
+												})
 											});
 											return {
 												caseDetail,
@@ -7864,4 +7968,4 @@ function toLastRunStatus(status) {
 	return status === "pending" ? null : status;
 }
 //#endregion
-export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, columnDefSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, defineEval as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, getEvalRegistry as Lt, z$1 as M, evalStatsConfigSchema as Mt, buildTraceTree as N, manualInputDescriptorSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, evalChartsConfigSchema as Pt, evalTime as Q, hashCacheKeySync as R, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
+export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, evalChartsConfigSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, columnDefSchema as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, defineEval as Lt, z$1 as M, evalStatAggregateSchema as Mt, buildTraceTree as N, evalStatsConfigSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, manualInputDescriptorSchema as Pt, evalTime as Q, hashCacheKeySync as R, getEvalRegistry as Rt, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadIsolatedEvalRegistry as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };

package/dist/{runner-WRQdfG0r.mjs → runner-5y6nEBZM.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-DbVFgRO3.mjs";
-import "./src-DlvYXPxG.mjs";
+import { n as createRunner } from "./cli-BEtk5skO.mjs";
+import "./src-BM6LW4ou.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{runner-CyTUvbHE.mjs → runner-pmIYculz.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-WRQdfG0r.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-5y6nEBZM.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{src-DlvYXPxG.mjs → src-BM6LW4ou.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-DT6cje9E.mjs";
-import "./cli-DbVFgRO3.mjs";
+import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-C78U4Ir0.mjs";
+import "./cli-BEtk5skO.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {