npm - vieval - Versions diffs - 0.0.1 → 0.0.3 - Mend

vieval 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/README.md +8 -5
package/dist/cli/index.d.mts +1 -1
package/dist/cli/index.mjs +1204 -61
package/dist/cli/index.mjs.map +1 -1
package/dist/{config-D2fe1SnT.mjs → config-CHN24egi.mjs} +1 -1
package/dist/{config-D2fe1SnT.mjs.map → config-CHN24egi.mjs.map} +1 -1
package/dist/config.d.mts +2 -3
package/dist/config.mjs +2 -2
package/dist/core/assertions/index.d.mts +1 -1
package/dist/core/inference-executors/index.d.mts +1 -45
package/dist/core/inference-executors/index.mjs +1 -38
package/dist/core/inference-executors/index.mjs.map +1 -1
package/dist/core/processors/results/index.d.mts +1 -1
package/dist/core/runner/index.d.mts +2 -2
package/dist/core/runner/index.mjs +2 -2
package/dist/env-C7X81PWa.mjs +41 -0
package/dist/env-C7X81PWa.mjs.map +1 -0
package/dist/env-DtpjACOW.d.mts +47 -0
package/dist/expect-B2vaoRVZ.d.mts +10 -0
package/dist/{expect-i9WZWGrA.mjs → expect-CaXiUkwY.mjs} +3 -3
package/dist/expect-CaXiUkwY.mjs.map +1 -0
package/dist/expect-extensions-BOzwV5EJ.mjs +197 -0
package/dist/expect-extensions-BOzwV5EJ.mjs.map +1 -0
package/dist/expect.d.mts +1 -1
package/dist/expect.mjs +1 -1
package/dist/{index-DP7jsORl.d.mts → index-BDMEAmf2.d.mts} +246 -3
package/dist/{index-oSXhM1zx.d.mts → index-C3gPFmcR.d.mts} +2 -2
package/dist/index.d.mts +326 -6
package/dist/index.mjs +65 -23
package/dist/index.mjs.map +1 -1
package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
package/dist/{models-D_MsBtYw.mjs.map → models-DIGdOUpJ.mjs.map} +1 -1
package/dist/plugins/chat-models/index.d.mts +465 -6
package/dist/plugins/chat-models/index.mjs +469 -6
package/dist/plugins/chat-models/index.mjs.map +1 -1
package/dist/{registry-ChOjjdEC.mjs → registry-CHJcTN2W.mjs} +75 -16
package/dist/registry-CHJcTN2W.mjs.map +1 -0
package/dist/{runner-4ZsOveoY.mjs → runner-Dpy-eivM.mjs} +177 -21
package/dist/runner-Dpy-eivM.mjs.map +1 -0
package/dist/testing/expect-extensions.d.mts +44 -38
package/dist/testing/expect-extensions.mjs +1 -1
package/package.json +11 -4
package/dist/expect-0jPJ7Zio.d.mts +0 -2318
package/dist/expect-extensions-CwPtgTz8.mjs +0 -13471
package/dist/expect-extensions-CwPtgTz8.mjs.map +0 -1
package/dist/expect-i9WZWGrA.mjs.map +0 -1
package/dist/magic-string.es-CH1jwzMg.mjs +0 -1013
package/dist/magic-string.es-CH1jwzMg.mjs.map +0 -1
package/dist/plugin-DVaRZY2x.d.mts +0 -84
package/dist/registry-ChOjjdEC.mjs.map +0 -1
package/dist/runner-4ZsOveoY.mjs.map +0 -1

package/dist/cli/index.mjs CHANGED Viewed

@@ -1,18 +1,196 @@
 #!/usr/bin/env node
-import { n as consumeModuleRegistrations, r as endModuleRegistration, s as loadVievalCliConfig, t as beginModuleRegistration } from "../registry-ChOjjdEC.mjs";
-import { a as runScheduledTasks, i as RunnerExecutionError, n as createRunnerSchedule, r as createRunnerRuntimeContext, s as collectEvalEntries, t as createTaskExecutionContext } from "../runner-4ZsOveoY.mjs";
+import { c as loadRawVievalConfig, l as loadVievalCliConfig, n as consumeModuleRegistrations, o as detectCliConfigMode, r as endModuleRegistration, t as beginModuleRegistration } from "../registry-CHJcTN2W.mjs";
+import { a as runScheduledTasks, i as RunnerExecutionError, l as createFilesystemTaskCacheRuntime, n as createRunnerSchedule, r as createRunnerRuntimeContext, s as collectEvalEntries, t as createTaskExecutionContext } from "../runner-Dpy-eivM.mjs";
 import process from "node:process";
-import path from "node:path";
+import { access, mkdir, writeFile } from "node:fs/promises";
+import path, { basename, dirname, isAbsolute, join, relative, resolve } from "node:path";
 import { fileURLToPath, pathToFileURL } from "node:url";
 import { errorMessageFrom } from "@moeru/std";
-import c from "tinyrainbow";
+import { existsSync, readFileSync, realpathSync } from "node:fs";
 import meow from "meow";
-import { uniq } from "es-toolkit";
 import { glob } from "tinyglobby";
+import { randomUUID } from "node:crypto";
+import c from "tinyrainbow";
+import { uniq } from "es-toolkit";
+import { createVitest } from "vitest/node";
 import { stripVTControlCharacters } from "node:util";
 import stringWidth from "fast-string-width";
-//#region package.json
-var name = "vieval";
+//#region src/cli/comparison-config.ts
+const supportedWorkspaceConfigFileNames = [
+	"vieval.config.ts",
+	"vieval.config.mts",
+	"vieval.config.cts",
+	"vieval.config.js",
+	"vieval.config.mjs",
+	"vieval.config.cjs",
+	"vieval.config.json"
+];
+async function isReadableFile(filePath) {
+	try {
+		await access(filePath);
+		return true;
+	} catch {
+		return false;
+	}
+}
+function normalizeGlobInput(patterns) {
+	if (patterns == null) return [];
+	return (typeof patterns === "string" ? [patterns] : patterns).map((pattern) => pattern.trim()).filter((pattern) => pattern.length > 0);
+}
+function normalizeMethodShape(method, configDirectory, index) {
+	const id = method.id.trim();
+	const workspace = method.workspace.trim();
+	const project = method.project.trim();
+	const configFilePath = method.configFilePath?.trim();
+	if (id.length === 0) throw new Error(`Comparison method #${index + 1} is missing id.`);
+	if (workspace.length === 0) throw new Error(`Comparison method "${id}" is missing workspace.`);
+	if (project.length === 0) throw new Error(`Comparison method "${id}" is missing project.`);
+	const resolvedWorkspace = isAbsolute(workspace) ? workspace : resolve(configDirectory, workspace);
+	return {
+		configFilePath: configFilePath == null || configFilePath.length === 0 ? void 0 : isAbsolute(configFilePath) ? configFilePath : resolve(configDirectory, configFilePath),
+		id,
+		project,
+		workspace: resolvedWorkspace
+	};
+}
+async function findWorkspaceConfigFile(workspaceDirectory) {
+	for (const fileName of supportedWorkspaceConfigFileNames) {
+		const candidate = join(workspaceDirectory, fileName);
+		if (await isReadableFile(candidate)) return candidate;
+	}
+	return null;
+}
+function createDiscoveredMethodId(configDirectory, workspace, projectName) {
+	const relativeWorkspace = relative(configDirectory, workspace);
+	return `${(relativeWorkspace.length > 0 ? relativeWorkspace : basename(workspace)).replaceAll("\\", "/")}:${projectName}`;
+}
+async function discoverMethodsFromWorkspaceGlobs(args) {
+	const includes = normalizeGlobInput(args.comparison.includesWorkspaces);
+	if (includes.length === 0) return [];
+	const discoveredWorkspaceDirectories = await glob(includes, {
+		absolute: true,
+		cwd: args.configDirectory,
+		ignore: normalizeGlobInput(args.comparison.excludesWorkspaces),
+		onlyDirectories: true
+	});
+	const methods = [];
+	for (const workspaceDirectory of discoveredWorkspaceDirectories.sort((left, right) => left.localeCompare(right))) {
+		const configFilePath = await findWorkspaceConfigFile(workspaceDirectory);
+		if (configFilePath == null) continue;
+		const loadedWorkspaceConfig = await loadVievalCliConfig({
+			configFilePath,
+			cwd: workspaceDirectory
+		});
+		for (const project of loadedWorkspaceConfig.projects) methods.push({
+			configFilePath,
+			id: createDiscoveredMethodId(args.configDirectory, workspaceDirectory, project.name),
+			project: project.name,
+			workspace: workspaceDirectory
+		});
+	}
+	return methods;
+}
+function validateMethodIdsAreUnique(methods) {
+	const methodIds = methods.map((method) => method.id);
+	const duplicatedMethodId = methodIds.find((methodId, index) => methodIds.indexOf(methodId) !== index);
+	if (duplicatedMethodId != null) throw new Error(`Duplicate comparison method id "${duplicatedMethodId}".`);
+}
+function assertComparisonMode(config) {
+	const mode = detectCliConfigMode(config);
+	if (mode !== "comparisons") throw new Error(`Expected comparison-mode config, but received ${mode}-mode config.`);
+}
+function selectComparisonConfig(comparisons, comparisonId) {
+	if (comparisons.length === 0) throw new Error("Comparison config requires at least one comparisons entry.");
+	if (comparisonId == null || comparisonId.trim().length === 0) {
+		if (comparisons.length > 1) throw new Error(`Multiple comparisons found. Provide --comparison. Available ids: ${comparisons.map((item) => item.id).join(", ")}`);
+		return comparisons[0];
+	}
+	const selected = comparisons.find((item) => item.id === comparisonId);
+	if (selected == null) throw new Error(`Unknown comparison id "${comparisonId}".`);
+	return selected;
+}
+function normalizeBenchmark(comparison) {
+	const benchmarkId = comparison.benchmark.id.trim();
+	const sharedCaseNamespace = comparison.benchmark.sharedCaseNamespace.trim();
+	if (benchmarkId.length === 0) throw new Error("Comparison config requires benchmark.id.");
+	if (sharedCaseNamespace.length === 0) throw new Error("Comparison config requires benchmark.sharedCaseNamespace.");
+	return {
+		id: benchmarkId,
+		sharedCaseNamespace
+	};
+}
+/**
+* Loads and validates comparison-mode data from `vieval.config.*`.
+*/
+async function loadVievalComparisonConfig(options = {}) {
+	const cwd = options.cwd ?? process.cwd();
+	try {
+		const loaded = await loadRawVievalConfig({
+			configFilePath: options.configFilePath,
+			cwd
+		});
+		if (loaded.configFilePath == null || loaded.config == null) throw new Error("Failed to find vieval config. Expected vieval.config.*");
+		assertComparisonMode(loaded.config);
+		const selectedComparison = selectComparisonConfig(loaded.config.comparisons, options.comparisonId);
+		const configDirectory = dirname(loaded.configFilePath);
+		const explicitMethods = (selectedComparison.methods ?? []).map((method, index) => normalizeMethodShape(method, configDirectory, index));
+		const discoveredMethods = await discoverMethodsFromWorkspaceGlobs({
+			comparison: selectedComparison,
+			configDirectory
+		});
+		const methods = [...explicitMethods, ...discoveredMethods];
+		if (methods.length === 0) throw new Error("Comparison config resolved zero methods. Configure methods or includesWorkspaces.");
+		validateMethodIdsAreUnique(methods);
+		return {
+			config: {
+				benchmark: normalizeBenchmark(selectedComparison),
+				methods
+			},
+			configFilePath: loaded.configFilePath
+		};
+	} catch (error) {
+		const errorMessage = errorMessageFrom(error) ?? "Unknown comparison config loading error.";
+		const resolvedPath = options.configFilePath ?? "vieval.config";
+		throw new Error(`Failed to load comparison config "${resolvedPath}": ${errorMessage}`);
+	}
+}
+//#endregion
+//#region src/cli/report-compare.ts
+/**
+* Builds a compact compare report sorted by hybrid/exact score.
+*/
+function buildCompareReportArtifact(args) {
+	const rows = args.methods.map((method) => {
+		const overall = method.output.projects[0]?.result?.overall;
+		return {
+			exactAverage: overall?.exactAverage ?? null,
+			hybridAverage: overall?.hybridAverage ?? null,
+			methodId: method.methodId,
+			runCount: overall?.runCount ?? 0
+		};
+	});
+	rows.sort((left, right) => {
+		const leftHybrid = left.hybridAverage ?? Number.NEGATIVE_INFINITY;
+		const rightHybrid = right.hybridAverage ?? Number.NEGATIVE_INFINITY;
+		if (leftHybrid !== rightHybrid) return rightHybrid - leftHybrid;
+		const leftExact = left.exactAverage ?? Number.NEGATIVE_INFINITY;
+		return (right.exactAverage ?? Number.NEGATIVE_INFINITY) - leftExact;
+	});
+	return {
+		benchmarkId: args.benchmarkId,
+		methods: rows,
+		reportPath: args.reportPath
+	};
+}
+/**
+* Writes compare report artifact as JSON.
+*/
+async function writeCompareReportArtifact(args) {
+	const outputPath = resolve(args.outputPath);
+	await mkdir(dirname(outputPath), { recursive: true });
+	await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
+	return outputPath;
+}
 //#endregion
 //#region src/cli/discovery.ts
 /**
@@ -33,6 +211,58 @@ async function discoverEvalFiles(options) {
 	})).sort((left, right) => left.localeCompare(right));
 }
 //#endregion
+//#region src/cli/module-runtime.ts
+/**
+* Loads eval modules and returns a normalized eval-module map.
+*
+* Use when:
+* - CLI collection needs Vite/Vitest-powered module resolution and transforms
+* - eval files should be imported with the same runtime semantics as Vitest
+*
+* Expects:
+* - `projectRoot` points at the project that owns the eval files
+* - each `evalFilePaths` entry is an absolute file path
+*
+* Returns:
+* - eval modules keyed by stable file href + optional registration suffixes
+*/
+async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
+	const loadedModules = {};
+	const runtime = await createVitest("test", {
+		config: false,
+		root: projectRoot,
+		run: false,
+		silent: true,
+		watch: false
+	});
+	try {
+		for (const evalFilePath of evalFilePaths) {
+			const moduleHref = pathToFileURL(evalFilePath).href;
+			beginModuleRegistration(moduleHref);
+			try {
+				const moduleValue = await runtime.import(moduleHref);
+				const registeredDefinitions = consumeModuleRegistrations(moduleHref);
+				const defaultDefinition = moduleValue.default;
+				const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
+				const deduplicatedDefinitions = definitions.filter((definition, index) => {
+					const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
+					return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
+				});
+				if (deduplicatedDefinitions.length === 0) continue;
+				for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
+					const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
+					loadedModules[moduleKey] = { default: definition };
+				}
+			} finally {
+				endModuleRegistration();
+			}
+		}
+	} finally {
+		await runtime.close();
+	}
+	return loadedModules;
+}
+//#endregion
 //#region src/cli/reporters/noop-reporter.ts
 /**
 * Creates a reporter that intentionally does nothing.
@@ -684,7 +914,144 @@ function getTextDisplayWidth(text) {
 	return stringWidth(stripVTControlCharacters(text));
 }
 //#endregion
+//#region src/cli/reporters/vitest-compat-reporter.ts
+function isReporterReferenceTuple(reference) {
+	return Array.isArray(reference);
+}
+function isAbsoluteLikePath(value) {
+	return value.startsWith("/") || value.startsWith("./") || value.startsWith("../") || /^[A-Z]:[\\/]/i.test(value);
+}
+async function loadReporterModule(path) {
+	if (isAbsoluteLikePath(path)) return import(pathToFileURL(path).href);
+	return import(path);
+}
+function normalizeReporterReference(reference) {
+	if (isReporterReferenceTuple(reference)) return {
+		options: reference[1],
+		value: reference[0]
+	};
+	return {
+		options: void 0,
+		value: reference
+	};
+}
+function createReporterInstance(moduleValue, options) {
+	const value = moduleValue.default ?? moduleValue;
+	if (value == null) return null;
+	if (typeof value === "function") return new value(options);
+	if (typeof value === "object") return value;
+	return null;
+}
+async function emitToReporters(reporters, callback) {
+	await Promise.all(reporters.map(async (reporter) => {
+		try {
+			await callback(reporter);
+		} catch {}
+	}));
+}
+/**
+* Creates a project-level vitest-compatible reporter bridge.
+*
+* Use when:
+* - `vieval` should reuse vitest-like reporter callbacks without changing CLI output contracts
+*
+* Expects:
+* - references point to modules whose default export is a reporter instance or constructor
+*
+* Returns:
+* - `null` when no reporter references are configured
+*/
+async function createVievalVitestCompatReporterBridge(options) {
+	if (options.references.length === 0) return null;
+	const loadedReporters = [];
+	for (const reference of options.references) {
+		const normalized = normalizeReporterReference(reference);
+		try {
+			const instance = createReporterInstance(typeof normalized.value === "string" ? await loadReporterModule(normalized.value) : normalized.value, normalized.options);
+			if (instance != null) loadedReporters.push(instance);
+		} catch {}
+	}
+	if (loadedReporters.length === 0) return null;
+	const modulesByTaskId = /* @__PURE__ */ new Map();
+	const casesByCompositeId = /* @__PURE__ */ new Map();
+	function getOrCreateModule(taskId) {
+		const existing = modulesByTaskId.get(taskId);
+		if (existing != null) return existing;
+		const created = {
+			id: taskId,
+			name: taskId,
+			projectName: options.projectName
+		};
+		modulesByTaskId.set(taskId, created);
+		return created;
+	}
+	function getOrCreateCase(taskId, caseId) {
+		const compositeId = `${taskId}::${caseId}`;
+		const existing = casesByCompositeId.get(compositeId);
+		if (existing != null) return existing;
+		const created = {
+			id: caseId,
+			module: getOrCreateModule(taskId),
+			name: caseId,
+			state: "pending"
+		};
+		casesByCompositeId.set(compositeId, created);
+		return created;
+	}
+	return {
+		async onCaseEnd(payload) {
+			const taskCase = getOrCreateCase(payload.taskId, payload.caseId);
+			taskCase.state = payload.state;
+			await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseResult?.(taskCase));
+		},
+		async onCaseStart(payload) {
+			const taskCase = getOrCreateCase(payload.taskId, payload.caseId);
+			await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseReady?.(taskCase));
+		},
+		async onRunEnd(run) {
+			const modules = [...modulesByTaskId.values()];
+			const errors = run.failed ? [{ message: "vieval run failed" }] : [];
+			await emitToReporters(loadedReporters, (reporter) => reporter.onTestRunEnd?.(modules, errors, run.failed ? "failed" : "passed"));
+		},
+		async onRunStart() {
+			const specifications = [...modulesByTaskId.values()].map((module) => ({
+				moduleId: module.id,
+				projectName: module.projectName
+			}));
+			await emitToReporters(loadedReporters, (reporter) => reporter.onTestRunStart?.(specifications));
+		},
+		async onTaskEnd(payload) {
+			const module = getOrCreateModule(payload.taskId);
+			if (payload.state === "failed") {
+				const syntheticCase = getOrCreateCase(payload.taskId, `${payload.taskId}:task`);
+				syntheticCase.state = "failed";
+				await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseResult?.(syntheticCase));
+			}
+			await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleEnd?.(module));
+		},
+		async onTaskQueued(payload) {
+			const module = getOrCreateModule(payload.taskId);
+			await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleQueued?.(module));
+			await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleCollected?.(module));
+		},
+		async onTaskStart(payload) {
+			const module = getOrCreateModule(payload.taskId);
+			await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleStart?.(module));
+		}
+	};
+}
+//#endregion
 //#region src/cli/run.ts
+/**
+* Returns true when output contains at least one failing project/task/case outcome.
+*/
+function hasRunFailures(output) {
+	return output.projects.some((project) => {
+		if (project.errorMessage != null) return true;
+		if (project.caseSummary != null && project.caseSummary.failed > 0) return true;
+		return (project.caseFailures?.length ?? 0) > 0;
+	});
+}
 function shouldUseColor() {
 	if (process.env.NO_COLOR != null) return false;
 	const forceColor = process.env.FORCE_COLOR;
@@ -733,37 +1100,94 @@ function formatDuration(durationMs, colors) {
 	const rounded = Math.round(durationMs);
 	return (rounded > 1e3 ? colors.yellow : colors.green)(` ${rounded}${colors.dim("ms")}`);
 }
-async function loadEvalModules(evalFilePaths) {
-	const loadedModules = {};
-	for (const [moduleIndex, evalFilePath] of evalFilePaths.entries()) {
-		const moduleHref = pathToFileURL(evalFilePath).href;
-		const importHref = `${moduleHref}?vieval_load=${Date.now()}_${moduleIndex}`;
-		beginModuleRegistration(importHref);
-		try {
-			const moduleValue = await import(importHref);
-			const registeredDefinitions = consumeModuleRegistrations(importHref);
-			const defaultDefinition = moduleValue.default;
-			const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
-			const deduplicatedDefinitions = definitions.filter((definition, index) => {
-				const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
-				return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
-			});
-			if (deduplicatedDefinitions.length === 0) continue;
-			for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
-				const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
-				loadedModules[moduleKey] = { default: definition };
-			}
-		} finally {
-			endModuleRegistration();
-		}
-	}
-	return loadedModules;
-}
 function filterProjectsByName(projects, names) {
 	if (names.length === 0) return [...projects];
 	const nameSet = new Set(names);
 	return projects.filter((project) => nameSet.has(project.name));
 }
+function sanitizeIdentitySegment(value) {
+	const normalized = value.trim();
+	if (normalized.length === 0) return "default";
+	return normalized.replace(/[^\w.-]+/g, "-");
+}
+function createRunIdentity(options) {
+	const workspaceId = sanitizeIdentitySegment(options.workspace ?? "default-workspace");
+	const experimentId = sanitizeIdentitySegment(options.experiment ?? "default-experiment");
+	return {
+		attemptId: sanitizeIdentitySegment(options.attempt ?? `attempt-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`),
+		experimentId,
+		runId: `run-${Date.now()}-${randomUUID().slice(0, 8)}`,
+		workspaceId
+	};
+}
+function deriveReportProjectId(output) {
+	const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
+	if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment(uniqueProjectNames[0] ?? "default-project");
+	return "multi-project";
+}
+function createEventRecorder(identity) {
+	const events = [];
+	const taskProjectMap = /* @__PURE__ */ new Map();
+	return {
+		events,
+		record(event, payload, metadata) {
+			const maybeTaskPayload = payload;
+			const taskId = metadata?.taskId ?? maybeTaskPayload?.taskId;
+			const caseId = metadata?.caseId ?? payload?.caseId;
+			const projectName = metadata?.projectName ?? maybeTaskPayload?.projectName;
+			if (taskId != null && projectName != null) taskProjectMap.set(taskId, projectName);
+			events.push({
+				attemptId: identity.attemptId,
+				caseId,
+				data: payload,
+				event,
+				experimentId: identity.experimentId,
+				projectId: taskId == null ? void 0 : taskProjectMap.get(taskId),
+				runId: identity.runId,
+				schemaVersion: 1,
+				taskId,
+				timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+				version: 1,
+				workspaceId: identity.workspaceId
+			});
+		}
+	};
+}
+function createReporterWithEventCapture(reporter, recordEvent) {
+	return {
+		dispose() {
+			reporter.dispose();
+		},
+		onCaseEnd(payload) {
+			recordEvent("CaseEnded", payload);
+			reporter.onCaseEnd(payload);
+		},
+		onCaseStart(payload) {
+			recordEvent("CaseStarted", payload);
+			reporter.onCaseStart(payload);
+		},
+		onRunEnd(payload) {
+			recordEvent("RunEnded", payload);
+			reporter.onRunEnd(payload);
+		},
+		onRunStart(payload) {
+			recordEvent("RunStarted", payload);
+			reporter.onRunStart(payload);
+		},
+		onTaskEnd(payload) {
+			recordEvent("TaskEnded", payload);
+			reporter.onTaskEnd(payload);
+		},
+		onTaskQueued(payload) {
+			recordEvent("TaskQueued", payload);
+			reporter.onTaskQueued(payload);
+		},
+		onTaskStart(payload) {
+			recordEvent("TaskStarted", payload);
+			reporter.onTaskStart(payload);
+		}
+	};
+}
 function applyRunEnvironment(env) {
 	const envEntries = Object.entries(env);
 	if (envEntries.length === 0) return () => {};
@@ -876,7 +1300,7 @@ function createTaskQueuePayload(task, projectName) {
 function createTaskCaseReporterId(payload) {
 	return `${payload.index}:${encodeURIComponent(payload.name)}`;
 }
-function createTaskReporterHooks(task, reporter, projectCaseCounters) {
+function createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
 	function syncCaseTotal(total) {
 		reporter.onTaskQueued({
 			taskId: task.id,
@@ -896,8 +1320,21 @@ function createTaskReporterHooks(task, reporter, projectCaseCounters) {
 				}
 			}
 			syncCaseTotal(payload.total);
+			if (payload.state === "failed" && payload.errorMessage != null && projectCaseFailures != null) projectCaseFailures.push({
+				caseId,
+				caseName: payload.name,
+				errorMessage: payload.errorMessage,
+				taskId: task.id
+			});
 			reporter.onCaseEnd({
 				caseId,
+				errorMessage: payload.errorMessage,
+				state: payload.state,
+				taskId: task.id
+			});
+			vitestCompatReporter?.onCaseEnd({
+				caseId,
+				errorMessage: payload.errorMessage,
 				state: payload.state,
 				taskId: task.id
 			});
@@ -910,32 +1347,49 @@ function createTaskReporterHooks(task, reporter, projectCaseCounters) {
 				caseName: payload.name,
 				taskId: task.id
 			});
+			vitestCompatReporter?.onCaseStart({
+				caseId,
+				taskId: task.id
+			});
+		},
+		onEvent(payload) {
+			recordEvent(payload.event, payload.data, {
+				caseId: payload.caseId,
+				projectName,
+				taskId: task.id
+			});
 		}
 	};
 }
-function createCliTaskExecutionContext(task, models, reporter, projectCaseCounters) {
+function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
 	return {
 		...createTaskExecutionContext({
+			cache: createFilesystemTaskCacheRuntime({
+				cacheRootDirectory,
+				projectName: cacheProjectName,
+				workspaceId
+			}),
 			models,
 			task
 		}),
-		reporterHooks: createTaskReporterHooks(task, reporter, projectCaseCounters)
+		reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter)
 	};
 }
-function resolveTaskReporterHooks(task, context, reporter, projectCaseCounters) {
-	return context.reporterHooks ?? createTaskReporterHooks(task, reporter, projectCaseCounters);
+function resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
+	return context.reporterHooks ?? createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
 }
 function getFailedTaskId(error) {
 	if (error instanceof RunnerExecutionError) return error.taskId;
 	return null;
 }
-function createAutoTaskExecutor(reporter, projectCaseCounters) {
+function createAutoTaskExecutor(reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
 	return async (task, context) => {
 		const taskDefinition = task.entry.task;
 		if (taskDefinition == null) throw new Error(`Missing eval task definition for entry "${task.entry.id}".`);
 		const output = await taskDefinition.run({
+			cache: context.cache,
 			model: context.model,
-			reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectCaseCounters),
+			reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
 			task
 		});
 		return {
@@ -985,7 +1439,7 @@ async function prepareProject(project) {
 			include: project.include,
 			root: project.root
 		});
-		const entries = collectEvalEntries(await loadEvalModules(evalFilePaths), runtimeContext);
+		const entries = collectEvalEntries(await loadEvalModulesWithVitestRuntime(evalFilePaths, project.root), runtimeContext);
 		const tasks = createRunnerSchedule({
 			evalMatrix: project.evalMatrix,
 			entries,
@@ -997,6 +1451,7 @@ async function prepareProject(project) {
 			kind: "summary",
 			summary: {
 				caseSummary: null,
+				caseFailures: [],
 				discoveredEvalFileCount: evalFilePaths.length,
 				durationMs: Date.now() - startedAt,
 				entryCount: entries.length,
@@ -1024,6 +1479,7 @@ async function prepareProject(project) {
 			kind: "summary",
 			summary: {
 				caseSummary: null,
+				caseFailures: [],
 				discoveredEvalFileCount: 0,
 				durationMs: Date.now() - startedAt,
 				entryCount: 0,
@@ -1037,7 +1493,7 @@ async function prepareProject(project) {
 		};
 	}
 }
-async function executePreparedProject(prepared, reporter, counters) {
+async function executePreparedProject(prepared, identity, cacheProjectName, reporter, counters, recordEvent) {
 	const settledTaskIds = /* @__PURE__ */ new Set();
 	const projectCaseCounters = {
 		failed: 0,
@@ -1045,17 +1501,24 @@ async function executePreparedProject(prepared, reporter, counters) {
 		seenCaseIds: /* @__PURE__ */ new Set(),
 		skipped: 0
 	};
-	const rawTaskExecutor = prepared.project.executor ?? createAutoTaskExecutor(reporter, projectCaseCounters);
+	const projectCaseFailures = [];
+	const vitestCompatReporter = await createVievalVitestCompatReporterBridge({
+		projectName: prepared.name,
+		references: prepared.project.reporters
+	});
+	const rawTaskExecutor = prepared.project.executor ?? createAutoTaskExecutor(reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
 	const taskExecutor = async (task, context) => {
 		return {
 			...await rawTaskExecutor(task, context),
 			matrix: cloneScheduledTaskMatrix(task)
 		};
 	};
+	for (const task of prepared.tasks) await vitestCompatReporter?.onTaskQueued({ taskId: task.id });
+	await vitestCompatReporter?.onRunStart();
 	try {
 		const aggregated = await runScheduledTasks(prepared.tasks, taskExecutor, {
 			createExecutionContext(task) {
-				return createCliTaskExecutionContext(task, prepared.project.models, reporter, projectCaseCounters);
+				return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
 			},
 			onTaskEnd(task, state) {
 				settledTaskIds.add(task.id);
@@ -1063,6 +1526,10 @@ async function executePreparedProject(prepared, reporter, counters) {
 					state,
 					taskId: task.id
 				});
+				vitestCompatReporter?.onTaskEnd({
+					state,
+					taskId: task.id
+				});
 				if (state === "passed") {
 					counters.passedTasks += 1;
 					return;
@@ -1071,8 +1538,10 @@ async function executePreparedProject(prepared, reporter, counters) {
 			},
 			onTaskStart(task) {
 				reporter.onTaskStart({ taskId: task.id });
+				vitestCompatReporter?.onTaskStart({ taskId: task.id });
 			}
 		});
+		await vitestCompatReporter?.onRunEnd({ failed: false });
 		return {
 			caseSummary: {
 				failed: projectCaseCounters.failed,
@@ -1080,6 +1549,7 @@ async function executePreparedProject(prepared, reporter, counters) {
 				skipped: projectCaseCounters.skipped,
 				total: projectCaseCounters.seenCaseIds.size
 			},
+			caseFailures: projectCaseFailures,
 			discoveredEvalFileCount: prepared.discoveredEvalFileCount,
 			durationMs: Date.now() - prepared.startedAt,
 			entryCount: prepared.entryCount,
@@ -1099,6 +1569,10 @@ async function executePreparedProject(prepared, reporter, counters) {
 				state: "failed",
 				taskId: failedTaskId
 			});
+			await vitestCompatReporter?.onTaskEnd({
+				state: "failed",
+				taskId: failedTaskId
+			});
 		}
 		for (const task of prepared.tasks) {
 			if (settledTaskIds.has(task.id)) continue;
@@ -1108,7 +1582,12 @@ async function executePreparedProject(prepared, reporter, counters) {
 				state: "skipped",
 				taskId: task.id
 			});
+			await vitestCompatReporter?.onTaskEnd({
+				state: "skipped",
+				taskId: task.id
+			});
 		}
+		await vitestCompatReporter?.onRunEnd({ failed: true });
 		return {
 			caseSummary: {
 				failed: projectCaseCounters.failed,
@@ -1116,6 +1595,7 @@ async function executePreparedProject(prepared, reporter, counters) {
 				skipped: projectCaseCounters.skipped,
 				total: projectCaseCounters.seenCaseIds.size
 			},
+			caseFailures: projectCaseFailures,
 			discoveredEvalFileCount: prepared.discoveredEvalFileCount,
 			durationMs: Date.now() - prepared.startedAt,
 			entryCount: prepared.entryCount,
@@ -1128,6 +1608,14 @@ async function executePreparedProject(prepared, reporter, counters) {
 		};
 	}
 }
+async function writeRunReportArtifacts(output, events, identity, reportOut) {
+	const projectId = deriveReportProjectId(output);
+	const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
+	await mkdir(reportDirectory, { recursive: true });
+	await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(output, null, 2)}\n`, "utf-8");
+	await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
+	return reportDirectory;
+}
 /**
 * Runs vieval orchestration from config and returns project-level summaries.
 *
@@ -1145,12 +1633,14 @@ async function executePreparedProject(prepared, reporter, counters) {
 * - keeping business-agent eval files near their implementation packages
 */
 async function runVievalCli(options = {}) {
+	const identity = createRunIdentity(options);
 	const loadedConfig = await loadVievalCliConfig({
 		configFilePath: options.configFilePath,
 		cwd: options.cwd
 	});
 	const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
-	const reporter = createRunReporter(options.reporter);
+	const eventRecorder = createEventRecorder(identity);
+	const reporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
 	try {
 		const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
 		const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
@@ -1176,7 +1666,7 @@ async function runVievalCli(options = {}) {
 				projectSummaries.push(preparedProject.summary);
 				continue;
 			}
-			projectSummaries.push(await executePreparedProject(preparedProject.prepared, reporter, reporterCounters));
+			projectSummaries.push(await executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, reporter, reporterCounters, eventRecorder.record));
 		}
 		reporter.onRunEnd({
 			failedTasks: reporterCounters.failedTasks,
@@ -1184,10 +1674,17 @@ async function runVievalCli(options = {}) {
 			skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
 			totalTasks
 		});
-		return {
+		const output = {
+			attemptId: identity.attemptId,
 			configFilePath: loadedConfig.configFilePath,
-			projects: projectSummaries
+			experimentId: identity.experimentId,
+			projects: projectSummaries,
+			reportDirectory: null,
+			runId: identity.runId,
+			workspaceId: identity.workspaceId
 		};
+		if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
+		return output;
 	} finally {
 		reporter.dispose();
 		restoreEnvironment();
@@ -1239,7 +1736,9 @@ function formatVievalCliRunOutput(output) {
 		totalTasks += project.taskCount;
 		executedTasks += project.result?.overall.runCount ?? 0;
 		const badge = createProjectBadge(project.name, colors, colorEnabled);
-		if (project.errorMessage != null) {
+		const isFailed = project.errorMessage != null;
+		const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
+		if (isFailed) {
 			failedProjects += 1;
 			lines.push(` ${colors.red("❯")} ${badge}${formatDuration(project.durationMs, colors)}`);
 			lines.push(`   ${project.errorMessage}`);
@@ -1256,7 +1755,8 @@ function formatVievalCliRunOutput(output) {
 			if (scheduleBreakdown != null) lines.push(`   ${scheduleBreakdown}`);
 			continue;
 		}
-		passedProjects += 1;
+		if (hasFailedCases) failedProjects += 1;
+		else passedProjects += 1;
 		const hybridAverage = project.result?.overall.hybridAverage;
 		const hybridAverageLabel = hybridAverage == null ? "n/a" : String(hybridAverage);
 		const runCount = project.result?.overall.runCount ?? 0;
@@ -1264,10 +1764,18 @@ function formatVievalCliRunOutput(output) {
 		const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed`;
 		const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, ${runCount} runs${caseSummaryLabel}, hybrid ${hybridAverageLabel}`);
 		const matrixSummary = formatMatrixSummary(project.matrixSummary);
-		lines.push(` ${colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
+		lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
 		if (matrixSummary != null) lines.push(`   ${colors.dim(matrixSummary)}`);
 		const scheduleBreakdown = formatScheduleBreakdown(project);
 		if (scheduleBreakdown != null) lines.push(`   ${scheduleBreakdown}`);
+		if ((project.caseFailures?.length ?? 0) > 0) {
+			lines.push(`   ${colors.red("Failed cases:")}`);
+			for (const failure of project.caseFailures.slice(0, 5)) {
+				lines.push(`   ${colors.red(`- ${failure.caseName} (${failure.taskId})`)}`);
+				for (const line of failure.errorMessage.split("\n")) lines.push(`     ${colors.red(line)}`);
+			}
+			if (project.caseFailures.length > 5) lines.push(`   ${colors.dim(`... ${project.caseFailures.length - 5} more failed cases`)}`);
+		}
 	}
 	lines.push("");
 	if (failedProjects > 0 || skippedProjects > 0) {
@@ -1280,19 +1788,127 @@ function formatVievalCliRunOutput(output) {
 	return lines.join("\n");
 }
 //#endregion
+//#region src/cli/compare.ts
+const compareHelpText = `
+  Compare multiple methods on one benchmark.
+  Usage
+    $ vieval compare [--config <path>] [--comparison <id>] [--output <path>] [--format <format>]
+  Options
+    --config      Config file path (default: nearest vieval.config.*)
+    --comparison  Comparison entry id from config.comparisons
+    --output      Optional output artifact path
+    --format      Console output format: table | json (default: table)
+`;
+function normalizeCliArgv$4(argv) {
+	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
+	if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
+	return normalizedArgv;
+}
+function parseCompareCliArguments(argv) {
+	const cli = meow(compareHelpText, {
+		argv: normalizeCliArgv$4(argv),
+		flags: {
+			config: { type: "string" },
+			comparison: { type: "string" },
+			format: {
+				default: "table",
+				type: "string"
+			},
+			output: { type: "string" }
+		},
+		importMeta: import.meta
+	});
+	return {
+		comparisonId: cli.flags.comparison,
+		configFilePath: cli.flags.config,
+		format: cli.flags.format === "json" ? "json" : "table",
+		output: cli.flags.output
+	};
+}
+/**
+* Runs one compare session from `vieval.config.*` comparison-mode config.
+*/
+async function runCompareCli(argv) {
+	const parsed = parseCompareCliArguments(argv);
+	const loaded = await loadVievalComparisonConfig({
+		comparisonId: parsed.comparisonId,
+		configFilePath: parsed.configFilePath,
+		cwd: parsed.cwd
+	});
+	const methodResults = [];
+	for (const method of loaded.config.methods) {
+		const methodWorkspace = resolve(method.workspace);
+		const output = await runVievalCli({
+			cacheProjectName: loaded.config.benchmark.sharedCaseNamespace,
+			configFilePath: method.configFilePath ?? resolve(methodWorkspace, "vieval.config.ts"),
+			cwd: methodWorkspace,
+			project: [method.project],
+			workspace: loaded.config.benchmark.id
+		});
+		const failedProject = output.projects.find((project) => project.errorMessage != null);
+		if (failedProject != null) throw new Error(`Comparison method "${method.id}" failed: ${failedProject.errorMessage}`);
+		methodResults.push({
+			methodId: method.id,
+			output
+		});
+	}
+	const runOutput = {
+		benchmarkId: loaded.config.benchmark.id,
+		methods: methodResults
+	};
+	const artifact = buildCompareReportArtifact({
+		benchmarkId: runOutput.benchmarkId,
+		methods: runOutput.methods,
+		reportPath: loaded.configFilePath
+	});
+	if (parsed.output != null) await writeCompareReportArtifact({
+		artifact,
+		outputPath: parsed.output
+	});
+	if (parsed.format === "json") process.stdout.write(`${JSON.stringify(artifact, null, 2)}\n`);
+	else process.stdout.write([
+		"COMPARE  vieval",
+		`Benchmark  ${artifact.benchmarkId}`,
+		...artifact.methods.map((method, index) => {
+			const hybrid = method.hybridAverage == null ? "n/a" : method.hybridAverage.toFixed(3);
+			const exact = method.exactAverage == null ? "n/a" : method.exactAverage.toFixed(3);
+			return `${index + 1}. ${method.methodId}  hybrid=${hybrid} exact=${exact} runs=${method.runCount}`;
+		})
+	].join("\n").concat("\n"));
+	return runOutput;
+}
+async function runCompareCliOrExit(argv) {
+	try {
+		await runCompareCli(argv);
+	} catch (error) {
+		const errorMessage = errorMessageFrom(error) ?? "Unknown compare command failure.";
+		process.stderr.write(`[vieval compare] ${errorMessage}\n`);
+		process.exitCode = 1;
+	}
+}
+//#endregion
+//#region package.json
+var name = "vieval";
+//#endregion
 //#region src/cli/eval-run.ts
 const evalRunHelpText = `
   Execute vieval projects from discovered or explicit config.
   Usage
-    $ vieval run [--config <path>] [--project <name>] [--json]
+    $ vieval run [--config <path>] [--project <name>] [--json] [--report-out <path>]
   Options
     --config     Config file path
     --project    Project name to execute; may be repeated
+    --workspace  Workspace id used in report artifacts
+    --experiment Experiment id used in report artifacts
+    --attempt    Attempt id used in report artifacts
+    --report-out Report output root directory
     --json       Print machine-readable JSON output
 `;
-function normalizeCliArgv$1(argv) {
+function normalizeCliArgv$3(argv) {
 	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
 	return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
 }
@@ -1302,7 +1918,7 @@ function normalizeProjectNames(projectNames) {
 }
 function parseCliArguments(argv) {
 	const cli = meow(evalRunHelpText, {
-		argv: normalizeCliArgv$1(argv),
+		argv: normalizeCliArgv$3(argv),
 		importMeta: import.meta,
 		flags: {
 			config: { type: "string" },
@@ -1313,13 +1929,21 @@ function parseCliArguments(argv) {
 			project: {
 				isMultiple: true,
 				type: "string"
-			}
+			},
+			workspace: { type: "string" },
+			experiment: { type: "string" },
+			attempt: { type: "string" },
+			reportOut: { type: "string" }
 		}
 	});
 	return {
+		attempt: cli.flags.attempt,
 		configFilePath: cli.flags.config,
+		experiment: cli.flags.experiment,
 		json: cli.flags.json === true,
-		project: normalizeProjectNames(cli.flags.project)
+		project: normalizeProjectNames(cli.flags.project),
+		reportOut: cli.flags.reportOut,
+		workspace: cli.flags.workspace
 	};
 }
 function isDirectExecution$1() {
@@ -1345,14 +1969,20 @@ async function main$1() {
 	const parsed = parseCliArguments(process.argv.slice(2));
 	try {
 		const output = await runVievalCli({
+			attempt: parsed.attempt,
 			configFilePath: parsed.configFilePath,
-			project: parsed.project
+			experiment: parsed.experiment,
+			project: parsed.project,
+			reportOut: parsed.reportOut,
+			workspace: parsed.workspace
 		});
 		if (parsed.json) {
 			process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
+			if (hasRunFailures(output)) process.exitCode = 1;
 			return;
 		}
 		process.stdout.write(`${formatVievalCliRunOutput(output)}\n`);
+		if (hasRunFailures(output)) process.exitCode = 1;
 	} catch (error) {
 		const errorMessage = errorMessageFrom(error) ?? "Unknown CLI failure.";
 		process.stderr.write(`[${name}] ${errorMessage}\n`);
@@ -1361,6 +1991,485 @@ async function main$1() {
 }
 if (isDirectExecution$1()) await main$1();
 //#endregion
+//#region src/cli/report-artifacts.ts
+/**
+* Resolves one or more `run-summary.json` paths from a report location.
+*
+* Use when:
+* - callers may pass a run directory, summary file path, or a report root
+*
+* Returns:
+* - sorted absolute summary file paths
+*/
+async function resolveRunSummaryPaths(reportPath) {
+	const absoluteReportPath = resolve(reportPath);
+	const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
+	if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
+	if (existsSync(directSummaryPath)) return [directSummaryPath];
+	return (await glob("**/run-summary.json", {
+		absolute: true,
+		cwd: absoluteReportPath
+	})).sort((left, right) => left.localeCompare(right));
+}
+/**
+* Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
+*
+* Use when:
+* - report analysis needs both run aggregate output and event count metadata
+*/
+function readReportRunArtifact(summaryFilePath) {
+	const reportDirectory = resolve(summaryFilePath, "..");
+	const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
+	const eventsFilePath = resolve(reportDirectory, "events.jsonl");
+	const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
+		const event = JSON.parse(line);
+		return {
+			caseId: event.caseId,
+			data: event.data,
+			event: event.event,
+			taskId: event.taskId
+		};
+	}) : [];
+	return {
+		events,
+		eventsCount: events.length,
+		reportDirectory,
+		summary,
+		summaryFilePath
+	};
+}
+/**
+* Reads all run artifacts found under `reportPath`.
+*
+* Use when:
+* - callers need multi-run analysis from a directory root
+*/
+async function readReportArtifacts(reportPath) {
+	return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
+}
+/**
+* Creates a compact summary row for one run artifact.
+*
+* Use when:
+* - table/csv/jsonl exports should stay stable and cheap to parse
+*/
+function summarizeReportRunArtifact(artifact) {
+	const totalProjects = artifact.summary.projects.length;
+	const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
+	const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
+	const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
+	const projectNames = artifact.summary.projects.map((project) => project.name);
+	return {
+		attemptId: artifact.summary.attemptId ?? null,
+		eventsCount: artifact.eventsCount,
+		executedProjects,
+		experimentId: artifact.summary.experimentId ?? null,
+		failedProjects,
+		projectNames,
+		reportDirectory: artifact.reportDirectory,
+		runId: artifact.summary.runId ?? null,
+		totalProjects,
+		totalTasks,
+		workspaceId: artifact.summary.workspaceId ?? null
+	};
+}
+//#endregion
+//#region src/cli/report-analyze.ts
+const reportAnalyzeHelpText = `
+  Analyze generated vieval report artifacts.
+  Usage
+    $ vieval report analyze <reportPath> [options]
+  Options
+    --format       Output format: table | json | jsonl | csv (default: table)
+    --workspace    Workspace id filter
+    --project      Project name filter (exact)
+    --experiment   Experiment id filter
+    --attempt      Attempt id filter
+    --run          Run id filter
+    --task-state   Keep runs containing at least one task in this state
+    --case-state   Keep runs containing at least one case in this state
+    --contains     Keep runs containing this text in event name or payload
+    --error-contains Keep runs containing this text in project errors or event payload
+    --run-matrix   Keep runs matching run-matrix selector "key=value[,key=value]"
+    --eval-matrix  Keep runs matching eval-matrix selector "key=value[,key=value]"
+`;
+function normalizeCliArgv$2(argv) {
+	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
+	if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
+	if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
+	return normalizedArgv;
+}
+function parseReportAnalyzeCliArguments(argv) {
+	const cli = meow(reportAnalyzeHelpText, {
+		argv: normalizeCliArgv$2(argv),
+		flags: {
+			attempt: { type: "string" },
+			caseState: { type: "string" },
+			contains: { type: "string" },
+			evalMatrix: { type: "string" },
+			errorContains: { type: "string" },
+			experiment: { type: "string" },
+			format: {
+				default: "table",
+				type: "string"
+			},
+			project: { type: "string" },
+			runMatrix: { type: "string" },
+			run: { type: "string" },
+			taskState: { type: "string" },
+			workspace: { type: "string" }
+		},
+		importMeta: import.meta
+	});
+	const reportPath = cli.input[0];
+	if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
+	const normalizedFormat = cli.flags.format.toLowerCase();
+	const format = normalizedFormat === "json" ? "json" : normalizedFormat === "jsonl" ? "jsonl" : normalizedFormat === "csv" ? "csv" : "table";
+	return {
+		attempt: cli.flags.attempt,
+		caseState: normalizeStateFilter(cli.flags.caseState),
+		contains: cli.flags.contains,
+		evalMatrix: parseMatrixSelector(cli.flags.evalMatrix),
+		errorContains: cli.flags.errorContains,
+		experiment: cli.flags.experiment,
+		format,
+		project: cli.flags.project,
+		reportPath,
+		runMatrix: parseMatrixSelector(cli.flags.runMatrix),
+		run: cli.flags.run,
+		taskState: normalizeStateFilter(cli.flags.taskState),
+		workspace: cli.flags.workspace
+	};
+}
+function normalizeStateFilter(value) {
+	if (value == null) return;
+	const normalized = value.trim().toLowerCase();
+	if (normalized === "passed" || normalized === "failed" || normalized === "skipped") return normalized;
+	throw new Error(`Unsupported state filter "${value}". Expected "passed", "failed", or "skipped".`);
+}
+function parseMatrixSelector(value) {
+	if (value == null) return;
+	const selector = {};
+	const segments = value.split(",").map((segment) => segment.trim()).filter((segment) => segment.length > 0);
+	for (const segment of segments) {
+		const separatorIndex = segment.indexOf("=");
+		if (separatorIndex <= 0 || separatorIndex === segment.length - 1) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
+		const key = segment.slice(0, separatorIndex).trim();
+		const parsedValue = segment.slice(separatorIndex + 1).trim();
+		if (key.length === 0 || parsedValue.length === 0) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
+		selector[key] = parsedValue;
+	}
+	return selector;
+}
+function filterAnalyzeRows(rows, parsed) {
+	return rows.filter((row) => {
+		if (parsed.workspace != null && row.workspaceId !== parsed.workspace) return false;
+		if (parsed.experiment != null && row.experimentId !== parsed.experiment) return false;
+		if (parsed.attempt != null && row.attemptId !== parsed.attempt) return false;
+		if (parsed.run != null && row.runId !== parsed.run) return false;
+		if (parsed.project != null && !row.projectNames.includes(parsed.project)) return false;
+		return true;
+	});
+}
+function includesNeedle(value, needle) {
+	const normalizedNeedle = needle.trim().toLowerCase();
+	if (normalizedNeedle.length === 0) return true;
+	return JSON.stringify(value).toLowerCase().includes(normalizedNeedle);
+}
+function hasTaskState(artifact, targetState) {
+	return artifact.events.some((event) => {
+		if (event.event !== "TaskEnded") return false;
+		return event.data?.state === targetState;
+	});
+}
+function hasCaseState(artifact, targetState) {
+	return artifact.events.some((event) => {
+		if (event.event !== "CaseEnded") return false;
+		return event.data?.state === targetState;
+	});
+}
+function matchesMatrixSelector(matrix, selector) {
+	return Object.entries(selector).every(([key, expectedValue]) => String(matrix[key]) === expectedValue);
+}
+function hasRunMatrixMatch(artifact, selector) {
+	return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.run, selector)) === true);
+}
+function hasEvalMatrixMatch(artifact, selector) {
+	return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.eval, selector)) === true);
+}
+function matchesOutcomeFilters(artifact, parsed) {
+	if (parsed.runMatrix != null && !hasRunMatrixMatch(artifact, parsed.runMatrix)) return false;
+	if (parsed.evalMatrix != null && !hasEvalMatrixMatch(artifact, parsed.evalMatrix)) return false;
+	if (parsed.taskState != null && !hasTaskState(artifact, parsed.taskState)) return false;
+	if (parsed.caseState != null && !hasCaseState(artifact, parsed.caseState)) return false;
+	if (parsed.contains != null) {
+		if (!artifact.events.some((event) => includesNeedle({
+			data: event.data,
+			event: event.event
+		}, parsed.contains))) return false;
+	}
+	if (parsed.errorContains != null) {
+		if (!(artifact.summary.projects.map((project) => project.errorMessage).filter((errorMessage) => errorMessage != null).some((errorMessage) => includesNeedle(errorMessage, parsed.errorContains)) || artifact.events.some((event) => includesNeedle(event.data, parsed.errorContains)))) return false;
+	}
+	return true;
+}
+async function readReportAnalyzeOutput(parsed) {
+	const artifacts = await readReportArtifacts(parsed.reportPath);
+	const rows = artifacts.map((artifact) => summarizeReportRunArtifact(artifact));
+	const identityFilteredRows = filterAnalyzeRows(rows, parsed);
+	const rowByDirectory = new Map(identityFilteredRows.map((row) => [row.reportDirectory, row]));
+	const filteredRows = artifacts.filter((artifact) => rowByDirectory.has(artifact.reportDirectory)).filter((artifact) => matchesOutcomeFilters(artifact, parsed)).map((artifact) => rowByDirectory.get(artifact.reportDirectory)).filter((row) => row != null);
+	return {
+		experimentSummaries: buildExperimentSummaries(filteredRows),
+		filteredRunCount: filteredRows.length,
+		runs: filteredRows,
+		totalRunCount: rows.length
+	};
+}
+function roundMetric(value) {
+	return Number(value.toFixed(6));
+}
+function computeAverage(values) {
+	if (values.length === 0) return 0;
+	return values.reduce((sum, value) => sum + value, 0) / values.length;
+}
+function computeStandardDeviation(values) {
+	if (values.length === 0) return 0;
+	const average = computeAverage(values);
+	const variance = computeAverage(values.map((value) => (value - average) ** 2));
+	return Math.sqrt(variance);
+}
+function createExperimentGroupKey(row) {
+	return `${row.workspaceId ?? "unknown-workspace"}::${row.experimentId ?? "unknown-experiment"}`;
+}
+/**
+* Builds experiment-level rollups from filtered run rows.
+*
+* Use when:
+* - CLI consumers need stability and reliability summaries above per-run data
+*
+* Returns:
+* - one summary row per `workspaceId + experimentId` group
+*/
+function buildExperimentSummaries(rows) {
+	const grouped = /* @__PURE__ */ new Map();
+	for (const row of rows) {
+		const groupKey = createExperimentGroupKey(row);
+		const existing = grouped.get(groupKey);
+		if (existing == null) {
+			grouped.set(groupKey, [row]);
+			continue;
+		}
+		existing.push(row);
+	}
+	return [...grouped.entries()].map(([groupKey, groupRows]) => {
+		const [workspaceId, experimentId] = groupKey.split("::");
+		const failedProjects = groupRows.reduce((sum, row) => sum + row.failedProjects, 0);
+		const totalTasks = groupRows.reduce((sum, row) => sum + row.totalTasks, 0);
+		const totalEvents = groupRows.reduce((sum, row) => sum + row.eventsCount, 0);
+		const successfulRunCount = groupRows.filter((row) => row.failedProjects === 0).length;
+		const successRate = groupRows.length === 0 ? 0 : successfulRunCount / groupRows.length;
+		const attemptToRuns = /* @__PURE__ */ new Map();
+		for (const row of groupRows) {
+			const attemptId = row.attemptId ?? "unknown-attempt";
+			const attemptRows = attemptToRuns.get(attemptId);
+			if (attemptRows == null) {
+				attemptToRuns.set(attemptId, [row]);
+				continue;
+			}
+			attemptRows.push(row);
+		}
+		const attemptSummaries = [...attemptToRuns.entries()].map(([attemptId, attemptRows]) => {
+			const successCount = attemptRows.filter((row) => row.failedProjects === 0).length;
+			const runCount = attemptRows.length;
+			const failedProjectCount = attemptRows.reduce((sum, row) => sum + row.failedProjects, 0);
+			const totalTaskCount = attemptRows.reduce((sum, row) => sum + row.totalTasks, 0);
+			const totalEventCount = attemptRows.reduce((sum, row) => sum + row.eventsCount, 0);
+			return {
+				attemptId,
+				failedProjects: failedProjectCount,
+				runCount,
+				runIds: attemptRows.map((row) => row.runId).filter((runId) => runId != null).sort((left, right) => left.localeCompare(right)),
+				successRate: roundMetric(runCount === 0 ? 0 : successCount / runCount),
+				totalEvents: totalEventCount,
+				totalTasks: totalTaskCount
+			};
+		}).sort((left, right) => left.attemptId.localeCompare(right.attemptId));
+		const attemptSuccessRates = attemptSummaries.map((summary) => summary.successRate);
+		const minAttemptSuccessRate = attemptSuccessRates.length === 0 ? 0 : Math.min(...attemptSuccessRates);
+		const maxAttemptSuccessRate = attemptSuccessRates.length === 0 ? 0 : Math.max(...attemptSuccessRates);
+		const avgAttemptSuccessRate = computeAverage(attemptSuccessRates);
+		const stdevAttemptSuccessRate = computeStandardDeviation(attemptSuccessRates);
+		return {
+			attemptCount: attemptToRuns.size,
+			attemptSummaries,
+			attemptSuccessRateStats: {
+				avg: roundMetric(avgAttemptSuccessRate),
+				max: roundMetric(maxAttemptSuccessRate),
+				min: roundMetric(minAttemptSuccessRate),
+				stdev: roundMetric(stdevAttemptSuccessRate)
+			},
+			experimentId,
+			failedProjects,
+			runCount: groupRows.length,
+			successRate: roundMetric(successRate),
+			totalEvents,
+			totalTasks,
+			workspaceId
+		};
+	}).sort((left, right) => {
+		const workspaceCompare = left.workspaceId.localeCompare(right.workspaceId);
+		if (workspaceCompare !== 0) return workspaceCompare;
+		return left.experimentId.localeCompare(right.experimentId);
+	});
+}
+function formatTableOutput$1(output) {
+	const header = "Run ID | Workspace | Experiment | Attempt | Projects(executed/total) | FailedProjects | Tasks | Events";
+	const lines = output.runs.map((row) => {
+		return `${row.runId ?? "n/a"} | ${row.workspaceId ?? "n/a"} | ${row.experimentId ?? "n/a"} | ${row.attemptId ?? "n/a"} | ${`${row.executedProjects}/${row.totalProjects}`} | ${row.failedProjects} | ${row.totalTasks} | ${row.eventsCount}`;
+	});
+	return [
+		`ANALYZE vieval report: ${output.filteredRunCount}/${output.totalRunCount} runs (${output.experimentSummaries.length} experiment groups)`,
+		header,
+		...lines
+	].join("\n");
+}
+function formatCsvOutput(output) {
+	return [[
+		"runId",
+		"workspaceId",
+		"experimentId",
+		"attemptId",
+		"totalProjects",
+		"executedProjects",
+		"failedProjects",
+		"totalTasks",
+		"eventsCount",
+		"reportDirectory",
+		"projectNames"
+	].join(","), ...output.runs.map((row) => {
+		const escapedProjectNames = `"${row.projectNames.join("|").replaceAll("\"", "\"\"")}"`;
+		const escapedDirectory = `"${row.reportDirectory.replaceAll("\"", "\"\"")}"`;
+		return [
+			row.runId ?? "",
+			row.workspaceId ?? "",
+			row.experimentId ?? "",
+			row.attemptId ?? "",
+			row.totalProjects.toString(),
+			row.executedProjects.toString(),
+			row.failedProjects.toString(),
+			row.totalTasks.toString(),
+			row.eventsCount.toString(),
+			escapedDirectory,
+			escapedProjectNames
+		].join(",");
+	})].join("\n");
+}
+async function runReportAnalyzeCli(argv) {
+	try {
+		const parsed = parseReportAnalyzeCliArguments(argv);
+		const output = await readReportAnalyzeOutput(parsed);
+		if (parsed.format === "json") {
+			process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
+			return;
+		}
+		if (parsed.format === "jsonl") {
+			const jsonl = output.runs.map((run) => JSON.stringify(run)).join("\n");
+			process.stdout.write(`${jsonl}${jsonl.length > 0 ? "\n" : ""}`);
+			return;
+		}
+		if (parsed.format === "csv") {
+			process.stdout.write(`${formatCsvOutput(output)}\n`);
+			return;
+		}
+		process.stdout.write(`${formatTableOutput$1(output)}\n`);
+	} catch (error) {
+		const errorMessage = errorMessageFrom(error) ?? "Unknown report analyze failure.";
+		process.stderr.write(`[vieval report analyze] ${errorMessage}\n`);
+		process.exitCode = 1;
+	}
+}
+//#endregion
+//#region src/cli/report-index.ts
+const reportIndexHelpText = `
+  Build report indexes from generated vieval artifacts.
+  Usage
+    $ vieval report index <reportPath> [--output <path>] [--format <format>]
+  Options
+    --output      Output file path (default: <reportPath>/index/runs.jsonl)
+    --format      Console output format: table | json | jsonl (default: table)
+`;
+function normalizeCliArgv$1(argv) {
+	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
+	if (normalizedArgv[0] === "report" && normalizedArgv[1] === "index") return normalizedArgv.slice(2);
+	if (normalizedArgv[0] === "index") return normalizedArgv.slice(1);
+	return normalizedArgv;
+}
+function parseReportIndexCliArguments(argv) {
+	const cli = meow(reportIndexHelpText, {
+		argv: normalizeCliArgv$1(argv),
+		flags: {
+			format: {
+				default: "table",
+				type: "string"
+			},
+			output: { type: "string" }
+		},
+		importMeta: import.meta
+	});
+	const reportPath = cli.input[0];
+	if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
+	const normalizedFormat = cli.flags.format.toLowerCase();
+	return {
+		format: normalizedFormat === "json" ? "json" : normalizedFormat === "jsonl" ? "jsonl" : "table",
+		output: cli.flags.output,
+		reportPath
+	};
+}
+async function writeIndexFile(parsed) {
+	const rows = (await readReportArtifacts(parsed.reportPath)).map((artifact) => summarizeReportRunArtifact(artifact));
+	const indexFilePath = resolve(parsed.output ?? resolve(parsed.reportPath, "index", "runs.jsonl"));
+	await mkdir(dirname(indexFilePath), { recursive: true });
+	const indexContents = rows.map((row) => JSON.stringify(row)).join("\n");
+	await writeFile(indexFilePath, `${indexContents}${indexContents.length > 0 ? "\n" : ""}`, "utf-8");
+	return {
+		indexFilePath,
+		indexedRunCount: rows.length,
+		rows
+	};
+}
+function formatTableOutput(output) {
+	return [
+		"INDEX  vieval report",
+		`Path      ${output.indexFilePath}`,
+		`Run count ${output.indexedRunCount}`
+	].join("\n");
+}
+async function runReportIndexCli(argv) {
+	try {
+		const parsed = parseReportIndexCliArguments(argv);
+		const output = await writeIndexFile(parsed);
+		if (parsed.format === "json") {
+			process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
+			return;
+		}
+		if (parsed.format === "jsonl") {
+			const jsonl = output.rows.map((row) => JSON.stringify(row)).join("\n");
+			process.stdout.write(`${jsonl}${jsonl.length > 0 ? "\n" : ""}`);
+			return;
+		}
+		process.stdout.write(`${formatTableOutput(output)}\n`);
+	} catch (error) {
+		const errorMessage = errorMessageFrom(error) ?? "Unknown report index failure.";
+		process.stderr.write(`[vieval report index] ${errorMessage}\n`);
+		process.exitCode = 1;
+	}
+}
+//#endregion
 //#region src/cli/index.ts
 const topLevelHelpText = `
   Execute and report evaluation projects.
@@ -1370,10 +2479,15 @@ const topLevelHelpText = `
   Commands
     run            Discover and execute eval projects
+    compare        Compare multiple workspaces/methods on one benchmark
+    report         Analyze and index generated report artifacts
   Examples
     $ vieval run
-    $ vieval run --config vieval.config.ts --project chess --json
+    $ vieval run --config vieval.config.ts --project chess --json --report-out .vieval/reports
+    $ vieval compare --config vieval.config.ts --comparison agent-memory
+    $ vieval report analyze .vieval/reports/my-run
+    $ vieval report index .vieval/reports --output .vieval/reports/index/runs.jsonl
 `;
 function normalizeCliArgv(argv) {
 	return argv[0] === "--" ? argv.slice(1) : [...argv];
@@ -1391,7 +2505,7 @@ function parseTopLevelCliArguments(argv) {
 		command: "help",
 		commandArgv: []
 	};
-	if (command !== "run") throw new Error(`Unsupported vieval command "${command ?? "(none)"}". Expected "run".`);
+	if (command !== "run" && command !== "report" && command !== "compare") throw new Error(`Unsupported vieval command "${command ?? "(none)"}". Expected "run", "compare", or "report".`);
 	return {
 		command,
 		commandArgv: normalizedArgv.slice(1)
@@ -1403,20 +2517,49 @@ async function runTopLevelCli(argv) {
 		process.stdout.write(`${topLevelHelpText.trim()}\n`);
 		return;
 	}
+	if (parsed.command === "report") {
+		const reportSubcommand = parsed.commandArgv[0];
+		if (reportSubcommand === "analyze") {
+			await runReportAnalyzeCli(parsed.commandArgv);
+			return;
+		}
+		if (reportSubcommand === "index") {
+			await runReportIndexCli(parsed.commandArgv);
+			return;
+		}
+		throw new Error(`Unsupported vieval report command "${reportSubcommand ?? "(none)"}". Expected "analyze" or "index".`);
+	}
+	if (parsed.command === "compare") {
+		await runCompareCliOrExit(parsed.commandArgv);
+		return;
+	}
 	const runArguments = parseCliArguments(parsed.commandArgv);
 	const output = await runVievalCli({
+		attempt: runArguments.attempt,
 		configFilePath: runArguments.configFilePath,
-		project: runArguments.project
+		experiment: runArguments.experiment,
+		project: runArguments.project,
+		reportOut: runArguments.reportOut,
+		workspace: runArguments.workspace
 	});
 	if (runArguments.json) {
 		process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
+		if (hasRunFailures(output)) process.exitCode = 1;
 		return;
 	}
 	process.stdout.write(`${formatVievalCliRunOutput(output)}\n`);
+	if (hasRunFailures(output)) process.exitCode = 1;
 }
 function isDirectExecution() {
 	if (!process.argv[1]) return false;
-	return path.resolve(process.argv[1]) === fileURLToPath(import.meta.url);
+	const resolvedArgvPath = path.resolve(process.argv[1]);
+	const currentModulePath = fileURLToPath(import.meta.url);
+	try {
+		if (realpathSync.native(resolvedArgvPath) === realpathSync.native(currentModulePath)) return true;
+	} catch {
+		if (resolvedArgvPath === currentModulePath) return true;
+	}
+	return resolvedArgvPath.replaceAll("\\", "/").endsWith("/.bin/vieval");
 }
 async function main() {
 	try {