npm - vieval - Versions diffs - 0.0.8 → 0.0.10 - Mend

vieval 0.0.8 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/README.md +51 -4
package/dist/bin/vieval.mjs +1 -1
package/dist/cli/index.mjs +1 -1
package/dist/{cli-Dao25VxV.mjs → cli-DTDgaqeI.mjs} +669 -599
package/dist/cli-DTDgaqeI.mjs.map +1 -0
package/dist/config.d.mts +1 -1
package/dist/core/assertions/index.d.mts +1 -1
package/dist/core/inference-executors/index.d.mts +1 -1
package/dist/core/inference-executors/index.mjs +10 -4
package/dist/core/inference-executors/index.mjs.map +1 -1
package/dist/core/processors/results/index.d.mts +1 -1
package/dist/core/runner/index.d.mts +2 -2
package/dist/core/runner/index.mjs +2 -2
package/dist/core/scheduler/index.d.mts +1 -1
package/dist/core/scheduler/index.mjs +2 -2
package/dist/core/scheduler/index.mjs.map +1 -1
package/dist/{env-BeHv_5mo.d.mts → env-DfWZy_n4.d.mts} +14 -9
package/dist/env-nV5rVErX.mjs +35 -0
package/dist/env-nV5rVErX.mjs.map +1 -0
package/dist/{index-fakXoZEe.d.mts → index-Bg0atWBF.d.mts} +4 -3
package/dist/{index-BkjyCInx.d.mts → index-D_aMeWqO.d.mts} +2 -2
package/dist/index.d.mts +2 -2
package/dist/index.mjs +21 -26
package/dist/index.mjs.map +1 -1
package/dist/plugins/chat-models/index.d.mts +1 -1
package/dist/plugins/chat-models/index.mjs +15 -13
package/dist/plugins/chat-models/index.mjs.map +1 -1
package/dist/{registry-BHGMxjpA.mjs → registry-DMnwE_mY.mjs} +54 -10
package/dist/registry-DMnwE_mY.mjs.map +1 -0
package/package.json +1 -1
package/dist/cli-Dao25VxV.mjs.map +0 -1
package/dist/env-BFSjny07.mjs +0 -41
package/dist/env-BFSjny07.mjs.map +0 -1
package/dist/registry-BHGMxjpA.mjs.map +0 -1

package/dist/{cli-Dao25VxV.mjs → cli-DTDgaqeI.mjs} RENAMED Viewed

@@ -1,16 +1,17 @@
-import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-BHGMxjpA.mjs";
+import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-DMnwE_mY.mjs";
 import { createSchedulerRuntime } from "./core/scheduler/index.mjs";
 import { RunnerExecutionError, collectEvalEntries, createFilesystemTaskCacheRuntime, createRunnerRuntimeContext, createRunnerSchedule, createTaskExecutionContext, runScheduledTasks } from "./core/runner/index.mjs";
 import process from "node:process";
 import { errorMessageFrom } from "@moeru/std";
 import meow from "meow";
+import { access, mkdir, mkdtemp, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
 import { basename, dirname, isAbsolute, join, relative, resolve } from "node:path";
-import { access, mkdir, writeFile } from "node:fs/promises";
 import { glob } from "tinyglobby";
 import { pathToFileURL } from "node:url";
+import { existsSync, readFileSync } from "node:fs";
 import { randomUUID } from "node:crypto";
 import c from "tinyrainbow";
-import { existsSync, readFileSync } from "node:fs";
 import { uniq } from "es-toolkit";
 import { createVitest } from "vitest/node";
 import { formatDuration, intervalToDuration } from "date-fns";
@@ -156,393 +157,99 @@ async function loadVievalComparisonConfig(options = {}) {
 	}
 }
 //#endregion
-//#region src/cli/report-compare.ts
-/**
-* Builds a compact compare report sorted by hybrid/exact score.
-*/
-function buildCompareReportArtifact(args) {
-	const rows = args.methods.map((method) => {
-		const overall = method.output.projects[0]?.result?.overall;
-		return {
-			exactAverage: overall?.exactAverage ?? null,
-			hybridAverage: overall?.hybridAverage ?? null,
-			methodId: method.methodId,
-			runCount: overall?.runCount ?? 0
-		};
-	});
-	rows.sort((left, right) => {
-		const leftHybrid = left.hybridAverage ?? Number.NEGATIVE_INFINITY;
-		const rightHybrid = right.hybridAverage ?? Number.NEGATIVE_INFINITY;
-		if (leftHybrid !== rightHybrid) return rightHybrid - leftHybrid;
-		const leftExact = left.exactAverage ?? Number.NEGATIVE_INFINITY;
-		return (right.exactAverage ?? Number.NEGATIVE_INFINITY) - leftExact;
-	});
-	return {
-		benchmarkId: args.benchmarkId,
-		methods: rows,
-		reportPath: args.reportPath
-	};
-}
-/**
-* Writes compare report artifact as JSON.
-*/
-async function writeCompareReportArtifact(args) {
-	const outputPath = resolve(args.outputPath);
-	await mkdir(dirname(outputPath), { recursive: true });
-	await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
-	return outputPath;
-}
-//#endregion
-//#region src/cli/discovery.ts
-/**
-* Discovers eval files using include/exclude globs relative to project root.
-*
-* Before:
-* - Absolute path file list from recursive filesystem walk
-*
-* After:
-* - Filtered absolute path list matching include/exclude rules
-*/
-async function discoverEvalFiles(options) {
-	return uniq(await glob([...options.include], {
-		absolute: true,
-		cwd: options.root,
-		ignore: [...options.exclude],
-		onlyFiles: true
-	})).sort((left, right) => left.localeCompare(right));
-}
-//#endregion
-//#region src/cli/module-runtime.ts
+//#region src/cli/report-records.ts
 /**
-* Loads eval modules and returns a normalized eval-module map.
+* Builds normalized case records from lifecycle, metric, and score events.
 *
 * Use when:
-* - CLI collection needs Vite/Vitest-powered module resolution and transforms
-* - eval files should be imported with the same runtime semantics as Vitest
+* - `events.jsonl` should be projected into `cases.jsonl`
+* - report commands need one final record per observed case outcome
 *
 * Expects:
-* - `projectRoot` points at the project that owns the eval files
-* - each `evalFilePaths` entry is an absolute file path
+* - events are ordered by occurrence where possible
+* - lifecycle events use either `task.case.start`/`task.case.end` or current CLI `CaseStarted`/`CaseEnded` names
 *
 * Returns:
-* - eval modules keyed by stable file href + optional registration suffixes
+* - records for cases that emitted an end lifecycle event
 */
-async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
-	const loadedModules = {};
-	const runtime = await createVitest("test", {
-		config: false,
-		root: projectRoot,
-		run: false,
-		silent: true,
-		watch: false
-	});
-	try {
-		for (const evalFilePath of evalFilePaths) {
-			const moduleHref = pathToFileURL(evalFilePath).href;
-			beginModuleRegistration(moduleHref);
-			try {
-				const moduleValue = await runtime.import(moduleHref);
-				const registeredDefinitions = consumeModuleRegistrations(moduleHref);
-				const defaultDefinition = moduleValue.default;
-				const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
-				const deduplicatedDefinitions = definitions.filter((definition, index) => {
-					const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
-					return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
-				});
-				if (deduplicatedDefinitions.length === 0) continue;
-				for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
-					const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
-					loadedModules[moduleKey] = { default: definition };
-				}
-			} finally {
-				endModuleRegistration();
-			}
+function buildCaseRecords(args) {
+	const drafts = /* @__PURE__ */ new Map();
+	const completedKeys = [];
+	for (const event of args.events) {
+		const normalizedEvent = normalizeCaseEventName(event.event);
+		if (normalizedEvent == null) continue;
+		const ids = extractEventIds(event, args);
+		if (ids.caseId.length === 0 || ids.taskId.length === 0) continue;
+		const draft = getOrCreateDraft(drafts, ids, event, args);
+		applyIdentity(draft, ids, event, args);
+		if (normalizedEvent === "start") applyCaseStart(draft, event);
+		else if (normalizedEvent === "metric") applyCaseMetric(draft, event);
+		else if (normalizedEvent === "score") applyCaseScore(draft, event);
+		else {
+			applyCaseEnd(draft, event);
+			const key = createCaseKey(ids.taskId, ids.caseId);
+			if (!completedKeys.includes(key)) completedKeys.push(key);
 		}
-	} finally {
-		await runtime.close();
 	}
-	return loadedModules;
+	return completedKeys.map((key) => drafts.get(key)).filter((draft) => draft != null && draft.endedAt != null).map(toCaseRecord);
 }
-//#endregion
-//#region src/cli/report-selectors.ts
 /**
-* Resolves a generic case selector from metrics, scores, then direct fields.
+* Builds generic score summaries overall and grouped by arbitrary keys.
 *
 * Use when:
-* - report commands accept benchmark-neutral selectors such as `benchmark.case.id`
-* - comparisons need the same lookup semantics as filtering and grouping
+* - report artifacts need benchmark-neutral aggregate score views
+* - callers want to group by metrics such as `benchmark.category` or direct record fields such as `taskId`
 *
 * Expects:
-* - `key` is a direct `CaseRecord` field, score key, `scores.<key>`, or metric key
+* - `groupByKeys` are stable metric names or direct `CaseRecord` field names
+* - record score values are normalized numeric scores
 *
 * Returns:
-* - existence flag plus matched value when present
+* - overall score buckets and group buckets keyed by `<key>=<value>`
 */
-function getCaseSelectorValue(record, key) {
-	if (Object.hasOwn(record.metrics, key)) return {
-		exists: true,
-		value: record.metrics[key]
-	};
-	if (key.startsWith("scores.") && Object.hasOwn(record.scores, key.slice(7))) return {
-		exists: true,
-		value: record.scores[key.slice(7)]
-	};
-	if (Object.hasOwn(record.scores, key)) return {
-		exists: true,
-		value: record.scores[key]
-	};
-	if (Object.hasOwn(record, key)) return {
-		exists: true,
-		value: record[key]
+function buildMetricsSummary(records, groupByKeys) {
+	const overall = {};
+	const groups = {};
+	for (const record of records) {
+		addRecordScores(overall, record);
+		for (const groupByKey of groupByKeys) {
+			const groupValue = getGroupValue(record, groupByKey);
+			if (!groupValue.exists) continue;
+			const groupKey = `${groupByKey}=${String(groupValue.value)}`;
+			groups[groupKey] ??= {};
+			addRecordScores(groups[groupKey], record);
+		}
+	}
+	return {
+		groups: finalizeSummaryGroups(groups),
+		overall: finalizeScoreSummary(overall)
 	};
-	return { exists: false };
-}
-/**
-* Stable-stringifies JSON-like values for report comparisons.
-*
-* Before:
-* - `{ b: 1, a: true }`
-*
-* After:
-* - `{"a":true,"b":1}`
-*/
-function stableStringify(value) {
-	if (value == null || typeof value !== "object") return JSON.stringify(value);
-	if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
-	const record = value;
-	return `{${Object.keys(record).sort((left, right) => left.localeCompare(right)).map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(",")}}`;
 }
-//#endregion
-//#region src/cli/report-otlp.ts
 /**
-* Builds local OTLP-shaped JSON projections from normalized case records.
+* Encodes records as newline-delimited JSON.
 *
 * Use when:
-* - writing deterministic report artifacts without requiring an OpenTelemetry Collector
-* - future tools need trace/log/metric-shaped JSON files
+* - writing `cases.jsonl` for command-line tools, dataframes, or streaming parsers
+* - each record should occupy exactly one JSON line
 *
 * Expects:
-* - records belong to one Vieval run
+* - records are JSON-serializable case records
 *
 * Returns:
-* - trace, log, and metric containers shaped after OTLP JSON concepts
+* - one JSON object per line with a trailing newline for non-empty input
 */
-function buildLocalOtlpProjection(args) {
-	const projectSpans = collectProjectNames(args.records).map((projectName) => ({
-		attributes: toAttributes({
-			"vieval.project.name": projectName,
-			"vieval.run.id": args.runId
-		}),
-		name: "vieval.project"
-	}));
-	const taskSpans = collectTasks(args.records).map((task) => ({
-		attributes: toAttributes({
-			"vieval.project.name": task.projectName,
-			"vieval.run.id": args.runId,
-			"vieval.task.id": task.taskId
-		}),
-		name: "vieval.task"
-	}));
-	const caseSpans = args.records.map((record) => ({
-		attributes: toAttributes({
-			...record.metrics,
-			"vieval.case.duration_ms": record.durationMs,
-			"vieval.case.id": record.caseId,
-			"vieval.case.name": record.caseName,
-			"vieval.case.retry_count": record.retryCount,
-			"vieval.case.state": record.state,
-			"vieval.project.name": record.projectName,
-			"vieval.task.id": record.taskId
-		}),
-		endTimeUnixNano: isoToUnixNano(record.endedAt),
-		name: "vieval.case",
-		startTimeUnixNano: isoToUnixNano(record.startedAt)
-	}));
-	return {
-		logs: { resourceLogs: [{ scopeLogs: [{
-			logRecords: args.records.map((record) => ({
-				attributes: toAttributes(record.metrics),
-				body: { stringValue: JSON.stringify({
-					caseId: record.caseId,
-					scores: record.scores,
-					state: record.state
-				}) },
-				eventName: "vieval.case",
-				timeUnixNano: isoToUnixNano(record.endedAt)
-			})),
-			scope: { name: "vieval" }
-		}] }] },
-		metrics: { resourceMetrics: [{ scopeMetrics: [{
-			metrics: collectScoreKinds(args.records).map((kind) => ({
-				gauge: { dataPoints: args.records.filter((record) => typeof record.scores[kind] === "number").map((record) => ({
-					asDouble: record.scores[kind],
-					attributes: toAttributes({
-						...record.metrics,
-						"vieval.case.id": record.caseId,
-						"vieval.task.id": record.taskId
-					}),
-					timeUnixNano: isoToUnixNano(record.endedAt)
-				})) },
-				name: `vieval.score.${kind}`
-			})),
-			scope: { name: "vieval" }
-		}] }] },
-		traces: { resourceSpans: [{ scopeSpans: [{
-			scope: { name: "vieval" },
-			spans: [
-				{
-					attributes: toAttributes({ "vieval.run.id": args.runId }),
-					name: "vieval.run"
-				},
-				...projectSpans,
-				...taskSpans,
-				...caseSpans
-			]
-		}] }] }
-	};
-}
-function toAttributes(attributes) {
-	return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
-		key,
-		value: toAnyValue(value)
-	}));
-}
-function toAnyValue(value) {
-	if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
-	if (isAttributeScalar(value)) {
-		if (typeof value === "boolean") return { boolValue: value };
-		if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
-		if (value == null) return { stringValue: "null" };
-		return { stringValue: value };
-	}
-	return { stringValue: stableStringify(value) };
-}
-function isAttributeScalar(value) {
-	return value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string";
-}
-function isoToUnixNano(value) {
-	const preciseMatch = /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.(\d{1,9}))?(Z|[+-]\d{2}:\d{2})$/.exec(value);
-	if (preciseMatch != null) {
-		const [, secondsPart, fraction = "", zone] = preciseMatch;
-		const unixMilliseconds = Date.parse(`${secondsPart}.000${zone}`);
-		if (!Number.isFinite(unixMilliseconds)) return "0";
-		return String(BigInt(unixMilliseconds) * 1000000n + BigInt(fraction.padEnd(9, "0").slice(0, 9)));
-	}
-	const unixMilliseconds = Date.parse(value);
-	if (!Number.isFinite(unixMilliseconds)) return "0";
-	return String(BigInt(unixMilliseconds) * 1000000n);
-}
-function collectScoreKinds(records) {
-	return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
-}
-function collectProjectNames(records) {
-	return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
-}
-function collectTasks(records) {
-	const tasks = /* @__PURE__ */ new Map();
-	for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
-		projectName: record.projectName,
-		taskId: record.taskId
-	});
-	return [...tasks.values()].sort((left, right) => {
-		const projectOrder = left.projectName.localeCompare(right.projectName);
-		return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
-	});
-}
-//#endregion
-//#region src/cli/report-records.ts
-/**
-* Builds normalized case records from lifecycle, metric, and score events.
-*
-* Use when:
-* - `events.jsonl` should be projected into `cases.jsonl`
-* - report commands need one final record per observed case outcome
-*
-* Expects:
-* - events are ordered by occurrence where possible
-* - lifecycle events use either `task.case.start`/`task.case.end` or current CLI `CaseStarted`/`CaseEnded` names
-*
-* Returns:
-* - records for cases that emitted an end lifecycle event
-*/
-function buildCaseRecords(args) {
-	const drafts = /* @__PURE__ */ new Map();
-	const completedKeys = [];
-	for (const event of args.events) {
-		const normalizedEvent = normalizeCaseEventName(event.event);
-		if (normalizedEvent == null) continue;
-		const ids = extractEventIds(event, args);
-		if (ids.caseId.length === 0 || ids.taskId.length === 0) continue;
-		const draft = getOrCreateDraft(drafts, ids, event, args);
-		applyIdentity(draft, ids, event, args);
-		if (normalizedEvent === "start") applyCaseStart(draft, event);
-		else if (normalizedEvent === "metric") applyCaseMetric(draft, event);
-		else if (normalizedEvent === "score") applyCaseScore(draft, event);
-		else {
-			applyCaseEnd(draft, event);
-			const key = createCaseKey(ids.taskId, ids.caseId);
-			if (!completedKeys.includes(key)) completedKeys.push(key);
-		}
-	}
-	return completedKeys.map((key) => drafts.get(key)).filter((draft) => draft != null && draft.endedAt != null).map(toCaseRecord);
-}
-/**
-* Builds generic score summaries overall and grouped by arbitrary keys.
-*
-* Use when:
-* - report artifacts need benchmark-neutral aggregate score views
-* - callers want to group by metrics such as `benchmark.category` or direct record fields such as `taskId`
-*
-* Expects:
-* - `groupByKeys` are stable metric names or direct `CaseRecord` field names
-* - record score values are normalized numeric scores
-*
-* Returns:
-* - overall score buckets and group buckets keyed by `<key>=<value>`
-*/
-function buildMetricsSummary(records, groupByKeys) {
-	const overall = {};
-	const groups = {};
-	for (const record of records) {
-		addRecordScores(overall, record);
-		for (const groupByKey of groupByKeys) {
-			const groupValue = getGroupValue(record, groupByKey);
-			if (!groupValue.exists) continue;
-			const groupKey = `${groupByKey}=${String(groupValue.value)}`;
-			groups[groupKey] ??= {};
-			addRecordScores(groups[groupKey], record);
-		}
-	}
-	return {
-		groups: finalizeSummaryGroups(groups),
-		overall: finalizeScoreSummary(overall)
-	};
-}
-/**
-* Encodes records as newline-delimited JSON.
-*
-* Use when:
-* - writing `cases.jsonl` for command-line tools, dataframes, or streaming parsers
-* - each record should occupy exactly one JSON line
-*
-* Expects:
-* - records are JSON-serializable case records
-*
-* Returns:
-* - one JSON object per line with a trailing newline for non-empty input
-*/
-function encodeJsonl(records) {
-	if (records.length === 0) return "";
-	return `${records.map((record) => JSON.stringify(record)).join("\n")}\n`;
-}
-function normalizeCaseEventName(eventName) {
-	if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
-	if (eventName === "task.case.metric") return "metric";
-	if (eventName === "task.case.score") return "score";
-	if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
-}
-function extractEventIds(event, args) {
-	const data = asRecord(event.data);
+function encodeJsonl(records) {
+	if (records.length === 0) return "";
+	return `${records.map((record) => JSON.stringify(record)).join("\n")}\n`;
+}
+function normalizeCaseEventName(eventName) {
+	if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
+	if (eventName === "task.case.metric") return "metric";
+	if (eventName === "task.case.score") return "score";
+	if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
+}
+function extractEventIds(event, args) {
+	const data = asRecord(event.data);
 	return {
 		attemptId: stringFrom(data?.attemptId) ?? event.attemptId ?? args.attemptId,
 		caseId: stringFrom(data?.caseId) ?? event.caseId ?? "",
@@ -713,15 +420,560 @@ function isCaseMetricValue(value) {
 	if (value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string") return true;
 	return Array.isArray(value);
 }
-function asRecord(value) {
-	if (value == null || typeof value !== "object" || Array.isArray(value)) return;
-	return value;
+function asRecord(value) {
+	if (value == null || typeof value !== "object" || Array.isArray(value)) return;
+	return value;
+}
+function stringFrom(value) {
+	return typeof value === "string" ? value : void 0;
+}
+function numberFrom(value) {
+	return typeof value === "number" && Number.isFinite(value) ? value : void 0;
+}
+//#endregion
+//#region src/cli/report-selectors.ts
+/**
+* Resolves a generic case selector from metrics, scores, then direct fields.
+*
+* Use when:
+* - report commands accept benchmark-neutral selectors such as `benchmark.case.id`
+* - comparisons need the same lookup semantics as filtering and grouping
+*
+* Expects:
+* - `key` is a direct `CaseRecord` field, score key, `scores.<key>`, or metric key
+*
+* Returns:
+* - existence flag plus matched value when present
+*/
+function getCaseSelectorValue(record, key) {
+	if (Object.hasOwn(record.metrics, key)) return {
+		exists: true,
+		value: record.metrics[key]
+	};
+	if (key.startsWith("scores.") && Object.hasOwn(record.scores, key.slice(7))) return {
+		exists: true,
+		value: record.scores[key.slice(7)]
+	};
+	if (Object.hasOwn(record.scores, key)) return {
+		exists: true,
+		value: record.scores[key]
+	};
+	if (Object.hasOwn(record, key)) return {
+		exists: true,
+		value: record[key]
+	};
+	return { exists: false };
+}
+/**
+* Stable-stringifies JSON-like values for report comparisons.
+*
+* Before:
+* - `{ b: 1, a: true }`
+*
+* After:
+* - `{"a":true,"b":1}`
+*/
+function stableStringify(value) {
+	if (value == null || typeof value !== "object") return JSON.stringify(value);
+	if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
+	const record = value;
+	return `{${Object.keys(record).sort((left, right) => left.localeCompare(right)).map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(",")}}`;
+}
+//#endregion
+//#region src/cli/report-cases.ts
+const reportCasesHelpText = `
+  Inspect normalized case records from generated vieval report artifacts.
+  Usage
+    $ vieval report cases <reportPath> [options]
+  Options
+    --format       Output format: table | json | jsonl (default: table)
+    --where        Equality filter "key=value"; repeatable
+    --group-by     Case field, score name, or metric name used for grouped score summaries
+`;
+/**
+* Reads normalized case records from one report run directory or report root.
+*
+* Use when:
+* - CLI tools need case-level inspection from local report artifacts
+* - callers may pass a run directory, a `cases.jsonl` file, or a report root
+*
+* Expects:
+* - discovered `cases.jsonl` files contain one `CaseRecord` JSON object per line
+*
+* Returns:
+* - all parsed case records sorted by discovered file path order
+*/
+async function readCaseRecordsFromReport(reportPath) {
+	const caseFilePaths = await resolveCaseRecordPaths(reportPath);
+	if (caseFilePaths.length === 0) throw new Error(`No cases.jsonl files found under "${resolve(reportPath)}".`);
+	const records = [];
+	for (const caseFilePath of caseFilePaths) {
+		const lines = readFileSync(caseFilePath, "utf-8").split("\n");
+		for (const [index, line] of lines.entries()) {
+			const trimmed = line.trim();
+			if (trimmed.length === 0) continue;
+			try {
+				records.push(JSON.parse(trimmed));
+			} catch (error) {
+				throw new Error(`Invalid cases.jsonl line ${index + 1} in "${caseFilePath}": ${errorMessageFrom(error) ?? "Unknown JSON parse failure."}`);
+			}
+		}
+	}
+	return records;
+}
+/**
+* Builds filtered case inspection output.
+*
+* Use when:
+* - `vieval report cases` needs deterministic JSON/table output
+* - tests need pure filtering and grouping behavior without process I/O
+*
+* Expects:
+* - `where` filters use `key=value`
+* - lookup keys may target direct case fields, score names, or metric names
+*
+* Returns:
+* - filtered records plus grouped score summaries when `groupBy` is present
+*/
+function buildReportCasesOutput(records, options) {
+	const whereFilters = (options.where ?? []).map(parseSelector);
+	const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
+	return {
+		groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
+		records: [...filteredRecords]
+	};
+}
+/**
+* Runs the `vieval report cases` command.
+*
+* Call stack:
+*
+* published executable (`../bin/vieval`)
+*   -> {@link import('./index').runTopLevelCli}
+*     -> {@link runReportCasesCli}
+*       -> {@link readCaseRecordsFromReport}
+*
+* Use when:
+* - the top-level CLI dispatches local case artifact inspection
+*
+* Expects:
+* - argv is either `cases <reportPath> ...` or `<reportPath> ...`
+*
+* Returns:
+* - resolves after writing the requested output to stdout
+*/
+async function runReportCasesCli(argv) {
+	try {
+		const parsed = parseReportCasesCliArguments(argv);
+		const output = buildReportCasesOutput(await readCaseRecordsFromReport(parsed.reportPath), parsed);
+		if (parsed.format === "json") {
+			process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
+			return;
+		}
+		if (parsed.format === "jsonl") {
+			process.stdout.write(encodeJsonl(output.records));
+			return;
+		}
+		process.stdout.write(`${formatCasesTable(output)}\n`);
+	} catch (error) {
+		const errorMessage = errorMessageFrom(error) ?? "Unknown report cases failure.";
+		process.stderr.write(`[vieval report cases] ${errorMessage}\n`);
+		process.exitCode = 1;
+	}
+}
+function normalizeCliArgv$6(argv) {
+	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
+	if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
+	if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
+	return normalizedArgv;
+}
+function parseReportCasesCliArguments(argv) {
+	const cli = meow(reportCasesHelpText, {
+		argv: normalizeCliArgv$6(argv),
+		flags: {
+			format: {
+				default: "table",
+				type: "string"
+			},
+			groupBy: { type: "string" },
+			where: {
+				isMultiple: true,
+				type: "string"
+			}
+		},
+		importMeta: import.meta
+	});
+	const reportPath = cli.input[0];
+	if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
+	return {
+		format: normalizeReportCasesFormat(cli.flags.format),
+		groupBy: cli.flags.groupBy,
+		reportPath,
+		where: cli.flags.where
+	};
+}
+function normalizeReportCasesFormat(value) {
+	const normalized = value.toLowerCase();
+	if (normalized === "json") return "json";
+	if (normalized === "jsonl") return "jsonl";
+	return "table";
+}
+async function resolveCaseRecordPaths(reportPath) {
+	const absoluteReportPath = resolve(reportPath);
+	const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
+	if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
+	if (existsSync(directCaseFilePath)) return [directCaseFilePath];
+	return (await glob("**/cases.jsonl", {
+		absolute: true,
+		cwd: absoluteReportPath
+	})).sort((left, right) => left.localeCompare(right));
+}
+function matchesWhereFilters(record, whereFilters) {
+	return whereFilters.every((parsed) => {
+		const resolved = getCaseSelectorValue(record, parsed.key);
+		return resolved.exists && String(resolved.value) === parsed.value;
+	});
+}
+function parseSelector(selector) {
+	const separatorIndex = selector.indexOf("=");
+	if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
+	return {
+		key: selector.slice(0, separatorIndex).trim(),
+		value: selector.slice(separatorIndex + 1).trim()
+	};
+}
+function buildCaseGroups(records, groupBy) {
+	const groups = {};
+	for (const record of records) {
+		const resolved = getCaseSelectorValue(record, groupBy);
+		if (!resolved.exists) continue;
+		const groupKey = `${groupBy}=${String(resolved.value)}`;
+		groups[groupKey] ??= {
+			count: 0,
+			scores: {}
+		};
+		groups[groupKey].count += 1;
+		addScores(groups[groupKey].scores, record.scores);
+	}
+	return Object.fromEntries(Object.entries(groups).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, group]) => [groupKey, {
+		count: group.count,
+		scores: finalizeScores(group.scores)
+	}]));
+}
+function addScores(summary, scores) {
+	for (const [scoreName, value] of Object.entries(scores)) {
+		summary[scoreName] ??= {
+			average: 0,
+			count: 0,
+			sum: 0
+		};
+		summary[scoreName].count += 1;
+		summary[scoreName].sum += value;
+	}
+}
+function finalizeScores(summary) {
+	return Object.fromEntries(Object.entries(summary).sort(([left], [right]) => left.localeCompare(right)).map(([scoreName, bucket]) => [scoreName, {
+		average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
+		count: bucket.count,
+		sum: bucket.sum
+	}]));
+}
+function formatCasesTable(output) {
+	const lines = ["CASES  vieval report", `Case count ${output.records.length}`];
+	if (output.groups != null) {
+		lines.push("Groups");
+		for (const [groupKey, group] of Object.entries(output.groups)) {
+			const scoreText = Object.entries(group.scores).map(([scoreName, bucket]) => `${scoreName}=${bucket.average.toFixed(3)}`).join(" ");
+			lines.push(`${groupKey}  count=${group.count}${scoreText.length > 0 ? ` ${scoreText}` : ""}`);
+		}
+	}
+	return lines.join("\n");
+}
+//#endregion
+//#region src/cli/report-compare.ts
+/**
+* Builds a compact compare report sorted by hybrid/exact score.
+*/
+function buildCompareReportArtifact(args) {
+	const rows = args.methods.map((method) => {
+		const caseRecords = method.caseRecords ?? [];
+		const projects = method.output.projects.map((project) => ({
+			caseCount: countCasesForProject(caseRecords, project.name),
+			distinctCaseCount: countDistinctCasesForProject(caseRecords, project.name),
+			exactAverage: project.result?.overall.exactAverage ?? null,
+			executed: project.executed,
+			hybridAverage: project.result?.overall.hybridAverage ?? null,
+			name: project.name,
+			runCount: project.result?.overall.runCount ?? 0,
+			taskCount: project.taskCount
+		}));
+		return {
+			caseCount: caseRecords.length,
+			distinctCaseCount: countDistinctCases(caseRecords),
+			exactAverage: createWeightedAverage(projects, (project) => project.exactAverage),
+			executedProjectCount: projects.filter((project) => project.executed).length,
+			hybridAverage: createWeightedAverage(projects, (project) => project.hybridAverage),
+			methodId: method.methodId,
+			projectCount: projects.length,
+			projects,
+			runCount: projects.reduce((sum, project) => sum + project.runCount, 0),
+			taskCount: projects.reduce((sum, project) => sum + project.taskCount, 0)
+		};
+	});
+	rows.sort((left, right) => {
+		const leftHybrid = left.hybridAverage ?? Number.NEGATIVE_INFINITY;
+		const rightHybrid = right.hybridAverage ?? Number.NEGATIVE_INFINITY;
+		if (leftHybrid !== rightHybrid) return rightHybrid - leftHybrid;
+		const leftExact = left.exactAverage ?? Number.NEGATIVE_INFINITY;
+		return (right.exactAverage ?? Number.NEGATIVE_INFINITY) - leftExact;
+	});
+	return {
+		benchmarkId: args.benchmarkId,
+		methods: rows,
+		reportPath: args.reportPath
+	};
+}
+function countCasesForProject(caseRecords, projectName) {
+	return caseRecords.filter((record) => record.projectName === projectName).length;
+}
+function countDistinctCasesForProject(caseRecords, projectName) {
+	return countDistinctCases(caseRecords.filter((record) => record.projectName === projectName));
+}
+function countDistinctCases(caseRecords) {
+	const caseKeys = /* @__PURE__ */ new Set();
+	for (const record of caseRecords) caseKeys.add(`${record.projectName}:${record.taskId}:${record.caseId}`);
+	return caseKeys.size;
+}
+function createWeightedAverage(projects, selectAverage) {
+	let weightedScoreTotal = 0;
+	let weightTotal = 0;
+	for (const project of projects) {
+		const average = selectAverage(project);
+		if (average == null || project.runCount <= 0) continue;
+		weightedScoreTotal += average * project.runCount;
+		weightTotal += project.runCount;
+	}
+	if (weightTotal === 0) return null;
+	return weightedScoreTotal / weightTotal;
+}
+/**
+* Writes compare report artifact as JSON.
+*/
+async function writeCompareReportArtifact(args) {
+	const outputPath = resolve(args.outputPath);
+	await mkdir(dirname(outputPath), { recursive: true });
+	await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
+	return outputPath;
+}
+//#endregion
+//#region src/cli/discovery.ts
+/**
+* Discovers eval files using include/exclude globs relative to project root.
+*
+* Before:
+* - Absolute path file list from recursive filesystem walk
+*
+* After:
+* - Filtered absolute path list matching include/exclude rules
+*/
+async function discoverEvalFiles(options) {
+	return uniq(await glob([...options.include], {
+		absolute: true,
+		cwd: options.root,
+		ignore: [...options.exclude],
+		onlyFiles: true
+	})).sort((left, right) => left.localeCompare(right));
+}
+//#endregion
+//#region src/cli/module-runtime.ts
+/**
+* Loads eval modules and returns a normalized eval-module map.
+*
+* Use when:
+* - CLI collection needs Vite/Vitest-powered module resolution and transforms
+* - eval files should be imported with the same runtime semantics as Vitest
+*
+* Expects:
+* - `projectRoot` points at the project that owns the eval files
+* - each `evalFilePaths` entry is an absolute file path
+*
+* Returns:
+* - eval modules keyed by stable file href + optional registration suffixes
+*/
+async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
+	const loadedModules = {};
+	const runtime = await createVitest("test", {
+		config: false,
+		root: projectRoot,
+		run: false,
+		silent: true,
+		watch: false
+	});
+	try {
+		for (const evalFilePath of evalFilePaths) {
+			const moduleHref = pathToFileURL(evalFilePath).href;
+			beginModuleRegistration(moduleHref);
+			try {
+				const moduleValue = await runtime.import(moduleHref);
+				const registeredDefinitions = consumeModuleRegistrations(moduleHref);
+				const defaultDefinition = moduleValue.default;
+				const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
+				const deduplicatedDefinitions = definitions.filter((definition, index) => {
+					const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
+					return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
+				});
+				if (deduplicatedDefinitions.length === 0) continue;
+				for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
+					const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
+					loadedModules[moduleKey] = { default: definition };
+				}
+			} finally {
+				endModuleRegistration();
+			}
+		}
+	} finally {
+		await runtime.close();
+	}
+	return loadedModules;
+}
+//#endregion
+//#region src/cli/report-otlp.ts
+/**
+* Builds local OTLP-shaped JSON projections from normalized case records.
+*
+* Use when:
+* - writing deterministic report artifacts without requiring an OpenTelemetry Collector
+* - future tools need trace/log/metric-shaped JSON files
+*
+* Expects:
+* - records belong to one Vieval run
+*
+* Returns:
+* - trace, log, and metric containers shaped after OTLP JSON concepts
+*/
+function buildLocalOtlpProjection(args) {
+	const projectSpans = collectProjectNames(args.records).map((projectName) => ({
+		attributes: toAttributes({
+			"vieval.project.name": projectName,
+			"vieval.run.id": args.runId
+		}),
+		name: "vieval.project"
+	}));
+	const taskSpans = collectTasks(args.records).map((task) => ({
+		attributes: toAttributes({
+			"vieval.project.name": task.projectName,
+			"vieval.run.id": args.runId,
+			"vieval.task.id": task.taskId
+		}),
+		name: "vieval.task"
+	}));
+	const caseSpans = args.records.map((record) => ({
+		attributes: toAttributes({
+			...record.metrics,
+			"vieval.case.duration_ms": record.durationMs,
+			"vieval.case.id": record.caseId,
+			"vieval.case.name": record.caseName,
+			"vieval.case.retry_count": record.retryCount,
+			"vieval.case.state": record.state,
+			"vieval.project.name": record.projectName,
+			"vieval.task.id": record.taskId
+		}),
+		endTimeUnixNano: isoToUnixNano(record.endedAt),
+		name: "vieval.case",
+		startTimeUnixNano: isoToUnixNano(record.startedAt)
+	}));
+	return {
+		logs: { resourceLogs: [{ scopeLogs: [{
+			logRecords: args.records.map((record) => ({
+				attributes: toAttributes(record.metrics),
+				body: { stringValue: JSON.stringify({
+					caseId: record.caseId,
+					scores: record.scores,
+					state: record.state
+				}) },
+				eventName: "vieval.case",
+				timeUnixNano: isoToUnixNano(record.endedAt)
+			})),
+			scope: { name: "vieval" }
+		}] }] },
+		metrics: { resourceMetrics: [{ scopeMetrics: [{
+			metrics: collectScoreKinds(args.records).map((kind) => ({
+				gauge: { dataPoints: args.records.filter((record) => typeof record.scores[kind] === "number").map((record) => ({
+					asDouble: record.scores[kind],
+					attributes: toAttributes({
+						...record.metrics,
+						"vieval.case.id": record.caseId,
+						"vieval.task.id": record.taskId
+					}),
+					timeUnixNano: isoToUnixNano(record.endedAt)
+				})) },
+				name: `vieval.score.${kind}`
+			})),
+			scope: { name: "vieval" }
+		}] }] },
+		traces: { resourceSpans: [{ scopeSpans: [{
+			scope: { name: "vieval" },
+			spans: [
+				{
+					attributes: toAttributes({ "vieval.run.id": args.runId }),
+					name: "vieval.run"
+				},
+				...projectSpans,
+				...taskSpans,
+				...caseSpans
+			]
+		}] }] }
+	};
+}
+function toAttributes(attributes) {
+	return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
+		key,
+		value: toAnyValue(value)
+	}));
+}
+function toAnyValue(value) {
+	if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
+	if (isAttributeScalar(value)) {
+		if (typeof value === "boolean") return { boolValue: value };
+		if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
+		if (value == null) return { stringValue: "null" };
+		return { stringValue: value };
+	}
+	return { stringValue: stableStringify(value) };
+}
+function isAttributeScalar(value) {
+	return value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string";
+}
+function isoToUnixNano(value) {
+	const preciseMatch = /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.(\d{1,9}))?(Z|[+-]\d{2}:\d{2})$/.exec(value);
+	if (preciseMatch != null) {
+		const [, secondsPart, fraction = "", zone] = preciseMatch;
+		const unixMilliseconds = Date.parse(`${secondsPart}.000${zone}`);
+		if (!Number.isFinite(unixMilliseconds)) return "0";
+		return String(BigInt(unixMilliseconds) * 1000000n + BigInt(fraction.padEnd(9, "0").slice(0, 9)));
+	}
+	const unixMilliseconds = Date.parse(value);
+	if (!Number.isFinite(unixMilliseconds)) return "0";
+	return String(BigInt(unixMilliseconds) * 1000000n);
+}
+function collectScoreKinds(records) {
+	return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
 }
-function stringFrom(value) {
-	return typeof value === "string" ? value : void 0;
+function collectProjectNames(records) {
+	return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
 }
-function numberFrom(value) {
-	return typeof value === "number" && Number.isFinite(value) ? value : void 0;
+function collectTasks(records) {
+	const tasks = /* @__PURE__ */ new Map();
+	for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
+		projectName: record.projectName,
+		taskId: record.taskId
+	});
+	return [...tasks.values()].sort((left, right) => {
+		const projectOrder = left.projectName.localeCompare(right.projectName);
+		return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
+	});
 }
 //#endregion
 //#region src/cli/report-artifacts.ts
@@ -1893,9 +2145,30 @@ function sanitizeIdentitySegment(value) {
 	if (normalized.length === 0) return "default";
 	return normalized.replace(/[^\w.-]+/g, "-");
 }
-function createRunIdentity(options) {
+function createExperimentMatrixRows(tasks) {
+	const rows = /* @__PURE__ */ new Set();
+	for (const task of tasks) {
+		const runRowId = task.matrix.meta.runRowId;
+		const evalRowId = task.matrix.meta.evalRowId;
+		if (runRowId !== "default" && evalRowId !== "default") {
+			rows.add(`run:${runRowId}+eval:${evalRowId}`);
+			continue;
+		}
+		if (runRowId !== "default") rows.add(`run:${runRowId}`);
+		if (evalRowId !== "default") rows.add(`eval:${evalRowId}`);
+	}
+	return [...rows].sort((left, right) => left.localeCompare(right));
+}
+function resolveExperimentId(options, preparedProjects) {
+	if (options.experiment != null) return sanitizeIdentitySegment(options.experiment);
+	const matrixRows = /* @__PURE__ */ new Set();
+	for (const project of preparedProjects) project.experimentMatrixRows.forEach((row) => matrixRows.add(row));
+	if (matrixRows.size === 0) return "default-experiment";
+	return sanitizeIdentitySegment(`matrix-${[...matrixRows].sort().join("--")}`);
+}
+function createRunIdentity(options, preparedProjects) {
 	const workspaceId = sanitizeIdentitySegment(options.workspace ?? "default-workspace");
-	const experimentId = sanitizeIdentitySegment(options.experiment ?? "default-experiment");
+	const experimentId = resolveExperimentId(options, preparedProjects);
 	return {
 		attemptId: sanitizeIdentitySegment(options.attempt ?? `attempt-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`),
 		experimentId,
@@ -2251,6 +2524,7 @@ async function prepareProject(project) {
 		});
 		const canAutoExecuteEntryTasks = entries.some((entry) => entry.task != null) && project.models.length > 0;
 		if (project.executor == null && !canAutoExecuteEntryTasks) return {
+			experimentMatrixRows: createExperimentMatrixRows(tasks),
 			kind: "summary",
 			summary: {
 				caseSummary: null,
@@ -2267,6 +2541,7 @@ async function prepareProject(project) {
 			}
 		};
 		return {
+			experimentMatrixRows: createExperimentMatrixRows(tasks),
 			kind: "prepared",
 			prepared: {
 				discoveredEvalFileCount: evalFilePaths.length,
@@ -2279,6 +2554,7 @@ async function prepareProject(project) {
 		};
 	} catch (error) {
 		return {
+			experimentMatrixRows: [],
 			kind: "summary",
 			summary: {
 				caseSummary: null,
@@ -2439,7 +2715,6 @@ async function executePreparedProject(prepared, identity, cacheProjectName, tele
 * - keeping business-agent eval files near their implementation packages
 */
 async function runVievalCli(options = {}) {
-	const identity = createRunIdentity(options);
 	const loadedConfig = await loadVievalCliConfig({
 		configFilePath: options.configFilePath,
 		cwd: options.cwd
@@ -2447,21 +2722,24 @@ async function runVievalCli(options = {}) {
 	const telemetry = loadedConfig.reporting?.openTelemetry?.enabled === true ? createOpenTelemetryRuntime() : createNoopTelemetryRuntime();
 	const onOpenTelemetryRunEnd = loadedConfig.reporting?.openTelemetry?.enabled === true ? loadedConfig.reporting.openTelemetry.onRunEnd : void 0;
 	const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
-	const eventRecorder = createEventRecorder(identity);
-	const reporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
 	let runError;
 	let runEndError;
 	let output;
+	let reporter;
 	try {
+		const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
+		const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
+		const identity = createRunIdentity(options, preparedProjects);
+		const eventRecorder = createEventRecorder(identity);
+		const runReporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
+		reporter = runReporter;
 		output = await telemetry.withSpan("vieval.run", {
 			"vieval.attempt.id": identity.attemptId,
 			"vieval.experiment.id": identity.experimentId,
 			"vieval.run.id": identity.runId,
 			"vieval.workspace.id": identity.workspaceId
 		}, async () => {
-			const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
 			const workspaceScheduler = createSchedulerRuntime({ concurrency: { workspace: resolveWorkspaceConcurrency(loadedConfig, options) } });
-			const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
 			const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
 			const totalTasks = preparedProjects.reduce((sum, project) => {
 				if (project.kind === "prepared") return sum + project.prepared.tasks.length;
@@ -2476,8 +2754,8 @@ async function runVievalCli(options = {}) {
 				passedTasks: 0,
 				skippedTasks: 0
 			};
-			reporter.onRunStart({ totalTasks });
-			for (const project of executableProjects) for (const task of project.tasks) reporter.onTaskQueued(createTaskQueuePayload(task, project.name));
+			runReporter.onRunStart({ totalTasks });
+			for (const project of executableProjects) for (const task of project.tasks) runReporter.onTaskQueued(createTaskQueuePayload(task, project.name));
 			const projectSummaries = (await Promise.all(preparedProjects.map(async (preparedProject, index) => {
 				if (preparedProject.kind === "summary") return {
 					index,
@@ -2493,10 +2771,10 @@ async function runVievalCli(options = {}) {
 						projectName: preparedProject.prepared.name,
 						scope: "workspace",
 						workspaceId: identity.workspaceId
-					}, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry, reporter, reporterCounters, eventRecorder.record, options)))
+					}, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry, runReporter, reporterCounters, eventRecorder.record, options)))
 				};
 			}))).sort((left, right) => left.index - right.index).map((item) => item.summary);
-			reporter.onRunEnd({
+			runReporter.onRunEnd({
 				failedTasks: reporterCounters.failedTasks,
 				passedTasks: reporterCounters.passedTasks,
 				skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
@@ -2522,7 +2800,7 @@ async function runVievalCli(options = {}) {
 		} catch (error) {
 			if (runError == null) runEndError = error;
 		}
-		reporter.dispose();
+		reporter?.dispose();
 		restoreEnvironment();
 	}
 	if (runError != null) throw runError;
@@ -2640,14 +2918,14 @@ const compareHelpText = `
     --output      Optional output artifact path
     --format      Console output format: table | json (default: table)
 `;
-function normalizeCliArgv$6(argv) {
+function normalizeCliArgv$5(argv) {
 	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
 	if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
 	return normalizedArgv;
 }
 function parseCompareCliArguments(argv) {
 	const cli = meow(compareHelpText, {
-		argv: normalizeCliArgv$6(argv),
+		argv: normalizeCliArgv$5(argv),
 		flags: {
 			config: { type: "string" },
 			comparison: { type: "string" },
@@ -2677,18 +2955,22 @@ async function runCompareCli(argv) {
 		cwd: parsed.cwd
 	});
 	const methodResults = [];
+	const reportRoot = await mkdtemp(join(tmpdir(), "vieval-compare-"));
 	for (const method of loaded.config.methods) {
 		const methodWorkspace = resolve(method.workspace);
+		const methodReportOut = join(reportRoot, method.id);
 		const output = await runVievalCli({
 			cacheProjectName: loaded.config.benchmark.sharedCaseNamespace,
 			configFilePath: method.configFilePath ?? resolve(methodWorkspace, "vieval.config.ts"),
 			cwd: methodWorkspace,
 			project: [method.project],
+			reportOut: methodReportOut,
 			workspace: loaded.config.benchmark.id
 		});
 		const failedProject = output.projects.find((project) => project.errorMessage != null);
 		if (failedProject != null) throw new Error(`Comparison method "${method.id}" failed: ${failedProject.errorMessage}`);
 		methodResults.push({
+			caseRecords: await readCaseRecordsFromReport(methodReportOut),
 			methodId: method.id,
 			output
 		});
@@ -2752,7 +3034,7 @@ const evalRunHelpText = `
     --report-out Report output root directory
     --json       Print machine-readable JSON output
 `;
-function normalizeCliArgv$5(argv) {
+function normalizeCliArgv$4(argv) {
 	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
 	return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
 }
@@ -2775,7 +3057,7 @@ function normalizeProjectNames(projectNames) {
 */
 function parseCliArguments(argv) {
 	const cli = meow(evalRunHelpText, {
-		argv: normalizeCliArgv$5(argv),
+		argv: normalizeCliArgv$4(argv),
 		importMeta: import.meta,
 		flags: {
 			config: { type: "string" },
@@ -2892,7 +3174,7 @@ const reportAnalyzeHelpText = `
     --run-matrix   Keep runs matching run-matrix selector "key=value[,key=value]"
     --eval-matrix  Keep runs matching eval-matrix selector "key=value[,key=value]"
 `;
-function normalizeCliArgv$4(argv) {
+function normalizeCliArgv$3(argv) {
 	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
 	if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
 	if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
@@ -2900,7 +3182,7 @@ function normalizeCliArgv$4(argv) {
 }
 function parseReportAnalyzeCliArguments(argv) {
 	const cli = meow(reportAnalyzeHelpText, {
-		argv: normalizeCliArgv$4(argv),
+		argv: normalizeCliArgv$3(argv),
 		flags: {
 			attempt: { type: "string" },
 			caseState: { type: "string" },
@@ -3189,218 +3471,6 @@ async function runReportAnalyzeCli(argv) {
 	}
 }
 //#endregion
-//#region src/cli/report-cases.ts
-const reportCasesHelpText = `
-  Inspect normalized case records from generated vieval report artifacts.
-  Usage
-    $ vieval report cases <reportPath> [options]
-  Options
-    --format       Output format: table | json | jsonl (default: table)
-    --where        Equality filter "key=value"; repeatable
-    --group-by     Case field, score name, or metric name used for grouped score summaries
-`;
-/**
-* Reads normalized case records from one report run directory or report root.
-*
-* Use when:
-* - CLI tools need case-level inspection from local report artifacts
-* - callers may pass a run directory, a `cases.jsonl` file, or a report root
-*
-* Expects:
-* - discovered `cases.jsonl` files contain one `CaseRecord` JSON object per line
-*
-* Returns:
-* - all parsed case records sorted by discovered file path order
-*/
-async function readCaseRecordsFromReport(reportPath) {
-	const caseFilePaths = await resolveCaseRecordPaths(reportPath);
-	if (caseFilePaths.length === 0) throw new Error(`No cases.jsonl files found under "${resolve(reportPath)}".`);
-	const records = [];
-	for (const caseFilePath of caseFilePaths) {
-		const lines = readFileSync(caseFilePath, "utf-8").split("\n");
-		for (const [index, line] of lines.entries()) {
-			const trimmed = line.trim();
-			if (trimmed.length === 0) continue;
-			try {
-				records.push(JSON.parse(trimmed));
-			} catch (error) {
-				throw new Error(`Invalid cases.jsonl line ${index + 1} in "${caseFilePath}": ${errorMessageFrom(error) ?? "Unknown JSON parse failure."}`);
-			}
-		}
-	}
-	return records;
-}
-/**
-* Builds filtered case inspection output.
-*
-* Use when:
-* - `vieval report cases` needs deterministic JSON/table output
-* - tests need pure filtering and grouping behavior without process I/O
-*
-* Expects:
-* - `where` filters use `key=value`
-* - lookup keys may target direct case fields, score names, or metric names
-*
-* Returns:
-* - filtered records plus grouped score summaries when `groupBy` is present
-*/
-function buildReportCasesOutput(records, options) {
-	const whereFilters = (options.where ?? []).map(parseSelector);
-	const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
-	return {
-		groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
-		records: [...filteredRecords]
-	};
-}
-/**
-* Runs the `vieval report cases` command.
-*
-* Call stack:
-*
-* published executable (`../bin/vieval`)
-*   -> {@link import('./index').runTopLevelCli}
-*     -> {@link runReportCasesCli}
-*       -> {@link readCaseRecordsFromReport}
-*
-* Use when:
-* - the top-level CLI dispatches local case artifact inspection
-*
-* Expects:
-* - argv is either `cases <reportPath> ...` or `<reportPath> ...`
-*
-* Returns:
-* - resolves after writing the requested output to stdout
-*/
-async function runReportCasesCli(argv) {
-	try {
-		const parsed = parseReportCasesCliArguments(argv);
-		const output = buildReportCasesOutput(await readCaseRecordsFromReport(parsed.reportPath), parsed);
-		if (parsed.format === "json") {
-			process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
-			return;
-		}
-		if (parsed.format === "jsonl") {
-			process.stdout.write(encodeJsonl(output.records));
-			return;
-		}
-		process.stdout.write(`${formatCasesTable(output)}\n`);
-	} catch (error) {
-		const errorMessage = errorMessageFrom(error) ?? "Unknown report cases failure.";
-		process.stderr.write(`[vieval report cases] ${errorMessage}\n`);
-		process.exitCode = 1;
-	}
-}
-function normalizeCliArgv$3(argv) {
-	const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
-	if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
-	if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
-	return normalizedArgv;
-}
-function parseReportCasesCliArguments(argv) {
-	const cli = meow(reportCasesHelpText, {
-		argv: normalizeCliArgv$3(argv),
-		flags: {
-			format: {
-				default: "table",
-				type: "string"
-			},
-			groupBy: { type: "string" },
-			where: {
-				isMultiple: true,
-				type: "string"
-			}
-		},
-		importMeta: import.meta
-	});
-	const reportPath = cli.input[0];
-	if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
-	return {
-		format: normalizeReportCasesFormat(cli.flags.format),
-		groupBy: cli.flags.groupBy,
-		reportPath,
-		where: cli.flags.where
-	};
-}
-function normalizeReportCasesFormat(value) {
-	const normalized = value.toLowerCase();
-	if (normalized === "json") return "json";
-	if (normalized === "jsonl") return "jsonl";
-	return "table";
-}
-async function resolveCaseRecordPaths(reportPath) {
-	const absoluteReportPath = resolve(reportPath);
-	const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
-	if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
-	if (existsSync(directCaseFilePath)) return [directCaseFilePath];
-	return (await glob("**/cases.jsonl", {
-		absolute: true,
-		cwd: absoluteReportPath
-	})).sort((left, right) => left.localeCompare(right));
-}
-function matchesWhereFilters(record, whereFilters) {
-	return whereFilters.every((parsed) => {
-		const resolved = getCaseSelectorValue(record, parsed.key);
-		return resolved.exists && String(resolved.value) === parsed.value;
-	});
-}
-function parseSelector(selector) {
-	const separatorIndex = selector.indexOf("=");
-	if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
-	return {
-		key: selector.slice(0, separatorIndex).trim(),
-		value: selector.slice(separatorIndex + 1).trim()
-	};
-}
-function buildCaseGroups(records, groupBy) {
-	const groups = {};
-	for (const record of records) {
-		const resolved = getCaseSelectorValue(record, groupBy);
-		if (!resolved.exists) continue;
-		const groupKey = `${groupBy}=${String(resolved.value)}`;
-		groups[groupKey] ??= {
-			count: 0,
-			scores: {}
-		};
-		groups[groupKey].count += 1;
-		addScores(groups[groupKey].scores, record.scores);
-	}
-	return Object.fromEntries(Object.entries(groups).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, group]) => [groupKey, {
-		count: group.count,
-		scores: finalizeScores(group.scores)
-	}]));
-}
-function addScores(summary, scores) {
-	for (const [scoreName, value] of Object.entries(scores)) {
-		summary[scoreName] ??= {
-			average: 0,
-			count: 0,
-			sum: 0
-		};
-		summary[scoreName].count += 1;
-		summary[scoreName].sum += value;
-	}
-}
-function finalizeScores(summary) {
-	return Object.fromEntries(Object.entries(summary).sort(([left], [right]) => left.localeCompare(right)).map(([scoreName, bucket]) => [scoreName, {
-		average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
-		count: bucket.count,
-		sum: bucket.sum
-	}]));
-}
-function formatCasesTable(output) {
-	const lines = ["CASES  vieval report", `Case count ${output.records.length}`];
-	if (output.groups != null) {
-		lines.push("Groups");
-		for (const [groupKey, group] of Object.entries(output.groups)) {
-			const scoreText = Object.entries(group.scores).map(([scoreName, bucket]) => `${scoreName}=${bucket.average.toFixed(3)}`).join(" ");
-			lines.push(`${groupKey}  count=${group.count}${scoreText.length > 0 ? ` ${scoreText}` : ""}`);
-		}
-	}
-	return lines.join("\n");
-}
-//#endregion
 //#region src/cli/report-case-compare.ts
 const reportCompareHelpText = `
   Compare normalized case records from two generated vieval reports.
@@ -3842,4 +3912,4 @@ async function runTopLevelCli(argv) {
 //#endregion
 export { runTopLevelCli as n, parseTopLevelCliArguments as t };
-//# sourceMappingURL=cli-Dao25VxV.mjs.map
+//# sourceMappingURL=cli-DTDgaqeI.mjs.map