vieval 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +8 -5
  2. package/dist/cli/index.d.mts +1 -1
  3. package/dist/cli/index.mjs +1204 -61
  4. package/dist/cli/index.mjs.map +1 -1
  5. package/dist/{config-D2fe1SnT.mjs → config-CHN24egi.mjs} +1 -1
  6. package/dist/{config-D2fe1SnT.mjs.map → config-CHN24egi.mjs.map} +1 -1
  7. package/dist/config.d.mts +2 -3
  8. package/dist/config.mjs +2 -2
  9. package/dist/core/assertions/index.d.mts +1 -1
  10. package/dist/core/inference-executors/index.d.mts +1 -45
  11. package/dist/core/inference-executors/index.mjs +1 -38
  12. package/dist/core/inference-executors/index.mjs.map +1 -1
  13. package/dist/core/processors/results/index.d.mts +1 -1
  14. package/dist/core/runner/index.d.mts +2 -2
  15. package/dist/core/runner/index.mjs +2 -2
  16. package/dist/env-C7X81PWa.mjs +41 -0
  17. package/dist/env-C7X81PWa.mjs.map +1 -0
  18. package/dist/env-DtpjACOW.d.mts +47 -0
  19. package/dist/expect-B2vaoRVZ.d.mts +10 -0
  20. package/dist/{expect-i9WZWGrA.mjs → expect-CaXiUkwY.mjs} +3 -3
  21. package/dist/expect-CaXiUkwY.mjs.map +1 -0
  22. package/dist/expect-extensions-BOzwV5EJ.mjs +197 -0
  23. package/dist/expect-extensions-BOzwV5EJ.mjs.map +1 -0
  24. package/dist/expect.d.mts +1 -1
  25. package/dist/expect.mjs +1 -1
  26. package/dist/{index-DP7jsORl.d.mts → index-BDMEAmf2.d.mts} +246 -3
  27. package/dist/{index-oSXhM1zx.d.mts → index-C3gPFmcR.d.mts} +2 -2
  28. package/dist/index.d.mts +326 -6
  29. package/dist/index.mjs +65 -23
  30. package/dist/index.mjs.map +1 -1
  31. package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
  32. package/dist/{models-D_MsBtYw.mjs.map → models-DIGdOUpJ.mjs.map} +1 -1
  33. package/dist/plugins/chat-models/index.d.mts +465 -6
  34. package/dist/plugins/chat-models/index.mjs +469 -6
  35. package/dist/plugins/chat-models/index.mjs.map +1 -1
  36. package/dist/{registry-ChOjjdEC.mjs → registry-CHJcTN2W.mjs} +75 -16
  37. package/dist/registry-CHJcTN2W.mjs.map +1 -0
  38. package/dist/{runner-4ZsOveoY.mjs → runner-Dpy-eivM.mjs} +177 -21
  39. package/dist/runner-Dpy-eivM.mjs.map +1 -0
  40. package/dist/testing/expect-extensions.d.mts +44 -38
  41. package/dist/testing/expect-extensions.mjs +1 -1
  42. package/package.json +11 -4
  43. package/dist/expect-0jPJ7Zio.d.mts +0 -2318
  44. package/dist/expect-extensions-CwPtgTz8.mjs +0 -13471
  45. package/dist/expect-extensions-CwPtgTz8.mjs.map +0 -1
  46. package/dist/expect-i9WZWGrA.mjs.map +0 -1
  47. package/dist/magic-string.es-CH1jwzMg.mjs +0 -1013
  48. package/dist/magic-string.es-CH1jwzMg.mjs.map +0 -1
  49. package/dist/plugin-DVaRZY2x.d.mts +0 -84
  50. package/dist/registry-ChOjjdEC.mjs.map +0 -1
  51. package/dist/runner-4ZsOveoY.mjs.map +0 -1
@@ -1,18 +1,196 @@
1
1
  #!/usr/bin/env node
2
- import { n as consumeModuleRegistrations, r as endModuleRegistration, s as loadVievalCliConfig, t as beginModuleRegistration } from "../registry-ChOjjdEC.mjs";
3
- import { a as runScheduledTasks, i as RunnerExecutionError, n as createRunnerSchedule, r as createRunnerRuntimeContext, s as collectEvalEntries, t as createTaskExecutionContext } from "../runner-4ZsOveoY.mjs";
2
+ import { c as loadRawVievalConfig, l as loadVievalCliConfig, n as consumeModuleRegistrations, o as detectCliConfigMode, r as endModuleRegistration, t as beginModuleRegistration } from "../registry-CHJcTN2W.mjs";
3
+ import { a as runScheduledTasks, i as RunnerExecutionError, l as createFilesystemTaskCacheRuntime, n as createRunnerSchedule, r as createRunnerRuntimeContext, s as collectEvalEntries, t as createTaskExecutionContext } from "../runner-Dpy-eivM.mjs";
4
4
  import process from "node:process";
5
- import path from "node:path";
5
+ import { access, mkdir, writeFile } from "node:fs/promises";
6
+ import path, { basename, dirname, isAbsolute, join, relative, resolve } from "node:path";
6
7
  import { fileURLToPath, pathToFileURL } from "node:url";
7
8
  import { errorMessageFrom } from "@moeru/std";
8
- import c from "tinyrainbow";
9
+ import { existsSync, readFileSync, realpathSync } from "node:fs";
9
10
  import meow from "meow";
10
- import { uniq } from "es-toolkit";
11
11
  import { glob } from "tinyglobby";
12
+ import { randomUUID } from "node:crypto";
13
+ import c from "tinyrainbow";
14
+ import { uniq } from "es-toolkit";
15
+ import { createVitest } from "vitest/node";
12
16
  import { stripVTControlCharacters } from "node:util";
13
17
  import stringWidth from "fast-string-width";
14
- //#region package.json
15
- var name = "vieval";
18
+ //#region src/cli/comparison-config.ts
19
+ const supportedWorkspaceConfigFileNames = [
20
+ "vieval.config.ts",
21
+ "vieval.config.mts",
22
+ "vieval.config.cts",
23
+ "vieval.config.js",
24
+ "vieval.config.mjs",
25
+ "vieval.config.cjs",
26
+ "vieval.config.json"
27
+ ];
28
+ async function isReadableFile(filePath) {
29
+ try {
30
+ await access(filePath);
31
+ return true;
32
+ } catch {
33
+ return false;
34
+ }
35
+ }
36
+ function normalizeGlobInput(patterns) {
37
+ if (patterns == null) return [];
38
+ return (typeof patterns === "string" ? [patterns] : patterns).map((pattern) => pattern.trim()).filter((pattern) => pattern.length > 0);
39
+ }
40
+ function normalizeMethodShape(method, configDirectory, index) {
41
+ const id = method.id.trim();
42
+ const workspace = method.workspace.trim();
43
+ const project = method.project.trim();
44
+ const configFilePath = method.configFilePath?.trim();
45
+ if (id.length === 0) throw new Error(`Comparison method #${index + 1} is missing id.`);
46
+ if (workspace.length === 0) throw new Error(`Comparison method "${id}" is missing workspace.`);
47
+ if (project.length === 0) throw new Error(`Comparison method "${id}" is missing project.`);
48
+ const resolvedWorkspace = isAbsolute(workspace) ? workspace : resolve(configDirectory, workspace);
49
+ return {
50
+ configFilePath: configFilePath == null || configFilePath.length === 0 ? void 0 : isAbsolute(configFilePath) ? configFilePath : resolve(configDirectory, configFilePath),
51
+ id,
52
+ project,
53
+ workspace: resolvedWorkspace
54
+ };
55
+ }
56
+ async function findWorkspaceConfigFile(workspaceDirectory) {
57
+ for (const fileName of supportedWorkspaceConfigFileNames) {
58
+ const candidate = join(workspaceDirectory, fileName);
59
+ if (await isReadableFile(candidate)) return candidate;
60
+ }
61
+ return null;
62
+ }
63
+ function createDiscoveredMethodId(configDirectory, workspace, projectName) {
64
+ const relativeWorkspace = relative(configDirectory, workspace);
65
+ return `${(relativeWorkspace.length > 0 ? relativeWorkspace : basename(workspace)).replaceAll("\\", "/")}:${projectName}`;
66
+ }
67
+ async function discoverMethodsFromWorkspaceGlobs(args) {
68
+ const includes = normalizeGlobInput(args.comparison.includesWorkspaces);
69
+ if (includes.length === 0) return [];
70
+ const discoveredWorkspaceDirectories = await glob(includes, {
71
+ absolute: true,
72
+ cwd: args.configDirectory,
73
+ ignore: normalizeGlobInput(args.comparison.excludesWorkspaces),
74
+ onlyDirectories: true
75
+ });
76
+ const methods = [];
77
+ for (const workspaceDirectory of discoveredWorkspaceDirectories.sort((left, right) => left.localeCompare(right))) {
78
+ const configFilePath = await findWorkspaceConfigFile(workspaceDirectory);
79
+ if (configFilePath == null) continue;
80
+ const loadedWorkspaceConfig = await loadVievalCliConfig({
81
+ configFilePath,
82
+ cwd: workspaceDirectory
83
+ });
84
+ for (const project of loadedWorkspaceConfig.projects) methods.push({
85
+ configFilePath,
86
+ id: createDiscoveredMethodId(args.configDirectory, workspaceDirectory, project.name),
87
+ project: project.name,
88
+ workspace: workspaceDirectory
89
+ });
90
+ }
91
+ return methods;
92
+ }
93
+ function validateMethodIdsAreUnique(methods) {
94
+ const methodIds = methods.map((method) => method.id);
95
+ const duplicatedMethodId = methodIds.find((methodId, index) => methodIds.indexOf(methodId) !== index);
96
+ if (duplicatedMethodId != null) throw new Error(`Duplicate comparison method id "${duplicatedMethodId}".`);
97
+ }
98
+ function assertComparisonMode(config) {
99
+ const mode = detectCliConfigMode(config);
100
+ if (mode !== "comparisons") throw new Error(`Expected comparison-mode config, but received ${mode}-mode config.`);
101
+ }
102
+ function selectComparisonConfig(comparisons, comparisonId) {
103
+ if (comparisons.length === 0) throw new Error("Comparison config requires at least one comparisons entry.");
104
+ if (comparisonId == null || comparisonId.trim().length === 0) {
105
+ if (comparisons.length > 1) throw new Error(`Multiple comparisons found. Provide --comparison. Available ids: ${comparisons.map((item) => item.id).join(", ")}`);
106
+ return comparisons[0];
107
+ }
108
+ const selected = comparisons.find((item) => item.id === comparisonId);
109
+ if (selected == null) throw new Error(`Unknown comparison id "${comparisonId}".`);
110
+ return selected;
111
+ }
112
+ function normalizeBenchmark(comparison) {
113
+ const benchmarkId = comparison.benchmark.id.trim();
114
+ const sharedCaseNamespace = comparison.benchmark.sharedCaseNamespace.trim();
115
+ if (benchmarkId.length === 0) throw new Error("Comparison config requires benchmark.id.");
116
+ if (sharedCaseNamespace.length === 0) throw new Error("Comparison config requires benchmark.sharedCaseNamespace.");
117
+ return {
118
+ id: benchmarkId,
119
+ sharedCaseNamespace
120
+ };
121
+ }
122
+ /**
123
+ * Loads and validates comparison-mode data from `vieval.config.*`.
124
+ */
125
+ async function loadVievalComparisonConfig(options = {}) {
126
+ const cwd = options.cwd ?? process.cwd();
127
+ try {
128
+ const loaded = await loadRawVievalConfig({
129
+ configFilePath: options.configFilePath,
130
+ cwd
131
+ });
132
+ if (loaded.configFilePath == null || loaded.config == null) throw new Error("Failed to find vieval config. Expected vieval.config.*");
133
+ assertComparisonMode(loaded.config);
134
+ const selectedComparison = selectComparisonConfig(loaded.config.comparisons, options.comparisonId);
135
+ const configDirectory = dirname(loaded.configFilePath);
136
+ const explicitMethods = (selectedComparison.methods ?? []).map((method, index) => normalizeMethodShape(method, configDirectory, index));
137
+ const discoveredMethods = await discoverMethodsFromWorkspaceGlobs({
138
+ comparison: selectedComparison,
139
+ configDirectory
140
+ });
141
+ const methods = [...explicitMethods, ...discoveredMethods];
142
+ if (methods.length === 0) throw new Error("Comparison config resolved zero methods. Configure methods or includesWorkspaces.");
143
+ validateMethodIdsAreUnique(methods);
144
+ return {
145
+ config: {
146
+ benchmark: normalizeBenchmark(selectedComparison),
147
+ methods
148
+ },
149
+ configFilePath: loaded.configFilePath
150
+ };
151
+ } catch (error) {
152
+ const errorMessage = errorMessageFrom(error) ?? "Unknown comparison config loading error.";
153
+ const resolvedPath = options.configFilePath ?? "vieval.config";
154
+ throw new Error(`Failed to load comparison config "${resolvedPath}": ${errorMessage}`);
155
+ }
156
+ }
157
+ //#endregion
158
+ //#region src/cli/report-compare.ts
159
+ /**
160
+ * Builds a compact compare report sorted by hybrid/exact score.
161
+ */
162
+ function buildCompareReportArtifact(args) {
163
+ const rows = args.methods.map((method) => {
164
+ const overall = method.output.projects[0]?.result?.overall;
165
+ return {
166
+ exactAverage: overall?.exactAverage ?? null,
167
+ hybridAverage: overall?.hybridAverage ?? null,
168
+ methodId: method.methodId,
169
+ runCount: overall?.runCount ?? 0
170
+ };
171
+ });
172
+ rows.sort((left, right) => {
173
+ const leftHybrid = left.hybridAverage ?? Number.NEGATIVE_INFINITY;
174
+ const rightHybrid = right.hybridAverage ?? Number.NEGATIVE_INFINITY;
175
+ if (leftHybrid !== rightHybrid) return rightHybrid - leftHybrid;
176
+ const leftExact = left.exactAverage ?? Number.NEGATIVE_INFINITY;
177
+ return (right.exactAverage ?? Number.NEGATIVE_INFINITY) - leftExact;
178
+ });
179
+ return {
180
+ benchmarkId: args.benchmarkId,
181
+ methods: rows,
182
+ reportPath: args.reportPath
183
+ };
184
+ }
185
+ /**
186
+ * Writes compare report artifact as JSON.
187
+ */
188
+ async function writeCompareReportArtifact(args) {
189
+ const outputPath = resolve(args.outputPath);
190
+ await mkdir(dirname(outputPath), { recursive: true });
191
+ await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
192
+ return outputPath;
193
+ }
16
194
  //#endregion
17
195
  //#region src/cli/discovery.ts
18
196
  /**
@@ -33,6 +211,58 @@ async function discoverEvalFiles(options) {
33
211
  })).sort((left, right) => left.localeCompare(right));
34
212
  }
35
213
  //#endregion
214
+ //#region src/cli/module-runtime.ts
215
+ /**
216
+ * Loads eval modules and returns a normalized eval-module map.
217
+ *
218
+ * Use when:
219
+ * - CLI collection needs Vite/Vitest-powered module resolution and transforms
220
+ * - eval files should be imported with the same runtime semantics as Vitest
221
+ *
222
+ * Expects:
223
+ * - `projectRoot` points at the project that owns the eval files
224
+ * - each `evalFilePaths` entry is an absolute file path
225
+ *
226
+ * Returns:
227
+ * - eval modules keyed by stable file href + optional registration suffixes
228
+ */
229
+ async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
230
+ const loadedModules = {};
231
+ const runtime = await createVitest("test", {
232
+ config: false,
233
+ root: projectRoot,
234
+ run: false,
235
+ silent: true,
236
+ watch: false
237
+ });
238
+ try {
239
+ for (const evalFilePath of evalFilePaths) {
240
+ const moduleHref = pathToFileURL(evalFilePath).href;
241
+ beginModuleRegistration(moduleHref);
242
+ try {
243
+ const moduleValue = await runtime.import(moduleHref);
244
+ const registeredDefinitions = consumeModuleRegistrations(moduleHref);
245
+ const defaultDefinition = moduleValue.default;
246
+ const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
247
+ const deduplicatedDefinitions = definitions.filter((definition, index) => {
248
+ const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
249
+ return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
250
+ });
251
+ if (deduplicatedDefinitions.length === 0) continue;
252
+ for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
253
+ const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
254
+ loadedModules[moduleKey] = { default: definition };
255
+ }
256
+ } finally {
257
+ endModuleRegistration();
258
+ }
259
+ }
260
+ } finally {
261
+ await runtime.close();
262
+ }
263
+ return loadedModules;
264
+ }
265
+ //#endregion
36
266
  //#region src/cli/reporters/noop-reporter.ts
37
267
  /**
38
268
  * Creates a reporter that intentionally does nothing.
@@ -684,7 +914,144 @@ function getTextDisplayWidth(text) {
684
914
  return stringWidth(stripVTControlCharacters(text));
685
915
  }
686
916
  //#endregion
917
+ //#region src/cli/reporters/vitest-compat-reporter.ts
918
+ function isReporterReferenceTuple(reference) {
919
+ return Array.isArray(reference);
920
+ }
921
+ function isAbsoluteLikePath(value) {
922
+ return value.startsWith("/") || value.startsWith("./") || value.startsWith("../") || /^[A-Z]:[\\/]/i.test(value);
923
+ }
924
+ async function loadReporterModule(path) {
925
+ if (isAbsoluteLikePath(path)) return import(pathToFileURL(path).href);
926
+ return import(path);
927
+ }
928
+ function normalizeReporterReference(reference) {
929
+ if (isReporterReferenceTuple(reference)) return {
930
+ options: reference[1],
931
+ value: reference[0]
932
+ };
933
+ return {
934
+ options: void 0,
935
+ value: reference
936
+ };
937
+ }
938
+ function createReporterInstance(moduleValue, options) {
939
+ const value = moduleValue.default ?? moduleValue;
940
+ if (value == null) return null;
941
+ if (typeof value === "function") return new value(options);
942
+ if (typeof value === "object") return value;
943
+ return null;
944
+ }
945
+ async function emitToReporters(reporters, callback) {
946
+ await Promise.all(reporters.map(async (reporter) => {
947
+ try {
948
+ await callback(reporter);
949
+ } catch {}
950
+ }));
951
+ }
952
+ /**
953
+ * Creates a project-level vitest-compatible reporter bridge.
954
+ *
955
+ * Use when:
956
+ * - `vieval` should reuse vitest-like reporter callbacks without changing CLI output contracts
957
+ *
958
+ * Expects:
959
+ * - references point to modules whose default export is a reporter instance or constructor
960
+ *
961
+ * Returns:
962
+ * - `null` when no reporter references are configured
963
+ */
964
+ async function createVievalVitestCompatReporterBridge(options) {
965
+ if (options.references.length === 0) return null;
966
+ const loadedReporters = [];
967
+ for (const reference of options.references) {
968
+ const normalized = normalizeReporterReference(reference);
969
+ try {
970
+ const instance = createReporterInstance(typeof normalized.value === "string" ? await loadReporterModule(normalized.value) : normalized.value, normalized.options);
971
+ if (instance != null) loadedReporters.push(instance);
972
+ } catch {}
973
+ }
974
+ if (loadedReporters.length === 0) return null;
975
+ const modulesByTaskId = /* @__PURE__ */ new Map();
976
+ const casesByCompositeId = /* @__PURE__ */ new Map();
977
+ function getOrCreateModule(taskId) {
978
+ const existing = modulesByTaskId.get(taskId);
979
+ if (existing != null) return existing;
980
+ const created = {
981
+ id: taskId,
982
+ name: taskId,
983
+ projectName: options.projectName
984
+ };
985
+ modulesByTaskId.set(taskId, created);
986
+ return created;
987
+ }
988
+ function getOrCreateCase(taskId, caseId) {
989
+ const compositeId = `${taskId}::${caseId}`;
990
+ const existing = casesByCompositeId.get(compositeId);
991
+ if (existing != null) return existing;
992
+ const created = {
993
+ id: caseId,
994
+ module: getOrCreateModule(taskId),
995
+ name: caseId,
996
+ state: "pending"
997
+ };
998
+ casesByCompositeId.set(compositeId, created);
999
+ return created;
1000
+ }
1001
+ return {
1002
+ async onCaseEnd(payload) {
1003
+ const taskCase = getOrCreateCase(payload.taskId, payload.caseId);
1004
+ taskCase.state = payload.state;
1005
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseResult?.(taskCase));
1006
+ },
1007
+ async onCaseStart(payload) {
1008
+ const taskCase = getOrCreateCase(payload.taskId, payload.caseId);
1009
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseReady?.(taskCase));
1010
+ },
1011
+ async onRunEnd(run) {
1012
+ const modules = [...modulesByTaskId.values()];
1013
+ const errors = run.failed ? [{ message: "vieval run failed" }] : [];
1014
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestRunEnd?.(modules, errors, run.failed ? "failed" : "passed"));
1015
+ },
1016
+ async onRunStart() {
1017
+ const specifications = [...modulesByTaskId.values()].map((module) => ({
1018
+ moduleId: module.id,
1019
+ projectName: module.projectName
1020
+ }));
1021
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestRunStart?.(specifications));
1022
+ },
1023
+ async onTaskEnd(payload) {
1024
+ const module = getOrCreateModule(payload.taskId);
1025
+ if (payload.state === "failed") {
1026
+ const syntheticCase = getOrCreateCase(payload.taskId, `${payload.taskId}:task`);
1027
+ syntheticCase.state = "failed";
1028
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseResult?.(syntheticCase));
1029
+ }
1030
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleEnd?.(module));
1031
+ },
1032
+ async onTaskQueued(payload) {
1033
+ const module = getOrCreateModule(payload.taskId);
1034
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleQueued?.(module));
1035
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleCollected?.(module));
1036
+ },
1037
+ async onTaskStart(payload) {
1038
+ const module = getOrCreateModule(payload.taskId);
1039
+ await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleStart?.(module));
1040
+ }
1041
+ };
1042
+ }
1043
+ //#endregion
687
1044
  //#region src/cli/run.ts
1045
+ /**
1046
+ * Returns true when output contains at least one failing project/task/case outcome.
1047
+ */
1048
+ function hasRunFailures(output) {
1049
+ return output.projects.some((project) => {
1050
+ if (project.errorMessage != null) return true;
1051
+ if (project.caseSummary != null && project.caseSummary.failed > 0) return true;
1052
+ return (project.caseFailures?.length ?? 0) > 0;
1053
+ });
1054
+ }
688
1055
  function shouldUseColor() {
689
1056
  if (process.env.NO_COLOR != null) return false;
690
1057
  const forceColor = process.env.FORCE_COLOR;
@@ -733,37 +1100,94 @@ function formatDuration(durationMs, colors) {
733
1100
  const rounded = Math.round(durationMs);
734
1101
  return (rounded > 1e3 ? colors.yellow : colors.green)(` ${rounded}${colors.dim("ms")}`);
735
1102
  }
736
- async function loadEvalModules(evalFilePaths) {
737
- const loadedModules = {};
738
- for (const [moduleIndex, evalFilePath] of evalFilePaths.entries()) {
739
- const moduleHref = pathToFileURL(evalFilePath).href;
740
- const importHref = `${moduleHref}?vieval_load=${Date.now()}_${moduleIndex}`;
741
- beginModuleRegistration(importHref);
742
- try {
743
- const moduleValue = await import(importHref);
744
- const registeredDefinitions = consumeModuleRegistrations(importHref);
745
- const defaultDefinition = moduleValue.default;
746
- const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
747
- const deduplicatedDefinitions = definitions.filter((definition, index) => {
748
- const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
749
- return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
750
- });
751
- if (deduplicatedDefinitions.length === 0) continue;
752
- for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
753
- const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
754
- loadedModules[moduleKey] = { default: definition };
755
- }
756
- } finally {
757
- endModuleRegistration();
758
- }
759
- }
760
- return loadedModules;
761
- }
762
1103
  function filterProjectsByName(projects, names) {
763
1104
  if (names.length === 0) return [...projects];
764
1105
  const nameSet = new Set(names);
765
1106
  return projects.filter((project) => nameSet.has(project.name));
766
1107
  }
1108
+ function sanitizeIdentitySegment(value) {
1109
+ const normalized = value.trim();
1110
+ if (normalized.length === 0) return "default";
1111
+ return normalized.replace(/[^\w.-]+/g, "-");
1112
+ }
1113
+ function createRunIdentity(options) {
1114
+ const workspaceId = sanitizeIdentitySegment(options.workspace ?? "default-workspace");
1115
+ const experimentId = sanitizeIdentitySegment(options.experiment ?? "default-experiment");
1116
+ return {
1117
+ attemptId: sanitizeIdentitySegment(options.attempt ?? `attempt-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`),
1118
+ experimentId,
1119
+ runId: `run-${Date.now()}-${randomUUID().slice(0, 8)}`,
1120
+ workspaceId
1121
+ };
1122
+ }
1123
+ function deriveReportProjectId(output) {
1124
+ const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
1125
+ if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment(uniqueProjectNames[0] ?? "default-project");
1126
+ return "multi-project";
1127
+ }
1128
+ function createEventRecorder(identity) {
1129
+ const events = [];
1130
+ const taskProjectMap = /* @__PURE__ */ new Map();
1131
+ return {
1132
+ events,
1133
+ record(event, payload, metadata) {
1134
+ const maybeTaskPayload = payload;
1135
+ const taskId = metadata?.taskId ?? maybeTaskPayload?.taskId;
1136
+ const caseId = metadata?.caseId ?? payload?.caseId;
1137
+ const projectName = metadata?.projectName ?? maybeTaskPayload?.projectName;
1138
+ if (taskId != null && projectName != null) taskProjectMap.set(taskId, projectName);
1139
+ events.push({
1140
+ attemptId: identity.attemptId,
1141
+ caseId,
1142
+ data: payload,
1143
+ event,
1144
+ experimentId: identity.experimentId,
1145
+ projectId: taskId == null ? void 0 : taskProjectMap.get(taskId),
1146
+ runId: identity.runId,
1147
+ schemaVersion: 1,
1148
+ taskId,
1149
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1150
+ version: 1,
1151
+ workspaceId: identity.workspaceId
1152
+ });
1153
+ }
1154
+ };
1155
+ }
1156
+ function createReporterWithEventCapture(reporter, recordEvent) {
1157
+ return {
1158
+ dispose() {
1159
+ reporter.dispose();
1160
+ },
1161
+ onCaseEnd(payload) {
1162
+ recordEvent("CaseEnded", payload);
1163
+ reporter.onCaseEnd(payload);
1164
+ },
1165
+ onCaseStart(payload) {
1166
+ recordEvent("CaseStarted", payload);
1167
+ reporter.onCaseStart(payload);
1168
+ },
1169
+ onRunEnd(payload) {
1170
+ recordEvent("RunEnded", payload);
1171
+ reporter.onRunEnd(payload);
1172
+ },
1173
+ onRunStart(payload) {
1174
+ recordEvent("RunStarted", payload);
1175
+ reporter.onRunStart(payload);
1176
+ },
1177
+ onTaskEnd(payload) {
1178
+ recordEvent("TaskEnded", payload);
1179
+ reporter.onTaskEnd(payload);
1180
+ },
1181
+ onTaskQueued(payload) {
1182
+ recordEvent("TaskQueued", payload);
1183
+ reporter.onTaskQueued(payload);
1184
+ },
1185
+ onTaskStart(payload) {
1186
+ recordEvent("TaskStarted", payload);
1187
+ reporter.onTaskStart(payload);
1188
+ }
1189
+ };
1190
+ }
767
1191
  function applyRunEnvironment(env) {
768
1192
  const envEntries = Object.entries(env);
769
1193
  if (envEntries.length === 0) return () => {};
@@ -876,7 +1300,7 @@ function createTaskQueuePayload(task, projectName) {
876
1300
  function createTaskCaseReporterId(payload) {
877
1301
  return `${payload.index}:${encodeURIComponent(payload.name)}`;
878
1302
  }
879
- function createTaskReporterHooks(task, reporter, projectCaseCounters) {
1303
+ function createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
880
1304
  function syncCaseTotal(total) {
881
1305
  reporter.onTaskQueued({
882
1306
  taskId: task.id,
@@ -896,8 +1320,21 @@ function createTaskReporterHooks(task, reporter, projectCaseCounters) {
896
1320
  }
897
1321
  }
898
1322
  syncCaseTotal(payload.total);
1323
+ if (payload.state === "failed" && payload.errorMessage != null && projectCaseFailures != null) projectCaseFailures.push({
1324
+ caseId,
1325
+ caseName: payload.name,
1326
+ errorMessage: payload.errorMessage,
1327
+ taskId: task.id
1328
+ });
899
1329
  reporter.onCaseEnd({
900
1330
  caseId,
1331
+ errorMessage: payload.errorMessage,
1332
+ state: payload.state,
1333
+ taskId: task.id
1334
+ });
1335
+ vitestCompatReporter?.onCaseEnd({
1336
+ caseId,
1337
+ errorMessage: payload.errorMessage,
901
1338
  state: payload.state,
902
1339
  taskId: task.id
903
1340
  });
@@ -910,32 +1347,49 @@ function createTaskReporterHooks(task, reporter, projectCaseCounters) {
910
1347
  caseName: payload.name,
911
1348
  taskId: task.id
912
1349
  });
1350
+ vitestCompatReporter?.onCaseStart({
1351
+ caseId,
1352
+ taskId: task.id
1353
+ });
1354
+ },
1355
+ onEvent(payload) {
1356
+ recordEvent(payload.event, payload.data, {
1357
+ caseId: payload.caseId,
1358
+ projectName,
1359
+ taskId: task.id
1360
+ });
913
1361
  }
914
1362
  };
915
1363
  }
916
- function createCliTaskExecutionContext(task, models, reporter, projectCaseCounters) {
1364
+ function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
917
1365
  return {
918
1366
  ...createTaskExecutionContext({
1367
+ cache: createFilesystemTaskCacheRuntime({
1368
+ cacheRootDirectory,
1369
+ projectName: cacheProjectName,
1370
+ workspaceId
1371
+ }),
919
1372
  models,
920
1373
  task
921
1374
  }),
922
- reporterHooks: createTaskReporterHooks(task, reporter, projectCaseCounters)
1375
+ reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter)
923
1376
  };
924
1377
  }
925
- function resolveTaskReporterHooks(task, context, reporter, projectCaseCounters) {
926
- return context.reporterHooks ?? createTaskReporterHooks(task, reporter, projectCaseCounters);
1378
+ function resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
1379
+ return context.reporterHooks ?? createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
927
1380
  }
928
1381
  function getFailedTaskId(error) {
929
1382
  if (error instanceof RunnerExecutionError) return error.taskId;
930
1383
  return null;
931
1384
  }
932
- function createAutoTaskExecutor(reporter, projectCaseCounters) {
1385
+ function createAutoTaskExecutor(reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
933
1386
  return async (task, context) => {
934
1387
  const taskDefinition = task.entry.task;
935
1388
  if (taskDefinition == null) throw new Error(`Missing eval task definition for entry "${task.entry.id}".`);
936
1389
  const output = await taskDefinition.run({
1390
+ cache: context.cache,
937
1391
  model: context.model,
938
- reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectCaseCounters),
1392
+ reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
939
1393
  task
940
1394
  });
941
1395
  return {
@@ -985,7 +1439,7 @@ async function prepareProject(project) {
985
1439
  include: project.include,
986
1440
  root: project.root
987
1441
  });
988
- const entries = collectEvalEntries(await loadEvalModules(evalFilePaths), runtimeContext);
1442
+ const entries = collectEvalEntries(await loadEvalModulesWithVitestRuntime(evalFilePaths, project.root), runtimeContext);
989
1443
  const tasks = createRunnerSchedule({
990
1444
  evalMatrix: project.evalMatrix,
991
1445
  entries,
@@ -997,6 +1451,7 @@ async function prepareProject(project) {
997
1451
  kind: "summary",
998
1452
  summary: {
999
1453
  caseSummary: null,
1454
+ caseFailures: [],
1000
1455
  discoveredEvalFileCount: evalFilePaths.length,
1001
1456
  durationMs: Date.now() - startedAt,
1002
1457
  entryCount: entries.length,
@@ -1024,6 +1479,7 @@ async function prepareProject(project) {
1024
1479
  kind: "summary",
1025
1480
  summary: {
1026
1481
  caseSummary: null,
1482
+ caseFailures: [],
1027
1483
  discoveredEvalFileCount: 0,
1028
1484
  durationMs: Date.now() - startedAt,
1029
1485
  entryCount: 0,
@@ -1037,7 +1493,7 @@ async function prepareProject(project) {
1037
1493
  };
1038
1494
  }
1039
1495
  }
1040
- async function executePreparedProject(prepared, reporter, counters) {
1496
+ async function executePreparedProject(prepared, identity, cacheProjectName, reporter, counters, recordEvent) {
1041
1497
  const settledTaskIds = /* @__PURE__ */ new Set();
1042
1498
  const projectCaseCounters = {
1043
1499
  failed: 0,
@@ -1045,17 +1501,24 @@ async function executePreparedProject(prepared, reporter, counters) {
1045
1501
  seenCaseIds: /* @__PURE__ */ new Set(),
1046
1502
  skipped: 0
1047
1503
  };
1048
- const rawTaskExecutor = prepared.project.executor ?? createAutoTaskExecutor(reporter, projectCaseCounters);
1504
+ const projectCaseFailures = [];
1505
+ const vitestCompatReporter = await createVievalVitestCompatReporterBridge({
1506
+ projectName: prepared.name,
1507
+ references: prepared.project.reporters
1508
+ });
1509
+ const rawTaskExecutor = prepared.project.executor ?? createAutoTaskExecutor(reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
1049
1510
  const taskExecutor = async (task, context) => {
1050
1511
  return {
1051
1512
  ...await rawTaskExecutor(task, context),
1052
1513
  matrix: cloneScheduledTaskMatrix(task)
1053
1514
  };
1054
1515
  };
1516
+ for (const task of prepared.tasks) await vitestCompatReporter?.onTaskQueued({ taskId: task.id });
1517
+ await vitestCompatReporter?.onRunStart();
1055
1518
  try {
1056
1519
  const aggregated = await runScheduledTasks(prepared.tasks, taskExecutor, {
1057
1520
  createExecutionContext(task) {
1058
- return createCliTaskExecutionContext(task, prepared.project.models, reporter, projectCaseCounters);
1521
+ return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
1059
1522
  },
1060
1523
  onTaskEnd(task, state) {
1061
1524
  settledTaskIds.add(task.id);
@@ -1063,6 +1526,10 @@ async function executePreparedProject(prepared, reporter, counters) {
1063
1526
  state,
1064
1527
  taskId: task.id
1065
1528
  });
1529
+ vitestCompatReporter?.onTaskEnd({
1530
+ state,
1531
+ taskId: task.id
1532
+ });
1066
1533
  if (state === "passed") {
1067
1534
  counters.passedTasks += 1;
1068
1535
  return;
@@ -1071,8 +1538,10 @@ async function executePreparedProject(prepared, reporter, counters) {
1071
1538
  },
1072
1539
  onTaskStart(task) {
1073
1540
  reporter.onTaskStart({ taskId: task.id });
1541
+ vitestCompatReporter?.onTaskStart({ taskId: task.id });
1074
1542
  }
1075
1543
  });
1544
+ await vitestCompatReporter?.onRunEnd({ failed: false });
1076
1545
  return {
1077
1546
  caseSummary: {
1078
1547
  failed: projectCaseCounters.failed,
@@ -1080,6 +1549,7 @@ async function executePreparedProject(prepared, reporter, counters) {
1080
1549
  skipped: projectCaseCounters.skipped,
1081
1550
  total: projectCaseCounters.seenCaseIds.size
1082
1551
  },
1552
+ caseFailures: projectCaseFailures,
1083
1553
  discoveredEvalFileCount: prepared.discoveredEvalFileCount,
1084
1554
  durationMs: Date.now() - prepared.startedAt,
1085
1555
  entryCount: prepared.entryCount,
@@ -1099,6 +1569,10 @@ async function executePreparedProject(prepared, reporter, counters) {
1099
1569
  state: "failed",
1100
1570
  taskId: failedTaskId
1101
1571
  });
1572
+ await vitestCompatReporter?.onTaskEnd({
1573
+ state: "failed",
1574
+ taskId: failedTaskId
1575
+ });
1102
1576
  }
1103
1577
  for (const task of prepared.tasks) {
1104
1578
  if (settledTaskIds.has(task.id)) continue;
@@ -1108,7 +1582,12 @@ async function executePreparedProject(prepared, reporter, counters) {
1108
1582
  state: "skipped",
1109
1583
  taskId: task.id
1110
1584
  });
1585
+ await vitestCompatReporter?.onTaskEnd({
1586
+ state: "skipped",
1587
+ taskId: task.id
1588
+ });
1111
1589
  }
1590
+ await vitestCompatReporter?.onRunEnd({ failed: true });
1112
1591
  return {
1113
1592
  caseSummary: {
1114
1593
  failed: projectCaseCounters.failed,
@@ -1116,6 +1595,7 @@ async function executePreparedProject(prepared, reporter, counters) {
1116
1595
  skipped: projectCaseCounters.skipped,
1117
1596
  total: projectCaseCounters.seenCaseIds.size
1118
1597
  },
1598
+ caseFailures: projectCaseFailures,
1119
1599
  discoveredEvalFileCount: prepared.discoveredEvalFileCount,
1120
1600
  durationMs: Date.now() - prepared.startedAt,
1121
1601
  entryCount: prepared.entryCount,
@@ -1128,6 +1608,14 @@ async function executePreparedProject(prepared, reporter, counters) {
1128
1608
  };
1129
1609
  }
1130
1610
  }
1611
+ async function writeRunReportArtifacts(output, events, identity, reportOut) {
1612
+ const projectId = deriveReportProjectId(output);
1613
+ const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
1614
+ await mkdir(reportDirectory, { recursive: true });
1615
+ await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(output, null, 2)}\n`, "utf-8");
1616
+ await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
1617
+ return reportDirectory;
1618
+ }
1131
1619
  /**
1132
1620
  * Runs vieval orchestration from config and returns project-level summaries.
1133
1621
  *
@@ -1145,12 +1633,14 @@ async function executePreparedProject(prepared, reporter, counters) {
1145
1633
  * - keeping business-agent eval files near their implementation packages
1146
1634
  */
1147
1635
  async function runVievalCli(options = {}) {
1636
+ const identity = createRunIdentity(options);
1148
1637
  const loadedConfig = await loadVievalCliConfig({
1149
1638
  configFilePath: options.configFilePath,
1150
1639
  cwd: options.cwd
1151
1640
  });
1152
1641
  const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
1153
- const reporter = createRunReporter(options.reporter);
1642
+ const eventRecorder = createEventRecorder(identity);
1643
+ const reporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
1154
1644
  try {
1155
1645
  const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
1156
1646
  const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
@@ -1176,7 +1666,7 @@ async function runVievalCli(options = {}) {
1176
1666
  projectSummaries.push(preparedProject.summary);
1177
1667
  continue;
1178
1668
  }
1179
- projectSummaries.push(await executePreparedProject(preparedProject.prepared, reporter, reporterCounters));
1669
+ projectSummaries.push(await executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, reporter, reporterCounters, eventRecorder.record));
1180
1670
  }
1181
1671
  reporter.onRunEnd({
1182
1672
  failedTasks: reporterCounters.failedTasks,
@@ -1184,10 +1674,17 @@ async function runVievalCli(options = {}) {
1184
1674
  skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
1185
1675
  totalTasks
1186
1676
  });
1187
- return {
1677
+ const output = {
1678
+ attemptId: identity.attemptId,
1188
1679
  configFilePath: loadedConfig.configFilePath,
1189
- projects: projectSummaries
1680
+ experimentId: identity.experimentId,
1681
+ projects: projectSummaries,
1682
+ reportDirectory: null,
1683
+ runId: identity.runId,
1684
+ workspaceId: identity.workspaceId
1190
1685
  };
1686
+ if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
1687
+ return output;
1191
1688
  } finally {
1192
1689
  reporter.dispose();
1193
1690
  restoreEnvironment();
@@ -1239,7 +1736,9 @@ function formatVievalCliRunOutput(output) {
1239
1736
  totalTasks += project.taskCount;
1240
1737
  executedTasks += project.result?.overall.runCount ?? 0;
1241
1738
  const badge = createProjectBadge(project.name, colors, colorEnabled);
1242
- if (project.errorMessage != null) {
1739
+ const isFailed = project.errorMessage != null;
1740
+ const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
1741
+ if (isFailed) {
1243
1742
  failedProjects += 1;
1244
1743
  lines.push(` ${colors.red("❯")} ${badge}${formatDuration(project.durationMs, colors)}`);
1245
1744
  lines.push(` ${project.errorMessage}`);
@@ -1256,7 +1755,8 @@ function formatVievalCliRunOutput(output) {
1256
1755
  if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
1257
1756
  continue;
1258
1757
  }
1259
- passedProjects += 1;
1758
+ if (hasFailedCases) failedProjects += 1;
1759
+ else passedProjects += 1;
1260
1760
  const hybridAverage = project.result?.overall.hybridAverage;
1261
1761
  const hybridAverageLabel = hybridAverage == null ? "n/a" : String(hybridAverage);
1262
1762
  const runCount = project.result?.overall.runCount ?? 0;
@@ -1264,10 +1764,18 @@ function formatVievalCliRunOutput(output) {
1264
1764
  const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed`;
1265
1765
  const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, ${runCount} runs${caseSummaryLabel}, hybrid ${hybridAverageLabel}`);
1266
1766
  const matrixSummary = formatMatrixSummary(project.matrixSummary);
1267
- lines.push(` ${colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
1767
+ lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
1268
1768
  if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
1269
1769
  const scheduleBreakdown = formatScheduleBreakdown(project);
1270
1770
  if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
1771
+ if ((project.caseFailures?.length ?? 0) > 0) {
1772
+ lines.push(` ${colors.red("Failed cases:")}`);
1773
+ for (const failure of project.caseFailures.slice(0, 5)) {
1774
+ lines.push(` ${colors.red(`- ${failure.caseName} (${failure.taskId})`)}`);
1775
+ for (const line of failure.errorMessage.split("\n")) lines.push(` ${colors.red(line)}`);
1776
+ }
1777
+ if (project.caseFailures.length > 5) lines.push(` ${colors.dim(`... ${project.caseFailures.length - 5} more failed cases`)}`);
1778
+ }
1271
1779
  }
1272
1780
  lines.push("");
1273
1781
  if (failedProjects > 0 || skippedProjects > 0) {
@@ -1280,19 +1788,127 @@ function formatVievalCliRunOutput(output) {
1280
1788
  return lines.join("\n");
1281
1789
  }
1282
1790
  //#endregion
1791
+ //#region src/cli/compare.ts
1792
+ const compareHelpText = `
1793
+ Compare multiple methods on one benchmark.
1794
+
1795
+ Usage
1796
+ $ vieval compare [--config <path>] [--comparison <id>] [--output <path>] [--format <format>]
1797
+
1798
+ Options
1799
+ --config Config file path (default: nearest vieval.config.*)
1800
+ --comparison Comparison entry id from config.comparisons
1801
+ --output Optional output artifact path
1802
+ --format Console output format: table | json (default: table)
1803
+ `;
1804
+ function normalizeCliArgv$4(argv) {
1805
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
1806
+ if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
1807
+ return normalizedArgv;
1808
+ }
1809
+ function parseCompareCliArguments(argv) {
1810
+ const cli = meow(compareHelpText, {
1811
+ argv: normalizeCliArgv$4(argv),
1812
+ flags: {
1813
+ config: { type: "string" },
1814
+ comparison: { type: "string" },
1815
+ format: {
1816
+ default: "table",
1817
+ type: "string"
1818
+ },
1819
+ output: { type: "string" }
1820
+ },
1821
+ importMeta: import.meta
1822
+ });
1823
+ return {
1824
+ comparisonId: cli.flags.comparison,
1825
+ configFilePath: cli.flags.config,
1826
+ format: cli.flags.format === "json" ? "json" : "table",
1827
+ output: cli.flags.output
1828
+ };
1829
+ }
1830
+ /**
1831
+ * Runs one compare session from `vieval.config.*` comparison-mode config.
1832
+ */
1833
+ async function runCompareCli(argv) {
1834
+ const parsed = parseCompareCliArguments(argv);
1835
+ const loaded = await loadVievalComparisonConfig({
1836
+ comparisonId: parsed.comparisonId,
1837
+ configFilePath: parsed.configFilePath,
1838
+ cwd: parsed.cwd
1839
+ });
1840
+ const methodResults = [];
1841
+ for (const method of loaded.config.methods) {
1842
+ const methodWorkspace = resolve(method.workspace);
1843
+ const output = await runVievalCli({
1844
+ cacheProjectName: loaded.config.benchmark.sharedCaseNamespace,
1845
+ configFilePath: method.configFilePath ?? resolve(methodWorkspace, "vieval.config.ts"),
1846
+ cwd: methodWorkspace,
1847
+ project: [method.project],
1848
+ workspace: loaded.config.benchmark.id
1849
+ });
1850
+ const failedProject = output.projects.find((project) => project.errorMessage != null);
1851
+ if (failedProject != null) throw new Error(`Comparison method "${method.id}" failed: ${failedProject.errorMessage}`);
1852
+ methodResults.push({
1853
+ methodId: method.id,
1854
+ output
1855
+ });
1856
+ }
1857
+ const runOutput = {
1858
+ benchmarkId: loaded.config.benchmark.id,
1859
+ methods: methodResults
1860
+ };
1861
+ const artifact = buildCompareReportArtifact({
1862
+ benchmarkId: runOutput.benchmarkId,
1863
+ methods: runOutput.methods,
1864
+ reportPath: loaded.configFilePath
1865
+ });
1866
+ if (parsed.output != null) await writeCompareReportArtifact({
1867
+ artifact,
1868
+ outputPath: parsed.output
1869
+ });
1870
+ if (parsed.format === "json") process.stdout.write(`${JSON.stringify(artifact, null, 2)}\n`);
1871
+ else process.stdout.write([
1872
+ "COMPARE vieval",
1873
+ `Benchmark ${artifact.benchmarkId}`,
1874
+ ...artifact.methods.map((method, index) => {
1875
+ const hybrid = method.hybridAverage == null ? "n/a" : method.hybridAverage.toFixed(3);
1876
+ const exact = method.exactAverage == null ? "n/a" : method.exactAverage.toFixed(3);
1877
+ return `${index + 1}. ${method.methodId} hybrid=${hybrid} exact=${exact} runs=${method.runCount}`;
1878
+ })
1879
+ ].join("\n").concat("\n"));
1880
+ return runOutput;
1881
+ }
1882
+ async function runCompareCliOrExit(argv) {
1883
+ try {
1884
+ await runCompareCli(argv);
1885
+ } catch (error) {
1886
+ const errorMessage = errorMessageFrom(error) ?? "Unknown compare command failure.";
1887
+ process.stderr.write(`[vieval compare] ${errorMessage}\n`);
1888
+ process.exitCode = 1;
1889
+ }
1890
+ }
1891
+ //#endregion
1892
+ //#region package.json
1893
+ var name = "vieval";
1894
+ //#endregion
1283
1895
  //#region src/cli/eval-run.ts
1284
1896
  const evalRunHelpText = `
1285
1897
  Execute vieval projects from discovered or explicit config.
1286
1898
 
1287
1899
  Usage
1288
- $ vieval run [--config <path>] [--project <name>] [--json]
1900
+ $ vieval run [--config <path>] [--project <name>] [--json] [--report-out <path>]
1289
1901
 
1290
1902
  Options
1291
1903
  --config Config file path
1292
1904
  --project Project name to execute; may be repeated
1905
+ --workspace Workspace id used in report artifacts
1906
+ --experiment Experiment id used in report artifacts
1907
+ --attempt Attempt id used in report artifacts
1908
+ --report-out Report output root directory
1293
1909
  --json Print machine-readable JSON output
1294
1910
  `;
1295
- function normalizeCliArgv$1(argv) {
1911
+ function normalizeCliArgv$3(argv) {
1296
1912
  const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
1297
1913
  return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
1298
1914
  }
@@ -1302,7 +1918,7 @@ function normalizeProjectNames(projectNames) {
1302
1918
  }
1303
1919
  function parseCliArguments(argv) {
1304
1920
  const cli = meow(evalRunHelpText, {
1305
- argv: normalizeCliArgv$1(argv),
1921
+ argv: normalizeCliArgv$3(argv),
1306
1922
  importMeta: import.meta,
1307
1923
  flags: {
1308
1924
  config: { type: "string" },
@@ -1313,13 +1929,21 @@ function parseCliArguments(argv) {
1313
1929
  project: {
1314
1930
  isMultiple: true,
1315
1931
  type: "string"
1316
- }
1932
+ },
1933
+ workspace: { type: "string" },
1934
+ experiment: { type: "string" },
1935
+ attempt: { type: "string" },
1936
+ reportOut: { type: "string" }
1317
1937
  }
1318
1938
  });
1319
1939
  return {
1940
+ attempt: cli.flags.attempt,
1320
1941
  configFilePath: cli.flags.config,
1942
+ experiment: cli.flags.experiment,
1321
1943
  json: cli.flags.json === true,
1322
- project: normalizeProjectNames(cli.flags.project)
1944
+ project: normalizeProjectNames(cli.flags.project),
1945
+ reportOut: cli.flags.reportOut,
1946
+ workspace: cli.flags.workspace
1323
1947
  };
1324
1948
  }
1325
1949
  function isDirectExecution$1() {
@@ -1345,14 +1969,20 @@ async function main$1() {
1345
1969
  const parsed = parseCliArguments(process.argv.slice(2));
1346
1970
  try {
1347
1971
  const output = await runVievalCli({
1972
+ attempt: parsed.attempt,
1348
1973
  configFilePath: parsed.configFilePath,
1349
- project: parsed.project
1974
+ experiment: parsed.experiment,
1975
+ project: parsed.project,
1976
+ reportOut: parsed.reportOut,
1977
+ workspace: parsed.workspace
1350
1978
  });
1351
1979
  if (parsed.json) {
1352
1980
  process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
1981
+ if (hasRunFailures(output)) process.exitCode = 1;
1353
1982
  return;
1354
1983
  }
1355
1984
  process.stdout.write(`${formatVievalCliRunOutput(output)}\n`);
1985
+ if (hasRunFailures(output)) process.exitCode = 1;
1356
1986
  } catch (error) {
1357
1987
  const errorMessage = errorMessageFrom(error) ?? "Unknown CLI failure.";
1358
1988
  process.stderr.write(`[${name}] ${errorMessage}\n`);
@@ -1361,6 +1991,485 @@ async function main$1() {
1361
1991
  }
1362
1992
  if (isDirectExecution$1()) await main$1();
1363
1993
  //#endregion
1994
+ //#region src/cli/report-artifacts.ts
1995
+ /**
1996
+ * Resolves one or more `run-summary.json` paths from a report location.
1997
+ *
1998
+ * Use when:
1999
+ * - callers may pass a run directory, summary file path, or a report root
2000
+ *
2001
+ * Returns:
2002
+ * - sorted absolute summary file paths
2003
+ */
2004
+ async function resolveRunSummaryPaths(reportPath) {
2005
+ const absoluteReportPath = resolve(reportPath);
2006
+ const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
2007
+ if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
2008
+ if (existsSync(directSummaryPath)) return [directSummaryPath];
2009
+ return (await glob("**/run-summary.json", {
2010
+ absolute: true,
2011
+ cwd: absoluteReportPath
2012
+ })).sort((left, right) => left.localeCompare(right));
2013
+ }
2014
+ /**
2015
+ * Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
2016
+ *
2017
+ * Use when:
2018
+ * - report analysis needs both run aggregate output and event count metadata
2019
+ */
2020
+ function readReportRunArtifact(summaryFilePath) {
2021
+ const reportDirectory = resolve(summaryFilePath, "..");
2022
+ const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
2023
+ const eventsFilePath = resolve(reportDirectory, "events.jsonl");
2024
+ const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
2025
+ const event = JSON.parse(line);
2026
+ return {
2027
+ caseId: event.caseId,
2028
+ data: event.data,
2029
+ event: event.event,
2030
+ taskId: event.taskId
2031
+ };
2032
+ }) : [];
2033
+ return {
2034
+ events,
2035
+ eventsCount: events.length,
2036
+ reportDirectory,
2037
+ summary,
2038
+ summaryFilePath
2039
+ };
2040
+ }
2041
+ /**
2042
+ * Reads all run artifacts found under `reportPath`.
2043
+ *
2044
+ * Use when:
2045
+ * - callers need multi-run analysis from a directory root
2046
+ */
2047
+ async function readReportArtifacts(reportPath) {
2048
+ return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
2049
+ }
2050
+ /**
2051
+ * Creates a compact summary row for one run artifact.
2052
+ *
2053
+ * Use when:
2054
+ * - table/csv/jsonl exports should stay stable and cheap to parse
2055
+ */
2056
+ function summarizeReportRunArtifact(artifact) {
2057
+ const totalProjects = artifact.summary.projects.length;
2058
+ const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
2059
+ const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
2060
+ const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
2061
+ const projectNames = artifact.summary.projects.map((project) => project.name);
2062
+ return {
2063
+ attemptId: artifact.summary.attemptId ?? null,
2064
+ eventsCount: artifact.eventsCount,
2065
+ executedProjects,
2066
+ experimentId: artifact.summary.experimentId ?? null,
2067
+ failedProjects,
2068
+ projectNames,
2069
+ reportDirectory: artifact.reportDirectory,
2070
+ runId: artifact.summary.runId ?? null,
2071
+ totalProjects,
2072
+ totalTasks,
2073
+ workspaceId: artifact.summary.workspaceId ?? null
2074
+ };
2075
+ }
2076
+ //#endregion
2077
+ //#region src/cli/report-analyze.ts
2078
+ const reportAnalyzeHelpText = `
2079
+ Analyze generated vieval report artifacts.
2080
+
2081
+ Usage
2082
+ $ vieval report analyze <reportPath> [options]
2083
+
2084
+ Options
2085
+ --format Output format: table | json | jsonl | csv (default: table)
2086
+ --workspace Workspace id filter
2087
+ --project Project name filter (exact)
2088
+ --experiment Experiment id filter
2089
+ --attempt Attempt id filter
2090
+ --run Run id filter
2091
+ --task-state Keep runs containing at least one task in this state
2092
+ --case-state Keep runs containing at least one case in this state
2093
+ --contains Keep runs containing this text in event name or payload
2094
+ --error-contains Keep runs containing this text in project errors or event payload
2095
+ --run-matrix Keep runs matching run-matrix selector "key=value[,key=value]"
2096
+ --eval-matrix Keep runs matching eval-matrix selector "key=value[,key=value]"
2097
+ `;
2098
+ function normalizeCliArgv$2(argv) {
2099
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
2100
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
2101
+ if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
2102
+ return normalizedArgv;
2103
+ }
2104
+ function parseReportAnalyzeCliArguments(argv) {
2105
+ const cli = meow(reportAnalyzeHelpText, {
2106
+ argv: normalizeCliArgv$2(argv),
2107
+ flags: {
2108
+ attempt: { type: "string" },
2109
+ caseState: { type: "string" },
2110
+ contains: { type: "string" },
2111
+ evalMatrix: { type: "string" },
2112
+ errorContains: { type: "string" },
2113
+ experiment: { type: "string" },
2114
+ format: {
2115
+ default: "table",
2116
+ type: "string"
2117
+ },
2118
+ project: { type: "string" },
2119
+ runMatrix: { type: "string" },
2120
+ run: { type: "string" },
2121
+ taskState: { type: "string" },
2122
+ workspace: { type: "string" }
2123
+ },
2124
+ importMeta: import.meta
2125
+ });
2126
+ const reportPath = cli.input[0];
2127
+ if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
2128
+ const normalizedFormat = cli.flags.format.toLowerCase();
2129
+ const format = normalizedFormat === "json" ? "json" : normalizedFormat === "jsonl" ? "jsonl" : normalizedFormat === "csv" ? "csv" : "table";
2130
+ return {
2131
+ attempt: cli.flags.attempt,
2132
+ caseState: normalizeStateFilter(cli.flags.caseState),
2133
+ contains: cli.flags.contains,
2134
+ evalMatrix: parseMatrixSelector(cli.flags.evalMatrix),
2135
+ errorContains: cli.flags.errorContains,
2136
+ experiment: cli.flags.experiment,
2137
+ format,
2138
+ project: cli.flags.project,
2139
+ reportPath,
2140
+ runMatrix: parseMatrixSelector(cli.flags.runMatrix),
2141
+ run: cli.flags.run,
2142
+ taskState: normalizeStateFilter(cli.flags.taskState),
2143
+ workspace: cli.flags.workspace
2144
+ };
2145
+ }
2146
+ function normalizeStateFilter(value) {
2147
+ if (value == null) return;
2148
+ const normalized = value.trim().toLowerCase();
2149
+ if (normalized === "passed" || normalized === "failed" || normalized === "skipped") return normalized;
2150
+ throw new Error(`Unsupported state filter "${value}". Expected "passed", "failed", or "skipped".`);
2151
+ }
2152
+ function parseMatrixSelector(value) {
2153
+ if (value == null) return;
2154
+ const selector = {};
2155
+ const segments = value.split(",").map((segment) => segment.trim()).filter((segment) => segment.length > 0);
2156
+ for (const segment of segments) {
2157
+ const separatorIndex = segment.indexOf("=");
2158
+ if (separatorIndex <= 0 || separatorIndex === segment.length - 1) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
2159
+ const key = segment.slice(0, separatorIndex).trim();
2160
+ const parsedValue = segment.slice(separatorIndex + 1).trim();
2161
+ if (key.length === 0 || parsedValue.length === 0) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
2162
+ selector[key] = parsedValue;
2163
+ }
2164
+ return selector;
2165
+ }
2166
+ function filterAnalyzeRows(rows, parsed) {
2167
+ return rows.filter((row) => {
2168
+ if (parsed.workspace != null && row.workspaceId !== parsed.workspace) return false;
2169
+ if (parsed.experiment != null && row.experimentId !== parsed.experiment) return false;
2170
+ if (parsed.attempt != null && row.attemptId !== parsed.attempt) return false;
2171
+ if (parsed.run != null && row.runId !== parsed.run) return false;
2172
+ if (parsed.project != null && !row.projectNames.includes(parsed.project)) return false;
2173
+ return true;
2174
+ });
2175
+ }
2176
+ function includesNeedle(value, needle) {
2177
+ const normalizedNeedle = needle.trim().toLowerCase();
2178
+ if (normalizedNeedle.length === 0) return true;
2179
+ return JSON.stringify(value).toLowerCase().includes(normalizedNeedle);
2180
+ }
2181
+ function hasTaskState(artifact, targetState) {
2182
+ return artifact.events.some((event) => {
2183
+ if (event.event !== "TaskEnded") return false;
2184
+ return event.data?.state === targetState;
2185
+ });
2186
+ }
2187
+ function hasCaseState(artifact, targetState) {
2188
+ return artifact.events.some((event) => {
2189
+ if (event.event !== "CaseEnded") return false;
2190
+ return event.data?.state === targetState;
2191
+ });
2192
+ }
2193
+ function matchesMatrixSelector(matrix, selector) {
2194
+ return Object.entries(selector).every(([key, expectedValue]) => String(matrix[key]) === expectedValue);
2195
+ }
2196
+ function hasRunMatrixMatch(artifact, selector) {
2197
+ return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.run, selector)) === true);
2198
+ }
2199
+ function hasEvalMatrixMatch(artifact, selector) {
2200
+ return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.eval, selector)) === true);
2201
+ }
2202
+ function matchesOutcomeFilters(artifact, parsed) {
2203
+ if (parsed.runMatrix != null && !hasRunMatrixMatch(artifact, parsed.runMatrix)) return false;
2204
+ if (parsed.evalMatrix != null && !hasEvalMatrixMatch(artifact, parsed.evalMatrix)) return false;
2205
+ if (parsed.taskState != null && !hasTaskState(artifact, parsed.taskState)) return false;
2206
+ if (parsed.caseState != null && !hasCaseState(artifact, parsed.caseState)) return false;
2207
+ if (parsed.contains != null) {
2208
+ if (!artifact.events.some((event) => includesNeedle({
2209
+ data: event.data,
2210
+ event: event.event
2211
+ }, parsed.contains))) return false;
2212
+ }
2213
+ if (parsed.errorContains != null) {
2214
+ if (!(artifact.summary.projects.map((project) => project.errorMessage).filter((errorMessage) => errorMessage != null).some((errorMessage) => includesNeedle(errorMessage, parsed.errorContains)) || artifact.events.some((event) => includesNeedle(event.data, parsed.errorContains)))) return false;
2215
+ }
2216
+ return true;
2217
+ }
2218
+ async function readReportAnalyzeOutput(parsed) {
2219
+ const artifacts = await readReportArtifacts(parsed.reportPath);
2220
+ const rows = artifacts.map((artifact) => summarizeReportRunArtifact(artifact));
2221
+ const identityFilteredRows = filterAnalyzeRows(rows, parsed);
2222
+ const rowByDirectory = new Map(identityFilteredRows.map((row) => [row.reportDirectory, row]));
2223
+ const filteredRows = artifacts.filter((artifact) => rowByDirectory.has(artifact.reportDirectory)).filter((artifact) => matchesOutcomeFilters(artifact, parsed)).map((artifact) => rowByDirectory.get(artifact.reportDirectory)).filter((row) => row != null);
2224
+ return {
2225
+ experimentSummaries: buildExperimentSummaries(filteredRows),
2226
+ filteredRunCount: filteredRows.length,
2227
+ runs: filteredRows,
2228
+ totalRunCount: rows.length
2229
+ };
2230
+ }
2231
+ function roundMetric(value) {
2232
+ return Number(value.toFixed(6));
2233
+ }
2234
+ function computeAverage(values) {
2235
+ if (values.length === 0) return 0;
2236
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
2237
+ }
2238
+ function computeStandardDeviation(values) {
2239
+ if (values.length === 0) return 0;
2240
+ const average = computeAverage(values);
2241
+ const variance = computeAverage(values.map((value) => (value - average) ** 2));
2242
+ return Math.sqrt(variance);
2243
+ }
2244
+ function createExperimentGroupKey(row) {
2245
+ return `${row.workspaceId ?? "unknown-workspace"}::${row.experimentId ?? "unknown-experiment"}`;
2246
+ }
2247
+ /**
2248
+ * Builds experiment-level rollups from filtered run rows.
2249
+ *
2250
+ * Use when:
2251
+ * - CLI consumers need stability and reliability summaries above per-run data
2252
+ *
2253
+ * Returns:
2254
+ * - one summary row per `workspaceId + experimentId` group
2255
+ */
2256
+ function buildExperimentSummaries(rows) {
2257
+ const grouped = /* @__PURE__ */ new Map();
2258
+ for (const row of rows) {
2259
+ const groupKey = createExperimentGroupKey(row);
2260
+ const existing = grouped.get(groupKey);
2261
+ if (existing == null) {
2262
+ grouped.set(groupKey, [row]);
2263
+ continue;
2264
+ }
2265
+ existing.push(row);
2266
+ }
2267
+ return [...grouped.entries()].map(([groupKey, groupRows]) => {
2268
+ const [workspaceId, experimentId] = groupKey.split("::");
2269
+ const failedProjects = groupRows.reduce((sum, row) => sum + row.failedProjects, 0);
2270
+ const totalTasks = groupRows.reduce((sum, row) => sum + row.totalTasks, 0);
2271
+ const totalEvents = groupRows.reduce((sum, row) => sum + row.eventsCount, 0);
2272
+ const successfulRunCount = groupRows.filter((row) => row.failedProjects === 0).length;
2273
+ const successRate = groupRows.length === 0 ? 0 : successfulRunCount / groupRows.length;
2274
+ const attemptToRuns = /* @__PURE__ */ new Map();
2275
+ for (const row of groupRows) {
2276
+ const attemptId = row.attemptId ?? "unknown-attempt";
2277
+ const attemptRows = attemptToRuns.get(attemptId);
2278
+ if (attemptRows == null) {
2279
+ attemptToRuns.set(attemptId, [row]);
2280
+ continue;
2281
+ }
2282
+ attemptRows.push(row);
2283
+ }
2284
+ const attemptSummaries = [...attemptToRuns.entries()].map(([attemptId, attemptRows]) => {
2285
+ const successCount = attemptRows.filter((row) => row.failedProjects === 0).length;
2286
+ const runCount = attemptRows.length;
2287
+ const failedProjectCount = attemptRows.reduce((sum, row) => sum + row.failedProjects, 0);
2288
+ const totalTaskCount = attemptRows.reduce((sum, row) => sum + row.totalTasks, 0);
2289
+ const totalEventCount = attemptRows.reduce((sum, row) => sum + row.eventsCount, 0);
2290
+ return {
2291
+ attemptId,
2292
+ failedProjects: failedProjectCount,
2293
+ runCount,
2294
+ runIds: attemptRows.map((row) => row.runId).filter((runId) => runId != null).sort((left, right) => left.localeCompare(right)),
2295
+ successRate: roundMetric(runCount === 0 ? 0 : successCount / runCount),
2296
+ totalEvents: totalEventCount,
2297
+ totalTasks: totalTaskCount
2298
+ };
2299
+ }).sort((left, right) => left.attemptId.localeCompare(right.attemptId));
2300
+ const attemptSuccessRates = attemptSummaries.map((summary) => summary.successRate);
2301
+ const minAttemptSuccessRate = attemptSuccessRates.length === 0 ? 0 : Math.min(...attemptSuccessRates);
2302
+ const maxAttemptSuccessRate = attemptSuccessRates.length === 0 ? 0 : Math.max(...attemptSuccessRates);
2303
+ const avgAttemptSuccessRate = computeAverage(attemptSuccessRates);
2304
+ const stdevAttemptSuccessRate = computeStandardDeviation(attemptSuccessRates);
2305
+ return {
2306
+ attemptCount: attemptToRuns.size,
2307
+ attemptSummaries,
2308
+ attemptSuccessRateStats: {
2309
+ avg: roundMetric(avgAttemptSuccessRate),
2310
+ max: roundMetric(maxAttemptSuccessRate),
2311
+ min: roundMetric(minAttemptSuccessRate),
2312
+ stdev: roundMetric(stdevAttemptSuccessRate)
2313
+ },
2314
+ experimentId,
2315
+ failedProjects,
2316
+ runCount: groupRows.length,
2317
+ successRate: roundMetric(successRate),
2318
+ totalEvents,
2319
+ totalTasks,
2320
+ workspaceId
2321
+ };
2322
+ }).sort((left, right) => {
2323
+ const workspaceCompare = left.workspaceId.localeCompare(right.workspaceId);
2324
+ if (workspaceCompare !== 0) return workspaceCompare;
2325
+ return left.experimentId.localeCompare(right.experimentId);
2326
+ });
2327
+ }
2328
+ function formatTableOutput$1(output) {
2329
+ const header = "Run ID | Workspace | Experiment | Attempt | Projects(executed/total) | FailedProjects | Tasks | Events";
2330
+ const lines = output.runs.map((row) => {
2331
+ return `${row.runId ?? "n/a"} | ${row.workspaceId ?? "n/a"} | ${row.experimentId ?? "n/a"} | ${row.attemptId ?? "n/a"} | ${`${row.executedProjects}/${row.totalProjects}`} | ${row.failedProjects} | ${row.totalTasks} | ${row.eventsCount}`;
2332
+ });
2333
+ return [
2334
+ `ANALYZE vieval report: ${output.filteredRunCount}/${output.totalRunCount} runs (${output.experimentSummaries.length} experiment groups)`,
2335
+ header,
2336
+ ...lines
2337
+ ].join("\n");
2338
+ }
2339
+ function formatCsvOutput(output) {
2340
+ return [[
2341
+ "runId",
2342
+ "workspaceId",
2343
+ "experimentId",
2344
+ "attemptId",
2345
+ "totalProjects",
2346
+ "executedProjects",
2347
+ "failedProjects",
2348
+ "totalTasks",
2349
+ "eventsCount",
2350
+ "reportDirectory",
2351
+ "projectNames"
2352
+ ].join(","), ...output.runs.map((row) => {
2353
+ const escapedProjectNames = `"${row.projectNames.join("|").replaceAll("\"", "\"\"")}"`;
2354
+ const escapedDirectory = `"${row.reportDirectory.replaceAll("\"", "\"\"")}"`;
2355
+ return [
2356
+ row.runId ?? "",
2357
+ row.workspaceId ?? "",
2358
+ row.experimentId ?? "",
2359
+ row.attemptId ?? "",
2360
+ row.totalProjects.toString(),
2361
+ row.executedProjects.toString(),
2362
+ row.failedProjects.toString(),
2363
+ row.totalTasks.toString(),
2364
+ row.eventsCount.toString(),
2365
+ escapedDirectory,
2366
+ escapedProjectNames
2367
+ ].join(",");
2368
+ })].join("\n");
2369
+ }
2370
+ async function runReportAnalyzeCli(argv) {
2371
+ try {
2372
+ const parsed = parseReportAnalyzeCliArguments(argv);
2373
+ const output = await readReportAnalyzeOutput(parsed);
2374
+ if (parsed.format === "json") {
2375
+ process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
2376
+ return;
2377
+ }
2378
+ if (parsed.format === "jsonl") {
2379
+ const jsonl = output.runs.map((run) => JSON.stringify(run)).join("\n");
2380
+ process.stdout.write(`${jsonl}${jsonl.length > 0 ? "\n" : ""}`);
2381
+ return;
2382
+ }
2383
+ if (parsed.format === "csv") {
2384
+ process.stdout.write(`${formatCsvOutput(output)}\n`);
2385
+ return;
2386
+ }
2387
+ process.stdout.write(`${formatTableOutput$1(output)}\n`);
2388
+ } catch (error) {
2389
+ const errorMessage = errorMessageFrom(error) ?? "Unknown report analyze failure.";
2390
+ process.stderr.write(`[vieval report analyze] ${errorMessage}\n`);
2391
+ process.exitCode = 1;
2392
+ }
2393
+ }
2394
+ //#endregion
2395
+ //#region src/cli/report-index.ts
2396
+ const reportIndexHelpText = `
2397
+ Build report indexes from generated vieval artifacts.
2398
+
2399
+ Usage
2400
+ $ vieval report index <reportPath> [--output <path>] [--format <format>]
2401
+
2402
+ Options
2403
+ --output Output file path (default: <reportPath>/index/runs.jsonl)
2404
+ --format Console output format: table | json | jsonl (default: table)
2405
+ `;
2406
+ function normalizeCliArgv$1(argv) {
2407
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
2408
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "index") return normalizedArgv.slice(2);
2409
+ if (normalizedArgv[0] === "index") return normalizedArgv.slice(1);
2410
+ return normalizedArgv;
2411
+ }
2412
+ function parseReportIndexCliArguments(argv) {
2413
+ const cli = meow(reportIndexHelpText, {
2414
+ argv: normalizeCliArgv$1(argv),
2415
+ flags: {
2416
+ format: {
2417
+ default: "table",
2418
+ type: "string"
2419
+ },
2420
+ output: { type: "string" }
2421
+ },
2422
+ importMeta: import.meta
2423
+ });
2424
+ const reportPath = cli.input[0];
2425
+ if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
2426
+ const normalizedFormat = cli.flags.format.toLowerCase();
2427
+ return {
2428
+ format: normalizedFormat === "json" ? "json" : normalizedFormat === "jsonl" ? "jsonl" : "table",
2429
+ output: cli.flags.output,
2430
+ reportPath
2431
+ };
2432
+ }
2433
+ async function writeIndexFile(parsed) {
2434
+ const rows = (await readReportArtifacts(parsed.reportPath)).map((artifact) => summarizeReportRunArtifact(artifact));
2435
+ const indexFilePath = resolve(parsed.output ?? resolve(parsed.reportPath, "index", "runs.jsonl"));
2436
+ await mkdir(dirname(indexFilePath), { recursive: true });
2437
+ const indexContents = rows.map((row) => JSON.stringify(row)).join("\n");
2438
+ await writeFile(indexFilePath, `${indexContents}${indexContents.length > 0 ? "\n" : ""}`, "utf-8");
2439
+ return {
2440
+ indexFilePath,
2441
+ indexedRunCount: rows.length,
2442
+ rows
2443
+ };
2444
+ }
2445
+ function formatTableOutput(output) {
2446
+ return [
2447
+ "INDEX vieval report",
2448
+ `Path ${output.indexFilePath}`,
2449
+ `Run count ${output.indexedRunCount}`
2450
+ ].join("\n");
2451
+ }
2452
+ async function runReportIndexCli(argv) {
2453
+ try {
2454
+ const parsed = parseReportIndexCliArguments(argv);
2455
+ const output = await writeIndexFile(parsed);
2456
+ if (parsed.format === "json") {
2457
+ process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
2458
+ return;
2459
+ }
2460
+ if (parsed.format === "jsonl") {
2461
+ const jsonl = output.rows.map((row) => JSON.stringify(row)).join("\n");
2462
+ process.stdout.write(`${jsonl}${jsonl.length > 0 ? "\n" : ""}`);
2463
+ return;
2464
+ }
2465
+ process.stdout.write(`${formatTableOutput(output)}\n`);
2466
+ } catch (error) {
2467
+ const errorMessage = errorMessageFrom(error) ?? "Unknown report index failure.";
2468
+ process.stderr.write(`[vieval report index] ${errorMessage}\n`);
2469
+ process.exitCode = 1;
2470
+ }
2471
+ }
2472
+ //#endregion
1364
2473
  //#region src/cli/index.ts
1365
2474
  const topLevelHelpText = `
1366
2475
  Execute and report evaluation projects.
@@ -1370,10 +2479,15 @@ const topLevelHelpText = `
1370
2479
 
1371
2480
  Commands
1372
2481
  run Discover and execute eval projects
2482
+ compare Compare multiple workspaces/methods on one benchmark
2483
+ report Analyze and index generated report artifacts
1373
2484
 
1374
2485
  Examples
1375
2486
  $ vieval run
1376
- $ vieval run --config vieval.config.ts --project chess --json
2487
+ $ vieval run --config vieval.config.ts --project chess --json --report-out .vieval/reports
2488
+ $ vieval compare --config vieval.config.ts --comparison agent-memory
2489
+ $ vieval report analyze .vieval/reports/my-run
2490
+ $ vieval report index .vieval/reports --output .vieval/reports/index/runs.jsonl
1377
2491
  `;
1378
2492
  function normalizeCliArgv(argv) {
1379
2493
  return argv[0] === "--" ? argv.slice(1) : [...argv];
@@ -1391,7 +2505,7 @@ function parseTopLevelCliArguments(argv) {
1391
2505
  command: "help",
1392
2506
  commandArgv: []
1393
2507
  };
1394
- if (command !== "run") throw new Error(`Unsupported vieval command "${command ?? "(none)"}". Expected "run".`);
2508
+ if (command !== "run" && command !== "report" && command !== "compare") throw new Error(`Unsupported vieval command "${command ?? "(none)"}". Expected "run", "compare", or "report".`);
1395
2509
  return {
1396
2510
  command,
1397
2511
  commandArgv: normalizedArgv.slice(1)
@@ -1403,20 +2517,49 @@ async function runTopLevelCli(argv) {
1403
2517
  process.stdout.write(`${topLevelHelpText.trim()}\n`);
1404
2518
  return;
1405
2519
  }
2520
+ if (parsed.command === "report") {
2521
+ const reportSubcommand = parsed.commandArgv[0];
2522
+ if (reportSubcommand === "analyze") {
2523
+ await runReportAnalyzeCli(parsed.commandArgv);
2524
+ return;
2525
+ }
2526
+ if (reportSubcommand === "index") {
2527
+ await runReportIndexCli(parsed.commandArgv);
2528
+ return;
2529
+ }
2530
+ throw new Error(`Unsupported vieval report command "${reportSubcommand ?? "(none)"}". Expected "analyze" or "index".`);
2531
+ }
2532
+ if (parsed.command === "compare") {
2533
+ await runCompareCliOrExit(parsed.commandArgv);
2534
+ return;
2535
+ }
1406
2536
  const runArguments = parseCliArguments(parsed.commandArgv);
1407
2537
  const output = await runVievalCli({
2538
+ attempt: runArguments.attempt,
1408
2539
  configFilePath: runArguments.configFilePath,
1409
- project: runArguments.project
2540
+ experiment: runArguments.experiment,
2541
+ project: runArguments.project,
2542
+ reportOut: runArguments.reportOut,
2543
+ workspace: runArguments.workspace
1410
2544
  });
1411
2545
  if (runArguments.json) {
1412
2546
  process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
2547
+ if (hasRunFailures(output)) process.exitCode = 1;
1413
2548
  return;
1414
2549
  }
1415
2550
  process.stdout.write(`${formatVievalCliRunOutput(output)}\n`);
2551
+ if (hasRunFailures(output)) process.exitCode = 1;
1416
2552
  }
1417
2553
  function isDirectExecution() {
1418
2554
  if (!process.argv[1]) return false;
1419
- return path.resolve(process.argv[1]) === fileURLToPath(import.meta.url);
2555
+ const resolvedArgvPath = path.resolve(process.argv[1]);
2556
+ const currentModulePath = fileURLToPath(import.meta.url);
2557
+ try {
2558
+ if (realpathSync.native(resolvedArgvPath) === realpathSync.native(currentModulePath)) return true;
2559
+ } catch {
2560
+ if (resolvedArgvPath === currentModulePath) return true;
2561
+ }
2562
+ return resolvedArgvPath.replaceAll("\\", "/").endsWith("/.bin/vieval");
1420
2563
  }
1421
2564
  async function main() {
1422
2565
  try {