vieval 0.0.1 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/cli/index.d.mts +1 -1
- package/dist/cli/index.mjs +1232 -83
- package/dist/cli/index.mjs.map +1 -1
- package/dist/{config-D2fe1SnT.mjs → config-CHN24egi.mjs} +1 -1
- package/dist/{config-D2fe1SnT.mjs.map → config-CHN24egi.mjs.map} +1 -1
- package/dist/config.d.mts +2 -3
- package/dist/config.mjs +2 -2
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/inference-executors/index.d.mts +1 -45
- package/dist/core/inference-executors/index.mjs +1 -38
- package/dist/core/inference-executors/index.mjs.map +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +2 -2
- package/dist/env-C7X81PWa.mjs +41 -0
- package/dist/env-C7X81PWa.mjs.map +1 -0
- package/dist/env-DtpjACOW.d.mts +47 -0
- package/dist/expect-B2vaoRVZ.d.mts +10 -0
- package/dist/{expect-i9WZWGrA.mjs → expect-CaXiUkwY.mjs} +3 -3
- package/dist/expect-CaXiUkwY.mjs.map +1 -0
- package/dist/expect-extensions-BOzwV5EJ.mjs +197 -0
- package/dist/expect-extensions-BOzwV5EJ.mjs.map +1 -0
- package/dist/expect.d.mts +1 -1
- package/dist/expect.mjs +1 -1
- package/dist/{index-DP7jsORl.d.mts → index-BDMEAmf2.d.mts} +246 -3
- package/dist/{index-oSXhM1zx.d.mts → index-C3gPFmcR.d.mts} +2 -2
- package/dist/index.d.mts +326 -6
- package/dist/index.mjs +65 -23
- package/dist/index.mjs.map +1 -1
- package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
- package/dist/{models-D_MsBtYw.mjs.map → models-DIGdOUpJ.mjs.map} +1 -1
- package/dist/plugins/chat-models/index.d.mts +465 -6
- package/dist/plugins/chat-models/index.mjs +469 -6
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{registry-ChOjjdEC.mjs → registry-CHJcTN2W.mjs} +75 -16
- package/dist/registry-CHJcTN2W.mjs.map +1 -0
- package/dist/{runner-4ZsOveoY.mjs → runner-Dpy-eivM.mjs} +177 -21
- package/dist/runner-Dpy-eivM.mjs.map +1 -0
- package/dist/testing/expect-extensions.d.mts +44 -38
- package/dist/testing/expect-extensions.mjs +1 -1
- package/package.json +11 -4
- package/dist/expect-0jPJ7Zio.d.mts +0 -2318
- package/dist/expect-extensions-CwPtgTz8.mjs +0 -13471
- package/dist/expect-extensions-CwPtgTz8.mjs.map +0 -1
- package/dist/expect-i9WZWGrA.mjs.map +0 -1
- package/dist/magic-string.es-CH1jwzMg.mjs +0 -1013
- package/dist/magic-string.es-CH1jwzMg.mjs.map +0 -1
- package/dist/plugin-DVaRZY2x.d.mts +0 -84
- package/dist/registry-ChOjjdEC.mjs.map +0 -1
- package/dist/runner-4ZsOveoY.mjs.map +0 -1
package/dist/cli/index.mjs
CHANGED
|
@@ -1,18 +1,196 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { n as consumeModuleRegistrations,
|
|
3
|
-
import { a as runScheduledTasks, i as RunnerExecutionError, n as createRunnerSchedule, r as createRunnerRuntimeContext, s as collectEvalEntries, t as createTaskExecutionContext } from "../runner-
|
|
2
|
+
import { c as loadRawVievalConfig, l as loadVievalCliConfig, n as consumeModuleRegistrations, o as detectCliConfigMode, r as endModuleRegistration, t as beginModuleRegistration } from "../registry-CHJcTN2W.mjs";
|
|
3
|
+
import { a as runScheduledTasks, i as RunnerExecutionError, l as createFilesystemTaskCacheRuntime, n as createRunnerSchedule, r as createRunnerRuntimeContext, s as collectEvalEntries, t as createTaskExecutionContext } from "../runner-Dpy-eivM.mjs";
|
|
4
4
|
import process from "node:process";
|
|
5
|
-
import
|
|
5
|
+
import { access, mkdir, writeFile } from "node:fs/promises";
|
|
6
|
+
import path, { basename, dirname, isAbsolute, join, relative, resolve } from "node:path";
|
|
6
7
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
7
8
|
import { errorMessageFrom } from "@moeru/std";
|
|
8
|
-
import
|
|
9
|
+
import { existsSync, readFileSync, realpathSync } from "node:fs";
|
|
9
10
|
import meow from "meow";
|
|
10
|
-
import { uniq } from "es-toolkit";
|
|
11
11
|
import { glob } from "tinyglobby";
|
|
12
|
+
import { randomUUID } from "node:crypto";
|
|
13
|
+
import c from "tinyrainbow";
|
|
14
|
+
import { uniq } from "es-toolkit";
|
|
15
|
+
import { createVitest } from "vitest/node";
|
|
12
16
|
import { stripVTControlCharacters } from "node:util";
|
|
13
17
|
import stringWidth from "fast-string-width";
|
|
14
|
-
//#region
|
|
15
|
-
|
|
18
|
+
//#region src/cli/comparison-config.ts
|
|
19
|
+
const supportedWorkspaceConfigFileNames = [
|
|
20
|
+
"vieval.config.ts",
|
|
21
|
+
"vieval.config.mts",
|
|
22
|
+
"vieval.config.cts",
|
|
23
|
+
"vieval.config.js",
|
|
24
|
+
"vieval.config.mjs",
|
|
25
|
+
"vieval.config.cjs",
|
|
26
|
+
"vieval.config.json"
|
|
27
|
+
];
|
|
28
|
+
async function isReadableFile(filePath) {
|
|
29
|
+
try {
|
|
30
|
+
await access(filePath);
|
|
31
|
+
return true;
|
|
32
|
+
} catch {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
function normalizeGlobInput(patterns) {
|
|
37
|
+
if (patterns == null) return [];
|
|
38
|
+
return (typeof patterns === "string" ? [patterns] : patterns).map((pattern) => pattern.trim()).filter((pattern) => pattern.length > 0);
|
|
39
|
+
}
|
|
40
|
+
function normalizeMethodShape(method, configDirectory, index) {
|
|
41
|
+
const id = method.id.trim();
|
|
42
|
+
const workspace = method.workspace.trim();
|
|
43
|
+
const project = method.project.trim();
|
|
44
|
+
const configFilePath = method.configFilePath?.trim();
|
|
45
|
+
if (id.length === 0) throw new Error(`Comparison method #${index + 1} is missing id.`);
|
|
46
|
+
if (workspace.length === 0) throw new Error(`Comparison method "${id}" is missing workspace.`);
|
|
47
|
+
if (project.length === 0) throw new Error(`Comparison method "${id}" is missing project.`);
|
|
48
|
+
const resolvedWorkspace = isAbsolute(workspace) ? workspace : resolve(configDirectory, workspace);
|
|
49
|
+
return {
|
|
50
|
+
configFilePath: configFilePath == null || configFilePath.length === 0 ? void 0 : isAbsolute(configFilePath) ? configFilePath : resolve(configDirectory, configFilePath),
|
|
51
|
+
id,
|
|
52
|
+
project,
|
|
53
|
+
workspace: resolvedWorkspace
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
async function findWorkspaceConfigFile(workspaceDirectory) {
|
|
57
|
+
for (const fileName of supportedWorkspaceConfigFileNames) {
|
|
58
|
+
const candidate = join(workspaceDirectory, fileName);
|
|
59
|
+
if (await isReadableFile(candidate)) return candidate;
|
|
60
|
+
}
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
function createDiscoveredMethodId(configDirectory, workspace, projectName) {
|
|
64
|
+
const relativeWorkspace = relative(configDirectory, workspace);
|
|
65
|
+
return `${(relativeWorkspace.length > 0 ? relativeWorkspace : basename(workspace)).replaceAll("\\", "/")}:${projectName}`;
|
|
66
|
+
}
|
|
67
|
+
async function discoverMethodsFromWorkspaceGlobs(args) {
|
|
68
|
+
const includes = normalizeGlobInput(args.comparison.includesWorkspaces);
|
|
69
|
+
if (includes.length === 0) return [];
|
|
70
|
+
const discoveredWorkspaceDirectories = await glob(includes, {
|
|
71
|
+
absolute: true,
|
|
72
|
+
cwd: args.configDirectory,
|
|
73
|
+
ignore: normalizeGlobInput(args.comparison.excludesWorkspaces),
|
|
74
|
+
onlyDirectories: true
|
|
75
|
+
});
|
|
76
|
+
const methods = [];
|
|
77
|
+
for (const workspaceDirectory of discoveredWorkspaceDirectories.sort((left, right) => left.localeCompare(right))) {
|
|
78
|
+
const configFilePath = await findWorkspaceConfigFile(workspaceDirectory);
|
|
79
|
+
if (configFilePath == null) continue;
|
|
80
|
+
const loadedWorkspaceConfig = await loadVievalCliConfig({
|
|
81
|
+
configFilePath,
|
|
82
|
+
cwd: workspaceDirectory
|
|
83
|
+
});
|
|
84
|
+
for (const project of loadedWorkspaceConfig.projects) methods.push({
|
|
85
|
+
configFilePath,
|
|
86
|
+
id: createDiscoveredMethodId(args.configDirectory, workspaceDirectory, project.name),
|
|
87
|
+
project: project.name,
|
|
88
|
+
workspace: workspaceDirectory
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
return methods;
|
|
92
|
+
}
|
|
93
|
+
function validateMethodIdsAreUnique(methods) {
|
|
94
|
+
const methodIds = methods.map((method) => method.id);
|
|
95
|
+
const duplicatedMethodId = methodIds.find((methodId, index) => methodIds.indexOf(methodId) !== index);
|
|
96
|
+
if (duplicatedMethodId != null) throw new Error(`Duplicate comparison method id "${duplicatedMethodId}".`);
|
|
97
|
+
}
|
|
98
|
+
function assertComparisonMode(config) {
|
|
99
|
+
const mode = detectCliConfigMode(config);
|
|
100
|
+
if (mode !== "comparisons") throw new Error(`Expected comparison-mode config, but received ${mode}-mode config.`);
|
|
101
|
+
}
|
|
102
|
+
function selectComparisonConfig(comparisons, comparisonId) {
|
|
103
|
+
if (comparisons.length === 0) throw new Error("Comparison config requires at least one comparisons entry.");
|
|
104
|
+
if (comparisonId == null || comparisonId.trim().length === 0) {
|
|
105
|
+
if (comparisons.length > 1) throw new Error(`Multiple comparisons found. Provide --comparison. Available ids: ${comparisons.map((item) => item.id).join(", ")}`);
|
|
106
|
+
return comparisons[0];
|
|
107
|
+
}
|
|
108
|
+
const selected = comparisons.find((item) => item.id === comparisonId);
|
|
109
|
+
if (selected == null) throw new Error(`Unknown comparison id "${comparisonId}".`);
|
|
110
|
+
return selected;
|
|
111
|
+
}
|
|
112
|
+
function normalizeBenchmark(comparison) {
|
|
113
|
+
const benchmarkId = comparison.benchmark.id.trim();
|
|
114
|
+
const sharedCaseNamespace = comparison.benchmark.sharedCaseNamespace.trim();
|
|
115
|
+
if (benchmarkId.length === 0) throw new Error("Comparison config requires benchmark.id.");
|
|
116
|
+
if (sharedCaseNamespace.length === 0) throw new Error("Comparison config requires benchmark.sharedCaseNamespace.");
|
|
117
|
+
return {
|
|
118
|
+
id: benchmarkId,
|
|
119
|
+
sharedCaseNamespace
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Loads and validates comparison-mode data from `vieval.config.*`.
|
|
124
|
+
*/
|
|
125
|
+
async function loadVievalComparisonConfig(options = {}) {
|
|
126
|
+
const cwd = options.cwd ?? process.cwd();
|
|
127
|
+
try {
|
|
128
|
+
const loaded = await loadRawVievalConfig({
|
|
129
|
+
configFilePath: options.configFilePath,
|
|
130
|
+
cwd
|
|
131
|
+
});
|
|
132
|
+
if (loaded.configFilePath == null || loaded.config == null) throw new Error("Failed to find vieval config. Expected vieval.config.*");
|
|
133
|
+
assertComparisonMode(loaded.config);
|
|
134
|
+
const selectedComparison = selectComparisonConfig(loaded.config.comparisons, options.comparisonId);
|
|
135
|
+
const configDirectory = dirname(loaded.configFilePath);
|
|
136
|
+
const explicitMethods = (selectedComparison.methods ?? []).map((method, index) => normalizeMethodShape(method, configDirectory, index));
|
|
137
|
+
const discoveredMethods = await discoverMethodsFromWorkspaceGlobs({
|
|
138
|
+
comparison: selectedComparison,
|
|
139
|
+
configDirectory
|
|
140
|
+
});
|
|
141
|
+
const methods = [...explicitMethods, ...discoveredMethods];
|
|
142
|
+
if (methods.length === 0) throw new Error("Comparison config resolved zero methods. Configure methods or includesWorkspaces.");
|
|
143
|
+
validateMethodIdsAreUnique(methods);
|
|
144
|
+
return {
|
|
145
|
+
config: {
|
|
146
|
+
benchmark: normalizeBenchmark(selectedComparison),
|
|
147
|
+
methods
|
|
148
|
+
},
|
|
149
|
+
configFilePath: loaded.configFilePath
|
|
150
|
+
};
|
|
151
|
+
} catch (error) {
|
|
152
|
+
const errorMessage = errorMessageFrom(error) ?? "Unknown comparison config loading error.";
|
|
153
|
+
const resolvedPath = options.configFilePath ?? "vieval.config";
|
|
154
|
+
throw new Error(`Failed to load comparison config "${resolvedPath}": ${errorMessage}`);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
//#endregion
|
|
158
|
+
//#region src/cli/report-compare.ts
|
|
159
|
+
/**
|
|
160
|
+
* Builds a compact compare report sorted by hybrid/exact score.
|
|
161
|
+
*/
|
|
162
|
+
function buildCompareReportArtifact(args) {
|
|
163
|
+
const rows = args.methods.map((method) => {
|
|
164
|
+
const overall = method.output.projects[0]?.result?.overall;
|
|
165
|
+
return {
|
|
166
|
+
exactAverage: overall?.exactAverage ?? null,
|
|
167
|
+
hybridAverage: overall?.hybridAverage ?? null,
|
|
168
|
+
methodId: method.methodId,
|
|
169
|
+
runCount: overall?.runCount ?? 0
|
|
170
|
+
};
|
|
171
|
+
});
|
|
172
|
+
rows.sort((left, right) => {
|
|
173
|
+
const leftHybrid = left.hybridAverage ?? Number.NEGATIVE_INFINITY;
|
|
174
|
+
const rightHybrid = right.hybridAverage ?? Number.NEGATIVE_INFINITY;
|
|
175
|
+
if (leftHybrid !== rightHybrid) return rightHybrid - leftHybrid;
|
|
176
|
+
const leftExact = left.exactAverage ?? Number.NEGATIVE_INFINITY;
|
|
177
|
+
return (right.exactAverage ?? Number.NEGATIVE_INFINITY) - leftExact;
|
|
178
|
+
});
|
|
179
|
+
return {
|
|
180
|
+
benchmarkId: args.benchmarkId,
|
|
181
|
+
methods: rows,
|
|
182
|
+
reportPath: args.reportPath
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Writes compare report artifact as JSON.
|
|
187
|
+
*/
|
|
188
|
+
async function writeCompareReportArtifact(args) {
|
|
189
|
+
const outputPath = resolve(args.outputPath);
|
|
190
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
191
|
+
await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
|
|
192
|
+
return outputPath;
|
|
193
|
+
}
|
|
16
194
|
//#endregion
|
|
17
195
|
//#region src/cli/discovery.ts
|
|
18
196
|
/**
|
|
@@ -33,6 +211,58 @@ async function discoverEvalFiles(options) {
|
|
|
33
211
|
})).sort((left, right) => left.localeCompare(right));
|
|
34
212
|
}
|
|
35
213
|
//#endregion
|
|
214
|
+
//#region src/cli/module-runtime.ts
|
|
215
|
+
/**
|
|
216
|
+
* Loads eval modules and returns a normalized eval-module map.
|
|
217
|
+
*
|
|
218
|
+
* Use when:
|
|
219
|
+
* - CLI collection needs Vite/Vitest-powered module resolution and transforms
|
|
220
|
+
* - eval files should be imported with the same runtime semantics as Vitest
|
|
221
|
+
*
|
|
222
|
+
* Expects:
|
|
223
|
+
* - `projectRoot` points at the project that owns the eval files
|
|
224
|
+
* - each `evalFilePaths` entry is an absolute file path
|
|
225
|
+
*
|
|
226
|
+
* Returns:
|
|
227
|
+
* - eval modules keyed by stable file href + optional registration suffixes
|
|
228
|
+
*/
|
|
229
|
+
async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
|
|
230
|
+
const loadedModules = {};
|
|
231
|
+
const runtime = await createVitest("test", {
|
|
232
|
+
config: false,
|
|
233
|
+
root: projectRoot,
|
|
234
|
+
run: false,
|
|
235
|
+
silent: true,
|
|
236
|
+
watch: false
|
|
237
|
+
});
|
|
238
|
+
try {
|
|
239
|
+
for (const evalFilePath of evalFilePaths) {
|
|
240
|
+
const moduleHref = pathToFileURL(evalFilePath).href;
|
|
241
|
+
beginModuleRegistration(moduleHref);
|
|
242
|
+
try {
|
|
243
|
+
const moduleValue = await runtime.import(moduleHref);
|
|
244
|
+
const registeredDefinitions = consumeModuleRegistrations(moduleHref);
|
|
245
|
+
const defaultDefinition = moduleValue.default;
|
|
246
|
+
const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
|
|
247
|
+
const deduplicatedDefinitions = definitions.filter((definition, index) => {
|
|
248
|
+
const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
|
|
249
|
+
return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
|
|
250
|
+
});
|
|
251
|
+
if (deduplicatedDefinitions.length === 0) continue;
|
|
252
|
+
for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
|
|
253
|
+
const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
|
|
254
|
+
loadedModules[moduleKey] = { default: definition };
|
|
255
|
+
}
|
|
256
|
+
} finally {
|
|
257
|
+
endModuleRegistration();
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
} finally {
|
|
261
|
+
await runtime.close();
|
|
262
|
+
}
|
|
263
|
+
return loadedModules;
|
|
264
|
+
}
|
|
265
|
+
//#endregion
|
|
36
266
|
//#region src/cli/reporters/noop-reporter.ts
|
|
37
267
|
/**
|
|
38
268
|
* Creates a reporter that intentionally does nothing.
|
|
@@ -684,7 +914,144 @@ function getTextDisplayWidth(text) {
|
|
|
684
914
|
return stringWidth(stripVTControlCharacters(text));
|
|
685
915
|
}
|
|
686
916
|
//#endregion
|
|
917
|
+
//#region src/cli/reporters/vitest-compat-reporter.ts
|
|
918
|
+
function isReporterReferenceTuple(reference) {
|
|
919
|
+
return Array.isArray(reference);
|
|
920
|
+
}
|
|
921
|
+
function isAbsoluteLikePath(value) {
|
|
922
|
+
return value.startsWith("/") || value.startsWith("./") || value.startsWith("../") || /^[A-Z]:[\\/]/i.test(value);
|
|
923
|
+
}
|
|
924
|
+
async function loadReporterModule(path) {
|
|
925
|
+
if (isAbsoluteLikePath(path)) return import(pathToFileURL(path).href);
|
|
926
|
+
return import(path);
|
|
927
|
+
}
|
|
928
|
+
function normalizeReporterReference(reference) {
|
|
929
|
+
if (isReporterReferenceTuple(reference)) return {
|
|
930
|
+
options: reference[1],
|
|
931
|
+
value: reference[0]
|
|
932
|
+
};
|
|
933
|
+
return {
|
|
934
|
+
options: void 0,
|
|
935
|
+
value: reference
|
|
936
|
+
};
|
|
937
|
+
}
|
|
938
|
+
function createReporterInstance(moduleValue, options) {
|
|
939
|
+
const value = moduleValue.default ?? moduleValue;
|
|
940
|
+
if (value == null) return null;
|
|
941
|
+
if (typeof value === "function") return new value(options);
|
|
942
|
+
if (typeof value === "object") return value;
|
|
943
|
+
return null;
|
|
944
|
+
}
|
|
945
|
+
async function emitToReporters(reporters, callback) {
|
|
946
|
+
await Promise.all(reporters.map(async (reporter) => {
|
|
947
|
+
try {
|
|
948
|
+
await callback(reporter);
|
|
949
|
+
} catch {}
|
|
950
|
+
}));
|
|
951
|
+
}
|
|
952
|
+
/**
|
|
953
|
+
* Creates a project-level vitest-compatible reporter bridge.
|
|
954
|
+
*
|
|
955
|
+
* Use when:
|
|
956
|
+
* - `vieval` should reuse vitest-like reporter callbacks without changing CLI output contracts
|
|
957
|
+
*
|
|
958
|
+
* Expects:
|
|
959
|
+
* - references point to modules whose default export is a reporter instance or constructor
|
|
960
|
+
*
|
|
961
|
+
* Returns:
|
|
962
|
+
* - `null` when no reporter references are configured
|
|
963
|
+
*/
|
|
964
|
+
async function createVievalVitestCompatReporterBridge(options) {
|
|
965
|
+
if (options.references.length === 0) return null;
|
|
966
|
+
const loadedReporters = [];
|
|
967
|
+
for (const reference of options.references) {
|
|
968
|
+
const normalized = normalizeReporterReference(reference);
|
|
969
|
+
try {
|
|
970
|
+
const instance = createReporterInstance(typeof normalized.value === "string" ? await loadReporterModule(normalized.value) : normalized.value, normalized.options);
|
|
971
|
+
if (instance != null) loadedReporters.push(instance);
|
|
972
|
+
} catch {}
|
|
973
|
+
}
|
|
974
|
+
if (loadedReporters.length === 0) return null;
|
|
975
|
+
const modulesByTaskId = /* @__PURE__ */ new Map();
|
|
976
|
+
const casesByCompositeId = /* @__PURE__ */ new Map();
|
|
977
|
+
function getOrCreateModule(taskId) {
|
|
978
|
+
const existing = modulesByTaskId.get(taskId);
|
|
979
|
+
if (existing != null) return existing;
|
|
980
|
+
const created = {
|
|
981
|
+
id: taskId,
|
|
982
|
+
name: taskId,
|
|
983
|
+
projectName: options.projectName
|
|
984
|
+
};
|
|
985
|
+
modulesByTaskId.set(taskId, created);
|
|
986
|
+
return created;
|
|
987
|
+
}
|
|
988
|
+
function getOrCreateCase(taskId, caseId) {
|
|
989
|
+
const compositeId = `${taskId}::${caseId}`;
|
|
990
|
+
const existing = casesByCompositeId.get(compositeId);
|
|
991
|
+
if (existing != null) return existing;
|
|
992
|
+
const created = {
|
|
993
|
+
id: caseId,
|
|
994
|
+
module: getOrCreateModule(taskId),
|
|
995
|
+
name: caseId,
|
|
996
|
+
state: "pending"
|
|
997
|
+
};
|
|
998
|
+
casesByCompositeId.set(compositeId, created);
|
|
999
|
+
return created;
|
|
1000
|
+
}
|
|
1001
|
+
return {
|
|
1002
|
+
async onCaseEnd(payload) {
|
|
1003
|
+
const taskCase = getOrCreateCase(payload.taskId, payload.caseId);
|
|
1004
|
+
taskCase.state = payload.state;
|
|
1005
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseResult?.(taskCase));
|
|
1006
|
+
},
|
|
1007
|
+
async onCaseStart(payload) {
|
|
1008
|
+
const taskCase = getOrCreateCase(payload.taskId, payload.caseId);
|
|
1009
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseReady?.(taskCase));
|
|
1010
|
+
},
|
|
1011
|
+
async onRunEnd(run) {
|
|
1012
|
+
const modules = [...modulesByTaskId.values()];
|
|
1013
|
+
const errors = run.failed ? [{ message: "vieval run failed" }] : [];
|
|
1014
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestRunEnd?.(modules, errors, run.failed ? "failed" : "passed"));
|
|
1015
|
+
},
|
|
1016
|
+
async onRunStart() {
|
|
1017
|
+
const specifications = [...modulesByTaskId.values()].map((module) => ({
|
|
1018
|
+
moduleId: module.id,
|
|
1019
|
+
projectName: module.projectName
|
|
1020
|
+
}));
|
|
1021
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestRunStart?.(specifications));
|
|
1022
|
+
},
|
|
1023
|
+
async onTaskEnd(payload) {
|
|
1024
|
+
const module = getOrCreateModule(payload.taskId);
|
|
1025
|
+
if (payload.state === "failed") {
|
|
1026
|
+
const syntheticCase = getOrCreateCase(payload.taskId, `${payload.taskId}:task`);
|
|
1027
|
+
syntheticCase.state = "failed";
|
|
1028
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseResult?.(syntheticCase));
|
|
1029
|
+
}
|
|
1030
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleEnd?.(module));
|
|
1031
|
+
},
|
|
1032
|
+
async onTaskQueued(payload) {
|
|
1033
|
+
const module = getOrCreateModule(payload.taskId);
|
|
1034
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleQueued?.(module));
|
|
1035
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleCollected?.(module));
|
|
1036
|
+
},
|
|
1037
|
+
async onTaskStart(payload) {
|
|
1038
|
+
const module = getOrCreateModule(payload.taskId);
|
|
1039
|
+
await emitToReporters(loadedReporters, (reporter) => reporter.onTestModuleStart?.(module));
|
|
1040
|
+
}
|
|
1041
|
+
};
|
|
1042
|
+
}
|
|
1043
|
+
//#endregion
|
|
687
1044
|
//#region src/cli/run.ts
|
|
1045
|
+
/**
|
|
1046
|
+
* Returns true when output contains at least one failing project/task/case outcome.
|
|
1047
|
+
*/
|
|
1048
|
+
function hasRunFailures(output) {
|
|
1049
|
+
return output.projects.some((project) => {
|
|
1050
|
+
if (project.errorMessage != null) return true;
|
|
1051
|
+
if (project.caseSummary != null && project.caseSummary.failed > 0) return true;
|
|
1052
|
+
return (project.caseFailures?.length ?? 0) > 0;
|
|
1053
|
+
});
|
|
1054
|
+
}
|
|
688
1055
|
function shouldUseColor() {
|
|
689
1056
|
if (process.env.NO_COLOR != null) return false;
|
|
690
1057
|
const forceColor = process.env.FORCE_COLOR;
|
|
@@ -733,37 +1100,94 @@ function formatDuration(durationMs, colors) {
|
|
|
733
1100
|
const rounded = Math.round(durationMs);
|
|
734
1101
|
return (rounded > 1e3 ? colors.yellow : colors.green)(` ${rounded}${colors.dim("ms")}`);
|
|
735
1102
|
}
|
|
736
|
-
async function loadEvalModules(evalFilePaths) {
|
|
737
|
-
const loadedModules = {};
|
|
738
|
-
for (const [moduleIndex, evalFilePath] of evalFilePaths.entries()) {
|
|
739
|
-
const moduleHref = pathToFileURL(evalFilePath).href;
|
|
740
|
-
const importHref = `${moduleHref}?vieval_load=${Date.now()}_${moduleIndex}`;
|
|
741
|
-
beginModuleRegistration(importHref);
|
|
742
|
-
try {
|
|
743
|
-
const moduleValue = await import(importHref);
|
|
744
|
-
const registeredDefinitions = consumeModuleRegistrations(importHref);
|
|
745
|
-
const defaultDefinition = moduleValue.default;
|
|
746
|
-
const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
|
|
747
|
-
const deduplicatedDefinitions = definitions.filter((definition, index) => {
|
|
748
|
-
const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
|
|
749
|
-
return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
|
|
750
|
-
});
|
|
751
|
-
if (deduplicatedDefinitions.length === 0) continue;
|
|
752
|
-
for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
|
|
753
|
-
const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
|
|
754
|
-
loadedModules[moduleKey] = { default: definition };
|
|
755
|
-
}
|
|
756
|
-
} finally {
|
|
757
|
-
endModuleRegistration();
|
|
758
|
-
}
|
|
759
|
-
}
|
|
760
|
-
return loadedModules;
|
|
761
|
-
}
|
|
762
1103
|
function filterProjectsByName(projects, names) {
|
|
763
1104
|
if (names.length === 0) return [...projects];
|
|
764
1105
|
const nameSet = new Set(names);
|
|
765
1106
|
return projects.filter((project) => nameSet.has(project.name));
|
|
766
1107
|
}
|
|
1108
|
+
function sanitizeIdentitySegment(value) {
|
|
1109
|
+
const normalized = value.trim();
|
|
1110
|
+
if (normalized.length === 0) return "default";
|
|
1111
|
+
return normalized.replace(/[^\w.-]+/g, "-");
|
|
1112
|
+
}
|
|
1113
|
+
function createRunIdentity(options) {
|
|
1114
|
+
const workspaceId = sanitizeIdentitySegment(options.workspace ?? "default-workspace");
|
|
1115
|
+
const experimentId = sanitizeIdentitySegment(options.experiment ?? "default-experiment");
|
|
1116
|
+
return {
|
|
1117
|
+
attemptId: sanitizeIdentitySegment(options.attempt ?? `attempt-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`),
|
|
1118
|
+
experimentId,
|
|
1119
|
+
runId: `run-${Date.now()}-${randomUUID().slice(0, 8)}`,
|
|
1120
|
+
workspaceId
|
|
1121
|
+
};
|
|
1122
|
+
}
|
|
1123
|
+
function deriveReportProjectId(output) {
|
|
1124
|
+
const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
|
|
1125
|
+
if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment(uniqueProjectNames[0] ?? "default-project");
|
|
1126
|
+
return "multi-project";
|
|
1127
|
+
}
|
|
1128
|
+
function createEventRecorder(identity) {
|
|
1129
|
+
const events = [];
|
|
1130
|
+
const taskProjectMap = /* @__PURE__ */ new Map();
|
|
1131
|
+
return {
|
|
1132
|
+
events,
|
|
1133
|
+
record(event, payload, metadata) {
|
|
1134
|
+
const maybeTaskPayload = payload;
|
|
1135
|
+
const taskId = metadata?.taskId ?? maybeTaskPayload?.taskId;
|
|
1136
|
+
const caseId = metadata?.caseId ?? payload?.caseId;
|
|
1137
|
+
const projectName = metadata?.projectName ?? maybeTaskPayload?.projectName;
|
|
1138
|
+
if (taskId != null && projectName != null) taskProjectMap.set(taskId, projectName);
|
|
1139
|
+
events.push({
|
|
1140
|
+
attemptId: identity.attemptId,
|
|
1141
|
+
caseId,
|
|
1142
|
+
data: payload,
|
|
1143
|
+
event,
|
|
1144
|
+
experimentId: identity.experimentId,
|
|
1145
|
+
projectId: taskId == null ? void 0 : taskProjectMap.get(taskId),
|
|
1146
|
+
runId: identity.runId,
|
|
1147
|
+
schemaVersion: 1,
|
|
1148
|
+
taskId,
|
|
1149
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1150
|
+
version: 1,
|
|
1151
|
+
workspaceId: identity.workspaceId
|
|
1152
|
+
});
|
|
1153
|
+
}
|
|
1154
|
+
};
|
|
1155
|
+
}
|
|
1156
|
+
function createReporterWithEventCapture(reporter, recordEvent) {
|
|
1157
|
+
return {
|
|
1158
|
+
dispose() {
|
|
1159
|
+
reporter.dispose();
|
|
1160
|
+
},
|
|
1161
|
+
onCaseEnd(payload) {
|
|
1162
|
+
recordEvent("CaseEnded", payload);
|
|
1163
|
+
reporter.onCaseEnd(payload);
|
|
1164
|
+
},
|
|
1165
|
+
onCaseStart(payload) {
|
|
1166
|
+
recordEvent("CaseStarted", payload);
|
|
1167
|
+
reporter.onCaseStart(payload);
|
|
1168
|
+
},
|
|
1169
|
+
onRunEnd(payload) {
|
|
1170
|
+
recordEvent("RunEnded", payload);
|
|
1171
|
+
reporter.onRunEnd(payload);
|
|
1172
|
+
},
|
|
1173
|
+
onRunStart(payload) {
|
|
1174
|
+
recordEvent("RunStarted", payload);
|
|
1175
|
+
reporter.onRunStart(payload);
|
|
1176
|
+
},
|
|
1177
|
+
onTaskEnd(payload) {
|
|
1178
|
+
recordEvent("TaskEnded", payload);
|
|
1179
|
+
reporter.onTaskEnd(payload);
|
|
1180
|
+
},
|
|
1181
|
+
onTaskQueued(payload) {
|
|
1182
|
+
recordEvent("TaskQueued", payload);
|
|
1183
|
+
reporter.onTaskQueued(payload);
|
|
1184
|
+
},
|
|
1185
|
+
onTaskStart(payload) {
|
|
1186
|
+
recordEvent("TaskStarted", payload);
|
|
1187
|
+
reporter.onTaskStart(payload);
|
|
1188
|
+
}
|
|
1189
|
+
};
|
|
1190
|
+
}
|
|
767
1191
|
function applyRunEnvironment(env) {
|
|
768
1192
|
const envEntries = Object.entries(env);
|
|
769
1193
|
if (envEntries.length === 0) return () => {};
|
|
@@ -876,7 +1300,7 @@ function createTaskQueuePayload(task, projectName) {
|
|
|
876
1300
|
function createTaskCaseReporterId(payload) {
|
|
877
1301
|
return `${payload.index}:${encodeURIComponent(payload.name)}`;
|
|
878
1302
|
}
|
|
879
|
-
function createTaskReporterHooks(task, reporter, projectCaseCounters) {
|
|
1303
|
+
function createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
|
|
880
1304
|
function syncCaseTotal(total) {
|
|
881
1305
|
reporter.onTaskQueued({
|
|
882
1306
|
taskId: task.id,
|
|
@@ -896,8 +1320,21 @@ function createTaskReporterHooks(task, reporter, projectCaseCounters) {
|
|
|
896
1320
|
}
|
|
897
1321
|
}
|
|
898
1322
|
syncCaseTotal(payload.total);
|
|
1323
|
+
if (payload.state === "failed" && payload.errorMessage != null && projectCaseFailures != null) projectCaseFailures.push({
|
|
1324
|
+
caseId,
|
|
1325
|
+
caseName: payload.name,
|
|
1326
|
+
errorMessage: payload.errorMessage,
|
|
1327
|
+
taskId: task.id
|
|
1328
|
+
});
|
|
899
1329
|
reporter.onCaseEnd({
|
|
900
1330
|
caseId,
|
|
1331
|
+
errorMessage: payload.errorMessage,
|
|
1332
|
+
state: payload.state,
|
|
1333
|
+
taskId: task.id
|
|
1334
|
+
});
|
|
1335
|
+
vitestCompatReporter?.onCaseEnd({
|
|
1336
|
+
caseId,
|
|
1337
|
+
errorMessage: payload.errorMessage,
|
|
901
1338
|
state: payload.state,
|
|
902
1339
|
taskId: task.id
|
|
903
1340
|
});
|
|
@@ -910,32 +1347,49 @@ function createTaskReporterHooks(task, reporter, projectCaseCounters) {
|
|
|
910
1347
|
caseName: payload.name,
|
|
911
1348
|
taskId: task.id
|
|
912
1349
|
});
|
|
1350
|
+
vitestCompatReporter?.onCaseStart({
|
|
1351
|
+
caseId,
|
|
1352
|
+
taskId: task.id
|
|
1353
|
+
});
|
|
1354
|
+
},
|
|
1355
|
+
onEvent(payload) {
|
|
1356
|
+
recordEvent(payload.event, payload.data, {
|
|
1357
|
+
caseId: payload.caseId,
|
|
1358
|
+
projectName,
|
|
1359
|
+
taskId: task.id
|
|
1360
|
+
});
|
|
913
1361
|
}
|
|
914
1362
|
};
|
|
915
1363
|
}
|
|
916
|
-
function createCliTaskExecutionContext(task, models, reporter, projectCaseCounters) {
|
|
1364
|
+
function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
|
|
917
1365
|
return {
|
|
918
1366
|
...createTaskExecutionContext({
|
|
1367
|
+
cache: createFilesystemTaskCacheRuntime({
|
|
1368
|
+
cacheRootDirectory,
|
|
1369
|
+
projectName: cacheProjectName,
|
|
1370
|
+
workspaceId
|
|
1371
|
+
}),
|
|
919
1372
|
models,
|
|
920
1373
|
task
|
|
921
1374
|
}),
|
|
922
|
-
reporterHooks: createTaskReporterHooks(task, reporter, projectCaseCounters)
|
|
1375
|
+
reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter)
|
|
923
1376
|
};
|
|
924
1377
|
}
|
|
925
|
-
function resolveTaskReporterHooks(task, context, reporter, projectCaseCounters) {
|
|
926
|
-
return context.reporterHooks ?? createTaskReporterHooks(task, reporter, projectCaseCounters);
|
|
1378
|
+
function resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
|
|
1379
|
+
return context.reporterHooks ?? createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
|
|
927
1380
|
}
|
|
928
1381
|
function getFailedTaskId(error) {
|
|
929
1382
|
if (error instanceof RunnerExecutionError) return error.taskId;
|
|
930
1383
|
return null;
|
|
931
1384
|
}
|
|
932
|
-
function createAutoTaskExecutor(reporter, projectCaseCounters) {
|
|
1385
|
+
function createAutoTaskExecutor(reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
|
|
933
1386
|
return async (task, context) => {
|
|
934
1387
|
const taskDefinition = task.entry.task;
|
|
935
1388
|
if (taskDefinition == null) throw new Error(`Missing eval task definition for entry "${task.entry.id}".`);
|
|
936
1389
|
const output = await taskDefinition.run({
|
|
1390
|
+
cache: context.cache,
|
|
937
1391
|
model: context.model,
|
|
938
|
-
reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectCaseCounters),
|
|
1392
|
+
reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
|
|
939
1393
|
task
|
|
940
1394
|
});
|
|
941
1395
|
return {
|
|
@@ -985,7 +1439,7 @@ async function prepareProject(project) {
|
|
|
985
1439
|
include: project.include,
|
|
986
1440
|
root: project.root
|
|
987
1441
|
});
|
|
988
|
-
const entries = collectEvalEntries(await
|
|
1442
|
+
const entries = collectEvalEntries(await loadEvalModulesWithVitestRuntime(evalFilePaths, project.root), runtimeContext);
|
|
989
1443
|
const tasks = createRunnerSchedule({
|
|
990
1444
|
evalMatrix: project.evalMatrix,
|
|
991
1445
|
entries,
|
|
@@ -997,6 +1451,7 @@ async function prepareProject(project) {
|
|
|
997
1451
|
kind: "summary",
|
|
998
1452
|
summary: {
|
|
999
1453
|
caseSummary: null,
|
|
1454
|
+
caseFailures: [],
|
|
1000
1455
|
discoveredEvalFileCount: evalFilePaths.length,
|
|
1001
1456
|
durationMs: Date.now() - startedAt,
|
|
1002
1457
|
entryCount: entries.length,
|
|
@@ -1024,6 +1479,7 @@ async function prepareProject(project) {
|
|
|
1024
1479
|
kind: "summary",
|
|
1025
1480
|
summary: {
|
|
1026
1481
|
caseSummary: null,
|
|
1482
|
+
caseFailures: [],
|
|
1027
1483
|
discoveredEvalFileCount: 0,
|
|
1028
1484
|
durationMs: Date.now() - startedAt,
|
|
1029
1485
|
entryCount: 0,
|
|
@@ -1037,7 +1493,7 @@ async function prepareProject(project) {
|
|
|
1037
1493
|
};
|
|
1038
1494
|
}
|
|
1039
1495
|
}
|
|
1040
|
-
async function executePreparedProject(prepared, reporter, counters) {
|
|
1496
|
+
async function executePreparedProject(prepared, identity, cacheProjectName, reporter, counters, recordEvent) {
|
|
1041
1497
|
const settledTaskIds = /* @__PURE__ */ new Set();
|
|
1042
1498
|
const projectCaseCounters = {
|
|
1043
1499
|
failed: 0,
|
|
@@ -1045,17 +1501,24 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1045
1501
|
seenCaseIds: /* @__PURE__ */ new Set(),
|
|
1046
1502
|
skipped: 0
|
|
1047
1503
|
};
|
|
1048
|
-
const
|
|
1504
|
+
const projectCaseFailures = [];
|
|
1505
|
+
const vitestCompatReporter = await createVievalVitestCompatReporterBridge({
|
|
1506
|
+
projectName: prepared.name,
|
|
1507
|
+
references: prepared.project.reporters
|
|
1508
|
+
});
|
|
1509
|
+
const rawTaskExecutor = prepared.project.executor ?? createAutoTaskExecutor(reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
|
|
1049
1510
|
const taskExecutor = async (task, context) => {
|
|
1050
1511
|
return {
|
|
1051
1512
|
...await rawTaskExecutor(task, context),
|
|
1052
1513
|
matrix: cloneScheduledTaskMatrix(task)
|
|
1053
1514
|
};
|
|
1054
1515
|
};
|
|
1516
|
+
for (const task of prepared.tasks) await vitestCompatReporter?.onTaskQueued({ taskId: task.id });
|
|
1517
|
+
await vitestCompatReporter?.onRunStart();
|
|
1055
1518
|
try {
|
|
1056
1519
|
const aggregated = await runScheduledTasks(prepared.tasks, taskExecutor, {
|
|
1057
1520
|
createExecutionContext(task) {
|
|
1058
|
-
return createCliTaskExecutionContext(task, prepared.project.models, reporter, projectCaseCounters);
|
|
1521
|
+
return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
|
|
1059
1522
|
},
|
|
1060
1523
|
onTaskEnd(task, state) {
|
|
1061
1524
|
settledTaskIds.add(task.id);
|
|
@@ -1063,6 +1526,10 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1063
1526
|
state,
|
|
1064
1527
|
taskId: task.id
|
|
1065
1528
|
});
|
|
1529
|
+
vitestCompatReporter?.onTaskEnd({
|
|
1530
|
+
state,
|
|
1531
|
+
taskId: task.id
|
|
1532
|
+
});
|
|
1066
1533
|
if (state === "passed") {
|
|
1067
1534
|
counters.passedTasks += 1;
|
|
1068
1535
|
return;
|
|
@@ -1071,8 +1538,10 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1071
1538
|
},
|
|
1072
1539
|
onTaskStart(task) {
|
|
1073
1540
|
reporter.onTaskStart({ taskId: task.id });
|
|
1541
|
+
vitestCompatReporter?.onTaskStart({ taskId: task.id });
|
|
1074
1542
|
}
|
|
1075
1543
|
});
|
|
1544
|
+
await vitestCompatReporter?.onRunEnd({ failed: false });
|
|
1076
1545
|
return {
|
|
1077
1546
|
caseSummary: {
|
|
1078
1547
|
failed: projectCaseCounters.failed,
|
|
@@ -1080,6 +1549,7 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1080
1549
|
skipped: projectCaseCounters.skipped,
|
|
1081
1550
|
total: projectCaseCounters.seenCaseIds.size
|
|
1082
1551
|
},
|
|
1552
|
+
caseFailures: projectCaseFailures,
|
|
1083
1553
|
discoveredEvalFileCount: prepared.discoveredEvalFileCount,
|
|
1084
1554
|
durationMs: Date.now() - prepared.startedAt,
|
|
1085
1555
|
entryCount: prepared.entryCount,
|
|
@@ -1099,6 +1569,10 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1099
1569
|
state: "failed",
|
|
1100
1570
|
taskId: failedTaskId
|
|
1101
1571
|
});
|
|
1572
|
+
await vitestCompatReporter?.onTaskEnd({
|
|
1573
|
+
state: "failed",
|
|
1574
|
+
taskId: failedTaskId
|
|
1575
|
+
});
|
|
1102
1576
|
}
|
|
1103
1577
|
for (const task of prepared.tasks) {
|
|
1104
1578
|
if (settledTaskIds.has(task.id)) continue;
|
|
@@ -1108,7 +1582,12 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1108
1582
|
state: "skipped",
|
|
1109
1583
|
taskId: task.id
|
|
1110
1584
|
});
|
|
1585
|
+
await vitestCompatReporter?.onTaskEnd({
|
|
1586
|
+
state: "skipped",
|
|
1587
|
+
taskId: task.id
|
|
1588
|
+
});
|
|
1111
1589
|
}
|
|
1590
|
+
await vitestCompatReporter?.onRunEnd({ failed: true });
|
|
1112
1591
|
return {
|
|
1113
1592
|
caseSummary: {
|
|
1114
1593
|
failed: projectCaseCounters.failed,
|
|
@@ -1116,6 +1595,7 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1116
1595
|
skipped: projectCaseCounters.skipped,
|
|
1117
1596
|
total: projectCaseCounters.seenCaseIds.size
|
|
1118
1597
|
},
|
|
1598
|
+
caseFailures: projectCaseFailures,
|
|
1119
1599
|
discoveredEvalFileCount: prepared.discoveredEvalFileCount,
|
|
1120
1600
|
durationMs: Date.now() - prepared.startedAt,
|
|
1121
1601
|
entryCount: prepared.entryCount,
|
|
@@ -1128,6 +1608,14 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1128
1608
|
};
|
|
1129
1609
|
}
|
|
1130
1610
|
}
|
|
1611
|
+
async function writeRunReportArtifacts(output, events, identity, reportOut) {
|
|
1612
|
+
const projectId = deriveReportProjectId(output);
|
|
1613
|
+
const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
|
|
1614
|
+
await mkdir(reportDirectory, { recursive: true });
|
|
1615
|
+
await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(output, null, 2)}\n`, "utf-8");
|
|
1616
|
+
await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
|
|
1617
|
+
return reportDirectory;
|
|
1618
|
+
}
|
|
1131
1619
|
/**
|
|
1132
1620
|
* Runs vieval orchestration from config and returns project-level summaries.
|
|
1133
1621
|
*
|
|
@@ -1145,12 +1633,14 @@ async function executePreparedProject(prepared, reporter, counters) {
|
|
|
1145
1633
|
* - keeping business-agent eval files near their implementation packages
|
|
1146
1634
|
*/
|
|
1147
1635
|
async function runVievalCli(options = {}) {
|
|
1636
|
+
const identity = createRunIdentity(options);
|
|
1148
1637
|
const loadedConfig = await loadVievalCliConfig({
|
|
1149
1638
|
configFilePath: options.configFilePath,
|
|
1150
1639
|
cwd: options.cwd
|
|
1151
1640
|
});
|
|
1152
1641
|
const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
|
|
1153
|
-
const
|
|
1642
|
+
const eventRecorder = createEventRecorder(identity);
|
|
1643
|
+
const reporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
|
|
1154
1644
|
try {
|
|
1155
1645
|
const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
|
|
1156
1646
|
const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
|
|
@@ -1176,7 +1666,7 @@ async function runVievalCli(options = {}) {
|
|
|
1176
1666
|
projectSummaries.push(preparedProject.summary);
|
|
1177
1667
|
continue;
|
|
1178
1668
|
}
|
|
1179
|
-
projectSummaries.push(await executePreparedProject(preparedProject.prepared, reporter, reporterCounters));
|
|
1669
|
+
projectSummaries.push(await executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, reporter, reporterCounters, eventRecorder.record));
|
|
1180
1670
|
}
|
|
1181
1671
|
reporter.onRunEnd({
|
|
1182
1672
|
failedTasks: reporterCounters.failedTasks,
|
|
@@ -1184,10 +1674,17 @@ async function runVievalCli(options = {}) {
|
|
|
1184
1674
|
skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
|
|
1185
1675
|
totalTasks
|
|
1186
1676
|
});
|
|
1187
|
-
|
|
1677
|
+
const output = {
|
|
1678
|
+
attemptId: identity.attemptId,
|
|
1188
1679
|
configFilePath: loadedConfig.configFilePath,
|
|
1189
|
-
|
|
1680
|
+
experimentId: identity.experimentId,
|
|
1681
|
+
projects: projectSummaries,
|
|
1682
|
+
reportDirectory: null,
|
|
1683
|
+
runId: identity.runId,
|
|
1684
|
+
workspaceId: identity.workspaceId
|
|
1190
1685
|
};
|
|
1686
|
+
if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
|
|
1687
|
+
return output;
|
|
1191
1688
|
} finally {
|
|
1192
1689
|
reporter.dispose();
|
|
1193
1690
|
restoreEnvironment();
|
|
@@ -1239,7 +1736,9 @@ function formatVievalCliRunOutput(output) {
|
|
|
1239
1736
|
totalTasks += project.taskCount;
|
|
1240
1737
|
executedTasks += project.result?.overall.runCount ?? 0;
|
|
1241
1738
|
const badge = createProjectBadge(project.name, colors, colorEnabled);
|
|
1242
|
-
|
|
1739
|
+
const isFailed = project.errorMessage != null;
|
|
1740
|
+
const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
|
|
1741
|
+
if (isFailed) {
|
|
1243
1742
|
failedProjects += 1;
|
|
1244
1743
|
lines.push(` ${colors.red("❯")} ${badge}${formatDuration(project.durationMs, colors)}`);
|
|
1245
1744
|
lines.push(` ${project.errorMessage}`);
|
|
@@ -1256,7 +1755,8 @@ function formatVievalCliRunOutput(output) {
|
|
|
1256
1755
|
if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
|
|
1257
1756
|
continue;
|
|
1258
1757
|
}
|
|
1259
|
-
|
|
1758
|
+
if (hasFailedCases) failedProjects += 1;
|
|
1759
|
+
else passedProjects += 1;
|
|
1260
1760
|
const hybridAverage = project.result?.overall.hybridAverage;
|
|
1261
1761
|
const hybridAverageLabel = hybridAverage == null ? "n/a" : String(hybridAverage);
|
|
1262
1762
|
const runCount = project.result?.overall.runCount ?? 0;
|
|
@@ -1264,10 +1764,18 @@ function formatVievalCliRunOutput(output) {
|
|
|
1264
1764
|
const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed`;
|
|
1265
1765
|
const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, ${runCount} runs${caseSummaryLabel}, hybrid ${hybridAverageLabel}`);
|
|
1266
1766
|
const matrixSummary = formatMatrixSummary(project.matrixSummary);
|
|
1267
|
-
lines.push(` ${colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
|
|
1767
|
+
lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
|
|
1268
1768
|
if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
|
|
1269
1769
|
const scheduleBreakdown = formatScheduleBreakdown(project);
|
|
1270
1770
|
if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
|
|
1771
|
+
if ((project.caseFailures?.length ?? 0) > 0) {
|
|
1772
|
+
lines.push(` ${colors.red("Failed cases:")}`);
|
|
1773
|
+
for (const failure of project.caseFailures.slice(0, 5)) {
|
|
1774
|
+
lines.push(` ${colors.red(`- ${failure.caseName} (${failure.taskId})`)}`);
|
|
1775
|
+
for (const line of failure.errorMessage.split("\n")) lines.push(` ${colors.red(line)}`);
|
|
1776
|
+
}
|
|
1777
|
+
if (project.caseFailures.length > 5) lines.push(` ${colors.dim(`... ${project.caseFailures.length - 5} more failed cases`)}`);
|
|
1778
|
+
}
|
|
1271
1779
|
}
|
|
1272
1780
|
lines.push("");
|
|
1273
1781
|
if (failedProjects > 0 || skippedProjects > 0) {
|
|
@@ -1280,19 +1788,127 @@ function formatVievalCliRunOutput(output) {
|
|
|
1280
1788
|
return lines.join("\n");
|
|
1281
1789
|
}
|
|
1282
1790
|
//#endregion
|
|
1791
|
+
//#region src/cli/compare.ts
|
|
1792
|
+
const compareHelpText = `
|
|
1793
|
+
Compare multiple methods on one benchmark.
|
|
1794
|
+
|
|
1795
|
+
Usage
|
|
1796
|
+
$ vieval compare [--config <path>] [--comparison <id>] [--output <path>] [--format <format>]
|
|
1797
|
+
|
|
1798
|
+
Options
|
|
1799
|
+
--config Config file path (default: nearest vieval.config.*)
|
|
1800
|
+
--comparison Comparison entry id from config.comparisons
|
|
1801
|
+
--output Optional output artifact path
|
|
1802
|
+
--format Console output format: table | json (default: table)
|
|
1803
|
+
`;
|
|
1804
|
+
function normalizeCliArgv$4(argv) {
|
|
1805
|
+
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
1806
|
+
if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
|
|
1807
|
+
return normalizedArgv;
|
|
1808
|
+
}
|
|
1809
|
+
function parseCompareCliArguments(argv) {
|
|
1810
|
+
const cli = meow(compareHelpText, {
|
|
1811
|
+
argv: normalizeCliArgv$4(argv),
|
|
1812
|
+
flags: {
|
|
1813
|
+
config: { type: "string" },
|
|
1814
|
+
comparison: { type: "string" },
|
|
1815
|
+
format: {
|
|
1816
|
+
default: "table",
|
|
1817
|
+
type: "string"
|
|
1818
|
+
},
|
|
1819
|
+
output: { type: "string" }
|
|
1820
|
+
},
|
|
1821
|
+
importMeta: import.meta
|
|
1822
|
+
});
|
|
1823
|
+
return {
|
|
1824
|
+
comparisonId: cli.flags.comparison,
|
|
1825
|
+
configFilePath: cli.flags.config,
|
|
1826
|
+
format: cli.flags.format === "json" ? "json" : "table",
|
|
1827
|
+
output: cli.flags.output
|
|
1828
|
+
};
|
|
1829
|
+
}
|
|
1830
|
+
/**
|
|
1831
|
+
* Runs one compare session from `vieval.config.*` comparison-mode config.
|
|
1832
|
+
*/
|
|
1833
|
+
async function runCompareCli(argv) {
|
|
1834
|
+
const parsed = parseCompareCliArguments(argv);
|
|
1835
|
+
const loaded = await loadVievalComparisonConfig({
|
|
1836
|
+
comparisonId: parsed.comparisonId,
|
|
1837
|
+
configFilePath: parsed.configFilePath,
|
|
1838
|
+
cwd: parsed.cwd
|
|
1839
|
+
});
|
|
1840
|
+
const methodResults = [];
|
|
1841
|
+
for (const method of loaded.config.methods) {
|
|
1842
|
+
const methodWorkspace = resolve(method.workspace);
|
|
1843
|
+
const output = await runVievalCli({
|
|
1844
|
+
cacheProjectName: loaded.config.benchmark.sharedCaseNamespace,
|
|
1845
|
+
configFilePath: method.configFilePath ?? resolve(methodWorkspace, "vieval.config.ts"),
|
|
1846
|
+
cwd: methodWorkspace,
|
|
1847
|
+
project: [method.project],
|
|
1848
|
+
workspace: loaded.config.benchmark.id
|
|
1849
|
+
});
|
|
1850
|
+
const failedProject = output.projects.find((project) => project.errorMessage != null);
|
|
1851
|
+
if (failedProject != null) throw new Error(`Comparison method "${method.id}" failed: ${failedProject.errorMessage}`);
|
|
1852
|
+
methodResults.push({
|
|
1853
|
+
methodId: method.id,
|
|
1854
|
+
output
|
|
1855
|
+
});
|
|
1856
|
+
}
|
|
1857
|
+
const runOutput = {
|
|
1858
|
+
benchmarkId: loaded.config.benchmark.id,
|
|
1859
|
+
methods: methodResults
|
|
1860
|
+
};
|
|
1861
|
+
const artifact = buildCompareReportArtifact({
|
|
1862
|
+
benchmarkId: runOutput.benchmarkId,
|
|
1863
|
+
methods: runOutput.methods,
|
|
1864
|
+
reportPath: loaded.configFilePath
|
|
1865
|
+
});
|
|
1866
|
+
if (parsed.output != null) await writeCompareReportArtifact({
|
|
1867
|
+
artifact,
|
|
1868
|
+
outputPath: parsed.output
|
|
1869
|
+
});
|
|
1870
|
+
if (parsed.format === "json") process.stdout.write(`${JSON.stringify(artifact, null, 2)}\n`);
|
|
1871
|
+
else process.stdout.write([
|
|
1872
|
+
"COMPARE vieval",
|
|
1873
|
+
`Benchmark ${artifact.benchmarkId}`,
|
|
1874
|
+
...artifact.methods.map((method, index) => {
|
|
1875
|
+
const hybrid = method.hybridAverage == null ? "n/a" : method.hybridAverage.toFixed(3);
|
|
1876
|
+
const exact = method.exactAverage == null ? "n/a" : method.exactAverage.toFixed(3);
|
|
1877
|
+
return `${index + 1}. ${method.methodId} hybrid=${hybrid} exact=${exact} runs=${method.runCount}`;
|
|
1878
|
+
})
|
|
1879
|
+
].join("\n").concat("\n"));
|
|
1880
|
+
return runOutput;
|
|
1881
|
+
}
|
|
1882
|
+
async function runCompareCliOrExit(argv) {
|
|
1883
|
+
try {
|
|
1884
|
+
await runCompareCli(argv);
|
|
1885
|
+
} catch (error) {
|
|
1886
|
+
const errorMessage = errorMessageFrom(error) ?? "Unknown compare command failure.";
|
|
1887
|
+
process.stderr.write(`[vieval compare] ${errorMessage}\n`);
|
|
1888
|
+
process.exitCode = 1;
|
|
1889
|
+
}
|
|
1890
|
+
}
|
|
1891
|
+
//#endregion
|
|
1892
|
+
//#region package.json
|
|
1893
|
+
var name = "vieval";
|
|
1894
|
+
//#endregion
|
|
1283
1895
|
//#region src/cli/eval-run.ts
|
|
1284
1896
|
const evalRunHelpText = `
|
|
1285
1897
|
Execute vieval projects from discovered or explicit config.
|
|
1286
1898
|
|
|
1287
1899
|
Usage
|
|
1288
|
-
$ vieval run [--config <path>] [--project <name>] [--json]
|
|
1900
|
+
$ vieval run [--config <path>] [--project <name>] [--json] [--report-out <path>]
|
|
1289
1901
|
|
|
1290
1902
|
Options
|
|
1291
1903
|
--config Config file path
|
|
1292
1904
|
--project Project name to execute; may be repeated
|
|
1905
|
+
--workspace Workspace id used in report artifacts
|
|
1906
|
+
--experiment Experiment id used in report artifacts
|
|
1907
|
+
--attempt Attempt id used in report artifacts
|
|
1908
|
+
--report-out Report output root directory
|
|
1293
1909
|
--json Print machine-readable JSON output
|
|
1294
1910
|
`;
|
|
1295
|
-
function normalizeCliArgv$
|
|
1911
|
+
function normalizeCliArgv$3(argv) {
|
|
1296
1912
|
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
1297
1913
|
return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
|
|
1298
1914
|
}
|
|
@@ -1300,9 +1916,22 @@ function normalizeProjectNames(projectNames) {
|
|
|
1300
1916
|
if (typeof projectNames === "string") return [projectNames];
|
|
1301
1917
|
return projectNames ?? [];
|
|
1302
1918
|
}
|
|
1919
|
+
/**
|
|
1920
|
+
* Parses `vieval run` CLI arguments into one normalized execution payload.
|
|
1921
|
+
*
|
|
1922
|
+
* Use when:
|
|
1923
|
+
* - the top-level CLI forwards `run` subcommand arguments
|
|
1924
|
+
* - tests need stable flag normalization without executing the runner
|
|
1925
|
+
*
|
|
1926
|
+
* Expects:
|
|
1927
|
+
* - argv in either direct `run` form or forwarded `-- ...` form
|
|
1928
|
+
*
|
|
1929
|
+
* Returns:
|
|
1930
|
+
* - normalized run options ready for {@link runVievalCli}
|
|
1931
|
+
*/
|
|
1303
1932
|
function parseCliArguments(argv) {
|
|
1304
1933
|
const cli = meow(evalRunHelpText, {
|
|
1305
|
-
argv: normalizeCliArgv$
|
|
1934
|
+
argv: normalizeCliArgv$3(argv),
|
|
1306
1935
|
importMeta: import.meta,
|
|
1307
1936
|
flags: {
|
|
1308
1937
|
config: { type: "string" },
|
|
@@ -1313,53 +1942,554 @@ function parseCliArguments(argv) {
|
|
|
1313
1942
|
project: {
|
|
1314
1943
|
isMultiple: true,
|
|
1315
1944
|
type: "string"
|
|
1316
|
-
}
|
|
1945
|
+
},
|
|
1946
|
+
workspace: { type: "string" },
|
|
1947
|
+
experiment: { type: "string" },
|
|
1948
|
+
attempt: { type: "string" },
|
|
1949
|
+
reportOut: { type: "string" }
|
|
1317
1950
|
}
|
|
1318
1951
|
});
|
|
1319
1952
|
return {
|
|
1953
|
+
attempt: cli.flags.attempt,
|
|
1320
1954
|
configFilePath: cli.flags.config,
|
|
1955
|
+
experiment: cli.flags.experiment,
|
|
1321
1956
|
json: cli.flags.json === true,
|
|
1322
|
-
project: normalizeProjectNames(cli.flags.project)
|
|
1957
|
+
project: normalizeProjectNames(cli.flags.project),
|
|
1958
|
+
reportOut: cli.flags.reportOut,
|
|
1959
|
+
workspace: cli.flags.workspace
|
|
1323
1960
|
};
|
|
1324
1961
|
}
|
|
1325
|
-
function isDirectExecution$1() {
|
|
1326
|
-
if (!process.argv[1]) return false;
|
|
1327
|
-
return path.resolve(process.argv[1]) === fileURLToPath(import.meta.url);
|
|
1328
|
-
}
|
|
1329
1962
|
/**
|
|
1330
|
-
*
|
|
1963
|
+
* Executes the `vieval run` subcommand.
|
|
1331
1964
|
*
|
|
1332
1965
|
* Call stack:
|
|
1333
1966
|
*
|
|
1334
|
-
*
|
|
1335
|
-
* -> {@link
|
|
1336
|
-
*
|
|
1337
|
-
*
|
|
1338
|
-
*
|
|
1967
|
+
* top-level `vieval` CLI
|
|
1968
|
+
* -> {@link runTopLevelCli} (`./index`)
|
|
1969
|
+
* -> {@link runEvalRunCli}
|
|
1970
|
+
* -> {@link parseCliArguments}
|
|
1971
|
+
* -> {@link runVievalCli}
|
|
1972
|
+
* -> `process.stdout.write(...)` / `process.stderr.write(...)`
|
|
1973
|
+
* -> `process.exitCode`
|
|
1339
1974
|
*
|
|
1340
1975
|
* Use when:
|
|
1341
|
-
* -
|
|
1342
|
-
* -
|
|
1976
|
+
* - the published `vieval` binary needs to execute the `run` subcommand
|
|
1977
|
+
* - callers want one reusable implementation without a second bundled entrypoint
|
|
1978
|
+
*
|
|
1979
|
+
* Expects:
|
|
1980
|
+
* - argv that belongs to the `run` subcommand only
|
|
1981
|
+
*
|
|
1982
|
+
* Returns:
|
|
1983
|
+
* - resolves after writing CLI output and updating `process.exitCode`
|
|
1984
|
+
*
|
|
1985
|
+
* NOTICE:
|
|
1986
|
+
* - `src/cli/index.ts` is the only direct-execution entrypoint for the bundled
|
|
1987
|
+
* CLI artifact. Keeping `eval-run.ts` reusable avoids duplicate top-level
|
|
1988
|
+
* await guards once tsdown inlines both modules into `dist/cli/index.mjs`.
|
|
1343
1989
|
*/
|
|
1344
|
-
async function
|
|
1345
|
-
const parsed = parseCliArguments(
|
|
1990
|
+
async function runEvalRunCli(argv) {
|
|
1991
|
+
const parsed = parseCliArguments(argv);
|
|
1346
1992
|
try {
|
|
1347
1993
|
const output = await runVievalCli({
|
|
1994
|
+
attempt: parsed.attempt,
|
|
1348
1995
|
configFilePath: parsed.configFilePath,
|
|
1349
|
-
|
|
1996
|
+
experiment: parsed.experiment,
|
|
1997
|
+
project: parsed.project,
|
|
1998
|
+
reportOut: parsed.reportOut,
|
|
1999
|
+
workspace: parsed.workspace
|
|
1350
2000
|
});
|
|
1351
2001
|
if (parsed.json) {
|
|
1352
2002
|
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
2003
|
+
if (hasRunFailures(output)) process.exitCode = 1;
|
|
1353
2004
|
return;
|
|
1354
2005
|
}
|
|
1355
2006
|
process.stdout.write(`${formatVievalCliRunOutput(output)}\n`);
|
|
2007
|
+
if (hasRunFailures(output)) process.exitCode = 1;
|
|
1356
2008
|
} catch (error) {
|
|
1357
2009
|
const errorMessage = errorMessageFrom(error) ?? "Unknown CLI failure.";
|
|
1358
2010
|
process.stderr.write(`[${name}] ${errorMessage}\n`);
|
|
1359
2011
|
process.exitCode = 1;
|
|
1360
2012
|
}
|
|
1361
2013
|
}
|
|
1362
|
-
|
|
2014
|
+
//#endregion
|
|
2015
|
+
//#region src/cli/report-artifacts.ts
|
|
2016
|
+
/**
|
|
2017
|
+
* Resolves one or more `run-summary.json` paths from a report location.
|
|
2018
|
+
*
|
|
2019
|
+
* Use when:
|
|
2020
|
+
* - callers may pass a run directory, summary file path, or a report root
|
|
2021
|
+
*
|
|
2022
|
+
* Returns:
|
|
2023
|
+
* - sorted absolute summary file paths
|
|
2024
|
+
*/
|
|
2025
|
+
async function resolveRunSummaryPaths(reportPath) {
|
|
2026
|
+
const absoluteReportPath = resolve(reportPath);
|
|
2027
|
+
const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
|
|
2028
|
+
if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
|
|
2029
|
+
if (existsSync(directSummaryPath)) return [directSummaryPath];
|
|
2030
|
+
return (await glob("**/run-summary.json", {
|
|
2031
|
+
absolute: true,
|
|
2032
|
+
cwd: absoluteReportPath
|
|
2033
|
+
})).sort((left, right) => left.localeCompare(right));
|
|
2034
|
+
}
|
|
2035
|
+
/**
|
|
2036
|
+
* Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
|
|
2037
|
+
*
|
|
2038
|
+
* Use when:
|
|
2039
|
+
* - report analysis needs both run aggregate output and event count metadata
|
|
2040
|
+
*/
|
|
2041
|
+
function readReportRunArtifact(summaryFilePath) {
|
|
2042
|
+
const reportDirectory = resolve(summaryFilePath, "..");
|
|
2043
|
+
const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
|
|
2044
|
+
const eventsFilePath = resolve(reportDirectory, "events.jsonl");
|
|
2045
|
+
const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
|
|
2046
|
+
const event = JSON.parse(line);
|
|
2047
|
+
return {
|
|
2048
|
+
caseId: event.caseId,
|
|
2049
|
+
data: event.data,
|
|
2050
|
+
event: event.event,
|
|
2051
|
+
taskId: event.taskId
|
|
2052
|
+
};
|
|
2053
|
+
}) : [];
|
|
2054
|
+
return {
|
|
2055
|
+
events,
|
|
2056
|
+
eventsCount: events.length,
|
|
2057
|
+
reportDirectory,
|
|
2058
|
+
summary,
|
|
2059
|
+
summaryFilePath
|
|
2060
|
+
};
|
|
2061
|
+
}
|
|
2062
|
+
/**
|
|
2063
|
+
* Reads all run artifacts found under `reportPath`.
|
|
2064
|
+
*
|
|
2065
|
+
* Use when:
|
|
2066
|
+
* - callers need multi-run analysis from a directory root
|
|
2067
|
+
*/
|
|
2068
|
+
async function readReportArtifacts(reportPath) {
|
|
2069
|
+
return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
|
|
2070
|
+
}
|
|
2071
|
+
/**
|
|
2072
|
+
* Creates a compact summary row for one run artifact.
|
|
2073
|
+
*
|
|
2074
|
+
* Use when:
|
|
2075
|
+
* - table/csv/jsonl exports should stay stable and cheap to parse
|
|
2076
|
+
*/
|
|
2077
|
+
function summarizeReportRunArtifact(artifact) {
|
|
2078
|
+
const totalProjects = artifact.summary.projects.length;
|
|
2079
|
+
const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
|
|
2080
|
+
const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
|
|
2081
|
+
const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
|
|
2082
|
+
const projectNames = artifact.summary.projects.map((project) => project.name);
|
|
2083
|
+
return {
|
|
2084
|
+
attemptId: artifact.summary.attemptId ?? null,
|
|
2085
|
+
eventsCount: artifact.eventsCount,
|
|
2086
|
+
executedProjects,
|
|
2087
|
+
experimentId: artifact.summary.experimentId ?? null,
|
|
2088
|
+
failedProjects,
|
|
2089
|
+
projectNames,
|
|
2090
|
+
reportDirectory: artifact.reportDirectory,
|
|
2091
|
+
runId: artifact.summary.runId ?? null,
|
|
2092
|
+
totalProjects,
|
|
2093
|
+
totalTasks,
|
|
2094
|
+
workspaceId: artifact.summary.workspaceId ?? null
|
|
2095
|
+
};
|
|
2096
|
+
}
|
|
2097
|
+
//#endregion
|
|
2098
|
+
//#region src/cli/report-analyze.ts
|
|
2099
|
+
const reportAnalyzeHelpText = `
|
|
2100
|
+
Analyze generated vieval report artifacts.
|
|
2101
|
+
|
|
2102
|
+
Usage
|
|
2103
|
+
$ vieval report analyze <reportPath> [options]
|
|
2104
|
+
|
|
2105
|
+
Options
|
|
2106
|
+
--format Output format: table | json | jsonl | csv (default: table)
|
|
2107
|
+
--workspace Workspace id filter
|
|
2108
|
+
--project Project name filter (exact)
|
|
2109
|
+
--experiment Experiment id filter
|
|
2110
|
+
--attempt Attempt id filter
|
|
2111
|
+
--run Run id filter
|
|
2112
|
+
--task-state Keep runs containing at least one task in this state
|
|
2113
|
+
--case-state Keep runs containing at least one case in this state
|
|
2114
|
+
--contains Keep runs containing this text in event name or payload
|
|
2115
|
+
--error-contains Keep runs containing this text in project errors or event payload
|
|
2116
|
+
--run-matrix Keep runs matching run-matrix selector "key=value[,key=value]"
|
|
2117
|
+
--eval-matrix Keep runs matching eval-matrix selector "key=value[,key=value]"
|
|
2118
|
+
`;
|
|
2119
|
+
function normalizeCliArgv$2(argv) {
|
|
2120
|
+
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
2121
|
+
if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
|
|
2122
|
+
if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
|
|
2123
|
+
return normalizedArgv;
|
|
2124
|
+
}
|
|
2125
|
+
function parseReportAnalyzeCliArguments(argv) {
|
|
2126
|
+
const cli = meow(reportAnalyzeHelpText, {
|
|
2127
|
+
argv: normalizeCliArgv$2(argv),
|
|
2128
|
+
flags: {
|
|
2129
|
+
attempt: { type: "string" },
|
|
2130
|
+
caseState: { type: "string" },
|
|
2131
|
+
contains: { type: "string" },
|
|
2132
|
+
evalMatrix: { type: "string" },
|
|
2133
|
+
errorContains: { type: "string" },
|
|
2134
|
+
experiment: { type: "string" },
|
|
2135
|
+
format: {
|
|
2136
|
+
default: "table",
|
|
2137
|
+
type: "string"
|
|
2138
|
+
},
|
|
2139
|
+
project: { type: "string" },
|
|
2140
|
+
runMatrix: { type: "string" },
|
|
2141
|
+
run: { type: "string" },
|
|
2142
|
+
taskState: { type: "string" },
|
|
2143
|
+
workspace: { type: "string" }
|
|
2144
|
+
},
|
|
2145
|
+
importMeta: import.meta
|
|
2146
|
+
});
|
|
2147
|
+
const reportPath = cli.input[0];
|
|
2148
|
+
if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
|
|
2149
|
+
const normalizedFormat = cli.flags.format.toLowerCase();
|
|
2150
|
+
const format = normalizedFormat === "json" ? "json" : normalizedFormat === "jsonl" ? "jsonl" : normalizedFormat === "csv" ? "csv" : "table";
|
|
2151
|
+
return {
|
|
2152
|
+
attempt: cli.flags.attempt,
|
|
2153
|
+
caseState: normalizeStateFilter(cli.flags.caseState),
|
|
2154
|
+
contains: cli.flags.contains,
|
|
2155
|
+
evalMatrix: parseMatrixSelector(cli.flags.evalMatrix),
|
|
2156
|
+
errorContains: cli.flags.errorContains,
|
|
2157
|
+
experiment: cli.flags.experiment,
|
|
2158
|
+
format,
|
|
2159
|
+
project: cli.flags.project,
|
|
2160
|
+
reportPath,
|
|
2161
|
+
runMatrix: parseMatrixSelector(cli.flags.runMatrix),
|
|
2162
|
+
run: cli.flags.run,
|
|
2163
|
+
taskState: normalizeStateFilter(cli.flags.taskState),
|
|
2164
|
+
workspace: cli.flags.workspace
|
|
2165
|
+
};
|
|
2166
|
+
}
|
|
2167
|
+
function normalizeStateFilter(value) {
|
|
2168
|
+
if (value == null) return;
|
|
2169
|
+
const normalized = value.trim().toLowerCase();
|
|
2170
|
+
if (normalized === "passed" || normalized === "failed" || normalized === "skipped") return normalized;
|
|
2171
|
+
throw new Error(`Unsupported state filter "${value}". Expected "passed", "failed", or "skipped".`);
|
|
2172
|
+
}
|
|
2173
|
+
function parseMatrixSelector(value) {
|
|
2174
|
+
if (value == null) return;
|
|
2175
|
+
const selector = {};
|
|
2176
|
+
const segments = value.split(",").map((segment) => segment.trim()).filter((segment) => segment.length > 0);
|
|
2177
|
+
for (const segment of segments) {
|
|
2178
|
+
const separatorIndex = segment.indexOf("=");
|
|
2179
|
+
if (separatorIndex <= 0 || separatorIndex === segment.length - 1) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
|
|
2180
|
+
const key = segment.slice(0, separatorIndex).trim();
|
|
2181
|
+
const parsedValue = segment.slice(separatorIndex + 1).trim();
|
|
2182
|
+
if (key.length === 0 || parsedValue.length === 0) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
|
|
2183
|
+
selector[key] = parsedValue;
|
|
2184
|
+
}
|
|
2185
|
+
return selector;
|
|
2186
|
+
}
|
|
2187
|
+
function filterAnalyzeRows(rows, parsed) {
|
|
2188
|
+
return rows.filter((row) => {
|
|
2189
|
+
if (parsed.workspace != null && row.workspaceId !== parsed.workspace) return false;
|
|
2190
|
+
if (parsed.experiment != null && row.experimentId !== parsed.experiment) return false;
|
|
2191
|
+
if (parsed.attempt != null && row.attemptId !== parsed.attempt) return false;
|
|
2192
|
+
if (parsed.run != null && row.runId !== parsed.run) return false;
|
|
2193
|
+
if (parsed.project != null && !row.projectNames.includes(parsed.project)) return false;
|
|
2194
|
+
return true;
|
|
2195
|
+
});
|
|
2196
|
+
}
|
|
2197
|
+
function includesNeedle(value, needle) {
|
|
2198
|
+
const normalizedNeedle = needle.trim().toLowerCase();
|
|
2199
|
+
if (normalizedNeedle.length === 0) return true;
|
|
2200
|
+
return JSON.stringify(value).toLowerCase().includes(normalizedNeedle);
|
|
2201
|
+
}
|
|
2202
|
+
function hasTaskState(artifact, targetState) {
|
|
2203
|
+
return artifact.events.some((event) => {
|
|
2204
|
+
if (event.event !== "TaskEnded") return false;
|
|
2205
|
+
return event.data?.state === targetState;
|
|
2206
|
+
});
|
|
2207
|
+
}
|
|
2208
|
+
function hasCaseState(artifact, targetState) {
|
|
2209
|
+
return artifact.events.some((event) => {
|
|
2210
|
+
if (event.event !== "CaseEnded") return false;
|
|
2211
|
+
return event.data?.state === targetState;
|
|
2212
|
+
});
|
|
2213
|
+
}
|
|
2214
|
+
function matchesMatrixSelector(matrix, selector) {
|
|
2215
|
+
return Object.entries(selector).every(([key, expectedValue]) => String(matrix[key]) === expectedValue);
|
|
2216
|
+
}
|
|
2217
|
+
function hasRunMatrixMatch(artifact, selector) {
|
|
2218
|
+
return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.run, selector)) === true);
|
|
2219
|
+
}
|
|
2220
|
+
function hasEvalMatrixMatch(artifact, selector) {
|
|
2221
|
+
return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.eval, selector)) === true);
|
|
2222
|
+
}
|
|
2223
|
+
function matchesOutcomeFilters(artifact, parsed) {
|
|
2224
|
+
if (parsed.runMatrix != null && !hasRunMatrixMatch(artifact, parsed.runMatrix)) return false;
|
|
2225
|
+
if (parsed.evalMatrix != null && !hasEvalMatrixMatch(artifact, parsed.evalMatrix)) return false;
|
|
2226
|
+
if (parsed.taskState != null && !hasTaskState(artifact, parsed.taskState)) return false;
|
|
2227
|
+
if (parsed.caseState != null && !hasCaseState(artifact, parsed.caseState)) return false;
|
|
2228
|
+
if (parsed.contains != null) {
|
|
2229
|
+
if (!artifact.events.some((event) => includesNeedle({
|
|
2230
|
+
data: event.data,
|
|
2231
|
+
event: event.event
|
|
2232
|
+
}, parsed.contains))) return false;
|
|
2233
|
+
}
|
|
2234
|
+
if (parsed.errorContains != null) {
|
|
2235
|
+
if (!(artifact.summary.projects.map((project) => project.errorMessage).filter((errorMessage) => errorMessage != null).some((errorMessage) => includesNeedle(errorMessage, parsed.errorContains)) || artifact.events.some((event) => includesNeedle(event.data, parsed.errorContains)))) return false;
|
|
2236
|
+
}
|
|
2237
|
+
return true;
|
|
2238
|
+
}
|
|
2239
|
+
async function readReportAnalyzeOutput(parsed) {
|
|
2240
|
+
const artifacts = await readReportArtifacts(parsed.reportPath);
|
|
2241
|
+
const rows = artifacts.map((artifact) => summarizeReportRunArtifact(artifact));
|
|
2242
|
+
const identityFilteredRows = filterAnalyzeRows(rows, parsed);
|
|
2243
|
+
const rowByDirectory = new Map(identityFilteredRows.map((row) => [row.reportDirectory, row]));
|
|
2244
|
+
const filteredRows = artifacts.filter((artifact) => rowByDirectory.has(artifact.reportDirectory)).filter((artifact) => matchesOutcomeFilters(artifact, parsed)).map((artifact) => rowByDirectory.get(artifact.reportDirectory)).filter((row) => row != null);
|
|
2245
|
+
return {
|
|
2246
|
+
experimentSummaries: buildExperimentSummaries(filteredRows),
|
|
2247
|
+
filteredRunCount: filteredRows.length,
|
|
2248
|
+
runs: filteredRows,
|
|
2249
|
+
totalRunCount: rows.length
|
|
2250
|
+
};
|
|
2251
|
+
}
|
|
2252
|
+
function roundMetric(value) {
|
|
2253
|
+
return Number(value.toFixed(6));
|
|
2254
|
+
}
|
|
2255
|
+
function computeAverage(values) {
|
|
2256
|
+
if (values.length === 0) return 0;
|
|
2257
|
+
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
2258
|
+
}
|
|
2259
|
+
function computeStandardDeviation(values) {
|
|
2260
|
+
if (values.length === 0) return 0;
|
|
2261
|
+
const average = computeAverage(values);
|
|
2262
|
+
const variance = computeAverage(values.map((value) => (value - average) ** 2));
|
|
2263
|
+
return Math.sqrt(variance);
|
|
2264
|
+
}
|
|
2265
|
+
function createExperimentGroupKey(row) {
|
|
2266
|
+
return `${row.workspaceId ?? "unknown-workspace"}::${row.experimentId ?? "unknown-experiment"}`;
|
|
2267
|
+
}
|
|
2268
|
+
/**
|
|
2269
|
+
* Builds experiment-level rollups from filtered run rows.
|
|
2270
|
+
*
|
|
2271
|
+
* Use when:
|
|
2272
|
+
* - CLI consumers need stability and reliability summaries above per-run data
|
|
2273
|
+
*
|
|
2274
|
+
* Returns:
|
|
2275
|
+
* - one summary row per `workspaceId + experimentId` group
|
|
2276
|
+
*/
|
|
2277
|
+
function buildExperimentSummaries(rows) {
|
|
2278
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
2279
|
+
for (const row of rows) {
|
|
2280
|
+
const groupKey = createExperimentGroupKey(row);
|
|
2281
|
+
const existing = grouped.get(groupKey);
|
|
2282
|
+
if (existing == null) {
|
|
2283
|
+
grouped.set(groupKey, [row]);
|
|
2284
|
+
continue;
|
|
2285
|
+
}
|
|
2286
|
+
existing.push(row);
|
|
2287
|
+
}
|
|
2288
|
+
return [...grouped.entries()].map(([groupKey, groupRows]) => {
|
|
2289
|
+
const [workspaceId, experimentId] = groupKey.split("::");
|
|
2290
|
+
const failedProjects = groupRows.reduce((sum, row) => sum + row.failedProjects, 0);
|
|
2291
|
+
const totalTasks = groupRows.reduce((sum, row) => sum + row.totalTasks, 0);
|
|
2292
|
+
const totalEvents = groupRows.reduce((sum, row) => sum + row.eventsCount, 0);
|
|
2293
|
+
const successfulRunCount = groupRows.filter((row) => row.failedProjects === 0).length;
|
|
2294
|
+
const successRate = groupRows.length === 0 ? 0 : successfulRunCount / groupRows.length;
|
|
2295
|
+
const attemptToRuns = /* @__PURE__ */ new Map();
|
|
2296
|
+
for (const row of groupRows) {
|
|
2297
|
+
const attemptId = row.attemptId ?? "unknown-attempt";
|
|
2298
|
+
const attemptRows = attemptToRuns.get(attemptId);
|
|
2299
|
+
if (attemptRows == null) {
|
|
2300
|
+
attemptToRuns.set(attemptId, [row]);
|
|
2301
|
+
continue;
|
|
2302
|
+
}
|
|
2303
|
+
attemptRows.push(row);
|
|
2304
|
+
}
|
|
2305
|
+
const attemptSummaries = [...attemptToRuns.entries()].map(([attemptId, attemptRows]) => {
|
|
2306
|
+
const successCount = attemptRows.filter((row) => row.failedProjects === 0).length;
|
|
2307
|
+
const runCount = attemptRows.length;
|
|
2308
|
+
const failedProjectCount = attemptRows.reduce((sum, row) => sum + row.failedProjects, 0);
|
|
2309
|
+
const totalTaskCount = attemptRows.reduce((sum, row) => sum + row.totalTasks, 0);
|
|
2310
|
+
const totalEventCount = attemptRows.reduce((sum, row) => sum + row.eventsCount, 0);
|
|
2311
|
+
return {
|
|
2312
|
+
attemptId,
|
|
2313
|
+
failedProjects: failedProjectCount,
|
|
2314
|
+
runCount,
|
|
2315
|
+
runIds: attemptRows.map((row) => row.runId).filter((runId) => runId != null).sort((left, right) => left.localeCompare(right)),
|
|
2316
|
+
successRate: roundMetric(runCount === 0 ? 0 : successCount / runCount),
|
|
2317
|
+
totalEvents: totalEventCount,
|
|
2318
|
+
totalTasks: totalTaskCount
|
|
2319
|
+
};
|
|
2320
|
+
}).sort((left, right) => left.attemptId.localeCompare(right.attemptId));
|
|
2321
|
+
const attemptSuccessRates = attemptSummaries.map((summary) => summary.successRate);
|
|
2322
|
+
const minAttemptSuccessRate = attemptSuccessRates.length === 0 ? 0 : Math.min(...attemptSuccessRates);
|
|
2323
|
+
const maxAttemptSuccessRate = attemptSuccessRates.length === 0 ? 0 : Math.max(...attemptSuccessRates);
|
|
2324
|
+
const avgAttemptSuccessRate = computeAverage(attemptSuccessRates);
|
|
2325
|
+
const stdevAttemptSuccessRate = computeStandardDeviation(attemptSuccessRates);
|
|
2326
|
+
return {
|
|
2327
|
+
attemptCount: attemptToRuns.size,
|
|
2328
|
+
attemptSummaries,
|
|
2329
|
+
attemptSuccessRateStats: {
|
|
2330
|
+
avg: roundMetric(avgAttemptSuccessRate),
|
|
2331
|
+
max: roundMetric(maxAttemptSuccessRate),
|
|
2332
|
+
min: roundMetric(minAttemptSuccessRate),
|
|
2333
|
+
stdev: roundMetric(stdevAttemptSuccessRate)
|
|
2334
|
+
},
|
|
2335
|
+
experimentId,
|
|
2336
|
+
failedProjects,
|
|
2337
|
+
runCount: groupRows.length,
|
|
2338
|
+
successRate: roundMetric(successRate),
|
|
2339
|
+
totalEvents,
|
|
2340
|
+
totalTasks,
|
|
2341
|
+
workspaceId
|
|
2342
|
+
};
|
|
2343
|
+
}).sort((left, right) => {
|
|
2344
|
+
const workspaceCompare = left.workspaceId.localeCompare(right.workspaceId);
|
|
2345
|
+
if (workspaceCompare !== 0) return workspaceCompare;
|
|
2346
|
+
return left.experimentId.localeCompare(right.experimentId);
|
|
2347
|
+
});
|
|
2348
|
+
}
|
|
2349
|
+
function formatTableOutput$1(output) {
|
|
2350
|
+
const header = "Run ID | Workspace | Experiment | Attempt | Projects(executed/total) | FailedProjects | Tasks | Events";
|
|
2351
|
+
const lines = output.runs.map((row) => {
|
|
2352
|
+
return `${row.runId ?? "n/a"} | ${row.workspaceId ?? "n/a"} | ${row.experimentId ?? "n/a"} | ${row.attemptId ?? "n/a"} | ${`${row.executedProjects}/${row.totalProjects}`} | ${row.failedProjects} | ${row.totalTasks} | ${row.eventsCount}`;
|
|
2353
|
+
});
|
|
2354
|
+
return [
|
|
2355
|
+
`ANALYZE vieval report: ${output.filteredRunCount}/${output.totalRunCount} runs (${output.experimentSummaries.length} experiment groups)`,
|
|
2356
|
+
header,
|
|
2357
|
+
...lines
|
|
2358
|
+
].join("\n");
|
|
2359
|
+
}
|
|
2360
|
+
function formatCsvOutput(output) {
|
|
2361
|
+
return [[
|
|
2362
|
+
"runId",
|
|
2363
|
+
"workspaceId",
|
|
2364
|
+
"experimentId",
|
|
2365
|
+
"attemptId",
|
|
2366
|
+
"totalProjects",
|
|
2367
|
+
"executedProjects",
|
|
2368
|
+
"failedProjects",
|
|
2369
|
+
"totalTasks",
|
|
2370
|
+
"eventsCount",
|
|
2371
|
+
"reportDirectory",
|
|
2372
|
+
"projectNames"
|
|
2373
|
+
].join(","), ...output.runs.map((row) => {
|
|
2374
|
+
const escapedProjectNames = `"${row.projectNames.join("|").replaceAll("\"", "\"\"")}"`;
|
|
2375
|
+
const escapedDirectory = `"${row.reportDirectory.replaceAll("\"", "\"\"")}"`;
|
|
2376
|
+
return [
|
|
2377
|
+
row.runId ?? "",
|
|
2378
|
+
row.workspaceId ?? "",
|
|
2379
|
+
row.experimentId ?? "",
|
|
2380
|
+
row.attemptId ?? "",
|
|
2381
|
+
row.totalProjects.toString(),
|
|
2382
|
+
row.executedProjects.toString(),
|
|
2383
|
+
row.failedProjects.toString(),
|
|
2384
|
+
row.totalTasks.toString(),
|
|
2385
|
+
row.eventsCount.toString(),
|
|
2386
|
+
escapedDirectory,
|
|
2387
|
+
escapedProjectNames
|
|
2388
|
+
].join(",");
|
|
2389
|
+
})].join("\n");
|
|
2390
|
+
}
|
|
2391
|
+
async function runReportAnalyzeCli(argv) {
|
|
2392
|
+
try {
|
|
2393
|
+
const parsed = parseReportAnalyzeCliArguments(argv);
|
|
2394
|
+
const output = await readReportAnalyzeOutput(parsed);
|
|
2395
|
+
if (parsed.format === "json") {
|
|
2396
|
+
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
2397
|
+
return;
|
|
2398
|
+
}
|
|
2399
|
+
if (parsed.format === "jsonl") {
|
|
2400
|
+
const jsonl = output.runs.map((run) => JSON.stringify(run)).join("\n");
|
|
2401
|
+
process.stdout.write(`${jsonl}${jsonl.length > 0 ? "\n" : ""}`);
|
|
2402
|
+
return;
|
|
2403
|
+
}
|
|
2404
|
+
if (parsed.format === "csv") {
|
|
2405
|
+
process.stdout.write(`${formatCsvOutput(output)}\n`);
|
|
2406
|
+
return;
|
|
2407
|
+
}
|
|
2408
|
+
process.stdout.write(`${formatTableOutput$1(output)}\n`);
|
|
2409
|
+
} catch (error) {
|
|
2410
|
+
const errorMessage = errorMessageFrom(error) ?? "Unknown report analyze failure.";
|
|
2411
|
+
process.stderr.write(`[vieval report analyze] ${errorMessage}\n`);
|
|
2412
|
+
process.exitCode = 1;
|
|
2413
|
+
}
|
|
2414
|
+
}
|
|
2415
|
+
//#endregion
|
|
2416
|
+
//#region src/cli/report-index.ts
|
|
2417
|
+
const reportIndexHelpText = `
|
|
2418
|
+
Build report indexes from generated vieval artifacts.
|
|
2419
|
+
|
|
2420
|
+
Usage
|
|
2421
|
+
$ vieval report index <reportPath> [--output <path>] [--format <format>]
|
|
2422
|
+
|
|
2423
|
+
Options
|
|
2424
|
+
--output Output file path (default: <reportPath>/index/runs.jsonl)
|
|
2425
|
+
--format Console output format: table | json | jsonl (default: table)
|
|
2426
|
+
`;
|
|
2427
|
+
function normalizeCliArgv$1(argv) {
|
|
2428
|
+
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
2429
|
+
if (normalizedArgv[0] === "report" && normalizedArgv[1] === "index") return normalizedArgv.slice(2);
|
|
2430
|
+
if (normalizedArgv[0] === "index") return normalizedArgv.slice(1);
|
|
2431
|
+
return normalizedArgv;
|
|
2432
|
+
}
|
|
2433
|
+
function parseReportIndexCliArguments(argv) {
|
|
2434
|
+
const cli = meow(reportIndexHelpText, {
|
|
2435
|
+
argv: normalizeCliArgv$1(argv),
|
|
2436
|
+
flags: {
|
|
2437
|
+
format: {
|
|
2438
|
+
default: "table",
|
|
2439
|
+
type: "string"
|
|
2440
|
+
},
|
|
2441
|
+
output: { type: "string" }
|
|
2442
|
+
},
|
|
2443
|
+
importMeta: import.meta
|
|
2444
|
+
});
|
|
2445
|
+
const reportPath = cli.input[0];
|
|
2446
|
+
if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
|
|
2447
|
+
const normalizedFormat = cli.flags.format.toLowerCase();
|
|
2448
|
+
return {
|
|
2449
|
+
format: normalizedFormat === "json" ? "json" : normalizedFormat === "jsonl" ? "jsonl" : "table",
|
|
2450
|
+
output: cli.flags.output,
|
|
2451
|
+
reportPath
|
|
2452
|
+
};
|
|
2453
|
+
}
|
|
2454
|
+
async function writeIndexFile(parsed) {
|
|
2455
|
+
const rows = (await readReportArtifacts(parsed.reportPath)).map((artifact) => summarizeReportRunArtifact(artifact));
|
|
2456
|
+
const indexFilePath = resolve(parsed.output ?? resolve(parsed.reportPath, "index", "runs.jsonl"));
|
|
2457
|
+
await mkdir(dirname(indexFilePath), { recursive: true });
|
|
2458
|
+
const indexContents = rows.map((row) => JSON.stringify(row)).join("\n");
|
|
2459
|
+
await writeFile(indexFilePath, `${indexContents}${indexContents.length > 0 ? "\n" : ""}`, "utf-8");
|
|
2460
|
+
return {
|
|
2461
|
+
indexFilePath,
|
|
2462
|
+
indexedRunCount: rows.length,
|
|
2463
|
+
rows
|
|
2464
|
+
};
|
|
2465
|
+
}
|
|
2466
|
+
function formatTableOutput(output) {
|
|
2467
|
+
return [
|
|
2468
|
+
"INDEX vieval report",
|
|
2469
|
+
`Path ${output.indexFilePath}`,
|
|
2470
|
+
`Run count ${output.indexedRunCount}`
|
|
2471
|
+
].join("\n");
|
|
2472
|
+
}
|
|
2473
|
+
async function runReportIndexCli(argv) {
|
|
2474
|
+
try {
|
|
2475
|
+
const parsed = parseReportIndexCliArguments(argv);
|
|
2476
|
+
const output = await writeIndexFile(parsed);
|
|
2477
|
+
if (parsed.format === "json") {
|
|
2478
|
+
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
2479
|
+
return;
|
|
2480
|
+
}
|
|
2481
|
+
if (parsed.format === "jsonl") {
|
|
2482
|
+
const jsonl = output.rows.map((row) => JSON.stringify(row)).join("\n");
|
|
2483
|
+
process.stdout.write(`${jsonl}${jsonl.length > 0 ? "\n" : ""}`);
|
|
2484
|
+
return;
|
|
2485
|
+
}
|
|
2486
|
+
process.stdout.write(`${formatTableOutput(output)}\n`);
|
|
2487
|
+
} catch (error) {
|
|
2488
|
+
const errorMessage = errorMessageFrom(error) ?? "Unknown report index failure.";
|
|
2489
|
+
process.stderr.write(`[vieval report index] ${errorMessage}\n`);
|
|
2490
|
+
process.exitCode = 1;
|
|
2491
|
+
}
|
|
2492
|
+
}
|
|
1363
2493
|
//#endregion
|
|
1364
2494
|
//#region src/cli/index.ts
|
|
1365
2495
|
const topLevelHelpText = `
|
|
@@ -1370,10 +2500,15 @@ const topLevelHelpText = `
|
|
|
1370
2500
|
|
|
1371
2501
|
Commands
|
|
1372
2502
|
run Discover and execute eval projects
|
|
2503
|
+
compare Compare multiple workspaces/methods on one benchmark
|
|
2504
|
+
report Analyze and index generated report artifacts
|
|
1373
2505
|
|
|
1374
2506
|
Examples
|
|
1375
2507
|
$ vieval run
|
|
1376
|
-
$ vieval run --config vieval.config.ts --project chess --json
|
|
2508
|
+
$ vieval run --config vieval.config.ts --project chess --json --report-out .vieval/reports
|
|
2509
|
+
$ vieval compare --config vieval.config.ts --comparison agent-memory
|
|
2510
|
+
$ vieval report analyze .vieval/reports/my-run
|
|
2511
|
+
$ vieval report index .vieval/reports --output .vieval/reports/index/runs.jsonl
|
|
1377
2512
|
`;
|
|
1378
2513
|
function normalizeCliArgv(argv) {
|
|
1379
2514
|
return argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
@@ -1391,7 +2526,7 @@ function parseTopLevelCliArguments(argv) {
|
|
|
1391
2526
|
command: "help",
|
|
1392
2527
|
commandArgv: []
|
|
1393
2528
|
};
|
|
1394
|
-
if (command !== "run") throw new Error(`Unsupported vieval command "${command ?? "(none)"}". Expected "run".`);
|
|
2529
|
+
if (command !== "run" && command !== "report" && command !== "compare") throw new Error(`Unsupported vieval command "${command ?? "(none)"}". Expected "run", "compare", or "report".`);
|
|
1395
2530
|
return {
|
|
1396
2531
|
command,
|
|
1397
2532
|
commandArgv: normalizedArgv.slice(1)
|
|
@@ -1403,20 +2538,34 @@ async function runTopLevelCli(argv) {
|
|
|
1403
2538
|
process.stdout.write(`${topLevelHelpText.trim()}\n`);
|
|
1404
2539
|
return;
|
|
1405
2540
|
}
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
2541
|
+
if (parsed.command === "report") {
|
|
2542
|
+
const reportSubcommand = parsed.commandArgv[0];
|
|
2543
|
+
if (reportSubcommand === "analyze") {
|
|
2544
|
+
await runReportAnalyzeCli(parsed.commandArgv);
|
|
2545
|
+
return;
|
|
2546
|
+
}
|
|
2547
|
+
if (reportSubcommand === "index") {
|
|
2548
|
+
await runReportIndexCli(parsed.commandArgv);
|
|
2549
|
+
return;
|
|
2550
|
+
}
|
|
2551
|
+
throw new Error(`Unsupported vieval report command "${reportSubcommand ?? "(none)"}". Expected "analyze" or "index".`);
|
|
2552
|
+
}
|
|
2553
|
+
if (parsed.command === "compare") {
|
|
2554
|
+
await runCompareCliOrExit(parsed.commandArgv);
|
|
1413
2555
|
return;
|
|
1414
2556
|
}
|
|
1415
|
-
|
|
2557
|
+
await runEvalRunCli(parsed.commandArgv);
|
|
1416
2558
|
}
|
|
1417
2559
|
function isDirectExecution() {
|
|
1418
2560
|
if (!process.argv[1]) return false;
|
|
1419
|
-
|
|
2561
|
+
const resolvedArgvPath = path.resolve(process.argv[1]);
|
|
2562
|
+
const currentModulePath = fileURLToPath(import.meta.url);
|
|
2563
|
+
try {
|
|
2564
|
+
if (realpathSync.native(resolvedArgvPath) === realpathSync.native(currentModulePath)) return true;
|
|
2565
|
+
} catch {
|
|
2566
|
+
if (resolvedArgvPath === currentModulePath) return true;
|
|
2567
|
+
}
|
|
2568
|
+
return resolvedArgvPath.replaceAll("\\", "/").endsWith("/.bin/vieval");
|
|
1420
2569
|
}
|
|
1421
2570
|
async function main() {
|
|
1422
2571
|
try {
|