vieval 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +31 -31
  2. package/dist/bin/vieval.mjs +1 -1
  3. package/dist/bin/vieval.mjs.map +1 -1
  4. package/dist/cli/index.d.mts +1 -1
  5. package/dist/cli/index.mjs +1 -1
  6. package/dist/{cli-DTDgaqeI.mjs → cli-uzS81IPd.mjs} +1483 -1483
  7. package/dist/cli-uzS81IPd.mjs.map +1 -0
  8. package/dist/config.d.mts +1 -1
  9. package/dist/config.mjs +1 -1
  10. package/dist/config.mjs.map +1 -1
  11. package/dist/core/assertions/index.d.mts +156 -156
  12. package/dist/core/assertions/index.mjs +82 -82
  13. package/dist/core/assertions/index.mjs.map +1 -1
  14. package/dist/core/inference-executors/index.d.mts +37 -37
  15. package/dist/core/inference-executors/index.mjs +54 -53
  16. package/dist/core/inference-executors/index.mjs.map +1 -1
  17. package/dist/core/processors/results/index.d.mts +18 -18
  18. package/dist/core/processors/results/index.mjs.map +1 -1
  19. package/dist/core/runner/index.d.mts +2 -2
  20. package/dist/core/runner/index.mjs +259 -259
  21. package/dist/core/runner/index.mjs.map +1 -1
  22. package/dist/core/scheduler/index.d.mts +1 -1
  23. package/dist/core/scheduler/index.mjs +65 -65
  24. package/dist/core/scheduler/index.mjs.map +1 -1
  25. package/dist/{env-DfWZy_n4.d.mts → env-Br6jaWGL.d.mts} +9 -9
  26. package/dist/{env-nV5rVErX.mjs → env-egxaJtNn.mjs} +8 -8
  27. package/dist/env-egxaJtNn.mjs.map +1 -0
  28. package/dist/{expect-extensions-DCSqlneN.mjs → expect-extensions-BKdEPt3h.mjs} +46 -46
  29. package/dist/expect-extensions-BKdEPt3h.mjs.map +1 -0
  30. package/dist/expect.d.mts +1 -3
  31. package/dist/expect.mjs +1 -1
  32. package/dist/expect.mjs.map +1 -1
  33. package/dist/{index-D_aMeWqO.d.mts → index-BLIlhiWT.d.mts} +565 -565
  34. package/dist/{index-Bg0atWBF.d.mts → index-CIaJClcC.d.mts} +48 -48
  35. package/dist/index.d.mts +208 -197
  36. package/dist/index.mjs +148 -148
  37. package/dist/index.mjs.map +1 -1
  38. package/dist/{models-pBSRUZhY.mjs → models-CaCOUPZw.mjs} +1 -1
  39. package/dist/{models-pBSRUZhY.mjs.map → models-CaCOUPZw.mjs.map} +1 -1
  40. package/dist/plugins/chat-models/index.d.mts +279 -279
  41. package/dist/plugins/chat-models/index.mjs +360 -360
  42. package/dist/plugins/chat-models/index.mjs.map +1 -1
  43. package/dist/{queue-DsZQkZO_.mjs → queue-BL86z2W_.mjs} +1 -1
  44. package/dist/{queue-DsZQkZO_.mjs.map → queue-BL86z2W_.mjs.map} +1 -1
  45. package/dist/{registry-DMnwE_mY.mjs → registry-BK7k6X81.mjs} +294 -294
  46. package/dist/registry-BK7k6X81.mjs.map +1 -0
  47. package/dist/testing/expect-extensions.d.mts +27 -27
  48. package/dist/testing/expect-extensions.mjs +1 -1
  49. package/package.json +12 -12
  50. package/dist/cli-DTDgaqeI.mjs.map +0 -1
  51. package/dist/env-nV5rVErX.mjs.map +0 -1
  52. package/dist/expect-extensions-DCSqlneN.mjs.map +0 -1
  53. package/dist/registry-DMnwE_mY.mjs.map +0 -1
@@ -1,4 +1,4 @@
1
- import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-DMnwE_mY.mjs";
1
+ import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-BK7k6X81.mjs";
2
2
  import { createSchedulerRuntime } from "./core/scheduler/index.mjs";
3
3
  import { RunnerExecutionError, collectEvalEntries, createFilesystemTaskCacheRuntime, createRunnerRuntimeContext, createRunnerSchedule, createTaskExecutionContext, runScheduledTasks } from "./core/runner/index.mjs";
4
4
  import process from "node:process";
@@ -27,40 +27,44 @@ const supportedWorkspaceConfigFileNames = [
27
27
  "vieval.config.cjs",
28
28
  "vieval.config.json"
29
29
  ];
30
- async function isReadableFile(filePath) {
30
+ /**
31
+ * Loads and validates comparison-mode data from `vieval.config.*`.
32
+ */
33
+ async function loadVievalComparisonConfig(options = {}) {
34
+ const cwd = options.cwd ?? process.cwd();
31
35
  try {
32
- await access(filePath);
33
- return true;
34
- } catch {
35
- return false;
36
+ const loaded = await loadRawVievalConfig({
37
+ configFilePath: options.configFilePath,
38
+ cwd
39
+ });
40
+ if (loaded.configFilePath == null || loaded.config == null) throw new Error("Failed to find vieval config. Expected vieval.config.*");
41
+ assertComparisonMode(loaded.config);
42
+ const selectedComparison = selectComparisonConfig(loaded.config.comparisons, options.comparisonId);
43
+ const configDirectory = dirname(loaded.configFilePath);
44
+ const explicitMethods = (selectedComparison.methods ?? []).map((method, index) => normalizeMethodShape(method, configDirectory, index));
45
+ const discoveredMethods = await discoverMethodsFromWorkspaceGlobs({
46
+ comparison: selectedComparison,
47
+ configDirectory
48
+ });
49
+ const methods = [...explicitMethods, ...discoveredMethods];
50
+ if (methods.length === 0) throw new Error("Comparison config resolved zero methods. Configure methods or includesWorkspaces.");
51
+ validateMethodIdsAreUnique(methods);
52
+ return {
53
+ config: {
54
+ benchmark: normalizeBenchmark(selectedComparison),
55
+ methods
56
+ },
57
+ configFilePath: loaded.configFilePath
58
+ };
59
+ } catch (error) {
60
+ const errorMessage = errorMessageFrom(error) ?? "Unknown comparison config loading error.";
61
+ const resolvedPath = options.configFilePath ?? "vieval.config";
62
+ throw new Error(`Failed to load comparison config "${resolvedPath}": ${errorMessage}`);
36
63
  }
37
64
  }
38
- function normalizeGlobInput(patterns) {
39
- if (patterns == null) return [];
40
- return (typeof patterns === "string" ? [patterns] : patterns).map((pattern) => pattern.trim()).filter((pattern) => pattern.length > 0);
41
- }
42
- function normalizeMethodShape(method, configDirectory, index) {
43
- const id = method.id.trim();
44
- const workspace = method.workspace.trim();
45
- const project = method.project.trim();
46
- const configFilePath = method.configFilePath?.trim();
47
- if (id.length === 0) throw new Error(`Comparison method #${index + 1} is missing id.`);
48
- if (workspace.length === 0) throw new Error(`Comparison method "${id}" is missing workspace.`);
49
- if (project.length === 0) throw new Error(`Comparison method "${id}" is missing project.`);
50
- const resolvedWorkspace = isAbsolute(workspace) ? workspace : resolve(configDirectory, workspace);
51
- return {
52
- configFilePath: configFilePath == null || configFilePath.length === 0 ? void 0 : isAbsolute(configFilePath) ? configFilePath : resolve(configDirectory, configFilePath),
53
- id,
54
- project,
55
- workspace: resolvedWorkspace
56
- };
57
- }
58
- async function findWorkspaceConfigFile(workspaceDirectory) {
59
- for (const fileName of supportedWorkspaceConfigFileNames) {
60
- const candidate = join(workspaceDirectory, fileName);
61
- if (await isReadableFile(candidate)) return candidate;
62
- }
63
- return null;
65
+ function assertComparisonMode(config) {
66
+ const mode = detectCliConfigMode(config);
67
+ if (mode !== "comparisons") throw new Error(`Expected comparison-mode config, but received ${mode}-mode config.`);
64
68
  }
65
69
  function createDiscoveredMethodId(configDirectory, workspace, projectName) {
66
70
  const relativeWorkspace = relative(configDirectory, workspace);
@@ -92,24 +96,20 @@ async function discoverMethodsFromWorkspaceGlobs(args) {
92
96
  }
93
97
  return methods;
94
98
  }
95
- function validateMethodIdsAreUnique(methods) {
96
- const methodIds = methods.map((method) => method.id);
97
- const duplicatedMethodId = methodIds.find((methodId, index) => methodIds.indexOf(methodId) !== index);
98
- if (duplicatedMethodId != null) throw new Error(`Duplicate comparison method id "${duplicatedMethodId}".`);
99
- }
100
- function assertComparisonMode(config) {
101
- const mode = detectCliConfigMode(config);
102
- if (mode !== "comparisons") throw new Error(`Expected comparison-mode config, but received ${mode}-mode config.`);
99
+ async function findWorkspaceConfigFile(workspaceDirectory) {
100
+ for (const fileName of supportedWorkspaceConfigFileNames) {
101
+ const candidate = join(workspaceDirectory, fileName);
102
+ if (await isReadableFile(candidate)) return candidate;
103
+ }
104
+ return null;
103
105
  }
104
- function selectComparisonConfig(comparisons, comparisonId) {
105
- if (comparisons.length === 0) throw new Error("Comparison config requires at least one comparisons entry.");
106
- if (comparisonId == null || comparisonId.trim().length === 0) {
107
- if (comparisons.length > 1) throw new Error(`Multiple comparisons found. Provide --comparison. Available ids: ${comparisons.map((item) => item.id).join(", ")}`);
108
- return comparisons[0];
106
+ async function isReadableFile(filePath) {
107
+ try {
108
+ await access(filePath);
109
+ return true;
110
+ } catch {
111
+ return false;
109
112
  }
110
- const selected = comparisons.find((item) => item.id === comparisonId);
111
- if (selected == null) throw new Error(`Unknown comparison id "${comparisonId}".`);
112
- return selected;
113
113
  }
114
114
  function normalizeBenchmark(comparison) {
115
115
  const benchmarkId = comparison.benchmark.id.trim();
@@ -121,40 +121,40 @@ function normalizeBenchmark(comparison) {
121
121
  sharedCaseNamespace
122
122
  };
123
123
  }
124
- /**
125
- * Loads and validates comparison-mode data from `vieval.config.*`.
126
- */
127
- async function loadVievalComparisonConfig(options = {}) {
128
- const cwd = options.cwd ?? process.cwd();
129
- try {
130
- const loaded = await loadRawVievalConfig({
131
- configFilePath: options.configFilePath,
132
- cwd
133
- });
134
- if (loaded.configFilePath == null || loaded.config == null) throw new Error("Failed to find vieval config. Expected vieval.config.*");
135
- assertComparisonMode(loaded.config);
136
- const selectedComparison = selectComparisonConfig(loaded.config.comparisons, options.comparisonId);
137
- const configDirectory = dirname(loaded.configFilePath);
138
- const explicitMethods = (selectedComparison.methods ?? []).map((method, index) => normalizeMethodShape(method, configDirectory, index));
139
- const discoveredMethods = await discoverMethodsFromWorkspaceGlobs({
140
- comparison: selectedComparison,
141
- configDirectory
142
- });
143
- const methods = [...explicitMethods, ...discoveredMethods];
144
- if (methods.length === 0) throw new Error("Comparison config resolved zero methods. Configure methods or includesWorkspaces.");
145
- validateMethodIdsAreUnique(methods);
146
- return {
147
- config: {
148
- benchmark: normalizeBenchmark(selectedComparison),
149
- methods
150
- },
151
- configFilePath: loaded.configFilePath
152
- };
153
- } catch (error) {
154
- const errorMessage = errorMessageFrom(error) ?? "Unknown comparison config loading error.";
155
- const resolvedPath = options.configFilePath ?? "vieval.config";
156
- throw new Error(`Failed to load comparison config "${resolvedPath}": ${errorMessage}`);
124
+ function normalizeGlobInput(patterns) {
125
+ if (patterns == null) return [];
126
+ return (typeof patterns === "string" ? [patterns] : patterns).map((pattern) => pattern.trim()).filter((pattern) => pattern.length > 0);
127
+ }
128
+ function normalizeMethodShape(method, configDirectory, index) {
129
+ const id = method.id.trim();
130
+ const workspace = method.workspace.trim();
131
+ const project = method.project.trim();
132
+ const configFilePath = method.configFilePath?.trim();
133
+ if (id.length === 0) throw new Error(`Comparison method #${index + 1} is missing id.`);
134
+ if (workspace.length === 0) throw new Error(`Comparison method "${id}" is missing workspace.`);
135
+ if (project.length === 0) throw new Error(`Comparison method "${id}" is missing project.`);
136
+ const resolvedWorkspace = isAbsolute(workspace) ? workspace : resolve(configDirectory, workspace);
137
+ return {
138
+ configFilePath: configFilePath == null || configFilePath.length === 0 ? void 0 : isAbsolute(configFilePath) ? configFilePath : resolve(configDirectory, configFilePath),
139
+ id,
140
+ project,
141
+ workspace: resolvedWorkspace
142
+ };
143
+ }
144
+ function selectComparisonConfig(comparisons, comparisonId) {
145
+ if (comparisons.length === 0) throw new Error("Comparison config requires at least one comparisons entry.");
146
+ if (comparisonId == null || comparisonId.trim().length === 0) {
147
+ if (comparisons.length > 1) throw new Error(`Multiple comparisons found. Provide --comparison. Available ids: ${comparisons.map((item) => item.id).join(", ")}`);
148
+ return comparisons[0];
157
149
  }
150
+ const selected = comparisons.find((item) => item.id === comparisonId);
151
+ if (selected == null) throw new Error(`Unknown comparison id "${comparisonId}".`);
152
+ return selected;
153
+ }
154
+ function validateMethodIdsAreUnique(methods) {
155
+ const methodIds = methods.map((method) => method.id);
156
+ const duplicatedMethodId = methodIds.find((methodId, index) => methodIds.indexOf(methodId) !== index);
157
+ if (duplicatedMethodId != null) throw new Error(`Duplicate comparison method id "${duplicatedMethodId}".`);
158
158
  }
159
159
  //#endregion
160
160
  //#region src/cli/report-records.ts
@@ -242,51 +242,39 @@ function encodeJsonl(records) {
242
242
  if (records.length === 0) return "";
243
243
  return `${records.map((record) => JSON.stringify(record)).join("\n")}\n`;
244
244
  }
245
- function normalizeCaseEventName(eventName) {
246
- if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
247
- if (eventName === "task.case.metric") return "metric";
248
- if (eventName === "task.case.score") return "score";
249
- if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
245
+ function addRecordScores(summary, record) {
246
+ for (const [kind, score] of Object.entries(record.scores)) {
247
+ if (!Number.isFinite(score)) continue;
248
+ summary[kind] ??= {
249
+ average: 0,
250
+ count: 0,
251
+ sum: 0
252
+ };
253
+ summary[kind].count += 1;
254
+ summary[kind].sum += score;
255
+ }
250
256
  }
251
- function extractEventIds(event, args) {
257
+ function applyCaseEnd(draft, event) {
252
258
  const data = asRecord(event.data);
253
- return {
254
- attemptId: stringFrom(data?.attemptId) ?? event.attemptId ?? args.attemptId,
255
- caseId: stringFrom(data?.caseId) ?? event.caseId ?? "",
256
- experimentId: stringFrom(data?.experimentId) ?? event.experimentId ?? args.experimentId,
257
- projectName: stringFrom(data?.projectName) ?? event.projectName ?? event.projectId ?? args.projectName,
258
- runId: stringFrom(data?.runId) ?? event.runId ?? args.runId,
259
- taskId: stringFrom(data?.taskId) ?? event.taskId ?? "",
260
- workspaceId: stringFrom(data?.workspaceId) ?? event.workspaceId ?? args.workspaceId
261
- };
259
+ draft.caseName = extractCaseName(event) ?? draft.caseName;
260
+ draft.endedAt = stringFrom(data?.endedAt) ?? event.timestamp ?? draft.endedAt;
261
+ draft.output = data != null && "output" in data ? data.output : draft.output;
262
+ draft.state = normalizeState(stringFrom(data?.state)) ?? "failed";
263
+ draft.scores.exact ??= draft.state === "passed" ? 1 : 0;
262
264
  }
263
- function getOrCreateDraft(drafts, ids, event, args) {
264
- const key = createCaseKey(ids.taskId, ids.caseId);
265
- const existing = drafts.get(key);
266
- if (existing != null) return existing;
267
- const draft = {
268
- attemptId: ids.attemptId,
269
- caseId: ids.caseId,
270
- caseName: extractCaseName(event) ?? ids.caseId,
271
- experimentId: ids.experimentId,
272
- metrics: {},
273
- projectName: ids.projectName || args.projectName,
274
- retryCount: 0,
275
- runId: ids.runId,
276
- scores: {},
277
- startCount: 0,
278
- taskId: ids.taskId,
279
- workspaceId: ids.workspaceId
280
- };
281
- drafts.set(key, draft);
282
- return draft;
265
+ function applyCaseMetric(draft, event) {
266
+ const data = asRecord(event.data);
267
+ const name = stringFrom(data?.name);
268
+ if (name == null) return;
269
+ const value = data?.value;
270
+ if (isCaseMetricValue(value)) draft.metrics[name] = value;
283
271
  }
284
- function applyIdentity(draft, ids, event, args) {
285
- draft.attemptId = ids.attemptId || args.attemptId;
286
- draft.experimentId = ids.experimentId || args.experimentId;
287
- draft.projectName = extractExplicitProjectName(event) ?? draft.projectName;
288
- draft.runId = ids.runId || args.runId;
289
- draft.workspaceId = ids.workspaceId || args.workspaceId;
272
+ function applyCaseScore(draft, event) {
273
+ const data = asRecord(event.data);
274
+ const kind = stringFrom(data?.kind) ?? stringFrom(data?.name) ?? stringFrom(data?.["vieval.score.kind"]);
275
+ const score = numberFrom(data?.score) ?? numberFrom(data?.value) ?? numberFrom(data?.["vieval.score.value"]);
276
+ if (kind == null || score == null) return;
277
+ draft.scores[kind] = score;
290
278
  }
291
279
  function applyCaseStart(draft, event) {
292
280
  const data = asRecord(event.data);
@@ -307,66 +295,55 @@ function applyCaseStart(draft, event) {
307
295
  }
308
296
  draft.retryCount = Math.max(draft.retryCount, draft.startCount - 1);
309
297
  }
310
- function applyCaseMetric(draft, event) {
311
- const data = asRecord(event.data);
312
- const name = stringFrom(data?.name);
313
- if (name == null) return;
314
- const value = data?.value;
315
- if (isCaseMetricValue(value)) draft.metrics[name] = value;
298
+ function applyIdentity(draft, ids, event, args) {
299
+ draft.attemptId = ids.attemptId || args.attemptId;
300
+ draft.experimentId = ids.experimentId || args.experimentId;
301
+ draft.projectName = extractExplicitProjectName(event) ?? draft.projectName;
302
+ draft.runId = ids.runId || args.runId;
303
+ draft.workspaceId = ids.workspaceId || args.workspaceId;
316
304
  }
317
- function applyCaseScore(draft, event) {
318
- const data = asRecord(event.data);
319
- const kind = stringFrom(data?.kind) ?? stringFrom(data?.name) ?? stringFrom(data?.["vieval.score.kind"]);
320
- const score = numberFrom(data?.score) ?? numberFrom(data?.value) ?? numberFrom(data?.["vieval.score.value"]);
321
- if (kind == null || score == null) return;
322
- draft.scores[kind] = score;
305
+ function asRecord(value) {
306
+ if (value == null || typeof value !== "object" || Array.isArray(value)) return;
307
+ return value;
323
308
  }
324
- function applyCaseEnd(draft, event) {
325
- const data = asRecord(event.data);
326
- draft.caseName = extractCaseName(event) ?? draft.caseName;
327
- draft.endedAt = stringFrom(data?.endedAt) ?? event.timestamp ?? draft.endedAt;
328
- draft.output = data != null && "output" in data ? data.output : draft.output;
329
- draft.state = normalizeState(stringFrom(data?.state)) ?? "failed";
330
- draft.scores.exact ??= draft.state === "passed" ? 1 : 0;
309
+ /**
310
+ * Normalizes duration timestamps.
311
+ *
312
+ * Before:
313
+ * - `startedAt="2026-05-08T00:00:00.000Z"`, `endedAt="2026-05-08T00:00:01.250Z"`
314
+ * - `startedAt="bad"`, `endedAt="2026-05-08T00:00:01.250Z"`
315
+ *
316
+ * After:
317
+ * - `1250`
318
+ * - `0`
319
+ */
320
+ function calculateDurationMs(startedAt, endedAt) {
321
+ const started = Date.parse(startedAt);
322
+ const ended = Date.parse(endedAt);
323
+ if (!Number.isFinite(started) || !Number.isFinite(ended)) return 0;
324
+ return Math.max(0, ended - started);
331
325
  }
332
- function toCaseRecord(draft) {
333
- const startedAt = draft.startedAt ?? draft.endedAt ?? "";
334
- const endedAt = draft.endedAt ?? startedAt;
326
+ function createCaseKey(taskId, caseId) {
327
+ return `${taskId}\u0000${caseId}`;
328
+ }
329
+ function extractCaseName(event) {
330
+ const data = asRecord(event.data);
331
+ return stringFrom(data?.caseName) ?? stringFrom(data?.name);
332
+ }
333
+ function extractEventIds(event, args) {
334
+ const data = asRecord(event.data);
335
335
  return {
336
- attemptId: draft.attemptId,
337
- caseId: draft.caseId,
338
- caseName: draft.caseName,
339
- durationMs: calculateDurationMs(startedAt, endedAt),
340
- endedAt,
341
- experimentId: draft.experimentId,
342
- ...draft.input === void 0 ? {} : { input: draft.input },
343
- metrics: draft.metrics,
344
- ...draft.output === void 0 ? {} : { output: draft.output },
345
- projectName: draft.projectName,
346
- retryCount: draft.retryCount,
347
- runId: draft.runId,
348
- schemaVersion: 1,
349
- scores: draft.scores,
350
- startedAt,
351
- state: draft.state ?? "failed",
352
- taskId: draft.taskId,
353
- workspaceId: draft.workspaceId
336
+ attemptId: stringFrom(data?.attemptId) ?? event.attemptId ?? args.attemptId,
337
+ caseId: stringFrom(data?.caseId) ?? event.caseId ?? "",
338
+ experimentId: stringFrom(data?.experimentId) ?? event.experimentId ?? args.experimentId,
339
+ projectName: stringFrom(data?.projectName) ?? event.projectName ?? event.projectId ?? args.projectName,
340
+ runId: stringFrom(data?.runId) ?? event.runId ?? args.runId,
341
+ taskId: stringFrom(data?.taskId) ?? event.taskId ?? "",
342
+ workspaceId: stringFrom(data?.workspaceId) ?? event.workspaceId ?? args.workspaceId
354
343
  };
355
344
  }
356
- function addRecordScores(summary, record) {
357
- for (const [kind, score] of Object.entries(record.scores)) {
358
- if (!Number.isFinite(score)) continue;
359
- summary[kind] ??= {
360
- average: 0,
361
- count: 0,
362
- sum: 0
363
- };
364
- summary[kind].count += 1;
365
- summary[kind].sum += score;
366
- }
367
- }
368
- function finalizeSummaryGroups(groups) {
369
- return Object.fromEntries(Object.entries(groups).map(([key, summary]) => [key, finalizeScoreSummary(summary)]));
345
+ function extractExplicitProjectName(event) {
346
+ return stringFrom(asRecord(event.data)?.projectName) ?? event.projectName ?? event.projectId;
370
347
  }
371
348
  function finalizeScoreSummary(summary) {
372
349
  return Object.fromEntries(Object.entries(summary).map(([kind, bucket]) => [kind, {
@@ -375,6 +352,9 @@ function finalizeScoreSummary(summary) {
375
352
  sum: bucket.sum
376
353
  }]));
377
354
  }
355
+ function finalizeSummaryGroups(groups) {
356
+ return Object.fromEntries(Object.entries(groups).map(([key, summary]) => [key, finalizeScoreSummary(summary)]));
357
+ }
378
358
  function getGroupValue(record, key) {
379
359
  if (Object.hasOwn(record.metrics, key)) return {
380
360
  exists: true,
@@ -386,49 +366,69 @@ function getGroupValue(record, key) {
386
366
  value: directValue
387
367
  } : { exists: false };
388
368
  }
389
- function extractCaseName(event) {
390
- const data = asRecord(event.data);
391
- return stringFrom(data?.caseName) ?? stringFrom(data?.name);
392
- }
393
- function extractExplicitProjectName(event) {
394
- return stringFrom(asRecord(event.data)?.projectName) ?? event.projectName ?? event.projectId;
369
+ function getOrCreateDraft(drafts, ids, event, args) {
370
+ const key = createCaseKey(ids.taskId, ids.caseId);
371
+ const existing = drafts.get(key);
372
+ if (existing != null) return existing;
373
+ const draft = {
374
+ attemptId: ids.attemptId,
375
+ caseId: ids.caseId,
376
+ caseName: extractCaseName(event) ?? ids.caseId,
377
+ experimentId: ids.experimentId,
378
+ metrics: {},
379
+ projectName: ids.projectName || args.projectName,
380
+ retryCount: 0,
381
+ runId: ids.runId,
382
+ scores: {},
383
+ startCount: 0,
384
+ taskId: ids.taskId,
385
+ workspaceId: ids.workspaceId
386
+ };
387
+ drafts.set(key, draft);
388
+ return draft;
395
389
  }
396
- function createCaseKey(taskId, caseId) {
397
- return `${taskId}\u0000${caseId}`;
390
+ function isCaseMetricValue(value) {
391
+ if (value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string") return true;
392
+ return Array.isArray(value);
398
393
  }
399
- /**
400
- * Normalizes duration timestamps.
401
- *
402
- * Before:
403
- * - `startedAt="2026-05-08T00:00:00.000Z"`, `endedAt="2026-05-08T00:00:01.250Z"`
404
- * - `startedAt="bad"`, `endedAt="2026-05-08T00:00:01.250Z"`
405
- *
406
- * After:
407
- * - `1250`
408
- * - `0`
409
- */
410
- function calculateDurationMs(startedAt, endedAt) {
411
- const started = Date.parse(startedAt);
412
- const ended = Date.parse(endedAt);
413
- if (!Number.isFinite(started) || !Number.isFinite(ended)) return 0;
414
- return Math.max(0, ended - started);
394
+ function normalizeCaseEventName(eventName) {
395
+ if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
396
+ if (eventName === "task.case.metric") return "metric";
397
+ if (eventName === "task.case.score") return "score";
398
+ if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
415
399
  }
416
400
  function normalizeState(value) {
417
401
  if (value === "failed" || value === "passed" || value === "skipped" || value === "timeout") return value;
418
402
  }
419
- function isCaseMetricValue(value) {
420
- if (value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string") return true;
421
- return Array.isArray(value);
422
- }
423
- function asRecord(value) {
424
- if (value == null || typeof value !== "object" || Array.isArray(value)) return;
425
- return value;
403
+ function numberFrom(value) {
404
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
426
405
  }
427
406
  function stringFrom(value) {
428
407
  return typeof value === "string" ? value : void 0;
429
408
  }
430
- function numberFrom(value) {
431
- return typeof value === "number" && Number.isFinite(value) ? value : void 0;
409
+ function toCaseRecord(draft) {
410
+ const startedAt = draft.startedAt ?? draft.endedAt ?? "";
411
+ const endedAt = draft.endedAt ?? startedAt;
412
+ return {
413
+ attemptId: draft.attemptId,
414
+ caseId: draft.caseId,
415
+ caseName: draft.caseName,
416
+ durationMs: calculateDurationMs(startedAt, endedAt),
417
+ endedAt,
418
+ experimentId: draft.experimentId,
419
+ ...draft.input === void 0 ? {} : { input: draft.input },
420
+ metrics: draft.metrics,
421
+ ...draft.output === void 0 ? {} : { output: draft.output },
422
+ projectName: draft.projectName,
423
+ retryCount: draft.retryCount,
424
+ runId: draft.runId,
425
+ schemaVersion: 1,
426
+ scores: draft.scores,
427
+ startedAt,
428
+ state: draft.state ?? "failed",
429
+ taskId: draft.taskId,
430
+ workspaceId: draft.workspaceId
431
+ };
432
432
  }
433
433
  //#endregion
434
434
  //#region src/cli/report-selectors.ts
@@ -493,6 +493,28 @@ const reportCasesHelpText = `
493
493
  --group-by Case field, score name, or metric name used for grouped score summaries
494
494
  `;
495
495
  /**
496
+ * Builds filtered case inspection output.
497
+ *
498
+ * Use when:
499
+ * - `vieval report cases` needs deterministic JSON/table output
500
+ * - tests need pure filtering and grouping behavior without process I/O
501
+ *
502
+ * Expects:
503
+ * - `where` filters use `key=value`
504
+ * - lookup keys may target direct case fields, score names, or metric names
505
+ *
506
+ * Returns:
507
+ * - filtered records plus grouped score summaries when `groupBy` is present
508
+ */
509
+ function buildReportCasesOutput(records, options) {
510
+ const whereFilters = (options.where ?? []).map(parseSelector);
511
+ const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
512
+ return {
513
+ groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
514
+ records: [...filteredRecords]
515
+ };
516
+ }
517
+ /**
496
518
  * Reads normalized case records from one report run directory or report root.
497
519
  *
498
520
  * Use when:
@@ -524,28 +546,6 @@ async function readCaseRecordsFromReport(reportPath) {
524
546
  return records;
525
547
  }
526
548
  /**
527
- * Builds filtered case inspection output.
528
- *
529
- * Use when:
530
- * - `vieval report cases` needs deterministic JSON/table output
531
- * - tests need pure filtering and grouping behavior without process I/O
532
- *
533
- * Expects:
534
- * - `where` filters use `key=value`
535
- * - lookup keys may target direct case fields, score names, or metric names
536
- *
537
- * Returns:
538
- * - filtered records plus grouped score summaries when `groupBy` is present
539
- */
540
- function buildReportCasesOutput(records, options) {
541
- const whereFilters = (options.where ?? []).map(parseSelector);
542
- const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
543
- return {
544
- groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
545
- records: [...filteredRecords]
546
- };
547
- }
548
- /**
549
549
  * Runs the `vieval report cases` command.
550
550
  *
551
551
  * Call stack:
@@ -583,66 +583,16 @@ async function runReportCasesCli(argv) {
583
583
  process.exitCode = 1;
584
584
  }
585
585
  }
586
- function normalizeCliArgv$6(argv) {
587
- const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
588
- if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
589
- if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
590
- return normalizedArgv;
591
- }
592
- function parseReportCasesCliArguments(argv) {
593
- const cli = meow(reportCasesHelpText, {
594
- argv: normalizeCliArgv$6(argv),
595
- flags: {
596
- format: {
597
- default: "table",
598
- type: "string"
599
- },
600
- groupBy: { type: "string" },
601
- where: {
602
- isMultiple: true,
603
- type: "string"
604
- }
605
- },
606
- importMeta: import.meta
607
- });
608
- const reportPath = cli.input[0];
609
- if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
610
- return {
611
- format: normalizeReportCasesFormat(cli.flags.format),
612
- groupBy: cli.flags.groupBy,
613
- reportPath,
614
- where: cli.flags.where
615
- };
616
- }
617
- function normalizeReportCasesFormat(value) {
618
- const normalized = value.toLowerCase();
619
- if (normalized === "json") return "json";
620
- if (normalized === "jsonl") return "jsonl";
621
- return "table";
622
- }
623
- async function resolveCaseRecordPaths(reportPath) {
624
- const absoluteReportPath = resolve(reportPath);
625
- const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
626
- if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
627
- if (existsSync(directCaseFilePath)) return [directCaseFilePath];
628
- return (await glob("**/cases.jsonl", {
629
- absolute: true,
630
- cwd: absoluteReportPath
631
- })).sort((left, right) => left.localeCompare(right));
632
- }
633
- function matchesWhereFilters(record, whereFilters) {
634
- return whereFilters.every((parsed) => {
635
- const resolved = getCaseSelectorValue(record, parsed.key);
636
- return resolved.exists && String(resolved.value) === parsed.value;
637
- });
638
- }
639
- function parseSelector(selector) {
640
- const separatorIndex = selector.indexOf("=");
641
- if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
642
- return {
643
- key: selector.slice(0, separatorIndex).trim(),
644
- value: selector.slice(separatorIndex + 1).trim()
645
- };
586
+ function addScores(summary, scores) {
587
+ for (const [scoreName, value] of Object.entries(scores)) {
588
+ summary[scoreName] ??= {
589
+ average: 0,
590
+ count: 0,
591
+ sum: 0
592
+ };
593
+ summary[scoreName].count += 1;
594
+ summary[scoreName].sum += value;
595
+ }
646
596
  }
647
597
  function buildCaseGroups(records, groupBy) {
648
598
  const groups = {};
@@ -662,17 +612,6 @@ function buildCaseGroups(records, groupBy) {
662
612
  scores: finalizeScores(group.scores)
663
613
  }]));
664
614
  }
665
- function addScores(summary, scores) {
666
- for (const [scoreName, value] of Object.entries(scores)) {
667
- summary[scoreName] ??= {
668
- average: 0,
669
- count: 0,
670
- sum: 0
671
- };
672
- summary[scoreName].count += 1;
673
- summary[scoreName].sum += value;
674
- }
675
- }
676
615
  function finalizeScores(summary) {
677
616
  return Object.fromEntries(Object.entries(summary).sort(([left], [right]) => left.localeCompare(right)).map(([scoreName, bucket]) => [scoreName, {
678
617
  average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
@@ -691,6 +630,67 @@ function formatCasesTable(output) {
691
630
  }
692
631
  return lines.join("\n");
693
632
  }
633
+ function matchesWhereFilters(record, whereFilters) {
634
+ return whereFilters.every((parsed) => {
635
+ const resolved = getCaseSelectorValue(record, parsed.key);
636
+ return resolved.exists && String(resolved.value) === parsed.value;
637
+ });
638
+ }
639
+ function normalizeCliArgv$6(argv) {
640
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
641
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
642
+ if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
643
+ return normalizedArgv;
644
+ }
645
+ function normalizeReportCasesFormat(value) {
646
+ const normalized = value.toLowerCase();
647
+ if (normalized === "json") return "json";
648
+ if (normalized === "jsonl") return "jsonl";
649
+ return "table";
650
+ }
651
+ function parseReportCasesCliArguments(argv) {
652
+ const cli = meow(reportCasesHelpText, {
653
+ argv: normalizeCliArgv$6(argv),
654
+ flags: {
655
+ format: {
656
+ default: "table",
657
+ type: "string"
658
+ },
659
+ groupBy: { type: "string" },
660
+ where: {
661
+ isMultiple: true,
662
+ type: "string"
663
+ }
664
+ },
665
+ importMeta: import.meta
666
+ });
667
+ const reportPath = cli.input[0];
668
+ if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
669
+ return {
670
+ format: normalizeReportCasesFormat(cli.flags.format),
671
+ groupBy: cli.flags.groupBy,
672
+ reportPath,
673
+ where: cli.flags.where
674
+ };
675
+ }
676
+ function parseSelector(selector) {
677
+ const separatorIndex = selector.indexOf("=");
678
+ if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
679
+ return {
680
+ key: selector.slice(0, separatorIndex).trim(),
681
+ value: selector.slice(separatorIndex + 1).trim()
682
+ };
683
+ }
684
+ async function resolveCaseRecordPaths(reportPath) {
685
+ const absoluteReportPath = resolve(reportPath);
686
+ const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
687
+ if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
688
+ if (existsSync(directCaseFilePath)) return [directCaseFilePath];
689
+ return (await glob("**/cases.jsonl", {
690
+ absolute: true,
691
+ cwd: absoluteReportPath
692
+ })).sort((left, right) => left.localeCompare(right));
693
+ }
694
694
  //#endregion
695
695
  //#region src/cli/report-compare.ts
696
696
  /**
@@ -735,17 +735,26 @@ function buildCompareReportArtifact(args) {
735
735
  reportPath: args.reportPath
736
736
  };
737
737
  }
738
+ /**
739
+ * Writes compare report artifact as JSON.
740
+ */
741
+ async function writeCompareReportArtifact(args) {
742
+ const outputPath = resolve(args.outputPath);
743
+ await mkdir(dirname(outputPath), { recursive: true });
744
+ await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
745
+ return outputPath;
746
+ }
738
747
  function countCasesForProject(caseRecords, projectName) {
739
748
  return caseRecords.filter((record) => record.projectName === projectName).length;
740
749
  }
741
- function countDistinctCasesForProject(caseRecords, projectName) {
742
- return countDistinctCases(caseRecords.filter((record) => record.projectName === projectName));
743
- }
744
750
  function countDistinctCases(caseRecords) {
745
751
  const caseKeys = /* @__PURE__ */ new Set();
746
752
  for (const record of caseRecords) caseKeys.add(`${record.projectName}:${record.taskId}:${record.caseId}`);
747
753
  return caseKeys.size;
748
754
  }
755
+ function countDistinctCasesForProject(caseRecords, projectName) {
756
+ return countDistinctCases(caseRecords.filter((record) => record.projectName === projectName));
757
+ }
749
758
  function createWeightedAverage(projects, selectAverage) {
750
759
  let weightedScoreTotal = 0;
751
760
  let weightTotal = 0;
@@ -758,15 +767,6 @@ function createWeightedAverage(projects, selectAverage) {
758
767
  if (weightTotal === 0) return null;
759
768
  return weightedScoreTotal / weightTotal;
760
769
  }
761
- /**
762
- * Writes compare report artifact as JSON.
763
- */
764
- async function writeCompareReportArtifact(args) {
765
- const outputPath = resolve(args.outputPath);
766
- await mkdir(dirname(outputPath), { recursive: true });
767
- await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
768
- return outputPath;
769
- }
770
770
  //#endregion
771
771
  //#region src/cli/discovery.ts
772
772
  /**
@@ -927,21 +927,22 @@ function buildLocalOtlpProjection(args) {
927
927
  }] }] }
928
928
  };
929
929
  }
930
- function toAttributes(attributes) {
931
- return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
932
- key,
933
- value: toAnyValue(value)
934
- }));
930
+ function collectProjectNames(records) {
931
+ return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
935
932
  }
936
- function toAnyValue(value) {
937
- if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
938
- if (isAttributeScalar(value)) {
939
- if (typeof value === "boolean") return { boolValue: value };
940
- if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
941
- if (value == null) return { stringValue: "null" };
942
- return { stringValue: value };
943
- }
944
- return { stringValue: stableStringify(value) };
933
+ function collectScoreKinds(records) {
934
+ return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
935
+ }
936
+ function collectTasks(records) {
937
+ const tasks = /* @__PURE__ */ new Map();
938
+ for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
939
+ projectName: record.projectName,
940
+ taskId: record.taskId
941
+ });
942
+ return [...tasks.values()].sort((left, right) => {
943
+ const projectOrder = left.projectName.localeCompare(right.projectName);
944
+ return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
945
+ });
945
946
  }
946
947
  function isAttributeScalar(value) {
947
948
  return value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string";
@@ -958,43 +959,32 @@ function isoToUnixNano(value) {
958
959
  if (!Number.isFinite(unixMilliseconds)) return "0";
959
960
  return String(BigInt(unixMilliseconds) * 1000000n);
960
961
  }
961
- function collectScoreKinds(records) {
962
- return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
963
- }
964
- function collectProjectNames(records) {
965
- return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
962
+ function toAnyValue(value) {
963
+ if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
964
+ if (isAttributeScalar(value)) {
965
+ if (typeof value === "boolean") return { boolValue: value };
966
+ if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
967
+ if (value == null) return { stringValue: "null" };
968
+ return { stringValue: value };
969
+ }
970
+ return { stringValue: stableStringify(value) };
966
971
  }
967
- function collectTasks(records) {
968
- const tasks = /* @__PURE__ */ new Map();
969
- for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
970
- projectName: record.projectName,
971
- taskId: record.taskId
972
- });
973
- return [...tasks.values()].sort((left, right) => {
974
- const projectOrder = left.projectName.localeCompare(right.projectName);
975
- return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
976
- });
972
+ function toAttributes(attributes) {
973
+ return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
974
+ key,
975
+ value: toAnyValue(value)
976
+ }));
977
977
  }
978
978
  //#endregion
979
979
  //#region src/cli/report-artifacts.ts
980
980
  /**
981
- * Resolves one or more `run-summary.json` paths from a report location.
981
+ * Reads all run artifacts found under `reportPath`.
982
982
  *
983
983
  * Use when:
984
- * - callers may pass a run directory, summary file path, or a report root
985
- *
986
- * Returns:
987
- * - sorted absolute summary file paths
984
+ * - callers need multi-run analysis from a directory root
988
985
  */
989
- async function resolveRunSummaryPaths(reportPath) {
990
- const absoluteReportPath = resolve(reportPath);
991
- const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
992
- if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
993
- if (existsSync(directSummaryPath)) return [directSummaryPath];
994
- return (await glob("**/run-summary.json", {
995
- absolute: true,
996
- cwd: absoluteReportPath
997
- })).sort((left, right) => left.localeCompare(right));
986
+ async function readReportArtifacts(reportPath) {
987
+ return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
998
988
  }
999
989
  /**
1000
990
  * Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
@@ -1031,13 +1021,23 @@ function readReportRunArtifact(summaryFilePath) {
1031
1021
  };
1032
1022
  }
1033
1023
  /**
1034
- * Reads all run artifacts found under `reportPath`.
1024
+ * Resolves one or more `run-summary.json` paths from a report location.
1035
1025
  *
1036
1026
  * Use when:
1037
- * - callers need multi-run analysis from a directory root
1027
+ * - callers may pass a run directory, summary file path, or a report root
1028
+ *
1029
+ * Returns:
1030
+ * - sorted absolute summary file paths
1038
1031
  */
1039
- async function readReportArtifacts(reportPath) {
1040
- return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
1032
+ async function resolveRunSummaryPaths(reportPath) {
1033
+ const absoluteReportPath = resolve(reportPath);
1034
+ const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
1035
+ if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
1036
+ if (existsSync(directSummaryPath)) return [directSummaryPath];
1037
+ return (await glob("**/run-summary.json", {
1038
+ absolute: true,
1039
+ cwd: absoluteReportPath
1040
+ })).sort((left, right) => left.localeCompare(right));
1041
1041
  }
1042
1042
  /**
1043
1043
  * Creates a compact summary row for one run artifact.
@@ -1138,14 +1138,14 @@ function sanitizeIdentitySegment$1(value) {
1138
1138
  */
1139
1139
  function createNoopReporter() {
1140
1140
  return {
1141
- onRunStart(_payload) {},
1142
- onTaskQueued(_payload) {},
1143
- onTaskStart(_payload) {},
1144
- onCaseStart(_payload) {},
1141
+ dispose() {},
1145
1142
  onCaseEnd(_payload) {},
1146
- onTaskEnd(_payload) {},
1143
+ onCaseStart(_payload) {},
1147
1144
  onRunEnd(_payload) {},
1148
- dispose() {}
1145
+ onRunStart(_payload) {},
1146
+ onTaskEnd(_payload) {},
1147
+ onTaskQueued(_payload) {},
1148
+ onTaskStart(_payload) {}
1149
1149
  };
1150
1150
  }
1151
1151
  //#endregion
@@ -1154,72 +1154,91 @@ const POINTER = "❯";
1154
1154
  const TREE_NODE_END = "└";
1155
1155
  const TREE_NODE_MIDDLE = "├";
1156
1156
  var SummaryReporterStateMachine = class {
1157
- options;
1158
- taskCounters = createCounterState();
1159
1157
  caseCounters = createCounterState();
1160
- tasks = /* @__PURE__ */ new Map();
1158
+ options;
1161
1159
  queueOrderCounter = 0;
1162
1160
  startedAtMs = 0;
1163
1161
  startTime = "";
1162
+ taskCounters = createCounterState();
1163
+ tasks = /* @__PURE__ */ new Map();
1164
1164
  constructor(options) {
1165
1165
  this.options = options;
1166
1166
  }
1167
1167
  /**
1168
- * Handles run startup.
1168
+ * Releases reporter resources.
1169
1169
  *
1170
1170
  * Use when:
1171
- * - a new CLI run is starting and the summary state must reset
1171
+ * - CLI cleanup runs from a `finally` block
1172
1172
  *
1173
1173
  * Expects:
1174
- * - `totalTasks` matches the scheduled task count for the run
1174
+ * - repeated calls are safe
1175
1175
  *
1176
1176
  * Returns:
1177
1177
  * - no direct value
1178
1178
  */
1179
- onRunStart(payload) {
1180
- this.tasks.clear();
1181
- this.queueOrderCounter = 0;
1182
- resetCounterState(this.taskCounters, payload.totalTasks);
1183
- resetCounterState(this.caseCounters, 0);
1184
- this.startedAtMs = this.options.getNow();
1185
- this.startTime = formatTimeString(new Date(this.options.getWallClockNow()));
1186
- }
1179
+ dispose() {}
1187
1180
  /**
1188
- * Handles task queue events.
1181
+ * Builds the current live summary window rows.
1189
1182
  *
1190
1183
  * Use when:
1191
- * - a scheduled task becomes visible in the live summary before it starts
1184
+ * - the live reporter or tests need a snapshot of the active window
1192
1185
  *
1193
1186
  * Expects:
1194
- * - `taskId` is stable across later lifecycle events
1187
+ * - `maxRows`, when present, keeps footer rows visible
1195
1188
  *
1196
1189
  * Returns:
1197
- * - no direct value
1190
+ * - terminal rows in display order
1198
1191
  */
1199
- onTaskQueued(payload) {
1200
- const task = this.getOrCreateTaskState(payload.taskId);
1201
- if (task.state === "finished") return;
1202
- task.displayName = payload.displayName ?? task.displayName;
1203
- task.projectName = payload.projectName ?? task.projectName;
1204
- this.syncTaskTotalCases(task, payload.totalCases);
1205
- }
1206
- /**
1207
- * Handles task start events.
1208
- *
1209
- * Use when:
1210
- * - a queued task begins executing
1192
+ getWindowRows(options) {
1193
+ const activeRows = this.createActiveRows();
1194
+ const footerRows = this.createFooterRows();
1195
+ const maxRows = options?.maxRows;
1196
+ const footerBlock = [...footerRows, ""];
1197
+ if (maxRows == null || maxRows <= 0) return [...[
1198
+ "",
1199
+ ...activeRows,
1200
+ ...activeRows.length > 0 ? [""] : []
1201
+ ], ...footerBlock];
1202
+ if (maxRows <= footerBlock.length) return footerBlock.slice(-maxRows);
1203
+ return [...createBoundedActiveBlock(activeRows, Math.max(0, maxRows - footerBlock.length)), ...footerBlock];
1204
+ }
1205
+ /**
1206
+ * Handles case completion.
1207
+ *
1208
+ * Use when:
1209
+ * - a running case settles and counters must advance
1211
1210
  *
1212
1211
  * Expects:
1213
- * - the task was previously queued or can be synthesized from its identifier
1212
+ * - duplicate completion for the same `caseId` is ignored
1214
1213
  *
1215
1214
  * Returns:
1216
1215
  * - no direct value
1217
1216
  */
1218
- onTaskStart(payload) {
1217
+ onCaseEnd(payload) {
1219
1218
  const task = this.getOrCreateTaskState(payload.taskId);
1220
1219
  if (task.state === "finished") return;
1221
- task.state = "running";
1222
- task.startedAt ??= this.options.getNow();
1220
+ if (task.settledCaseIds.has(payload.caseId)) {
1221
+ task.runningCases.delete(payload.caseId);
1222
+ return;
1223
+ }
1224
+ task.settledCaseIds.add(payload.caseId);
1225
+ task.runningCases.delete(payload.caseId);
1226
+ task.completedCases += 1;
1227
+ this.syncTaskTotalCases(task);
1228
+ this.caseCounters.completed += 1;
1229
+ if (payload.state === "passed") {
1230
+ this.caseCounters.passed += 1;
1231
+ return;
1232
+ }
1233
+ if (payload.state === "failed") {
1234
+ this.caseCounters.failed += 1;
1235
+ return;
1236
+ }
1237
+ if (payload.state === "timeout") {
1238
+ this.caseCounters.timeout += 1;
1239
+ return;
1240
+ }
1241
+ this.caseCounters.skipped += 1;
1223
1242
  }
1224
1243
  /**
1225
1244
  * Handles case start events.
@@ -1258,42 +1277,43 @@ var SummaryReporterStateMachine = class {
1258
1277
  this.syncTaskTotalCases(task);
1259
1278
  }
1260
1279
  /**
1261
- * Handles case completion.
1280
+ * Handles run completion.
1262
1281
  *
1263
1282
  * Use when:
1264
- * - a running case settles and counters must advance
1283
+ * - the caller has final task totals and wants the footer normalized
1265
1284
  *
1266
1285
  * Expects:
1267
- * - duplicate completion for the same `caseId` is ignored
1286
+ * - payload counters are final terminal task totals
1268
1287
  *
1269
1288
  * Returns:
1270
1289
  * - no direct value
1271
1290
  */
1272
- onCaseEnd(payload) {
1273
- const task = this.getOrCreateTaskState(payload.taskId);
1274
- if (task.state === "finished") return;
1275
- if (task.settledCaseIds.has(payload.caseId)) {
1276
- task.runningCases.delete(payload.caseId);
1277
- return;
1278
- }
1279
- task.settledCaseIds.add(payload.caseId);
1280
- task.runningCases.delete(payload.caseId);
1281
- task.completedCases += 1;
1282
- this.syncTaskTotalCases(task);
1283
- this.caseCounters.completed += 1;
1284
- if (payload.state === "passed") {
1285
- this.caseCounters.passed += 1;
1286
- return;
1287
- }
1288
- if (payload.state === "failed") {
1289
- this.caseCounters.failed += 1;
1290
- return;
1291
- }
1292
- if (payload.state === "timeout") {
1293
- this.caseCounters.timeout += 1;
1294
- return;
1295
- }
1296
- this.caseCounters.skipped += 1;
1291
+ onRunEnd(payload) {
1292
+ this.taskCounters.total = payload.totalTasks;
1293
+ this.taskCounters.passed = payload.passedTasks;
1294
+ this.taskCounters.failed = payload.failedTasks;
1295
+ this.taskCounters.skipped = payload.skippedTasks;
1296
+ this.taskCounters.completed = payload.passedTasks + payload.failedTasks + payload.skippedTasks;
1297
+ }
1298
+ /**
1299
+ * Handles run startup.
1300
+ *
1301
+ * Use when:
1302
+ * - a new CLI run is starting and the summary state must reset
1303
+ *
1304
+ * Expects:
1305
+ * - `totalTasks` matches the scheduled task count for the run
1306
+ *
1307
+ * Returns:
1308
+ * - no direct value
1309
+ */
1310
+ onRunStart(payload) {
1311
+ this.tasks.clear();
1312
+ this.queueOrderCounter = 0;
1313
+ resetCounterState(this.taskCounters, payload.totalTasks);
1314
+ resetCounterState(this.caseCounters, 0);
1315
+ this.startedAtMs = this.options.getNow();
1316
+ this.startTime = formatTimeString(new Date(this.options.getWallClockNow()));
1297
1317
  }
1298
1318
  /**
1299
1319
  * Handles task completion.
@@ -1326,61 +1346,41 @@ var SummaryReporterStateMachine = class {
1326
1346
  this.taskCounters.skipped += 1;
1327
1347
  }
1328
1348
  /**
1329
- * Handles run completion.
1349
+ * Handles task queue events.
1330
1350
  *
1331
1351
  * Use when:
1332
- * - the caller has final task totals and wants the footer normalized
1352
+ * - a scheduled task becomes visible in the live summary before it starts
1333
1353
  *
1334
1354
  * Expects:
1335
- * - payload counters are final terminal task totals
1355
+ * - `taskId` is stable across later lifecycle events
1336
1356
  *
1337
1357
  * Returns:
1338
1358
  * - no direct value
1339
1359
  */
1340
- onRunEnd(payload) {
1341
- this.taskCounters.total = payload.totalTasks;
1342
- this.taskCounters.passed = payload.passedTasks;
1343
- this.taskCounters.failed = payload.failedTasks;
1344
- this.taskCounters.skipped = payload.skippedTasks;
1345
- this.taskCounters.completed = payload.passedTasks + payload.failedTasks + payload.skippedTasks;
1360
+ onTaskQueued(payload) {
1361
+ const task = this.getOrCreateTaskState(payload.taskId);
1362
+ if (task.state === "finished") return;
1363
+ task.displayName = payload.displayName ?? task.displayName;
1364
+ task.projectName = payload.projectName ?? task.projectName;
1365
+ this.syncTaskTotalCases(task, payload.totalCases);
1346
1366
  }
1347
1367
  /**
1348
- * Releases reporter resources.
1368
+ * Handles task start events.
1349
1369
  *
1350
1370
  * Use when:
1351
- * - CLI cleanup runs from a `finally` block
1371
+ * - a queued task begins executing
1352
1372
  *
1353
1373
  * Expects:
1354
- * - repeated calls are safe
1374
+ * - the task was previously queued or can be synthesized from its identifier
1355
1375
  *
1356
1376
  * Returns:
1357
1377
  * - no direct value
1358
1378
  */
1359
- dispose() {}
1360
- /**
1361
- * Builds the current live summary window rows.
1362
- *
1363
- * Use when:
1364
- * - the live reporter or tests need a snapshot of the active window
1365
- *
1366
- * Expects:
1367
- * - `maxRows`, when present, keeps footer rows visible
1368
- *
1369
- * Returns:
1370
- * - terminal rows in display order
1371
- */
1372
- getWindowRows(options) {
1373
- const activeRows = this.createActiveRows();
1374
- const footerRows = this.createFooterRows();
1375
- const maxRows = options?.maxRows;
1376
- const footerBlock = [...footerRows, ""];
1377
- if (maxRows == null || maxRows <= 0) return [...[
1378
- "",
1379
- ...activeRows,
1380
- ...activeRows.length > 0 ? [""] : []
1381
- ], ...footerBlock];
1382
- if (maxRows <= footerBlock.length) return footerBlock.slice(-maxRows);
1383
- return [...createBoundedActiveBlock(activeRows, Math.max(0, maxRows - footerBlock.length)), ...footerBlock];
1379
+ onTaskStart(payload) {
1380
+ const task = this.getOrCreateTaskState(payload.taskId);
1381
+ if (task.state === "finished") return;
1382
+ task.state = "running";
1383
+ task.startedAt ??= this.options.getNow();
1384
1384
  }
1385
1385
  createActiveRows() {
1386
1386
  const activeTasks = Array.from(this.tasks.values()).filter((task) => task.state !== "finished").sort(compareActiveTasks);
@@ -1449,6 +1449,49 @@ var SummaryReporterStateMachine = class {
1449
1449
  }
1450
1450
  };
1451
1451
  /**
1452
+ * Creates the live summary reporter state machine for `vieval` CLI runs.
1453
+ *
1454
+ * Use when:
1455
+ * - the CLI wants Vitest-style active rows and live counters
1456
+ * - tests need a deterministic reporter surface without touching the terminal
1457
+ *
1458
+ * Expects:
1459
+ * - queue/start/end events describe task lifecycle in order
1460
+ * - `getNow()` remains monotonic within one run
1461
+ * - `getWallClockNow()` returns the wall-clock run start timestamp
1462
+ *
1463
+ * Returns:
1464
+ * - a reporter compatible with the base CLI lifecycle plus `getWindowRows()`
1465
+ *
1466
+ * Call stack:
1467
+ *
1468
+ * {@link createSummaryReporter}
1469
+ * -> {@link SummaryReporterStateMachine.onTaskQueued}
1470
+ * -> {@link SummaryReporterStateMachine.onCaseStart}
1471
+ * -> {@link SummaryReporterStateMachine.getWindowRows}
1472
+ */
1473
+ function createSummaryReporter(options) {
1474
+ return new SummaryReporterStateMachine(options);
1475
+ }
1476
+ function compareActiveTasks(left, right) {
1477
+ const leftProject = left.projectName ?? "";
1478
+ const rightProject = right.projectName ?? "";
1479
+ if (leftProject !== rightProject) return leftProject.localeCompare(rightProject);
1480
+ const displayNameOrder = left.displayName.localeCompare(right.displayName);
1481
+ if (displayNameOrder !== 0) return displayNameOrder;
1482
+ return left.queueOrder - right.queueOrder;
1483
+ }
1484
+ function countRunningCases(tasks) {
1485
+ let runningCount = 0;
1486
+ for (const task of tasks) runningCount += task.runningCases.size;
1487
+ return runningCount;
1488
+ }
1489
+ function countRunningTasks(tasks) {
1490
+ let runningCount = 0;
1491
+ for (const task of tasks) if (task.state === "running") runningCount += 1;
1492
+ return runningCount;
1493
+ }
1494
+ /**
1452
1495
  * Creates the active task block while keeping room for summary footer rows.
1453
1496
  *
1454
1497
  * Use when:
@@ -1481,31 +1524,6 @@ function createBoundedActiveBlock(activeRows, maxRows) {
1481
1524
  c.dim(` ${TREE_NODE_END} ... ${hiddenRows} more running rows hidden`)
1482
1525
  ];
1483
1526
  }
1484
- /**
1485
- * Creates the live summary reporter state machine for `vieval` CLI runs.
1486
- *
1487
- * Use when:
1488
- * - the CLI wants Vitest-style active rows and live counters
1489
- * - tests need a deterministic reporter surface without touching the terminal
1490
- *
1491
- * Expects:
1492
- * - queue/start/end events describe task lifecycle in order
1493
- * - `getNow()` remains monotonic within one run
1494
- * - `getWallClockNow()` returns the wall-clock run start timestamp
1495
- *
1496
- * Returns:
1497
- * - a reporter compatible with the base CLI lifecycle plus `getWindowRows()`
1498
- *
1499
- * Call stack:
1500
- *
1501
- * {@link createSummaryReporter}
1502
- * -> {@link SummaryReporterStateMachine.onTaskQueued}
1503
- * -> {@link SummaryReporterStateMachine.onCaseStart}
1504
- * -> {@link SummaryReporterStateMachine.getWindowRows}
1505
- */
1506
- function createSummaryReporter(options) {
1507
- return new SummaryReporterStateMachine(options);
1508
- }
1509
1527
  function createCounterState() {
1510
1528
  return {
1511
1529
  completed: 0,
@@ -1516,29 +1534,17 @@ function createCounterState() {
1516
1534
  total: 0
1517
1535
  };
1518
1536
  }
1519
- function resetCounterState(counter, total) {
1520
- counter.completed = 0;
1521
- counter.failed = 0;
1522
- counter.passed = 0;
1523
- counter.skipped = 0;
1524
- counter.timeout = 0;
1525
- counter.total = total;
1526
- }
1527
- function sumTaskCaseTotals(tasks) {
1528
- let total = 0;
1529
- for (const task of tasks) total += task.totalCases;
1530
- return total;
1537
+ function estimateTaskDurationMs(task, now) {
1538
+ if (task.startedAt == null) return;
1539
+ return estimateTotalDurationMs(task.completedCases, task.totalCases, Math.max(0, now - task.startedAt));
1531
1540
  }
1532
- function compareActiveTasks(left, right) {
1533
- const leftProject = left.projectName ?? "";
1534
- const rightProject = right.projectName ?? "";
1535
- if (leftProject !== rightProject) return leftProject.localeCompare(rightProject);
1536
- const displayNameOrder = left.displayName.localeCompare(right.displayName);
1537
- if (displayNameOrder !== 0) return displayNameOrder;
1538
- return left.queueOrder - right.queueOrder;
1541
+ function estimateTotalDurationMs(completedCount, totalCount, elapsedDurationMs) {
1542
+ if (completedCount === 0 || totalCount === 0) return;
1543
+ const averageDurationMs = elapsedDurationMs / completedCount;
1544
+ return Math.round(averageDurationMs * totalCount);
1539
1545
  }
1540
- function padSummaryTitle(label) {
1541
- return `${c.dim(label.padEnd(8))} `;
1546
+ function formatActiveConcurrencyState(options) {
1547
+ return [options.taskRunningCount > 0 ? c.bold(c.yellow(`${options.taskRunningCount} ${pluralize("task", options.taskRunningCount)} running`)) : c.dim("0 tasks running"), options.caseRunningCount > 0 ? c.bold(c.yellow(`${options.caseRunningCount} ${pluralize("case", options.caseRunningCount)} running`)) : c.dim("0 cases running")].join(c.dim(" | "));
1542
1548
  }
1543
1549
  function formatCounterState(counter, runningCount, timing) {
1544
1550
  const plannedCount = Math.max(0, counter.total - counter.completed - runningCount);
@@ -1551,19 +1557,6 @@ function formatCounterState(counter, runningCount, timing) {
1551
1557
  counter.skipped > 0 ? c.yellow(`${counter.skipped} skipped`) : c.dim(`${counter.skipped} skipped`)
1552
1558
  ].join(c.dim(" | ")) + c.gray(` (${counter.total})`) + formatTimingSuffix(timing);
1553
1559
  }
1554
- function formatActiveConcurrencyState(options) {
1555
- return [options.taskRunningCount > 0 ? c.bold(c.yellow(`${options.taskRunningCount} ${pluralize("task", options.taskRunningCount)} running`)) : c.dim("0 tasks running"), options.caseRunningCount > 0 ? c.bold(c.yellow(`${options.caseRunningCount} ${pluralize("case", options.caseRunningCount)} running`)) : c.dim("0 cases running")].join(c.dim(" | "));
1556
- }
1557
- function pluralize(noun, count) {
1558
- return count === 1 ? noun : `${noun}s`;
1559
- }
1560
- function formatRetrySuffix(activeCase) {
1561
- if (activeCase.retryIndex == null || activeCase.retryIndex <= 0 || activeCase.autoRetry == null || activeCase.autoRetry <= 0) return "";
1562
- return c.dim(` retry ${activeCase.retryIndex}/${activeCase.autoRetry}`);
1563
- }
1564
- function formatTimeString(date) {
1565
- return date.toTimeString().split(" ")[0] ?? "";
1566
- }
1567
1560
  function formatDuration$2(durationMs) {
1568
1561
  return formatHumanDuration(durationMs);
1569
1562
  }
@@ -1595,24 +1588,9 @@ function formatProjectBadge(projectName, isTTY) {
1595
1588
  const background = backgroundPool[projectName.split("").reduce((accumulator, character, index) => accumulator + character.charCodeAt(0) + index, 0) % backgroundPool.length];
1596
1589
  return `${c.black(background(` ${projectName} `))} `;
1597
1590
  }
1598
- function countRunningCases(tasks) {
1599
- let runningCount = 0;
1600
- for (const task of tasks) runningCount += task.runningCases.size;
1601
- return runningCount;
1602
- }
1603
- function countRunningTasks(tasks) {
1604
- let runningCount = 0;
1605
- for (const task of tasks) if (task.state === "running") runningCount += 1;
1606
- return runningCount;
1607
- }
1608
- function estimateTaskDurationMs(task, now) {
1609
- if (task.startedAt == null) return;
1610
- return estimateTotalDurationMs(task.completedCases, task.totalCases, Math.max(0, now - task.startedAt));
1611
- }
1612
- function estimateTotalDurationMs(completedCount, totalCount, elapsedDurationMs) {
1613
- if (completedCount === 0 || totalCount === 0) return;
1614
- const averageDurationMs = elapsedDurationMs / completedCount;
1615
- return Math.round(averageDurationMs * totalCount);
1591
+ function formatRetrySuffix(activeCase) {
1592
+ if (activeCase.retryIndex == null || activeCase.retryIndex <= 0 || activeCase.autoRetry == null || activeCase.autoRetry <= 0) return "";
1593
+ return c.dim(` retry ${activeCase.retryIndex}/${activeCase.autoRetry}`);
1616
1594
  }
1617
1595
  function formatTaskProgressSuffix(task, now) {
1618
1596
  const elapsedDurationMs = task.startedAt == null ? 0 : Math.max(0, now - task.startedAt);
@@ -1621,11 +1599,33 @@ function formatTaskProgressSuffix(task, now) {
1621
1599
  estimatedDurationMs: estimateTaskDurationMs(task, now)
1622
1600
  })}`;
1623
1601
  }
1602
+ function formatTimeString(date) {
1603
+ return date.toTimeString().split(" ")[0] ?? "";
1604
+ }
1624
1605
  function formatTimingSuffix(timing) {
1625
1606
  const parts = [`elapsed ${formatHumanDuration(timing.elapsedDurationMs)}`];
1626
1607
  if (timing.estimatedDurationMs != null) parts.push(`estimated ${formatHumanDuration(timing.estimatedDurationMs)}`);
1627
1608
  return ` (${parts.join(", ")})`;
1628
1609
  }
1610
+ function padSummaryTitle(label) {
1611
+ return `${c.dim(label.padEnd(8))} `;
1612
+ }
1613
+ function pluralize(noun, count) {
1614
+ return count === 1 ? noun : `${noun}s`;
1615
+ }
1616
+ function resetCounterState(counter, total) {
1617
+ counter.completed = 0;
1618
+ counter.failed = 0;
1619
+ counter.passed = 0;
1620
+ counter.skipped = 0;
1621
+ counter.timeout = 0;
1622
+ counter.total = total;
1623
+ }
1624
+ function sumTaskCaseTotals(tasks) {
1625
+ let total = 0;
1626
+ for (const task of tasks) total += task.totalCases;
1627
+ return total;
1628
+ }
1629
1629
  //#endregion
1630
1630
  //#region src/cli/reporters/index.ts
1631
1631
  /**
@@ -1676,14 +1676,14 @@ const SYNC_END = `${ESC}?2026l`;
1676
1676
  * -> {@link WindowRenderer.renderWindow}
1677
1677
  */
1678
1678
  var WindowRenderer = class {
1679
+ bufferedOutput = "";
1680
+ finished = false;
1679
1681
  options;
1680
1682
  renderInterval;
1681
1683
  renderScheduled = false;
1682
1684
  renderScheduleVersion = 0;
1683
- windowHeight = 0;
1684
1685
  started = false;
1685
- finished = false;
1686
- bufferedOutput = "";
1686
+ windowHeight = 0;
1687
1687
  constructor(options) {
1688
1688
  if (options.createInterval && options.clearInterval) {
1689
1689
  this.options = {
@@ -1714,26 +1714,41 @@ var WindowRenderer = class {
1714
1714
  };
1715
1715
  }
1716
1716
  /**
1717
- * Starts the periodic refresh loop.
1717
+ * Stops the renderer and clears any visible window state.
1718
1718
  *
1719
1719
  * Use when:
1720
- * - the live reporter is about to emit in-place updates
1720
+ * - cleanup needs to happen from a `finally` block or interrupted run
1721
1721
  *
1722
1722
  * Expects:
1723
- * - repeated calls are harmless and keep the existing timer
1723
+ * - callers may invoke it more than once
1724
1724
  *
1725
1725
  * Returns:
1726
1726
  * - no direct value
1727
1727
  */
1728
- start() {
1729
- if (this.started && !this.finished) return;
1730
- this.started = true;
1731
- this.finished = false;
1728
+ dispose() {
1729
+ this.finish();
1730
+ }
1731
+ /**
1732
+ * Clears the rendered window and stops the refresh loop.
1733
+ *
1734
+ * Use when:
1735
+ * - the live reporter is transitioning to final static output
1736
+ *
1737
+ * Expects:
1738
+ * - repeated calls are safe
1739
+ *
1740
+ * Returns:
1741
+ * - no direct value
1742
+ */
1743
+ finish() {
1744
+ if (this.finished) return;
1745
+ this.finished = true;
1746
+ this.started = false;
1732
1747
  this.renderScheduleVersion += 1;
1733
- if (!this.renderInterval) {
1734
- this.renderInterval = this.options.createInterval(() => this.schedule(), this.options.intervalMs);
1735
- this.renderInterval.unref?.();
1736
- }
1748
+ this.renderScheduled = false;
1749
+ this.stopInterval();
1750
+ this.clearWindow();
1751
+ this.flushBufferedOutput();
1737
1752
  }
1738
1753
  /**
1739
1754
  * Queues a render if one is not already in flight.
@@ -1758,41 +1773,26 @@ var WindowRenderer = class {
1758
1773
  });
1759
1774
  }
1760
1775
  /**
1761
- * Clears the rendered window and stops the refresh loop.
1776
+ * Starts the periodic refresh loop.
1762
1777
  *
1763
1778
  * Use when:
1764
- * - the live reporter is transitioning to final static output
1779
+ * - the live reporter is about to emit in-place updates
1765
1780
  *
1766
1781
  * Expects:
1767
- * - repeated calls are safe
1782
+ * - repeated calls are harmless and keep the existing timer
1768
1783
  *
1769
1784
  * Returns:
1770
1785
  * - no direct value
1771
1786
  */
1772
- finish() {
1773
- if (this.finished) return;
1774
- this.finished = true;
1775
- this.started = false;
1787
+ start() {
1788
+ if (this.started && !this.finished) return;
1789
+ this.started = true;
1790
+ this.finished = false;
1776
1791
  this.renderScheduleVersion += 1;
1777
- this.renderScheduled = false;
1778
- this.stopInterval();
1779
- this.clearWindow();
1780
- this.flushBufferedOutput();
1781
- }
1782
- /**
1783
- * Stops the renderer and clears any visible window state.
1784
- *
1785
- * Use when:
1786
- * - cleanup needs to happen from a `finally` block or interrupted run
1787
- *
1788
- * Expects:
1789
- * - callers may invoke it more than once
1790
- *
1791
- * Returns:
1792
- * - no direct value
1793
- */
1794
- dispose() {
1795
- this.finish();
1792
+ if (!this.renderInterval) {
1793
+ this.renderInterval = this.options.createInterval(() => this.schedule(), this.options.intervalMs);
1794
+ this.renderInterval.unref?.();
1795
+ }
1796
1796
  }
1797
1797
  /**
1798
1798
  * Alias for disposal to match Vitest's renderer lifecycle naming.
@@ -1830,6 +1830,20 @@ var WindowRenderer = class {
1830
1830
  }
1831
1831
  this.bufferedOutput += message;
1832
1832
  }
1833
+ clearWindow() {
1834
+ if (!this.options.supportsAnsiWindowing || this.windowHeight === 0) return;
1835
+ this.writeOutput(`${CARRIAGE_RETURN}${CLEAR_LINE}`);
1836
+ for (let rowIndex = 1; rowIndex < this.windowHeight; rowIndex += 1) this.writeOutput(`${CARRIAGE_RETURN}${MOVE_CURSOR_ONE_ROW_UP}${CLEAR_LINE}`);
1837
+ this.windowHeight = 0;
1838
+ }
1839
+ flushBufferedOutput() {
1840
+ if (this.bufferedOutput.length === 0) return;
1841
+ this.writeOutput(this.bufferedOutput);
1842
+ this.bufferedOutput = "";
1843
+ }
1844
+ isActiveWindowMode() {
1845
+ return this.started && !this.finished && this.options.supportsAnsiWindowing;
1846
+ }
1833
1847
  renderWindow() {
1834
1848
  const windowContent = this.options.getWindow();
1835
1849
  const rowCount = getRenderedRowCount(windowContent, this.options.getColumns());
@@ -1847,12 +1861,6 @@ var WindowRenderer = class {
1847
1861
  this.writeOutput("\n");
1848
1862
  this.windowHeight = 0;
1849
1863
  }
1850
- clearWindow() {
1851
- if (!this.options.supportsAnsiWindowing || this.windowHeight === 0) return;
1852
- this.writeOutput(`${CARRIAGE_RETURN}${CLEAR_LINE}`);
1853
- for (let rowIndex = 1; rowIndex < this.windowHeight; rowIndex += 1) this.writeOutput(`${CARRIAGE_RETURN}${MOVE_CURSOR_ONE_ROW_UP}${CLEAR_LINE}`);
1854
- this.windowHeight = 0;
1855
- }
1856
1864
  stopInterval() {
1857
1865
  if (!this.renderInterval) return;
1858
1866
  this.renderInterval.clear();
@@ -1861,14 +1869,6 @@ var WindowRenderer = class {
1861
1869
  writeOutput(message) {
1862
1870
  this.options.writeOutput(message);
1863
1871
  }
1864
- flushBufferedOutput() {
1865
- if (this.bufferedOutput.length === 0) return;
1866
- this.writeOutput(this.bufferedOutput);
1867
- this.bufferedOutput = "";
1868
- }
1869
- isActiveWindowMode() {
1870
- return this.started && !this.finished && this.options.supportsAnsiWindowing;
1871
- }
1872
1872
  };
1873
1873
  function defaultCreateInterval(callback, intervalMs) {
1874
1874
  const timer = globalThis.setInterval(callback, intervalMs);
@@ -1895,40 +1895,6 @@ function getTextDisplayWidth(text) {
1895
1895
  }
1896
1896
  //#endregion
1897
1897
  //#region src/cli/reporters/vitest-compat-reporter.ts
1898
- function isReporterReferenceTuple(reference) {
1899
- return Array.isArray(reference);
1900
- }
1901
- function isAbsoluteLikePath(value) {
1902
- return value.startsWith("/") || value.startsWith("./") || value.startsWith("../") || /^[A-Z]:[\\/]/i.test(value);
1903
- }
1904
- async function loadReporterModule(path) {
1905
- if (isAbsoluteLikePath(path)) return import(pathToFileURL(path).href);
1906
- return import(path);
1907
- }
1908
- function normalizeReporterReference(reference) {
1909
- if (isReporterReferenceTuple(reference)) return {
1910
- options: reference[1],
1911
- value: reference[0]
1912
- };
1913
- return {
1914
- options: void 0,
1915
- value: reference
1916
- };
1917
- }
1918
- function createReporterInstance(moduleValue, options) {
1919
- const value = moduleValue.default ?? moduleValue;
1920
- if (value == null) return null;
1921
- if (typeof value === "function") return new value(options);
1922
- if (typeof value === "object") return value;
1923
- return null;
1924
- }
1925
- async function emitToReporters(reporters, callback) {
1926
- await Promise.all(reporters.map(async (reporter) => {
1927
- try {
1928
- await callback(reporter);
1929
- } catch {}
1930
- }));
1931
- }
1932
1898
  /**
1933
1899
  * Creates a project-level vitest-compatible reporter bridge.
1934
1900
  *
@@ -2020,82 +1986,337 @@ async function createVievalVitestCompatReporterBridge(options) {
2020
1986
  }
2021
1987
  };
2022
1988
  }
2023
- //#endregion
2024
- //#region src/cli/run.ts
2025
- /**
2026
- * Returns true when output contains at least one failing project/task/case outcome.
2027
- */
2028
- function hasRunFailures(output) {
2029
- return output.projects.some((project) => {
2030
- if (project.errorMessage != null) return true;
2031
- if (project.caseSummary != null && (project.caseSummary.failed > 0 || project.caseSummary.timeout > 0)) return true;
2032
- return (project.caseFailures?.length ?? 0) > 0;
2033
- });
2034
- }
2035
- function resolveCappedConcurrency(defaultConcurrency, cliConcurrency, fallback) {
2036
- const effectiveDefault = defaultConcurrency ?? fallback;
2037
- if (cliConcurrency == null) return effectiveDefault;
2038
- return Math.min(effectiveDefault, cliConcurrency);
2039
- }
2040
- function resolveOptionalRuntimeTaskConcurrency(defaultConcurrency, cliConcurrency) {
2041
- return cliConcurrency ?? defaultConcurrency;
1989
+ function createReporterInstance(moduleValue, options) {
1990
+ const value = moduleValue.default ?? moduleValue;
1991
+ if (value == null) return null;
1992
+ if (typeof value === "function") return new value(options);
1993
+ if (typeof value === "object") return value;
1994
+ return null;
2042
1995
  }
2043
- function resolveWorkspaceConcurrency(loadedConfig, options) {
2044
- return resolveCappedConcurrency(loadedConfig.concurrency?.workspace, options.workspaceConcurrency, 1);
1996
+ async function emitToReporters(reporters, callback) {
1997
+ await Promise.all(reporters.map(async (reporter) => {
1998
+ try {
1999
+ await callback(reporter);
2000
+ } catch {}
2001
+ }));
2045
2002
  }
2046
- function resolveProjectConcurrency(project, options) {
2047
- return resolveCappedConcurrency(project.concurrency?.project, options.projectConcurrency, Number.POSITIVE_INFINITY);
2003
+ function isAbsoluteLikePath(value) {
2004
+ return value.startsWith("/") || value.startsWith("./") || value.startsWith("../") || /^[A-Z]:[\\/]/i.test(value);
2048
2005
  }
2049
- function resolveTaskConcurrency(project, options) {
2050
- return resolveCappedConcurrency(project.concurrency?.task, options.taskConcurrency, 1);
2006
+ function isReporterReferenceTuple(reference) {
2007
+ return Array.isArray(reference);
2051
2008
  }
2052
- function resolveScheduledTaskConcurrency(project, options) {
2053
- return Math.min(resolveProjectConcurrency(project, options), resolveTaskConcurrency(project, options));
2009
+ async function loadReporterModule(path) {
2010
+ if (isAbsoluteLikePath(path)) return import(pathToFileURL(path).href);
2011
+ return import(path);
2054
2012
  }
2055
- function resolveRuntimeTaskConcurrency(taskConcurrency, project, options) {
2056
- const attempt = resolveOptionalRuntimeTaskConcurrency(taskConcurrency?.attempt ?? project.concurrency?.attempt, options.attemptConcurrency);
2057
- const caseConcurrency = resolveOptionalRuntimeTaskConcurrency(taskConcurrency?.case ?? project.concurrency?.case, options.caseConcurrency);
2058
- if (attempt == null && caseConcurrency == null) return;
2059
- return {
2060
- attempt,
2061
- case: caseConcurrency
2013
+ function normalizeReporterReference(reference) {
2014
+ if (isReporterReferenceTuple(reference)) return {
2015
+ options: reference[1],
2016
+ value: reference[0]
2062
2017
  };
2063
- }
2064
- function createScheduledTaskWithRuntimeConcurrency(task, project, options) {
2065
- const taskDefinition = task.entry.task;
2066
- if (taskDefinition == null) return task;
2067
- const concurrency = resolveRuntimeTaskConcurrency(taskDefinition.concurrency, project, options);
2068
2018
  return {
2069
- ...task,
2070
- entry: {
2071
- ...task.entry,
2072
- task: {
2073
- ...taskDefinition,
2074
- concurrency
2019
+ options: void 0,
2020
+ value: reference
2021
+ };
2022
+ }
2023
+ //#endregion
2024
+ //#region src/cli/run.ts
2025
+ /**
2026
+ * Formats CLI run output as human-readable lines.
2027
+ */
2028
+ function formatVievalCliRunOutput(output) {
2029
+ const colorEnabled = shouldUseColor();
2030
+ const colors = createColorPalette(colorEnabled);
2031
+ const lines = [];
2032
+ lines.push(` ${colors.dim("RUN")} ${colors.yellow("vieval")}`);
2033
+ lines.push(` ${colors.dim("Config")} ${output.configFilePath ?? "(not found, using defaults)"}`);
2034
+ lines.push("");
2035
+ let passedProjects = 0;
2036
+ let skippedProjects = 0;
2037
+ let failedProjects = 0;
2038
+ let totalTasks = 0;
2039
+ let executedTasks = 0;
2040
+ function formatMatrixSummary(summary) {
2041
+ if (summary == null) return null;
2042
+ const runAxesLabel = summary.runAxes.length === 0 ? "-" : summary.runAxes.join("|");
2043
+ const evalAxesLabel = summary.evalAxes.length === 0 ? "-" : summary.evalAxes.join("|");
2044
+ return `matrix run ${summary.runRows} [${runAxesLabel}] / eval ${summary.evalRows} [${evalAxesLabel}]`;
2045
+ }
2046
+ function formatScheduleBreakdown(project) {
2047
+ const summary = project.matrixSummary;
2048
+ if (summary == null) return null;
2049
+ if (project.taskCount <= 0 || project.entryCount <= 0 || summary.runRows <= 0 || summary.evalRows <= 0) return null;
2050
+ const denominator = project.entryCount * summary.runRows * summary.evalRows;
2051
+ if (denominator <= 0 || project.taskCount % denominator !== 0) return null;
2052
+ const providerCount = project.taskCount / denominator;
2053
+ return [
2054
+ colors.dim("schedule "),
2055
+ colors.yellow(String(project.entryCount)),
2056
+ colors.dim(" entries × "),
2057
+ colors.yellow(String(providerCount)),
2058
+ colors.dim(" inferenceExecutors × "),
2059
+ colors.yellow(String(summary.runRows)),
2060
+ colors.dim(" run rows × "),
2061
+ colors.yellow(String(summary.evalRows)),
2062
+ colors.dim(" eval rows = "),
2063
+ colors.green(String(project.taskCount)),
2064
+ colors.dim(" tasks")
2065
+ ].join("");
2066
+ }
2067
+ for (const project of output.projects) {
2068
+ totalTasks += project.taskCount;
2069
+ executedTasks += project.result?.overall.runCount ?? 0;
2070
+ const badge = createProjectBadge(project.name, colors, colorEnabled);
2071
+ const isFailed = project.errorMessage != null;
2072
+ const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseSummary?.timeout ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
2073
+ if (isFailed) {
2074
+ failedProjects += 1;
2075
+ lines.push(` ${colors.red("❯")} ${badge}${formatDuration$1(project.durationMs, colors)}`);
2076
+ lines.push(` ${project.errorMessage}`);
2077
+ continue;
2078
+ }
2079
+ if (!project.executed) {
2080
+ skippedProjects += 1;
2081
+ const countLabel = colors.dim(`(${project.taskCount} tasks)`);
2082
+ const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, 0 runs, hybrid n/a`);
2083
+ const matrixSummary = formatMatrixSummary(project.matrixSummary);
2084
+ lines.push(` ${colors.dim("○")} ${badge}${countLabel}${detailsLabel}${formatDuration$1(project.durationMs, colors)}`);
2085
+ if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
2086
+ const scheduleBreakdown = formatScheduleBreakdown(project);
2087
+ if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
2088
+ continue;
2089
+ }
2090
+ if (hasFailedCases) failedProjects += 1;
2091
+ else passedProjects += 1;
2092
+ const hybridAverageLabel = formatHybridAverage(project.result?.overall.hybridAverage);
2093
+ const runCount = project.result?.overall.runCount ?? 0;
2094
+ const countLabel = colors.dim(`(${project.taskCount} tasks)`);
2095
+ const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed | ${project.caseSummary.timeout} timeout`;
2096
+ const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, ${runCount} runs${caseSummaryLabel}, hybrid ${hybridAverageLabel}`);
2097
+ const matrixSummary = formatMatrixSummary(project.matrixSummary);
2098
+ lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration$1(project.durationMs, colors)}`);
2099
+ if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
2100
+ const scheduleBreakdown = formatScheduleBreakdown(project);
2101
+ if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
2102
+ if ((project.caseFailures?.length ?? 0) > 0) {
2103
+ lines.push(` ${colors.red("Failed cases:")}`);
2104
+ for (const failure of project.caseFailures.slice(0, 5)) {
2105
+ lines.push(` ${colors.red(`- ${failure.caseName} (${failure.taskId})`)}`);
2106
+ for (const line of failure.errorMessage.split("\n")) lines.push(` ${colors.red(line)}`);
2075
2107
  }
2108
+ if (project.caseFailures.length > 5) lines.push(` ${colors.dim(`... ${project.caseFailures.length - 5} more failed cases`)}`);
2109
+ }
2110
+ }
2111
+ lines.push("");
2112
+ if (failedProjects > 0 || skippedProjects > 0) {
2113
+ const summarySegments = [`${colors.green(String(passedProjects))} passed`];
2114
+ if (skippedProjects > 0) summarySegments.push(`${colors.dim(String(skippedProjects))} skipped`);
2115
+ if (failedProjects > 0) summarySegments.push(`${colors.red(String(failedProjects))} failed`);
2116
+ lines.push(` ${colors.dim("Projects")} ${summarySegments.join(" | ")} (${output.projects.length})`);
2117
+ } else lines.push(` ${colors.dim("Projects")} ${colors.green(String(passedProjects))} passed (${output.projects.length})`);
2118
+ lines.push(` ${colors.dim("Tasks")} ${executedTasks} executed / ${totalTasks} scheduled`);
2119
+ return lines.join("\n");
2120
+ }
2121
+ /**
2122
+ * Returns true when output contains at least one failing project/task/case outcome.
2123
+ */
2124
+ function hasRunFailures(output) {
2125
+ return output.projects.some((project) => {
2126
+ if (project.errorMessage != null) return true;
2127
+ if (project.caseSummary != null && (project.caseSummary.failed > 0 || project.caseSummary.timeout > 0)) return true;
2128
+ return (project.caseFailures?.length ?? 0) > 0;
2129
+ });
2130
+ }
2131
+ /**
2132
+ * Runs vieval orchestration from config and returns project-level summaries.
2133
+ *
2134
+ * Call stack:
2135
+ *
2136
+ * {@link runVievalCli}
2137
+ * -> {@link loadVievalCliConfig}
2138
+ * -> {@link discoverEvalFiles}
2139
+ * -> {@link collectEvalEntries}
2140
+ * -> {@link createRunnerSchedule}
2141
+ * -> {@link runScheduledTasks} (optional)
2142
+ *
2143
+ * Use when:
2144
+ * - running eval collection and scheduling from a single command
2145
+ * - keeping business-agent eval files near their implementation packages
2146
+ */
2147
+ async function runVievalCli(options = {}) {
2148
+ const loadedConfig = await loadVievalCliConfig({
2149
+ configFilePath: options.configFilePath,
2150
+ cwd: options.cwd
2151
+ });
2152
+ const telemetry = loadedConfig.reporting?.openTelemetry?.enabled === true ? createOpenTelemetryRuntime() : createNoopTelemetryRuntime();
2153
+ const onOpenTelemetryRunEnd = loadedConfig.reporting?.openTelemetry?.enabled === true ? loadedConfig.reporting.openTelemetry.onRunEnd : void 0;
2154
+ const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
2155
+ let runError;
2156
+ let runEndError;
2157
+ let output;
2158
+ let reporter;
2159
+ try {
2160
+ const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
2161
+ const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
2162
+ const identity = createRunIdentity(options, preparedProjects);
2163
+ const eventRecorder = createEventRecorder(identity);
2164
+ const runReporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
2165
+ reporter = runReporter;
2166
+ output = await telemetry.withSpan("vieval.run", {
2167
+ "vieval.attempt.id": identity.attemptId,
2168
+ "vieval.experiment.id": identity.experimentId,
2169
+ "vieval.run.id": identity.runId,
2170
+ "vieval.workspace.id": identity.workspaceId
2171
+ }, async () => {
2172
+ const workspaceScheduler = createSchedulerRuntime({ concurrency: { workspace: resolveWorkspaceConcurrency(loadedConfig, options) } });
2173
+ const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
2174
+ const totalTasks = preparedProjects.reduce((sum, project) => {
2175
+ if (project.kind === "prepared") return sum + project.prepared.tasks.length;
2176
+ return sum + project.summary.taskCount;
2177
+ }, 0);
2178
+ const skippedSummaryTasks = preparedProjects.reduce((sum, project) => {
2179
+ if (project.kind === "summary") return sum + project.summary.taskCount;
2180
+ return sum;
2181
+ }, 0);
2182
+ const reporterCounters = {
2183
+ failedTasks: 0,
2184
+ passedTasks: 0,
2185
+ skippedTasks: 0
2186
+ };
2187
+ runReporter.onRunStart({ totalTasks });
2188
+ for (const project of executableProjects) for (const task of project.tasks) runReporter.onTaskQueued(createTaskQueuePayload(task, project.name));
2189
+ const projectSummaries = (await Promise.all(preparedProjects.map(async (preparedProject, index) => {
2190
+ if (preparedProject.kind === "summary") return {
2191
+ index,
2192
+ summary: preparedProject.summary
2193
+ };
2194
+ return {
2195
+ index,
2196
+ summary: await telemetry.withSpan("vieval.project", {
2197
+ "vieval.project.name": preparedProject.prepared.name,
2198
+ "vieval.run.id": identity.runId
2199
+ }, async () => await workspaceScheduler.runCase({
2200
+ experimentId: identity.experimentId,
2201
+ projectName: preparedProject.prepared.name,
2202
+ scope: "workspace",
2203
+ workspaceId: identity.workspaceId
2204
+ }, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry, runReporter, reporterCounters, eventRecorder.record, options)))
2205
+ };
2206
+ }))).sort((left, right) => left.index - right.index).map((item) => item.summary);
2207
+ runReporter.onRunEnd({
2208
+ failedTasks: reporterCounters.failedTasks,
2209
+ passedTasks: reporterCounters.passedTasks,
2210
+ skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
2211
+ totalTasks
2212
+ });
2213
+ const output = {
2214
+ attemptId: identity.attemptId,
2215
+ configFilePath: loadedConfig.configFilePath,
2216
+ experimentId: identity.experimentId,
2217
+ projects: projectSummaries,
2218
+ reportDirectory: null,
2219
+ runId: identity.runId,
2220
+ workspaceId: identity.workspaceId
2221
+ };
2222
+ if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
2223
+ return output;
2224
+ });
2225
+ } catch (error) {
2226
+ runError = error;
2227
+ } finally {
2228
+ if (onOpenTelemetryRunEnd != null) try {
2229
+ await onOpenTelemetryRunEnd();
2230
+ } catch (error) {
2231
+ if (runError == null) runEndError = error;
2232
+ }
2233
+ reporter?.dispose();
2234
+ restoreEnvironment();
2235
+ }
2236
+ if (runError != null) throw runError;
2237
+ if (runEndError != null) throw runEndError;
2238
+ if (output == null) throw new Error("Vieval run finished without output.");
2239
+ return output;
2240
+ }
2241
+ function applyRunEnvironment(env) {
2242
+ const envEntries = Object.entries(env);
2243
+ if (envEntries.length === 0) return () => {};
2244
+ const snapshot = /* @__PURE__ */ new Map();
2245
+ for (const [key, value] of envEntries) {
2246
+ snapshot.set(key, {
2247
+ existed: Object.hasOwn(process.env, key),
2248
+ value: process.env[key]
2249
+ });
2250
+ if (value == null) {
2251
+ delete process.env[key];
2252
+ continue;
2253
+ }
2254
+ process.env[key] = value;
2255
+ }
2256
+ return () => {
2257
+ for (const [key, previous] of snapshot.entries()) {
2258
+ if (previous.existed) {
2259
+ if (previous.value == null) {
2260
+ delete process.env[key];
2261
+ continue;
2262
+ }
2263
+ process.env[key] = previous.value;
2264
+ continue;
2265
+ }
2266
+ delete process.env[key];
2076
2267
  }
2077
2268
  };
2078
2269
  }
2079
- function resolveCliRuntimeConcurrency(options) {
2080
- if (options.attemptConcurrency == null && options.caseConcurrency == null) return;
2270
+ function cloneScheduledTaskMatrix(task) {
2081
2271
  return {
2082
- attempt: options.attemptConcurrency,
2083
- case: options.caseConcurrency
2272
+ eval: { ...task.matrix.eval },
2273
+ meta: { ...task.matrix.meta },
2274
+ run: { ...task.matrix.run }
2084
2275
  };
2085
2276
  }
2086
- function shouldUseColor() {
2087
- if (process.env.NO_COLOR != null) return false;
2088
- const forceColor = process.env.FORCE_COLOR;
2089
- if (forceColor != null) return forceColor !== "0";
2090
- return process.stdout.isTTY === true;
2277
+ function createAutoTaskExecutor(reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
2278
+ return async (task, context) => {
2279
+ const taskDefinition = task.entry.task;
2280
+ if (taskDefinition == null) throw new Error(`Missing eval task definition for entry "${task.entry.id}".`);
2281
+ const output = await taskDefinition.run({
2282
+ cache: context.cache,
2283
+ models: context.models,
2284
+ reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
2285
+ task,
2286
+ telemetry: context.telemetry
2287
+ });
2288
+ return {
2289
+ entryId: task.entry.id,
2290
+ id: task.id,
2291
+ inferenceExecutorId: task.inferenceExecutor.id,
2292
+ matrix: task.matrix,
2293
+ scores: [...output.scores]
2294
+ };
2295
+ };
2296
+ }
2297
+ function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, telemetry, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, runtimeConcurrency, vitestCompatReporter) {
2298
+ return {
2299
+ ...createTaskExecutionContext({
2300
+ cache: createFilesystemTaskCacheRuntime({
2301
+ cacheRootDirectory,
2302
+ projectName: cacheProjectName,
2303
+ workspaceId
2304
+ }),
2305
+ models,
2306
+ task
2307
+ }),
2308
+ reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
2309
+ runtimeConcurrency,
2310
+ telemetry
2311
+ };
2091
2312
  }
2092
2313
  function createColorPalette(enabled) {
2093
2314
  if (!enabled) return {
2094
- black: (value) => value,
2095
2315
  bgCyan: (value) => value,
2096
2316
  bgGreen: (value) => value,
2097
2317
  bgMagenta: (value) => value,
2098
2318
  bgYellow: (value) => value,
2319
+ black: (value) => value,
2099
2320
  dim: (value) => value,
2100
2321
  gray: (value) => value,
2101
2322
  green: (value) => value,
@@ -2103,11 +2324,11 @@ function createColorPalette(enabled) {
2103
2324
  yellow: (value) => value
2104
2325
  };
2105
2326
  return {
2106
- black: (value) => c.black(value),
2107
2327
  bgCyan: (value) => c.bgCyan(value),
2108
2328
  bgGreen: (value) => c.bgGreen(value),
2109
2329
  bgMagenta: (value) => c.bgMagenta(value),
2110
2330
  bgYellow: (value) => c.bgYellow(value),
2331
+ black: (value) => c.black(value),
2111
2332
  dim: (value) => c.dim(value),
2112
2333
  gray: (value) => c.gray(value),
2113
2334
  green: (value) => c.green(value),
@@ -2115,67 +2336,6 @@ function createColorPalette(enabled) {
2115
2336
  yellow: (value) => c.yellow(value)
2116
2337
  };
2117
2338
  }
2118
- function createProjectBadge(name, colors, colorEnabled) {
2119
- if (!colorEnabled || !c.isColorSupported) return `|${name}| `;
2120
- const labelColorPool = [
2121
- colors.bgYellow,
2122
- colors.bgCyan,
2123
- colors.bgGreen,
2124
- colors.bgMagenta
2125
- ];
2126
- const background = labelColorPool[name.split("").reduce((accumulator, char, index) => accumulator + char.charCodeAt(0) + index, 0) % labelColorPool.length];
2127
- return `${colors.black(background(` ${name} `))} `;
2128
- }
2129
- function formatDuration$1(durationMs, colors) {
2130
- if (durationMs == null) return "";
2131
- const rounded = Math.round(durationMs);
2132
- return (rounded > 1e3 ? colors.yellow : colors.green)(` ${rounded}${colors.dim("ms")}`);
2133
- }
2134
- function formatHybridAverage(hybridAverage) {
2135
- if (hybridAverage == null) return "n/a";
2136
- return hybridAverage.toFixed(3).replace(/\.?0+$/, "");
2137
- }
2138
- function filterProjectsByName(projects, names) {
2139
- if (names.length === 0) return [...projects];
2140
- const nameSet = new Set(names);
2141
- return projects.filter((project) => nameSet.has(project.name));
2142
- }
2143
- function sanitizeIdentitySegment(value) {
2144
- const normalized = value.trim();
2145
- if (normalized.length === 0) return "default";
2146
- return normalized.replace(/[^\w.-]+/g, "-");
2147
- }
2148
- function createExperimentMatrixRows(tasks) {
2149
- const rows = /* @__PURE__ */ new Set();
2150
- for (const task of tasks) {
2151
- const runRowId = task.matrix.meta.runRowId;
2152
- const evalRowId = task.matrix.meta.evalRowId;
2153
- if (runRowId !== "default" && evalRowId !== "default") {
2154
- rows.add(`run:${runRowId}+eval:${evalRowId}`);
2155
- continue;
2156
- }
2157
- if (runRowId !== "default") rows.add(`run:${runRowId}`);
2158
- if (evalRowId !== "default") rows.add(`eval:${evalRowId}`);
2159
- }
2160
- return [...rows].sort((left, right) => left.localeCompare(right));
2161
- }
2162
- function resolveExperimentId(options, preparedProjects) {
2163
- if (options.experiment != null) return sanitizeIdentitySegment(options.experiment);
2164
- const matrixRows = /* @__PURE__ */ new Set();
2165
- for (const project of preparedProjects) project.experimentMatrixRows.forEach((row) => matrixRows.add(row));
2166
- if (matrixRows.size === 0) return "default-experiment";
2167
- return sanitizeIdentitySegment(`matrix-${[...matrixRows].sort().join("--")}`);
2168
- }
2169
- function createRunIdentity(options, preparedProjects) {
2170
- const workspaceId = sanitizeIdentitySegment(options.workspace ?? "default-workspace");
2171
- const experimentId = resolveExperimentId(options, preparedProjects);
2172
- return {
2173
- attemptId: sanitizeIdentitySegment(options.attempt ?? `attempt-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`),
2174
- experimentId,
2175
- runId: `run-${Date.now()}-${randomUUID().slice(0, 8)}`,
2176
- workspaceId
2177
- };
2178
- }
2179
2339
  function createEventRecorder(identity) {
2180
2340
  const events = [];
2181
2341
  const taskProjectMap = /* @__PURE__ */ new Map();
@@ -2204,6 +2364,50 @@ function createEventRecorder(identity) {
2204
2364
  }
2205
2365
  };
2206
2366
  }
2367
+ function createExperimentMatrixRows(tasks) {
2368
+ const rows = /* @__PURE__ */ new Set();
2369
+ for (const task of tasks) {
2370
+ const runRowId = task.matrix.meta.runRowId;
2371
+ const evalRowId = task.matrix.meta.evalRowId;
2372
+ if (runRowId !== "default" && evalRowId !== "default") {
2373
+ rows.add(`run:${runRowId}+eval:${evalRowId}`);
2374
+ continue;
2375
+ }
2376
+ if (runRowId !== "default") rows.add(`run:${runRowId}`);
2377
+ if (evalRowId !== "default") rows.add(`eval:${evalRowId}`);
2378
+ }
2379
+ return [...rows].sort((left, right) => left.localeCompare(right));
2380
+ }
2381
+ function createProjectBadge(name, colors, colorEnabled) {
2382
+ if (!colorEnabled || !c.isColorSupported) return `|${name}| `;
2383
+ const labelColorPool = [
2384
+ colors.bgYellow,
2385
+ colors.bgCyan,
2386
+ colors.bgGreen,
2387
+ colors.bgMagenta
2388
+ ];
2389
+ const background = labelColorPool[name.split("").reduce((accumulator, char, index) => accumulator + char.charCodeAt(0) + index, 0) % labelColorPool.length];
2390
+ return `${colors.black(background(` ${name} `))} `;
2391
+ }
2392
+ function createProjectMatrixSummary(tasks) {
2393
+ if (tasks.length === 0) return null;
2394
+ const runAxes = /* @__PURE__ */ new Set();
2395
+ const evalAxes = /* @__PURE__ */ new Set();
2396
+ const runRows = /* @__PURE__ */ new Set();
2397
+ const evalRows = /* @__PURE__ */ new Set();
2398
+ for (const task of tasks) {
2399
+ Object.keys(task.matrix.run).forEach((axis) => runAxes.add(axis));
2400
+ Object.keys(task.matrix.eval).forEach((axis) => evalAxes.add(axis));
2401
+ runRows.add(task.matrix.meta.runRowId);
2402
+ evalRows.add(task.matrix.meta.evalRowId);
2403
+ }
2404
+ return {
2405
+ evalAxes: [...evalAxes].sort(),
2406
+ evalRows: evalRows.size,
2407
+ runAxes: [...runAxes].sort(),
2408
+ runRows: runRows.size
2409
+ };
2410
+ }
2207
2411
  function createReporterWithEventCapture(reporter, recordEvent) {
2208
2412
  return {
2209
2413
  dispose() {
@@ -2239,38 +2443,16 @@ function createReporterWithEventCapture(reporter, recordEvent) {
2239
2443
  }
2240
2444
  };
2241
2445
  }
2242
- function applyRunEnvironment(env) {
2243
- const envEntries = Object.entries(env);
2244
- if (envEntries.length === 0) return () => {};
2245
- const snapshot = /* @__PURE__ */ new Map();
2246
- for (const [key, value] of envEntries) {
2247
- snapshot.set(key, {
2248
- existed: Object.hasOwn(process.env, key),
2249
- value: process.env[key]
2250
- });
2251
- if (value == null) {
2252
- delete process.env[key];
2253
- continue;
2254
- }
2255
- process.env[key] = value;
2256
- }
2257
- return () => {
2258
- for (const [key, previous] of snapshot.entries()) {
2259
- if (previous.existed) {
2260
- if (previous.value == null) {
2261
- delete process.env[key];
2262
- continue;
2263
- }
2264
- process.env[key] = previous.value;
2265
- continue;
2266
- }
2267
- delete process.env[key];
2268
- }
2446
+ function createRunIdentity(options, preparedProjects) {
2447
+ const workspaceId = sanitizeIdentitySegment(options.workspace ?? "default-workspace");
2448
+ const experimentId = resolveExperimentId(options, preparedProjects);
2449
+ return {
2450
+ attemptId: sanitizeIdentitySegment(options.attempt ?? `attempt-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`),
2451
+ experimentId,
2452
+ runId: `run-${Date.now()}-${randomUUID().slice(0, 8)}`,
2453
+ workspaceId
2269
2454
  };
2270
2455
  }
2271
- function isSummaryReporter(reporter) {
2272
- return "getWindowRows" in reporter;
2273
- }
2274
2456
  function createRunReporter(options) {
2275
2457
  const getRows = options?.getRows ?? (() => process.stdout.rows);
2276
2458
  const reporter = createCliReporter({
@@ -2342,21 +2524,23 @@ function createRunReporter(options) {
2342
2524
  }
2343
2525
  };
2344
2526
  }
2345
- /**
2346
- * Normalizes terminal row count into the live reporter window height.
2347
- *
2348
- * Before:
2349
- * - undefined
2350
- * - 4
2351
- * - 40
2352
- *
2353
- * After:
2354
- * - 23
2355
- * - 6
2356
- * - 39
2357
- */
2358
- function normalizeLiveReporterMaxRows(rows) {
2359
- return Math.max(6, (rows == null || !Number.isFinite(rows) || rows <= 0 ? 24 : Math.floor(rows)) - 1);
2527
+ function createScheduledTaskWithRuntimeConcurrency(task, project, options) {
2528
+ const taskDefinition = task.entry.task;
2529
+ if (taskDefinition == null) return task;
2530
+ const concurrency = resolveRuntimeTaskConcurrency(taskDefinition.concurrency, project, options);
2531
+ return {
2532
+ ...task,
2533
+ entry: {
2534
+ ...task.entry,
2535
+ task: {
2536
+ ...taskDefinition,
2537
+ concurrency
2538
+ }
2539
+ }
2540
+ };
2541
+ }
2542
+ function createTaskCaseReporterId(payload) {
2543
+ return `${payload.index}:${encodeURIComponent(payload.name)}`;
2360
2544
  }
2361
2545
  function createTaskQueuePayload(task, projectName) {
2362
2546
  return {
@@ -2365,9 +2549,6 @@ function createTaskQueuePayload(task, projectName) {
2365
2549
  taskId: task.id
2366
2550
  };
2367
2551
  }
2368
- function createTaskCaseReporterId(payload) {
2369
- return `${payload.index}:${encodeURIComponent(payload.name)}`;
2370
- }
2371
2552
  function createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
2372
2553
  function syncCaseTotal(total) {
2373
2554
  reporter.onTaskQueued({
@@ -2415,8 +2596,8 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
2415
2596
  reporter.onCaseStart({
2416
2597
  autoRetry: payload.autoRetry,
2417
2598
  caseId,
2418
- input: payload.input,
2419
2599
  caseName: payload.name,
2600
+ input: payload.input,
2420
2601
  retryIndex: payload.retryIndex,
2421
2602
  taskId: task.id
2422
2603
  });
@@ -2434,144 +2615,6 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
2434
2615
  }
2435
2616
  };
2436
2617
  }
2437
- function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, telemetry, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, runtimeConcurrency, vitestCompatReporter) {
2438
- return {
2439
- ...createTaskExecutionContext({
2440
- cache: createFilesystemTaskCacheRuntime({
2441
- cacheRootDirectory,
2442
- projectName: cacheProjectName,
2443
- workspaceId
2444
- }),
2445
- models,
2446
- task
2447
- }),
2448
- reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
2449
- runtimeConcurrency,
2450
- telemetry
2451
- };
2452
- }
2453
- function resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
2454
- return context.reporterHooks ?? createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
2455
- }
2456
- function getFailedTaskId(error) {
2457
- if (error instanceof RunnerExecutionError) return error.taskId;
2458
- return null;
2459
- }
2460
- function createAutoTaskExecutor(reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
2461
- return async (task, context) => {
2462
- const taskDefinition = task.entry.task;
2463
- if (taskDefinition == null) throw new Error(`Missing eval task definition for entry "${task.entry.id}".`);
2464
- const output = await taskDefinition.run({
2465
- cache: context.cache,
2466
- models: context.models,
2467
- reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
2468
- task,
2469
- telemetry: context.telemetry
2470
- });
2471
- return {
2472
- entryId: task.entry.id,
2473
- id: task.id,
2474
- matrix: task.matrix,
2475
- inferenceExecutorId: task.inferenceExecutor.id,
2476
- scores: [...output.scores]
2477
- };
2478
- };
2479
- }
2480
- function cloneScheduledTaskMatrix(task) {
2481
- return {
2482
- eval: { ...task.matrix.eval },
2483
- meta: { ...task.matrix.meta },
2484
- run: { ...task.matrix.run }
2485
- };
2486
- }
2487
- function createProjectMatrixSummary(tasks) {
2488
- if (tasks.length === 0) return null;
2489
- const runAxes = /* @__PURE__ */ new Set();
2490
- const evalAxes = /* @__PURE__ */ new Set();
2491
- const runRows = /* @__PURE__ */ new Set();
2492
- const evalRows = /* @__PURE__ */ new Set();
2493
- for (const task of tasks) {
2494
- Object.keys(task.matrix.run).forEach((axis) => runAxes.add(axis));
2495
- Object.keys(task.matrix.eval).forEach((axis) => evalAxes.add(axis));
2496
- runRows.add(task.matrix.meta.runRowId);
2497
- evalRows.add(task.matrix.meta.evalRowId);
2498
- }
2499
- return {
2500
- evalAxes: [...evalAxes].sort(),
2501
- evalRows: evalRows.size,
2502
- runAxes: [...runAxes].sort(),
2503
- runRows: runRows.size
2504
- };
2505
- }
2506
- async function prepareProject(project) {
2507
- const startedAt = Date.now();
2508
- try {
2509
- const runtimeContext = await createRunnerRuntimeContext({
2510
- cwd: project.root,
2511
- fallbackProjectRootDirectory: project.root
2512
- });
2513
- const evalFilePaths = await discoverEvalFiles({
2514
- exclude: project.exclude,
2515
- include: project.include,
2516
- root: project.root
2517
- });
2518
- const entries = collectEvalEntries(await loadEvalModulesWithVitestRuntime(evalFilePaths, project.root), runtimeContext);
2519
- const tasks = createRunnerSchedule({
2520
- evalMatrix: project.evalMatrix,
2521
- entries,
2522
- inferenceExecutors: project.inferenceExecutors,
2523
- runMatrix: project.runMatrix
2524
- });
2525
- const canAutoExecuteEntryTasks = entries.some((entry) => entry.task != null) && project.models.length > 0;
2526
- if (project.executor == null && !canAutoExecuteEntryTasks) return {
2527
- experimentMatrixRows: createExperimentMatrixRows(tasks),
2528
- kind: "summary",
2529
- summary: {
2530
- caseSummary: null,
2531
- caseFailures: [],
2532
- discoveredEvalFileCount: evalFilePaths.length,
2533
- durationMs: Date.now() - startedAt,
2534
- entryCount: entries.length,
2535
- errorMessage: null,
2536
- executed: false,
2537
- matrixSummary: createProjectMatrixSummary(tasks),
2538
- name: project.name,
2539
- result: null,
2540
- taskCount: tasks.length
2541
- }
2542
- };
2543
- return {
2544
- experimentMatrixRows: createExperimentMatrixRows(tasks),
2545
- kind: "prepared",
2546
- prepared: {
2547
- discoveredEvalFileCount: evalFilePaths.length,
2548
- entryCount: entries.length,
2549
- name: project.name,
2550
- project,
2551
- startedAt,
2552
- tasks
2553
- }
2554
- };
2555
- } catch (error) {
2556
- return {
2557
- experimentMatrixRows: [],
2558
- kind: "summary",
2559
- summary: {
2560
- caseSummary: null,
2561
- caseFailures: [],
2562
- discoveredEvalFileCount: 0,
2563
- durationMs: Date.now() - startedAt,
2564
- entryCount: 0,
2565
- errorMessage: errorMessageFrom(error) ?? "Unknown project execution error.",
2566
- executed: false,
2567
- matrixSummary: null,
2568
- name: project.name,
2569
- result: null,
2570
- taskCount: 0
2571
- }
2572
- };
2573
- }
2574
- }
2575
2618
  async function executePreparedProject(prepared, identity, cacheProjectName, telemetry, reporter, counters, recordEvent, options) {
2576
2619
  const settledTaskIds = /* @__PURE__ */ new Set();
2577
2620
  const projectCaseCounters = {
@@ -2607,6 +2650,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, tele
2607
2650
  createExecutionContext(task) {
2608
2651
  return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, telemetry, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, resolveCliRuntimeConcurrency(options), vitestCompatReporter);
2609
2652
  },
2653
+ maxConcurrency: resolveScheduledTaskConcurrency(prepared.project, options),
2610
2654
  onTaskEnd(task, state) {
2611
2655
  settledTaskIds.add(task.id);
2612
2656
  reporter.onTaskEnd({
@@ -2626,11 +2670,11 @@ async function executePreparedProject(prepared, identity, cacheProjectName, tele
2626
2670
  onTaskStart(task) {
2627
2671
  reporter.onTaskStart({ taskId: task.id });
2628
2672
  vitestCompatReporter?.onTaskStart({ taskId: task.id });
2629
- },
2630
- maxConcurrency: resolveScheduledTaskConcurrency(prepared.project, options)
2673
+ }
2631
2674
  });
2632
2675
  await vitestCompatReporter?.onRunEnd({ failed: false });
2633
2676
  return {
2677
+ caseFailures: projectCaseFailures,
2634
2678
  caseSummary: {
2635
2679
  failed: projectCaseCounters.failed,
2636
2680
  passed: projectCaseCounters.passed,
@@ -2638,7 +2682,6 @@ async function executePreparedProject(prepared, identity, cacheProjectName, tele
2638
2682
  timeout: projectCaseCounters.timeout,
2639
2683
  total: projectCaseCounters.seenCaseIds.size
2640
2684
  },
2641
- caseFailures: projectCaseFailures,
2642
2685
  discoveredEvalFileCount: prepared.discoveredEvalFileCount,
2643
2686
  durationMs: Date.now() - prepared.startedAt,
2644
2687
  entryCount: prepared.entryCount,
@@ -2678,6 +2721,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, tele
2678
2721
  }
2679
2722
  await vitestCompatReporter?.onRunEnd({ failed: true });
2680
2723
  return {
2724
+ caseFailures: projectCaseFailures,
2681
2725
  caseSummary: {
2682
2726
  failed: projectCaseCounters.failed,
2683
2727
  passed: projectCaseCounters.passed,
@@ -2685,7 +2729,6 @@ async function executePreparedProject(prepared, identity, cacheProjectName, tele
2685
2729
  timeout: projectCaseCounters.timeout,
2686
2730
  total: projectCaseCounters.seenCaseIds.size
2687
2731
  },
2688
- caseFailures: projectCaseFailures,
2689
2732
  discoveredEvalFileCount: prepared.discoveredEvalFileCount,
2690
2733
  durationMs: Date.now() - prepared.startedAt,
2691
2734
  entryCount: prepared.entryCount,
@@ -2698,211 +2741,168 @@ async function executePreparedProject(prepared, identity, cacheProjectName, tele
2698
2741
  };
2699
2742
  }
2700
2743
  }
2744
+ function filterProjectsByName(projects, names) {
2745
+ if (names.length === 0) return [...projects];
2746
+ const nameSet = new Set(names);
2747
+ return projects.filter((project) => nameSet.has(project.name));
2748
+ }
2749
+ function formatDuration$1(durationMs, colors) {
2750
+ if (durationMs == null) return "";
2751
+ const rounded = Math.round(durationMs);
2752
+ return (rounded > 1e3 ? colors.yellow : colors.green)(` ${rounded}${colors.dim("ms")}`);
2753
+ }
2754
+ function formatHybridAverage(hybridAverage) {
2755
+ if (hybridAverage == null) return "n/a";
2756
+ return hybridAverage.toFixed(3).replace(/\.?0+$/, "");
2757
+ }
2758
+ function getFailedTaskId(error) {
2759
+ if (error instanceof RunnerExecutionError) return error.taskId;
2760
+ return null;
2761
+ }
2762
+ function isSummaryReporter(reporter) {
2763
+ return "getWindowRows" in reporter;
2764
+ }
2701
2765
  /**
2702
- * Runs vieval orchestration from config and returns project-level summaries.
2703
- *
2704
- * Call stack:
2766
+ * Normalizes terminal row count into the live reporter window height.
2705
2767
  *
2706
- * {@link runVievalCli}
2707
- * -> {@link loadVievalCliConfig}
2708
- * -> {@link discoverEvalFiles}
2709
- * -> {@link collectEvalEntries}
2710
- * -> {@link createRunnerSchedule}
2711
- * -> {@link runScheduledTasks} (optional)
2768
+ * Before:
2769
+ * - undefined
2770
+ * - 4
2771
+ * - 40
2712
2772
  *
2713
- * Use when:
2714
- * - running eval collection and scheduling from a single command
2715
- * - keeping business-agent eval files near their implementation packages
2773
+ * After:
2774
+ * - 23
2775
+ * - 6
2776
+ * - 39
2716
2777
  */
2717
- async function runVievalCli(options = {}) {
2718
- const loadedConfig = await loadVievalCliConfig({
2719
- configFilePath: options.configFilePath,
2720
- cwd: options.cwd
2721
- });
2722
- const telemetry = loadedConfig.reporting?.openTelemetry?.enabled === true ? createOpenTelemetryRuntime() : createNoopTelemetryRuntime();
2723
- const onOpenTelemetryRunEnd = loadedConfig.reporting?.openTelemetry?.enabled === true ? loadedConfig.reporting.openTelemetry.onRunEnd : void 0;
2724
- const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
2725
- let runError;
2726
- let runEndError;
2727
- let output;
2728
- let reporter;
2778
+ function normalizeLiveReporterMaxRows(rows) {
2779
+ return Math.max(6, (rows == null || !Number.isFinite(rows) || rows <= 0 ? 24 : Math.floor(rows)) - 1);
2780
+ }
2781
+ async function prepareProject(project) {
2782
+ const startedAt = Date.now();
2729
2783
  try {
2730
- const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
2731
- const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
2732
- const identity = createRunIdentity(options, preparedProjects);
2733
- const eventRecorder = createEventRecorder(identity);
2734
- const runReporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
2735
- reporter = runReporter;
2736
- output = await telemetry.withSpan("vieval.run", {
2737
- "vieval.attempt.id": identity.attemptId,
2738
- "vieval.experiment.id": identity.experimentId,
2739
- "vieval.run.id": identity.runId,
2740
- "vieval.workspace.id": identity.workspaceId
2741
- }, async () => {
2742
- const workspaceScheduler = createSchedulerRuntime({ concurrency: { workspace: resolveWorkspaceConcurrency(loadedConfig, options) } });
2743
- const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
2744
- const totalTasks = preparedProjects.reduce((sum, project) => {
2745
- if (project.kind === "prepared") return sum + project.prepared.tasks.length;
2746
- return sum + project.summary.taskCount;
2747
- }, 0);
2748
- const skippedSummaryTasks = preparedProjects.reduce((sum, project) => {
2749
- if (project.kind === "summary") return sum + project.summary.taskCount;
2750
- return sum;
2751
- }, 0);
2752
- const reporterCounters = {
2753
- failedTasks: 0,
2754
- passedTasks: 0,
2755
- skippedTasks: 0
2756
- };
2757
- runReporter.onRunStart({ totalTasks });
2758
- for (const project of executableProjects) for (const task of project.tasks) runReporter.onTaskQueued(createTaskQueuePayload(task, project.name));
2759
- const projectSummaries = (await Promise.all(preparedProjects.map(async (preparedProject, index) => {
2760
- if (preparedProject.kind === "summary") return {
2761
- index,
2762
- summary: preparedProject.summary
2763
- };
2764
- return {
2765
- index,
2766
- summary: await telemetry.withSpan("vieval.project", {
2767
- "vieval.project.name": preparedProject.prepared.name,
2768
- "vieval.run.id": identity.runId
2769
- }, async () => await workspaceScheduler.runCase({
2770
- experimentId: identity.experimentId,
2771
- projectName: preparedProject.prepared.name,
2772
- scope: "workspace",
2773
- workspaceId: identity.workspaceId
2774
- }, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry, runReporter, reporterCounters, eventRecorder.record, options)))
2775
- };
2776
- }))).sort((left, right) => left.index - right.index).map((item) => item.summary);
2777
- runReporter.onRunEnd({
2778
- failedTasks: reporterCounters.failedTasks,
2779
- passedTasks: reporterCounters.passedTasks,
2780
- skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
2781
- totalTasks
2782
- });
2783
- const output = {
2784
- attemptId: identity.attemptId,
2785
- configFilePath: loadedConfig.configFilePath,
2786
- experimentId: identity.experimentId,
2787
- projects: projectSummaries,
2788
- reportDirectory: null,
2789
- runId: identity.runId,
2790
- workspaceId: identity.workspaceId
2791
- };
2792
- if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
2793
- return output;
2784
+ const runtimeContext = await createRunnerRuntimeContext({
2785
+ cwd: project.root,
2786
+ fallbackProjectRootDirectory: project.root
2787
+ });
2788
+ const evalFilePaths = await discoverEvalFiles({
2789
+ exclude: project.exclude,
2790
+ include: project.include,
2791
+ root: project.root
2792
+ });
2793
+ const entries = collectEvalEntries(await loadEvalModulesWithVitestRuntime(evalFilePaths, project.root), runtimeContext);
2794
+ const tasks = createRunnerSchedule({
2795
+ entries,
2796
+ evalMatrix: project.evalMatrix,
2797
+ inferenceExecutors: project.inferenceExecutors,
2798
+ runMatrix: project.runMatrix
2794
2799
  });
2800
+ const canAutoExecuteEntryTasks = entries.some((entry) => entry.task != null) && project.models.length > 0;
2801
+ if (project.executor == null && !canAutoExecuteEntryTasks) return {
2802
+ experimentMatrixRows: createExperimentMatrixRows(tasks),
2803
+ kind: "summary",
2804
+ summary: {
2805
+ caseFailures: [],
2806
+ caseSummary: null,
2807
+ discoveredEvalFileCount: evalFilePaths.length,
2808
+ durationMs: Date.now() - startedAt,
2809
+ entryCount: entries.length,
2810
+ errorMessage: null,
2811
+ executed: false,
2812
+ matrixSummary: createProjectMatrixSummary(tasks),
2813
+ name: project.name,
2814
+ result: null,
2815
+ taskCount: tasks.length
2816
+ }
2817
+ };
2818
+ return {
2819
+ experimentMatrixRows: createExperimentMatrixRows(tasks),
2820
+ kind: "prepared",
2821
+ prepared: {
2822
+ discoveredEvalFileCount: evalFilePaths.length,
2823
+ entryCount: entries.length,
2824
+ name: project.name,
2825
+ project,
2826
+ startedAt,
2827
+ tasks
2828
+ }
2829
+ };
2795
2830
  } catch (error) {
2796
- runError = error;
2797
- } finally {
2798
- if (onOpenTelemetryRunEnd != null) try {
2799
- await onOpenTelemetryRunEnd();
2800
- } catch (error) {
2801
- if (runError == null) runEndError = error;
2802
- }
2803
- reporter?.dispose();
2804
- restoreEnvironment();
2805
- }
2806
- if (runError != null) throw runError;
2807
- if (runEndError != null) throw runEndError;
2808
- if (output == null) throw new Error("Vieval run finished without output.");
2809
- return output;
2810
- }
2811
- /**
2812
- * Formats CLI run output as human-readable lines.
2813
- */
2814
- function formatVievalCliRunOutput(output) {
2815
- const colorEnabled = shouldUseColor();
2816
- const colors = createColorPalette(colorEnabled);
2817
- const lines = [];
2818
- lines.push(` ${colors.dim("RUN")} ${colors.yellow("vieval")}`);
2819
- lines.push(` ${colors.dim("Config")} ${output.configFilePath ?? "(not found, using defaults)"}`);
2820
- lines.push("");
2821
- let passedProjects = 0;
2822
- let skippedProjects = 0;
2823
- let failedProjects = 0;
2824
- let totalTasks = 0;
2825
- let executedTasks = 0;
2826
- function formatMatrixSummary(summary) {
2827
- if (summary == null) return null;
2828
- const runAxesLabel = summary.runAxes.length === 0 ? "-" : summary.runAxes.join("|");
2829
- const evalAxesLabel = summary.evalAxes.length === 0 ? "-" : summary.evalAxes.join("|");
2830
- return `matrix run ${summary.runRows} [${runAxesLabel}] / eval ${summary.evalRows} [${evalAxesLabel}]`;
2831
- }
2832
- function formatScheduleBreakdown(project) {
2833
- const summary = project.matrixSummary;
2834
- if (summary == null) return null;
2835
- if (project.taskCount <= 0 || project.entryCount <= 0 || summary.runRows <= 0 || summary.evalRows <= 0) return null;
2836
- const denominator = project.entryCount * summary.runRows * summary.evalRows;
2837
- if (denominator <= 0 || project.taskCount % denominator !== 0) return null;
2838
- const providerCount = project.taskCount / denominator;
2839
- return [
2840
- colors.dim("schedule "),
2841
- colors.yellow(String(project.entryCount)),
2842
- colors.dim(" entries × "),
2843
- colors.yellow(String(providerCount)),
2844
- colors.dim(" inferenceExecutors × "),
2845
- colors.yellow(String(summary.runRows)),
2846
- colors.dim(" run rows × "),
2847
- colors.yellow(String(summary.evalRows)),
2848
- colors.dim(" eval rows = "),
2849
- colors.green(String(project.taskCount)),
2850
- colors.dim(" tasks")
2851
- ].join("");
2852
- }
2853
- for (const project of output.projects) {
2854
- totalTasks += project.taskCount;
2855
- executedTasks += project.result?.overall.runCount ?? 0;
2856
- const badge = createProjectBadge(project.name, colors, colorEnabled);
2857
- const isFailed = project.errorMessage != null;
2858
- const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseSummary?.timeout ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
2859
- if (isFailed) {
2860
- failedProjects += 1;
2861
- lines.push(` ${colors.red("❯")} ${badge}${formatDuration$1(project.durationMs, colors)}`);
2862
- lines.push(` ${project.errorMessage}`);
2863
- continue;
2864
- }
2865
- if (!project.executed) {
2866
- skippedProjects += 1;
2867
- const countLabel = colors.dim(`(${project.taskCount} tasks)`);
2868
- const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, 0 runs, hybrid n/a`);
2869
- const matrixSummary = formatMatrixSummary(project.matrixSummary);
2870
- lines.push(` ${colors.dim("○")} ${badge}${countLabel}${detailsLabel}${formatDuration$1(project.durationMs, colors)}`);
2871
- if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
2872
- const scheduleBreakdown = formatScheduleBreakdown(project);
2873
- if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
2874
- continue;
2875
- }
2876
- if (hasFailedCases) failedProjects += 1;
2877
- else passedProjects += 1;
2878
- const hybridAverageLabel = formatHybridAverage(project.result?.overall.hybridAverage);
2879
- const runCount = project.result?.overall.runCount ?? 0;
2880
- const countLabel = colors.dim(`(${project.taskCount} tasks)`);
2881
- const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed | ${project.caseSummary.timeout} timeout`;
2882
- const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, ${runCount} runs${caseSummaryLabel}, hybrid ${hybridAverageLabel}`);
2883
- const matrixSummary = formatMatrixSummary(project.matrixSummary);
2884
- lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration$1(project.durationMs, colors)}`);
2885
- if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
2886
- const scheduleBreakdown = formatScheduleBreakdown(project);
2887
- if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
2888
- if ((project.caseFailures?.length ?? 0) > 0) {
2889
- lines.push(` ${colors.red("Failed cases:")}`);
2890
- for (const failure of project.caseFailures.slice(0, 5)) {
2891
- lines.push(` ${colors.red(`- ${failure.caseName} (${failure.taskId})`)}`);
2892
- for (const line of failure.errorMessage.split("\n")) lines.push(` ${colors.red(line)}`);
2831
+ return {
2832
+ experimentMatrixRows: [],
2833
+ kind: "summary",
2834
+ summary: {
2835
+ caseFailures: [],
2836
+ caseSummary: null,
2837
+ discoveredEvalFileCount: 0,
2838
+ durationMs: Date.now() - startedAt,
2839
+ entryCount: 0,
2840
+ errorMessage: errorMessageFrom(error) ?? "Unknown project execution error.",
2841
+ executed: false,
2842
+ matrixSummary: null,
2843
+ name: project.name,
2844
+ result: null,
2845
+ taskCount: 0
2893
2846
  }
2894
- if (project.caseFailures.length > 5) lines.push(` ${colors.dim(`... ${project.caseFailures.length - 5} more failed cases`)}`);
2895
- }
2847
+ };
2896
2848
  }
2897
- lines.push("");
2898
- if (failedProjects > 0 || skippedProjects > 0) {
2899
- const summarySegments = [`${colors.green(String(passedProjects))} passed`];
2900
- if (skippedProjects > 0) summarySegments.push(`${colors.dim(String(skippedProjects))} skipped`);
2901
- if (failedProjects > 0) summarySegments.push(`${colors.red(String(failedProjects))} failed`);
2902
- lines.push(` ${colors.dim("Projects")} ${summarySegments.join(" | ")} (${output.projects.length})`);
2903
- } else lines.push(` ${colors.dim("Projects")} ${colors.green(String(passedProjects))} passed (${output.projects.length})`);
2904
- lines.push(` ${colors.dim("Tasks")} ${executedTasks} executed / ${totalTasks} scheduled`);
2905
- return lines.join("\n");
2849
+ }
2850
+ function resolveCappedConcurrency(defaultConcurrency, cliConcurrency, fallback) {
2851
+ const effectiveDefault = defaultConcurrency ?? fallback;
2852
+ if (cliConcurrency == null) return effectiveDefault;
2853
+ return Math.min(effectiveDefault, cliConcurrency);
2854
+ }
2855
+ function resolveCliRuntimeConcurrency(options) {
2856
+ if (options.attemptConcurrency == null && options.caseConcurrency == null) return;
2857
+ return {
2858
+ attempt: options.attemptConcurrency,
2859
+ case: options.caseConcurrency
2860
+ };
2861
+ }
2862
+ function resolveExperimentId(options, preparedProjects) {
2863
+ if (options.experiment != null) return sanitizeIdentitySegment(options.experiment);
2864
+ const matrixRows = /* @__PURE__ */ new Set();
2865
+ for (const project of preparedProjects) project.experimentMatrixRows.forEach((row) => matrixRows.add(row));
2866
+ if (matrixRows.size === 0) return "default-experiment";
2867
+ return sanitizeIdentitySegment(`matrix-${[...matrixRows].sort().join("--")}`);
2868
+ }
2869
+ function resolveOptionalRuntimeTaskConcurrency(defaultConcurrency, cliConcurrency) {
2870
+ return cliConcurrency ?? defaultConcurrency;
2871
+ }
2872
+ function resolveProjectConcurrency(project, options) {
2873
+ return resolveCappedConcurrency(project.concurrency?.project, options.projectConcurrency, Number.POSITIVE_INFINITY);
2874
+ }
2875
+ function resolveRuntimeTaskConcurrency(taskConcurrency, project, options) {
2876
+ const attempt = resolveOptionalRuntimeTaskConcurrency(taskConcurrency?.attempt ?? project.concurrency?.attempt, options.attemptConcurrency);
2877
+ const caseConcurrency = resolveOptionalRuntimeTaskConcurrency(taskConcurrency?.case ?? project.concurrency?.case, options.caseConcurrency);
2878
+ if (attempt == null && caseConcurrency == null) return;
2879
+ return {
2880
+ attempt,
2881
+ case: caseConcurrency
2882
+ };
2883
+ }
2884
+ function resolveScheduledTaskConcurrency(project, options) {
2885
+ return Math.min(resolveProjectConcurrency(project, options), resolveTaskConcurrency(project, options));
2886
+ }
2887
+ function resolveTaskConcurrency(project, options) {
2888
+ return resolveCappedConcurrency(project.concurrency?.task, options.taskConcurrency, 1);
2889
+ }
2890
+ function resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
2891
+ return context.reporterHooks ?? createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
2892
+ }
2893
+ function resolveWorkspaceConcurrency(loadedConfig, options) {
2894
+ return resolveCappedConcurrency(loadedConfig.concurrency?.workspace, options.workspaceConcurrency, 1);
2895
+ }
2896
+ function sanitizeIdentitySegment(value) {
2897
+ const normalized = value.trim();
2898
+ if (normalized.length === 0) return "default";
2899
+ return normalized.replace(/[^\w.-]+/g, "-");
2900
+ }
2901
+ function shouldUseColor() {
2902
+ if (process.env.NO_COLOR != null) return false;
2903
+ const forceColor = process.env.FORCE_COLOR;
2904
+ if (forceColor != null) return forceColor !== "0";
2905
+ return process.stdout.isTTY === true;
2906
2906
  }
2907
2907
  //#endregion
2908
2908
  //#region src/cli/compare.ts
@@ -2918,17 +2918,12 @@ const compareHelpText = `
2918
2918
  --output Optional output artifact path
2919
2919
  --format Console output format: table | json (default: table)
2920
2920
  `;
2921
- function normalizeCliArgv$5(argv) {
2922
- const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
2923
- if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
2924
- return normalizedArgv;
2925
- }
2926
2921
  function parseCompareCliArguments(argv) {
2927
2922
  const cli = meow(compareHelpText, {
2928
2923
  argv: normalizeCliArgv$5(argv),
2929
2924
  flags: {
2930
- config: { type: "string" },
2931
2925
  comparison: { type: "string" },
2926
+ config: { type: "string" },
2932
2927
  format: {
2933
2928
  default: "table",
2934
2929
  type: "string"
@@ -3009,6 +3004,11 @@ async function runCompareCliOrExit(argv) {
3009
3004
  process.exitCode = 1;
3010
3005
  }
3011
3006
  }
3007
+ function normalizeCliArgv$5(argv) {
3008
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3009
+ if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
3010
+ return normalizedArgv;
3011
+ }
3012
3012
  //#endregion
3013
3013
  //#region package.json
3014
3014
  var name = "vieval";
@@ -3034,14 +3034,6 @@ const evalRunHelpText = `
3034
3034
  --report-out Report output root directory
3035
3035
  --json Print machine-readable JSON output
3036
3036
  `;
3037
- function normalizeCliArgv$4(argv) {
3038
- const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3039
- return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
3040
- }
3041
- function normalizeProjectNames(projectNames) {
3042
- if (typeof projectNames === "string") return [projectNames];
3043
- return projectNames ?? [];
3044
- }
3045
3037
  /**
3046
3038
  * Parses `vieval run` CLI arguments into one normalized execution payload.
3047
3039
  *
@@ -3058,9 +3050,12 @@ function normalizeProjectNames(projectNames) {
3058
3050
  function parseCliArguments(argv) {
3059
3051
  const cli = meow(evalRunHelpText, {
3060
3052
  argv: normalizeCliArgv$4(argv),
3061
- importMeta: import.meta,
3062
3053
  flags: {
3054
+ attempt: { type: "string" },
3055
+ attemptConcurrency: { type: "number" },
3056
+ caseConcurrency: { type: "number" },
3063
3057
  config: { type: "string" },
3058
+ experiment: { type: "string" },
3064
3059
  json: {
3065
3060
  default: false,
3066
3061
  type: "boolean"
@@ -3069,16 +3064,13 @@ function parseCliArguments(argv) {
3069
3064
  isMultiple: true,
3070
3065
  type: "string"
3071
3066
  },
3072
- workspace: { type: "string" },
3073
- experiment: { type: "string" },
3074
- attempt: { type: "string" },
3075
- workspaceConcurrency: { type: "number" },
3076
3067
  projectConcurrency: { type: "number" },
3068
+ reportOut: { type: "string" },
3077
3069
  taskConcurrency: { type: "number" },
3078
- attemptConcurrency: { type: "number" },
3079
- caseConcurrency: { type: "number" },
3080
- reportOut: { type: "string" }
3081
- }
3070
+ workspace: { type: "string" },
3071
+ workspaceConcurrency: { type: "number" }
3072
+ },
3073
+ importMeta: import.meta
3082
3074
  });
3083
3075
  return {
3084
3076
  attempt: cli.flags.attempt,
@@ -3149,180 +3141,39 @@ async function runEvalRunCli(argv) {
3149
3141
  } catch (error) {
3150
3142
  const errorMessage = errorMessageFrom(error) ?? "Unknown CLI failure.";
3151
3143
  process.stderr.write(`[${name}] ${errorMessage}\n`);
3152
- process.exitCode = 1;
3153
- }
3154
- }
3155
- //#endregion
3156
- //#region src/cli/report-analyze.ts
3157
- const reportAnalyzeHelpText = `
3158
- Analyze generated vieval report artifacts.
3159
-
3160
- Usage
3161
- $ vieval report analyze <reportPath> [options]
3162
-
3163
- Options
3164
- --format Output format: table | json | jsonl | csv (default: table)
3165
- --workspace Workspace id filter
3166
- --project Project name filter (exact)
3167
- --experiment Experiment id filter
3168
- --attempt Attempt id filter
3169
- --run Run id filter
3170
- --task-state Keep runs containing at least one task in this state
3171
- --case-state Keep runs containing at least one case in this state
3172
- --contains Keep runs containing this text in event name or payload
3173
- --error-contains Keep runs containing this text in project errors or event payload
3174
- --run-matrix Keep runs matching run-matrix selector "key=value[,key=value]"
3175
- --eval-matrix Keep runs matching eval-matrix selector "key=value[,key=value]"
3176
- `;
3177
- function normalizeCliArgv$3(argv) {
3178
- const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3179
- if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
3180
- if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
3181
- return normalizedArgv;
3182
- }
3183
- function parseReportAnalyzeCliArguments(argv) {
3184
- const cli = meow(reportAnalyzeHelpText, {
3185
- argv: normalizeCliArgv$3(argv),
3186
- flags: {
3187
- attempt: { type: "string" },
3188
- caseState: { type: "string" },
3189
- contains: { type: "string" },
3190
- evalMatrix: { type: "string" },
3191
- errorContains: { type: "string" },
3192
- experiment: { type: "string" },
3193
- format: {
3194
- default: "table",
3195
- type: "string"
3196
- },
3197
- project: { type: "string" },
3198
- runMatrix: { type: "string" },
3199
- run: { type: "string" },
3200
- taskState: { type: "string" },
3201
- workspace: { type: "string" }
3202
- },
3203
- importMeta: import.meta
3204
- });
3205
- const reportPath = cli.input[0];
3206
- if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
3207
- const normalizedFormat = cli.flags.format.toLowerCase();
3208
- const format = normalizedFormat === "json" ? "json" : normalizedFormat === "jsonl" ? "jsonl" : normalizedFormat === "csv" ? "csv" : "table";
3209
- return {
3210
- attempt: cli.flags.attempt,
3211
- caseState: normalizeStateFilter(cli.flags.caseState),
3212
- contains: cli.flags.contains,
3213
- evalMatrix: parseMatrixSelector(cli.flags.evalMatrix),
3214
- errorContains: cli.flags.errorContains,
3215
- experiment: cli.flags.experiment,
3216
- format,
3217
- project: cli.flags.project,
3218
- reportPath,
3219
- runMatrix: parseMatrixSelector(cli.flags.runMatrix),
3220
- run: cli.flags.run,
3221
- taskState: normalizeStateFilter(cli.flags.taskState),
3222
- workspace: cli.flags.workspace
3223
- };
3224
- }
3225
- function normalizeStateFilter(value) {
3226
- if (value == null) return;
3227
- const normalized = value.trim().toLowerCase();
3228
- if (normalized === "passed" || normalized === "failed" || normalized === "skipped") return normalized;
3229
- throw new Error(`Unsupported state filter "${value}". Expected "passed", "failed", or "skipped".`);
3230
- }
3231
- function parseMatrixSelector(value) {
3232
- if (value == null) return;
3233
- const selector = {};
3234
- const segments = value.split(",").map((segment) => segment.trim()).filter((segment) => segment.length > 0);
3235
- for (const segment of segments) {
3236
- const separatorIndex = segment.indexOf("=");
3237
- if (separatorIndex <= 0 || separatorIndex === segment.length - 1) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
3238
- const key = segment.slice(0, separatorIndex).trim();
3239
- const parsedValue = segment.slice(separatorIndex + 1).trim();
3240
- if (key.length === 0 || parsedValue.length === 0) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
3241
- selector[key] = parsedValue;
3242
- }
3243
- return selector;
3244
- }
3245
- function filterAnalyzeRows(rows, parsed) {
3246
- return rows.filter((row) => {
3247
- if (parsed.workspace != null && row.workspaceId !== parsed.workspace) return false;
3248
- if (parsed.experiment != null && row.experimentId !== parsed.experiment) return false;
3249
- if (parsed.attempt != null && row.attemptId !== parsed.attempt) return false;
3250
- if (parsed.run != null && row.runId !== parsed.run) return false;
3251
- if (parsed.project != null && !row.projectNames.includes(parsed.project)) return false;
3252
- return true;
3253
- });
3254
- }
3255
- function includesNeedle(value, needle) {
3256
- const normalizedNeedle = needle.trim().toLowerCase();
3257
- if (normalizedNeedle.length === 0) return true;
3258
- return JSON.stringify(value).toLowerCase().includes(normalizedNeedle);
3259
- }
3260
- function hasTaskState(artifact, targetState) {
3261
- return artifact.events.some((event) => {
3262
- if (event.event !== "TaskEnded") return false;
3263
- return event.data?.state === targetState;
3264
- });
3265
- }
3266
- function hasCaseState(artifact, targetState) {
3267
- return artifact.events.some((event) => {
3268
- if (event.event !== "CaseEnded") return false;
3269
- return event.data?.state === targetState;
3270
- });
3271
- }
3272
- function matchesMatrixSelector(matrix, selector) {
3273
- return Object.entries(selector).every(([key, expectedValue]) => String(matrix[key]) === expectedValue);
3274
- }
3275
- function hasRunMatrixMatch(artifact, selector) {
3276
- return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.run, selector)) === true);
3277
- }
3278
- function hasEvalMatrixMatch(artifact, selector) {
3279
- return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.eval, selector)) === true);
3280
- }
3281
- function matchesOutcomeFilters(artifact, parsed) {
3282
- if (parsed.runMatrix != null && !hasRunMatrixMatch(artifact, parsed.runMatrix)) return false;
3283
- if (parsed.evalMatrix != null && !hasEvalMatrixMatch(artifact, parsed.evalMatrix)) return false;
3284
- if (parsed.taskState != null && !hasTaskState(artifact, parsed.taskState)) return false;
3285
- if (parsed.caseState != null && !hasCaseState(artifact, parsed.caseState)) return false;
3286
- if (parsed.contains != null) {
3287
- if (!artifact.events.some((event) => includesNeedle({
3288
- data: event.data,
3289
- event: event.event
3290
- }, parsed.contains))) return false;
3291
- }
3292
- if (parsed.errorContains != null) {
3293
- if (!(artifact.summary.projects.map((project) => project.errorMessage).filter((errorMessage) => errorMessage != null).some((errorMessage) => includesNeedle(errorMessage, parsed.errorContains)) || artifact.events.some((event) => includesNeedle(event.data, parsed.errorContains)))) return false;
3144
+ process.exitCode = 1;
3294
3145
  }
3295
- return true;
3296
- }
3297
- async function readReportAnalyzeOutput(parsed) {
3298
- const artifacts = await readReportArtifacts(parsed.reportPath);
3299
- const rows = artifacts.map((artifact) => summarizeReportRunArtifact(artifact));
3300
- const identityFilteredRows = filterAnalyzeRows(rows, parsed);
3301
- const rowByDirectory = new Map(identityFilteredRows.map((row) => [row.reportDirectory, row]));
3302
- const filteredRows = artifacts.filter((artifact) => rowByDirectory.has(artifact.reportDirectory)).filter((artifact) => matchesOutcomeFilters(artifact, parsed)).map((artifact) => rowByDirectory.get(artifact.reportDirectory)).filter((row) => row != null);
3303
- return {
3304
- experimentSummaries: buildExperimentSummaries(filteredRows),
3305
- filteredRunCount: filteredRows.length,
3306
- runs: filteredRows,
3307
- totalRunCount: rows.length
3308
- };
3309
- }
3310
- function roundMetric(value) {
3311
- return Number(value.toFixed(6));
3312
- }
3313
- function computeAverage(values) {
3314
- if (values.length === 0) return 0;
3315
- return values.reduce((sum, value) => sum + value, 0) / values.length;
3316
3146
  }
3317
- function computeStandardDeviation(values) {
3318
- if (values.length === 0) return 0;
3319
- const average = computeAverage(values);
3320
- const variance = computeAverage(values.map((value) => (value - average) ** 2));
3321
- return Math.sqrt(variance);
3147
+ function normalizeCliArgv$4(argv) {
3148
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3149
+ return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
3322
3150
  }
3323
- function createExperimentGroupKey(row) {
3324
- return `${row.workspaceId ?? "unknown-workspace"}::${row.experimentId ?? "unknown-experiment"}`;
3151
+ function normalizeProjectNames(projectNames) {
3152
+ if (typeof projectNames === "string") return [projectNames];
3153
+ return projectNames ?? [];
3325
3154
  }
3155
+ //#endregion
3156
+ //#region src/cli/report-analyze.ts
3157
+ const reportAnalyzeHelpText = `
3158
+ Analyze generated vieval report artifacts.
3159
+
3160
+ Usage
3161
+ $ vieval report analyze <reportPath> [options]
3162
+
3163
+ Options
3164
+ --format Output format: table | json | jsonl | csv (default: table)
3165
+ --workspace Workspace id filter
3166
+ --project Project name filter (exact)
3167
+ --experiment Experiment id filter
3168
+ --attempt Attempt id filter
3169
+ --run Run id filter
3170
+ --task-state Keep runs containing at least one task in this state
3171
+ --case-state Keep runs containing at least one case in this state
3172
+ --contains Keep runs containing this text in event name or payload
3173
+ --error-contains Keep runs containing this text in project errors or event payload
3174
+ --run-matrix Keep runs matching run-matrix selector "key=value[,key=value]"
3175
+ --eval-matrix Keep runs matching eval-matrix selector "key=value[,key=value]"
3176
+ `;
3326
3177
  /**
3327
3178
  * Builds experiment-level rollups from filtered run rows.
3328
3179
  *
@@ -3383,13 +3234,13 @@ function buildExperimentSummaries(rows) {
3383
3234
  const stdevAttemptSuccessRate = computeStandardDeviation(attemptSuccessRates);
3384
3235
  return {
3385
3236
  attemptCount: attemptToRuns.size,
3386
- attemptSummaries,
3387
3237
  attemptSuccessRateStats: {
3388
3238
  avg: roundMetric(avgAttemptSuccessRate),
3389
3239
  max: roundMetric(maxAttemptSuccessRate),
3390
3240
  min: roundMetric(minAttemptSuccessRate),
3391
3241
  stdev: roundMetric(stdevAttemptSuccessRate)
3392
3242
  },
3243
+ attemptSummaries,
3393
3244
  experimentId,
3394
3245
  failedProjects,
3395
3246
  runCount: groupRows.length,
@@ -3404,16 +3255,94 @@ function buildExperimentSummaries(rows) {
3404
3255
  return left.experimentId.localeCompare(right.experimentId);
3405
3256
  });
3406
3257
  }
3407
- function formatTableOutput$1(output) {
3408
- const header = "Run ID | Workspace | Experiment | Attempt | Projects(executed/total) | FailedProjects | Tasks | Events";
3409
- const lines = output.runs.map((row) => {
3410
- return `${row.runId ?? "n/a"} | ${row.workspaceId ?? "n/a"} | ${row.experimentId ?? "n/a"} | ${row.attemptId ?? "n/a"} | ${`${row.executedProjects}/${row.totalProjects}`} | ${row.failedProjects} | ${row.totalTasks} | ${row.eventsCount}`;
3258
+ function parseReportAnalyzeCliArguments(argv) {
3259
+ const cli = meow(reportAnalyzeHelpText, {
3260
+ argv: normalizeCliArgv$3(argv),
3261
+ flags: {
3262
+ attempt: { type: "string" },
3263
+ caseState: { type: "string" },
3264
+ contains: { type: "string" },
3265
+ errorContains: { type: "string" },
3266
+ evalMatrix: { type: "string" },
3267
+ experiment: { type: "string" },
3268
+ format: {
3269
+ default: "table",
3270
+ type: "string"
3271
+ },
3272
+ project: { type: "string" },
3273
+ run: { type: "string" },
3274
+ runMatrix: { type: "string" },
3275
+ taskState: { type: "string" },
3276
+ workspace: { type: "string" }
3277
+ },
3278
+ importMeta: import.meta
3279
+ });
3280
+ const reportPath = cli.input[0];
3281
+ if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
3282
+ const normalizedFormat = cli.flags.format.toLowerCase();
3283
+ const format = normalizedFormat === "json" ? "json" : normalizedFormat === "jsonl" ? "jsonl" : normalizedFormat === "csv" ? "csv" : "table";
3284
+ return {
3285
+ attempt: cli.flags.attempt,
3286
+ caseState: normalizeStateFilter(cli.flags.caseState),
3287
+ contains: cli.flags.contains,
3288
+ errorContains: cli.flags.errorContains,
3289
+ evalMatrix: parseMatrixSelector(cli.flags.evalMatrix),
3290
+ experiment: cli.flags.experiment,
3291
+ format,
3292
+ project: cli.flags.project,
3293
+ reportPath,
3294
+ run: cli.flags.run,
3295
+ runMatrix: parseMatrixSelector(cli.flags.runMatrix),
3296
+ taskState: normalizeStateFilter(cli.flags.taskState),
3297
+ workspace: cli.flags.workspace
3298
+ };
3299
+ }
3300
+ async function runReportAnalyzeCli(argv) {
3301
+ try {
3302
+ const parsed = parseReportAnalyzeCliArguments(argv);
3303
+ const output = await readReportAnalyzeOutput(parsed);
3304
+ if (parsed.format === "json") {
3305
+ process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
3306
+ return;
3307
+ }
3308
+ if (parsed.format === "jsonl") {
3309
+ const jsonl = output.runs.map((run) => JSON.stringify(run)).join("\n");
3310
+ process.stdout.write(`${jsonl}${jsonl.length > 0 ? "\n" : ""}`);
3311
+ return;
3312
+ }
3313
+ if (parsed.format === "csv") {
3314
+ process.stdout.write(`${formatCsvOutput(output)}\n`);
3315
+ return;
3316
+ }
3317
+ process.stdout.write(`${formatTableOutput$1(output)}\n`);
3318
+ } catch (error) {
3319
+ const errorMessage = errorMessageFrom(error) ?? "Unknown report analyze failure.";
3320
+ process.stderr.write(`[vieval report analyze] ${errorMessage}\n`);
3321
+ process.exitCode = 1;
3322
+ }
3323
+ }
3324
+ function computeAverage(values) {
3325
+ if (values.length === 0) return 0;
3326
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
3327
+ }
3328
+ function computeStandardDeviation(values) {
3329
+ if (values.length === 0) return 0;
3330
+ const average = computeAverage(values);
3331
+ const variance = computeAverage(values.map((value) => (value - average) ** 2));
3332
+ return Math.sqrt(variance);
3333
+ }
3334
+ function createExperimentGroupKey(row) {
3335
+ return `${row.workspaceId ?? "unknown-workspace"}::${row.experimentId ?? "unknown-experiment"}`;
3336
+ }
3337
+ function filterAnalyzeRows(rows, parsed) {
3338
+ return rows.filter((row) => {
3339
+ if (parsed.workspace != null && row.workspaceId !== parsed.workspace) return false;
3340
+ if (parsed.experiment != null && row.experimentId !== parsed.experiment) return false;
3341
+ if (parsed.attempt != null && row.attemptId !== parsed.attempt) return false;
3342
+ if (parsed.run != null && row.runId !== parsed.run) return false;
3343
+ if (parsed.project != null && !row.projectNames.includes(parsed.project)) return false;
3344
+ return true;
3411
3345
  });
3412
- return [
3413
- `ANALYZE vieval report: ${output.filteredRunCount}/${output.totalRunCount} runs (${output.experimentSummaries.length} experiment groups)`,
3414
- header,
3415
- ...lines
3416
- ].join("\n");
3417
3346
  }
3418
3347
  function formatCsvOutput(output) {
3419
3348
  return [[
@@ -3446,29 +3375,100 @@ function formatCsvOutput(output) {
3446
3375
  ].join(",");
3447
3376
  })].join("\n");
3448
3377
  }
3449
- async function runReportAnalyzeCli(argv) {
3450
- try {
3451
- const parsed = parseReportAnalyzeCliArguments(argv);
3452
- const output = await readReportAnalyzeOutput(parsed);
3453
- if (parsed.format === "json") {
3454
- process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
3455
- return;
3456
- }
3457
- if (parsed.format === "jsonl") {
3458
- const jsonl = output.runs.map((run) => JSON.stringify(run)).join("\n");
3459
- process.stdout.write(`${jsonl}${jsonl.length > 0 ? "\n" : ""}`);
3460
- return;
3461
- }
3462
- if (parsed.format === "csv") {
3463
- process.stdout.write(`${formatCsvOutput(output)}\n`);
3464
- return;
3465
- }
3466
- process.stdout.write(`${formatTableOutput$1(output)}\n`);
3467
- } catch (error) {
3468
- const errorMessage = errorMessageFrom(error) ?? "Unknown report analyze failure.";
3469
- process.stderr.write(`[vieval report analyze] ${errorMessage}\n`);
3470
- process.exitCode = 1;
3378
+ function formatTableOutput$1(output) {
3379
+ const header = "Run ID | Workspace | Experiment | Attempt | Projects(executed/total) | FailedProjects | Tasks | Events";
3380
+ const lines = output.runs.map((row) => {
3381
+ return `${row.runId ?? "n/a"} | ${row.workspaceId ?? "n/a"} | ${row.experimentId ?? "n/a"} | ${row.attemptId ?? "n/a"} | ${`${row.executedProjects}/${row.totalProjects}`} | ${row.failedProjects} | ${row.totalTasks} | ${row.eventsCount}`;
3382
+ });
3383
+ return [
3384
+ `ANALYZE vieval report: ${output.filteredRunCount}/${output.totalRunCount} runs (${output.experimentSummaries.length} experiment groups)`,
3385
+ header,
3386
+ ...lines
3387
+ ].join("\n");
3388
+ }
3389
+ function hasCaseState(artifact, targetState) {
3390
+ return artifact.events.some((event) => {
3391
+ if (event.event !== "CaseEnded") return false;
3392
+ return event.data?.state === targetState;
3393
+ });
3394
+ }
3395
+ function hasEvalMatrixMatch(artifact, selector) {
3396
+ return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.eval, selector)) === true);
3397
+ }
3398
+ function hasRunMatrixMatch(artifact, selector) {
3399
+ return artifact.summary.projects.some((project) => project.result?.runs.some((run) => matchesMatrixSelector(run.matrix.run, selector)) === true);
3400
+ }
3401
+ function hasTaskState(artifact, targetState) {
3402
+ return artifact.events.some((event) => {
3403
+ if (event.event !== "TaskEnded") return false;
3404
+ return event.data?.state === targetState;
3405
+ });
3406
+ }
3407
+ function includesNeedle(value, needle) {
3408
+ const normalizedNeedle = needle.trim().toLowerCase();
3409
+ if (normalizedNeedle.length === 0) return true;
3410
+ return JSON.stringify(value).toLowerCase().includes(normalizedNeedle);
3411
+ }
3412
+ function matchesMatrixSelector(matrix, selector) {
3413
+ return Object.entries(selector).every(([key, expectedValue]) => String(matrix[key]) === expectedValue);
3414
+ }
3415
+ function matchesOutcomeFilters(artifact, parsed) {
3416
+ if (parsed.runMatrix != null && !hasRunMatrixMatch(artifact, parsed.runMatrix)) return false;
3417
+ if (parsed.evalMatrix != null && !hasEvalMatrixMatch(artifact, parsed.evalMatrix)) return false;
3418
+ if (parsed.taskState != null && !hasTaskState(artifact, parsed.taskState)) return false;
3419
+ if (parsed.caseState != null && !hasCaseState(artifact, parsed.caseState)) return false;
3420
+ if (parsed.contains != null) {
3421
+ if (!artifact.events.some((event) => includesNeedle({
3422
+ data: event.data,
3423
+ event: event.event
3424
+ }, parsed.contains))) return false;
3471
3425
  }
3426
+ if (parsed.errorContains != null) {
3427
+ if (!(artifact.summary.projects.map((project) => project.errorMessage).filter((errorMessage) => errorMessage != null).some((errorMessage) => includesNeedle(errorMessage, parsed.errorContains)) || artifact.events.some((event) => includesNeedle(event.data, parsed.errorContains)))) return false;
3428
+ }
3429
+ return true;
3430
+ }
3431
+ function normalizeCliArgv$3(argv) {
3432
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3433
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
3434
+ if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
3435
+ return normalizedArgv;
3436
+ }
3437
+ function normalizeStateFilter(value) {
3438
+ if (value == null) return;
3439
+ const normalized = value.trim().toLowerCase();
3440
+ if (normalized === "passed" || normalized === "failed" || normalized === "skipped") return normalized;
3441
+ throw new Error(`Unsupported state filter "${value}". Expected "passed", "failed", or "skipped".`);
3442
+ }
3443
+ function parseMatrixSelector(value) {
3444
+ if (value == null) return;
3445
+ const selector = {};
3446
+ const segments = value.split(",").map((segment) => segment.trim()).filter((segment) => segment.length > 0);
3447
+ for (const segment of segments) {
3448
+ const separatorIndex = segment.indexOf("=");
3449
+ if (separatorIndex <= 0 || separatorIndex === segment.length - 1) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
3450
+ const key = segment.slice(0, separatorIndex).trim();
3451
+ const parsedValue = segment.slice(separatorIndex + 1).trim();
3452
+ if (key.length === 0 || parsedValue.length === 0) throw new Error(`Invalid matrix selector segment "${segment}". Expected "key=value".`);
3453
+ selector[key] = parsedValue;
3454
+ }
3455
+ return selector;
3456
+ }
3457
+ async function readReportAnalyzeOutput(parsed) {
3458
+ const artifacts = await readReportArtifacts(parsed.reportPath);
3459
+ const rows = artifacts.map((artifact) => summarizeReportRunArtifact(artifact));
3460
+ const identityFilteredRows = filterAnalyzeRows(rows, parsed);
3461
+ const rowByDirectory = new Map(identityFilteredRows.map((row) => [row.reportDirectory, row]));
3462
+ const filteredRows = artifacts.filter((artifact) => rowByDirectory.has(artifact.reportDirectory)).filter((artifact) => matchesOutcomeFilters(artifact, parsed)).map((artifact) => rowByDirectory.get(artifact.reportDirectory)).filter((row) => row != null);
3463
+ return {
3464
+ experimentSummaries: buildExperimentSummaries(filteredRows),
3465
+ filteredRunCount: filteredRows.length,
3466
+ runs: filteredRows,
3467
+ totalRunCount: rows.length
3468
+ };
3469
+ }
3470
+ function roundMetric(value) {
3471
+ return Number(value.toFixed(6));
3472
3472
  }
3473
3473
  //#endregion
3474
3474
  //#region src/cli/report-case-compare.ts
@@ -3545,6 +3545,50 @@ function buildCaseComparison(args) {
3545
3545
  };
3546
3546
  }
3547
3547
  /**
3548
+ * Formats a case comparison as a compact human-readable table.
3549
+ *
3550
+ * Use when:
3551
+ * - `vieval report compare` should expose the same information as JSON output
3552
+ * - users need a terminal-first overview of group and per-case deltas
3553
+ *
3554
+ * Expects:
3555
+ * - comparison output was produced by {@link buildCaseComparison}
3556
+ *
3557
+ * Returns:
3558
+ * - multi-line text containing aggregate, group, top-change, case, and unmatched summaries
3559
+ */
3560
+ function formatCaseComparisonTable(output) {
3561
+ const lines = [
3562
+ "COMPARE vieval report cases",
3563
+ `Matched ${output.cases.length}`,
3564
+ `Added ${output.added.length}`,
3565
+ `Removed ${output.removed.length}`,
3566
+ `Scores left=${output.overall.leftAverage.toFixed(3)} right=${output.overall.rightAverage.toFixed(3)} delta=${output.overall.delta.toFixed(3)}`
3567
+ ];
3568
+ if (output.groups != null && Object.keys(output.groups).length > 0) {
3569
+ lines.push("Groups");
3570
+ for (const [groupKey, group] of Object.entries(output.groups)) lines.push(`${groupKey} count=${group.count} left=${group.leftAverage.toFixed(3)} right=${group.rightAverage.toFixed(3)} delta=${group.delta.toFixed(3)}`);
3571
+ }
3572
+ if (output.topImprovements.length > 0) {
3573
+ lines.push("Top improvements");
3574
+ for (const row of output.topImprovements) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
3575
+ }
3576
+ if (output.topRegressions.length > 0) {
3577
+ lines.push("Top regressions");
3578
+ for (const row of output.topRegressions) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
3579
+ }
3580
+ if (output.cases.length > 0) {
3581
+ lines.push("Cases");
3582
+ for (const row of output.cases) {
3583
+ const changedMetricNames = Object.keys(row.metricsChanged);
3584
+ lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} changedMetrics=${changedMetricNames.length === 0 ? "none" : changedMetricNames.join(",")}`);
3585
+ }
3586
+ }
3587
+ if (output.added.length > 0) lines.push(`Added cases ${output.added.map((record) => record.caseId).join(",")}`);
3588
+ if (output.removed.length > 0) lines.push(`Removed cases ${output.removed.map((record) => record.caseId).join(",")}`);
3589
+ return lines.join("\n");
3590
+ }
3591
+ /**
3548
3592
  * Runs the `vieval report compare` command.
3549
3593
  *
3550
3594
  * Call stack:
@@ -3586,6 +3630,55 @@ async function runReportCompareCli(argv) {
3586
3630
  process.exitCode = 1;
3587
3631
  }
3588
3632
  }
3633
+ function averageScore(records, scoreKind) {
3634
+ const values = records.map((record) => record.scores[scoreKind]).filter((value) => typeof value === "number");
3635
+ if (values.length === 0) return 0;
3636
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
3637
+ }
3638
+ function buildComparisonGroups(cases, groupBy) {
3639
+ const groupedRows = {};
3640
+ for (const row of cases) {
3641
+ const resolved = getCaseSelectorValue(row.right, groupBy);
3642
+ if (!resolved.exists) continue;
3643
+ const groupKey = `${groupBy}=${String(resolved.value)}`;
3644
+ groupedRows[groupKey] ??= [];
3645
+ groupedRows[groupKey].push(row);
3646
+ }
3647
+ return Object.fromEntries(Object.entries(groupedRows).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, rows]) => {
3648
+ const leftAverage = rows.reduce((sum, row) => sum + row.delta.left, 0) / rows.length;
3649
+ const rightAverage = rows.reduce((sum, row) => sum + row.delta.right, 0) / rows.length;
3650
+ return [groupKey, {
3651
+ count: rows.length,
3652
+ delta: rightAverage - leftAverage,
3653
+ leftAverage,
3654
+ rightAverage
3655
+ }];
3656
+ }));
3657
+ }
3658
+ function compareCaseRecords(left, right) {
3659
+ return left.caseId.localeCompare(right.caseId);
3660
+ }
3661
+ function diffMetrics(left, right) {
3662
+ const changed = {};
3663
+ const metricKeys = [.../* @__PURE__ */ new Set([...Object.keys(left), ...Object.keys(right)])].sort((leftKey, rightKey) => leftKey.localeCompare(rightKey));
3664
+ for (const metricKey of metricKeys) if (stableStringify(left[metricKey]) !== stableStringify(right[metricKey])) changed[metricKey] = {
3665
+ left: left[metricKey],
3666
+ right: right[metricKey]
3667
+ };
3668
+ return changed;
3669
+ }
3670
+ function getScore(record, scoreKind) {
3671
+ return record.scores[scoreKind] ?? 0;
3672
+ }
3673
+ function indexRecordsByCaseKey(records, caseKey, side) {
3674
+ const indexed = /* @__PURE__ */ new Map();
3675
+ for (const record of records) {
3676
+ const resolved = resolveCaseKey(record, caseKey);
3677
+ if (indexed.has(resolved)) throw new Error(`Duplicate case key "${resolved}" in ${side} report.`);
3678
+ indexed.set(resolved, record);
3679
+ }
3680
+ return indexed;
3681
+ }
3589
3682
  function normalizeCliArgv$2(argv) {
3590
3683
  const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3591
3684
  if (normalizedArgv[0] === "report" && normalizedArgv[1] === "compare") return normalizedArgv.slice(2);
@@ -3621,15 +3714,6 @@ function parseReportCompareCliArguments(argv) {
3621
3714
  scoreKind: cli.flags.scoreKind
3622
3715
  };
3623
3716
  }
3624
- function indexRecordsByCaseKey(records, caseKey, side) {
3625
- const indexed = /* @__PURE__ */ new Map();
3626
- for (const record of records) {
3627
- const resolved = resolveCaseKey(record, caseKey);
3628
- if (indexed.has(resolved)) throw new Error(`Duplicate case key "${resolved}" in ${side} report.`);
3629
- indexed.set(resolved, record);
3630
- }
3631
- return indexed;
3632
- }
3633
3717
  function resolveCaseKey(record, caseKey) {
3634
3718
  if (caseKey != null) {
3635
3719
  const resolved = getCaseSelectorValue(record, caseKey);
@@ -3641,90 +3725,6 @@ function resolveCaseKey(record, caseKey) {
3641
3725
  const vievalCaseId = getCaseSelectorValue(record, "vieval.case.id");
3642
3726
  return vievalCaseId.exists ? String(vievalCaseId.value) : record.caseId;
3643
3727
  }
3644
- function getScore(record, scoreKind) {
3645
- return record.scores[scoreKind] ?? 0;
3646
- }
3647
- function averageScore(records, scoreKind) {
3648
- const values = records.map((record) => record.scores[scoreKind]).filter((value) => typeof value === "number");
3649
- if (values.length === 0) return 0;
3650
- return values.reduce((sum, value) => sum + value, 0) / values.length;
3651
- }
3652
- function diffMetrics(left, right) {
3653
- const changed = {};
3654
- const metricKeys = [...new Set([...Object.keys(left), ...Object.keys(right)])].sort((leftKey, rightKey) => leftKey.localeCompare(rightKey));
3655
- for (const metricKey of metricKeys) if (stableStringify(left[metricKey]) !== stableStringify(right[metricKey])) changed[metricKey] = {
3656
- left: left[metricKey],
3657
- right: right[metricKey]
3658
- };
3659
- return changed;
3660
- }
3661
- function buildComparisonGroups(cases, groupBy) {
3662
- const groupedRows = {};
3663
- for (const row of cases) {
3664
- const resolved = getCaseSelectorValue(row.right, groupBy);
3665
- if (!resolved.exists) continue;
3666
- const groupKey = `${groupBy}=${String(resolved.value)}`;
3667
- groupedRows[groupKey] ??= [];
3668
- groupedRows[groupKey].push(row);
3669
- }
3670
- return Object.fromEntries(Object.entries(groupedRows).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, rows]) => {
3671
- const leftAverage = rows.reduce((sum, row) => sum + row.delta.left, 0) / rows.length;
3672
- const rightAverage = rows.reduce((sum, row) => sum + row.delta.right, 0) / rows.length;
3673
- return [groupKey, {
3674
- count: rows.length,
3675
- delta: rightAverage - leftAverage,
3676
- leftAverage,
3677
- rightAverage
3678
- }];
3679
- }));
3680
- }
3681
- function compareCaseRecords(left, right) {
3682
- return left.caseId.localeCompare(right.caseId);
3683
- }
3684
- /**
3685
- * Formats a case comparison as a compact human-readable table.
3686
- *
3687
- * Use when:
3688
- * - `vieval report compare` should expose the same information as JSON output
3689
- * - users need a terminal-first overview of group and per-case deltas
3690
- *
3691
- * Expects:
3692
- * - comparison output was produced by {@link buildCaseComparison}
3693
- *
3694
- * Returns:
3695
- * - multi-line text containing aggregate, group, top-change, case, and unmatched summaries
3696
- */
3697
- function formatCaseComparisonTable(output) {
3698
- const lines = [
3699
- "COMPARE vieval report cases",
3700
- `Matched ${output.cases.length}`,
3701
- `Added ${output.added.length}`,
3702
- `Removed ${output.removed.length}`,
3703
- `Scores left=${output.overall.leftAverage.toFixed(3)} right=${output.overall.rightAverage.toFixed(3)} delta=${output.overall.delta.toFixed(3)}`
3704
- ];
3705
- if (output.groups != null && Object.keys(output.groups).length > 0) {
3706
- lines.push("Groups");
3707
- for (const [groupKey, group] of Object.entries(output.groups)) lines.push(`${groupKey} count=${group.count} left=${group.leftAverage.toFixed(3)} right=${group.rightAverage.toFixed(3)} delta=${group.delta.toFixed(3)}`);
3708
- }
3709
- if (output.topImprovements.length > 0) {
3710
- lines.push("Top improvements");
3711
- for (const row of output.topImprovements) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
3712
- }
3713
- if (output.topRegressions.length > 0) {
3714
- lines.push("Top regressions");
3715
- for (const row of output.topRegressions) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
3716
- }
3717
- if (output.cases.length > 0) {
3718
- lines.push("Cases");
3719
- for (const row of output.cases) {
3720
- const changedMetricNames = Object.keys(row.metricsChanged);
3721
- lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} changedMetrics=${changedMetricNames.length === 0 ? "none" : changedMetricNames.join(",")}`);
3722
- }
3723
- }
3724
- if (output.added.length > 0) lines.push(`Added cases ${output.added.map((record) => record.caseId).join(",")}`);
3725
- if (output.removed.length > 0) lines.push(`Removed cases ${output.removed.map((record) => record.caseId).join(",")}`);
3726
- return lines.join("\n");
3727
- }
3728
3728
  //#endregion
3729
3729
  //#region src/cli/report-index.ts
3730
3730
  const reportIndexHelpText = `
@@ -3737,12 +3737,6 @@ const reportIndexHelpText = `
3737
3737
  --output Output file path (default: <reportPath>/index/runs.jsonl)
3738
3738
  --format Console output format: table | json | jsonl (default: table)
3739
3739
  `;
3740
- function normalizeCliArgv$1(argv) {
3741
- const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3742
- if (normalizedArgv[0] === "report" && normalizedArgv[1] === "index") return normalizedArgv.slice(2);
3743
- if (normalizedArgv[0] === "index") return normalizedArgv.slice(1);
3744
- return normalizedArgv;
3745
- }
3746
3740
  function parseReportIndexCliArguments(argv) {
3747
3741
  const cli = meow(reportIndexHelpText, {
3748
3742
  argv: normalizeCliArgv$1(argv),
@@ -3764,25 +3758,6 @@ function parseReportIndexCliArguments(argv) {
3764
3758
  reportPath
3765
3759
  };
3766
3760
  }
3767
- async function writeIndexFile(parsed) {
3768
- const rows = (await readReportArtifacts(parsed.reportPath)).map((artifact) => summarizeReportRunArtifact(artifact));
3769
- const indexFilePath = resolve(parsed.output ?? resolve(parsed.reportPath, "index", "runs.jsonl"));
3770
- await mkdir(dirname(indexFilePath), { recursive: true });
3771
- const indexContents = rows.map((row) => JSON.stringify(row)).join("\n");
3772
- await writeFile(indexFilePath, `${indexContents}${indexContents.length > 0 ? "\n" : ""}`, "utf-8");
3773
- return {
3774
- indexFilePath,
3775
- indexedRunCount: rows.length,
3776
- rows
3777
- };
3778
- }
3779
- function formatTableOutput(output) {
3780
- return [
3781
- "INDEX vieval report",
3782
- `Path ${output.indexFilePath}`,
3783
- `Run count ${output.indexedRunCount}`
3784
- ].join("\n");
3785
- }
3786
3761
  async function runReportIndexCli(argv) {
3787
3762
  try {
3788
3763
  const parsed = parseReportIndexCliArguments(argv);
@@ -3803,6 +3778,31 @@ async function runReportIndexCli(argv) {
3803
3778
  process.exitCode = 1;
3804
3779
  }
3805
3780
  }
3781
+ function formatTableOutput(output) {
3782
+ return [
3783
+ "INDEX vieval report",
3784
+ `Path ${output.indexFilePath}`,
3785
+ `Run count ${output.indexedRunCount}`
3786
+ ].join("\n");
3787
+ }
3788
+ function normalizeCliArgv$1(argv) {
3789
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3790
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "index") return normalizedArgv.slice(2);
3791
+ if (normalizedArgv[0] === "index") return normalizedArgv.slice(1);
3792
+ return normalizedArgv;
3793
+ }
3794
+ async function writeIndexFile(parsed) {
3795
+ const rows = (await readReportArtifacts(parsed.reportPath)).map((artifact) => summarizeReportRunArtifact(artifact));
3796
+ const indexFilePath = resolve(parsed.output ?? resolve(parsed.reportPath, "index", "runs.jsonl"));
3797
+ await mkdir(dirname(indexFilePath), { recursive: true });
3798
+ const indexContents = rows.map((row) => JSON.stringify(row)).join("\n");
3799
+ await writeFile(indexFilePath, `${indexContents}${indexContents.length > 0 ? "\n" : ""}`, "utf-8");
3800
+ return {
3801
+ indexedRunCount: rows.length,
3802
+ indexFilePath,
3803
+ rows
3804
+ };
3805
+ }
3806
3806
  //#endregion
3807
3807
  //#region src/cli/index.ts
3808
3808
  const topLevelHelpText = `
@@ -3823,9 +3823,6 @@ const topLevelHelpText = `
3823
3823
  $ vieval report analyze .vieval/reports/my-run
3824
3824
  $ vieval report index .vieval/reports --output .vieval/reports/index/runs.jsonl
3825
3825
  `;
3826
- function normalizeCliArgv(argv) {
3827
- return argv[0] === "--" ? argv.slice(1) : [...argv];
3828
- }
3829
3826
  /**
3830
3827
  * Parses top-level `vieval` CLI arguments into one command dispatch payload.
3831
3828
  *
@@ -3843,9 +3840,9 @@ function parseTopLevelCliArguments(argv) {
3843
3840
  const normalizedArgv = normalizeCliArgv(argv);
3844
3841
  const command = normalizedArgv[0];
3845
3842
  meow(topLevelHelpText, {
3843
+ argv: normalizedArgv,
3846
3844
  autoHelp: false,
3847
3845
  autoVersion: false,
3848
- argv: normalizedArgv,
3849
3846
  importMeta: import.meta
3850
3847
  });
3851
3848
  if (command == null || command === "help" || command === "--help" || command === "-h") return {
@@ -3909,7 +3906,10 @@ async function runTopLevelCli(argv) {
3909
3906
  }
3910
3907
  await runEvalRunCli(parsed.commandArgv);
3911
3908
  }
3909
+ function normalizeCliArgv(argv) {
3910
+ return argv[0] === "--" ? argv.slice(1) : [...argv];
3911
+ }
3912
3912
  //#endregion
3913
3913
  export { runTopLevelCli as n, parseTopLevelCliArguments as t };
3914
3914
 
3915
- //# sourceMappingURL=cli-DTDgaqeI.mjs.map
3915
+ //# sourceMappingURL=cli-uzS81IPd.mjs.map