vieval 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +219 -109
- package/dist/bin/vieval.mjs +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-DayPXzHX.mjs → cli-ImxGpoYQ.mjs} +1447 -195
- package/dist/cli-ImxGpoYQ.mjs.map +1 -0
- package/dist/config.d.mts +2 -2
- package/dist/config.mjs +1 -1
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/inference-executors/index.d.mts +1 -1
- package/dist/core/inference-executors/index.mjs +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +3 -2
- package/dist/core/runner/index.mjs +3 -2
- package/dist/core/runner/index.mjs.map +1 -1
- package/dist/core/scheduler/index.d.mts +2 -0
- package/dist/core/scheduler/index.mjs +188 -0
- package/dist/core/scheduler/index.mjs.map +1 -0
- package/dist/{env-BFSjny07.mjs → env--94B0UtW.mjs} +1 -1
- package/dist/{env-BFSjny07.mjs.map → env--94B0UtW.mjs.map} +1 -1
- package/dist/{env-BTq3dV7C.d.mts → env-BeHv_5mo.d.mts} +1 -1
- package/dist/{expect-extensions-QLXESWjn.mjs → expect-extensions-DCSqlneN.mjs} +1 -1
- package/dist/{expect-extensions-QLXESWjn.mjs.map → expect-extensions-DCSqlneN.mjs.map} +1 -1
- package/dist/expect.mjs +1 -1
- package/dist/{index-OEdqjQSe.d.mts → index-5R1_k2nv.d.mts} +195 -3
- package/dist/index-fakXoZEe.d.mts +147 -0
- package/dist/index.d.mts +120 -13
- package/dist/index.mjs +286 -54
- package/dist/index.mjs.map +1 -1
- package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
- package/dist/models-DIGdOUpJ.mjs.map +1 -0
- package/dist/plugins/chat-models/index.d.mts +27 -1
- package/dist/plugins/chat-models/index.mjs +29 -1
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/queue-DsZQkZO_.mjs +21 -0
- package/dist/queue-DsZQkZO_.mjs.map +1 -0
- package/dist/{registry-CwcMMjnZ.mjs → registry-BHGMxjpA.mjs} +164 -6
- package/dist/registry-BHGMxjpA.mjs.map +1 -0
- package/dist/testing/expect-extensions.mjs +1 -1
- package/package.json +8 -1
- package/dist/cli-DayPXzHX.mjs.map +0 -1
- package/dist/models-D_MsBtYw.mjs.map +0 -1
- package/dist/registry-CwcMMjnZ.mjs.map +0 -1
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { c as
|
|
1
|
+
import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-BHGMxjpA.mjs";
|
|
2
|
+
import { createSchedulerRuntime } from "./core/scheduler/index.mjs";
|
|
2
3
|
import { RunnerExecutionError, collectEvalEntries, createFilesystemTaskCacheRuntime, createRunnerRuntimeContext, createRunnerSchedule, createTaskExecutionContext, runScheduledTasks } from "./core/runner/index.mjs";
|
|
3
4
|
import process from "node:process";
|
|
4
5
|
import { errorMessageFrom } from "@moeru/std";
|
|
@@ -12,6 +13,7 @@ import c from "tinyrainbow";
|
|
|
12
13
|
import { existsSync, readFileSync } from "node:fs";
|
|
13
14
|
import { uniq } from "es-toolkit";
|
|
14
15
|
import { createVitest } from "vitest/node";
|
|
16
|
+
import { formatDuration, intervalToDuration } from "date-fns";
|
|
15
17
|
import { stripVTControlCharacters } from "node:util";
|
|
16
18
|
import stringWidth from "fast-string-width";
|
|
17
19
|
//#region src/cli/comparison-config.ts
|
|
@@ -262,6 +264,612 @@ async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
|
|
|
262
264
|
return loadedModules;
|
|
263
265
|
}
|
|
264
266
|
//#endregion
|
|
267
|
+
//#region src/cli/report-selectors.ts
|
|
268
|
+
/**
|
|
269
|
+
* Resolves a generic case selector from metrics, scores, then direct fields.
|
|
270
|
+
*
|
|
271
|
+
* Use when:
|
|
272
|
+
* - report commands accept benchmark-neutral selectors such as `benchmark.case.id`
|
|
273
|
+
* - comparisons need the same lookup semantics as filtering and grouping
|
|
274
|
+
*
|
|
275
|
+
* Expects:
|
|
276
|
+
* - `key` is a direct `CaseRecord` field, score key, `scores.<key>`, or metric key
|
|
277
|
+
*
|
|
278
|
+
* Returns:
|
|
279
|
+
* - existence flag plus matched value when present
|
|
280
|
+
*/
|
|
281
|
+
function getCaseSelectorValue(record, key) {
|
|
282
|
+
if (Object.hasOwn(record.metrics, key)) return {
|
|
283
|
+
exists: true,
|
|
284
|
+
value: record.metrics[key]
|
|
285
|
+
};
|
|
286
|
+
if (key.startsWith("scores.") && Object.hasOwn(record.scores, key.slice(7))) return {
|
|
287
|
+
exists: true,
|
|
288
|
+
value: record.scores[key.slice(7)]
|
|
289
|
+
};
|
|
290
|
+
if (Object.hasOwn(record.scores, key)) return {
|
|
291
|
+
exists: true,
|
|
292
|
+
value: record.scores[key]
|
|
293
|
+
};
|
|
294
|
+
if (Object.hasOwn(record, key)) return {
|
|
295
|
+
exists: true,
|
|
296
|
+
value: record[key]
|
|
297
|
+
};
|
|
298
|
+
return { exists: false };
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Stable-stringifies JSON-like values for report comparisons.
|
|
302
|
+
*
|
|
303
|
+
* Before:
|
|
304
|
+
* - `{ b: 1, a: true }`
|
|
305
|
+
*
|
|
306
|
+
* After:
|
|
307
|
+
* - `{"a":true,"b":1}`
|
|
308
|
+
*/
|
|
309
|
+
function stableStringify(value) {
|
|
310
|
+
if (value == null || typeof value !== "object") return JSON.stringify(value);
|
|
311
|
+
if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
|
|
312
|
+
const record = value;
|
|
313
|
+
return `{${Object.keys(record).sort((left, right) => left.localeCompare(right)).map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(",")}}`;
|
|
314
|
+
}
|
|
315
|
+
//#endregion
|
|
316
|
+
//#region src/cli/report-otlp.ts
|
|
317
|
+
/**
|
|
318
|
+
* Builds local OTLP-shaped JSON projections from normalized case records.
|
|
319
|
+
*
|
|
320
|
+
* Use when:
|
|
321
|
+
* - writing deterministic report artifacts without requiring an OpenTelemetry Collector
|
|
322
|
+
* - future tools need trace/log/metric-shaped JSON files
|
|
323
|
+
*
|
|
324
|
+
* Expects:
|
|
325
|
+
* - records belong to one Vieval run
|
|
326
|
+
*
|
|
327
|
+
* Returns:
|
|
328
|
+
* - trace, log, and metric containers shaped after OTLP JSON concepts
|
|
329
|
+
*/
|
|
330
|
+
function buildLocalOtlpProjection(args) {
|
|
331
|
+
const projectSpans = collectProjectNames(args.records).map((projectName) => ({
|
|
332
|
+
attributes: toAttributes({
|
|
333
|
+
"vieval.project.name": projectName,
|
|
334
|
+
"vieval.run.id": args.runId
|
|
335
|
+
}),
|
|
336
|
+
name: "vieval.project"
|
|
337
|
+
}));
|
|
338
|
+
const taskSpans = collectTasks(args.records).map((task) => ({
|
|
339
|
+
attributes: toAttributes({
|
|
340
|
+
"vieval.project.name": task.projectName,
|
|
341
|
+
"vieval.run.id": args.runId,
|
|
342
|
+
"vieval.task.id": task.taskId
|
|
343
|
+
}),
|
|
344
|
+
name: "vieval.task"
|
|
345
|
+
}));
|
|
346
|
+
const caseSpans = args.records.map((record) => ({
|
|
347
|
+
attributes: toAttributes({
|
|
348
|
+
...record.metrics,
|
|
349
|
+
"vieval.case.duration_ms": record.durationMs,
|
|
350
|
+
"vieval.case.id": record.caseId,
|
|
351
|
+
"vieval.case.name": record.caseName,
|
|
352
|
+
"vieval.case.retry_count": record.retryCount,
|
|
353
|
+
"vieval.case.state": record.state,
|
|
354
|
+
"vieval.project.name": record.projectName,
|
|
355
|
+
"vieval.task.id": record.taskId
|
|
356
|
+
}),
|
|
357
|
+
endTimeUnixNano: isoToUnixNano(record.endedAt),
|
|
358
|
+
name: "vieval.case",
|
|
359
|
+
startTimeUnixNano: isoToUnixNano(record.startedAt)
|
|
360
|
+
}));
|
|
361
|
+
return {
|
|
362
|
+
logs: { resourceLogs: [{ scopeLogs: [{
|
|
363
|
+
logRecords: args.records.map((record) => ({
|
|
364
|
+
attributes: toAttributes(record.metrics),
|
|
365
|
+
body: { stringValue: JSON.stringify({
|
|
366
|
+
caseId: record.caseId,
|
|
367
|
+
scores: record.scores,
|
|
368
|
+
state: record.state
|
|
369
|
+
}) },
|
|
370
|
+
eventName: "vieval.case",
|
|
371
|
+
timeUnixNano: isoToUnixNano(record.endedAt)
|
|
372
|
+
})),
|
|
373
|
+
scope: { name: "vieval" }
|
|
374
|
+
}] }] },
|
|
375
|
+
metrics: { resourceMetrics: [{ scopeMetrics: [{
|
|
376
|
+
metrics: collectScoreKinds(args.records).map((kind) => ({
|
|
377
|
+
gauge: { dataPoints: args.records.filter((record) => typeof record.scores[kind] === "number").map((record) => ({
|
|
378
|
+
asDouble: record.scores[kind],
|
|
379
|
+
attributes: toAttributes({
|
|
380
|
+
...record.metrics,
|
|
381
|
+
"vieval.case.id": record.caseId,
|
|
382
|
+
"vieval.task.id": record.taskId
|
|
383
|
+
}),
|
|
384
|
+
timeUnixNano: isoToUnixNano(record.endedAt)
|
|
385
|
+
})) },
|
|
386
|
+
name: `vieval.score.${kind}`
|
|
387
|
+
})),
|
|
388
|
+
scope: { name: "vieval" }
|
|
389
|
+
}] }] },
|
|
390
|
+
traces: { resourceSpans: [{ scopeSpans: [{
|
|
391
|
+
scope: { name: "vieval" },
|
|
392
|
+
spans: [
|
|
393
|
+
{
|
|
394
|
+
attributes: toAttributes({ "vieval.run.id": args.runId }),
|
|
395
|
+
name: "vieval.run"
|
|
396
|
+
},
|
|
397
|
+
...projectSpans,
|
|
398
|
+
...taskSpans,
|
|
399
|
+
...caseSpans
|
|
400
|
+
]
|
|
401
|
+
}] }] }
|
|
402
|
+
};
|
|
403
|
+
}
|
|
404
|
+
function toAttributes(attributes) {
|
|
405
|
+
return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
|
|
406
|
+
key,
|
|
407
|
+
value: toAnyValue(value)
|
|
408
|
+
}));
|
|
409
|
+
}
|
|
410
|
+
function toAnyValue(value) {
|
|
411
|
+
if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
|
|
412
|
+
if (isAttributeScalar(value)) {
|
|
413
|
+
if (typeof value === "boolean") return { boolValue: value };
|
|
414
|
+
if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
|
|
415
|
+
if (value == null) return { stringValue: "null" };
|
|
416
|
+
return { stringValue: value };
|
|
417
|
+
}
|
|
418
|
+
return { stringValue: stableStringify(value) };
|
|
419
|
+
}
|
|
420
|
+
function isAttributeScalar(value) {
|
|
421
|
+
return value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string";
|
|
422
|
+
}
|
|
423
|
+
function isoToUnixNano(value) {
|
|
424
|
+
const preciseMatch = /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.(\d{1,9}))?(Z|[+-]\d{2}:\d{2})$/.exec(value);
|
|
425
|
+
if (preciseMatch != null) {
|
|
426
|
+
const [, secondsPart, fraction = "", zone] = preciseMatch;
|
|
427
|
+
const unixMilliseconds = Date.parse(`${secondsPart}.000${zone}`);
|
|
428
|
+
if (!Number.isFinite(unixMilliseconds)) return "0";
|
|
429
|
+
return String(BigInt(unixMilliseconds) * 1000000n + BigInt(fraction.padEnd(9, "0").slice(0, 9)));
|
|
430
|
+
}
|
|
431
|
+
const unixMilliseconds = Date.parse(value);
|
|
432
|
+
if (!Number.isFinite(unixMilliseconds)) return "0";
|
|
433
|
+
return String(BigInt(unixMilliseconds) * 1000000n);
|
|
434
|
+
}
|
|
435
|
+
function collectScoreKinds(records) {
|
|
436
|
+
return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
|
|
437
|
+
}
|
|
438
|
+
function collectProjectNames(records) {
|
|
439
|
+
return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
|
|
440
|
+
}
|
|
441
|
+
function collectTasks(records) {
|
|
442
|
+
const tasks = /* @__PURE__ */ new Map();
|
|
443
|
+
for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
|
|
444
|
+
projectName: record.projectName,
|
|
445
|
+
taskId: record.taskId
|
|
446
|
+
});
|
|
447
|
+
return [...tasks.values()].sort((left, right) => {
|
|
448
|
+
const projectOrder = left.projectName.localeCompare(right.projectName);
|
|
449
|
+
return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
|
|
450
|
+
});
|
|
451
|
+
}
|
|
452
|
+
//#endregion
|
|
453
|
+
//#region src/cli/report-records.ts
|
|
454
|
+
/**
|
|
455
|
+
* Builds normalized case records from lifecycle, metric, and score events.
|
|
456
|
+
*
|
|
457
|
+
* Use when:
|
|
458
|
+
* - `events.jsonl` should be projected into `cases.jsonl`
|
|
459
|
+
* - report commands need one final record per observed case outcome
|
|
460
|
+
*
|
|
461
|
+
* Expects:
|
|
462
|
+
* - events are ordered by occurrence where possible
|
|
463
|
+
* - lifecycle events use either `task.case.start`/`task.case.end` or current CLI `CaseStarted`/`CaseEnded` names
|
|
464
|
+
*
|
|
465
|
+
* Returns:
|
|
466
|
+
* - records for cases that emitted an end lifecycle event
|
|
467
|
+
*/
|
|
468
|
+
function buildCaseRecords(args) {
|
|
469
|
+
const drafts = /* @__PURE__ */ new Map();
|
|
470
|
+
const completedKeys = [];
|
|
471
|
+
for (const event of args.events) {
|
|
472
|
+
const normalizedEvent = normalizeCaseEventName(event.event);
|
|
473
|
+
if (normalizedEvent == null) continue;
|
|
474
|
+
const ids = extractEventIds(event, args);
|
|
475
|
+
if (ids.caseId.length === 0 || ids.taskId.length === 0) continue;
|
|
476
|
+
const draft = getOrCreateDraft(drafts, ids, event, args);
|
|
477
|
+
applyIdentity(draft, ids, event, args);
|
|
478
|
+
if (normalizedEvent === "start") applyCaseStart(draft, event);
|
|
479
|
+
else if (normalizedEvent === "metric") applyCaseMetric(draft, event);
|
|
480
|
+
else if (normalizedEvent === "score") applyCaseScore(draft, event);
|
|
481
|
+
else {
|
|
482
|
+
applyCaseEnd(draft, event);
|
|
483
|
+
const key = createCaseKey(ids.taskId, ids.caseId);
|
|
484
|
+
if (!completedKeys.includes(key)) completedKeys.push(key);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
return completedKeys.map((key) => drafts.get(key)).filter((draft) => draft != null && draft.endedAt != null).map(toCaseRecord);
|
|
488
|
+
}
|
|
489
|
+
/**
|
|
490
|
+
* Builds generic score summaries overall and grouped by arbitrary keys.
|
|
491
|
+
*
|
|
492
|
+
* Use when:
|
|
493
|
+
* - report artifacts need benchmark-neutral aggregate score views
|
|
494
|
+
* - callers want to group by metrics such as `benchmark.category` or direct record fields such as `taskId`
|
|
495
|
+
*
|
|
496
|
+
* Expects:
|
|
497
|
+
* - `groupByKeys` are stable metric names or direct `CaseRecord` field names
|
|
498
|
+
* - record score values are normalized numeric scores
|
|
499
|
+
*
|
|
500
|
+
* Returns:
|
|
501
|
+
* - overall score buckets and group buckets keyed by `<key>=<value>`
|
|
502
|
+
*/
|
|
503
|
+
function buildMetricsSummary(records, groupByKeys) {
|
|
504
|
+
const overall = {};
|
|
505
|
+
const groups = {};
|
|
506
|
+
for (const record of records) {
|
|
507
|
+
addRecordScores(overall, record);
|
|
508
|
+
for (const groupByKey of groupByKeys) {
|
|
509
|
+
const groupValue = getGroupValue(record, groupByKey);
|
|
510
|
+
if (!groupValue.exists) continue;
|
|
511
|
+
const groupKey = `${groupByKey}=${String(groupValue.value)}`;
|
|
512
|
+
groups[groupKey] ??= {};
|
|
513
|
+
addRecordScores(groups[groupKey], record);
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
return {
|
|
517
|
+
groups: finalizeSummaryGroups(groups),
|
|
518
|
+
overall: finalizeScoreSummary(overall)
|
|
519
|
+
};
|
|
520
|
+
}
|
|
521
|
+
/**
|
|
522
|
+
* Encodes records as newline-delimited JSON.
|
|
523
|
+
*
|
|
524
|
+
* Use when:
|
|
525
|
+
* - writing `cases.jsonl` for command-line tools, dataframes, or streaming parsers
|
|
526
|
+
* - each record should occupy exactly one JSON line
|
|
527
|
+
*
|
|
528
|
+
* Expects:
|
|
529
|
+
* - records are JSON-serializable case records
|
|
530
|
+
*
|
|
531
|
+
* Returns:
|
|
532
|
+
* - one JSON object per line with a trailing newline for non-empty input
|
|
533
|
+
*/
|
|
534
|
+
function encodeJsonl(records) {
|
|
535
|
+
if (records.length === 0) return "";
|
|
536
|
+
return `${records.map((record) => JSON.stringify(record)).join("\n")}\n`;
|
|
537
|
+
}
|
|
538
|
+
function normalizeCaseEventName(eventName) {
|
|
539
|
+
if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
|
|
540
|
+
if (eventName === "task.case.metric") return "metric";
|
|
541
|
+
if (eventName === "task.case.score") return "score";
|
|
542
|
+
if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
|
|
543
|
+
}
|
|
544
|
+
function extractEventIds(event, args) {
|
|
545
|
+
const data = asRecord(event.data);
|
|
546
|
+
return {
|
|
547
|
+
attemptId: stringFrom(data?.attemptId) ?? event.attemptId ?? args.attemptId,
|
|
548
|
+
caseId: stringFrom(data?.caseId) ?? event.caseId ?? "",
|
|
549
|
+
experimentId: stringFrom(data?.experimentId) ?? event.experimentId ?? args.experimentId,
|
|
550
|
+
projectName: stringFrom(data?.projectName) ?? event.projectName ?? event.projectId ?? args.projectName,
|
|
551
|
+
runId: stringFrom(data?.runId) ?? event.runId ?? args.runId,
|
|
552
|
+
taskId: stringFrom(data?.taskId) ?? event.taskId ?? "",
|
|
553
|
+
workspaceId: stringFrom(data?.workspaceId) ?? event.workspaceId ?? args.workspaceId
|
|
554
|
+
};
|
|
555
|
+
}
|
|
556
|
+
function getOrCreateDraft(drafts, ids, event, args) {
|
|
557
|
+
const key = createCaseKey(ids.taskId, ids.caseId);
|
|
558
|
+
const existing = drafts.get(key);
|
|
559
|
+
if (existing != null) return existing;
|
|
560
|
+
const draft = {
|
|
561
|
+
attemptId: ids.attemptId,
|
|
562
|
+
caseId: ids.caseId,
|
|
563
|
+
caseName: extractCaseName(event) ?? ids.caseId,
|
|
564
|
+
experimentId: ids.experimentId,
|
|
565
|
+
metrics: {},
|
|
566
|
+
projectName: ids.projectName || args.projectName,
|
|
567
|
+
retryCount: 0,
|
|
568
|
+
runId: ids.runId,
|
|
569
|
+
scores: {},
|
|
570
|
+
startCount: 0,
|
|
571
|
+
taskId: ids.taskId,
|
|
572
|
+
workspaceId: ids.workspaceId
|
|
573
|
+
};
|
|
574
|
+
drafts.set(key, draft);
|
|
575
|
+
return draft;
|
|
576
|
+
}
|
|
577
|
+
function applyIdentity(draft, ids, event, args) {
|
|
578
|
+
draft.attemptId = ids.attemptId || args.attemptId;
|
|
579
|
+
draft.experimentId = ids.experimentId || args.experimentId;
|
|
580
|
+
draft.projectName = extractExplicitProjectName(event) ?? draft.projectName;
|
|
581
|
+
draft.runId = ids.runId || args.runId;
|
|
582
|
+
draft.workspaceId = ids.workspaceId || args.workspaceId;
|
|
583
|
+
}
|
|
584
|
+
function applyCaseStart(draft, event) {
|
|
585
|
+
const data = asRecord(event.data);
|
|
586
|
+
draft.startCount += 1;
|
|
587
|
+
draft.caseName = extractCaseName(event) ?? draft.caseName;
|
|
588
|
+
draft.startedAt ??= stringFrom(data?.startedAt) ?? event.timestamp;
|
|
589
|
+
draft.endedAt = void 0;
|
|
590
|
+
draft.input = void 0;
|
|
591
|
+
draft.metrics = {};
|
|
592
|
+
draft.output = void 0;
|
|
593
|
+
draft.scores = {};
|
|
594
|
+
draft.state = void 0;
|
|
595
|
+
draft.input = data != null && "input" in data ? data.input : draft.input;
|
|
596
|
+
const retryIndex = numberFrom(data?.retryIndex);
|
|
597
|
+
if (retryIndex != null) {
|
|
598
|
+
draft.retryCount = Math.max(draft.retryCount, retryIndex);
|
|
599
|
+
return;
|
|
600
|
+
}
|
|
601
|
+
draft.retryCount = Math.max(draft.retryCount, draft.startCount - 1);
|
|
602
|
+
}
|
|
603
|
+
function applyCaseMetric(draft, event) {
|
|
604
|
+
const data = asRecord(event.data);
|
|
605
|
+
const name = stringFrom(data?.name);
|
|
606
|
+
if (name == null) return;
|
|
607
|
+
const value = data?.value;
|
|
608
|
+
if (isCaseMetricValue(value)) draft.metrics[name] = value;
|
|
609
|
+
}
|
|
610
|
+
function applyCaseScore(draft, event) {
|
|
611
|
+
const data = asRecord(event.data);
|
|
612
|
+
const kind = stringFrom(data?.kind) ?? stringFrom(data?.name) ?? stringFrom(data?.["vieval.score.kind"]);
|
|
613
|
+
const score = numberFrom(data?.score) ?? numberFrom(data?.value) ?? numberFrom(data?.["vieval.score.value"]);
|
|
614
|
+
if (kind == null || score == null) return;
|
|
615
|
+
draft.scores[kind] = score;
|
|
616
|
+
}
|
|
617
|
+
function applyCaseEnd(draft, event) {
|
|
618
|
+
const data = asRecord(event.data);
|
|
619
|
+
draft.caseName = extractCaseName(event) ?? draft.caseName;
|
|
620
|
+
draft.endedAt = stringFrom(data?.endedAt) ?? event.timestamp ?? draft.endedAt;
|
|
621
|
+
draft.output = data != null && "output" in data ? data.output : draft.output;
|
|
622
|
+
draft.state = normalizeState(stringFrom(data?.state)) ?? "failed";
|
|
623
|
+
draft.scores.exact ??= draft.state === "passed" ? 1 : 0;
|
|
624
|
+
}
|
|
625
|
+
function toCaseRecord(draft) {
|
|
626
|
+
const startedAt = draft.startedAt ?? draft.endedAt ?? "";
|
|
627
|
+
const endedAt = draft.endedAt ?? startedAt;
|
|
628
|
+
return {
|
|
629
|
+
attemptId: draft.attemptId,
|
|
630
|
+
caseId: draft.caseId,
|
|
631
|
+
caseName: draft.caseName,
|
|
632
|
+
durationMs: calculateDurationMs(startedAt, endedAt),
|
|
633
|
+
endedAt,
|
|
634
|
+
experimentId: draft.experimentId,
|
|
635
|
+
...draft.input === void 0 ? {} : { input: draft.input },
|
|
636
|
+
metrics: draft.metrics,
|
|
637
|
+
...draft.output === void 0 ? {} : { output: draft.output },
|
|
638
|
+
projectName: draft.projectName,
|
|
639
|
+
retryCount: draft.retryCount,
|
|
640
|
+
runId: draft.runId,
|
|
641
|
+
schemaVersion: 1,
|
|
642
|
+
scores: draft.scores,
|
|
643
|
+
startedAt,
|
|
644
|
+
state: draft.state ?? "failed",
|
|
645
|
+
taskId: draft.taskId,
|
|
646
|
+
workspaceId: draft.workspaceId
|
|
647
|
+
};
|
|
648
|
+
}
|
|
649
|
+
function addRecordScores(summary, record) {
|
|
650
|
+
for (const [kind, score] of Object.entries(record.scores)) {
|
|
651
|
+
if (!Number.isFinite(score)) continue;
|
|
652
|
+
summary[kind] ??= {
|
|
653
|
+
average: 0,
|
|
654
|
+
count: 0,
|
|
655
|
+
sum: 0
|
|
656
|
+
};
|
|
657
|
+
summary[kind].count += 1;
|
|
658
|
+
summary[kind].sum += score;
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
function finalizeSummaryGroups(groups) {
|
|
662
|
+
return Object.fromEntries(Object.entries(groups).map(([key, summary]) => [key, finalizeScoreSummary(summary)]));
|
|
663
|
+
}
|
|
664
|
+
function finalizeScoreSummary(summary) {
|
|
665
|
+
return Object.fromEntries(Object.entries(summary).map(([kind, bucket]) => [kind, {
|
|
666
|
+
average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
|
|
667
|
+
count: bucket.count,
|
|
668
|
+
sum: bucket.sum
|
|
669
|
+
}]));
|
|
670
|
+
}
|
|
671
|
+
function getGroupValue(record, key) {
|
|
672
|
+
if (Object.hasOwn(record.metrics, key)) return {
|
|
673
|
+
exists: true,
|
|
674
|
+
value: record.metrics[key]
|
|
675
|
+
};
|
|
676
|
+
const directValue = record[key];
|
|
677
|
+
return isCaseMetricValue(directValue) ? {
|
|
678
|
+
exists: true,
|
|
679
|
+
value: directValue
|
|
680
|
+
} : { exists: false };
|
|
681
|
+
}
|
|
682
|
+
function extractCaseName(event) {
|
|
683
|
+
const data = asRecord(event.data);
|
|
684
|
+
return stringFrom(data?.caseName) ?? stringFrom(data?.name);
|
|
685
|
+
}
|
|
686
|
+
function extractExplicitProjectName(event) {
|
|
687
|
+
return stringFrom(asRecord(event.data)?.projectName) ?? event.projectName ?? event.projectId;
|
|
688
|
+
}
|
|
689
|
+
function createCaseKey(taskId, caseId) {
|
|
690
|
+
return `${taskId}\u0000${caseId}`;
|
|
691
|
+
}
|
|
692
|
+
/**
|
|
693
|
+
* Normalizes duration timestamps.
|
|
694
|
+
*
|
|
695
|
+
* Before:
|
|
696
|
+
* - `startedAt="2026-05-08T00:00:00.000Z"`, `endedAt="2026-05-08T00:00:01.250Z"`
|
|
697
|
+
* - `startedAt="bad"`, `endedAt="2026-05-08T00:00:01.250Z"`
|
|
698
|
+
*
|
|
699
|
+
* After:
|
|
700
|
+
* - `1250`
|
|
701
|
+
* - `0`
|
|
702
|
+
*/
|
|
703
|
+
function calculateDurationMs(startedAt, endedAt) {
|
|
704
|
+
const started = Date.parse(startedAt);
|
|
705
|
+
const ended = Date.parse(endedAt);
|
|
706
|
+
if (!Number.isFinite(started) || !Number.isFinite(ended)) return 0;
|
|
707
|
+
return Math.max(0, ended - started);
|
|
708
|
+
}
|
|
709
|
+
function normalizeState(value) {
|
|
710
|
+
if (value === "failed" || value === "passed" || value === "skipped" || value === "timeout") return value;
|
|
711
|
+
}
|
|
712
|
+
function isCaseMetricValue(value) {
|
|
713
|
+
if (value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string") return true;
|
|
714
|
+
return Array.isArray(value);
|
|
715
|
+
}
|
|
716
|
+
function asRecord(value) {
|
|
717
|
+
if (value == null || typeof value !== "object" || Array.isArray(value)) return;
|
|
718
|
+
return value;
|
|
719
|
+
}
|
|
720
|
+
function stringFrom(value) {
|
|
721
|
+
return typeof value === "string" ? value : void 0;
|
|
722
|
+
}
|
|
723
|
+
function numberFrom(value) {
|
|
724
|
+
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
725
|
+
}
|
|
726
|
+
//#endregion
|
|
727
|
+
//#region src/cli/report-artifacts.ts
|
|
728
|
+
/**
|
|
729
|
+
* Resolves one or more `run-summary.json` paths from a report location.
|
|
730
|
+
*
|
|
731
|
+
* Use when:
|
|
732
|
+
* - callers may pass a run directory, summary file path, or a report root
|
|
733
|
+
*
|
|
734
|
+
* Returns:
|
|
735
|
+
* - sorted absolute summary file paths
|
|
736
|
+
*/
|
|
737
|
+
async function resolveRunSummaryPaths(reportPath) {
|
|
738
|
+
const absoluteReportPath = resolve(reportPath);
|
|
739
|
+
const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
|
|
740
|
+
if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
|
|
741
|
+
if (existsSync(directSummaryPath)) return [directSummaryPath];
|
|
742
|
+
return (await glob("**/run-summary.json", {
|
|
743
|
+
absolute: true,
|
|
744
|
+
cwd: absoluteReportPath
|
|
745
|
+
})).sort((left, right) => left.localeCompare(right));
|
|
746
|
+
}
|
|
747
|
+
/**
|
|
748
|
+
* Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
|
|
749
|
+
*
|
|
750
|
+
* Use when:
|
|
751
|
+
* - report analysis needs both run aggregate output and event count metadata
|
|
752
|
+
*/
|
|
753
|
+
function readReportRunArtifact(summaryFilePath) {
|
|
754
|
+
const reportDirectory = resolve(summaryFilePath, "..");
|
|
755
|
+
const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
|
|
756
|
+
const eventsFilePath = resolve(reportDirectory, "events.jsonl");
|
|
757
|
+
const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
|
|
758
|
+
const event = JSON.parse(line);
|
|
759
|
+
return {
|
|
760
|
+
attemptId: event.attemptId,
|
|
761
|
+
caseId: event.caseId,
|
|
762
|
+
data: event.data,
|
|
763
|
+
event: event.event,
|
|
764
|
+
experimentId: event.experimentId,
|
|
765
|
+
projectId: event.projectId,
|
|
766
|
+
projectName: event.projectName,
|
|
767
|
+
runId: event.runId,
|
|
768
|
+
taskId: event.taskId,
|
|
769
|
+
timestamp: event.timestamp,
|
|
770
|
+
workspaceId: event.workspaceId
|
|
771
|
+
};
|
|
772
|
+
}) : [];
|
|
773
|
+
return {
|
|
774
|
+
events,
|
|
775
|
+
eventsCount: events.length,
|
|
776
|
+
reportDirectory,
|
|
777
|
+
summary,
|
|
778
|
+
summaryFilePath
|
|
779
|
+
};
|
|
780
|
+
}
|
|
781
|
+
/**
|
|
782
|
+
* Reads all run artifacts found under `reportPath`.
|
|
783
|
+
*
|
|
784
|
+
* Use when:
|
|
785
|
+
* - callers need multi-run analysis from a directory root
|
|
786
|
+
*/
|
|
787
|
+
async function readReportArtifacts(reportPath) {
|
|
788
|
+
return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
|
|
789
|
+
}
|
|
790
|
+
/**
|
|
791
|
+
* Creates a compact summary row for one run artifact.
|
|
792
|
+
*
|
|
793
|
+
* Use when:
|
|
794
|
+
* - table/csv/jsonl exports should stay stable and cheap to parse
|
|
795
|
+
*/
|
|
796
|
+
function summarizeReportRunArtifact(artifact) {
|
|
797
|
+
const totalProjects = artifact.summary.projects.length;
|
|
798
|
+
const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
|
|
799
|
+
const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
|
|
800
|
+
const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
|
|
801
|
+
const projectNames = artifact.summary.projects.map((project) => project.name);
|
|
802
|
+
return {
|
|
803
|
+
attemptId: artifact.summary.attemptId ?? null,
|
|
804
|
+
eventsCount: artifact.eventsCount,
|
|
805
|
+
executedProjects,
|
|
806
|
+
experimentId: artifact.summary.experimentId ?? null,
|
|
807
|
+
failedProjects,
|
|
808
|
+
projectNames,
|
|
809
|
+
reportDirectory: artifact.reportDirectory,
|
|
810
|
+
runId: artifact.summary.runId ?? null,
|
|
811
|
+
totalProjects,
|
|
812
|
+
totalTasks,
|
|
813
|
+
workspaceId: artifact.summary.workspaceId ?? null
|
|
814
|
+
};
|
|
815
|
+
}
|
|
816
|
+
/**
|
|
817
|
+
* Writes one complete local run report artifact set.
|
|
818
|
+
*
|
|
819
|
+
* Use when:
|
|
820
|
+
* - CLI runs need deterministic local artifacts under workspace/project/experiment/attempt/run
|
|
821
|
+
* - report commands need normalized case, metrics, and OTLP-shaped files
|
|
822
|
+
*
|
|
823
|
+
* Expects:
|
|
824
|
+
* - `events` are the same envelopes written to `events.jsonl`
|
|
825
|
+
* - `output` already contains run identity fields
|
|
826
|
+
*
|
|
827
|
+
* Returns:
|
|
828
|
+
* - absolute report directory path containing the written artifacts
|
|
829
|
+
*/
|
|
830
|
+
async function writeRunReportArtifacts(output, events, identity, reportOut) {
|
|
831
|
+
const projectId = deriveReportProjectId(output);
|
|
832
|
+
const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
|
|
833
|
+
const persistedOutput = {
|
|
834
|
+
...output,
|
|
835
|
+
reportDirectory
|
|
836
|
+
};
|
|
837
|
+
await mkdir(reportDirectory, { recursive: true });
|
|
838
|
+
await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(persistedOutput, null, 2)}\n`, "utf-8");
|
|
839
|
+
await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
|
|
840
|
+
const caseRecords = buildCaseRecords({
|
|
841
|
+
attemptId: identity.attemptId,
|
|
842
|
+
events,
|
|
843
|
+
experimentId: identity.experimentId,
|
|
844
|
+
projectName: projectId,
|
|
845
|
+
runId: identity.runId,
|
|
846
|
+
workspaceId: identity.workspaceId
|
|
847
|
+
});
|
|
848
|
+
const metricsSummary = buildMetricsSummary(caseRecords, []);
|
|
849
|
+
const otlp = buildLocalOtlpProjection({
|
|
850
|
+
records: caseRecords,
|
|
851
|
+
runId: identity.runId
|
|
852
|
+
});
|
|
853
|
+
await writeFile(resolve(reportDirectory, "cases.jsonl"), encodeJsonl(caseRecords), "utf-8");
|
|
854
|
+
await writeFile(resolve(reportDirectory, "metrics-summary.json"), `${JSON.stringify(metricsSummary, null, 2)}\n`, "utf-8");
|
|
855
|
+
await mkdir(resolve(reportDirectory, "otlp"), { recursive: true });
|
|
856
|
+
await mkdir(resolve(reportDirectory, "benchmark"), { recursive: true });
|
|
857
|
+
await writeFile(resolve(reportDirectory, "otlp", "traces.json"), `${JSON.stringify(otlp.traces, null, 2)}\n`, "utf-8");
|
|
858
|
+
await writeFile(resolve(reportDirectory, "otlp", "logs.json"), `${JSON.stringify(otlp.logs, null, 2)}\n`, "utf-8");
|
|
859
|
+
await writeFile(resolve(reportDirectory, "otlp", "metrics.json"), `${JSON.stringify(otlp.metrics, null, 2)}\n`, "utf-8");
|
|
860
|
+
return reportDirectory;
|
|
861
|
+
}
|
|
862
|
+
function deriveReportProjectId(output) {
|
|
863
|
+
const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
|
|
864
|
+
if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment$1(uniqueProjectNames[0] ?? "default-project");
|
|
865
|
+
return "multi-project";
|
|
866
|
+
}
|
|
867
|
+
function sanitizeIdentitySegment$1(value) {
|
|
868
|
+
const normalized = value.trim();
|
|
869
|
+
if (normalized.length === 0) return "default";
|
|
870
|
+
return normalized.replace(/[^\w.-]+/g, "-");
|
|
871
|
+
}
|
|
872
|
+
//#endregion
|
|
265
873
|
//#region src/cli/reporters/noop-reporter.ts
|
|
266
874
|
/**
|
|
267
875
|
* Creates a reporter that intentionally does nothing.
|
|
@@ -378,12 +986,21 @@ var SummaryReporterStateMachine = class {
|
|
|
378
986
|
if (task.state === "finished") return;
|
|
379
987
|
task.state = "running";
|
|
380
988
|
task.startedAt ??= this.options.getNow();
|
|
381
|
-
if (task.settledCaseIds.has(payload.caseId)
|
|
989
|
+
if (task.settledCaseIds.has(payload.caseId)) return;
|
|
990
|
+
const existingCase = task.runningCases.get(payload.caseId);
|
|
991
|
+
if (existingCase != null) {
|
|
992
|
+
existingCase.autoRetry = payload.autoRetry;
|
|
993
|
+
existingCase.caseName = payload.caseName ?? payload.caseId;
|
|
994
|
+
existingCase.retryIndex = payload.retryIndex;
|
|
995
|
+
return;
|
|
996
|
+
}
|
|
382
997
|
task.caseOrderCounter += 1;
|
|
383
998
|
task.runningCases.set(payload.caseId, {
|
|
999
|
+
autoRetry: payload.autoRetry,
|
|
384
1000
|
caseId: payload.caseId,
|
|
385
1001
|
caseName: payload.caseName ?? payload.caseId,
|
|
386
1002
|
order: task.caseOrderCounter,
|
|
1003
|
+
retryIndex: payload.retryIndex,
|
|
387
1004
|
startedAt: this.options.getNow()
|
|
388
1005
|
});
|
|
389
1006
|
this.syncTaskTotalCases(task);
|
|
@@ -420,6 +1037,10 @@ var SummaryReporterStateMachine = class {
|
|
|
420
1037
|
this.caseCounters.failed += 1;
|
|
421
1038
|
return;
|
|
422
1039
|
}
|
|
1040
|
+
if (payload.state === "timeout") {
|
|
1041
|
+
this.caseCounters.timeout += 1;
|
|
1042
|
+
return;
|
|
1043
|
+
}
|
|
423
1044
|
this.caseCounters.skipped += 1;
|
|
424
1045
|
}
|
|
425
1046
|
/**
|
|
@@ -500,39 +1121,52 @@ var SummaryReporterStateMachine = class {
|
|
|
500
1121
|
const activeRows = this.createActiveRows();
|
|
501
1122
|
const footerRows = this.createFooterRows();
|
|
502
1123
|
const maxRows = options?.maxRows;
|
|
503
|
-
const
|
|
1124
|
+
const footerBlock = [...footerRows, ""];
|
|
1125
|
+
if (maxRows == null || maxRows <= 0) return [...[
|
|
504
1126
|
"",
|
|
505
1127
|
...activeRows,
|
|
506
1128
|
...activeRows.length > 0 ? [""] : []
|
|
507
|
-
];
|
|
508
|
-
const footerBlock = [...footerRows, ""];
|
|
509
|
-
if (maxRows == null || maxRows <= 0) return [...activeBlock, ...footerBlock];
|
|
1129
|
+
], ...footerBlock];
|
|
510
1130
|
if (maxRows <= footerBlock.length) return footerBlock.slice(-maxRows);
|
|
511
|
-
|
|
512
|
-
return [...activeBlock.slice(0, availableActiveRows), ...footerBlock];
|
|
1131
|
+
return [...createBoundedActiveBlock(activeRows, Math.max(0, maxRows - footerBlock.length)), ...footerBlock];
|
|
513
1132
|
}
|
|
514
1133
|
createActiveRows() {
|
|
515
1134
|
const activeTasks = Array.from(this.tasks.values()).filter((task) => task.state !== "finished").sort(compareActiveTasks);
|
|
516
1135
|
const rows = [];
|
|
517
1136
|
for (const task of activeTasks) {
|
|
518
|
-
const
|
|
1137
|
+
const now = this.options.getNow();
|
|
1138
|
+
const suffix = task.state === "queued" ? c.dim(" [queued]") : formatTaskProgressSuffix(task, now);
|
|
519
1139
|
const badge = formatProjectBadge(task.projectName, this.options.isTTY);
|
|
520
1140
|
rows.push(c.bold(c.yellow(` ${POINTER} `)) + badge + task.displayName + c.dim(suffix));
|
|
521
|
-
const slowCases = Array.from(task.runningCases.values()).filter((activeCase) =>
|
|
1141
|
+
const slowCases = Array.from(task.runningCases.values()).filter((activeCase) => now - activeCase.startedAt >= this.options.slowThresholdMs).sort((left, right) => left.order - right.order);
|
|
522
1142
|
for (const [index, activeCase] of slowCases.entries()) {
|
|
523
1143
|
const icon = index === slowCases.length - 1 ? TREE_NODE_END : TREE_NODE_MIDDLE;
|
|
524
|
-
const elapsed = Math.max(0,
|
|
525
|
-
rows.push(c.bold(c.yellow(` ${icon} `)) + activeCase.caseName + c.bold(c.yellow(` ${formatDuration$
|
|
1144
|
+
const elapsed = Math.max(0, now - activeCase.startedAt);
|
|
1145
|
+
rows.push(c.bold(c.yellow(` ${icon} `)) + activeCase.caseName + formatRetrySuffix(activeCase) + c.bold(c.yellow(` ${formatDuration$2(elapsed)}`)));
|
|
526
1146
|
}
|
|
527
1147
|
}
|
|
528
1148
|
return rows;
|
|
529
1149
|
}
|
|
530
1150
|
createFooterRows() {
|
|
1151
|
+
const now = this.options.getNow();
|
|
1152
|
+
const runElapsedDurationMs = Math.max(0, now - this.startedAtMs);
|
|
1153
|
+
const taskRunningCount = countRunningTasks(this.tasks.values());
|
|
1154
|
+
const caseRunningCount = countRunningCases(this.tasks.values());
|
|
531
1155
|
return [
|
|
532
|
-
padSummaryTitle("Tasks") + formatCounterState(this.taskCounters
|
|
533
|
-
|
|
1156
|
+
padSummaryTitle("Tasks") + formatCounterState(this.taskCounters, taskRunningCount, {
|
|
1157
|
+
elapsedDurationMs: runElapsedDurationMs,
|
|
1158
|
+
estimatedDurationMs: estimateTotalDurationMs(this.taskCounters.completed, this.taskCounters.total, runElapsedDurationMs)
|
|
1159
|
+
}),
|
|
1160
|
+
padSummaryTitle("Cases") + formatCounterState(this.caseCounters, caseRunningCount, {
|
|
1161
|
+
elapsedDurationMs: runElapsedDurationMs,
|
|
1162
|
+
estimatedDurationMs: estimateTotalDurationMs(this.caseCounters.completed, this.caseCounters.total, runElapsedDurationMs)
|
|
1163
|
+
}),
|
|
1164
|
+
padSummaryTitle("Concurrency") + formatActiveConcurrencyState({
|
|
1165
|
+
caseRunningCount,
|
|
1166
|
+
taskRunningCount
|
|
1167
|
+
}),
|
|
534
1168
|
padSummaryTitle("Start at") + this.startTime,
|
|
535
|
-
padSummaryTitle("Duration") +
|
|
1169
|
+
padSummaryTitle("Duration") + formatHumanDuration(runElapsedDurationMs)
|
|
536
1170
|
];
|
|
537
1171
|
}
|
|
538
1172
|
getOrCreateTaskState(taskId) {
|
|
@@ -563,6 +1197,39 @@ var SummaryReporterStateMachine = class {
|
|
|
563
1197
|
}
|
|
564
1198
|
};
|
|
565
1199
|
/**
|
|
1200
|
+
* Creates the active task block while keeping room for summary footer rows.
|
|
1201
|
+
*
|
|
1202
|
+
* Use when:
|
|
1203
|
+
* - the live TTY window is smaller than the number of running task/case rows
|
|
1204
|
+
* - active rows need a visible truncation marker instead of silently disappearing
|
|
1205
|
+
*
|
|
1206
|
+
* Expects:
|
|
1207
|
+
* - `activeRows` contains already-formatted task and slow-case rows
|
|
1208
|
+
* - `maxRows` counts the leading spacer and truncation marker
|
|
1209
|
+
*
|
|
1210
|
+
* Returns:
|
|
1211
|
+
* - rows that fit inside `maxRows`
|
|
1212
|
+
* - a final hidden-row marker when active rows were omitted
|
|
1213
|
+
*/
|
|
1214
|
+
function createBoundedActiveBlock(activeRows, maxRows) {
|
|
1215
|
+
if (maxRows <= 0) return [];
|
|
1216
|
+
if (activeRows.length === 0) return [""];
|
|
1217
|
+
const fullBlock = [
|
|
1218
|
+
"",
|
|
1219
|
+
...activeRows,
|
|
1220
|
+
""
|
|
1221
|
+
];
|
|
1222
|
+
if (fullBlock.length <= maxRows) return fullBlock;
|
|
1223
|
+
if (maxRows === 1) return [""];
|
|
1224
|
+
const visibleActiveRows = Math.max(0, maxRows - 2);
|
|
1225
|
+
const hiddenRows = Math.max(0, activeRows.length - visibleActiveRows);
|
|
1226
|
+
return [
|
|
1227
|
+
"",
|
|
1228
|
+
...activeRows.slice(0, visibleActiveRows),
|
|
1229
|
+
c.dim(` ${TREE_NODE_END} ... ${hiddenRows} more running rows hidden`)
|
|
1230
|
+
];
|
|
1231
|
+
}
|
|
1232
|
+
/**
|
|
566
1233
|
* Creates the live summary reporter state machine for `vieval` CLI runs.
|
|
567
1234
|
*
|
|
568
1235
|
* Use when:
|
|
@@ -593,6 +1260,7 @@ function createCounterState() {
|
|
|
593
1260
|
failed: 0,
|
|
594
1261
|
passed: 0,
|
|
595
1262
|
skipped: 0,
|
|
1263
|
+
timeout: 0,
|
|
596
1264
|
total: 0
|
|
597
1265
|
};
|
|
598
1266
|
}
|
|
@@ -601,6 +1269,7 @@ function resetCounterState(counter, total) {
|
|
|
601
1269
|
counter.failed = 0;
|
|
602
1270
|
counter.passed = 0;
|
|
603
1271
|
counter.skipped = 0;
|
|
1272
|
+
counter.timeout = 0;
|
|
604
1273
|
counter.total = total;
|
|
605
1274
|
}
|
|
606
1275
|
function sumTaskCaseTotals(tasks) {
|
|
@@ -619,19 +1288,48 @@ function compareActiveTasks(left, right) {
|
|
|
619
1288
|
function padSummaryTitle(label) {
|
|
620
1289
|
return `${c.dim(label.padEnd(8))} `;
|
|
621
1290
|
}
|
|
622
|
-
function formatCounterState(counter) {
|
|
1291
|
+
function formatCounterState(counter, runningCount, timing) {
|
|
1292
|
+
const plannedCount = Math.max(0, counter.total - counter.completed - runningCount);
|
|
623
1293
|
return [
|
|
1294
|
+
plannedCount > 0 ? c.bold(c.blue(`${plannedCount} planned`)) : c.dim(`${plannedCount} planned`),
|
|
1295
|
+
runningCount > 0 ? c.bold(c.yellow(`${runningCount} running`)) : c.dim(`${runningCount} running`),
|
|
624
1296
|
c.bold(c.green(`${counter.passed} passed`)),
|
|
625
1297
|
counter.failed > 0 ? c.bold(c.red(`${counter.failed} failed`)) : c.dim(`${counter.failed} failed`),
|
|
1298
|
+
counter.timeout > 0 ? c.bold(c.yellow(`${counter.timeout} timeout`)) : c.dim(`${counter.timeout} timeout`),
|
|
626
1299
|
counter.skipped > 0 ? c.yellow(`${counter.skipped} skipped`) : c.dim(`${counter.skipped} skipped`)
|
|
627
|
-
].join(c.dim(" | ")) + c.gray(` (${counter.total})`);
|
|
1300
|
+
].join(c.dim(" | ")) + c.gray(` (${counter.total})`) + formatTimingSuffix(timing);
|
|
1301
|
+
}
|
|
1302
|
+
function formatActiveConcurrencyState(options) {
|
|
1303
|
+
return [options.taskRunningCount > 0 ? c.bold(c.yellow(`${options.taskRunningCount} ${pluralize("task", options.taskRunningCount)} running`)) : c.dim("0 tasks running"), options.caseRunningCount > 0 ? c.bold(c.yellow(`${options.caseRunningCount} ${pluralize("case", options.caseRunningCount)} running`)) : c.dim("0 cases running")].join(c.dim(" | "));
|
|
1304
|
+
}
|
|
1305
|
+
function pluralize(noun, count) {
|
|
1306
|
+
return count === 1 ? noun : `${noun}s`;
|
|
1307
|
+
}
|
|
1308
|
+
function formatRetrySuffix(activeCase) {
|
|
1309
|
+
if (activeCase.retryIndex == null || activeCase.retryIndex <= 0 || activeCase.autoRetry == null || activeCase.autoRetry <= 0) return "";
|
|
1310
|
+
return c.dim(` retry ${activeCase.retryIndex}/${activeCase.autoRetry}`);
|
|
628
1311
|
}
|
|
629
1312
|
function formatTimeString(date) {
|
|
630
1313
|
return date.toTimeString().split(" ")[0] ?? "";
|
|
631
1314
|
}
|
|
632
|
-
function formatDuration$
|
|
633
|
-
|
|
634
|
-
|
|
1315
|
+
function formatDuration$2(durationMs) {
|
|
1316
|
+
return formatHumanDuration(durationMs);
|
|
1317
|
+
}
|
|
1318
|
+
function formatHumanDuration(durationMs) {
|
|
1319
|
+
if (durationMs < 1e3) return `${Math.round(durationMs)}ms`;
|
|
1320
|
+
const formatted = formatDuration(intervalToDuration({
|
|
1321
|
+
end: durationMs,
|
|
1322
|
+
start: 0
|
|
1323
|
+
}), {
|
|
1324
|
+
delimiter: " ",
|
|
1325
|
+
format: [
|
|
1326
|
+
"hours",
|
|
1327
|
+
"minutes",
|
|
1328
|
+
"seconds"
|
|
1329
|
+
],
|
|
1330
|
+
zero: false
|
|
1331
|
+
});
|
|
1332
|
+
return formatted.length > 0 ? formatted : "0 seconds";
|
|
635
1333
|
}
|
|
636
1334
|
function formatProjectBadge(projectName, isTTY) {
|
|
637
1335
|
if (projectName == null || projectName.length === 0) return "";
|
|
@@ -645,6 +1343,37 @@ function formatProjectBadge(projectName, isTTY) {
|
|
|
645
1343
|
const background = backgroundPool[projectName.split("").reduce((accumulator, character, index) => accumulator + character.charCodeAt(0) + index, 0) % backgroundPool.length];
|
|
646
1344
|
return `${c.black(background(` ${projectName} `))} `;
|
|
647
1345
|
}
|
|
1346
|
+
function countRunningCases(tasks) {
|
|
1347
|
+
let runningCount = 0;
|
|
1348
|
+
for (const task of tasks) runningCount += task.runningCases.size;
|
|
1349
|
+
return runningCount;
|
|
1350
|
+
}
|
|
1351
|
+
function countRunningTasks(tasks) {
|
|
1352
|
+
let runningCount = 0;
|
|
1353
|
+
for (const task of tasks) if (task.state === "running") runningCount += 1;
|
|
1354
|
+
return runningCount;
|
|
1355
|
+
}
|
|
1356
|
+
function estimateTaskDurationMs(task, now) {
|
|
1357
|
+
if (task.startedAt == null) return;
|
|
1358
|
+
return estimateTotalDurationMs(task.completedCases, task.totalCases, Math.max(0, now - task.startedAt));
|
|
1359
|
+
}
|
|
1360
|
+
function estimateTotalDurationMs(completedCount, totalCount, elapsedDurationMs) {
|
|
1361
|
+
if (completedCount === 0 || totalCount === 0) return;
|
|
1362
|
+
const averageDurationMs = elapsedDurationMs / completedCount;
|
|
1363
|
+
return Math.round(averageDurationMs * totalCount);
|
|
1364
|
+
}
|
|
1365
|
+
function formatTaskProgressSuffix(task, now) {
|
|
1366
|
+
const elapsedDurationMs = task.startedAt == null ? 0 : Math.max(0, now - task.startedAt);
|
|
1367
|
+
return ` ${task.completedCases}/${task.totalCases}, ${task.runningCases.size} ${pluralize("case", task.runningCases.size)} running${formatTimingSuffix({
|
|
1368
|
+
elapsedDurationMs,
|
|
1369
|
+
estimatedDurationMs: estimateTaskDurationMs(task, now)
|
|
1370
|
+
})}`;
|
|
1371
|
+
}
|
|
1372
|
+
function formatTimingSuffix(timing) {
|
|
1373
|
+
const parts = [`elapsed ${formatHumanDuration(timing.elapsedDurationMs)}`];
|
|
1374
|
+
if (timing.estimatedDurationMs != null) parts.push(`estimated ${formatHumanDuration(timing.estimatedDurationMs)}`);
|
|
1375
|
+
return ` (${parts.join(", ")})`;
|
|
1376
|
+
}
|
|
648
1377
|
//#endregion
|
|
649
1378
|
//#region src/cli/reporters/index.ts
|
|
650
1379
|
/**
|
|
@@ -1000,7 +1729,7 @@ async function createVievalVitestCompatReporterBridge(options) {
|
|
|
1000
1729
|
return {
|
|
1001
1730
|
async onCaseEnd(payload) {
|
|
1002
1731
|
const taskCase = getOrCreateCase(payload.taskId, payload.caseId);
|
|
1003
|
-
taskCase.state = payload.state;
|
|
1732
|
+
taskCase.state = payload.state === "timeout" ? "failed" : payload.state;
|
|
1004
1733
|
await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseResult?.(taskCase));
|
|
1005
1734
|
},
|
|
1006
1735
|
async onCaseStart(payload) {
|
|
@@ -1047,17 +1776,68 @@ async function createVievalVitestCompatReporterBridge(options) {
|
|
|
1047
1776
|
function hasRunFailures(output) {
|
|
1048
1777
|
return output.projects.some((project) => {
|
|
1049
1778
|
if (project.errorMessage != null) return true;
|
|
1050
|
-
if (project.caseSummary != null && project.caseSummary.failed > 0) return true;
|
|
1779
|
+
if (project.caseSummary != null && (project.caseSummary.failed > 0 || project.caseSummary.timeout > 0)) return true;
|
|
1051
1780
|
return (project.caseFailures?.length ?? 0) > 0;
|
|
1052
1781
|
});
|
|
1053
1782
|
}
|
|
1054
|
-
function
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
return process.stdout.isTTY === true;
|
|
1783
|
+
function resolveCappedConcurrency(defaultConcurrency, cliConcurrency, fallback) {
|
|
1784
|
+
const effectiveDefault = defaultConcurrency ?? fallback;
|
|
1785
|
+
if (cliConcurrency == null) return effectiveDefault;
|
|
1786
|
+
return Math.min(effectiveDefault, cliConcurrency);
|
|
1059
1787
|
}
|
|
1060
|
-
function
|
|
1788
|
+
function resolveOptionalRuntimeTaskConcurrency(defaultConcurrency, cliConcurrency) {
|
|
1789
|
+
return cliConcurrency ?? defaultConcurrency;
|
|
1790
|
+
}
|
|
1791
|
+
function resolveWorkspaceConcurrency(loadedConfig, options) {
|
|
1792
|
+
return resolveCappedConcurrency(loadedConfig.concurrency?.workspace, options.workspaceConcurrency, 1);
|
|
1793
|
+
}
|
|
1794
|
+
function resolveProjectConcurrency(project, options) {
|
|
1795
|
+
return resolveCappedConcurrency(project.concurrency?.project, options.projectConcurrency, Number.POSITIVE_INFINITY);
|
|
1796
|
+
}
|
|
1797
|
+
function resolveTaskConcurrency(project, options) {
|
|
1798
|
+
return resolveCappedConcurrency(project.concurrency?.task, options.taskConcurrency, 1);
|
|
1799
|
+
}
|
|
1800
|
+
function resolveScheduledTaskConcurrency(project, options) {
|
|
1801
|
+
return Math.min(resolveProjectConcurrency(project, options), resolveTaskConcurrency(project, options));
|
|
1802
|
+
}
|
|
1803
|
+
function resolveRuntimeTaskConcurrency(taskConcurrency, project, options) {
|
|
1804
|
+
const attempt = resolveOptionalRuntimeTaskConcurrency(taskConcurrency?.attempt ?? project.concurrency?.attempt, options.attemptConcurrency);
|
|
1805
|
+
const caseConcurrency = resolveOptionalRuntimeTaskConcurrency(taskConcurrency?.case ?? project.concurrency?.case, options.caseConcurrency);
|
|
1806
|
+
if (attempt == null && caseConcurrency == null) return;
|
|
1807
|
+
return {
|
|
1808
|
+
attempt,
|
|
1809
|
+
case: caseConcurrency
|
|
1810
|
+
};
|
|
1811
|
+
}
|
|
1812
|
+
function createScheduledTaskWithRuntimeConcurrency(task, project, options) {
|
|
1813
|
+
const taskDefinition = task.entry.task;
|
|
1814
|
+
if (taskDefinition == null) return task;
|
|
1815
|
+
const concurrency = resolveRuntimeTaskConcurrency(taskDefinition.concurrency, project, options);
|
|
1816
|
+
return {
|
|
1817
|
+
...task,
|
|
1818
|
+
entry: {
|
|
1819
|
+
...task.entry,
|
|
1820
|
+
task: {
|
|
1821
|
+
...taskDefinition,
|
|
1822
|
+
concurrency
|
|
1823
|
+
}
|
|
1824
|
+
}
|
|
1825
|
+
};
|
|
1826
|
+
}
|
|
1827
|
+
function resolveCliRuntimeConcurrency(options) {
|
|
1828
|
+
if (options.attemptConcurrency == null && options.caseConcurrency == null) return;
|
|
1829
|
+
return {
|
|
1830
|
+
attempt: options.attemptConcurrency,
|
|
1831
|
+
case: options.caseConcurrency
|
|
1832
|
+
};
|
|
1833
|
+
}
|
|
1834
|
+
function shouldUseColor() {
|
|
1835
|
+
if (process.env.NO_COLOR != null) return false;
|
|
1836
|
+
const forceColor = process.env.FORCE_COLOR;
|
|
1837
|
+
if (forceColor != null) return forceColor !== "0";
|
|
1838
|
+
return process.stdout.isTTY === true;
|
|
1839
|
+
}
|
|
1840
|
+
function createColorPalette(enabled) {
|
|
1061
1841
|
if (!enabled) return {
|
|
1062
1842
|
black: (value) => value,
|
|
1063
1843
|
bgCyan: (value) => value,
|
|
@@ -1094,11 +1874,15 @@ function createProjectBadge(name, colors, colorEnabled) {
|
|
|
1094
1874
|
const background = labelColorPool[name.split("").reduce((accumulator, char, index) => accumulator + char.charCodeAt(0) + index, 0) % labelColorPool.length];
|
|
1095
1875
|
return `${colors.black(background(` ${name} `))} `;
|
|
1096
1876
|
}
|
|
1097
|
-
function formatDuration(durationMs, colors) {
|
|
1877
|
+
function formatDuration$1(durationMs, colors) {
|
|
1098
1878
|
if (durationMs == null) return "";
|
|
1099
1879
|
const rounded = Math.round(durationMs);
|
|
1100
1880
|
return (rounded > 1e3 ? colors.yellow : colors.green)(` ${rounded}${colors.dim("ms")}`);
|
|
1101
1881
|
}
|
|
1882
|
+
function formatHybridAverage(hybridAverage) {
|
|
1883
|
+
if (hybridAverage == null) return "n/a";
|
|
1884
|
+
return hybridAverage.toFixed(3).replace(/\.?0+$/, "");
|
|
1885
|
+
}
|
|
1102
1886
|
function filterProjectsByName(projects, names) {
|
|
1103
1887
|
if (names.length === 0) return [...projects];
|
|
1104
1888
|
const nameSet = new Set(names);
|
|
@@ -1119,11 +1903,6 @@ function createRunIdentity(options) {
|
|
|
1119
1903
|
workspaceId
|
|
1120
1904
|
};
|
|
1121
1905
|
}
|
|
1122
|
-
function deriveReportProjectId(output) {
|
|
1123
|
-
const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
|
|
1124
|
-
if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment(uniqueProjectNames[0] ?? "default-project");
|
|
1125
|
-
return "multi-project";
|
|
1126
|
-
}
|
|
1127
1906
|
function createEventRecorder(identity) {
|
|
1128
1907
|
const events = [];
|
|
1129
1908
|
const taskProjectMap = /* @__PURE__ */ new Map();
|
|
@@ -1220,6 +1999,7 @@ function isSummaryReporter(reporter) {
|
|
|
1220
1999
|
return "getWindowRows" in reporter;
|
|
1221
2000
|
}
|
|
1222
2001
|
function createRunReporter(options) {
|
|
2002
|
+
const getRows = options?.getRows ?? (() => process.stdout.rows);
|
|
1223
2003
|
const reporter = createCliReporter({
|
|
1224
2004
|
getColumns: options?.getColumns ?? (() => process.stdout.columns ?? 80),
|
|
1225
2005
|
getNow: options?.getNow ?? (() => Date.now()),
|
|
@@ -1240,7 +2020,7 @@ function createRunReporter(options) {
|
|
|
1240
2020
|
};
|
|
1241
2021
|
const rendererBaseOptions = {
|
|
1242
2022
|
getColumns: options?.getColumns ?? (() => process.stdout.columns ?? 80),
|
|
1243
|
-
getWindow: () => reporter.getWindowRows(),
|
|
2023
|
+
getWindow: () => reporter.getWindowRows({ maxRows: normalizeLiveReporterMaxRows(getRows()) }),
|
|
1244
2024
|
queueRenderReset: options?.queueRenderReset,
|
|
1245
2025
|
supportsAnsiWindowing: options?.supportsAnsiWindowing,
|
|
1246
2026
|
writeOutput: options?.writeOutput ?? ((value) => process.stdout.write(value))
|
|
@@ -1289,6 +2069,22 @@ function createRunReporter(options) {
|
|
|
1289
2069
|
}
|
|
1290
2070
|
};
|
|
1291
2071
|
}
|
|
2072
|
+
/**
|
|
2073
|
+
* Normalizes terminal row count into the live reporter window height.
|
|
2074
|
+
*
|
|
2075
|
+
* Before:
|
|
2076
|
+
* - undefined
|
|
2077
|
+
* - 4
|
|
2078
|
+
* - 40
|
|
2079
|
+
*
|
|
2080
|
+
* After:
|
|
2081
|
+
* - 23
|
|
2082
|
+
* - 6
|
|
2083
|
+
* - 39
|
|
2084
|
+
*/
|
|
2085
|
+
function normalizeLiveReporterMaxRows(rows) {
|
|
2086
|
+
return Math.max(6, (rows == null || !Number.isFinite(rows) || rows <= 0 ? 24 : Math.floor(rows)) - 1);
|
|
2087
|
+
}
|
|
1292
2088
|
function createTaskQueuePayload(task, projectName) {
|
|
1293
2089
|
return {
|
|
1294
2090
|
displayName: task.entry.name,
|
|
@@ -1315,11 +2111,12 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
|
|
|
1315
2111
|
projectCaseCounters.seenCaseIds.add(projectCaseId);
|
|
1316
2112
|
if (payload.state === "passed") projectCaseCounters.passed += 1;
|
|
1317
2113
|
else if (payload.state === "failed") projectCaseCounters.failed += 1;
|
|
2114
|
+
else if (payload.state === "timeout") projectCaseCounters.timeout += 1;
|
|
1318
2115
|
else projectCaseCounters.skipped += 1;
|
|
1319
2116
|
}
|
|
1320
2117
|
}
|
|
1321
2118
|
syncCaseTotal(payload.total);
|
|
1322
|
-
if (payload.state === "failed" && payload.errorMessage != null && projectCaseFailures != null) projectCaseFailures.push({
|
|
2119
|
+
if ((payload.state === "failed" || payload.state === "timeout") && payload.errorMessage != null && projectCaseFailures != null) projectCaseFailures.push({
|
|
1323
2120
|
caseId,
|
|
1324
2121
|
caseName: payload.name,
|
|
1325
2122
|
errorMessage: payload.errorMessage,
|
|
@@ -1328,6 +2125,7 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
|
|
|
1328
2125
|
reporter.onCaseEnd({
|
|
1329
2126
|
caseId,
|
|
1330
2127
|
errorMessage: payload.errorMessage,
|
|
2128
|
+
output: payload.output,
|
|
1331
2129
|
state: payload.state,
|
|
1332
2130
|
taskId: task.id
|
|
1333
2131
|
});
|
|
@@ -1342,8 +2140,11 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
|
|
|
1342
2140
|
const caseId = createTaskCaseReporterId(payload);
|
|
1343
2141
|
syncCaseTotal(payload.total);
|
|
1344
2142
|
reporter.onCaseStart({
|
|
2143
|
+
autoRetry: payload.autoRetry,
|
|
1345
2144
|
caseId,
|
|
2145
|
+
input: payload.input,
|
|
1346
2146
|
caseName: payload.name,
|
|
2147
|
+
retryIndex: payload.retryIndex,
|
|
1347
2148
|
taskId: task.id
|
|
1348
2149
|
});
|
|
1349
2150
|
vitestCompatReporter?.onCaseStart({
|
|
@@ -1360,7 +2161,7 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
|
|
|
1360
2161
|
}
|
|
1361
2162
|
};
|
|
1362
2163
|
}
|
|
1363
|
-
function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
|
|
2164
|
+
function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, telemetry, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, runtimeConcurrency, vitestCompatReporter) {
|
|
1364
2165
|
return {
|
|
1365
2166
|
...createTaskExecutionContext({
|
|
1366
2167
|
cache: createFilesystemTaskCacheRuntime({
|
|
@@ -1371,7 +2172,9 @@ function createCliTaskExecutionContext(task, models, cacheRootDirectory, cachePr
|
|
|
1371
2172
|
models,
|
|
1372
2173
|
task
|
|
1373
2174
|
}),
|
|
1374
|
-
reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter)
|
|
2175
|
+
reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
|
|
2176
|
+
runtimeConcurrency,
|
|
2177
|
+
telemetry
|
|
1375
2178
|
};
|
|
1376
2179
|
}
|
|
1377
2180
|
function resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
|
|
@@ -1389,7 +2192,8 @@ function createAutoTaskExecutor(reporter, projectName, recordEvent, projectCaseC
|
|
|
1389
2192
|
cache: context.cache,
|
|
1390
2193
|
model: context.model,
|
|
1391
2194
|
reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
|
|
1392
|
-
task
|
|
2195
|
+
task,
|
|
2196
|
+
telemetry: context.telemetry
|
|
1393
2197
|
});
|
|
1394
2198
|
return {
|
|
1395
2199
|
entryId: task.entry.id,
|
|
@@ -1492,13 +2296,14 @@ async function prepareProject(project) {
|
|
|
1492
2296
|
};
|
|
1493
2297
|
}
|
|
1494
2298
|
}
|
|
1495
|
-
async function executePreparedProject(prepared, identity, cacheProjectName, reporter, counters, recordEvent) {
|
|
2299
|
+
async function executePreparedProject(prepared, identity, cacheProjectName, telemetry, reporter, counters, recordEvent, options) {
|
|
1496
2300
|
const settledTaskIds = /* @__PURE__ */ new Set();
|
|
1497
2301
|
const projectCaseCounters = {
|
|
1498
2302
|
failed: 0,
|
|
1499
2303
|
passed: 0,
|
|
1500
2304
|
seenCaseIds: /* @__PURE__ */ new Set(),
|
|
1501
|
-
skipped: 0
|
|
2305
|
+
skipped: 0,
|
|
2306
|
+
timeout: 0
|
|
1502
2307
|
};
|
|
1503
2308
|
const projectCaseFailures = [];
|
|
1504
2309
|
const vitestCompatReporter = await createVievalVitestCompatReporterBridge({
|
|
@@ -1507,9 +2312,16 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
|
|
|
1507
2312
|
});
|
|
1508
2313
|
const rawTaskExecutor = prepared.project.executor ?? createAutoTaskExecutor(reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
|
|
1509
2314
|
const taskExecutor = async (task, context) => {
|
|
2315
|
+
const runtimeTask = createScheduledTaskWithRuntimeConcurrency(task, prepared.project, options);
|
|
1510
2316
|
return {
|
|
1511
|
-
...await
|
|
1512
|
-
|
|
2317
|
+
...await telemetry.withSpan("vieval.task", {
|
|
2318
|
+
"vieval.project.name": prepared.name,
|
|
2319
|
+
"vieval.run.id": identity.runId,
|
|
2320
|
+
"vieval.task.entry.id": runtimeTask.entry.id,
|
|
2321
|
+
"vieval.task.id": runtimeTask.id,
|
|
2322
|
+
"vieval.task.name": runtimeTask.entry.name
|
|
2323
|
+
}, async () => await rawTaskExecutor(runtimeTask, context)),
|
|
2324
|
+
matrix: cloneScheduledTaskMatrix(runtimeTask)
|
|
1513
2325
|
};
|
|
1514
2326
|
};
|
|
1515
2327
|
for (const task of prepared.tasks) await vitestCompatReporter?.onTaskQueued({ taskId: task.id });
|
|
@@ -1517,7 +2329,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
|
|
|
1517
2329
|
try {
|
|
1518
2330
|
const aggregated = await runScheduledTasks(prepared.tasks, taskExecutor, {
|
|
1519
2331
|
createExecutionContext(task) {
|
|
1520
|
-
return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
|
|
2332
|
+
return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, telemetry, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, resolveCliRuntimeConcurrency(options), vitestCompatReporter);
|
|
1521
2333
|
},
|
|
1522
2334
|
onTaskEnd(task, state) {
|
|
1523
2335
|
settledTaskIds.add(task.id);
|
|
@@ -1538,7 +2350,8 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
|
|
|
1538
2350
|
onTaskStart(task) {
|
|
1539
2351
|
reporter.onTaskStart({ taskId: task.id });
|
|
1540
2352
|
vitestCompatReporter?.onTaskStart({ taskId: task.id });
|
|
1541
|
-
}
|
|
2353
|
+
},
|
|
2354
|
+
maxConcurrency: resolveScheduledTaskConcurrency(prepared.project, options)
|
|
1542
2355
|
});
|
|
1543
2356
|
await vitestCompatReporter?.onRunEnd({ failed: false });
|
|
1544
2357
|
return {
|
|
@@ -1546,6 +2359,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
|
|
|
1546
2359
|
failed: projectCaseCounters.failed,
|
|
1547
2360
|
passed: projectCaseCounters.passed,
|
|
1548
2361
|
skipped: projectCaseCounters.skipped,
|
|
2362
|
+
timeout: projectCaseCounters.timeout,
|
|
1549
2363
|
total: projectCaseCounters.seenCaseIds.size
|
|
1550
2364
|
},
|
|
1551
2365
|
caseFailures: projectCaseFailures,
|
|
@@ -1592,6 +2406,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
|
|
|
1592
2406
|
failed: projectCaseCounters.failed,
|
|
1593
2407
|
passed: projectCaseCounters.passed,
|
|
1594
2408
|
skipped: projectCaseCounters.skipped,
|
|
2409
|
+
timeout: projectCaseCounters.timeout,
|
|
1595
2410
|
total: projectCaseCounters.seenCaseIds.size
|
|
1596
2411
|
},
|
|
1597
2412
|
caseFailures: projectCaseFailures,
|
|
@@ -1607,14 +2422,6 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
|
|
|
1607
2422
|
};
|
|
1608
2423
|
}
|
|
1609
2424
|
}
|
|
1610
|
-
async function writeRunReportArtifacts(output, events, identity, reportOut) {
|
|
1611
|
-
const projectId = deriveReportProjectId(output);
|
|
1612
|
-
const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
|
|
1613
|
-
await mkdir(reportDirectory, { recursive: true });
|
|
1614
|
-
await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(output, null, 2)}\n`, "utf-8");
|
|
1615
|
-
await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
|
|
1616
|
-
return reportDirectory;
|
|
1617
|
-
}
|
|
1618
2425
|
/**
|
|
1619
2426
|
* Runs vieval orchestration from config and returns project-level summaries.
|
|
1620
2427
|
*
|
|
@@ -1637,57 +2444,91 @@ async function runVievalCli(options = {}) {
|
|
|
1637
2444
|
configFilePath: options.configFilePath,
|
|
1638
2445
|
cwd: options.cwd
|
|
1639
2446
|
});
|
|
2447
|
+
const telemetry = loadedConfig.reporting?.openTelemetry?.enabled === true ? createOpenTelemetryRuntime() : createNoopTelemetryRuntime();
|
|
2448
|
+
const onOpenTelemetryRunEnd = loadedConfig.reporting?.openTelemetry?.enabled === true ? loadedConfig.reporting.openTelemetry.onRunEnd : void 0;
|
|
1640
2449
|
const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
|
|
1641
2450
|
const eventRecorder = createEventRecorder(identity);
|
|
1642
2451
|
const reporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
|
|
2452
|
+
let runError;
|
|
2453
|
+
let runEndError;
|
|
2454
|
+
let output;
|
|
1643
2455
|
try {
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
}
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
2456
|
+
output = await telemetry.withSpan("vieval.run", {
|
|
2457
|
+
"vieval.attempt.id": identity.attemptId,
|
|
2458
|
+
"vieval.experiment.id": identity.experimentId,
|
|
2459
|
+
"vieval.run.id": identity.runId,
|
|
2460
|
+
"vieval.workspace.id": identity.workspaceId
|
|
2461
|
+
}, async () => {
|
|
2462
|
+
const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
|
|
2463
|
+
const workspaceScheduler = createSchedulerRuntime({ concurrency: { workspace: resolveWorkspaceConcurrency(loadedConfig, options) } });
|
|
2464
|
+
const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
|
|
2465
|
+
const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
|
|
2466
|
+
const totalTasks = preparedProjects.reduce((sum, project) => {
|
|
2467
|
+
if (project.kind === "prepared") return sum + project.prepared.tasks.length;
|
|
2468
|
+
return sum + project.summary.taskCount;
|
|
2469
|
+
}, 0);
|
|
2470
|
+
const skippedSummaryTasks = preparedProjects.reduce((sum, project) => {
|
|
2471
|
+
if (project.kind === "summary") return sum + project.summary.taskCount;
|
|
2472
|
+
return sum;
|
|
2473
|
+
}, 0);
|
|
2474
|
+
const reporterCounters = {
|
|
2475
|
+
failedTasks: 0,
|
|
2476
|
+
passedTasks: 0,
|
|
2477
|
+
skippedTasks: 0
|
|
2478
|
+
};
|
|
2479
|
+
reporter.onRunStart({ totalTasks });
|
|
2480
|
+
for (const project of executableProjects) for (const task of project.tasks) reporter.onTaskQueued(createTaskQueuePayload(task, project.name));
|
|
2481
|
+
const projectSummaries = (await Promise.all(preparedProjects.map(async (preparedProject, index) => {
|
|
2482
|
+
if (preparedProject.kind === "summary") return {
|
|
2483
|
+
index,
|
|
2484
|
+
summary: preparedProject.summary
|
|
2485
|
+
};
|
|
2486
|
+
return {
|
|
2487
|
+
index,
|
|
2488
|
+
summary: await telemetry.withSpan("vieval.project", {
|
|
2489
|
+
"vieval.project.name": preparedProject.prepared.name,
|
|
2490
|
+
"vieval.run.id": identity.runId
|
|
2491
|
+
}, async () => await workspaceScheduler.runCase({
|
|
2492
|
+
experimentId: identity.experimentId,
|
|
2493
|
+
projectName: preparedProject.prepared.name,
|
|
2494
|
+
scope: "workspace",
|
|
2495
|
+
workspaceId: identity.workspaceId
|
|
2496
|
+
}, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry, reporter, reporterCounters, eventRecorder.record, options)))
|
|
2497
|
+
};
|
|
2498
|
+
}))).sort((left, right) => left.index - right.index).map((item) => item.summary);
|
|
2499
|
+
reporter.onRunEnd({
|
|
2500
|
+
failedTasks: reporterCounters.failedTasks,
|
|
2501
|
+
passedTasks: reporterCounters.passedTasks,
|
|
2502
|
+
skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
|
|
2503
|
+
totalTasks
|
|
2504
|
+
});
|
|
2505
|
+
const output = {
|
|
2506
|
+
attemptId: identity.attemptId,
|
|
2507
|
+
configFilePath: loadedConfig.configFilePath,
|
|
2508
|
+
experimentId: identity.experimentId,
|
|
2509
|
+
projects: projectSummaries,
|
|
2510
|
+
reportDirectory: null,
|
|
2511
|
+
runId: identity.runId,
|
|
2512
|
+
workspaceId: identity.workspaceId
|
|
2513
|
+
};
|
|
2514
|
+
if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
|
|
2515
|
+
return output;
|
|
1675
2516
|
});
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
configFilePath: loadedConfig.configFilePath,
|
|
1679
|
-
experimentId: identity.experimentId,
|
|
1680
|
-
projects: projectSummaries,
|
|
1681
|
-
reportDirectory: null,
|
|
1682
|
-
runId: identity.runId,
|
|
1683
|
-
workspaceId: identity.workspaceId
|
|
1684
|
-
};
|
|
1685
|
-
if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
|
|
1686
|
-
return output;
|
|
2517
|
+
} catch (error) {
|
|
2518
|
+
runError = error;
|
|
1687
2519
|
} finally {
|
|
2520
|
+
if (onOpenTelemetryRunEnd != null) try {
|
|
2521
|
+
await onOpenTelemetryRunEnd();
|
|
2522
|
+
} catch (error) {
|
|
2523
|
+
if (runError == null) runEndError = error;
|
|
2524
|
+
}
|
|
1688
2525
|
reporter.dispose();
|
|
1689
2526
|
restoreEnvironment();
|
|
1690
2527
|
}
|
|
2528
|
+
if (runError != null) throw runError;
|
|
2529
|
+
if (runEndError != null) throw runEndError;
|
|
2530
|
+
if (output == null) throw new Error("Vieval run finished without output.");
|
|
2531
|
+
return output;
|
|
1691
2532
|
}
|
|
1692
2533
|
/**
|
|
1693
2534
|
* Formats CLI run output as human-readable lines.
|
|
@@ -1736,10 +2577,10 @@ function formatVievalCliRunOutput(output) {
|
|
|
1736
2577
|
executedTasks += project.result?.overall.runCount ?? 0;
|
|
1737
2578
|
const badge = createProjectBadge(project.name, colors, colorEnabled);
|
|
1738
2579
|
const isFailed = project.errorMessage != null;
|
|
1739
|
-
const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
|
|
2580
|
+
const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseSummary?.timeout ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
|
|
1740
2581
|
if (isFailed) {
|
|
1741
2582
|
failedProjects += 1;
|
|
1742
|
-
lines.push(` ${colors.red("❯")} ${badge}${formatDuration(project.durationMs, colors)}`);
|
|
2583
|
+
lines.push(` ${colors.red("❯")} ${badge}${formatDuration$1(project.durationMs, colors)}`);
|
|
1743
2584
|
lines.push(` ${project.errorMessage}`);
|
|
1744
2585
|
continue;
|
|
1745
2586
|
}
|
|
@@ -1748,7 +2589,7 @@ function formatVievalCliRunOutput(output) {
|
|
|
1748
2589
|
const countLabel = colors.dim(`(${project.taskCount} tasks)`);
|
|
1749
2590
|
const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, 0 runs, hybrid n/a`);
|
|
1750
2591
|
const matrixSummary = formatMatrixSummary(project.matrixSummary);
|
|
1751
|
-
lines.push(` ${colors.dim("○")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
|
|
2592
|
+
lines.push(` ${colors.dim("○")} ${badge}${countLabel}${detailsLabel}${formatDuration$1(project.durationMs, colors)}`);
|
|
1752
2593
|
if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
|
|
1753
2594
|
const scheduleBreakdown = formatScheduleBreakdown(project);
|
|
1754
2595
|
if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
|
|
@@ -1756,14 +2597,13 @@ function formatVievalCliRunOutput(output) {
|
|
|
1756
2597
|
}
|
|
1757
2598
|
if (hasFailedCases) failedProjects += 1;
|
|
1758
2599
|
else passedProjects += 1;
|
|
1759
|
-
const
|
|
1760
|
-
const hybridAverageLabel = hybridAverage == null ? "n/a" : String(hybridAverage);
|
|
2600
|
+
const hybridAverageLabel = formatHybridAverage(project.result?.overall.hybridAverage);
|
|
1761
2601
|
const runCount = project.result?.overall.runCount ?? 0;
|
|
1762
2602
|
const countLabel = colors.dim(`(${project.taskCount} tasks)`);
|
|
1763
|
-
const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed`;
|
|
2603
|
+
const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed | ${project.caseSummary.timeout} timeout`;
|
|
1764
2604
|
const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, ${runCount} runs${caseSummaryLabel}, hybrid ${hybridAverageLabel}`);
|
|
1765
2605
|
const matrixSummary = formatMatrixSummary(project.matrixSummary);
|
|
1766
|
-
lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
|
|
2606
|
+
lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration$1(project.durationMs, colors)}`);
|
|
1767
2607
|
if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
|
|
1768
2608
|
const scheduleBreakdown = formatScheduleBreakdown(project);
|
|
1769
2609
|
if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
|
|
@@ -1800,14 +2640,14 @@ const compareHelpText = `
|
|
|
1800
2640
|
--output Optional output artifact path
|
|
1801
2641
|
--format Console output format: table | json (default: table)
|
|
1802
2642
|
`;
|
|
1803
|
-
function normalizeCliArgv$
|
|
2643
|
+
function normalizeCliArgv$6(argv) {
|
|
1804
2644
|
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
1805
2645
|
if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
|
|
1806
2646
|
return normalizedArgv;
|
|
1807
2647
|
}
|
|
1808
2648
|
function parseCompareCliArguments(argv) {
|
|
1809
2649
|
const cli = meow(compareHelpText, {
|
|
1810
|
-
argv: normalizeCliArgv$
|
|
2650
|
+
argv: normalizeCliArgv$6(argv),
|
|
1811
2651
|
flags: {
|
|
1812
2652
|
config: { type: "string" },
|
|
1813
2653
|
comparison: { type: "string" },
|
|
@@ -1904,10 +2744,15 @@ const evalRunHelpText = `
|
|
|
1904
2744
|
--workspace Workspace id used in report artifacts
|
|
1905
2745
|
--experiment Experiment id used in report artifacts
|
|
1906
2746
|
--attempt Attempt id used in report artifacts
|
|
2747
|
+
--workspace-concurrency Workspace scheduling cap
|
|
2748
|
+
--project-concurrency Project scheduling cap
|
|
2749
|
+
--task-concurrency Task scheduling cap
|
|
2750
|
+
--attempt-concurrency Attempt scheduling cap
|
|
2751
|
+
--case-concurrency Case scheduling cap
|
|
1907
2752
|
--report-out Report output root directory
|
|
1908
2753
|
--json Print machine-readable JSON output
|
|
1909
2754
|
`;
|
|
1910
|
-
function normalizeCliArgv$
|
|
2755
|
+
function normalizeCliArgv$5(argv) {
|
|
1911
2756
|
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
1912
2757
|
return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
|
|
1913
2758
|
}
|
|
@@ -1930,7 +2775,7 @@ function normalizeProjectNames(projectNames) {
|
|
|
1930
2775
|
*/
|
|
1931
2776
|
function parseCliArguments(argv) {
|
|
1932
2777
|
const cli = meow(evalRunHelpText, {
|
|
1933
|
-
argv: normalizeCliArgv$
|
|
2778
|
+
argv: normalizeCliArgv$5(argv),
|
|
1934
2779
|
importMeta: import.meta,
|
|
1935
2780
|
flags: {
|
|
1936
2781
|
config: { type: "string" },
|
|
@@ -1945,17 +2790,27 @@ function parseCliArguments(argv) {
|
|
|
1945
2790
|
workspace: { type: "string" },
|
|
1946
2791
|
experiment: { type: "string" },
|
|
1947
2792
|
attempt: { type: "string" },
|
|
2793
|
+
workspaceConcurrency: { type: "number" },
|
|
2794
|
+
projectConcurrency: { type: "number" },
|
|
2795
|
+
taskConcurrency: { type: "number" },
|
|
2796
|
+
attemptConcurrency: { type: "number" },
|
|
2797
|
+
caseConcurrency: { type: "number" },
|
|
1948
2798
|
reportOut: { type: "string" }
|
|
1949
2799
|
}
|
|
1950
2800
|
});
|
|
1951
2801
|
return {
|
|
1952
2802
|
attempt: cli.flags.attempt,
|
|
2803
|
+
attemptConcurrency: cli.flags.attemptConcurrency,
|
|
2804
|
+
caseConcurrency: cli.flags.caseConcurrency,
|
|
1953
2805
|
configFilePath: cli.flags.config,
|
|
1954
2806
|
experiment: cli.flags.experiment,
|
|
1955
2807
|
json: cli.flags.json === true,
|
|
1956
2808
|
project: normalizeProjectNames(cli.flags.project),
|
|
2809
|
+
projectConcurrency: cli.flags.projectConcurrency,
|
|
1957
2810
|
reportOut: cli.flags.reportOut,
|
|
1958
|
-
|
|
2811
|
+
taskConcurrency: cli.flags.taskConcurrency,
|
|
2812
|
+
workspace: cli.flags.workspace,
|
|
2813
|
+
workspaceConcurrency: cli.flags.workspaceConcurrency
|
|
1959
2814
|
};
|
|
1960
2815
|
}
|
|
1961
2816
|
/**
|
|
@@ -1991,11 +2846,16 @@ async function runEvalRunCli(argv) {
|
|
|
1991
2846
|
try {
|
|
1992
2847
|
const output = await runVievalCli({
|
|
1993
2848
|
attempt: parsed.attempt,
|
|
2849
|
+
attemptConcurrency: parsed.attemptConcurrency,
|
|
2850
|
+
caseConcurrency: parsed.caseConcurrency,
|
|
1994
2851
|
configFilePath: parsed.configFilePath,
|
|
1995
2852
|
experiment: parsed.experiment,
|
|
1996
2853
|
project: parsed.project,
|
|
2854
|
+
projectConcurrency: parsed.projectConcurrency,
|
|
1997
2855
|
reportOut: parsed.reportOut,
|
|
1998
|
-
|
|
2856
|
+
taskConcurrency: parsed.taskConcurrency,
|
|
2857
|
+
workspace: parsed.workspace,
|
|
2858
|
+
workspaceConcurrency: parsed.workspaceConcurrency
|
|
1999
2859
|
});
|
|
2000
2860
|
if (parsed.json) {
|
|
2001
2861
|
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
@@ -2011,89 +2871,6 @@ async function runEvalRunCli(argv) {
|
|
|
2011
2871
|
}
|
|
2012
2872
|
}
|
|
2013
2873
|
//#endregion
|
|
2014
|
-
//#region src/cli/report-artifacts.ts
|
|
2015
|
-
/**
|
|
2016
|
-
* Resolves one or more `run-summary.json` paths from a report location.
|
|
2017
|
-
*
|
|
2018
|
-
* Use when:
|
|
2019
|
-
* - callers may pass a run directory, summary file path, or a report root
|
|
2020
|
-
*
|
|
2021
|
-
* Returns:
|
|
2022
|
-
* - sorted absolute summary file paths
|
|
2023
|
-
*/
|
|
2024
|
-
async function resolveRunSummaryPaths(reportPath) {
|
|
2025
|
-
const absoluteReportPath = resolve(reportPath);
|
|
2026
|
-
const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
|
|
2027
|
-
if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
|
|
2028
|
-
if (existsSync(directSummaryPath)) return [directSummaryPath];
|
|
2029
|
-
return (await glob("**/run-summary.json", {
|
|
2030
|
-
absolute: true,
|
|
2031
|
-
cwd: absoluteReportPath
|
|
2032
|
-
})).sort((left, right) => left.localeCompare(right));
|
|
2033
|
-
}
|
|
2034
|
-
/**
|
|
2035
|
-
* Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
|
|
2036
|
-
*
|
|
2037
|
-
* Use when:
|
|
2038
|
-
* - report analysis needs both run aggregate output and event count metadata
|
|
2039
|
-
*/
|
|
2040
|
-
function readReportRunArtifact(summaryFilePath) {
|
|
2041
|
-
const reportDirectory = resolve(summaryFilePath, "..");
|
|
2042
|
-
const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
|
|
2043
|
-
const eventsFilePath = resolve(reportDirectory, "events.jsonl");
|
|
2044
|
-
const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
|
|
2045
|
-
const event = JSON.parse(line);
|
|
2046
|
-
return {
|
|
2047
|
-
caseId: event.caseId,
|
|
2048
|
-
data: event.data,
|
|
2049
|
-
event: event.event,
|
|
2050
|
-
taskId: event.taskId
|
|
2051
|
-
};
|
|
2052
|
-
}) : [];
|
|
2053
|
-
return {
|
|
2054
|
-
events,
|
|
2055
|
-
eventsCount: events.length,
|
|
2056
|
-
reportDirectory,
|
|
2057
|
-
summary,
|
|
2058
|
-
summaryFilePath
|
|
2059
|
-
};
|
|
2060
|
-
}
|
|
2061
|
-
/**
|
|
2062
|
-
* Reads all run artifacts found under `reportPath`.
|
|
2063
|
-
*
|
|
2064
|
-
* Use when:
|
|
2065
|
-
* - callers need multi-run analysis from a directory root
|
|
2066
|
-
*/
|
|
2067
|
-
async function readReportArtifacts(reportPath) {
|
|
2068
|
-
return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
|
|
2069
|
-
}
|
|
2070
|
-
/**
|
|
2071
|
-
* Creates a compact summary row for one run artifact.
|
|
2072
|
-
*
|
|
2073
|
-
* Use when:
|
|
2074
|
-
* - table/csv/jsonl exports should stay stable and cheap to parse
|
|
2075
|
-
*/
|
|
2076
|
-
function summarizeReportRunArtifact(artifact) {
|
|
2077
|
-
const totalProjects = artifact.summary.projects.length;
|
|
2078
|
-
const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
|
|
2079
|
-
const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
|
|
2080
|
-
const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
|
|
2081
|
-
const projectNames = artifact.summary.projects.map((project) => project.name);
|
|
2082
|
-
return {
|
|
2083
|
-
attemptId: artifact.summary.attemptId ?? null,
|
|
2084
|
-
eventsCount: artifact.eventsCount,
|
|
2085
|
-
executedProjects,
|
|
2086
|
-
experimentId: artifact.summary.experimentId ?? null,
|
|
2087
|
-
failedProjects,
|
|
2088
|
-
projectNames,
|
|
2089
|
-
reportDirectory: artifact.reportDirectory,
|
|
2090
|
-
runId: artifact.summary.runId ?? null,
|
|
2091
|
-
totalProjects,
|
|
2092
|
-
totalTasks,
|
|
2093
|
-
workspaceId: artifact.summary.workspaceId ?? null
|
|
2094
|
-
};
|
|
2095
|
-
}
|
|
2096
|
-
//#endregion
|
|
2097
2874
|
//#region src/cli/report-analyze.ts
|
|
2098
2875
|
const reportAnalyzeHelpText = `
|
|
2099
2876
|
Analyze generated vieval report artifacts.
|
|
@@ -2115,7 +2892,7 @@ const reportAnalyzeHelpText = `
|
|
|
2115
2892
|
--run-matrix Keep runs matching run-matrix selector "key=value[,key=value]"
|
|
2116
2893
|
--eval-matrix Keep runs matching eval-matrix selector "key=value[,key=value]"
|
|
2117
2894
|
`;
|
|
2118
|
-
function normalizeCliArgv$
|
|
2895
|
+
function normalizeCliArgv$4(argv) {
|
|
2119
2896
|
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
2120
2897
|
if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
|
|
2121
2898
|
if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
|
|
@@ -2123,7 +2900,7 @@ function normalizeCliArgv$2(argv) {
|
|
|
2123
2900
|
}
|
|
2124
2901
|
function parseReportAnalyzeCliArguments(argv) {
|
|
2125
2902
|
const cli = meow(reportAnalyzeHelpText, {
|
|
2126
|
-
argv: normalizeCliArgv$
|
|
2903
|
+
argv: normalizeCliArgv$4(argv),
|
|
2127
2904
|
flags: {
|
|
2128
2905
|
attempt: { type: "string" },
|
|
2129
2906
|
caseState: { type: "string" },
|
|
@@ -2412,6 +3189,473 @@ async function runReportAnalyzeCli(argv) {
|
|
|
2412
3189
|
}
|
|
2413
3190
|
}
|
|
2414
3191
|
//#endregion
|
|
3192
|
+
//#region src/cli/report-cases.ts
|
|
3193
|
+
const reportCasesHelpText = `
|
|
3194
|
+
Inspect normalized case records from generated vieval report artifacts.
|
|
3195
|
+
|
|
3196
|
+
Usage
|
|
3197
|
+
$ vieval report cases <reportPath> [options]
|
|
3198
|
+
|
|
3199
|
+
Options
|
|
3200
|
+
--format Output format: table | json | jsonl (default: table)
|
|
3201
|
+
--where Equality filter "key=value"; repeatable
|
|
3202
|
+
--group-by Case field, score name, or metric name used for grouped score summaries
|
|
3203
|
+
`;
|
|
3204
|
+
/**
|
|
3205
|
+
* Reads normalized case records from one report run directory or report root.
|
|
3206
|
+
*
|
|
3207
|
+
* Use when:
|
|
3208
|
+
* - CLI tools need case-level inspection from local report artifacts
|
|
3209
|
+
* - callers may pass a run directory, a `cases.jsonl` file, or a report root
|
|
3210
|
+
*
|
|
3211
|
+
* Expects:
|
|
3212
|
+
* - discovered `cases.jsonl` files contain one `CaseRecord` JSON object per line
|
|
3213
|
+
*
|
|
3214
|
+
* Returns:
|
|
3215
|
+
* - all parsed case records sorted by discovered file path order
|
|
3216
|
+
*/
|
|
3217
|
+
async function readCaseRecordsFromReport(reportPath) {
|
|
3218
|
+
const caseFilePaths = await resolveCaseRecordPaths(reportPath);
|
|
3219
|
+
if (caseFilePaths.length === 0) throw new Error(`No cases.jsonl files found under "${resolve(reportPath)}".`);
|
|
3220
|
+
const records = [];
|
|
3221
|
+
for (const caseFilePath of caseFilePaths) {
|
|
3222
|
+
const lines = readFileSync(caseFilePath, "utf-8").split("\n");
|
|
3223
|
+
for (const [index, line] of lines.entries()) {
|
|
3224
|
+
const trimmed = line.trim();
|
|
3225
|
+
if (trimmed.length === 0) continue;
|
|
3226
|
+
try {
|
|
3227
|
+
records.push(JSON.parse(trimmed));
|
|
3228
|
+
} catch (error) {
|
|
3229
|
+
throw new Error(`Invalid cases.jsonl line ${index + 1} in "${caseFilePath}": ${errorMessageFrom(error) ?? "Unknown JSON parse failure."}`);
|
|
3230
|
+
}
|
|
3231
|
+
}
|
|
3232
|
+
}
|
|
3233
|
+
return records;
|
|
3234
|
+
}
|
|
3235
|
+
/**
|
|
3236
|
+
* Builds filtered case inspection output.
|
|
3237
|
+
*
|
|
3238
|
+
* Use when:
|
|
3239
|
+
* - `vieval report cases` needs deterministic JSON/table output
|
|
3240
|
+
* - tests need pure filtering and grouping behavior without process I/O
|
|
3241
|
+
*
|
|
3242
|
+
* Expects:
|
|
3243
|
+
* - `where` filters use `key=value`
|
|
3244
|
+
* - lookup keys may target direct case fields, score names, or metric names
|
|
3245
|
+
*
|
|
3246
|
+
* Returns:
|
|
3247
|
+
* - filtered records plus grouped score summaries when `groupBy` is present
|
|
3248
|
+
*/
|
|
3249
|
+
function buildReportCasesOutput(records, options) {
|
|
3250
|
+
const whereFilters = (options.where ?? []).map(parseSelector);
|
|
3251
|
+
const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
|
|
3252
|
+
return {
|
|
3253
|
+
groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
|
|
3254
|
+
records: [...filteredRecords]
|
|
3255
|
+
};
|
|
3256
|
+
}
|
|
3257
|
+
/**
|
|
3258
|
+
* Runs the `vieval report cases` command.
|
|
3259
|
+
*
|
|
3260
|
+
* Call stack:
|
|
3261
|
+
*
|
|
3262
|
+
* published executable (`../bin/vieval`)
|
|
3263
|
+
* -> {@link import('./index').runTopLevelCli}
|
|
3264
|
+
* -> {@link runReportCasesCli}
|
|
3265
|
+
* -> {@link readCaseRecordsFromReport}
|
|
3266
|
+
*
|
|
3267
|
+
* Use when:
|
|
3268
|
+
* - the top-level CLI dispatches local case artifact inspection
|
|
3269
|
+
*
|
|
3270
|
+
* Expects:
|
|
3271
|
+
* - argv is either `cases <reportPath> ...` or `<reportPath> ...`
|
|
3272
|
+
*
|
|
3273
|
+
* Returns:
|
|
3274
|
+
* - resolves after writing the requested output to stdout
|
|
3275
|
+
*/
|
|
3276
|
+
async function runReportCasesCli(argv) {
|
|
3277
|
+
try {
|
|
3278
|
+
const parsed = parseReportCasesCliArguments(argv);
|
|
3279
|
+
const output = buildReportCasesOutput(await readCaseRecordsFromReport(parsed.reportPath), parsed);
|
|
3280
|
+
if (parsed.format === "json") {
|
|
3281
|
+
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
3282
|
+
return;
|
|
3283
|
+
}
|
|
3284
|
+
if (parsed.format === "jsonl") {
|
|
3285
|
+
process.stdout.write(encodeJsonl(output.records));
|
|
3286
|
+
return;
|
|
3287
|
+
}
|
|
3288
|
+
process.stdout.write(`${formatCasesTable(output)}\n`);
|
|
3289
|
+
} catch (error) {
|
|
3290
|
+
const errorMessage = errorMessageFrom(error) ?? "Unknown report cases failure.";
|
|
3291
|
+
process.stderr.write(`[vieval report cases] ${errorMessage}\n`);
|
|
3292
|
+
process.exitCode = 1;
|
|
3293
|
+
}
|
|
3294
|
+
}
|
|
3295
|
+
function normalizeCliArgv$3(argv) {
|
|
3296
|
+
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
3297
|
+
if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
|
|
3298
|
+
if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
|
|
3299
|
+
return normalizedArgv;
|
|
3300
|
+
}
|
|
3301
|
+
function parseReportCasesCliArguments(argv) {
|
|
3302
|
+
const cli = meow(reportCasesHelpText, {
|
|
3303
|
+
argv: normalizeCliArgv$3(argv),
|
|
3304
|
+
flags: {
|
|
3305
|
+
format: {
|
|
3306
|
+
default: "table",
|
|
3307
|
+
type: "string"
|
|
3308
|
+
},
|
|
3309
|
+
groupBy: { type: "string" },
|
|
3310
|
+
where: {
|
|
3311
|
+
isMultiple: true,
|
|
3312
|
+
type: "string"
|
|
3313
|
+
}
|
|
3314
|
+
},
|
|
3315
|
+
importMeta: import.meta
|
|
3316
|
+
});
|
|
3317
|
+
const reportPath = cli.input[0];
|
|
3318
|
+
if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
|
|
3319
|
+
return {
|
|
3320
|
+
format: normalizeReportCasesFormat(cli.flags.format),
|
|
3321
|
+
groupBy: cli.flags.groupBy,
|
|
3322
|
+
reportPath,
|
|
3323
|
+
where: cli.flags.where
|
|
3324
|
+
};
|
|
3325
|
+
}
|
|
3326
|
+
function normalizeReportCasesFormat(value) {
|
|
3327
|
+
const normalized = value.toLowerCase();
|
|
3328
|
+
if (normalized === "json") return "json";
|
|
3329
|
+
if (normalized === "jsonl") return "jsonl";
|
|
3330
|
+
return "table";
|
|
3331
|
+
}
|
|
3332
|
+
async function resolveCaseRecordPaths(reportPath) {
|
|
3333
|
+
const absoluteReportPath = resolve(reportPath);
|
|
3334
|
+
const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
|
|
3335
|
+
if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
|
|
3336
|
+
if (existsSync(directCaseFilePath)) return [directCaseFilePath];
|
|
3337
|
+
return (await glob("**/cases.jsonl", {
|
|
3338
|
+
absolute: true,
|
|
3339
|
+
cwd: absoluteReportPath
|
|
3340
|
+
})).sort((left, right) => left.localeCompare(right));
|
|
3341
|
+
}
|
|
3342
|
+
function matchesWhereFilters(record, whereFilters) {
|
|
3343
|
+
return whereFilters.every((parsed) => {
|
|
3344
|
+
const resolved = getCaseSelectorValue(record, parsed.key);
|
|
3345
|
+
return resolved.exists && String(resolved.value) === parsed.value;
|
|
3346
|
+
});
|
|
3347
|
+
}
|
|
3348
|
+
function parseSelector(selector) {
|
|
3349
|
+
const separatorIndex = selector.indexOf("=");
|
|
3350
|
+
if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
|
|
3351
|
+
return {
|
|
3352
|
+
key: selector.slice(0, separatorIndex).trim(),
|
|
3353
|
+
value: selector.slice(separatorIndex + 1).trim()
|
|
3354
|
+
};
|
|
3355
|
+
}
|
|
3356
|
+
function buildCaseGroups(records, groupBy) {
|
|
3357
|
+
const groups = {};
|
|
3358
|
+
for (const record of records) {
|
|
3359
|
+
const resolved = getCaseSelectorValue(record, groupBy);
|
|
3360
|
+
if (!resolved.exists) continue;
|
|
3361
|
+
const groupKey = `${groupBy}=${String(resolved.value)}`;
|
|
3362
|
+
groups[groupKey] ??= {
|
|
3363
|
+
count: 0,
|
|
3364
|
+
scores: {}
|
|
3365
|
+
};
|
|
3366
|
+
groups[groupKey].count += 1;
|
|
3367
|
+
addScores(groups[groupKey].scores, record.scores);
|
|
3368
|
+
}
|
|
3369
|
+
return Object.fromEntries(Object.entries(groups).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, group]) => [groupKey, {
|
|
3370
|
+
count: group.count,
|
|
3371
|
+
scores: finalizeScores(group.scores)
|
|
3372
|
+
}]));
|
|
3373
|
+
}
|
|
3374
|
+
function addScores(summary, scores) {
|
|
3375
|
+
for (const [scoreName, value] of Object.entries(scores)) {
|
|
3376
|
+
summary[scoreName] ??= {
|
|
3377
|
+
average: 0,
|
|
3378
|
+
count: 0,
|
|
3379
|
+
sum: 0
|
|
3380
|
+
};
|
|
3381
|
+
summary[scoreName].count += 1;
|
|
3382
|
+
summary[scoreName].sum += value;
|
|
3383
|
+
}
|
|
3384
|
+
}
|
|
3385
|
+
function finalizeScores(summary) {
|
|
3386
|
+
return Object.fromEntries(Object.entries(summary).sort(([left], [right]) => left.localeCompare(right)).map(([scoreName, bucket]) => [scoreName, {
|
|
3387
|
+
average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
|
|
3388
|
+
count: bucket.count,
|
|
3389
|
+
sum: bucket.sum
|
|
3390
|
+
}]));
|
|
3391
|
+
}
|
|
3392
|
+
function formatCasesTable(output) {
|
|
3393
|
+
const lines = ["CASES vieval report", `Case count ${output.records.length}`];
|
|
3394
|
+
if (output.groups != null) {
|
|
3395
|
+
lines.push("Groups");
|
|
3396
|
+
for (const [groupKey, group] of Object.entries(output.groups)) {
|
|
3397
|
+
const scoreText = Object.entries(group.scores).map(([scoreName, bucket]) => `${scoreName}=${bucket.average.toFixed(3)}`).join(" ");
|
|
3398
|
+
lines.push(`${groupKey} count=${group.count}${scoreText.length > 0 ? ` ${scoreText}` : ""}`);
|
|
3399
|
+
}
|
|
3400
|
+
}
|
|
3401
|
+
return lines.join("\n");
|
|
3402
|
+
}
|
|
3403
|
+
//#endregion
|
|
3404
|
+
//#region src/cli/report-case-compare.ts
|
|
3405
|
+
const reportCompareHelpText = `
|
|
3406
|
+
Compare normalized case records from two generated vieval reports.
|
|
3407
|
+
|
|
3408
|
+
Usage
|
|
3409
|
+
$ vieval report compare <leftReportPath> <rightReportPath> [options]
|
|
3410
|
+
|
|
3411
|
+
Options
|
|
3412
|
+
--format Output format: table | json (default: table)
|
|
3413
|
+
--case-key Case field, score name, or metric name used to match records
|
|
3414
|
+
--score-kind Score kind used for deltas (default: exact)
|
|
3415
|
+
--group-by Case field, score name, or metric name used for grouped deltas
|
|
3416
|
+
`;
|
|
3417
|
+
/**
|
|
3418
|
+
* Builds a generic case-level comparison between two report runs.
|
|
3419
|
+
*
|
|
3420
|
+
* Use when:
|
|
3421
|
+
* - local report analysis needs per-case improvements/regressions
|
|
3422
|
+
* - benchmark-specific facets should stay as generic metric keys
|
|
3423
|
+
*
|
|
3424
|
+
* Expects:
|
|
3425
|
+
* - left and right records are normalized `cases.jsonl` rows
|
|
3426
|
+
* - score values are numeric and comparable by `scoreKind`
|
|
3427
|
+
*
|
|
3428
|
+
* Returns:
|
|
3429
|
+
* - matched case deltas, added/removed cases, top changes, and optional group summaries
|
|
3430
|
+
*/
|
|
3431
|
+
function buildCaseComparison(args) {
|
|
3432
|
+
const scoreKind = args.scoreKind ?? "exact";
|
|
3433
|
+
const leftByKey = indexRecordsByCaseKey(args.left, args.caseKey, "left");
|
|
3434
|
+
const rightByKey = indexRecordsByCaseKey(args.right, args.caseKey, "right");
|
|
3435
|
+
const cases = [];
|
|
3436
|
+
const added = [];
|
|
3437
|
+
const removed = [];
|
|
3438
|
+
for (const [caseKey, leftRecord] of leftByKey) {
|
|
3439
|
+
const rightRecord = rightByKey.get(caseKey);
|
|
3440
|
+
if (rightRecord == null) {
|
|
3441
|
+
removed.push(leftRecord);
|
|
3442
|
+
continue;
|
|
3443
|
+
}
|
|
3444
|
+
const leftScore = getScore(leftRecord, scoreKind);
|
|
3445
|
+
const rightScore = getScore(rightRecord, scoreKind);
|
|
3446
|
+
cases.push({
|
|
3447
|
+
caseKey,
|
|
3448
|
+
delta: {
|
|
3449
|
+
left: leftScore,
|
|
3450
|
+
right: rightScore,
|
|
3451
|
+
score: rightScore - leftScore
|
|
3452
|
+
},
|
|
3453
|
+
left: leftRecord,
|
|
3454
|
+
metricsChanged: diffMetrics(leftRecord.metrics, rightRecord.metrics),
|
|
3455
|
+
right: rightRecord
|
|
3456
|
+
});
|
|
3457
|
+
}
|
|
3458
|
+
for (const [caseKey, rightRecord] of rightByKey) if (!leftByKey.has(caseKey)) added.push(rightRecord);
|
|
3459
|
+
const sortedCases = [...cases].sort((left, right) => {
|
|
3460
|
+
const deltaOrder = right.delta.score - left.delta.score;
|
|
3461
|
+
return deltaOrder === 0 ? left.caseKey.localeCompare(right.caseKey) : deltaOrder;
|
|
3462
|
+
});
|
|
3463
|
+
return {
|
|
3464
|
+
added: added.sort(compareCaseRecords),
|
|
3465
|
+
cases: cases.sort((left, right) => left.caseKey.localeCompare(right.caseKey)),
|
|
3466
|
+
groups: args.groupBy == null ? void 0 : buildComparisonGroups(cases, args.groupBy),
|
|
3467
|
+
overall: {
|
|
3468
|
+
delta: averageScore(args.right, scoreKind) - averageScore(args.left, scoreKind),
|
|
3469
|
+
leftAverage: averageScore(args.left, scoreKind),
|
|
3470
|
+
rightAverage: averageScore(args.right, scoreKind)
|
|
3471
|
+
},
|
|
3472
|
+
removed: removed.sort(compareCaseRecords),
|
|
3473
|
+
topImprovements: sortedCases.filter((row) => row.delta.score > 0).slice(0, 10),
|
|
3474
|
+
topRegressions: [...sortedCases].reverse().filter((row) => row.delta.score < 0).slice(0, 10)
|
|
3475
|
+
};
|
|
3476
|
+
}
|
|
3477
|
+
/**
|
|
3478
|
+
* Runs the `vieval report compare` command.
|
|
3479
|
+
*
|
|
3480
|
+
* Call stack:
|
|
3481
|
+
*
|
|
3482
|
+
* published executable (`../bin/vieval`)
|
|
3483
|
+
* -> {@link import('./index').runTopLevelCli}
|
|
3484
|
+
* -> {@link runReportCompareCli}
|
|
3485
|
+
* -> {@link readCaseRecordsFromReport}
|
|
3486
|
+
* -> {@link buildCaseComparison}
|
|
3487
|
+
*
|
|
3488
|
+
* Use when:
|
|
3489
|
+
* - two local report artifact directories should be compared case-by-case
|
|
3490
|
+
*
|
|
3491
|
+
* Expects:
|
|
3492
|
+
* - argv is either `compare <left> <right> ...` or `<left> <right> ...`
|
|
3493
|
+
*
|
|
3494
|
+
* Returns:
|
|
3495
|
+
* - resolves after writing the requested output to stdout
|
|
3496
|
+
*/
|
|
3497
|
+
async function runReportCompareCli(argv) {
|
|
3498
|
+
try {
|
|
3499
|
+
const parsed = parseReportCompareCliArguments(argv);
|
|
3500
|
+
const [left, right] = await Promise.all([readCaseRecordsFromReport(parsed.leftReportPath), readCaseRecordsFromReport(parsed.rightReportPath)]);
|
|
3501
|
+
const output = buildCaseComparison({
|
|
3502
|
+
caseKey: parsed.caseKey,
|
|
3503
|
+
groupBy: parsed.groupBy,
|
|
3504
|
+
left,
|
|
3505
|
+
right,
|
|
3506
|
+
scoreKind: parsed.scoreKind
|
|
3507
|
+
});
|
|
3508
|
+
if (parsed.format === "json") {
|
|
3509
|
+
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
3510
|
+
return;
|
|
3511
|
+
}
|
|
3512
|
+
process.stdout.write(`${formatCaseComparisonTable(output)}\n`);
|
|
3513
|
+
} catch (error) {
|
|
3514
|
+
const errorMessage = errorMessageFrom(error) ?? "Unknown report compare failure.";
|
|
3515
|
+
process.stderr.write(`[vieval report compare] ${errorMessage}\n`);
|
|
3516
|
+
process.exitCode = 1;
|
|
3517
|
+
}
|
|
3518
|
+
}
|
|
3519
|
+
function normalizeCliArgv$2(argv) {
|
|
3520
|
+
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
3521
|
+
if (normalizedArgv[0] === "report" && normalizedArgv[1] === "compare") return normalizedArgv.slice(2);
|
|
3522
|
+
if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
|
|
3523
|
+
return normalizedArgv;
|
|
3524
|
+
}
|
|
3525
|
+
function parseReportCompareCliArguments(argv) {
|
|
3526
|
+
const cli = meow(reportCompareHelpText, {
|
|
3527
|
+
argv: normalizeCliArgv$2(argv),
|
|
3528
|
+
flags: {
|
|
3529
|
+
caseKey: { type: "string" },
|
|
3530
|
+
format: {
|
|
3531
|
+
default: "table",
|
|
3532
|
+
type: "string"
|
|
3533
|
+
},
|
|
3534
|
+
groupBy: { type: "string" },
|
|
3535
|
+
scoreKind: {
|
|
3536
|
+
default: "exact",
|
|
3537
|
+
type: "string"
|
|
3538
|
+
}
|
|
3539
|
+
},
|
|
3540
|
+
importMeta: import.meta
|
|
3541
|
+
});
|
|
3542
|
+
const leftReportPath = cli.input[0];
|
|
3543
|
+
const rightReportPath = cli.input[1];
|
|
3544
|
+
if (leftReportPath == null || leftReportPath.length === 0 || rightReportPath == null || rightReportPath.length === 0) throw new Error("Missing required <leftReportPath> and <rightReportPath> arguments.");
|
|
3545
|
+
return {
|
|
3546
|
+
caseKey: cli.flags.caseKey,
|
|
3547
|
+
format: cli.flags.format === "json" ? "json" : "table",
|
|
3548
|
+
groupBy: cli.flags.groupBy,
|
|
3549
|
+
leftReportPath,
|
|
3550
|
+
rightReportPath,
|
|
3551
|
+
scoreKind: cli.flags.scoreKind
|
|
3552
|
+
};
|
|
3553
|
+
}
|
|
3554
|
+
function indexRecordsByCaseKey(records, caseKey, side) {
|
|
3555
|
+
const indexed = /* @__PURE__ */ new Map();
|
|
3556
|
+
for (const record of records) {
|
|
3557
|
+
const resolved = resolveCaseKey(record, caseKey);
|
|
3558
|
+
if (indexed.has(resolved)) throw new Error(`Duplicate case key "${resolved}" in ${side} report.`);
|
|
3559
|
+
indexed.set(resolved, record);
|
|
3560
|
+
}
|
|
3561
|
+
return indexed;
|
|
3562
|
+
}
|
|
3563
|
+
function resolveCaseKey(record, caseKey) {
|
|
3564
|
+
if (caseKey != null) {
|
|
3565
|
+
const resolved = getCaseSelectorValue(record, caseKey);
|
|
3566
|
+
if (resolved.exists) return String(resolved.value);
|
|
3567
|
+
throw new Error(`Missing explicit case key "${caseKey}" for case "${record.caseId}".`);
|
|
3568
|
+
}
|
|
3569
|
+
const benchmarkCaseId = getCaseSelectorValue(record, "benchmark.case.id");
|
|
3570
|
+
if (benchmarkCaseId.exists) return String(benchmarkCaseId.value);
|
|
3571
|
+
const vievalCaseId = getCaseSelectorValue(record, "vieval.case.id");
|
|
3572
|
+
return vievalCaseId.exists ? String(vievalCaseId.value) : record.caseId;
|
|
3573
|
+
}
|
|
3574
|
+
function getScore(record, scoreKind) {
|
|
3575
|
+
return record.scores[scoreKind] ?? 0;
|
|
3576
|
+
}
|
|
3577
|
+
function averageScore(records, scoreKind) {
|
|
3578
|
+
const values = records.map((record) => record.scores[scoreKind]).filter((value) => typeof value === "number");
|
|
3579
|
+
if (values.length === 0) return 0;
|
|
3580
|
+
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
3581
|
+
}
|
|
3582
|
+
function diffMetrics(left, right) {
|
|
3583
|
+
const changed = {};
|
|
3584
|
+
const metricKeys = [...new Set([...Object.keys(left), ...Object.keys(right)])].sort((leftKey, rightKey) => leftKey.localeCompare(rightKey));
|
|
3585
|
+
for (const metricKey of metricKeys) if (stableStringify(left[metricKey]) !== stableStringify(right[metricKey])) changed[metricKey] = {
|
|
3586
|
+
left: left[metricKey],
|
|
3587
|
+
right: right[metricKey]
|
|
3588
|
+
};
|
|
3589
|
+
return changed;
|
|
3590
|
+
}
|
|
3591
|
+
function buildComparisonGroups(cases, groupBy) {
|
|
3592
|
+
const groupedRows = {};
|
|
3593
|
+
for (const row of cases) {
|
|
3594
|
+
const resolved = getCaseSelectorValue(row.right, groupBy);
|
|
3595
|
+
if (!resolved.exists) continue;
|
|
3596
|
+
const groupKey = `${groupBy}=${String(resolved.value)}`;
|
|
3597
|
+
groupedRows[groupKey] ??= [];
|
|
3598
|
+
groupedRows[groupKey].push(row);
|
|
3599
|
+
}
|
|
3600
|
+
return Object.fromEntries(Object.entries(groupedRows).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, rows]) => {
|
|
3601
|
+
const leftAverage = rows.reduce((sum, row) => sum + row.delta.left, 0) / rows.length;
|
|
3602
|
+
const rightAverage = rows.reduce((sum, row) => sum + row.delta.right, 0) / rows.length;
|
|
3603
|
+
return [groupKey, {
|
|
3604
|
+
count: rows.length,
|
|
3605
|
+
delta: rightAverage - leftAverage,
|
|
3606
|
+
leftAverage,
|
|
3607
|
+
rightAverage
|
|
3608
|
+
}];
|
|
3609
|
+
}));
|
|
3610
|
+
}
|
|
3611
|
+
function compareCaseRecords(left, right) {
|
|
3612
|
+
return left.caseId.localeCompare(right.caseId);
|
|
3613
|
+
}
|
|
3614
|
+
/**
|
|
3615
|
+
* Formats a case comparison as a compact human-readable table.
|
|
3616
|
+
*
|
|
3617
|
+
* Use when:
|
|
3618
|
+
* - `vieval report compare` should expose the same information as JSON output
|
|
3619
|
+
* - users need a terminal-first overview of group and per-case deltas
|
|
3620
|
+
*
|
|
3621
|
+
* Expects:
|
|
3622
|
+
* - comparison output was produced by {@link buildCaseComparison}
|
|
3623
|
+
*
|
|
3624
|
+
* Returns:
|
|
3625
|
+
* - multi-line text containing aggregate, group, top-change, case, and unmatched summaries
|
|
3626
|
+
*/
|
|
3627
|
+
function formatCaseComparisonTable(output) {
|
|
3628
|
+
const lines = [
|
|
3629
|
+
"COMPARE vieval report cases",
|
|
3630
|
+
`Matched ${output.cases.length}`,
|
|
3631
|
+
`Added ${output.added.length}`,
|
|
3632
|
+
`Removed ${output.removed.length}`,
|
|
3633
|
+
`Scores left=${output.overall.leftAverage.toFixed(3)} right=${output.overall.rightAverage.toFixed(3)} delta=${output.overall.delta.toFixed(3)}`
|
|
3634
|
+
];
|
|
3635
|
+
if (output.groups != null && Object.keys(output.groups).length > 0) {
|
|
3636
|
+
lines.push("Groups");
|
|
3637
|
+
for (const [groupKey, group] of Object.entries(output.groups)) lines.push(`${groupKey} count=${group.count} left=${group.leftAverage.toFixed(3)} right=${group.rightAverage.toFixed(3)} delta=${group.delta.toFixed(3)}`);
|
|
3638
|
+
}
|
|
3639
|
+
if (output.topImprovements.length > 0) {
|
|
3640
|
+
lines.push("Top improvements");
|
|
3641
|
+
for (const row of output.topImprovements) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
|
|
3642
|
+
}
|
|
3643
|
+
if (output.topRegressions.length > 0) {
|
|
3644
|
+
lines.push("Top regressions");
|
|
3645
|
+
for (const row of output.topRegressions) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
|
|
3646
|
+
}
|
|
3647
|
+
if (output.cases.length > 0) {
|
|
3648
|
+
lines.push("Cases");
|
|
3649
|
+
for (const row of output.cases) {
|
|
3650
|
+
const changedMetricNames = Object.keys(row.metricsChanged);
|
|
3651
|
+
lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} changedMetrics=${changedMetricNames.length === 0 ? "none" : changedMetricNames.join(",")}`);
|
|
3652
|
+
}
|
|
3653
|
+
}
|
|
3654
|
+
if (output.added.length > 0) lines.push(`Added cases ${output.added.map((record) => record.caseId).join(",")}`);
|
|
3655
|
+
if (output.removed.length > 0) lines.push(`Removed cases ${output.removed.map((record) => record.caseId).join(",")}`);
|
|
3656
|
+
return lines.join("\n");
|
|
3657
|
+
}
|
|
3658
|
+
//#endregion
|
|
2415
3659
|
//#region src/cli/report-index.ts
|
|
2416
3660
|
const reportIndexHelpText = `
|
|
2417
3661
|
Build report indexes from generated vieval artifacts.
|
|
@@ -2579,7 +3823,15 @@ async function runTopLevelCli(argv) {
|
|
|
2579
3823
|
await runReportIndexCli(parsed.commandArgv);
|
|
2580
3824
|
return;
|
|
2581
3825
|
}
|
|
2582
|
-
|
|
3826
|
+
if (reportSubcommand === "cases") {
|
|
3827
|
+
await runReportCasesCli(parsed.commandArgv);
|
|
3828
|
+
return;
|
|
3829
|
+
}
|
|
3830
|
+
if (reportSubcommand === "compare") {
|
|
3831
|
+
await runReportCompareCli(parsed.commandArgv);
|
|
3832
|
+
return;
|
|
3833
|
+
}
|
|
3834
|
+
throw new Error(`Unsupported vieval report command "${reportSubcommand ?? "(none)"}". Expected "analyze", "index", "cases", or "compare".`);
|
|
2583
3835
|
}
|
|
2584
3836
|
if (parsed.command === "compare") {
|
|
2585
3837
|
await runCompareCliOrExit(parsed.commandArgv);
|
|
@@ -2590,4 +3842,4 @@ async function runTopLevelCli(argv) {
|
|
|
2590
3842
|
//#endregion
|
|
2591
3843
|
export { runTopLevelCli as n, parseTopLevelCliArguments as t };
|
|
2592
3844
|
|
|
2593
|
-
//# sourceMappingURL=cli-
|
|
3845
|
+
//# sourceMappingURL=cli-ImxGpoYQ.mjs.map
|