vieval 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +219 -109
  2. package/dist/bin/vieval.mjs +1 -1
  3. package/dist/cli/index.mjs +1 -1
  4. package/dist/{cli-DayPXzHX.mjs → cli-ImxGpoYQ.mjs} +1447 -195
  5. package/dist/cli-ImxGpoYQ.mjs.map +1 -0
  6. package/dist/config.d.mts +2 -2
  7. package/dist/config.mjs +1 -1
  8. package/dist/core/assertions/index.d.mts +1 -1
  9. package/dist/core/inference-executors/index.d.mts +1 -1
  10. package/dist/core/inference-executors/index.mjs +1 -1
  11. package/dist/core/processors/results/index.d.mts +1 -1
  12. package/dist/core/runner/index.d.mts +3 -2
  13. package/dist/core/runner/index.mjs +3 -2
  14. package/dist/core/runner/index.mjs.map +1 -1
  15. package/dist/core/scheduler/index.d.mts +2 -0
  16. package/dist/core/scheduler/index.mjs +188 -0
  17. package/dist/core/scheduler/index.mjs.map +1 -0
  18. package/dist/{env-BFSjny07.mjs → env--94B0UtW.mjs} +1 -1
  19. package/dist/{env-BFSjny07.mjs.map → env--94B0UtW.mjs.map} +1 -1
  20. package/dist/{env-BTq3dV7C.d.mts → env-BeHv_5mo.d.mts} +1 -1
  21. package/dist/{expect-extensions-QLXESWjn.mjs → expect-extensions-DCSqlneN.mjs} +1 -1
  22. package/dist/{expect-extensions-QLXESWjn.mjs.map → expect-extensions-DCSqlneN.mjs.map} +1 -1
  23. package/dist/expect.mjs +1 -1
  24. package/dist/{index-OEdqjQSe.d.mts → index-5R1_k2nv.d.mts} +195 -3
  25. package/dist/index-fakXoZEe.d.mts +147 -0
  26. package/dist/index.d.mts +120 -13
  27. package/dist/index.mjs +286 -54
  28. package/dist/index.mjs.map +1 -1
  29. package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
  30. package/dist/models-DIGdOUpJ.mjs.map +1 -0
  31. package/dist/plugins/chat-models/index.d.mts +27 -1
  32. package/dist/plugins/chat-models/index.mjs +29 -1
  33. package/dist/plugins/chat-models/index.mjs.map +1 -1
  34. package/dist/queue-DsZQkZO_.mjs +21 -0
  35. package/dist/queue-DsZQkZO_.mjs.map +1 -0
  36. package/dist/{registry-CwcMMjnZ.mjs → registry-BHGMxjpA.mjs} +164 -6
  37. package/dist/registry-BHGMxjpA.mjs.map +1 -0
  38. package/dist/testing/expect-extensions.mjs +1 -1
  39. package/package.json +8 -1
  40. package/dist/cli-DayPXzHX.mjs.map +0 -1
  41. package/dist/models-D_MsBtYw.mjs.map +0 -1
  42. package/dist/registry-CwcMMjnZ.mjs.map +0 -1
@@ -1,4 +1,5 @@
1
- import { c as loadRawVievalConfig, l as loadVievalCliConfig, n as consumeModuleRegistrations, o as detectCliConfigMode, r as endModuleRegistration, t as beginModuleRegistration } from "./registry-CwcMMjnZ.mjs";
1
+ import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-BHGMxjpA.mjs";
2
+ import { createSchedulerRuntime } from "./core/scheduler/index.mjs";
2
3
  import { RunnerExecutionError, collectEvalEntries, createFilesystemTaskCacheRuntime, createRunnerRuntimeContext, createRunnerSchedule, createTaskExecutionContext, runScheduledTasks } from "./core/runner/index.mjs";
3
4
  import process from "node:process";
4
5
  import { errorMessageFrom } from "@moeru/std";
@@ -12,6 +13,7 @@ import c from "tinyrainbow";
12
13
  import { existsSync, readFileSync } from "node:fs";
13
14
  import { uniq } from "es-toolkit";
14
15
  import { createVitest } from "vitest/node";
16
+ import { formatDuration, intervalToDuration } from "date-fns";
15
17
  import { stripVTControlCharacters } from "node:util";
16
18
  import stringWidth from "fast-string-width";
17
19
  //#region src/cli/comparison-config.ts
@@ -262,6 +264,612 @@ async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
262
264
  return loadedModules;
263
265
  }
264
266
  //#endregion
267
+ //#region src/cli/report-selectors.ts
268
+ /**
269
+ * Resolves a generic case selector from metrics, scores, then direct fields.
270
+ *
271
+ * Use when:
272
+ * - report commands accept benchmark-neutral selectors such as `benchmark.case.id`
273
+ * - comparisons need the same lookup semantics as filtering and grouping
274
+ *
275
+ * Expects:
276
+ * - `key` is a direct `CaseRecord` field, score key, `scores.<key>`, or metric key
277
+ *
278
+ * Returns:
279
+ * - existence flag plus matched value when present
280
+ */
281
+ function getCaseSelectorValue(record, key) {
282
+ if (Object.hasOwn(record.metrics, key)) return {
283
+ exists: true,
284
+ value: record.metrics[key]
285
+ };
286
+ if (key.startsWith("scores.") && Object.hasOwn(record.scores, key.slice(7))) return {
287
+ exists: true,
288
+ value: record.scores[key.slice(7)]
289
+ };
290
+ if (Object.hasOwn(record.scores, key)) return {
291
+ exists: true,
292
+ value: record.scores[key]
293
+ };
294
+ if (Object.hasOwn(record, key)) return {
295
+ exists: true,
296
+ value: record[key]
297
+ };
298
+ return { exists: false };
299
+ }
300
+ /**
301
+ * Stable-stringifies JSON-like values for report comparisons.
302
+ *
303
+ * Before:
304
+ * - `{ b: 1, a: true }`
305
+ *
306
+ * After:
307
+ * - `{"a":true,"b":1}`
308
+ */
309
+ function stableStringify(value) {
310
+ if (value == null || typeof value !== "object") return JSON.stringify(value);
311
+ if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
312
+ const record = value;
313
+ return `{${Object.keys(record).sort((left, right) => left.localeCompare(right)).map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(",")}}`;
314
+ }
315
+ //#endregion
316
+ //#region src/cli/report-otlp.ts
317
+ /**
318
+ * Builds local OTLP-shaped JSON projections from normalized case records.
319
+ *
320
+ * Use when:
321
+ * - writing deterministic report artifacts without requiring an OpenTelemetry Collector
322
+ * - future tools need trace/log/metric-shaped JSON files
323
+ *
324
+ * Expects:
325
+ * - records belong to one Vieval run
326
+ *
327
+ * Returns:
328
+ * - trace, log, and metric containers shaped after OTLP JSON concepts
329
+ */
330
+ function buildLocalOtlpProjection(args) {
331
+ const projectSpans = collectProjectNames(args.records).map((projectName) => ({
332
+ attributes: toAttributes({
333
+ "vieval.project.name": projectName,
334
+ "vieval.run.id": args.runId
335
+ }),
336
+ name: "vieval.project"
337
+ }));
338
+ const taskSpans = collectTasks(args.records).map((task) => ({
339
+ attributes: toAttributes({
340
+ "vieval.project.name": task.projectName,
341
+ "vieval.run.id": args.runId,
342
+ "vieval.task.id": task.taskId
343
+ }),
344
+ name: "vieval.task"
345
+ }));
346
+ const caseSpans = args.records.map((record) => ({
347
+ attributes: toAttributes({
348
+ ...record.metrics,
349
+ "vieval.case.duration_ms": record.durationMs,
350
+ "vieval.case.id": record.caseId,
351
+ "vieval.case.name": record.caseName,
352
+ "vieval.case.retry_count": record.retryCount,
353
+ "vieval.case.state": record.state,
354
+ "vieval.project.name": record.projectName,
355
+ "vieval.task.id": record.taskId
356
+ }),
357
+ endTimeUnixNano: isoToUnixNano(record.endedAt),
358
+ name: "vieval.case",
359
+ startTimeUnixNano: isoToUnixNano(record.startedAt)
360
+ }));
361
+ return {
362
+ logs: { resourceLogs: [{ scopeLogs: [{
363
+ logRecords: args.records.map((record) => ({
364
+ attributes: toAttributes(record.metrics),
365
+ body: { stringValue: JSON.stringify({
366
+ caseId: record.caseId,
367
+ scores: record.scores,
368
+ state: record.state
369
+ }) },
370
+ eventName: "vieval.case",
371
+ timeUnixNano: isoToUnixNano(record.endedAt)
372
+ })),
373
+ scope: { name: "vieval" }
374
+ }] }] },
375
+ metrics: { resourceMetrics: [{ scopeMetrics: [{
376
+ metrics: collectScoreKinds(args.records).map((kind) => ({
377
+ gauge: { dataPoints: args.records.filter((record) => typeof record.scores[kind] === "number").map((record) => ({
378
+ asDouble: record.scores[kind],
379
+ attributes: toAttributes({
380
+ ...record.metrics,
381
+ "vieval.case.id": record.caseId,
382
+ "vieval.task.id": record.taskId
383
+ }),
384
+ timeUnixNano: isoToUnixNano(record.endedAt)
385
+ })) },
386
+ name: `vieval.score.${kind}`
387
+ })),
388
+ scope: { name: "vieval" }
389
+ }] }] },
390
+ traces: { resourceSpans: [{ scopeSpans: [{
391
+ scope: { name: "vieval" },
392
+ spans: [
393
+ {
394
+ attributes: toAttributes({ "vieval.run.id": args.runId }),
395
+ name: "vieval.run"
396
+ },
397
+ ...projectSpans,
398
+ ...taskSpans,
399
+ ...caseSpans
400
+ ]
401
+ }] }] }
402
+ };
403
+ }
404
+ function toAttributes(attributes) {
405
+ return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
406
+ key,
407
+ value: toAnyValue(value)
408
+ }));
409
+ }
410
+ function toAnyValue(value) {
411
+ if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
412
+ if (isAttributeScalar(value)) {
413
+ if (typeof value === "boolean") return { boolValue: value };
414
+ if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
415
+ if (value == null) return { stringValue: "null" };
416
+ return { stringValue: value };
417
+ }
418
+ return { stringValue: stableStringify(value) };
419
+ }
420
+ function isAttributeScalar(value) {
421
+ return value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string";
422
+ }
423
+ function isoToUnixNano(value) {
424
+ const preciseMatch = /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.(\d{1,9}))?(Z|[+-]\d{2}:\d{2})$/.exec(value);
425
+ if (preciseMatch != null) {
426
+ const [, secondsPart, fraction = "", zone] = preciseMatch;
427
+ const unixMilliseconds = Date.parse(`${secondsPart}.000${zone}`);
428
+ if (!Number.isFinite(unixMilliseconds)) return "0";
429
+ return String(BigInt(unixMilliseconds) * 1000000n + BigInt(fraction.padEnd(9, "0").slice(0, 9)));
430
+ }
431
+ const unixMilliseconds = Date.parse(value);
432
+ if (!Number.isFinite(unixMilliseconds)) return "0";
433
+ return String(BigInt(unixMilliseconds) * 1000000n);
434
+ }
435
+ function collectScoreKinds(records) {
436
+ return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
437
+ }
438
+ function collectProjectNames(records) {
439
+ return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
440
+ }
441
+ function collectTasks(records) {
442
+ const tasks = /* @__PURE__ */ new Map();
443
+ for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
444
+ projectName: record.projectName,
445
+ taskId: record.taskId
446
+ });
447
+ return [...tasks.values()].sort((left, right) => {
448
+ const projectOrder = left.projectName.localeCompare(right.projectName);
449
+ return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
450
+ });
451
+ }
452
+ //#endregion
453
+ //#region src/cli/report-records.ts
454
+ /**
455
+ * Builds normalized case records from lifecycle, metric, and score events.
456
+ *
457
+ * Use when:
458
+ * - `events.jsonl` should be projected into `cases.jsonl`
459
+ * - report commands need one final record per observed case outcome
460
+ *
461
+ * Expects:
462
+ * - events are ordered by occurrence where possible
463
+ * - lifecycle events use either `task.case.start`/`task.case.end` or current CLI `CaseStarted`/`CaseEnded` names
464
+ *
465
+ * Returns:
466
+ * - records for cases that emitted an end lifecycle event
467
+ */
468
+ function buildCaseRecords(args) {
469
+ const drafts = /* @__PURE__ */ new Map();
470
+ const completedKeys = [];
471
+ for (const event of args.events) {
472
+ const normalizedEvent = normalizeCaseEventName(event.event);
473
+ if (normalizedEvent == null) continue;
474
+ const ids = extractEventIds(event, args);
475
+ if (ids.caseId.length === 0 || ids.taskId.length === 0) continue;
476
+ const draft = getOrCreateDraft(drafts, ids, event, args);
477
+ applyIdentity(draft, ids, event, args);
478
+ if (normalizedEvent === "start") applyCaseStart(draft, event);
479
+ else if (normalizedEvent === "metric") applyCaseMetric(draft, event);
480
+ else if (normalizedEvent === "score") applyCaseScore(draft, event);
481
+ else {
482
+ applyCaseEnd(draft, event);
483
+ const key = createCaseKey(ids.taskId, ids.caseId);
484
+ if (!completedKeys.includes(key)) completedKeys.push(key);
485
+ }
486
+ }
487
+ return completedKeys.map((key) => drafts.get(key)).filter((draft) => draft != null && draft.endedAt != null).map(toCaseRecord);
488
+ }
489
+ /**
490
+ * Builds generic score summaries overall and grouped by arbitrary keys.
491
+ *
492
+ * Use when:
493
+ * - report artifacts need benchmark-neutral aggregate score views
494
+ * - callers want to group by metrics such as `benchmark.category` or direct record fields such as `taskId`
495
+ *
496
+ * Expects:
497
+ * - `groupByKeys` are stable metric names or direct `CaseRecord` field names
498
+ * - record score values are normalized numeric scores
499
+ *
500
+ * Returns:
501
+ * - overall score buckets and group buckets keyed by `<key>=<value>`
502
+ */
503
+ function buildMetricsSummary(records, groupByKeys) {
504
+ const overall = {};
505
+ const groups = {};
506
+ for (const record of records) {
507
+ addRecordScores(overall, record);
508
+ for (const groupByKey of groupByKeys) {
509
+ const groupValue = getGroupValue(record, groupByKey);
510
+ if (!groupValue.exists) continue;
511
+ const groupKey = `${groupByKey}=${String(groupValue.value)}`;
512
+ groups[groupKey] ??= {};
513
+ addRecordScores(groups[groupKey], record);
514
+ }
515
+ }
516
+ return {
517
+ groups: finalizeSummaryGroups(groups),
518
+ overall: finalizeScoreSummary(overall)
519
+ };
520
+ }
521
+ /**
522
+ * Encodes records as newline-delimited JSON.
523
+ *
524
+ * Use when:
525
+ * - writing `cases.jsonl` for command-line tools, dataframes, or streaming parsers
526
+ * - each record should occupy exactly one JSON line
527
+ *
528
+ * Expects:
529
+ * - records are JSON-serializable case records
530
+ *
531
+ * Returns:
532
+ * - one JSON object per line with a trailing newline for non-empty input
533
+ */
534
+ function encodeJsonl(records) {
535
+ if (records.length === 0) return "";
536
+ return `${records.map((record) => JSON.stringify(record)).join("\n")}\n`;
537
+ }
538
+ function normalizeCaseEventName(eventName) {
539
+ if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
540
+ if (eventName === "task.case.metric") return "metric";
541
+ if (eventName === "task.case.score") return "score";
542
+ if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
543
+ }
544
+ function extractEventIds(event, args) {
545
+ const data = asRecord(event.data);
546
+ return {
547
+ attemptId: stringFrom(data?.attemptId) ?? event.attemptId ?? args.attemptId,
548
+ caseId: stringFrom(data?.caseId) ?? event.caseId ?? "",
549
+ experimentId: stringFrom(data?.experimentId) ?? event.experimentId ?? args.experimentId,
550
+ projectName: stringFrom(data?.projectName) ?? event.projectName ?? event.projectId ?? args.projectName,
551
+ runId: stringFrom(data?.runId) ?? event.runId ?? args.runId,
552
+ taskId: stringFrom(data?.taskId) ?? event.taskId ?? "",
553
+ workspaceId: stringFrom(data?.workspaceId) ?? event.workspaceId ?? args.workspaceId
554
+ };
555
+ }
556
+ function getOrCreateDraft(drafts, ids, event, args) {
557
+ const key = createCaseKey(ids.taskId, ids.caseId);
558
+ const existing = drafts.get(key);
559
+ if (existing != null) return existing;
560
+ const draft = {
561
+ attemptId: ids.attemptId,
562
+ caseId: ids.caseId,
563
+ caseName: extractCaseName(event) ?? ids.caseId,
564
+ experimentId: ids.experimentId,
565
+ metrics: {},
566
+ projectName: ids.projectName || args.projectName,
567
+ retryCount: 0,
568
+ runId: ids.runId,
569
+ scores: {},
570
+ startCount: 0,
571
+ taskId: ids.taskId,
572
+ workspaceId: ids.workspaceId
573
+ };
574
+ drafts.set(key, draft);
575
+ return draft;
576
+ }
577
+ function applyIdentity(draft, ids, event, args) {
578
+ draft.attemptId = ids.attemptId || args.attemptId;
579
+ draft.experimentId = ids.experimentId || args.experimentId;
580
+ draft.projectName = extractExplicitProjectName(event) ?? draft.projectName;
581
+ draft.runId = ids.runId || args.runId;
582
+ draft.workspaceId = ids.workspaceId || args.workspaceId;
583
+ }
584
+ function applyCaseStart(draft, event) {
585
+ const data = asRecord(event.data);
586
+ draft.startCount += 1;
587
+ draft.caseName = extractCaseName(event) ?? draft.caseName;
588
+ draft.startedAt ??= stringFrom(data?.startedAt) ?? event.timestamp;
589
+ draft.endedAt = void 0;
590
+ draft.input = void 0;
591
+ draft.metrics = {};
592
+ draft.output = void 0;
593
+ draft.scores = {};
594
+ draft.state = void 0;
595
+ draft.input = data != null && "input" in data ? data.input : draft.input;
596
+ const retryIndex = numberFrom(data?.retryIndex);
597
+ if (retryIndex != null) {
598
+ draft.retryCount = Math.max(draft.retryCount, retryIndex);
599
+ return;
600
+ }
601
+ draft.retryCount = Math.max(draft.retryCount, draft.startCount - 1);
602
+ }
603
+ function applyCaseMetric(draft, event) {
604
+ const data = asRecord(event.data);
605
+ const name = stringFrom(data?.name);
606
+ if (name == null) return;
607
+ const value = data?.value;
608
+ if (isCaseMetricValue(value)) draft.metrics[name] = value;
609
+ }
610
+ function applyCaseScore(draft, event) {
611
+ const data = asRecord(event.data);
612
+ const kind = stringFrom(data?.kind) ?? stringFrom(data?.name) ?? stringFrom(data?.["vieval.score.kind"]);
613
+ const score = numberFrom(data?.score) ?? numberFrom(data?.value) ?? numberFrom(data?.["vieval.score.value"]);
614
+ if (kind == null || score == null) return;
615
+ draft.scores[kind] = score;
616
+ }
617
+ function applyCaseEnd(draft, event) {
618
+ const data = asRecord(event.data);
619
+ draft.caseName = extractCaseName(event) ?? draft.caseName;
620
+ draft.endedAt = stringFrom(data?.endedAt) ?? event.timestamp ?? draft.endedAt;
621
+ draft.output = data != null && "output" in data ? data.output : draft.output;
622
+ draft.state = normalizeState(stringFrom(data?.state)) ?? "failed";
623
+ draft.scores.exact ??= draft.state === "passed" ? 1 : 0;
624
+ }
625
+ function toCaseRecord(draft) {
626
+ const startedAt = draft.startedAt ?? draft.endedAt ?? "";
627
+ const endedAt = draft.endedAt ?? startedAt;
628
+ return {
629
+ attemptId: draft.attemptId,
630
+ caseId: draft.caseId,
631
+ caseName: draft.caseName,
632
+ durationMs: calculateDurationMs(startedAt, endedAt),
633
+ endedAt,
634
+ experimentId: draft.experimentId,
635
+ ...draft.input === void 0 ? {} : { input: draft.input },
636
+ metrics: draft.metrics,
637
+ ...draft.output === void 0 ? {} : { output: draft.output },
638
+ projectName: draft.projectName,
639
+ retryCount: draft.retryCount,
640
+ runId: draft.runId,
641
+ schemaVersion: 1,
642
+ scores: draft.scores,
643
+ startedAt,
644
+ state: draft.state ?? "failed",
645
+ taskId: draft.taskId,
646
+ workspaceId: draft.workspaceId
647
+ };
648
+ }
649
+ function addRecordScores(summary, record) {
650
+ for (const [kind, score] of Object.entries(record.scores)) {
651
+ if (!Number.isFinite(score)) continue;
652
+ summary[kind] ??= {
653
+ average: 0,
654
+ count: 0,
655
+ sum: 0
656
+ };
657
+ summary[kind].count += 1;
658
+ summary[kind].sum += score;
659
+ }
660
+ }
661
+ function finalizeSummaryGroups(groups) {
662
+ return Object.fromEntries(Object.entries(groups).map(([key, summary]) => [key, finalizeScoreSummary(summary)]));
663
+ }
664
+ function finalizeScoreSummary(summary) {
665
+ return Object.fromEntries(Object.entries(summary).map(([kind, bucket]) => [kind, {
666
+ average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
667
+ count: bucket.count,
668
+ sum: bucket.sum
669
+ }]));
670
+ }
671
+ function getGroupValue(record, key) {
672
+ if (Object.hasOwn(record.metrics, key)) return {
673
+ exists: true,
674
+ value: record.metrics[key]
675
+ };
676
+ const directValue = record[key];
677
+ return isCaseMetricValue(directValue) ? {
678
+ exists: true,
679
+ value: directValue
680
+ } : { exists: false };
681
+ }
682
+ function extractCaseName(event) {
683
+ const data = asRecord(event.data);
684
+ return stringFrom(data?.caseName) ?? stringFrom(data?.name);
685
+ }
686
+ function extractExplicitProjectName(event) {
687
+ return stringFrom(asRecord(event.data)?.projectName) ?? event.projectName ?? event.projectId;
688
+ }
689
+ function createCaseKey(taskId, caseId) {
690
+ return `${taskId}\u0000${caseId}`;
691
+ }
692
+ /**
693
+ * Normalizes duration timestamps.
694
+ *
695
+ * Before:
696
+ * - `startedAt="2026-05-08T00:00:00.000Z"`, `endedAt="2026-05-08T00:00:01.250Z"`
697
+ * - `startedAt="bad"`, `endedAt="2026-05-08T00:00:01.250Z"`
698
+ *
699
+ * After:
700
+ * - `1250`
701
+ * - `0`
702
+ */
703
+ function calculateDurationMs(startedAt, endedAt) {
704
+ const started = Date.parse(startedAt);
705
+ const ended = Date.parse(endedAt);
706
+ if (!Number.isFinite(started) || !Number.isFinite(ended)) return 0;
707
+ return Math.max(0, ended - started);
708
+ }
709
+ function normalizeState(value) {
710
+ if (value === "failed" || value === "passed" || value === "skipped" || value === "timeout") return value;
711
+ }
712
+ function isCaseMetricValue(value) {
713
+ if (value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string") return true;
714
+ return Array.isArray(value);
715
+ }
716
+ function asRecord(value) {
717
+ if (value == null || typeof value !== "object" || Array.isArray(value)) return;
718
+ return value;
719
+ }
720
+ function stringFrom(value) {
721
+ return typeof value === "string" ? value : void 0;
722
+ }
723
+ function numberFrom(value) {
724
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
725
+ }
726
+ //#endregion
727
+ //#region src/cli/report-artifacts.ts
728
+ /**
729
+ * Resolves one or more `run-summary.json` paths from a report location.
730
+ *
731
+ * Use when:
732
+ * - callers may pass a run directory, summary file path, or a report root
733
+ *
734
+ * Returns:
735
+ * - sorted absolute summary file paths
736
+ */
737
+ async function resolveRunSummaryPaths(reportPath) {
738
+ const absoluteReportPath = resolve(reportPath);
739
+ const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
740
+ if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
741
+ if (existsSync(directSummaryPath)) return [directSummaryPath];
742
+ return (await glob("**/run-summary.json", {
743
+ absolute: true,
744
+ cwd: absoluteReportPath
745
+ })).sort((left, right) => left.localeCompare(right));
746
+ }
747
+ /**
748
+ * Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
749
+ *
750
+ * Use when:
751
+ * - report analysis needs both run aggregate output and event count metadata
752
+ */
753
+ function readReportRunArtifact(summaryFilePath) {
754
+ const reportDirectory = resolve(summaryFilePath, "..");
755
+ const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
756
+ const eventsFilePath = resolve(reportDirectory, "events.jsonl");
757
+ const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
758
+ const event = JSON.parse(line);
759
+ return {
760
+ attemptId: event.attemptId,
761
+ caseId: event.caseId,
762
+ data: event.data,
763
+ event: event.event,
764
+ experimentId: event.experimentId,
765
+ projectId: event.projectId,
766
+ projectName: event.projectName,
767
+ runId: event.runId,
768
+ taskId: event.taskId,
769
+ timestamp: event.timestamp,
770
+ workspaceId: event.workspaceId
771
+ };
772
+ }) : [];
773
+ return {
774
+ events,
775
+ eventsCount: events.length,
776
+ reportDirectory,
777
+ summary,
778
+ summaryFilePath
779
+ };
780
+ }
781
+ /**
782
+ * Reads all run artifacts found under `reportPath`.
783
+ *
784
+ * Use when:
785
+ * - callers need multi-run analysis from a directory root
786
+ */
787
+ async function readReportArtifacts(reportPath) {
788
+ return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
789
+ }
790
+ /**
791
+ * Creates a compact summary row for one run artifact.
792
+ *
793
+ * Use when:
794
+ * - table/csv/jsonl exports should stay stable and cheap to parse
795
+ */
796
+ function summarizeReportRunArtifact(artifact) {
797
+ const totalProjects = artifact.summary.projects.length;
798
+ const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
799
+ const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
800
+ const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
801
+ const projectNames = artifact.summary.projects.map((project) => project.name);
802
+ return {
803
+ attemptId: artifact.summary.attemptId ?? null,
804
+ eventsCount: artifact.eventsCount,
805
+ executedProjects,
806
+ experimentId: artifact.summary.experimentId ?? null,
807
+ failedProjects,
808
+ projectNames,
809
+ reportDirectory: artifact.reportDirectory,
810
+ runId: artifact.summary.runId ?? null,
811
+ totalProjects,
812
+ totalTasks,
813
+ workspaceId: artifact.summary.workspaceId ?? null
814
+ };
815
+ }
816
+ /**
817
+ * Writes one complete local run report artifact set.
818
+ *
819
+ * Use when:
820
+ * - CLI runs need deterministic local artifacts under workspace/project/experiment/attempt/run
821
+ * - report commands need normalized case, metrics, and OTLP-shaped files
822
+ *
823
+ * Expects:
824
+ * - `events` are the same envelopes written to `events.jsonl`
825
+ * - `output` already contains run identity fields
826
+ *
827
+ * Returns:
828
+ * - absolute report directory path containing the written artifacts
829
+ */
830
+ async function writeRunReportArtifacts(output, events, identity, reportOut) {
831
+ const projectId = deriveReportProjectId(output);
832
+ const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
833
+ const persistedOutput = {
834
+ ...output,
835
+ reportDirectory
836
+ };
837
+ await mkdir(reportDirectory, { recursive: true });
838
+ await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(persistedOutput, null, 2)}\n`, "utf-8");
839
+ await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
840
+ const caseRecords = buildCaseRecords({
841
+ attemptId: identity.attemptId,
842
+ events,
843
+ experimentId: identity.experimentId,
844
+ projectName: projectId,
845
+ runId: identity.runId,
846
+ workspaceId: identity.workspaceId
847
+ });
848
+ const metricsSummary = buildMetricsSummary(caseRecords, []);
849
+ const otlp = buildLocalOtlpProjection({
850
+ records: caseRecords,
851
+ runId: identity.runId
852
+ });
853
+ await writeFile(resolve(reportDirectory, "cases.jsonl"), encodeJsonl(caseRecords), "utf-8");
854
+ await writeFile(resolve(reportDirectory, "metrics-summary.json"), `${JSON.stringify(metricsSummary, null, 2)}\n`, "utf-8");
855
+ await mkdir(resolve(reportDirectory, "otlp"), { recursive: true });
856
+ await mkdir(resolve(reportDirectory, "benchmark"), { recursive: true });
857
+ await writeFile(resolve(reportDirectory, "otlp", "traces.json"), `${JSON.stringify(otlp.traces, null, 2)}\n`, "utf-8");
858
+ await writeFile(resolve(reportDirectory, "otlp", "logs.json"), `${JSON.stringify(otlp.logs, null, 2)}\n`, "utf-8");
859
+ await writeFile(resolve(reportDirectory, "otlp", "metrics.json"), `${JSON.stringify(otlp.metrics, null, 2)}\n`, "utf-8");
860
+ return reportDirectory;
861
+ }
862
+ function deriveReportProjectId(output) {
863
+ const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
864
+ if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment$1(uniqueProjectNames[0] ?? "default-project");
865
+ return "multi-project";
866
+ }
867
+ function sanitizeIdentitySegment$1(value) {
868
+ const normalized = value.trim();
869
+ if (normalized.length === 0) return "default";
870
+ return normalized.replace(/[^\w.-]+/g, "-");
871
+ }
872
+ //#endregion
265
873
  //#region src/cli/reporters/noop-reporter.ts
266
874
  /**
267
875
  * Creates a reporter that intentionally does nothing.
@@ -378,12 +986,21 @@ var SummaryReporterStateMachine = class {
378
986
  if (task.state === "finished") return;
379
987
  task.state = "running";
380
988
  task.startedAt ??= this.options.getNow();
381
- if (task.settledCaseIds.has(payload.caseId) || task.runningCases.has(payload.caseId)) return;
989
+ if (task.settledCaseIds.has(payload.caseId)) return;
990
+ const existingCase = task.runningCases.get(payload.caseId);
991
+ if (existingCase != null) {
992
+ existingCase.autoRetry = payload.autoRetry;
993
+ existingCase.caseName = payload.caseName ?? payload.caseId;
994
+ existingCase.retryIndex = payload.retryIndex;
995
+ return;
996
+ }
382
997
  task.caseOrderCounter += 1;
383
998
  task.runningCases.set(payload.caseId, {
999
+ autoRetry: payload.autoRetry,
384
1000
  caseId: payload.caseId,
385
1001
  caseName: payload.caseName ?? payload.caseId,
386
1002
  order: task.caseOrderCounter,
1003
+ retryIndex: payload.retryIndex,
387
1004
  startedAt: this.options.getNow()
388
1005
  });
389
1006
  this.syncTaskTotalCases(task);
@@ -420,6 +1037,10 @@ var SummaryReporterStateMachine = class {
420
1037
  this.caseCounters.failed += 1;
421
1038
  return;
422
1039
  }
1040
+ if (payload.state === "timeout") {
1041
+ this.caseCounters.timeout += 1;
1042
+ return;
1043
+ }
423
1044
  this.caseCounters.skipped += 1;
424
1045
  }
425
1046
  /**
@@ -500,39 +1121,52 @@ var SummaryReporterStateMachine = class {
500
1121
  const activeRows = this.createActiveRows();
501
1122
  const footerRows = this.createFooterRows();
502
1123
  const maxRows = options?.maxRows;
503
- const activeBlock = [
1124
+ const footerBlock = [...footerRows, ""];
1125
+ if (maxRows == null || maxRows <= 0) return [...[
504
1126
  "",
505
1127
  ...activeRows,
506
1128
  ...activeRows.length > 0 ? [""] : []
507
- ];
508
- const footerBlock = [...footerRows, ""];
509
- if (maxRows == null || maxRows <= 0) return [...activeBlock, ...footerBlock];
1129
+ ], ...footerBlock];
510
1130
  if (maxRows <= footerBlock.length) return footerBlock.slice(-maxRows);
511
- const availableActiveRows = Math.max(0, maxRows - footerBlock.length);
512
- return [...activeBlock.slice(0, availableActiveRows), ...footerBlock];
1131
+ return [...createBoundedActiveBlock(activeRows, Math.max(0, maxRows - footerBlock.length)), ...footerBlock];
513
1132
  }
514
1133
  createActiveRows() {
515
1134
  const activeTasks = Array.from(this.tasks.values()).filter((task) => task.state !== "finished").sort(compareActiveTasks);
516
1135
  const rows = [];
517
1136
  for (const task of activeTasks) {
518
- const suffix = task.state === "queued" ? c.dim(" [queued]") : ` ${task.completedCases}/${task.totalCases}`;
1137
+ const now = this.options.getNow();
1138
+ const suffix = task.state === "queued" ? c.dim(" [queued]") : formatTaskProgressSuffix(task, now);
519
1139
  const badge = formatProjectBadge(task.projectName, this.options.isTTY);
520
1140
  rows.push(c.bold(c.yellow(` ${POINTER} `)) + badge + task.displayName + c.dim(suffix));
521
- const slowCases = Array.from(task.runningCases.values()).filter((activeCase) => this.options.getNow() - activeCase.startedAt >= this.options.slowThresholdMs).sort((left, right) => left.order - right.order);
1141
+ const slowCases = Array.from(task.runningCases.values()).filter((activeCase) => now - activeCase.startedAt >= this.options.slowThresholdMs).sort((left, right) => left.order - right.order);
522
1142
  for (const [index, activeCase] of slowCases.entries()) {
523
1143
  const icon = index === slowCases.length - 1 ? TREE_NODE_END : TREE_NODE_MIDDLE;
524
- const elapsed = Math.max(0, this.options.getNow() - activeCase.startedAt);
525
- rows.push(c.bold(c.yellow(` ${icon} `)) + activeCase.caseName + c.bold(c.yellow(` ${formatDuration$1(elapsed)}`)));
1144
+ const elapsed = Math.max(0, now - activeCase.startedAt);
1145
+ rows.push(c.bold(c.yellow(` ${icon} `)) + activeCase.caseName + formatRetrySuffix(activeCase) + c.bold(c.yellow(` ${formatDuration$2(elapsed)}`)));
526
1146
  }
527
1147
  }
528
1148
  return rows;
529
1149
  }
530
1150
  createFooterRows() {
1151
+ const now = this.options.getNow();
1152
+ const runElapsedDurationMs = Math.max(0, now - this.startedAtMs);
1153
+ const taskRunningCount = countRunningTasks(this.tasks.values());
1154
+ const caseRunningCount = countRunningCases(this.tasks.values());
531
1155
  return [
532
- padSummaryTitle("Tasks") + formatCounterState(this.taskCounters),
533
- padSummaryTitle("Cases") + formatCounterState(this.caseCounters),
1156
+ padSummaryTitle("Tasks") + formatCounterState(this.taskCounters, taskRunningCount, {
1157
+ elapsedDurationMs: runElapsedDurationMs,
1158
+ estimatedDurationMs: estimateTotalDurationMs(this.taskCounters.completed, this.taskCounters.total, runElapsedDurationMs)
1159
+ }),
1160
+ padSummaryTitle("Cases") + formatCounterState(this.caseCounters, caseRunningCount, {
1161
+ elapsedDurationMs: runElapsedDurationMs,
1162
+ estimatedDurationMs: estimateTotalDurationMs(this.caseCounters.completed, this.caseCounters.total, runElapsedDurationMs)
1163
+ }),
1164
+ padSummaryTitle("Concurrency") + formatActiveConcurrencyState({
1165
+ caseRunningCount,
1166
+ taskRunningCount
1167
+ }),
534
1168
  padSummaryTitle("Start at") + this.startTime,
535
- padSummaryTitle("Duration") + formatDuration$1(Math.max(0, this.options.getNow() - this.startedAtMs))
1169
+ padSummaryTitle("Duration") + formatHumanDuration(runElapsedDurationMs)
536
1170
  ];
537
1171
  }
538
1172
  getOrCreateTaskState(taskId) {
@@ -563,6 +1197,39 @@ var SummaryReporterStateMachine = class {
563
1197
  }
564
1198
  };
565
1199
  /**
1200
+ * Creates the active task block while keeping room for summary footer rows.
1201
+ *
1202
+ * Use when:
1203
+ * - the live TTY window is smaller than the number of running task/case rows
1204
+ * - active rows need a visible truncation marker instead of silently disappearing
1205
+ *
1206
+ * Expects:
1207
+ * - `activeRows` contains already-formatted task and slow-case rows
1208
+ * - `maxRows` counts the leading spacer and truncation marker
1209
+ *
1210
+ * Returns:
1211
+ * - rows that fit inside `maxRows`
1212
+ * - a final hidden-row marker when active rows were omitted
1213
+ */
1214
+ function createBoundedActiveBlock(activeRows, maxRows) {
1215
+ if (maxRows <= 0) return [];
1216
+ if (activeRows.length === 0) return [""];
1217
+ const fullBlock = [
1218
+ "",
1219
+ ...activeRows,
1220
+ ""
1221
+ ];
1222
+ if (fullBlock.length <= maxRows) return fullBlock;
1223
+ if (maxRows === 1) return [""];
1224
+ const visibleActiveRows = Math.max(0, maxRows - 2);
1225
+ const hiddenRows = Math.max(0, activeRows.length - visibleActiveRows);
1226
+ return [
1227
+ "",
1228
+ ...activeRows.slice(0, visibleActiveRows),
1229
+ c.dim(` ${TREE_NODE_END} ... ${hiddenRows} more running rows hidden`)
1230
+ ];
1231
+ }
1232
+ /**
566
1233
  * Creates the live summary reporter state machine for `vieval` CLI runs.
567
1234
  *
568
1235
  * Use when:
@@ -593,6 +1260,7 @@ function createCounterState() {
593
1260
  failed: 0,
594
1261
  passed: 0,
595
1262
  skipped: 0,
1263
+ timeout: 0,
596
1264
  total: 0
597
1265
  };
598
1266
  }
@@ -601,6 +1269,7 @@ function resetCounterState(counter, total) {
601
1269
  counter.failed = 0;
602
1270
  counter.passed = 0;
603
1271
  counter.skipped = 0;
1272
+ counter.timeout = 0;
604
1273
  counter.total = total;
605
1274
  }
606
1275
  function sumTaskCaseTotals(tasks) {
@@ -619,19 +1288,48 @@ function compareActiveTasks(left, right) {
619
1288
  function padSummaryTitle(label) {
620
1289
  return `${c.dim(label.padEnd(8))} `;
621
1290
  }
622
- function formatCounterState(counter) {
1291
+ function formatCounterState(counter, runningCount, timing) {
1292
+ const plannedCount = Math.max(0, counter.total - counter.completed - runningCount);
623
1293
  return [
1294
+ plannedCount > 0 ? c.bold(c.blue(`${plannedCount} planned`)) : c.dim(`${plannedCount} planned`),
1295
+ runningCount > 0 ? c.bold(c.yellow(`${runningCount} running`)) : c.dim(`${runningCount} running`),
624
1296
  c.bold(c.green(`${counter.passed} passed`)),
625
1297
  counter.failed > 0 ? c.bold(c.red(`${counter.failed} failed`)) : c.dim(`${counter.failed} failed`),
1298
+ counter.timeout > 0 ? c.bold(c.yellow(`${counter.timeout} timeout`)) : c.dim(`${counter.timeout} timeout`),
626
1299
  counter.skipped > 0 ? c.yellow(`${counter.skipped} skipped`) : c.dim(`${counter.skipped} skipped`)
627
- ].join(c.dim(" | ")) + c.gray(` (${counter.total})`);
1300
+ ].join(c.dim(" | ")) + c.gray(` (${counter.total})`) + formatTimingSuffix(timing);
1301
+ }
1302
+ function formatActiveConcurrencyState(options) {
1303
+ return [options.taskRunningCount > 0 ? c.bold(c.yellow(`${options.taskRunningCount} ${pluralize("task", options.taskRunningCount)} running`)) : c.dim("0 tasks running"), options.caseRunningCount > 0 ? c.bold(c.yellow(`${options.caseRunningCount} ${pluralize("case", options.caseRunningCount)} running`)) : c.dim("0 cases running")].join(c.dim(" | "));
1304
+ }
1305
+ function pluralize(noun, count) {
1306
+ return count === 1 ? noun : `${noun}s`;
1307
+ }
1308
+ function formatRetrySuffix(activeCase) {
1309
+ if (activeCase.retryIndex == null || activeCase.retryIndex <= 0 || activeCase.autoRetry == null || activeCase.autoRetry <= 0) return "";
1310
+ return c.dim(` retry ${activeCase.retryIndex}/${activeCase.autoRetry}`);
628
1311
  }
629
1312
  function formatTimeString(date) {
630
1313
  return date.toTimeString().split(" ")[0] ?? "";
631
1314
  }
632
- function formatDuration$1(durationMs) {
633
- if (durationMs >= 1e3) return `${(durationMs / 1e3).toFixed(2)}s`;
634
- return `${Math.round(durationMs)}ms`;
1315
+ function formatDuration$2(durationMs) {
1316
+ return formatHumanDuration(durationMs);
1317
+ }
1318
+ function formatHumanDuration(durationMs) {
1319
+ if (durationMs < 1e3) return `${Math.round(durationMs)}ms`;
1320
+ const formatted = formatDuration(intervalToDuration({
1321
+ end: durationMs,
1322
+ start: 0
1323
+ }), {
1324
+ delimiter: " ",
1325
+ format: [
1326
+ "hours",
1327
+ "minutes",
1328
+ "seconds"
1329
+ ],
1330
+ zero: false
1331
+ });
1332
+ return formatted.length > 0 ? formatted : "0 seconds";
635
1333
  }
636
1334
  function formatProjectBadge(projectName, isTTY) {
637
1335
  if (projectName == null || projectName.length === 0) return "";
@@ -645,6 +1343,37 @@ function formatProjectBadge(projectName, isTTY) {
645
1343
  const background = backgroundPool[projectName.split("").reduce((accumulator, character, index) => accumulator + character.charCodeAt(0) + index, 0) % backgroundPool.length];
646
1344
  return `${c.black(background(` ${projectName} `))} `;
647
1345
  }
1346
+ function countRunningCases(tasks) {
1347
+ let runningCount = 0;
1348
+ for (const task of tasks) runningCount += task.runningCases.size;
1349
+ return runningCount;
1350
+ }
1351
+ function countRunningTasks(tasks) {
1352
+ let runningCount = 0;
1353
+ for (const task of tasks) if (task.state === "running") runningCount += 1;
1354
+ return runningCount;
1355
+ }
1356
+ function estimateTaskDurationMs(task, now) {
1357
+ if (task.startedAt == null) return;
1358
+ return estimateTotalDurationMs(task.completedCases, task.totalCases, Math.max(0, now - task.startedAt));
1359
+ }
1360
+ function estimateTotalDurationMs(completedCount, totalCount, elapsedDurationMs) {
1361
+ if (completedCount === 0 || totalCount === 0) return;
1362
+ const averageDurationMs = elapsedDurationMs / completedCount;
1363
+ return Math.round(averageDurationMs * totalCount);
1364
+ }
1365
+ function formatTaskProgressSuffix(task, now) {
1366
+ const elapsedDurationMs = task.startedAt == null ? 0 : Math.max(0, now - task.startedAt);
1367
+ return ` ${task.completedCases}/${task.totalCases}, ${task.runningCases.size} ${pluralize("case", task.runningCases.size)} running${formatTimingSuffix({
1368
+ elapsedDurationMs,
1369
+ estimatedDurationMs: estimateTaskDurationMs(task, now)
1370
+ })}`;
1371
+ }
1372
+ function formatTimingSuffix(timing) {
1373
+ const parts = [`elapsed ${formatHumanDuration(timing.elapsedDurationMs)}`];
1374
+ if (timing.estimatedDurationMs != null) parts.push(`estimated ${formatHumanDuration(timing.estimatedDurationMs)}`);
1375
+ return ` (${parts.join(", ")})`;
1376
+ }
648
1377
  //#endregion
649
1378
  //#region src/cli/reporters/index.ts
650
1379
  /**
@@ -1000,7 +1729,7 @@ async function createVievalVitestCompatReporterBridge(options) {
1000
1729
  return {
1001
1730
  async onCaseEnd(payload) {
1002
1731
  const taskCase = getOrCreateCase(payload.taskId, payload.caseId);
1003
- taskCase.state = payload.state;
1732
+ taskCase.state = payload.state === "timeout" ? "failed" : payload.state;
1004
1733
  await emitToReporters(loadedReporters, (reporter) => reporter.onTestCaseResult?.(taskCase));
1005
1734
  },
1006
1735
  async onCaseStart(payload) {
@@ -1047,17 +1776,68 @@ async function createVievalVitestCompatReporterBridge(options) {
1047
1776
  function hasRunFailures(output) {
1048
1777
  return output.projects.some((project) => {
1049
1778
  if (project.errorMessage != null) return true;
1050
- if (project.caseSummary != null && project.caseSummary.failed > 0) return true;
1779
+ if (project.caseSummary != null && (project.caseSummary.failed > 0 || project.caseSummary.timeout > 0)) return true;
1051
1780
  return (project.caseFailures?.length ?? 0) > 0;
1052
1781
  });
1053
1782
  }
1054
- function shouldUseColor() {
1055
- if (process.env.NO_COLOR != null) return false;
1056
- const forceColor = process.env.FORCE_COLOR;
1057
- if (forceColor != null) return forceColor !== "0";
1058
- return process.stdout.isTTY === true;
1783
+ function resolveCappedConcurrency(defaultConcurrency, cliConcurrency, fallback) {
1784
+ const effectiveDefault = defaultConcurrency ?? fallback;
1785
+ if (cliConcurrency == null) return effectiveDefault;
1786
+ return Math.min(effectiveDefault, cliConcurrency);
1059
1787
  }
1060
- function createColorPalette(enabled) {
1788
+ function resolveOptionalRuntimeTaskConcurrency(defaultConcurrency, cliConcurrency) {
1789
+ return cliConcurrency ?? defaultConcurrency;
1790
+ }
1791
+ function resolveWorkspaceConcurrency(loadedConfig, options) {
1792
+ return resolveCappedConcurrency(loadedConfig.concurrency?.workspace, options.workspaceConcurrency, 1);
1793
+ }
1794
+ function resolveProjectConcurrency(project, options) {
1795
+ return resolveCappedConcurrency(project.concurrency?.project, options.projectConcurrency, Number.POSITIVE_INFINITY);
1796
+ }
1797
+ function resolveTaskConcurrency(project, options) {
1798
+ return resolveCappedConcurrency(project.concurrency?.task, options.taskConcurrency, 1);
1799
+ }
1800
+ function resolveScheduledTaskConcurrency(project, options) {
1801
+ return Math.min(resolveProjectConcurrency(project, options), resolveTaskConcurrency(project, options));
1802
+ }
1803
+ function resolveRuntimeTaskConcurrency(taskConcurrency, project, options) {
1804
+ const attempt = resolveOptionalRuntimeTaskConcurrency(taskConcurrency?.attempt ?? project.concurrency?.attempt, options.attemptConcurrency);
1805
+ const caseConcurrency = resolveOptionalRuntimeTaskConcurrency(taskConcurrency?.case ?? project.concurrency?.case, options.caseConcurrency);
1806
+ if (attempt == null && caseConcurrency == null) return;
1807
+ return {
1808
+ attempt,
1809
+ case: caseConcurrency
1810
+ };
1811
+ }
1812
+ function createScheduledTaskWithRuntimeConcurrency(task, project, options) {
1813
+ const taskDefinition = task.entry.task;
1814
+ if (taskDefinition == null) return task;
1815
+ const concurrency = resolveRuntimeTaskConcurrency(taskDefinition.concurrency, project, options);
1816
+ return {
1817
+ ...task,
1818
+ entry: {
1819
+ ...task.entry,
1820
+ task: {
1821
+ ...taskDefinition,
1822
+ concurrency
1823
+ }
1824
+ }
1825
+ };
1826
+ }
1827
+ function resolveCliRuntimeConcurrency(options) {
1828
+ if (options.attemptConcurrency == null && options.caseConcurrency == null) return;
1829
+ return {
1830
+ attempt: options.attemptConcurrency,
1831
+ case: options.caseConcurrency
1832
+ };
1833
+ }
1834
+ function shouldUseColor() {
1835
+ if (process.env.NO_COLOR != null) return false;
1836
+ const forceColor = process.env.FORCE_COLOR;
1837
+ if (forceColor != null) return forceColor !== "0";
1838
+ return process.stdout.isTTY === true;
1839
+ }
1840
+ function createColorPalette(enabled) {
1061
1841
  if (!enabled) return {
1062
1842
  black: (value) => value,
1063
1843
  bgCyan: (value) => value,
@@ -1094,11 +1874,15 @@ function createProjectBadge(name, colors, colorEnabled) {
1094
1874
  const background = labelColorPool[name.split("").reduce((accumulator, char, index) => accumulator + char.charCodeAt(0) + index, 0) % labelColorPool.length];
1095
1875
  return `${colors.black(background(` ${name} `))} `;
1096
1876
  }
1097
- function formatDuration(durationMs, colors) {
1877
+ function formatDuration$1(durationMs, colors) {
1098
1878
  if (durationMs == null) return "";
1099
1879
  const rounded = Math.round(durationMs);
1100
1880
  return (rounded > 1e3 ? colors.yellow : colors.green)(` ${rounded}${colors.dim("ms")}`);
1101
1881
  }
1882
+ function formatHybridAverage(hybridAverage) {
1883
+ if (hybridAverage == null) return "n/a";
1884
+ return hybridAverage.toFixed(3).replace(/\.?0+$/, "");
1885
+ }
1102
1886
  function filterProjectsByName(projects, names) {
1103
1887
  if (names.length === 0) return [...projects];
1104
1888
  const nameSet = new Set(names);
@@ -1119,11 +1903,6 @@ function createRunIdentity(options) {
1119
1903
  workspaceId
1120
1904
  };
1121
1905
  }
1122
- function deriveReportProjectId(output) {
1123
- const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
1124
- if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment(uniqueProjectNames[0] ?? "default-project");
1125
- return "multi-project";
1126
- }
1127
1906
  function createEventRecorder(identity) {
1128
1907
  const events = [];
1129
1908
  const taskProjectMap = /* @__PURE__ */ new Map();
@@ -1220,6 +1999,7 @@ function isSummaryReporter(reporter) {
1220
1999
  return "getWindowRows" in reporter;
1221
2000
  }
1222
2001
  function createRunReporter(options) {
2002
+ const getRows = options?.getRows ?? (() => process.stdout.rows);
1223
2003
  const reporter = createCliReporter({
1224
2004
  getColumns: options?.getColumns ?? (() => process.stdout.columns ?? 80),
1225
2005
  getNow: options?.getNow ?? (() => Date.now()),
@@ -1240,7 +2020,7 @@ function createRunReporter(options) {
1240
2020
  };
1241
2021
  const rendererBaseOptions = {
1242
2022
  getColumns: options?.getColumns ?? (() => process.stdout.columns ?? 80),
1243
- getWindow: () => reporter.getWindowRows(),
2023
+ getWindow: () => reporter.getWindowRows({ maxRows: normalizeLiveReporterMaxRows(getRows()) }),
1244
2024
  queueRenderReset: options?.queueRenderReset,
1245
2025
  supportsAnsiWindowing: options?.supportsAnsiWindowing,
1246
2026
  writeOutput: options?.writeOutput ?? ((value) => process.stdout.write(value))
@@ -1289,6 +2069,22 @@ function createRunReporter(options) {
1289
2069
  }
1290
2070
  };
1291
2071
  }
2072
+ /**
2073
+ * Normalizes terminal row count into the live reporter window height.
2074
+ *
2075
+ * Before:
2076
+ * - undefined
2077
+ * - 4
2078
+ * - 40
2079
+ *
2080
+ * After:
2081
+ * - 23
2082
+ * - 6
2083
+ * - 39
2084
+ */
2085
+ function normalizeLiveReporterMaxRows(rows) {
2086
+ return Math.max(6, (rows == null || !Number.isFinite(rows) || rows <= 0 ? 24 : Math.floor(rows)) - 1);
2087
+ }
1292
2088
  function createTaskQueuePayload(task, projectName) {
1293
2089
  return {
1294
2090
  displayName: task.entry.name,
@@ -1315,11 +2111,12 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
1315
2111
  projectCaseCounters.seenCaseIds.add(projectCaseId);
1316
2112
  if (payload.state === "passed") projectCaseCounters.passed += 1;
1317
2113
  else if (payload.state === "failed") projectCaseCounters.failed += 1;
2114
+ else if (payload.state === "timeout") projectCaseCounters.timeout += 1;
1318
2115
  else projectCaseCounters.skipped += 1;
1319
2116
  }
1320
2117
  }
1321
2118
  syncCaseTotal(payload.total);
1322
- if (payload.state === "failed" && payload.errorMessage != null && projectCaseFailures != null) projectCaseFailures.push({
2119
+ if ((payload.state === "failed" || payload.state === "timeout") && payload.errorMessage != null && projectCaseFailures != null) projectCaseFailures.push({
1323
2120
  caseId,
1324
2121
  caseName: payload.name,
1325
2122
  errorMessage: payload.errorMessage,
@@ -1328,6 +2125,7 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
1328
2125
  reporter.onCaseEnd({
1329
2126
  caseId,
1330
2127
  errorMessage: payload.errorMessage,
2128
+ output: payload.output,
1331
2129
  state: payload.state,
1332
2130
  taskId: task.id
1333
2131
  });
@@ -1342,8 +2140,11 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
1342
2140
  const caseId = createTaskCaseReporterId(payload);
1343
2141
  syncCaseTotal(payload.total);
1344
2142
  reporter.onCaseStart({
2143
+ autoRetry: payload.autoRetry,
1345
2144
  caseId,
2145
+ input: payload.input,
1346
2146
  caseName: payload.name,
2147
+ retryIndex: payload.retryIndex,
1347
2148
  taskId: task.id
1348
2149
  });
1349
2150
  vitestCompatReporter?.onCaseStart({
@@ -1360,7 +2161,7 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
1360
2161
  }
1361
2162
  };
1362
2163
  }
1363
- function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
2164
+ function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, telemetry, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, runtimeConcurrency, vitestCompatReporter) {
1364
2165
  return {
1365
2166
  ...createTaskExecutionContext({
1366
2167
  cache: createFilesystemTaskCacheRuntime({
@@ -1371,7 +2172,9 @@ function createCliTaskExecutionContext(task, models, cacheRootDirectory, cachePr
1371
2172
  models,
1372
2173
  task
1373
2174
  }),
1374
- reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter)
2175
+ reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
2176
+ runtimeConcurrency,
2177
+ telemetry
1375
2178
  };
1376
2179
  }
1377
2180
  function resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
@@ -1389,7 +2192,8 @@ function createAutoTaskExecutor(reporter, projectName, recordEvent, projectCaseC
1389
2192
  cache: context.cache,
1390
2193
  model: context.model,
1391
2194
  reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
1392
- task
2195
+ task,
2196
+ telemetry: context.telemetry
1393
2197
  });
1394
2198
  return {
1395
2199
  entryId: task.entry.id,
@@ -1492,13 +2296,14 @@ async function prepareProject(project) {
1492
2296
  };
1493
2297
  }
1494
2298
  }
1495
- async function executePreparedProject(prepared, identity, cacheProjectName, reporter, counters, recordEvent) {
2299
+ async function executePreparedProject(prepared, identity, cacheProjectName, telemetry, reporter, counters, recordEvent, options) {
1496
2300
  const settledTaskIds = /* @__PURE__ */ new Set();
1497
2301
  const projectCaseCounters = {
1498
2302
  failed: 0,
1499
2303
  passed: 0,
1500
2304
  seenCaseIds: /* @__PURE__ */ new Set(),
1501
- skipped: 0
2305
+ skipped: 0,
2306
+ timeout: 0
1502
2307
  };
1503
2308
  const projectCaseFailures = [];
1504
2309
  const vitestCompatReporter = await createVievalVitestCompatReporterBridge({
@@ -1507,9 +2312,16 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1507
2312
  });
1508
2313
  const rawTaskExecutor = prepared.project.executor ?? createAutoTaskExecutor(reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
1509
2314
  const taskExecutor = async (task, context) => {
2315
+ const runtimeTask = createScheduledTaskWithRuntimeConcurrency(task, prepared.project, options);
1510
2316
  return {
1511
- ...await rawTaskExecutor(task, context),
1512
- matrix: cloneScheduledTaskMatrix(task)
2317
+ ...await telemetry.withSpan("vieval.task", {
2318
+ "vieval.project.name": prepared.name,
2319
+ "vieval.run.id": identity.runId,
2320
+ "vieval.task.entry.id": runtimeTask.entry.id,
2321
+ "vieval.task.id": runtimeTask.id,
2322
+ "vieval.task.name": runtimeTask.entry.name
2323
+ }, async () => await rawTaskExecutor(runtimeTask, context)),
2324
+ matrix: cloneScheduledTaskMatrix(runtimeTask)
1513
2325
  };
1514
2326
  };
1515
2327
  for (const task of prepared.tasks) await vitestCompatReporter?.onTaskQueued({ taskId: task.id });
@@ -1517,7 +2329,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1517
2329
  try {
1518
2330
  const aggregated = await runScheduledTasks(prepared.tasks, taskExecutor, {
1519
2331
  createExecutionContext(task) {
1520
- return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter);
2332
+ return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, telemetry, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, resolveCliRuntimeConcurrency(options), vitestCompatReporter);
1521
2333
  },
1522
2334
  onTaskEnd(task, state) {
1523
2335
  settledTaskIds.add(task.id);
@@ -1538,7 +2350,8 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1538
2350
  onTaskStart(task) {
1539
2351
  reporter.onTaskStart({ taskId: task.id });
1540
2352
  vitestCompatReporter?.onTaskStart({ taskId: task.id });
1541
- }
2353
+ },
2354
+ maxConcurrency: resolveScheduledTaskConcurrency(prepared.project, options)
1542
2355
  });
1543
2356
  await vitestCompatReporter?.onRunEnd({ failed: false });
1544
2357
  return {
@@ -1546,6 +2359,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1546
2359
  failed: projectCaseCounters.failed,
1547
2360
  passed: projectCaseCounters.passed,
1548
2361
  skipped: projectCaseCounters.skipped,
2362
+ timeout: projectCaseCounters.timeout,
1549
2363
  total: projectCaseCounters.seenCaseIds.size
1550
2364
  },
1551
2365
  caseFailures: projectCaseFailures,
@@ -1592,6 +2406,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1592
2406
  failed: projectCaseCounters.failed,
1593
2407
  passed: projectCaseCounters.passed,
1594
2408
  skipped: projectCaseCounters.skipped,
2409
+ timeout: projectCaseCounters.timeout,
1595
2410
  total: projectCaseCounters.seenCaseIds.size
1596
2411
  },
1597
2412
  caseFailures: projectCaseFailures,
@@ -1607,14 +2422,6 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1607
2422
  };
1608
2423
  }
1609
2424
  }
1610
- async function writeRunReportArtifacts(output, events, identity, reportOut) {
1611
- const projectId = deriveReportProjectId(output);
1612
- const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
1613
- await mkdir(reportDirectory, { recursive: true });
1614
- await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(output, null, 2)}\n`, "utf-8");
1615
- await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
1616
- return reportDirectory;
1617
- }
1618
2425
  /**
1619
2426
  * Runs vieval orchestration from config and returns project-level summaries.
1620
2427
  *
@@ -1637,57 +2444,91 @@ async function runVievalCli(options = {}) {
1637
2444
  configFilePath: options.configFilePath,
1638
2445
  cwd: options.cwd
1639
2446
  });
2447
+ const telemetry = loadedConfig.reporting?.openTelemetry?.enabled === true ? createOpenTelemetryRuntime() : createNoopTelemetryRuntime();
2448
+ const onOpenTelemetryRunEnd = loadedConfig.reporting?.openTelemetry?.enabled === true ? loadedConfig.reporting.openTelemetry.onRunEnd : void 0;
1640
2449
  const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
1641
2450
  const eventRecorder = createEventRecorder(identity);
1642
2451
  const reporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
2452
+ let runError;
2453
+ let runEndError;
2454
+ let output;
1643
2455
  try {
1644
- const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
1645
- const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
1646
- const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
1647
- const totalTasks = preparedProjects.reduce((sum, project) => {
1648
- if (project.kind === "prepared") return sum + project.prepared.tasks.length;
1649
- return sum + project.summary.taskCount;
1650
- }, 0);
1651
- const skippedSummaryTasks = preparedProjects.reduce((sum, project) => {
1652
- if (project.kind === "summary") return sum + project.summary.taskCount;
1653
- return sum;
1654
- }, 0);
1655
- const reporterCounters = {
1656
- failedTasks: 0,
1657
- passedTasks: 0,
1658
- skippedTasks: 0
1659
- };
1660
- reporter.onRunStart({ totalTasks });
1661
- for (const project of executableProjects) for (const task of project.tasks) reporter.onTaskQueued(createTaskQueuePayload(task, project.name));
1662
- const projectSummaries = [];
1663
- for (const preparedProject of preparedProjects) {
1664
- if (preparedProject.kind === "summary") {
1665
- projectSummaries.push(preparedProject.summary);
1666
- continue;
1667
- }
1668
- projectSummaries.push(await executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, reporter, reporterCounters, eventRecorder.record));
1669
- }
1670
- reporter.onRunEnd({
1671
- failedTasks: reporterCounters.failedTasks,
1672
- passedTasks: reporterCounters.passedTasks,
1673
- skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
1674
- totalTasks
2456
+ output = await telemetry.withSpan("vieval.run", {
2457
+ "vieval.attempt.id": identity.attemptId,
2458
+ "vieval.experiment.id": identity.experimentId,
2459
+ "vieval.run.id": identity.runId,
2460
+ "vieval.workspace.id": identity.workspaceId
2461
+ }, async () => {
2462
+ const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
2463
+ const workspaceScheduler = createSchedulerRuntime({ concurrency: { workspace: resolveWorkspaceConcurrency(loadedConfig, options) } });
2464
+ const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
2465
+ const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
2466
+ const totalTasks = preparedProjects.reduce((sum, project) => {
2467
+ if (project.kind === "prepared") return sum + project.prepared.tasks.length;
2468
+ return sum + project.summary.taskCount;
2469
+ }, 0);
2470
+ const skippedSummaryTasks = preparedProjects.reduce((sum, project) => {
2471
+ if (project.kind === "summary") return sum + project.summary.taskCount;
2472
+ return sum;
2473
+ }, 0);
2474
+ const reporterCounters = {
2475
+ failedTasks: 0,
2476
+ passedTasks: 0,
2477
+ skippedTasks: 0
2478
+ };
2479
+ reporter.onRunStart({ totalTasks });
2480
+ for (const project of executableProjects) for (const task of project.tasks) reporter.onTaskQueued(createTaskQueuePayload(task, project.name));
2481
+ const projectSummaries = (await Promise.all(preparedProjects.map(async (preparedProject, index) => {
2482
+ if (preparedProject.kind === "summary") return {
2483
+ index,
2484
+ summary: preparedProject.summary
2485
+ };
2486
+ return {
2487
+ index,
2488
+ summary: await telemetry.withSpan("vieval.project", {
2489
+ "vieval.project.name": preparedProject.prepared.name,
2490
+ "vieval.run.id": identity.runId
2491
+ }, async () => await workspaceScheduler.runCase({
2492
+ experimentId: identity.experimentId,
2493
+ projectName: preparedProject.prepared.name,
2494
+ scope: "workspace",
2495
+ workspaceId: identity.workspaceId
2496
+ }, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry, reporter, reporterCounters, eventRecorder.record, options)))
2497
+ };
2498
+ }))).sort((left, right) => left.index - right.index).map((item) => item.summary);
2499
+ reporter.onRunEnd({
2500
+ failedTasks: reporterCounters.failedTasks,
2501
+ passedTasks: reporterCounters.passedTasks,
2502
+ skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
2503
+ totalTasks
2504
+ });
2505
+ const output = {
2506
+ attemptId: identity.attemptId,
2507
+ configFilePath: loadedConfig.configFilePath,
2508
+ experimentId: identity.experimentId,
2509
+ projects: projectSummaries,
2510
+ reportDirectory: null,
2511
+ runId: identity.runId,
2512
+ workspaceId: identity.workspaceId
2513
+ };
2514
+ if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
2515
+ return output;
1675
2516
  });
1676
- const output = {
1677
- attemptId: identity.attemptId,
1678
- configFilePath: loadedConfig.configFilePath,
1679
- experimentId: identity.experimentId,
1680
- projects: projectSummaries,
1681
- reportDirectory: null,
1682
- runId: identity.runId,
1683
- workspaceId: identity.workspaceId
1684
- };
1685
- if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
1686
- return output;
2517
+ } catch (error) {
2518
+ runError = error;
1687
2519
  } finally {
2520
+ if (onOpenTelemetryRunEnd != null) try {
2521
+ await onOpenTelemetryRunEnd();
2522
+ } catch (error) {
2523
+ if (runError == null) runEndError = error;
2524
+ }
1688
2525
  reporter.dispose();
1689
2526
  restoreEnvironment();
1690
2527
  }
2528
+ if (runError != null) throw runError;
2529
+ if (runEndError != null) throw runEndError;
2530
+ if (output == null) throw new Error("Vieval run finished without output.");
2531
+ return output;
1691
2532
  }
1692
2533
  /**
1693
2534
  * Formats CLI run output as human-readable lines.
@@ -1736,10 +2577,10 @@ function formatVievalCliRunOutput(output) {
1736
2577
  executedTasks += project.result?.overall.runCount ?? 0;
1737
2578
  const badge = createProjectBadge(project.name, colors, colorEnabled);
1738
2579
  const isFailed = project.errorMessage != null;
1739
- const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
2580
+ const hasFailedCases = (project.caseSummary?.failed ?? 0) > 0 || (project.caseSummary?.timeout ?? 0) > 0 || (project.caseFailures?.length ?? 0) > 0;
1740
2581
  if (isFailed) {
1741
2582
  failedProjects += 1;
1742
- lines.push(` ${colors.red("❯")} ${badge}${formatDuration(project.durationMs, colors)}`);
2583
+ lines.push(` ${colors.red("❯")} ${badge}${formatDuration$1(project.durationMs, colors)}`);
1743
2584
  lines.push(` ${project.errorMessage}`);
1744
2585
  continue;
1745
2586
  }
@@ -1748,7 +2589,7 @@ function formatVievalCliRunOutput(output) {
1748
2589
  const countLabel = colors.dim(`(${project.taskCount} tasks)`);
1749
2590
  const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, 0 runs, hybrid n/a`);
1750
2591
  const matrixSummary = formatMatrixSummary(project.matrixSummary);
1751
- lines.push(` ${colors.dim("○")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
2592
+ lines.push(` ${colors.dim("○")} ${badge}${countLabel}${detailsLabel}${formatDuration$1(project.durationMs, colors)}`);
1752
2593
  if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
1753
2594
  const scheduleBreakdown = formatScheduleBreakdown(project);
1754
2595
  if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
@@ -1756,14 +2597,13 @@ function formatVievalCliRunOutput(output) {
1756
2597
  }
1757
2598
  if (hasFailedCases) failedProjects += 1;
1758
2599
  else passedProjects += 1;
1759
- const hybridAverage = project.result?.overall.hybridAverage;
1760
- const hybridAverageLabel = hybridAverage == null ? "n/a" : String(hybridAverage);
2600
+ const hybridAverageLabel = formatHybridAverage(project.result?.overall.hybridAverage);
1761
2601
  const runCount = project.result?.overall.runCount ?? 0;
1762
2602
  const countLabel = colors.dim(`(${project.taskCount} tasks)`);
1763
- const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed`;
2603
+ const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed | ${project.caseSummary.timeout} timeout`;
1764
2604
  const detailsLabel = colors.dim(` ${project.discoveredEvalFileCount} files, ${project.entryCount} entries, ${runCount} runs${caseSummaryLabel}, hybrid ${hybridAverageLabel}`);
1765
2605
  const matrixSummary = formatMatrixSummary(project.matrixSummary);
1766
- lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration(project.durationMs, colors)}`);
2606
+ lines.push(` ${hasFailedCases ? colors.red("❯") : colors.green("✓")} ${badge}${countLabel}${detailsLabel}${formatDuration$1(project.durationMs, colors)}`);
1767
2607
  if (matrixSummary != null) lines.push(` ${colors.dim(matrixSummary)}`);
1768
2608
  const scheduleBreakdown = formatScheduleBreakdown(project);
1769
2609
  if (scheduleBreakdown != null) lines.push(` ${scheduleBreakdown}`);
@@ -1800,14 +2640,14 @@ const compareHelpText = `
1800
2640
  --output Optional output artifact path
1801
2641
  --format Console output format: table | json (default: table)
1802
2642
  `;
1803
- function normalizeCliArgv$4(argv) {
2643
+ function normalizeCliArgv$6(argv) {
1804
2644
  const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
1805
2645
  if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
1806
2646
  return normalizedArgv;
1807
2647
  }
1808
2648
  function parseCompareCliArguments(argv) {
1809
2649
  const cli = meow(compareHelpText, {
1810
- argv: normalizeCliArgv$4(argv),
2650
+ argv: normalizeCliArgv$6(argv),
1811
2651
  flags: {
1812
2652
  config: { type: "string" },
1813
2653
  comparison: { type: "string" },
@@ -1904,10 +2744,15 @@ const evalRunHelpText = `
1904
2744
  --workspace Workspace id used in report artifacts
1905
2745
  --experiment Experiment id used in report artifacts
1906
2746
  --attempt Attempt id used in report artifacts
2747
+ --workspace-concurrency Workspace scheduling cap
2748
+ --project-concurrency Project scheduling cap
2749
+ --task-concurrency Task scheduling cap
2750
+ --attempt-concurrency Attempt scheduling cap
2751
+ --case-concurrency Case scheduling cap
1907
2752
  --report-out Report output root directory
1908
2753
  --json Print machine-readable JSON output
1909
2754
  `;
1910
- function normalizeCliArgv$3(argv) {
2755
+ function normalizeCliArgv$5(argv) {
1911
2756
  const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
1912
2757
  return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
1913
2758
  }
@@ -1930,7 +2775,7 @@ function normalizeProjectNames(projectNames) {
1930
2775
  */
1931
2776
  function parseCliArguments(argv) {
1932
2777
  const cli = meow(evalRunHelpText, {
1933
- argv: normalizeCliArgv$3(argv),
2778
+ argv: normalizeCliArgv$5(argv),
1934
2779
  importMeta: import.meta,
1935
2780
  flags: {
1936
2781
  config: { type: "string" },
@@ -1945,17 +2790,27 @@ function parseCliArguments(argv) {
1945
2790
  workspace: { type: "string" },
1946
2791
  experiment: { type: "string" },
1947
2792
  attempt: { type: "string" },
2793
+ workspaceConcurrency: { type: "number" },
2794
+ projectConcurrency: { type: "number" },
2795
+ taskConcurrency: { type: "number" },
2796
+ attemptConcurrency: { type: "number" },
2797
+ caseConcurrency: { type: "number" },
1948
2798
  reportOut: { type: "string" }
1949
2799
  }
1950
2800
  });
1951
2801
  return {
1952
2802
  attempt: cli.flags.attempt,
2803
+ attemptConcurrency: cli.flags.attemptConcurrency,
2804
+ caseConcurrency: cli.flags.caseConcurrency,
1953
2805
  configFilePath: cli.flags.config,
1954
2806
  experiment: cli.flags.experiment,
1955
2807
  json: cli.flags.json === true,
1956
2808
  project: normalizeProjectNames(cli.flags.project),
2809
+ projectConcurrency: cli.flags.projectConcurrency,
1957
2810
  reportOut: cli.flags.reportOut,
1958
- workspace: cli.flags.workspace
2811
+ taskConcurrency: cli.flags.taskConcurrency,
2812
+ workspace: cli.flags.workspace,
2813
+ workspaceConcurrency: cli.flags.workspaceConcurrency
1959
2814
  };
1960
2815
  }
1961
2816
  /**
@@ -1991,11 +2846,16 @@ async function runEvalRunCli(argv) {
1991
2846
  try {
1992
2847
  const output = await runVievalCli({
1993
2848
  attempt: parsed.attempt,
2849
+ attemptConcurrency: parsed.attemptConcurrency,
2850
+ caseConcurrency: parsed.caseConcurrency,
1994
2851
  configFilePath: parsed.configFilePath,
1995
2852
  experiment: parsed.experiment,
1996
2853
  project: parsed.project,
2854
+ projectConcurrency: parsed.projectConcurrency,
1997
2855
  reportOut: parsed.reportOut,
1998
- workspace: parsed.workspace
2856
+ taskConcurrency: parsed.taskConcurrency,
2857
+ workspace: parsed.workspace,
2858
+ workspaceConcurrency: parsed.workspaceConcurrency
1999
2859
  });
2000
2860
  if (parsed.json) {
2001
2861
  process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
@@ -2011,89 +2871,6 @@ async function runEvalRunCli(argv) {
2011
2871
  }
2012
2872
  }
2013
2873
  //#endregion
2014
- //#region src/cli/report-artifacts.ts
2015
- /**
2016
- * Resolves one or more `run-summary.json` paths from a report location.
2017
- *
2018
- * Use when:
2019
- * - callers may pass a run directory, summary file path, or a report root
2020
- *
2021
- * Returns:
2022
- * - sorted absolute summary file paths
2023
- */
2024
- async function resolveRunSummaryPaths(reportPath) {
2025
- const absoluteReportPath = resolve(reportPath);
2026
- const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
2027
- if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
2028
- if (existsSync(directSummaryPath)) return [directSummaryPath];
2029
- return (await glob("**/run-summary.json", {
2030
- absolute: true,
2031
- cwd: absoluteReportPath
2032
- })).sort((left, right) => left.localeCompare(right));
2033
- }
2034
- /**
2035
- * Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
2036
- *
2037
- * Use when:
2038
- * - report analysis needs both run aggregate output and event count metadata
2039
- */
2040
- function readReportRunArtifact(summaryFilePath) {
2041
- const reportDirectory = resolve(summaryFilePath, "..");
2042
- const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
2043
- const eventsFilePath = resolve(reportDirectory, "events.jsonl");
2044
- const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
2045
- const event = JSON.parse(line);
2046
- return {
2047
- caseId: event.caseId,
2048
- data: event.data,
2049
- event: event.event,
2050
- taskId: event.taskId
2051
- };
2052
- }) : [];
2053
- return {
2054
- events,
2055
- eventsCount: events.length,
2056
- reportDirectory,
2057
- summary,
2058
- summaryFilePath
2059
- };
2060
- }
2061
- /**
2062
- * Reads all run artifacts found under `reportPath`.
2063
- *
2064
- * Use when:
2065
- * - callers need multi-run analysis from a directory root
2066
- */
2067
- async function readReportArtifacts(reportPath) {
2068
- return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
2069
- }
2070
- /**
2071
- * Creates a compact summary row for one run artifact.
2072
- *
2073
- * Use when:
2074
- * - table/csv/jsonl exports should stay stable and cheap to parse
2075
- */
2076
- function summarizeReportRunArtifact(artifact) {
2077
- const totalProjects = artifact.summary.projects.length;
2078
- const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
2079
- const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
2080
- const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
2081
- const projectNames = artifact.summary.projects.map((project) => project.name);
2082
- return {
2083
- attemptId: artifact.summary.attemptId ?? null,
2084
- eventsCount: artifact.eventsCount,
2085
- executedProjects,
2086
- experimentId: artifact.summary.experimentId ?? null,
2087
- failedProjects,
2088
- projectNames,
2089
- reportDirectory: artifact.reportDirectory,
2090
- runId: artifact.summary.runId ?? null,
2091
- totalProjects,
2092
- totalTasks,
2093
- workspaceId: artifact.summary.workspaceId ?? null
2094
- };
2095
- }
2096
- //#endregion
2097
2874
  //#region src/cli/report-analyze.ts
2098
2875
  const reportAnalyzeHelpText = `
2099
2876
  Analyze generated vieval report artifacts.
@@ -2115,7 +2892,7 @@ const reportAnalyzeHelpText = `
2115
2892
  --run-matrix Keep runs matching run-matrix selector "key=value[,key=value]"
2116
2893
  --eval-matrix Keep runs matching eval-matrix selector "key=value[,key=value]"
2117
2894
  `;
2118
- function normalizeCliArgv$2(argv) {
2895
+ function normalizeCliArgv$4(argv) {
2119
2896
  const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
2120
2897
  if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
2121
2898
  if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
@@ -2123,7 +2900,7 @@ function normalizeCliArgv$2(argv) {
2123
2900
  }
2124
2901
  function parseReportAnalyzeCliArguments(argv) {
2125
2902
  const cli = meow(reportAnalyzeHelpText, {
2126
- argv: normalizeCliArgv$2(argv),
2903
+ argv: normalizeCliArgv$4(argv),
2127
2904
  flags: {
2128
2905
  attempt: { type: "string" },
2129
2906
  caseState: { type: "string" },
@@ -2412,6 +3189,473 @@ async function runReportAnalyzeCli(argv) {
2412
3189
  }
2413
3190
  }
2414
3191
  //#endregion
3192
+ //#region src/cli/report-cases.ts
3193
+ const reportCasesHelpText = `
3194
+ Inspect normalized case records from generated vieval report artifacts.
3195
+
3196
+ Usage
3197
+ $ vieval report cases <reportPath> [options]
3198
+
3199
+ Options
3200
+ --format Output format: table | json | jsonl (default: table)
3201
+ --where Equality filter "key=value"; repeatable
3202
+ --group-by Case field, score name, or metric name used for grouped score summaries
3203
+ `;
3204
+ /**
3205
+ * Reads normalized case records from one report run directory or report root.
3206
+ *
3207
+ * Use when:
3208
+ * - CLI tools need case-level inspection from local report artifacts
3209
+ * - callers may pass a run directory, a `cases.jsonl` file, or a report root
3210
+ *
3211
+ * Expects:
3212
+ * - discovered `cases.jsonl` files contain one `CaseRecord` JSON object per line
3213
+ *
3214
+ * Returns:
3215
+ * - all parsed case records sorted by discovered file path order
3216
+ */
3217
+ async function readCaseRecordsFromReport(reportPath) {
3218
+ const caseFilePaths = await resolveCaseRecordPaths(reportPath);
3219
+ if (caseFilePaths.length === 0) throw new Error(`No cases.jsonl files found under "${resolve(reportPath)}".`);
3220
+ const records = [];
3221
+ for (const caseFilePath of caseFilePaths) {
3222
+ const lines = readFileSync(caseFilePath, "utf-8").split("\n");
3223
+ for (const [index, line] of lines.entries()) {
3224
+ const trimmed = line.trim();
3225
+ if (trimmed.length === 0) continue;
3226
+ try {
3227
+ records.push(JSON.parse(trimmed));
3228
+ } catch (error) {
3229
+ throw new Error(`Invalid cases.jsonl line ${index + 1} in "${caseFilePath}": ${errorMessageFrom(error) ?? "Unknown JSON parse failure."}`);
3230
+ }
3231
+ }
3232
+ }
3233
+ return records;
3234
+ }
3235
+ /**
3236
+ * Builds filtered case inspection output.
3237
+ *
3238
+ * Use when:
3239
+ * - `vieval report cases` needs deterministic JSON/table output
3240
+ * - tests need pure filtering and grouping behavior without process I/O
3241
+ *
3242
+ * Expects:
3243
+ * - `where` filters use `key=value`
3244
+ * - lookup keys may target direct case fields, score names, or metric names
3245
+ *
3246
+ * Returns:
3247
+ * - filtered records plus grouped score summaries when `groupBy` is present
3248
+ */
3249
+ function buildReportCasesOutput(records, options) {
3250
+ const whereFilters = (options.where ?? []).map(parseSelector);
3251
+ const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
3252
+ return {
3253
+ groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
3254
+ records: [...filteredRecords]
3255
+ };
3256
+ }
3257
+ /**
3258
+ * Runs the `vieval report cases` command.
3259
+ *
3260
+ * Call stack:
3261
+ *
3262
+ * published executable (`../bin/vieval`)
3263
+ * -> {@link import('./index').runTopLevelCli}
3264
+ * -> {@link runReportCasesCli}
3265
+ * -> {@link readCaseRecordsFromReport}
3266
+ *
3267
+ * Use when:
3268
+ * - the top-level CLI dispatches local case artifact inspection
3269
+ *
3270
+ * Expects:
3271
+ * - argv is either `cases <reportPath> ...` or `<reportPath> ...`
3272
+ *
3273
+ * Returns:
3274
+ * - resolves after writing the requested output to stdout
3275
+ */
3276
+ async function runReportCasesCli(argv) {
3277
+ try {
3278
+ const parsed = parseReportCasesCliArguments(argv);
3279
+ const output = buildReportCasesOutput(await readCaseRecordsFromReport(parsed.reportPath), parsed);
3280
+ if (parsed.format === "json") {
3281
+ process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
3282
+ return;
3283
+ }
3284
+ if (parsed.format === "jsonl") {
3285
+ process.stdout.write(encodeJsonl(output.records));
3286
+ return;
3287
+ }
3288
+ process.stdout.write(`${formatCasesTable(output)}\n`);
3289
+ } catch (error) {
3290
+ const errorMessage = errorMessageFrom(error) ?? "Unknown report cases failure.";
3291
+ process.stderr.write(`[vieval report cases] ${errorMessage}\n`);
3292
+ process.exitCode = 1;
3293
+ }
3294
+ }
3295
+ function normalizeCliArgv$3(argv) {
3296
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3297
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
3298
+ if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
3299
+ return normalizedArgv;
3300
+ }
3301
+ function parseReportCasesCliArguments(argv) {
3302
+ const cli = meow(reportCasesHelpText, {
3303
+ argv: normalizeCliArgv$3(argv),
3304
+ flags: {
3305
+ format: {
3306
+ default: "table",
3307
+ type: "string"
3308
+ },
3309
+ groupBy: { type: "string" },
3310
+ where: {
3311
+ isMultiple: true,
3312
+ type: "string"
3313
+ }
3314
+ },
3315
+ importMeta: import.meta
3316
+ });
3317
+ const reportPath = cli.input[0];
3318
+ if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
3319
+ return {
3320
+ format: normalizeReportCasesFormat(cli.flags.format),
3321
+ groupBy: cli.flags.groupBy,
3322
+ reportPath,
3323
+ where: cli.flags.where
3324
+ };
3325
+ }
3326
+ function normalizeReportCasesFormat(value) {
3327
+ const normalized = value.toLowerCase();
3328
+ if (normalized === "json") return "json";
3329
+ if (normalized === "jsonl") return "jsonl";
3330
+ return "table";
3331
+ }
3332
+ async function resolveCaseRecordPaths(reportPath) {
3333
+ const absoluteReportPath = resolve(reportPath);
3334
+ const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
3335
+ if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
3336
+ if (existsSync(directCaseFilePath)) return [directCaseFilePath];
3337
+ return (await glob("**/cases.jsonl", {
3338
+ absolute: true,
3339
+ cwd: absoluteReportPath
3340
+ })).sort((left, right) => left.localeCompare(right));
3341
+ }
3342
+ function matchesWhereFilters(record, whereFilters) {
3343
+ return whereFilters.every((parsed) => {
3344
+ const resolved = getCaseSelectorValue(record, parsed.key);
3345
+ return resolved.exists && String(resolved.value) === parsed.value;
3346
+ });
3347
+ }
3348
+ function parseSelector(selector) {
3349
+ const separatorIndex = selector.indexOf("=");
3350
+ if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
3351
+ return {
3352
+ key: selector.slice(0, separatorIndex).trim(),
3353
+ value: selector.slice(separatorIndex + 1).trim()
3354
+ };
3355
+ }
3356
+ function buildCaseGroups(records, groupBy) {
3357
+ const groups = {};
3358
+ for (const record of records) {
3359
+ const resolved = getCaseSelectorValue(record, groupBy);
3360
+ if (!resolved.exists) continue;
3361
+ const groupKey = `${groupBy}=${String(resolved.value)}`;
3362
+ groups[groupKey] ??= {
3363
+ count: 0,
3364
+ scores: {}
3365
+ };
3366
+ groups[groupKey].count += 1;
3367
+ addScores(groups[groupKey].scores, record.scores);
3368
+ }
3369
+ return Object.fromEntries(Object.entries(groups).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, group]) => [groupKey, {
3370
+ count: group.count,
3371
+ scores: finalizeScores(group.scores)
3372
+ }]));
3373
+ }
3374
+ function addScores(summary, scores) {
3375
+ for (const [scoreName, value] of Object.entries(scores)) {
3376
+ summary[scoreName] ??= {
3377
+ average: 0,
3378
+ count: 0,
3379
+ sum: 0
3380
+ };
3381
+ summary[scoreName].count += 1;
3382
+ summary[scoreName].sum += value;
3383
+ }
3384
+ }
3385
+ function finalizeScores(summary) {
3386
+ return Object.fromEntries(Object.entries(summary).sort(([left], [right]) => left.localeCompare(right)).map(([scoreName, bucket]) => [scoreName, {
3387
+ average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
3388
+ count: bucket.count,
3389
+ sum: bucket.sum
3390
+ }]));
3391
+ }
3392
+ function formatCasesTable(output) {
3393
+ const lines = ["CASES vieval report", `Case count ${output.records.length}`];
3394
+ if (output.groups != null) {
3395
+ lines.push("Groups");
3396
+ for (const [groupKey, group] of Object.entries(output.groups)) {
3397
+ const scoreText = Object.entries(group.scores).map(([scoreName, bucket]) => `${scoreName}=${bucket.average.toFixed(3)}`).join(" ");
3398
+ lines.push(`${groupKey} count=${group.count}${scoreText.length > 0 ? ` ${scoreText}` : ""}`);
3399
+ }
3400
+ }
3401
+ return lines.join("\n");
3402
+ }
3403
+ //#endregion
3404
+ //#region src/cli/report-case-compare.ts
3405
+ const reportCompareHelpText = `
3406
+ Compare normalized case records from two generated vieval reports.
3407
+
3408
+ Usage
3409
+ $ vieval report compare <leftReportPath> <rightReportPath> [options]
3410
+
3411
+ Options
3412
+ --format Output format: table | json (default: table)
3413
+ --case-key Case field, score name, or metric name used to match records
3414
+ --score-kind Score kind used for deltas (default: exact)
3415
+ --group-by Case field, score name, or metric name used for grouped deltas
3416
+ `;
3417
+ /**
3418
+ * Builds a generic case-level comparison between two report runs.
3419
+ *
3420
+ * Use when:
3421
+ * - local report analysis needs per-case improvements/regressions
3422
+ * - benchmark-specific facets should stay as generic metric keys
3423
+ *
3424
+ * Expects:
3425
+ * - left and right records are normalized `cases.jsonl` rows
3426
+ * - score values are numeric and comparable by `scoreKind`
3427
+ *
3428
+ * Returns:
3429
+ * - matched case deltas, added/removed cases, top changes, and optional group summaries
3430
+ */
3431
+ function buildCaseComparison(args) {
3432
+ const scoreKind = args.scoreKind ?? "exact";
3433
+ const leftByKey = indexRecordsByCaseKey(args.left, args.caseKey, "left");
3434
+ const rightByKey = indexRecordsByCaseKey(args.right, args.caseKey, "right");
3435
+ const cases = [];
3436
+ const added = [];
3437
+ const removed = [];
3438
+ for (const [caseKey, leftRecord] of leftByKey) {
3439
+ const rightRecord = rightByKey.get(caseKey);
3440
+ if (rightRecord == null) {
3441
+ removed.push(leftRecord);
3442
+ continue;
3443
+ }
3444
+ const leftScore = getScore(leftRecord, scoreKind);
3445
+ const rightScore = getScore(rightRecord, scoreKind);
3446
+ cases.push({
3447
+ caseKey,
3448
+ delta: {
3449
+ left: leftScore,
3450
+ right: rightScore,
3451
+ score: rightScore - leftScore
3452
+ },
3453
+ left: leftRecord,
3454
+ metricsChanged: diffMetrics(leftRecord.metrics, rightRecord.metrics),
3455
+ right: rightRecord
3456
+ });
3457
+ }
3458
+ for (const [caseKey, rightRecord] of rightByKey) if (!leftByKey.has(caseKey)) added.push(rightRecord);
3459
+ const sortedCases = [...cases].sort((left, right) => {
3460
+ const deltaOrder = right.delta.score - left.delta.score;
3461
+ return deltaOrder === 0 ? left.caseKey.localeCompare(right.caseKey) : deltaOrder;
3462
+ });
3463
+ return {
3464
+ added: added.sort(compareCaseRecords),
3465
+ cases: cases.sort((left, right) => left.caseKey.localeCompare(right.caseKey)),
3466
+ groups: args.groupBy == null ? void 0 : buildComparisonGroups(cases, args.groupBy),
3467
+ overall: {
3468
+ delta: averageScore(args.right, scoreKind) - averageScore(args.left, scoreKind),
3469
+ leftAverage: averageScore(args.left, scoreKind),
3470
+ rightAverage: averageScore(args.right, scoreKind)
3471
+ },
3472
+ removed: removed.sort(compareCaseRecords),
3473
+ topImprovements: sortedCases.filter((row) => row.delta.score > 0).slice(0, 10),
3474
+ topRegressions: [...sortedCases].reverse().filter((row) => row.delta.score < 0).slice(0, 10)
3475
+ };
3476
+ }
3477
+ /**
3478
+ * Runs the `vieval report compare` command.
3479
+ *
3480
+ * Call stack:
3481
+ *
3482
+ * published executable (`../bin/vieval`)
3483
+ * -> {@link import('./index').runTopLevelCli}
3484
+ * -> {@link runReportCompareCli}
3485
+ * -> {@link readCaseRecordsFromReport}
3486
+ * -> {@link buildCaseComparison}
3487
+ *
3488
+ * Use when:
3489
+ * - two local report artifact directories should be compared case-by-case
3490
+ *
3491
+ * Expects:
3492
+ * - argv is either `compare <left> <right> ...` or `<left> <right> ...`
3493
+ *
3494
+ * Returns:
3495
+ * - resolves after writing the requested output to stdout
3496
+ */
3497
+ async function runReportCompareCli(argv) {
3498
+ try {
3499
+ const parsed = parseReportCompareCliArguments(argv);
3500
+ const [left, right] = await Promise.all([readCaseRecordsFromReport(parsed.leftReportPath), readCaseRecordsFromReport(parsed.rightReportPath)]);
3501
+ const output = buildCaseComparison({
3502
+ caseKey: parsed.caseKey,
3503
+ groupBy: parsed.groupBy,
3504
+ left,
3505
+ right,
3506
+ scoreKind: parsed.scoreKind
3507
+ });
3508
+ if (parsed.format === "json") {
3509
+ process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
3510
+ return;
3511
+ }
3512
+ process.stdout.write(`${formatCaseComparisonTable(output)}\n`);
3513
+ } catch (error) {
3514
+ const errorMessage = errorMessageFrom(error) ?? "Unknown report compare failure.";
3515
+ process.stderr.write(`[vieval report compare] ${errorMessage}\n`);
3516
+ process.exitCode = 1;
3517
+ }
3518
+ }
3519
+ function normalizeCliArgv$2(argv) {
3520
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3521
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "compare") return normalizedArgv.slice(2);
3522
+ if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
3523
+ return normalizedArgv;
3524
+ }
3525
+ function parseReportCompareCliArguments(argv) {
3526
+ const cli = meow(reportCompareHelpText, {
3527
+ argv: normalizeCliArgv$2(argv),
3528
+ flags: {
3529
+ caseKey: { type: "string" },
3530
+ format: {
3531
+ default: "table",
3532
+ type: "string"
3533
+ },
3534
+ groupBy: { type: "string" },
3535
+ scoreKind: {
3536
+ default: "exact",
3537
+ type: "string"
3538
+ }
3539
+ },
3540
+ importMeta: import.meta
3541
+ });
3542
+ const leftReportPath = cli.input[0];
3543
+ const rightReportPath = cli.input[1];
3544
+ if (leftReportPath == null || leftReportPath.length === 0 || rightReportPath == null || rightReportPath.length === 0) throw new Error("Missing required <leftReportPath> and <rightReportPath> arguments.");
3545
+ return {
3546
+ caseKey: cli.flags.caseKey,
3547
+ format: cli.flags.format === "json" ? "json" : "table",
3548
+ groupBy: cli.flags.groupBy,
3549
+ leftReportPath,
3550
+ rightReportPath,
3551
+ scoreKind: cli.flags.scoreKind
3552
+ };
3553
+ }
3554
+ function indexRecordsByCaseKey(records, caseKey, side) {
3555
+ const indexed = /* @__PURE__ */ new Map();
3556
+ for (const record of records) {
3557
+ const resolved = resolveCaseKey(record, caseKey);
3558
+ if (indexed.has(resolved)) throw new Error(`Duplicate case key "${resolved}" in ${side} report.`);
3559
+ indexed.set(resolved, record);
3560
+ }
3561
+ return indexed;
3562
+ }
3563
+ function resolveCaseKey(record, caseKey) {
3564
+ if (caseKey != null) {
3565
+ const resolved = getCaseSelectorValue(record, caseKey);
3566
+ if (resolved.exists) return String(resolved.value);
3567
+ throw new Error(`Missing explicit case key "${caseKey}" for case "${record.caseId}".`);
3568
+ }
3569
+ const benchmarkCaseId = getCaseSelectorValue(record, "benchmark.case.id");
3570
+ if (benchmarkCaseId.exists) return String(benchmarkCaseId.value);
3571
+ const vievalCaseId = getCaseSelectorValue(record, "vieval.case.id");
3572
+ return vievalCaseId.exists ? String(vievalCaseId.value) : record.caseId;
3573
+ }
3574
+ function getScore(record, scoreKind) {
3575
+ return record.scores[scoreKind] ?? 0;
3576
+ }
3577
+ function averageScore(records, scoreKind) {
3578
+ const values = records.map((record) => record.scores[scoreKind]).filter((value) => typeof value === "number");
3579
+ if (values.length === 0) return 0;
3580
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
3581
+ }
3582
+ function diffMetrics(left, right) {
3583
+ const changed = {};
3584
+ const metricKeys = [...new Set([...Object.keys(left), ...Object.keys(right)])].sort((leftKey, rightKey) => leftKey.localeCompare(rightKey));
3585
+ for (const metricKey of metricKeys) if (stableStringify(left[metricKey]) !== stableStringify(right[metricKey])) changed[metricKey] = {
3586
+ left: left[metricKey],
3587
+ right: right[metricKey]
3588
+ };
3589
+ return changed;
3590
+ }
3591
+ function buildComparisonGroups(cases, groupBy) {
3592
+ const groupedRows = {};
3593
+ for (const row of cases) {
3594
+ const resolved = getCaseSelectorValue(row.right, groupBy);
3595
+ if (!resolved.exists) continue;
3596
+ const groupKey = `${groupBy}=${String(resolved.value)}`;
3597
+ groupedRows[groupKey] ??= [];
3598
+ groupedRows[groupKey].push(row);
3599
+ }
3600
+ return Object.fromEntries(Object.entries(groupedRows).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, rows]) => {
3601
+ const leftAverage = rows.reduce((sum, row) => sum + row.delta.left, 0) / rows.length;
3602
+ const rightAverage = rows.reduce((sum, row) => sum + row.delta.right, 0) / rows.length;
3603
+ return [groupKey, {
3604
+ count: rows.length,
3605
+ delta: rightAverage - leftAverage,
3606
+ leftAverage,
3607
+ rightAverage
3608
+ }];
3609
+ }));
3610
+ }
3611
+ function compareCaseRecords(left, right) {
3612
+ return left.caseId.localeCompare(right.caseId);
3613
+ }
3614
+ /**
3615
+ * Formats a case comparison as a compact human-readable table.
3616
+ *
3617
+ * Use when:
3618
+ * - `vieval report compare` should expose the same information as JSON output
3619
+ * - users need a terminal-first overview of group and per-case deltas
3620
+ *
3621
+ * Expects:
3622
+ * - comparison output was produced by {@link buildCaseComparison}
3623
+ *
3624
+ * Returns:
3625
+ * - multi-line text containing aggregate, group, top-change, case, and unmatched summaries
3626
+ */
3627
+ function formatCaseComparisonTable(output) {
3628
+ const lines = [
3629
+ "COMPARE vieval report cases",
3630
+ `Matched ${output.cases.length}`,
3631
+ `Added ${output.added.length}`,
3632
+ `Removed ${output.removed.length}`,
3633
+ `Scores left=${output.overall.leftAverage.toFixed(3)} right=${output.overall.rightAverage.toFixed(3)} delta=${output.overall.delta.toFixed(3)}`
3634
+ ];
3635
+ if (output.groups != null && Object.keys(output.groups).length > 0) {
3636
+ lines.push("Groups");
3637
+ for (const [groupKey, group] of Object.entries(output.groups)) lines.push(`${groupKey} count=${group.count} left=${group.leftAverage.toFixed(3)} right=${group.rightAverage.toFixed(3)} delta=${group.delta.toFixed(3)}`);
3638
+ }
3639
+ if (output.topImprovements.length > 0) {
3640
+ lines.push("Top improvements");
3641
+ for (const row of output.topImprovements) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
3642
+ }
3643
+ if (output.topRegressions.length > 0) {
3644
+ lines.push("Top regressions");
3645
+ for (const row of output.topRegressions) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
3646
+ }
3647
+ if (output.cases.length > 0) {
3648
+ lines.push("Cases");
3649
+ for (const row of output.cases) {
3650
+ const changedMetricNames = Object.keys(row.metricsChanged);
3651
+ lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} changedMetrics=${changedMetricNames.length === 0 ? "none" : changedMetricNames.join(",")}`);
3652
+ }
3653
+ }
3654
+ if (output.added.length > 0) lines.push(`Added cases ${output.added.map((record) => record.caseId).join(",")}`);
3655
+ if (output.removed.length > 0) lines.push(`Removed cases ${output.removed.map((record) => record.caseId).join(",")}`);
3656
+ return lines.join("\n");
3657
+ }
3658
+ //#endregion
2415
3659
  //#region src/cli/report-index.ts
2416
3660
  const reportIndexHelpText = `
2417
3661
  Build report indexes from generated vieval artifacts.
@@ -2579,7 +3823,15 @@ async function runTopLevelCli(argv) {
2579
3823
  await runReportIndexCli(parsed.commandArgv);
2580
3824
  return;
2581
3825
  }
2582
- throw new Error(`Unsupported vieval report command "${reportSubcommand ?? "(none)"}". Expected "analyze" or "index".`);
3826
+ if (reportSubcommand === "cases") {
3827
+ await runReportCasesCli(parsed.commandArgv);
3828
+ return;
3829
+ }
3830
+ if (reportSubcommand === "compare") {
3831
+ await runReportCompareCli(parsed.commandArgv);
3832
+ return;
3833
+ }
3834
+ throw new Error(`Unsupported vieval report command "${reportSubcommand ?? "(none)"}". Expected "analyze", "index", "cases", or "compare".`);
2583
3835
  }
2584
3836
  if (parsed.command === "compare") {
2585
3837
  await runCompareCliOrExit(parsed.commandArgv);
@@ -2590,4 +3842,4 @@ async function runTopLevelCli(argv) {
2590
3842
  //#endregion
2591
3843
  export { runTopLevelCli as n, parseTopLevelCliArguments as t };
2592
3844
 
2593
- //# sourceMappingURL=cli-DayPXzHX.mjs.map
3845
+ //# sourceMappingURL=cli-ImxGpoYQ.mjs.map