vieval 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { c as loadRawVievalConfig, l as loadVievalCliConfig, n as consumeModuleRegistrations, o as detectCliConfigMode, r as endModuleRegistration, t as beginModuleRegistration } from "./registry-CcKZqDJY.mjs";
1
+ import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-BHGMxjpA.mjs";
2
2
  import { createSchedulerRuntime } from "./core/scheduler/index.mjs";
3
3
  import { RunnerExecutionError, collectEvalEntries, createFilesystemTaskCacheRuntime, createRunnerRuntimeContext, createRunnerSchedule, createTaskExecutionContext, runScheduledTasks } from "./core/runner/index.mjs";
4
4
  import process from "node:process";
@@ -264,6 +264,612 @@ async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
264
264
  return loadedModules;
265
265
  }
266
266
  //#endregion
267
+ //#region src/cli/report-selectors.ts
268
+ /**
269
+ * Resolves a generic case selector from metrics, scores, then direct fields.
270
+ *
271
+ * Use when:
272
+ * - report commands accept benchmark-neutral selectors such as `benchmark.case.id`
273
+ * - comparisons need the same lookup semantics as filtering and grouping
274
+ *
275
+ * Expects:
276
+ * - `key` is a direct `CaseRecord` field, score key, `scores.<key>`, or metric key
277
+ *
278
+ * Returns:
279
+ * - existence flag plus matched value when present
280
+ */
281
+ function getCaseSelectorValue(record, key) {
282
+ if (Object.hasOwn(record.metrics, key)) return {
283
+ exists: true,
284
+ value: record.metrics[key]
285
+ };
286
+ if (key.startsWith("scores.") && Object.hasOwn(record.scores, key.slice(7))) return {
287
+ exists: true,
288
+ value: record.scores[key.slice(7)]
289
+ };
290
+ if (Object.hasOwn(record.scores, key)) return {
291
+ exists: true,
292
+ value: record.scores[key]
293
+ };
294
+ if (Object.hasOwn(record, key)) return {
295
+ exists: true,
296
+ value: record[key]
297
+ };
298
+ return { exists: false };
299
+ }
300
+ /**
301
+ * Stable-stringifies JSON-like values for report comparisons.
302
+ *
303
+ * Before:
304
+ * - `{ b: 1, a: true }`
305
+ *
306
+ * After:
307
+ * - `{"a":true,"b":1}`
308
+ */
309
+ function stableStringify(value) {
310
+ if (value == null || typeof value !== "object") return JSON.stringify(value);
311
+ if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
312
+ const record = value;
313
+ return `{${Object.keys(record).sort((left, right) => left.localeCompare(right)).map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(",")}}`;
314
+ }
315
+ //#endregion
316
+ //#region src/cli/report-otlp.ts
317
+ /**
318
+ * Builds local OTLP-shaped JSON projections from normalized case records.
319
+ *
320
+ * Use when:
321
+ * - writing deterministic report artifacts without requiring an OpenTelemetry Collector
322
+ * - future tools need trace/log/metric-shaped JSON files
323
+ *
324
+ * Expects:
325
+ * - records belong to one Vieval run
326
+ *
327
+ * Returns:
328
+ * - trace, log, and metric containers shaped after OTLP JSON concepts
329
+ */
330
+ function buildLocalOtlpProjection(args) {
331
+ const projectSpans = collectProjectNames(args.records).map((projectName) => ({
332
+ attributes: toAttributes({
333
+ "vieval.project.name": projectName,
334
+ "vieval.run.id": args.runId
335
+ }),
336
+ name: "vieval.project"
337
+ }));
338
+ const taskSpans = collectTasks(args.records).map((task) => ({
339
+ attributes: toAttributes({
340
+ "vieval.project.name": task.projectName,
341
+ "vieval.run.id": args.runId,
342
+ "vieval.task.id": task.taskId
343
+ }),
344
+ name: "vieval.task"
345
+ }));
346
+ const caseSpans = args.records.map((record) => ({
347
+ attributes: toAttributes({
348
+ ...record.metrics,
349
+ "vieval.case.duration_ms": record.durationMs,
350
+ "vieval.case.id": record.caseId,
351
+ "vieval.case.name": record.caseName,
352
+ "vieval.case.retry_count": record.retryCount,
353
+ "vieval.case.state": record.state,
354
+ "vieval.project.name": record.projectName,
355
+ "vieval.task.id": record.taskId
356
+ }),
357
+ endTimeUnixNano: isoToUnixNano(record.endedAt),
358
+ name: "vieval.case",
359
+ startTimeUnixNano: isoToUnixNano(record.startedAt)
360
+ }));
361
+ return {
362
+ logs: { resourceLogs: [{ scopeLogs: [{
363
+ logRecords: args.records.map((record) => ({
364
+ attributes: toAttributes(record.metrics),
365
+ body: { stringValue: JSON.stringify({
366
+ caseId: record.caseId,
367
+ scores: record.scores,
368
+ state: record.state
369
+ }) },
370
+ eventName: "vieval.case",
371
+ timeUnixNano: isoToUnixNano(record.endedAt)
372
+ })),
373
+ scope: { name: "vieval" }
374
+ }] }] },
375
+ metrics: { resourceMetrics: [{ scopeMetrics: [{
376
+ metrics: collectScoreKinds(args.records).map((kind) => ({
377
+ gauge: { dataPoints: args.records.filter((record) => typeof record.scores[kind] === "number").map((record) => ({
378
+ asDouble: record.scores[kind],
379
+ attributes: toAttributes({
380
+ ...record.metrics,
381
+ "vieval.case.id": record.caseId,
382
+ "vieval.task.id": record.taskId
383
+ }),
384
+ timeUnixNano: isoToUnixNano(record.endedAt)
385
+ })) },
386
+ name: `vieval.score.${kind}`
387
+ })),
388
+ scope: { name: "vieval" }
389
+ }] }] },
390
+ traces: { resourceSpans: [{ scopeSpans: [{
391
+ scope: { name: "vieval" },
392
+ spans: [
393
+ {
394
+ attributes: toAttributes({ "vieval.run.id": args.runId }),
395
+ name: "vieval.run"
396
+ },
397
+ ...projectSpans,
398
+ ...taskSpans,
399
+ ...caseSpans
400
+ ]
401
+ }] }] }
402
+ };
403
+ }
404
+ function toAttributes(attributes) {
405
+ return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
406
+ key,
407
+ value: toAnyValue(value)
408
+ }));
409
+ }
410
+ function toAnyValue(value) {
411
+ if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
412
+ if (isAttributeScalar(value)) {
413
+ if (typeof value === "boolean") return { boolValue: value };
414
+ if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
415
+ if (value == null) return { stringValue: "null" };
416
+ return { stringValue: value };
417
+ }
418
+ return { stringValue: stableStringify(value) };
419
+ }
420
+ function isAttributeScalar(value) {
421
+ return value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string";
422
+ }
423
+ function isoToUnixNano(value) {
424
+ const preciseMatch = /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.(\d{1,9}))?(Z|[+-]\d{2}:\d{2})$/.exec(value);
425
+ if (preciseMatch != null) {
426
+ const [, secondsPart, fraction = "", zone] = preciseMatch;
427
+ const unixMilliseconds = Date.parse(`${secondsPart}.000${zone}`);
428
+ if (!Number.isFinite(unixMilliseconds)) return "0";
429
+ return String(BigInt(unixMilliseconds) * 1000000n + BigInt(fraction.padEnd(9, "0").slice(0, 9)));
430
+ }
431
+ const unixMilliseconds = Date.parse(value);
432
+ if (!Number.isFinite(unixMilliseconds)) return "0";
433
+ return String(BigInt(unixMilliseconds) * 1000000n);
434
+ }
435
+ function collectScoreKinds(records) {
436
+ return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
437
+ }
438
+ function collectProjectNames(records) {
439
+ return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
440
+ }
441
+ function collectTasks(records) {
442
+ const tasks = /* @__PURE__ */ new Map();
443
+ for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
444
+ projectName: record.projectName,
445
+ taskId: record.taskId
446
+ });
447
+ return [...tasks.values()].sort((left, right) => {
448
+ const projectOrder = left.projectName.localeCompare(right.projectName);
449
+ return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
450
+ });
451
+ }
452
+ //#endregion
453
+ //#region src/cli/report-records.ts
454
+ /**
455
+ * Builds normalized case records from lifecycle, metric, and score events.
456
+ *
457
+ * Use when:
458
+ * - `events.jsonl` should be projected into `cases.jsonl`
459
+ * - report commands need one final record per observed case outcome
460
+ *
461
+ * Expects:
462
+ * - events are ordered by occurrence where possible
463
+ * - lifecycle events use either `task.case.start`/`task.case.end` or current CLI `CaseStarted`/`CaseEnded` names
464
+ *
465
+ * Returns:
466
+ * - records for cases that emitted an end lifecycle event
467
+ */
468
+ function buildCaseRecords(args) {
469
+ const drafts = /* @__PURE__ */ new Map();
470
+ const completedKeys = [];
471
+ for (const event of args.events) {
472
+ const normalizedEvent = normalizeCaseEventName(event.event);
473
+ if (normalizedEvent == null) continue;
474
+ const ids = extractEventIds(event, args);
475
+ if (ids.caseId.length === 0 || ids.taskId.length === 0) continue;
476
+ const draft = getOrCreateDraft(drafts, ids, event, args);
477
+ applyIdentity(draft, ids, event, args);
478
+ if (normalizedEvent === "start") applyCaseStart(draft, event);
479
+ else if (normalizedEvent === "metric") applyCaseMetric(draft, event);
480
+ else if (normalizedEvent === "score") applyCaseScore(draft, event);
481
+ else {
482
+ applyCaseEnd(draft, event);
483
+ const key = createCaseKey(ids.taskId, ids.caseId);
484
+ if (!completedKeys.includes(key)) completedKeys.push(key);
485
+ }
486
+ }
487
+ return completedKeys.map((key) => drafts.get(key)).filter((draft) => draft != null && draft.endedAt != null).map(toCaseRecord);
488
+ }
489
+ /**
490
+ * Builds generic score summaries overall and grouped by arbitrary keys.
491
+ *
492
+ * Use when:
493
+ * - report artifacts need benchmark-neutral aggregate score views
494
+ * - callers want to group by metrics such as `benchmark.category` or direct record fields such as `taskId`
495
+ *
496
+ * Expects:
497
+ * - `groupByKeys` are stable metric names or direct `CaseRecord` field names
498
+ * - record score values are normalized numeric scores
499
+ *
500
+ * Returns:
501
+ * - overall score buckets and group buckets keyed by `<key>=<value>`
502
+ */
503
+ function buildMetricsSummary(records, groupByKeys) {
504
+ const overall = {};
505
+ const groups = {};
506
+ for (const record of records) {
507
+ addRecordScores(overall, record);
508
+ for (const groupByKey of groupByKeys) {
509
+ const groupValue = getGroupValue(record, groupByKey);
510
+ if (!groupValue.exists) continue;
511
+ const groupKey = `${groupByKey}=${String(groupValue.value)}`;
512
+ groups[groupKey] ??= {};
513
+ addRecordScores(groups[groupKey], record);
514
+ }
515
+ }
516
+ return {
517
+ groups: finalizeSummaryGroups(groups),
518
+ overall: finalizeScoreSummary(overall)
519
+ };
520
+ }
521
+ /**
522
+ * Encodes records as newline-delimited JSON.
523
+ *
524
+ * Use when:
525
+ * - writing `cases.jsonl` for command-line tools, dataframes, or streaming parsers
526
+ * - each record should occupy exactly one JSON line
527
+ *
528
+ * Expects:
529
+ * - records are JSON-serializable case records
530
+ *
531
+ * Returns:
532
+ * - one JSON object per line with a trailing newline for non-empty input
533
+ */
534
+ function encodeJsonl(records) {
535
+ if (records.length === 0) return "";
536
+ return `${records.map((record) => JSON.stringify(record)).join("\n")}\n`;
537
+ }
538
+ function normalizeCaseEventName(eventName) {
539
+ if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
540
+ if (eventName === "task.case.metric") return "metric";
541
+ if (eventName === "task.case.score") return "score";
542
+ if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
543
+ }
544
+ function extractEventIds(event, args) {
545
+ const data = asRecord(event.data);
546
+ return {
547
+ attemptId: stringFrom(data?.attemptId) ?? event.attemptId ?? args.attemptId,
548
+ caseId: stringFrom(data?.caseId) ?? event.caseId ?? "",
549
+ experimentId: stringFrom(data?.experimentId) ?? event.experimentId ?? args.experimentId,
550
+ projectName: stringFrom(data?.projectName) ?? event.projectName ?? event.projectId ?? args.projectName,
551
+ runId: stringFrom(data?.runId) ?? event.runId ?? args.runId,
552
+ taskId: stringFrom(data?.taskId) ?? event.taskId ?? "",
553
+ workspaceId: stringFrom(data?.workspaceId) ?? event.workspaceId ?? args.workspaceId
554
+ };
555
+ }
556
+ function getOrCreateDraft(drafts, ids, event, args) {
557
+ const key = createCaseKey(ids.taskId, ids.caseId);
558
+ const existing = drafts.get(key);
559
+ if (existing != null) return existing;
560
+ const draft = {
561
+ attemptId: ids.attemptId,
562
+ caseId: ids.caseId,
563
+ caseName: extractCaseName(event) ?? ids.caseId,
564
+ experimentId: ids.experimentId,
565
+ metrics: {},
566
+ projectName: ids.projectName || args.projectName,
567
+ retryCount: 0,
568
+ runId: ids.runId,
569
+ scores: {},
570
+ startCount: 0,
571
+ taskId: ids.taskId,
572
+ workspaceId: ids.workspaceId
573
+ };
574
+ drafts.set(key, draft);
575
+ return draft;
576
+ }
577
+ function applyIdentity(draft, ids, event, args) {
578
+ draft.attemptId = ids.attemptId || args.attemptId;
579
+ draft.experimentId = ids.experimentId || args.experimentId;
580
+ draft.projectName = extractExplicitProjectName(event) ?? draft.projectName;
581
+ draft.runId = ids.runId || args.runId;
582
+ draft.workspaceId = ids.workspaceId || args.workspaceId;
583
+ }
584
+ function applyCaseStart(draft, event) {
585
+ const data = asRecord(event.data);
586
+ draft.startCount += 1;
587
+ draft.caseName = extractCaseName(event) ?? draft.caseName;
588
+ draft.startedAt ??= stringFrom(data?.startedAt) ?? event.timestamp;
589
+ draft.endedAt = void 0;
590
+ draft.input = void 0;
591
+ draft.metrics = {};
592
+ draft.output = void 0;
593
+ draft.scores = {};
594
+ draft.state = void 0;
595
+ draft.input = data != null && "input" in data ? data.input : draft.input;
596
+ const retryIndex = numberFrom(data?.retryIndex);
597
+ if (retryIndex != null) {
598
+ draft.retryCount = Math.max(draft.retryCount, retryIndex);
599
+ return;
600
+ }
601
+ draft.retryCount = Math.max(draft.retryCount, draft.startCount - 1);
602
+ }
603
+ function applyCaseMetric(draft, event) {
604
+ const data = asRecord(event.data);
605
+ const name = stringFrom(data?.name);
606
+ if (name == null) return;
607
+ const value = data?.value;
608
+ if (isCaseMetricValue(value)) draft.metrics[name] = value;
609
+ }
610
+ function applyCaseScore(draft, event) {
611
+ const data = asRecord(event.data);
612
+ const kind = stringFrom(data?.kind) ?? stringFrom(data?.name) ?? stringFrom(data?.["vieval.score.kind"]);
613
+ const score = numberFrom(data?.score) ?? numberFrom(data?.value) ?? numberFrom(data?.["vieval.score.value"]);
614
+ if (kind == null || score == null) return;
615
+ draft.scores[kind] = score;
616
+ }
617
+ function applyCaseEnd(draft, event) {
618
+ const data = asRecord(event.data);
619
+ draft.caseName = extractCaseName(event) ?? draft.caseName;
620
+ draft.endedAt = stringFrom(data?.endedAt) ?? event.timestamp ?? draft.endedAt;
621
+ draft.output = data != null && "output" in data ? data.output : draft.output;
622
+ draft.state = normalizeState(stringFrom(data?.state)) ?? "failed";
623
+ draft.scores.exact ??= draft.state === "passed" ? 1 : 0;
624
+ }
625
+ function toCaseRecord(draft) {
626
+ const startedAt = draft.startedAt ?? draft.endedAt ?? "";
627
+ const endedAt = draft.endedAt ?? startedAt;
628
+ return {
629
+ attemptId: draft.attemptId,
630
+ caseId: draft.caseId,
631
+ caseName: draft.caseName,
632
+ durationMs: calculateDurationMs(startedAt, endedAt),
633
+ endedAt,
634
+ experimentId: draft.experimentId,
635
+ ...draft.input === void 0 ? {} : { input: draft.input },
636
+ metrics: draft.metrics,
637
+ ...draft.output === void 0 ? {} : { output: draft.output },
638
+ projectName: draft.projectName,
639
+ retryCount: draft.retryCount,
640
+ runId: draft.runId,
641
+ schemaVersion: 1,
642
+ scores: draft.scores,
643
+ startedAt,
644
+ state: draft.state ?? "failed",
645
+ taskId: draft.taskId,
646
+ workspaceId: draft.workspaceId
647
+ };
648
+ }
649
+ function addRecordScores(summary, record) {
650
+ for (const [kind, score] of Object.entries(record.scores)) {
651
+ if (!Number.isFinite(score)) continue;
652
+ summary[kind] ??= {
653
+ average: 0,
654
+ count: 0,
655
+ sum: 0
656
+ };
657
+ summary[kind].count += 1;
658
+ summary[kind].sum += score;
659
+ }
660
+ }
661
+ function finalizeSummaryGroups(groups) {
662
+ return Object.fromEntries(Object.entries(groups).map(([key, summary]) => [key, finalizeScoreSummary(summary)]));
663
+ }
664
+ function finalizeScoreSummary(summary) {
665
+ return Object.fromEntries(Object.entries(summary).map(([kind, bucket]) => [kind, {
666
+ average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
667
+ count: bucket.count,
668
+ sum: bucket.sum
669
+ }]));
670
+ }
671
+ function getGroupValue(record, key) {
672
+ if (Object.hasOwn(record.metrics, key)) return {
673
+ exists: true,
674
+ value: record.metrics[key]
675
+ };
676
+ const directValue = record[key];
677
+ return isCaseMetricValue(directValue) ? {
678
+ exists: true,
679
+ value: directValue
680
+ } : { exists: false };
681
+ }
682
+ function extractCaseName(event) {
683
+ const data = asRecord(event.data);
684
+ return stringFrom(data?.caseName) ?? stringFrom(data?.name);
685
+ }
686
+ function extractExplicitProjectName(event) {
687
+ return stringFrom(asRecord(event.data)?.projectName) ?? event.projectName ?? event.projectId;
688
+ }
689
+ function createCaseKey(taskId, caseId) {
690
+ return `${taskId}\u0000${caseId}`;
691
+ }
692
+ /**
693
+ * Normalizes duration timestamps.
694
+ *
695
+ * Before:
696
+ * - `startedAt="2026-05-08T00:00:00.000Z"`, `endedAt="2026-05-08T00:00:01.250Z"`
697
+ * - `startedAt="bad"`, `endedAt="2026-05-08T00:00:01.250Z"`
698
+ *
699
+ * After:
700
+ * - `1250`
701
+ * - `0`
702
+ */
703
+ function calculateDurationMs(startedAt, endedAt) {
704
+ const started = Date.parse(startedAt);
705
+ const ended = Date.parse(endedAt);
706
+ if (!Number.isFinite(started) || !Number.isFinite(ended)) return 0;
707
+ return Math.max(0, ended - started);
708
+ }
709
+ function normalizeState(value) {
710
+ if (value === "failed" || value === "passed" || value === "skipped" || value === "timeout") return value;
711
+ }
712
+ function isCaseMetricValue(value) {
713
+ if (value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string") return true;
714
+ return Array.isArray(value);
715
+ }
716
+ function asRecord(value) {
717
+ if (value == null || typeof value !== "object" || Array.isArray(value)) return;
718
+ return value;
719
+ }
720
+ function stringFrom(value) {
721
+ return typeof value === "string" ? value : void 0;
722
+ }
723
+ function numberFrom(value) {
724
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
725
+ }
726
+ //#endregion
727
+ //#region src/cli/report-artifacts.ts
728
+ /**
729
+ * Resolves one or more `run-summary.json` paths from a report location.
730
+ *
731
+ * Use when:
732
+ * - callers may pass a run directory, summary file path, or a report root
733
+ *
734
+ * Returns:
735
+ * - sorted absolute summary file paths
736
+ */
737
+ async function resolveRunSummaryPaths(reportPath) {
738
+ const absoluteReportPath = resolve(reportPath);
739
+ const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
740
+ if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
741
+ if (existsSync(directSummaryPath)) return [directSummaryPath];
742
+ return (await glob("**/run-summary.json", {
743
+ absolute: true,
744
+ cwd: absoluteReportPath
745
+ })).sort((left, right) => left.localeCompare(right));
746
+ }
747
+ /**
748
+ * Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
749
+ *
750
+ * Use when:
751
+ * - report analysis needs both run aggregate output and event count metadata
752
+ */
753
+ function readReportRunArtifact(summaryFilePath) {
754
+ const reportDirectory = resolve(summaryFilePath, "..");
755
+ const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
756
+ const eventsFilePath = resolve(reportDirectory, "events.jsonl");
757
+ const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
758
+ const event = JSON.parse(line);
759
+ return {
760
+ attemptId: event.attemptId,
761
+ caseId: event.caseId,
762
+ data: event.data,
763
+ event: event.event,
764
+ experimentId: event.experimentId,
765
+ projectId: event.projectId,
766
+ projectName: event.projectName,
767
+ runId: event.runId,
768
+ taskId: event.taskId,
769
+ timestamp: event.timestamp,
770
+ workspaceId: event.workspaceId
771
+ };
772
+ }) : [];
773
+ return {
774
+ events,
775
+ eventsCount: events.length,
776
+ reportDirectory,
777
+ summary,
778
+ summaryFilePath
779
+ };
780
+ }
781
+ /**
782
+ * Reads all run artifacts found under `reportPath`.
783
+ *
784
+ * Use when:
785
+ * - callers need multi-run analysis from a directory root
786
+ */
787
+ async function readReportArtifacts(reportPath) {
788
+ return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
789
+ }
790
+ /**
791
+ * Creates a compact summary row for one run artifact.
792
+ *
793
+ * Use when:
794
+ * - table/csv/jsonl exports should stay stable and cheap to parse
795
+ */
796
+ function summarizeReportRunArtifact(artifact) {
797
+ const totalProjects = artifact.summary.projects.length;
798
+ const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
799
+ const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
800
+ const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
801
+ const projectNames = artifact.summary.projects.map((project) => project.name);
802
+ return {
803
+ attemptId: artifact.summary.attemptId ?? null,
804
+ eventsCount: artifact.eventsCount,
805
+ executedProjects,
806
+ experimentId: artifact.summary.experimentId ?? null,
807
+ failedProjects,
808
+ projectNames,
809
+ reportDirectory: artifact.reportDirectory,
810
+ runId: artifact.summary.runId ?? null,
811
+ totalProjects,
812
+ totalTasks,
813
+ workspaceId: artifact.summary.workspaceId ?? null
814
+ };
815
+ }
816
+ /**
817
+ * Writes one complete local run report artifact set.
818
+ *
819
+ * Use when:
820
+ * - CLI runs need deterministic local artifacts under workspace/project/experiment/attempt/run
821
+ * - report commands need normalized case, metrics, and OTLP-shaped files
822
+ *
823
+ * Expects:
824
+ * - `events` are the same envelopes written to `events.jsonl`
825
+ * - `output` already contains run identity fields
826
+ *
827
+ * Returns:
828
+ * - absolute report directory path containing the written artifacts
829
+ */
830
+ async function writeRunReportArtifacts(output, events, identity, reportOut) {
831
+ const projectId = deriveReportProjectId(output);
832
+ const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
833
+ const persistedOutput = {
834
+ ...output,
835
+ reportDirectory
836
+ };
837
+ await mkdir(reportDirectory, { recursive: true });
838
+ await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(persistedOutput, null, 2)}\n`, "utf-8");
839
+ await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
840
+ const caseRecords = buildCaseRecords({
841
+ attemptId: identity.attemptId,
842
+ events,
843
+ experimentId: identity.experimentId,
844
+ projectName: projectId,
845
+ runId: identity.runId,
846
+ workspaceId: identity.workspaceId
847
+ });
848
+ const metricsSummary = buildMetricsSummary(caseRecords, []);
849
+ const otlp = buildLocalOtlpProjection({
850
+ records: caseRecords,
851
+ runId: identity.runId
852
+ });
853
+ await writeFile(resolve(reportDirectory, "cases.jsonl"), encodeJsonl(caseRecords), "utf-8");
854
+ await writeFile(resolve(reportDirectory, "metrics-summary.json"), `${JSON.stringify(metricsSummary, null, 2)}\n`, "utf-8");
855
+ await mkdir(resolve(reportDirectory, "otlp"), { recursive: true });
856
+ await mkdir(resolve(reportDirectory, "benchmark"), { recursive: true });
857
+ await writeFile(resolve(reportDirectory, "otlp", "traces.json"), `${JSON.stringify(otlp.traces, null, 2)}\n`, "utf-8");
858
+ await writeFile(resolve(reportDirectory, "otlp", "logs.json"), `${JSON.stringify(otlp.logs, null, 2)}\n`, "utf-8");
859
+ await writeFile(resolve(reportDirectory, "otlp", "metrics.json"), `${JSON.stringify(otlp.metrics, null, 2)}\n`, "utf-8");
860
+ return reportDirectory;
861
+ }
862
+ function deriveReportProjectId(output) {
863
+ const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
864
+ if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment$1(uniqueProjectNames[0] ?? "default-project");
865
+ return "multi-project";
866
+ }
867
+ function sanitizeIdentitySegment$1(value) {
868
+ const normalized = value.trim();
869
+ if (normalized.length === 0) return "default";
870
+ return normalized.replace(/[^\w.-]+/g, "-");
871
+ }
872
+ //#endregion
267
873
  //#region src/cli/reporters/noop-reporter.ts
268
874
  /**
269
875
  * Creates a reporter that intentionally does nothing.
@@ -1273,6 +1879,10 @@ function formatDuration$1(durationMs, colors) {
1273
1879
  const rounded = Math.round(durationMs);
1274
1880
  return (rounded > 1e3 ? colors.yellow : colors.green)(` ${rounded}${colors.dim("ms")}`);
1275
1881
  }
1882
+ function formatHybridAverage(hybridAverage) {
1883
+ if (hybridAverage == null) return "n/a";
1884
+ return hybridAverage.toFixed(3).replace(/\.?0+$/, "");
1885
+ }
1276
1886
  function filterProjectsByName(projects, names) {
1277
1887
  if (names.length === 0) return [...projects];
1278
1888
  const nameSet = new Set(names);
@@ -1293,11 +1903,6 @@ function createRunIdentity(options) {
1293
1903
  workspaceId
1294
1904
  };
1295
1905
  }
1296
- function deriveReportProjectId(output) {
1297
- const uniqueProjectNames = [...new Set(output.projects.map((project) => project.name))];
1298
- if (uniqueProjectNames.length === 1) return sanitizeIdentitySegment(uniqueProjectNames[0] ?? "default-project");
1299
- return "multi-project";
1300
- }
1301
1906
  function createEventRecorder(identity) {
1302
1907
  const events = [];
1303
1908
  const taskProjectMap = /* @__PURE__ */ new Map();
@@ -1520,6 +2125,7 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
1520
2125
  reporter.onCaseEnd({
1521
2126
  caseId,
1522
2127
  errorMessage: payload.errorMessage,
2128
+ output: payload.output,
1523
2129
  state: payload.state,
1524
2130
  taskId: task.id
1525
2131
  });
@@ -1536,6 +2142,7 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
1536
2142
  reporter.onCaseStart({
1537
2143
  autoRetry: payload.autoRetry,
1538
2144
  caseId,
2145
+ input: payload.input,
1539
2146
  caseName: payload.name,
1540
2147
  retryIndex: payload.retryIndex,
1541
2148
  taskId: task.id
@@ -1554,7 +2161,7 @@ function createTaskReporterHooks(task, reporter, projectName, recordEvent, proje
1554
2161
  }
1555
2162
  };
1556
2163
  }
1557
- function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, runtimeConcurrency, vitestCompatReporter) {
2164
+ function createCliTaskExecutionContext(task, models, cacheRootDirectory, cacheProjectName, workspaceId, telemetry, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, runtimeConcurrency, vitestCompatReporter) {
1558
2165
  return {
1559
2166
  ...createTaskExecutionContext({
1560
2167
  cache: createFilesystemTaskCacheRuntime({
@@ -1566,7 +2173,8 @@ function createCliTaskExecutionContext(task, models, cacheRootDirectory, cachePr
1566
2173
  task
1567
2174
  }),
1568
2175
  reporterHooks: createTaskReporterHooks(task, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
1569
- runtimeConcurrency
2176
+ runtimeConcurrency,
2177
+ telemetry
1570
2178
  };
1571
2179
  }
1572
2180
  function resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter) {
@@ -1584,7 +2192,8 @@ function createAutoTaskExecutor(reporter, projectName, recordEvent, projectCaseC
1584
2192
  cache: context.cache,
1585
2193
  model: context.model,
1586
2194
  reporterHooks: resolveTaskReporterHooks(task, context, reporter, projectName, recordEvent, projectCaseCounters, projectCaseFailures, vitestCompatReporter),
1587
- task
2195
+ task,
2196
+ telemetry: context.telemetry
1588
2197
  });
1589
2198
  return {
1590
2199
  entryId: task.entry.id,
@@ -1687,7 +2296,7 @@ async function prepareProject(project) {
1687
2296
  };
1688
2297
  }
1689
2298
  }
1690
- async function executePreparedProject(prepared, identity, cacheProjectName, reporter, counters, recordEvent, options) {
2299
+ async function executePreparedProject(prepared, identity, cacheProjectName, telemetry, reporter, counters, recordEvent, options) {
1691
2300
  const settledTaskIds = /* @__PURE__ */ new Set();
1692
2301
  const projectCaseCounters = {
1693
2302
  failed: 0,
@@ -1705,7 +2314,13 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1705
2314
  const taskExecutor = async (task, context) => {
1706
2315
  const runtimeTask = createScheduledTaskWithRuntimeConcurrency(task, prepared.project, options);
1707
2316
  return {
1708
- ...await rawTaskExecutor(runtimeTask, context),
2317
+ ...await telemetry.withSpan("vieval.task", {
2318
+ "vieval.project.name": prepared.name,
2319
+ "vieval.run.id": identity.runId,
2320
+ "vieval.task.entry.id": runtimeTask.entry.id,
2321
+ "vieval.task.id": runtimeTask.id,
2322
+ "vieval.task.name": runtimeTask.entry.name
2323
+ }, async () => await rawTaskExecutor(runtimeTask, context)),
1709
2324
  matrix: cloneScheduledTaskMatrix(runtimeTask)
1710
2325
  };
1711
2326
  };
@@ -1714,7 +2329,7 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1714
2329
  try {
1715
2330
  const aggregated = await runScheduledTasks(prepared.tasks, taskExecutor, {
1716
2331
  createExecutionContext(task) {
1717
- return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, resolveCliRuntimeConcurrency(options), vitestCompatReporter);
2332
+ return createCliTaskExecutionContext(task, prepared.project.models, resolve(prepared.project.root, ".vieval", "cache"), cacheProjectName ?? prepared.name, identity.workspaceId, telemetry, reporter, prepared.name, recordEvent, projectCaseCounters, projectCaseFailures, resolveCliRuntimeConcurrency(options), vitestCompatReporter);
1718
2333
  },
1719
2334
  onTaskEnd(task, state) {
1720
2335
  settledTaskIds.add(task.id);
@@ -1807,14 +2422,6 @@ async function executePreparedProject(prepared, identity, cacheProjectName, repo
1807
2422
  };
1808
2423
  }
1809
2424
  }
1810
- async function writeRunReportArtifacts(output, events, identity, reportOut) {
1811
- const projectId = deriveReportProjectId(output);
1812
- const reportDirectory = resolve(reportOut, identity.workspaceId, projectId, identity.experimentId, identity.attemptId, identity.runId);
1813
- await mkdir(reportDirectory, { recursive: true });
1814
- await writeFile(resolve(reportDirectory, "run-summary.json"), `${JSON.stringify(output, null, 2)}\n`, "utf-8");
1815
- await writeFile(resolve(reportDirectory, "events.jsonl"), events.map((event) => JSON.stringify(event)).join("\n").concat(events.length > 0 ? "\n" : ""), "utf-8");
1816
- return reportDirectory;
1817
- }
1818
2425
  /**
1819
2426
  * Runs vieval orchestration from config and returns project-level summaries.
1820
2427
  *
@@ -1837,65 +2444,91 @@ async function runVievalCli(options = {}) {
1837
2444
  configFilePath: options.configFilePath,
1838
2445
  cwd: options.cwd
1839
2446
  });
2447
+ const telemetry = loadedConfig.reporting?.openTelemetry?.enabled === true ? createOpenTelemetryRuntime() : createNoopTelemetryRuntime();
2448
+ const onOpenTelemetryRunEnd = loadedConfig.reporting?.openTelemetry?.enabled === true ? loadedConfig.reporting.openTelemetry.onRunEnd : void 0;
1840
2449
  const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
1841
2450
  const eventRecorder = createEventRecorder(identity);
1842
2451
  const reporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
2452
+ let runError;
2453
+ let runEndError;
2454
+ let output;
1843
2455
  try {
1844
- const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
1845
- const workspaceScheduler = createSchedulerRuntime({ concurrency: { workspace: resolveWorkspaceConcurrency(loadedConfig, options) } });
1846
- const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
1847
- const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
1848
- const totalTasks = preparedProjects.reduce((sum, project) => {
1849
- if (project.kind === "prepared") return sum + project.prepared.tasks.length;
1850
- return sum + project.summary.taskCount;
1851
- }, 0);
1852
- const skippedSummaryTasks = preparedProjects.reduce((sum, project) => {
1853
- if (project.kind === "summary") return sum + project.summary.taskCount;
1854
- return sum;
1855
- }, 0);
1856
- const reporterCounters = {
1857
- failedTasks: 0,
1858
- passedTasks: 0,
1859
- skippedTasks: 0
1860
- };
1861
- reporter.onRunStart({ totalTasks });
1862
- for (const project of executableProjects) for (const task of project.tasks) reporter.onTaskQueued(createTaskQueuePayload(task, project.name));
1863
- const projectSummaries = (await Promise.all(preparedProjects.map(async (preparedProject, index) => {
1864
- if (preparedProject.kind === "summary") return {
1865
- index,
1866
- summary: preparedProject.summary
2456
+ output = await telemetry.withSpan("vieval.run", {
2457
+ "vieval.attempt.id": identity.attemptId,
2458
+ "vieval.experiment.id": identity.experimentId,
2459
+ "vieval.run.id": identity.runId,
2460
+ "vieval.workspace.id": identity.workspaceId
2461
+ }, async () => {
2462
+ const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
2463
+ const workspaceScheduler = createSchedulerRuntime({ concurrency: { workspace: resolveWorkspaceConcurrency(loadedConfig, options) } });
2464
+ const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
2465
+ const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
2466
+ const totalTasks = preparedProjects.reduce((sum, project) => {
2467
+ if (project.kind === "prepared") return sum + project.prepared.tasks.length;
2468
+ return sum + project.summary.taskCount;
2469
+ }, 0);
2470
+ const skippedSummaryTasks = preparedProjects.reduce((sum, project) => {
2471
+ if (project.kind === "summary") return sum + project.summary.taskCount;
2472
+ return sum;
2473
+ }, 0);
2474
+ const reporterCounters = {
2475
+ failedTasks: 0,
2476
+ passedTasks: 0,
2477
+ skippedTasks: 0
1867
2478
  };
1868
- return {
1869
- index,
1870
- summary: await workspaceScheduler.runCase({
1871
- experimentId: identity.experimentId,
1872
- projectName: preparedProject.prepared.name,
1873
- scope: "workspace",
1874
- workspaceId: identity.workspaceId
1875
- }, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, reporter, reporterCounters, eventRecorder.record, options))
2479
+ reporter.onRunStart({ totalTasks });
2480
+ for (const project of executableProjects) for (const task of project.tasks) reporter.onTaskQueued(createTaskQueuePayload(task, project.name));
2481
+ const projectSummaries = (await Promise.all(preparedProjects.map(async (preparedProject, index) => {
2482
+ if (preparedProject.kind === "summary") return {
2483
+ index,
2484
+ summary: preparedProject.summary
2485
+ };
2486
+ return {
2487
+ index,
2488
+ summary: await telemetry.withSpan("vieval.project", {
2489
+ "vieval.project.name": preparedProject.prepared.name,
2490
+ "vieval.run.id": identity.runId
2491
+ }, async () => await workspaceScheduler.runCase({
2492
+ experimentId: identity.experimentId,
2493
+ projectName: preparedProject.prepared.name,
2494
+ scope: "workspace",
2495
+ workspaceId: identity.workspaceId
2496
+ }, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry, reporter, reporterCounters, eventRecorder.record, options)))
2497
+ };
2498
+ }))).sort((left, right) => left.index - right.index).map((item) => item.summary);
2499
+ reporter.onRunEnd({
2500
+ failedTasks: reporterCounters.failedTasks,
2501
+ passedTasks: reporterCounters.passedTasks,
2502
+ skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
2503
+ totalTasks
2504
+ });
2505
+ const output = {
2506
+ attemptId: identity.attemptId,
2507
+ configFilePath: loadedConfig.configFilePath,
2508
+ experimentId: identity.experimentId,
2509
+ projects: projectSummaries,
2510
+ reportDirectory: null,
2511
+ runId: identity.runId,
2512
+ workspaceId: identity.workspaceId
1876
2513
  };
1877
- }))).sort((left, right) => left.index - right.index).map((item) => item.summary);
1878
- reporter.onRunEnd({
1879
- failedTasks: reporterCounters.failedTasks,
1880
- passedTasks: reporterCounters.passedTasks,
1881
- skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
1882
- totalTasks
2514
+ if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
2515
+ return output;
1883
2516
  });
1884
- const output = {
1885
- attemptId: identity.attemptId,
1886
- configFilePath: loadedConfig.configFilePath,
1887
- experimentId: identity.experimentId,
1888
- projects: projectSummaries,
1889
- reportDirectory: null,
1890
- runId: identity.runId,
1891
- workspaceId: identity.workspaceId
1892
- };
1893
- if (options.reportOut != null) output.reportDirectory = await writeRunReportArtifacts(output, eventRecorder.events, identity, options.reportOut);
1894
- return output;
2517
+ } catch (error) {
2518
+ runError = error;
1895
2519
  } finally {
2520
+ if (onOpenTelemetryRunEnd != null) try {
2521
+ await onOpenTelemetryRunEnd();
2522
+ } catch (error) {
2523
+ if (runError == null) runEndError = error;
2524
+ }
1896
2525
  reporter.dispose();
1897
2526
  restoreEnvironment();
1898
2527
  }
2528
+ if (runError != null) throw runError;
2529
+ if (runEndError != null) throw runEndError;
2530
+ if (output == null) throw new Error("Vieval run finished without output.");
2531
+ return output;
1899
2532
  }
1900
2533
  /**
1901
2534
  * Formats CLI run output as human-readable lines.
@@ -1964,8 +2597,7 @@ function formatVievalCliRunOutput(output) {
1964
2597
  }
1965
2598
  if (hasFailedCases) failedProjects += 1;
1966
2599
  else passedProjects += 1;
1967
- const hybridAverage = project.result?.overall.hybridAverage;
1968
- const hybridAverageLabel = hybridAverage == null ? "n/a" : String(hybridAverage);
2600
+ const hybridAverageLabel = formatHybridAverage(project.result?.overall.hybridAverage);
1969
2601
  const runCount = project.result?.overall.runCount ?? 0;
1970
2602
  const countLabel = colors.dim(`(${project.taskCount} tasks)`);
1971
2603
  const caseSummaryLabel = project.caseSummary == null ? "" : `, cases ${project.caseSummary.passed} passed | ${project.caseSummary.failed} failed | ${project.caseSummary.timeout} timeout`;
@@ -2008,14 +2640,14 @@ const compareHelpText = `
2008
2640
  --output Optional output artifact path
2009
2641
  --format Console output format: table | json (default: table)
2010
2642
  `;
2011
- function normalizeCliArgv$4(argv) {
2643
+ function normalizeCliArgv$6(argv) {
2012
2644
  const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
2013
2645
  if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
2014
2646
  return normalizedArgv;
2015
2647
  }
2016
2648
  function parseCompareCliArguments(argv) {
2017
2649
  const cli = meow(compareHelpText, {
2018
- argv: normalizeCliArgv$4(argv),
2650
+ argv: normalizeCliArgv$6(argv),
2019
2651
  flags: {
2020
2652
  config: { type: "string" },
2021
2653
  comparison: { type: "string" },
@@ -2120,7 +2752,7 @@ const evalRunHelpText = `
2120
2752
  --report-out Report output root directory
2121
2753
  --json Print machine-readable JSON output
2122
2754
  `;
2123
- function normalizeCliArgv$3(argv) {
2755
+ function normalizeCliArgv$5(argv) {
2124
2756
  const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
2125
2757
  return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
2126
2758
  }
@@ -2143,7 +2775,7 @@ function normalizeProjectNames(projectNames) {
2143
2775
  */
2144
2776
  function parseCliArguments(argv) {
2145
2777
  const cli = meow(evalRunHelpText, {
2146
- argv: normalizeCliArgv$3(argv),
2778
+ argv: normalizeCliArgv$5(argv),
2147
2779
  importMeta: import.meta,
2148
2780
  flags: {
2149
2781
  config: { type: "string" },
@@ -2239,89 +2871,6 @@ async function runEvalRunCli(argv) {
2239
2871
  }
2240
2872
  }
2241
2873
  //#endregion
2242
- //#region src/cli/report-artifacts.ts
2243
- /**
2244
- * Resolves one or more `run-summary.json` paths from a report location.
2245
- *
2246
- * Use when:
2247
- * - callers may pass a run directory, summary file path, or a report root
2248
- *
2249
- * Returns:
2250
- * - sorted absolute summary file paths
2251
- */
2252
- async function resolveRunSummaryPaths(reportPath) {
2253
- const absoluteReportPath = resolve(reportPath);
2254
- const directSummaryPath = resolve(absoluteReportPath, "run-summary.json");
2255
- if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".json")) return [absoluteReportPath];
2256
- if (existsSync(directSummaryPath)) return [directSummaryPath];
2257
- return (await glob("**/run-summary.json", {
2258
- absolute: true,
2259
- cwd: absoluteReportPath
2260
- })).sort((left, right) => left.localeCompare(right));
2261
- }
2262
- /**
2263
- * Reads one run report artifact set from `run-summary.json` and sibling `events.jsonl`.
2264
- *
2265
- * Use when:
2266
- * - report analysis needs both run aggregate output and event count metadata
2267
- */
2268
- function readReportRunArtifact(summaryFilePath) {
2269
- const reportDirectory = resolve(summaryFilePath, "..");
2270
- const summary = JSON.parse(readFileSync(summaryFilePath, "utf-8"));
2271
- const eventsFilePath = resolve(reportDirectory, "events.jsonl");
2272
- const events = existsSync(eventsFilePath) ? readFileSync(eventsFilePath, "utf-8").split("\n").filter((line) => line.trim().length > 0).map((line) => {
2273
- const event = JSON.parse(line);
2274
- return {
2275
- caseId: event.caseId,
2276
- data: event.data,
2277
- event: event.event,
2278
- taskId: event.taskId
2279
- };
2280
- }) : [];
2281
- return {
2282
- events,
2283
- eventsCount: events.length,
2284
- reportDirectory,
2285
- summary,
2286
- summaryFilePath
2287
- };
2288
- }
2289
- /**
2290
- * Reads all run artifacts found under `reportPath`.
2291
- *
2292
- * Use when:
2293
- * - callers need multi-run analysis from a directory root
2294
- */
2295
- async function readReportArtifacts(reportPath) {
2296
- return (await resolveRunSummaryPaths(reportPath)).map((summaryFilePath) => readReportRunArtifact(summaryFilePath));
2297
- }
2298
- /**
2299
- * Creates a compact summary row for one run artifact.
2300
- *
2301
- * Use when:
2302
- * - table/csv/jsonl exports should stay stable and cheap to parse
2303
- */
2304
- function summarizeReportRunArtifact(artifact) {
2305
- const totalProjects = artifact.summary.projects.length;
2306
- const failedProjects = artifact.summary.projects.filter((project) => project.errorMessage != null).length;
2307
- const executedProjects = artifact.summary.projects.filter((project) => project.executed).length;
2308
- const totalTasks = artifact.summary.projects.reduce((sum, project) => sum + project.taskCount, 0);
2309
- const projectNames = artifact.summary.projects.map((project) => project.name);
2310
- return {
2311
- attemptId: artifact.summary.attemptId ?? null,
2312
- eventsCount: artifact.eventsCount,
2313
- executedProjects,
2314
- experimentId: artifact.summary.experimentId ?? null,
2315
- failedProjects,
2316
- projectNames,
2317
- reportDirectory: artifact.reportDirectory,
2318
- runId: artifact.summary.runId ?? null,
2319
- totalProjects,
2320
- totalTasks,
2321
- workspaceId: artifact.summary.workspaceId ?? null
2322
- };
2323
- }
2324
- //#endregion
2325
2874
  //#region src/cli/report-analyze.ts
2326
2875
  const reportAnalyzeHelpText = `
2327
2876
  Analyze generated vieval report artifacts.
@@ -2343,7 +2892,7 @@ const reportAnalyzeHelpText = `
2343
2892
  --run-matrix Keep runs matching run-matrix selector "key=value[,key=value]"
2344
2893
  --eval-matrix Keep runs matching eval-matrix selector "key=value[,key=value]"
2345
2894
  `;
2346
- function normalizeCliArgv$2(argv) {
2895
+ function normalizeCliArgv$4(argv) {
2347
2896
  const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
2348
2897
  if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
2349
2898
  if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
@@ -2351,7 +2900,7 @@ function normalizeCliArgv$2(argv) {
2351
2900
  }
2352
2901
  function parseReportAnalyzeCliArguments(argv) {
2353
2902
  const cli = meow(reportAnalyzeHelpText, {
2354
- argv: normalizeCliArgv$2(argv),
2903
+ argv: normalizeCliArgv$4(argv),
2355
2904
  flags: {
2356
2905
  attempt: { type: "string" },
2357
2906
  caseState: { type: "string" },
@@ -2640,6 +3189,473 @@ async function runReportAnalyzeCli(argv) {
2640
3189
  }
2641
3190
  }
2642
3191
  //#endregion
3192
+ //#region src/cli/report-cases.ts
3193
+ const reportCasesHelpText = `
3194
+ Inspect normalized case records from generated vieval report artifacts.
3195
+
3196
+ Usage
3197
+ $ vieval report cases <reportPath> [options]
3198
+
3199
+ Options
3200
+ --format Output format: table | json | jsonl (default: table)
3201
+ --where Equality filter "key=value"; repeatable
3202
+ --group-by Case field, score name, or metric name used for grouped score summaries
3203
+ `;
3204
+ /**
3205
+ * Reads normalized case records from one report run directory or report root.
3206
+ *
3207
+ * Use when:
3208
+ * - CLI tools need case-level inspection from local report artifacts
3209
+ * - callers may pass a run directory, a `cases.jsonl` file, or a report root
3210
+ *
3211
+ * Expects:
3212
+ * - discovered `cases.jsonl` files contain one `CaseRecord` JSON object per line
3213
+ *
3214
+ * Returns:
3215
+ * - all parsed case records sorted by discovered file path order
3216
+ */
3217
+ async function readCaseRecordsFromReport(reportPath) {
3218
+ const caseFilePaths = await resolveCaseRecordPaths(reportPath);
3219
+ if (caseFilePaths.length === 0) throw new Error(`No cases.jsonl files found under "${resolve(reportPath)}".`);
3220
+ const records = [];
3221
+ for (const caseFilePath of caseFilePaths) {
3222
+ const lines = readFileSync(caseFilePath, "utf-8").split("\n");
3223
+ for (const [index, line] of lines.entries()) {
3224
+ const trimmed = line.trim();
3225
+ if (trimmed.length === 0) continue;
3226
+ try {
3227
+ records.push(JSON.parse(trimmed));
3228
+ } catch (error) {
3229
+ throw new Error(`Invalid cases.jsonl line ${index + 1} in "${caseFilePath}": ${errorMessageFrom(error) ?? "Unknown JSON parse failure."}`);
3230
+ }
3231
+ }
3232
+ }
3233
+ return records;
3234
+ }
3235
+ /**
3236
+ * Builds filtered case inspection output.
3237
+ *
3238
+ * Use when:
3239
+ * - `vieval report cases` needs deterministic JSON/table output
3240
+ * - tests need pure filtering and grouping behavior without process I/O
3241
+ *
3242
+ * Expects:
3243
+ * - `where` filters use `key=value`
3244
+ * - lookup keys may target direct case fields, score names, or metric names
3245
+ *
3246
+ * Returns:
3247
+ * - filtered records plus grouped score summaries when `groupBy` is present
3248
+ */
3249
+ function buildReportCasesOutput(records, options) {
3250
+ const whereFilters = (options.where ?? []).map(parseSelector);
3251
+ const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
3252
+ return {
3253
+ groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
3254
+ records: [...filteredRecords]
3255
+ };
3256
+ }
3257
+ /**
3258
+ * Runs the `vieval report cases` command.
3259
+ *
3260
+ * Call stack:
3261
+ *
3262
+ * published executable (`../bin/vieval`)
3263
+ * -> {@link import('./index').runTopLevelCli}
3264
+ * -> {@link runReportCasesCli}
3265
+ * -> {@link readCaseRecordsFromReport}
3266
+ *
3267
+ * Use when:
3268
+ * - the top-level CLI dispatches local case artifact inspection
3269
+ *
3270
+ * Expects:
3271
+ * - argv is either `cases <reportPath> ...` or `<reportPath> ...`
3272
+ *
3273
+ * Returns:
3274
+ * - resolves after writing the requested output to stdout
3275
+ */
3276
+ async function runReportCasesCli(argv) {
3277
+ try {
3278
+ const parsed = parseReportCasesCliArguments(argv);
3279
+ const output = buildReportCasesOutput(await readCaseRecordsFromReport(parsed.reportPath), parsed);
3280
+ if (parsed.format === "json") {
3281
+ process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
3282
+ return;
3283
+ }
3284
+ if (parsed.format === "jsonl") {
3285
+ process.stdout.write(encodeJsonl(output.records));
3286
+ return;
3287
+ }
3288
+ process.stdout.write(`${formatCasesTable(output)}\n`);
3289
+ } catch (error) {
3290
+ const errorMessage = errorMessageFrom(error) ?? "Unknown report cases failure.";
3291
+ process.stderr.write(`[vieval report cases] ${errorMessage}\n`);
3292
+ process.exitCode = 1;
3293
+ }
3294
+ }
3295
+ function normalizeCliArgv$3(argv) {
3296
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3297
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
3298
+ if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
3299
+ return normalizedArgv;
3300
+ }
3301
+ function parseReportCasesCliArguments(argv) {
3302
+ const cli = meow(reportCasesHelpText, {
3303
+ argv: normalizeCliArgv$3(argv),
3304
+ flags: {
3305
+ format: {
3306
+ default: "table",
3307
+ type: "string"
3308
+ },
3309
+ groupBy: { type: "string" },
3310
+ where: {
3311
+ isMultiple: true,
3312
+ type: "string"
3313
+ }
3314
+ },
3315
+ importMeta: import.meta
3316
+ });
3317
+ const reportPath = cli.input[0];
3318
+ if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
3319
+ return {
3320
+ format: normalizeReportCasesFormat(cli.flags.format),
3321
+ groupBy: cli.flags.groupBy,
3322
+ reportPath,
3323
+ where: cli.flags.where
3324
+ };
3325
+ }
3326
+ function normalizeReportCasesFormat(value) {
3327
+ const normalized = value.toLowerCase();
3328
+ if (normalized === "json") return "json";
3329
+ if (normalized === "jsonl") return "jsonl";
3330
+ return "table";
3331
+ }
3332
+ async function resolveCaseRecordPaths(reportPath) {
3333
+ const absoluteReportPath = resolve(reportPath);
3334
+ const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
3335
+ if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
3336
+ if (existsSync(directCaseFilePath)) return [directCaseFilePath];
3337
+ return (await glob("**/cases.jsonl", {
3338
+ absolute: true,
3339
+ cwd: absoluteReportPath
3340
+ })).sort((left, right) => left.localeCompare(right));
3341
+ }
3342
+ function matchesWhereFilters(record, whereFilters) {
3343
+ return whereFilters.every((parsed) => {
3344
+ const resolved = getCaseSelectorValue(record, parsed.key);
3345
+ return resolved.exists && String(resolved.value) === parsed.value;
3346
+ });
3347
+ }
3348
+ function parseSelector(selector) {
3349
+ const separatorIndex = selector.indexOf("=");
3350
+ if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
3351
+ return {
3352
+ key: selector.slice(0, separatorIndex).trim(),
3353
+ value: selector.slice(separatorIndex + 1).trim()
3354
+ };
3355
+ }
3356
+ function buildCaseGroups(records, groupBy) {
3357
+ const groups = {};
3358
+ for (const record of records) {
3359
+ const resolved = getCaseSelectorValue(record, groupBy);
3360
+ if (!resolved.exists) continue;
3361
+ const groupKey = `${groupBy}=${String(resolved.value)}`;
3362
+ groups[groupKey] ??= {
3363
+ count: 0,
3364
+ scores: {}
3365
+ };
3366
+ groups[groupKey].count += 1;
3367
+ addScores(groups[groupKey].scores, record.scores);
3368
+ }
3369
+ return Object.fromEntries(Object.entries(groups).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, group]) => [groupKey, {
3370
+ count: group.count,
3371
+ scores: finalizeScores(group.scores)
3372
+ }]));
3373
+ }
3374
+ function addScores(summary, scores) {
3375
+ for (const [scoreName, value] of Object.entries(scores)) {
3376
+ summary[scoreName] ??= {
3377
+ average: 0,
3378
+ count: 0,
3379
+ sum: 0
3380
+ };
3381
+ summary[scoreName].count += 1;
3382
+ summary[scoreName].sum += value;
3383
+ }
3384
+ }
3385
+ function finalizeScores(summary) {
3386
+ return Object.fromEntries(Object.entries(summary).sort(([left], [right]) => left.localeCompare(right)).map(([scoreName, bucket]) => [scoreName, {
3387
+ average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
3388
+ count: bucket.count,
3389
+ sum: bucket.sum
3390
+ }]));
3391
+ }
3392
+ function formatCasesTable(output) {
3393
+ const lines = ["CASES vieval report", `Case count ${output.records.length}`];
3394
+ if (output.groups != null) {
3395
+ lines.push("Groups");
3396
+ for (const [groupKey, group] of Object.entries(output.groups)) {
3397
+ const scoreText = Object.entries(group.scores).map(([scoreName, bucket]) => `${scoreName}=${bucket.average.toFixed(3)}`).join(" ");
3398
+ lines.push(`${groupKey} count=${group.count}${scoreText.length > 0 ? ` ${scoreText}` : ""}`);
3399
+ }
3400
+ }
3401
+ return lines.join("\n");
3402
+ }
3403
+ //#endregion
3404
+ //#region src/cli/report-case-compare.ts
3405
+ const reportCompareHelpText = `
3406
+ Compare normalized case records from two generated vieval reports.
3407
+
3408
+ Usage
3409
+ $ vieval report compare <leftReportPath> <rightReportPath> [options]
3410
+
3411
+ Options
3412
+ --format Output format: table | json (default: table)
3413
+ --case-key Case field, score name, or metric name used to match records
3414
+ --score-kind Score kind used for deltas (default: exact)
3415
+ --group-by Case field, score name, or metric name used for grouped deltas
3416
+ `;
3417
+ /**
3418
+ * Builds a generic case-level comparison between two report runs.
3419
+ *
3420
+ * Use when:
3421
+ * - local report analysis needs per-case improvements/regressions
3422
+ * - benchmark-specific facets should stay as generic metric keys
3423
+ *
3424
+ * Expects:
3425
+ * - left and right records are normalized `cases.jsonl` rows
3426
+ * - score values are numeric and comparable by `scoreKind`
3427
+ *
3428
+ * Returns:
3429
+ * - matched case deltas, added/removed cases, top changes, and optional group summaries
3430
+ */
3431
+ function buildCaseComparison(args) {
3432
+ const scoreKind = args.scoreKind ?? "exact";
3433
+ const leftByKey = indexRecordsByCaseKey(args.left, args.caseKey, "left");
3434
+ const rightByKey = indexRecordsByCaseKey(args.right, args.caseKey, "right");
3435
+ const cases = [];
3436
+ const added = [];
3437
+ const removed = [];
3438
+ for (const [caseKey, leftRecord] of leftByKey) {
3439
+ const rightRecord = rightByKey.get(caseKey);
3440
+ if (rightRecord == null) {
3441
+ removed.push(leftRecord);
3442
+ continue;
3443
+ }
3444
+ const leftScore = getScore(leftRecord, scoreKind);
3445
+ const rightScore = getScore(rightRecord, scoreKind);
3446
+ cases.push({
3447
+ caseKey,
3448
+ delta: {
3449
+ left: leftScore,
3450
+ right: rightScore,
3451
+ score: rightScore - leftScore
3452
+ },
3453
+ left: leftRecord,
3454
+ metricsChanged: diffMetrics(leftRecord.metrics, rightRecord.metrics),
3455
+ right: rightRecord
3456
+ });
3457
+ }
3458
+ for (const [caseKey, rightRecord] of rightByKey) if (!leftByKey.has(caseKey)) added.push(rightRecord);
3459
+ const sortedCases = [...cases].sort((left, right) => {
3460
+ const deltaOrder = right.delta.score - left.delta.score;
3461
+ return deltaOrder === 0 ? left.caseKey.localeCompare(right.caseKey) : deltaOrder;
3462
+ });
3463
+ return {
3464
+ added: added.sort(compareCaseRecords),
3465
+ cases: cases.sort((left, right) => left.caseKey.localeCompare(right.caseKey)),
3466
+ groups: args.groupBy == null ? void 0 : buildComparisonGroups(cases, args.groupBy),
3467
+ overall: {
3468
+ delta: averageScore(args.right, scoreKind) - averageScore(args.left, scoreKind),
3469
+ leftAverage: averageScore(args.left, scoreKind),
3470
+ rightAverage: averageScore(args.right, scoreKind)
3471
+ },
3472
+ removed: removed.sort(compareCaseRecords),
3473
+ topImprovements: sortedCases.filter((row) => row.delta.score > 0).slice(0, 10),
3474
+ topRegressions: [...sortedCases].reverse().filter((row) => row.delta.score < 0).slice(0, 10)
3475
+ };
3476
+ }
3477
+ /**
3478
+ * Runs the `vieval report compare` command.
3479
+ *
3480
+ * Call stack:
3481
+ *
3482
+ * published executable (`../bin/vieval`)
3483
+ * -> {@link import('./index').runTopLevelCli}
3484
+ * -> {@link runReportCompareCli}
3485
+ * -> {@link readCaseRecordsFromReport}
3486
+ * -> {@link buildCaseComparison}
3487
+ *
3488
+ * Use when:
3489
+ * - two local report artifact directories should be compared case-by-case
3490
+ *
3491
+ * Expects:
3492
+ * - argv is either `compare <left> <right> ...` or `<left> <right> ...`
3493
+ *
3494
+ * Returns:
3495
+ * - resolves after writing the requested output to stdout
3496
+ */
3497
+ async function runReportCompareCli(argv) {
3498
+ try {
3499
+ const parsed = parseReportCompareCliArguments(argv);
3500
+ const [left, right] = await Promise.all([readCaseRecordsFromReport(parsed.leftReportPath), readCaseRecordsFromReport(parsed.rightReportPath)]);
3501
+ const output = buildCaseComparison({
3502
+ caseKey: parsed.caseKey,
3503
+ groupBy: parsed.groupBy,
3504
+ left,
3505
+ right,
3506
+ scoreKind: parsed.scoreKind
3507
+ });
3508
+ if (parsed.format === "json") {
3509
+ process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
3510
+ return;
3511
+ }
3512
+ process.stdout.write(`${formatCaseComparisonTable(output)}\n`);
3513
+ } catch (error) {
3514
+ const errorMessage = errorMessageFrom(error) ?? "Unknown report compare failure.";
3515
+ process.stderr.write(`[vieval report compare] ${errorMessage}\n`);
3516
+ process.exitCode = 1;
3517
+ }
3518
+ }
3519
+ function normalizeCliArgv$2(argv) {
3520
+ const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
3521
+ if (normalizedArgv[0] === "report" && normalizedArgv[1] === "compare") return normalizedArgv.slice(2);
3522
+ if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
3523
+ return normalizedArgv;
3524
+ }
3525
+ function parseReportCompareCliArguments(argv) {
3526
+ const cli = meow(reportCompareHelpText, {
3527
+ argv: normalizeCliArgv$2(argv),
3528
+ flags: {
3529
+ caseKey: { type: "string" },
3530
+ format: {
3531
+ default: "table",
3532
+ type: "string"
3533
+ },
3534
+ groupBy: { type: "string" },
3535
+ scoreKind: {
3536
+ default: "exact",
3537
+ type: "string"
3538
+ }
3539
+ },
3540
+ importMeta: import.meta
3541
+ });
3542
+ const leftReportPath = cli.input[0];
3543
+ const rightReportPath = cli.input[1];
3544
+ if (leftReportPath == null || leftReportPath.length === 0 || rightReportPath == null || rightReportPath.length === 0) throw new Error("Missing required <leftReportPath> and <rightReportPath> arguments.");
3545
+ return {
3546
+ caseKey: cli.flags.caseKey,
3547
+ format: cli.flags.format === "json" ? "json" : "table",
3548
+ groupBy: cli.flags.groupBy,
3549
+ leftReportPath,
3550
+ rightReportPath,
3551
+ scoreKind: cli.flags.scoreKind
3552
+ };
3553
+ }
3554
+ function indexRecordsByCaseKey(records, caseKey, side) {
3555
+ const indexed = /* @__PURE__ */ new Map();
3556
+ for (const record of records) {
3557
+ const resolved = resolveCaseKey(record, caseKey);
3558
+ if (indexed.has(resolved)) throw new Error(`Duplicate case key "${resolved}" in ${side} report.`);
3559
+ indexed.set(resolved, record);
3560
+ }
3561
+ return indexed;
3562
+ }
3563
+ function resolveCaseKey(record, caseKey) {
3564
+ if (caseKey != null) {
3565
+ const resolved = getCaseSelectorValue(record, caseKey);
3566
+ if (resolved.exists) return String(resolved.value);
3567
+ throw new Error(`Missing explicit case key "${caseKey}" for case "${record.caseId}".`);
3568
+ }
3569
+ const benchmarkCaseId = getCaseSelectorValue(record, "benchmark.case.id");
3570
+ if (benchmarkCaseId.exists) return String(benchmarkCaseId.value);
3571
+ const vievalCaseId = getCaseSelectorValue(record, "vieval.case.id");
3572
+ return vievalCaseId.exists ? String(vievalCaseId.value) : record.caseId;
3573
+ }
3574
+ function getScore(record, scoreKind) {
3575
+ return record.scores[scoreKind] ?? 0;
3576
+ }
3577
+ function averageScore(records, scoreKind) {
3578
+ const values = records.map((record) => record.scores[scoreKind]).filter((value) => typeof value === "number");
3579
+ if (values.length === 0) return 0;
3580
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
3581
+ }
3582
+ function diffMetrics(left, right) {
3583
+ const changed = {};
3584
+ const metricKeys = [...new Set([...Object.keys(left), ...Object.keys(right)])].sort((leftKey, rightKey) => leftKey.localeCompare(rightKey));
3585
+ for (const metricKey of metricKeys) if (stableStringify(left[metricKey]) !== stableStringify(right[metricKey])) changed[metricKey] = {
3586
+ left: left[metricKey],
3587
+ right: right[metricKey]
3588
+ };
3589
+ return changed;
3590
+ }
3591
+ function buildComparisonGroups(cases, groupBy) {
3592
+ const groupedRows = {};
3593
+ for (const row of cases) {
3594
+ const resolved = getCaseSelectorValue(row.right, groupBy);
3595
+ if (!resolved.exists) continue;
3596
+ const groupKey = `${groupBy}=${String(resolved.value)}`;
3597
+ groupedRows[groupKey] ??= [];
3598
+ groupedRows[groupKey].push(row);
3599
+ }
3600
+ return Object.fromEntries(Object.entries(groupedRows).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, rows]) => {
3601
+ const leftAverage = rows.reduce((sum, row) => sum + row.delta.left, 0) / rows.length;
3602
+ const rightAverage = rows.reduce((sum, row) => sum + row.delta.right, 0) / rows.length;
3603
+ return [groupKey, {
3604
+ count: rows.length,
3605
+ delta: rightAverage - leftAverage,
3606
+ leftAverage,
3607
+ rightAverage
3608
+ }];
3609
+ }));
3610
+ }
3611
+ function compareCaseRecords(left, right) {
3612
+ return left.caseId.localeCompare(right.caseId);
3613
+ }
3614
+ /**
3615
+ * Formats a case comparison as a compact human-readable table.
3616
+ *
3617
+ * Use when:
3618
+ * - `vieval report compare` should expose the same information as JSON output
3619
+ * - users need a terminal-first overview of group and per-case deltas
3620
+ *
3621
+ * Expects:
3622
+ * - comparison output was produced by {@link buildCaseComparison}
3623
+ *
3624
+ * Returns:
3625
+ * - multi-line text containing aggregate, group, top-change, case, and unmatched summaries
3626
+ */
3627
+ function formatCaseComparisonTable(output) {
3628
+ const lines = [
3629
+ "COMPARE vieval report cases",
3630
+ `Matched ${output.cases.length}`,
3631
+ `Added ${output.added.length}`,
3632
+ `Removed ${output.removed.length}`,
3633
+ `Scores left=${output.overall.leftAverage.toFixed(3)} right=${output.overall.rightAverage.toFixed(3)} delta=${output.overall.delta.toFixed(3)}`
3634
+ ];
3635
+ if (output.groups != null && Object.keys(output.groups).length > 0) {
3636
+ lines.push("Groups");
3637
+ for (const [groupKey, group] of Object.entries(output.groups)) lines.push(`${groupKey} count=${group.count} left=${group.leftAverage.toFixed(3)} right=${group.rightAverage.toFixed(3)} delta=${group.delta.toFixed(3)}`);
3638
+ }
3639
+ if (output.topImprovements.length > 0) {
3640
+ lines.push("Top improvements");
3641
+ for (const row of output.topImprovements) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
3642
+ }
3643
+ if (output.topRegressions.length > 0) {
3644
+ lines.push("Top regressions");
3645
+ for (const row of output.topRegressions) lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} left=${row.delta.left.toFixed(3)} right=${row.delta.right.toFixed(3)}`);
3646
+ }
3647
+ if (output.cases.length > 0) {
3648
+ lines.push("Cases");
3649
+ for (const row of output.cases) {
3650
+ const changedMetricNames = Object.keys(row.metricsChanged);
3651
+ lines.push(`${row.caseKey} delta=${row.delta.score.toFixed(3)} changedMetrics=${changedMetricNames.length === 0 ? "none" : changedMetricNames.join(",")}`);
3652
+ }
3653
+ }
3654
+ if (output.added.length > 0) lines.push(`Added cases ${output.added.map((record) => record.caseId).join(",")}`);
3655
+ if (output.removed.length > 0) lines.push(`Removed cases ${output.removed.map((record) => record.caseId).join(",")}`);
3656
+ return lines.join("\n");
3657
+ }
3658
+ //#endregion
2643
3659
  //#region src/cli/report-index.ts
2644
3660
  const reportIndexHelpText = `
2645
3661
  Build report indexes from generated vieval artifacts.
@@ -2807,7 +3823,15 @@ async function runTopLevelCli(argv) {
2807
3823
  await runReportIndexCli(parsed.commandArgv);
2808
3824
  return;
2809
3825
  }
2810
- throw new Error(`Unsupported vieval report command "${reportSubcommand ?? "(none)"}". Expected "analyze" or "index".`);
3826
+ if (reportSubcommand === "cases") {
3827
+ await runReportCasesCli(parsed.commandArgv);
3828
+ return;
3829
+ }
3830
+ if (reportSubcommand === "compare") {
3831
+ await runReportCompareCli(parsed.commandArgv);
3832
+ return;
3833
+ }
3834
+ throw new Error(`Unsupported vieval report command "${reportSubcommand ?? "(none)"}". Expected "analyze", "index", "cases", or "compare".`);
2811
3835
  }
2812
3836
  if (parsed.command === "compare") {
2813
3837
  await runCompareCliOrExit(parsed.commandArgv);
@@ -2818,4 +3842,4 @@ async function runTopLevelCli(argv) {
2818
3842
  //#endregion
2819
3843
  export { runTopLevelCli as n, parseTopLevelCliArguments as t };
2820
3844
 
2821
- //# sourceMappingURL=cli-sanbKtQq.mjs.map
3845
+ //# sourceMappingURL=cli-ImxGpoYQ.mjs.map