@m4trix/evals 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,7 +56,8 @@ var defaultRunnerConfig = {
56
56
  ],
57
57
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
58
58
  },
59
- artifactDirectory: ".eval-results"
59
+ artifactDirectory: ".eval-results",
60
+ maxConcurrency: 1
60
61
  };
61
62
  function toRunnerConfigOverrides(config) {
62
63
  if (!config) {
@@ -89,6 +90,9 @@ function toRunnerConfigOverrides(config) {
89
90
  if (config.artifactDirectory !== void 0) {
90
91
  overrides.artifactDirectory = config.artifactDirectory;
91
92
  }
93
+ if (config.maxConcurrency !== void 0) {
94
+ overrides.maxConcurrency = config.maxConcurrency;
95
+ }
92
96
  if (Object.keys(discovery).length > 0) {
93
97
  overrides.discovery = discovery;
94
98
  }
@@ -313,6 +317,7 @@ var Metric = {
313
317
  const def = {
314
318
  id: config.id,
315
319
  name: config.name,
320
+ aggregate: config.aggregate,
316
321
  format: config.format,
317
322
  make: (data) => ({ id: config.id, data })
318
323
  };
@@ -332,6 +337,7 @@ var Score = {
332
337
  id: config.id,
333
338
  name: config.name,
334
339
  displayStrategy: config.displayStrategy,
340
+ aggregate: config.aggregate,
335
341
  format: config.format,
336
342
  make: (data, options) => {
337
343
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -350,23 +356,62 @@ function getScoreById(id) {
350
356
  return registry2.get(id);
351
357
  }
352
358
 
359
+ // src/evals/aggregators.ts
360
+ function aggregateAverage(values) {
361
+ if (values.length === 0) {
362
+ return { value: 0 };
363
+ }
364
+ const sum = values.reduce((s, v) => s + v.value, 0);
365
+ return { value: sum / values.length };
366
+ }
367
+ function aggregateAll(values) {
368
+ return { passed: values.length > 0 && values.every((v) => v.passed) };
369
+ }
370
+ function aggregateTokenCountSum(values) {
371
+ const initial = {
372
+ input: 0,
373
+ output: 0,
374
+ inputCached: 0,
375
+ outputCached: 0
376
+ };
377
+ return values.reduce(
378
+ (acc, v) => ({
379
+ input: acc.input + (v.input ?? 0),
380
+ output: acc.output + (v.output ?? 0),
381
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
382
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
383
+ }),
384
+ initial
385
+ );
386
+ }
387
+ function aggregateLatencyAverage(values) {
388
+ if (values.length === 0) {
389
+ return { ms: 0 };
390
+ }
391
+ const sum = values.reduce((s, v) => s + v.ms, 0);
392
+ return { ms: sum / values.length };
393
+ }
394
+
353
395
  // src/evals/metrics/standard.ts
354
396
  Metric.of({
355
397
  id: "token-count",
356
398
  name: "Tokens",
357
- format: (data) => {
399
+ aggregate: aggregateTokenCountSum,
400
+ format: (data, options) => {
358
401
  const input = data.input ?? 0;
359
402
  const output = data.output ?? 0;
360
403
  const inputCached = data.inputCached ?? 0;
361
404
  const outputCached = data.outputCached ?? 0;
362
405
  const cached = inputCached + outputCached;
363
- return `in:${input} out:${output} cached:${cached}`;
406
+ const base = `in:${input} out:${output} cached:${cached}`;
407
+ return options?.isAggregated ? `Total: ${base}` : base;
364
408
  }
365
409
  });
366
410
  Metric.of({
367
411
  id: "latency",
368
412
  name: "Latency",
369
- format: (data) => `${data.ms}ms`
413
+ aggregate: aggregateLatencyAverage,
414
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
370
415
  });
371
416
 
372
417
  // src/evals/scores/standard.ts
@@ -374,16 +419,36 @@ Score.of({
374
419
  id: "percent",
375
420
  name: "Score",
376
421
  displayStrategy: "bar",
377
- format: (data) => data.value.toFixed(2)
422
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
423
+ aggregate: aggregateAverage
378
424
  });
379
425
  Score.of({
380
426
  id: "binary",
381
427
  name: "Result",
382
428
  displayStrategy: "passFail",
383
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
429
+ format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
430
+ aggregate: aggregateAll
384
431
  });
385
432
 
386
433
  // src/runner/score-utils.ts
434
+ function aggregateScoreItems(items) {
435
+ if (items.length === 0)
436
+ return void 0;
437
+ const def = getScoreById(items[0].id);
438
+ if (!def?.aggregate)
439
+ return items[items.length - 1];
440
+ const aggregated = def.aggregate(items.map((i) => i.data));
441
+ return { ...items[0], data: aggregated };
442
+ }
443
+ function aggregateMetricItems(items) {
444
+ if (items.length === 0)
445
+ return void 0;
446
+ const def = getMetricById(items[0].id);
447
+ if (!def?.aggregate)
448
+ return items[items.length - 1];
449
+ const aggregated = def.aggregate(items.map((i) => i.data));
450
+ return { ...items[0], data: aggregated };
451
+ }
387
452
  function toNumericScoreFromScores(scores) {
388
453
  for (const item of scores) {
389
454
  const def = getScoreById(item.id);
@@ -462,6 +527,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
462
527
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
463
528
  );
464
529
  }
530
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
531
+ return effect.Effect.gen(function* () {
532
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
533
+ const rerunPassed = [];
534
+ for (let r = 0; r < reruns; r++) {
535
+ const started = Date.now();
536
+ const evaluatorScores = [];
537
+ let testCaseError;
538
+ const output = readOutput(testCaseItem.testCase);
539
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
540
+ const evaluateFn = evaluator.getEvaluateFn();
541
+ if (!evaluateFn) {
542
+ continue;
543
+ }
544
+ try {
545
+ const logs = [];
546
+ const logDiff = (expected, actual, options) => {
547
+ logs.push(createDiffLogEntry(expected, actual, options));
548
+ };
549
+ const ctx = yield* effect.Effect.promise(
550
+ () => Promise.resolve(evaluator.resolveContext())
551
+ );
552
+ const result = yield* effect.Effect.promise(
553
+ () => Promise.resolve(
554
+ evaluateFn({
555
+ input: testCaseItem.testCase.getInput(),
556
+ ctx,
557
+ output,
558
+ logDiff
559
+ })
560
+ )
561
+ );
562
+ const { scores, metrics } = normalizeResult(result);
563
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
564
+ evaluatorScores.push({
565
+ evaluatorId,
566
+ scores,
567
+ passed: passed2,
568
+ metrics,
569
+ logs: logs.length > 0 ? logs : void 0
570
+ });
571
+ } catch (error) {
572
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
573
+ evaluatorScores.push({
574
+ evaluatorId,
575
+ scores: [],
576
+ passed: false
577
+ });
578
+ }
579
+ }
580
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
581
+ rerunPassed.push(rerunPassedThis);
582
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
583
+ n + 1,
584
+ n + 1
585
+ ]);
586
+ const progressEvent = {
587
+ type: "TestCaseProgress",
588
+ runId: task.runId,
589
+ testCaseId: testCaseItem.id,
590
+ testCaseName: testCaseItem.testCase.getName(),
591
+ completedTestCases: completedEvaluations,
592
+ totalTestCases: totalEvaluations,
593
+ rerunIndex: r + 1,
594
+ rerunTotal: reruns,
595
+ passed: rerunPassedThis,
596
+ durationMs: Date.now() - started,
597
+ evaluatorScores,
598
+ output,
599
+ errorMessage: testCaseError
600
+ };
601
+ updateSnapshot(task.runId, (snapshot) => ({
602
+ ...snapshot,
603
+ completedTestCases: completedEvaluations
604
+ }));
605
+ yield* publishEvent(progressEvent);
606
+ yield* effect.Queue.offer(persistenceQueue, {
607
+ runId: task.runId,
608
+ artifactPath: task.snapshot.artifactPath,
609
+ payload: progressEvent
610
+ });
611
+ }
612
+ const testCasePassed = rerunPassed.every(Boolean);
613
+ if (testCasePassed) {
614
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
615
+ } else {
616
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
617
+ }
618
+ const [passed, failed] = yield* effect.Effect.all([
619
+ effect.Ref.get(passedRef),
620
+ effect.Ref.get(failedRef)
621
+ ]);
622
+ updateSnapshot(task.runId, (snapshot) => ({
623
+ ...snapshot,
624
+ passedTestCases: passed,
625
+ failedTestCases: failed
626
+ }));
627
+ });
628
+ }
465
629
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
466
630
  const startedAt = Date.now();
467
631
  updateSnapshot(task.runId, (snapshot) => ({
@@ -474,104 +638,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
474
638
  runId: task.runId,
475
639
  startedAt
476
640
  });
477
- let completedTestCases = 0;
478
- let passedTestCases = 0;
479
- let failedTestCases = 0;
480
- for (const testCaseItem of task.testCases) {
481
- const started = Date.now();
482
- const evaluatorScores = [];
483
- let testCaseError;
484
- const output = readOutput(testCaseItem.testCase);
485
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
486
- const evaluateFn = evaluator.getEvaluateFn();
487
- if (!evaluateFn) {
488
- continue;
489
- }
490
- try {
491
- const logs = [];
492
- const logDiff = (expected, actual, options) => {
493
- logs.push(createDiffLogEntry(expected, actual, options));
494
- };
495
- const ctx = yield* effect.Effect.promise(
496
- () => Promise.resolve(evaluator.resolveContext())
497
- );
498
- const result = yield* effect.Effect.promise(
499
- () => Promise.resolve(
500
- evaluateFn({
501
- input: testCaseItem.testCase.getInput(),
502
- ctx,
503
- output,
504
- logDiff
505
- })
506
- )
507
- );
508
- const { scores, metrics } = normalizeResult(result);
509
- const passed = computeEvaluatorPassed(evaluator, result, scores);
510
- evaluatorScores.push({
511
- evaluatorId,
512
- scores,
513
- passed,
514
- metrics,
515
- logs: logs.length > 0 ? logs : void 0
516
- });
517
- } catch (error) {
518
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
519
- evaluatorScores.push({
520
- evaluatorId,
521
- scores: [],
522
- passed: false
523
- });
524
- }
525
- }
526
- const testCasePassed = evaluatorScores.every((s) => s.passed);
527
- completedTestCases += 1;
528
- if (testCasePassed) {
529
- passedTestCases += 1;
530
- } else {
531
- failedTestCases += 1;
532
- }
533
- const progressEvent = {
534
- type: "TestCaseProgress",
535
- runId: task.runId,
536
- testCaseId: testCaseItem.id,
537
- testCaseName: testCaseItem.testCase.getName(),
538
- completedTestCases,
539
- totalTestCases: task.testCases.length,
540
- passed: testCasePassed,
541
- durationMs: Date.now() - started,
542
- evaluatorScores,
543
- output,
544
- errorMessage: testCaseError
545
- };
546
- updateSnapshot(task.runId, (snapshot) => ({
547
- ...snapshot,
548
- completedTestCases,
549
- passedTestCases,
550
- failedTestCases
551
- }));
552
- yield* publishEvent(progressEvent);
553
- yield* effect.Queue.offer(persistenceQueue, {
554
- runId: task.runId,
555
- artifactPath: task.snapshot.artifactPath,
556
- payload: progressEvent
557
- });
558
- }
641
+ const totalEvaluations = task.testCases.reduce(
642
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
643
+ 0
644
+ );
645
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
646
+ const completedRef = yield* effect.Ref.make(0);
647
+ const passedRef = yield* effect.Ref.make(0);
648
+ const failedRef = yield* effect.Ref.make(0);
649
+ const processTestCase = (testCaseItem) => processOneTestCase(
650
+ task,
651
+ testCaseItem,
652
+ totalEvaluations,
653
+ publishEvent,
654
+ persistenceQueue,
655
+ updateSnapshot,
656
+ completedRef,
657
+ passedRef,
658
+ failedRef
659
+ );
660
+ yield* effect.Effect.forEach(
661
+ task.testCases,
662
+ processTestCase,
663
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
664
+ );
665
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
666
+ effect.Ref.get(completedRef),
667
+ effect.Ref.get(passedRef),
668
+ effect.Ref.get(failedRef)
669
+ ]);
559
670
  const finishedAt = Date.now();
560
671
  const completedEvent = {
561
672
  type: "RunCompleted",
562
673
  runId: task.runId,
563
674
  finishedAt,
564
- passedTestCases,
565
- failedTestCases,
675
+ passedTestCases: passedUniqueTestCases,
676
+ failedTestCases: failedUniqueTestCases,
566
677
  totalTestCases: task.testCases.length,
567
678
  artifactPath: task.snapshot.artifactPath
568
679
  };
569
680
  updateSnapshot(task.runId, (snapshot) => ({
570
681
  ...snapshot,
571
682
  status: "completed",
572
- completedTestCases,
573
- passedTestCases,
574
- failedTestCases,
683
+ completedTestCases: completedEvaluations,
684
+ passedTestCases: passedUniqueTestCases,
685
+ failedTestCases: failedUniqueTestCases,
575
686
  finishedAt
576
687
  }));
577
688
  yield* publishEvent(completedEvent);
@@ -586,6 +697,126 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
586
697
  artifactPath: task.snapshot.artifactPath
587
698
  });
588
699
  });
700
+ async function loadRunSnapshotsFromArtifacts(config) {
701
+ const baseDir = path.resolve(config.artifactDirectory);
702
+ let entries;
703
+ try {
704
+ entries = await promises.readdir(baseDir);
705
+ } catch {
706
+ return [];
707
+ }
708
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
709
+ const snapshots = [];
710
+ for (const fileName of jsonlFiles) {
711
+ const filePath = path.join(baseDir, fileName);
712
+ try {
713
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
714
+ if (snapshot) {
715
+ snapshots.push(snapshot);
716
+ }
717
+ } catch {
718
+ }
719
+ }
720
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
721
+ }
722
+ async function parseArtifactToSnapshot(filePath, _config) {
723
+ const content = await promises.readFile(filePath, "utf8");
724
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
725
+ if (lines.length === 0) {
726
+ return null;
727
+ }
728
+ let runQueued = null;
729
+ let runCompleted = null;
730
+ let runFailed = null;
731
+ let runStarted = null;
732
+ for (const line of lines) {
733
+ try {
734
+ const event = JSON.parse(line);
735
+ const type = event.type;
736
+ if (type === "RunQueued") {
737
+ runQueued = {
738
+ runId: event.runId,
739
+ datasetId: event.datasetId,
740
+ datasetName: event.datasetName,
741
+ evaluatorIds: event.evaluatorIds,
742
+ totalTestCases: event.totalTestCases ?? 0,
743
+ artifactPath: event.artifactPath ?? filePath,
744
+ ts: event.ts
745
+ };
746
+ }
747
+ if (type === "RunStarted") {
748
+ runStarted = { startedAt: event.startedAt };
749
+ }
750
+ if (type === "RunCompleted") {
751
+ runCompleted = {
752
+ passedTestCases: event.passedTestCases,
753
+ failedTestCases: event.failedTestCases,
754
+ totalTestCases: event.totalTestCases,
755
+ finishedAt: event.finishedAt
756
+ };
757
+ }
758
+ if (type === "RunFailed") {
759
+ runFailed = {
760
+ finishedAt: event.finishedAt,
761
+ errorMessage: event.errorMessage
762
+ };
763
+ }
764
+ } catch {
765
+ }
766
+ }
767
+ if (!runQueued) {
768
+ return null;
769
+ }
770
+ const artifactPath = filePath;
771
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
772
+ const progress = aggregateTestCaseProgress(lines);
773
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
774
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
775
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
776
+ return {
777
+ runId: runQueued.runId,
778
+ datasetId: runQueued.datasetId,
779
+ datasetName: runQueued.datasetName,
780
+ evaluatorIds: runQueued.evaluatorIds,
781
+ queuedAt: runQueued.ts ?? 0,
782
+ startedAt: runStarted?.startedAt,
783
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
784
+ totalTestCases: runQueued.totalTestCases,
785
+ completedTestCases,
786
+ passedTestCases,
787
+ failedTestCases,
788
+ status,
789
+ artifactPath,
790
+ errorMessage: runFailed?.errorMessage
791
+ };
792
+ }
793
+ function aggregateTestCaseProgress(lines) {
794
+ let completedTestCases = 0;
795
+ const testCasePassedBy = /* @__PURE__ */ new Map();
796
+ for (const line of lines) {
797
+ try {
798
+ const event = JSON.parse(line);
799
+ if (event.type === "TestCaseProgress") {
800
+ const ev = event;
801
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
802
+ const id = ev.testCaseId;
803
+ const current = testCasePassedBy.get(id);
804
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
805
+ }
806
+ } catch {
807
+ }
808
+ }
809
+ let passedTestCases = 0;
810
+ let failedTestCases = 0;
811
+ for (const passed of testCasePassedBy.values()) {
812
+ if (passed) {
813
+ passedTestCases += 1;
814
+ } else {
815
+ failedTestCases += 1;
816
+ }
817
+ }
818
+ return { completedTestCases, passedTestCases, failedTestCases };
819
+ }
589
820
  async function appendJsonLine(artifactPath, payload) {
590
821
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
591
822
  await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
@@ -778,6 +1009,10 @@ var EffectRunner = class {
778
1009
  throw new Error("No evaluators selected for run");
779
1010
  }
780
1011
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1012
+ const totalEvaluations = selectedTestCases.reduce(
1013
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1014
+ 0
1015
+ );
781
1016
  const runId = `run-${crypto.randomUUID()}`;
782
1017
  const artifactPath = createArtifactPath(
783
1018
  this.config.artifactDirectory,
@@ -790,7 +1025,7 @@ var EffectRunner = class {
790
1025
  datasetName: dataset.dataset.getName(),
791
1026
  evaluatorIds: selectedEvaluators.map((item) => item.id),
792
1027
  queuedAt: Date.now(),
793
- totalTestCases: selectedTestCases.length,
1028
+ totalTestCases: totalEvaluations,
794
1029
  completedTestCases: 0,
795
1030
  passedTestCases: 0,
796
1031
  failedTestCases: 0,
@@ -804,7 +1039,7 @@ var EffectRunner = class {
804
1039
  datasetId: request.datasetId,
805
1040
  datasetName: dataset.dataset.getName(),
806
1041
  evaluatorIds: selectedEvaluators.map((item) => item.id),
807
- totalTestCases: selectedTestCases.length,
1042
+ totalTestCases: totalEvaluations,
808
1043
  artifactPath
809
1044
  };
810
1045
  await effect.Effect.runPromise(this.publishEvent(queuedEvent));
@@ -815,6 +1050,7 @@ var EffectRunner = class {
815
1050
  payload: queuedEvent
816
1051
  })
817
1052
  );
1053
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
818
1054
  await effect.Effect.runPromise(
819
1055
  effect.Queue.offer(this.runQueue, {
820
1056
  runId,
@@ -822,7 +1058,8 @@ var EffectRunner = class {
822
1058
  dataset: dataset.dataset,
823
1059
  evaluators: selectedEvaluators,
824
1060
  testCases: selectedTestCases,
825
- snapshot
1061
+ snapshot,
1062
+ maxConcurrency
826
1063
  })
827
1064
  );
828
1065
  return snapshot;
@@ -842,6 +1079,9 @@ var EffectRunner = class {
842
1079
  (a, b) => b.queuedAt - a.queuedAt
843
1080
  );
844
1081
  }
1082
+ async loadRunSnapshotsFromArtifacts() {
1083
+ return loadRunSnapshotsFromArtifacts(this.config);
1084
+ }
845
1085
  async shutdown() {
846
1086
  await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
847
1087
  await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
@@ -973,7 +1213,7 @@ function GenerateView({
973
1213
  return;
974
1214
  }
975
1215
  const { writeFile: writeFile2 } = await import('fs/promises');
976
- const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
1216
+ const { join: join4, parse: parse2, resolve: resolve5 } = await import('path');
977
1217
  const testCases = await runner.collectDatasetTestCases(dataset.id);
978
1218
  const payload = testCases.map((item) => {
979
1219
  const tc = item.testCase;
@@ -983,9 +1223,9 @@ function GenerateView({
983
1223
  output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
984
1224
  };
985
1225
  });
986
- const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
1226
+ const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
987
1227
  const parsed = parse2(absoluteDatasetPath);
988
- const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
1228
+ const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
989
1229
  await writeFile2(
990
1230
  outputPath,
991
1231
  `${JSON.stringify(payload, null, 2)}
@@ -1060,7 +1300,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1060
1300
  console.log(`Wrote ${outputPath}`);
1061
1301
  }
1062
1302
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1063
- return new Promise((resolve4, reject) => {
1303
+ return new Promise((resolve5, reject) => {
1064
1304
  const app = ink.render(
1065
1305
  React2__default.default.createElement(GenerateView, {
1066
1306
  runner,
@@ -1070,7 +1310,7 @@ async function generateDatasetJsonCommandInk(runner, datasetName) {
1070
1310
  if (err) {
1071
1311
  reject(err);
1072
1312
  } else {
1073
- resolve4();
1313
+ resolve5();
1074
1314
  }
1075
1315
  }
1076
1316
  })
@@ -1137,13 +1377,62 @@ function createBar(value, max = 100, width = 20) {
1137
1377
  const filled = Math.round(safe / max * width);
1138
1378
  return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1139
1379
  }
1140
- function formatScorePart(item, scoreToColor2) {
1380
+ function aggregateEvaluatorScores(events, nameById) {
1381
+ if (events.length === 0)
1382
+ return [];
1383
+ const evaluatorIds = new Set(
1384
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1385
+ );
1386
+ const result = [];
1387
+ for (const evaluatorId of evaluatorIds) {
1388
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1389
+ const metricIdToItems = /* @__PURE__ */ new Map();
1390
+ for (const ev of events) {
1391
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1392
+ for (const s of es?.scores ?? []) {
1393
+ const list = scoreIdToItems.get(s.id) ?? [];
1394
+ list.push(s);
1395
+ scoreIdToItems.set(s.id, list);
1396
+ }
1397
+ for (const m of es?.metrics ?? []) {
1398
+ const list = metricIdToItems.get(m.id) ?? [];
1399
+ list.push(m);
1400
+ metricIdToItems.set(m.id, list);
1401
+ }
1402
+ }
1403
+ const aggregatedScores = [];
1404
+ for (const items of scoreIdToItems.values()) {
1405
+ const agg = aggregateScoreItems(items);
1406
+ if (agg)
1407
+ aggregatedScores.push(agg);
1408
+ }
1409
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1410
+ const passed = events.every((ev) => {
1411
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1412
+ return es?.passed ?? false;
1413
+ });
1414
+ const lastEvent = events[events.length - 1];
1415
+ const lastEs = lastEvent?.evaluatorScores.find(
1416
+ (x) => x.evaluatorId === evaluatorId
1417
+ );
1418
+ result.push({
1419
+ evaluatorId,
1420
+ evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
1421
+ scores: aggregatedScores,
1422
+ passed,
1423
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
1424
+ logs: lastEs?.logs
1425
+ });
1426
+ }
1427
+ return result;
1428
+ }
1429
+ function formatScorePart(item, scoreToColor2, options) {
1141
1430
  const def = getScoreById(item.id);
1142
1431
  if (!def) {
1143
1432
  const numeric = toNumericScore(item.data);
1144
1433
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1145
1434
  }
1146
- const formatted = def.format(item.data);
1435
+ const formatted = def.format(item.data, options);
1147
1436
  if (def.displayStrategy === "bar") {
1148
1437
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1149
1438
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1163,6 +1452,7 @@ function RunView({
1163
1452
  );
1164
1453
  const [runInfo, setRunInfo] = React2.useState(null);
1165
1454
  const [testCases, setTestCases] = React2.useState([]);
1455
+ const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
1166
1456
  const [summary, setSummary] = React2.useState(null);
1167
1457
  const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1168
1458
  const runEval = React2.useCallback(async () => {
@@ -1189,20 +1479,17 @@ function RunView({
1189
1479
  return;
1190
1480
  }
1191
1481
  const nameById = new Map(
1192
- evaluators.map((item) => [
1193
- item.id,
1194
- item.evaluator.getName() ?? item.id
1195
- ])
1482
+ evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1196
1483
  );
1197
1484
  setEvaluatorNameById(nameById);
1198
1485
  const aggregates = /* @__PURE__ */ new Map();
1199
1486
  let overallScoreTotal = 0;
1200
1487
  let overallScoreCount = 0;
1201
- const done = new Promise((resolve4) => {
1488
+ const done = new Promise((resolve5) => {
1202
1489
  const unsubscribe = runner.subscribeRunEvents((event) => {
1203
1490
  if (event.type === "TestCaseProgress") {
1204
1491
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1205
- const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1492
+ numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1206
1493
  for (const item of event.evaluatorScores) {
1207
1494
  const numeric = toNumericScoreFromScores(item.scores);
1208
1495
  if (numeric !== void 0) {
@@ -1222,15 +1509,10 @@ function RunView({
1222
1509
  overallScoreCount += 1;
1223
1510
  }
1224
1511
  }
1225
- setTestCases((prev) => [
1226
- ...prev,
1227
- {
1228
- name: event.testCaseName,
1229
- completedTestCases: event.completedTestCases,
1230
- totalTestCases: event.totalTestCases,
1231
- durationMs: event.durationMs,
1232
- passed: event.passed,
1233
- averageScore,
1512
+ setTestCases((prev) => {
1513
+ const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
1514
+ const existing = byId.get(event.testCaseId);
1515
+ const newEvent = {
1234
1516
  evaluatorScores: event.evaluatorScores.map((item) => ({
1235
1517
  evaluatorId: item.evaluatorId,
1236
1518
  evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
@@ -1238,13 +1520,37 @@ function RunView({
1238
1520
  passed: item.passed,
1239
1521
  metrics: item.metrics,
1240
1522
  logs: item.logs
1241
- }))
1242
- }
1243
- ]);
1523
+ })),
1524
+ passed: event.passed,
1525
+ durationMs: event.durationMs
1526
+ };
1527
+ const events = existing ? [...existing.events, newEvent] : [newEvent];
1528
+ const isAggregated = events.length > 1;
1529
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1530
+ events,
1531
+ nameById
1532
+ );
1533
+ const merged = {
1534
+ name: event.testCaseName,
1535
+ testCaseId: event.testCaseId,
1536
+ completedTestCases: event.completedTestCases,
1537
+ totalTestCases: event.totalTestCases,
1538
+ rerunIndex: event.rerunIndex,
1539
+ rerunTotal: event.rerunTotal,
1540
+ durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1541
+ passed: events.every((e) => e.passed),
1542
+ events,
1543
+ aggregatedEvaluatorScores,
1544
+ isAggregated
1545
+ };
1546
+ byId.set(event.testCaseId, merged);
1547
+ setCompletedEvaluations(event.completedTestCases);
1548
+ return Array.from(byId.values());
1549
+ });
1244
1550
  }
1245
1551
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
1246
1552
  unsubscribe();
1247
- resolve4(event);
1553
+ resolve5(event);
1248
1554
  }
1249
1555
  });
1250
1556
  });
@@ -1255,9 +1561,7 @@ function RunView({
1255
1561
  setRunInfo({
1256
1562
  runId: snapshot.runId,
1257
1563
  datasetName: snapshot.datasetName,
1258
- evaluatorNames: evaluators.map(
1259
- (e) => e.evaluator.getName() ?? e.id
1260
- ),
1564
+ evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
1261
1565
  totalTestCases: snapshot.totalTestCases
1262
1566
  });
1263
1567
  setPhase("running");
@@ -1285,29 +1589,41 @@ function RunView({
1285
1589
  /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1286
1590
  runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1287
1591
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1288
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
1592
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1593
+ "Run",
1594
+ " "
1595
+ ] }),
1289
1596
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
1290
1597
  ] }),
1291
1598
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1292
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
1599
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1600
+ "Dataset",
1601
+ " "
1602
+ ] }),
1293
1603
  runInfo.datasetName
1294
1604
  ] }),
1295
1605
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1296
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
1606
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1607
+ "Evaluators",
1608
+ " "
1609
+ ] }),
1297
1610
  runInfo.evaluatorNames.join(", ")
1298
1611
  ] }),
1299
1612
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1300
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
1613
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1614
+ "Test cases",
1615
+ " "
1616
+ ] }),
1301
1617
  runInfo.totalTestCases
1302
1618
  ] })
1303
1619
  ] }),
1304
1620
  phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1305
1621
  Spinner,
1306
1622
  {
1307
- label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1623
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1308
1624
  }
1309
1625
  ) }),
1310
- testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1626
+ testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1311
1627
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1312
1628
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1313
1629
  "[",
@@ -1318,49 +1634,78 @@ function RunView({
1318
1634
  ] }),
1319
1635
  " ",
1320
1636
  tc.name,
1637
+ " ",
1638
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1639
+ "(",
1640
+ tc.rerunIndex,
1641
+ "/",
1642
+ tc.rerunTotal,
1643
+ ")"
1644
+ ] }),
1321
1645
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1322
1646
  " (",
1323
1647
  tc.durationMs,
1324
1648
  "ms)"
1325
1649
  ] })
1326
1650
  ] }),
1327
- tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
1328
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1329
- item.evaluatorName,
1330
- ":",
1331
- " ",
1332
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1333
- " ",
1334
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1335
- formatScorePart(s),
1336
- " "
1337
- ] }, s.id)),
1338
- item.metrics?.map((m) => {
1339
- const def = getMetricById(m.id);
1340
- if (!def)
1341
- return null;
1342
- const formatted = def.format(m.data);
1343
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1344
- "[",
1345
- def.name ? `${def.name}: ` : "",
1346
- formatted,
1347
- "]",
1348
- " "
1349
- ] }, m.id);
1350
- })
1351
- ] }),
1352
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1353
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1354
- ink.Text,
1355
- {
1356
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1357
- children: line
1358
- },
1359
- lineIdx
1360
- )) }, logIdx) : null
1361
- ) })
1362
- ] }, item.evaluatorId))
1363
- ] }, i)) }),
1651
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
1652
+ ink.Box,
1653
+ {
1654
+ flexDirection: "column",
1655
+ marginLeft: 2,
1656
+ children: [
1657
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1658
+ item.evaluatorName,
1659
+ ":",
1660
+ " ",
1661
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1662
+ " ",
1663
+ item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
1664
+ ink.Text,
1665
+ {
1666
+ color: scoreColor(toNumericScore(s.data) ?? 0),
1667
+ children: [
1668
+ formatScorePart(s, scoreColor, {
1669
+ isAggregated: tc.isAggregated
1670
+ }),
1671
+ " "
1672
+ ]
1673
+ },
1674
+ s.id
1675
+ )),
1676
+ item.metrics?.map((m) => {
1677
+ const def = getMetricById(m.id);
1678
+ if (!def)
1679
+ return null;
1680
+ const formatted = def.format(m.data, {
1681
+ isAggregated: tc.isAggregated
1682
+ });
1683
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1684
+ "[",
1685
+ def.name ? `${def.name}: ` : "",
1686
+ formatted,
1687
+ "]",
1688
+ " "
1689
+ ] }, m.id);
1690
+ })
1691
+ ] }),
1692
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1693
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1694
+ ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1695
+ ink.Text,
1696
+ {
1697
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1698
+ children: line
1699
+ },
1700
+ lineIdx
1701
+ )
1702
+ ) }, logIdx) : null
1703
+ ) })
1704
+ ]
1705
+ },
1706
+ item.evaluatorId
1707
+ ))
1708
+ ] }, tc.testCaseId)) }),
1364
1709
  phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1365
1710
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
1366
1711
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
@@ -1407,7 +1752,8 @@ function RunView({
1407
1752
  name.padEnd(28),
1408
1753
  " avg=",
1409
1754
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1410
- " passed=",
1755
+ " ",
1756
+ "passed=",
1411
1757
  agg.passed,
1412
1758
  " failed=",
1413
1759
  agg.failed
@@ -1416,28 +1762,38 @@ function RunView({
1416
1762
  ] }),
1417
1763
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1418
1764
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
1419
- testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1420
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1421
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1422
- " ",
1423
- tc.name.padEnd(24)
1424
- ] }),
1425
- tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1426
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
1427
- "score=",
1428
- tc.averageScore.toFixed(2)
1765
+ testCases.map((tc) => {
1766
+ const numericScores = tc.aggregatedEvaluatorScores.flatMap(
1767
+ (item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
1768
+ );
1769
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1770
+ const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1771
+ const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1772
+ isAggregated: true
1773
+ }) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
1774
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1775
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1776
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1777
+ " ",
1778
+ tc.name.padEnd(24)
1429
1779
  ] }),
1780
+ averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1781
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(averageScore), children: [
1782
+ "score=",
1783
+ scoreLabel
1784
+ ] }),
1785
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1786
+ " ",
1787
+ createBar(averageScore, 100, 14)
1788
+ ] })
1789
+ ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1430
1790
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1431
- " ",
1432
- createBar(tc.averageScore, 100, 14)
1791
+ " (",
1792
+ tc.durationMs,
1793
+ "ms)"
1433
1794
  ] })
1434
- ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1435
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1436
- " (",
1437
- tc.durationMs,
1438
- "ms)"
1439
- ] })
1440
- ] }, i))
1795
+ ] }, tc.testCaseId);
1796
+ })
1441
1797
  ] }),
1442
1798
  /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1443
1799
  "artifact: ",
@@ -1448,6 +1804,51 @@ function RunView({
1448
1804
  }
1449
1805
 
1450
1806
  // src/cli-simple/run.ts
1807
+ function buildTestCaseSummaries(byId) {
1808
+ const summaries = [];
1809
+ for (const { name, events } of byId.values()) {
1810
+ const passed = events.every((e) => e.passed);
1811
+ const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1812
+ const isAggregated = events.length > 1;
1813
+ const numericScores = [];
1814
+ let firstAggregatedScore;
1815
+ for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1816
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1817
+ for (const ev of events) {
1818
+ const es = ev.evaluatorScores.find(
1819
+ (x) => x.evaluatorId === evaluatorScores.evaluatorId
1820
+ );
1821
+ for (const s of es?.scores ?? []) {
1822
+ const list = scoreIdToItems.get(s.id) ?? [];
1823
+ list.push(s);
1824
+ scoreIdToItems.set(s.id, list);
1825
+ }
1826
+ }
1827
+ for (const items of scoreIdToItems.values()) {
1828
+ const agg = aggregateScoreItems(items);
1829
+ if (agg) {
1830
+ const n = toNumericScoreFromScores([agg]);
1831
+ if (n !== void 0) {
1832
+ numericScores.push(n);
1833
+ if (firstAggregatedScore === void 0) {
1834
+ firstAggregatedScore = agg;
1835
+ }
1836
+ }
1837
+ }
1838
+ }
1839
+ }
1840
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1841
+ summaries.push({
1842
+ name,
1843
+ averageScore,
1844
+ aggregatedScoreItem: firstAggregatedScore,
1845
+ isAggregated,
1846
+ durationMs,
1847
+ passed
1848
+ });
1849
+ }
1850
+ return summaries;
1851
+ }
1451
1852
  var ansi2 = {
1452
1853
  reset: "\x1B[0m",
1453
1854
  bold: "\x1B[1m",
@@ -1482,7 +1883,50 @@ function createBar2(value, max = 100, width = 20) {
1482
1883
  const filled = Math.round(safe / max * width);
1483
1884
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
1484
1885
  }
1485
- function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1886
+ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1887
+ if (events.length === 0)
1888
+ return [];
1889
+ const evaluatorIds = new Set(
1890
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1891
+ );
1892
+ const result = [];
1893
+ for (const evaluatorId of evaluatorIds) {
1894
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1895
+ const metricIdToItems = /* @__PURE__ */ new Map();
1896
+ for (const ev of events) {
1897
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1898
+ for (const s of es?.scores ?? []) {
1899
+ const list = scoreIdToItems.get(s.id) ?? [];
1900
+ list.push(s);
1901
+ scoreIdToItems.set(s.id, list);
1902
+ }
1903
+ for (const m of es?.metrics ?? []) {
1904
+ const list = metricIdToItems.get(m.id) ?? [];
1905
+ list.push(m);
1906
+ metricIdToItems.set(m.id, list);
1907
+ }
1908
+ }
1909
+ const aggregatedScores = [];
1910
+ for (const items of scoreIdToItems.values()) {
1911
+ const agg = aggregateScoreItems(items);
1912
+ if (agg)
1913
+ aggregatedScores.push(agg);
1914
+ }
1915
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1916
+ const passed = events.every((ev) => {
1917
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1918
+ return es?.passed ?? false;
1919
+ });
1920
+ result.push({
1921
+ evaluatorId,
1922
+ scores: aggregatedScores,
1923
+ passed,
1924
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
1925
+ });
1926
+ }
1927
+ return result;
1928
+ }
1929
+ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1486
1930
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1487
1931
  const scoreParts = [];
1488
1932
  for (const item of scores) {
@@ -1494,7 +1938,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1494
1938
  );
1495
1939
  continue;
1496
1940
  }
1497
- const formatted = def.format(item.data);
1941
+ const formatted = def.format(item.data, options);
1498
1942
  switch (def.displayStrategy) {
1499
1943
  case "bar": {
1500
1944
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -1527,7 +1971,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1527
1971
  for (const { id, data } of metrics) {
1528
1972
  const def = getMetricById(id);
1529
1973
  if (def) {
1530
- const formatted = def.format(data);
1974
+ const formatted = def.format(data, options);
1531
1975
  metricParts.push(
1532
1976
  def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
1533
1977
  );
@@ -1560,7 +2004,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1560
2004
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1561
2005
  );
1562
2006
  const aggregates = /* @__PURE__ */ new Map();
1563
- const testCaseSummaries = [];
2007
+ const testCaseByTestId = /* @__PURE__ */ new Map();
1564
2008
  let overallScoreTotal = 0;
1565
2009
  let overallScoreCount = 0;
1566
2010
  let completedCount = 0;
@@ -1574,6 +2018,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1574
2018
  }
1575
2019
  process.stdout.write("\r\x1B[2K");
1576
2020
  }
2021
+ function cursorUp(n) {
2022
+ if (!process.stdout.isTTY || n <= 0)
2023
+ return;
2024
+ process.stdout.write(`\x1B[${n}A`);
2025
+ }
1577
2026
  function drawSpinner() {
1578
2027
  if (!process.stdout.isTTY || runFinished) {
1579
2028
  return;
@@ -1587,38 +2036,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1587
2036
  )} ${colorize("(live)", ansi2.dim)}`
1588
2037
  );
1589
2038
  }
2039
+ let lastPrintedTestCaseId = null;
2040
+ let lastPrintedLineCount = 0;
1590
2041
  let spinnerTimer;
1591
- const done = new Promise((resolve4) => {
2042
+ const done = new Promise((resolve5) => {
1592
2043
  const unsubscribe = runner.subscribeRunEvents((event) => {
1593
2044
  if (event.type === "TestCaseProgress") {
1594
2045
  completedCount = event.completedTestCases;
1595
2046
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1596
2047
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
1597
- clearLine();
1598
- console.log(
1599
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
1600
- );
2048
+ const testCaseId = event.testCaseId;
2049
+ const existing = testCaseByTestId.get(testCaseId) ?? {
2050
+ name: event.testCaseName,
2051
+ events: []
2052
+ };
2053
+ existing.events.push({
2054
+ averageScore,
2055
+ passed: event.passed,
2056
+ durationMs: event.durationMs,
2057
+ evaluatorScores: event.evaluatorScores
2058
+ });
2059
+ testCaseByTestId.set(testCaseId, existing);
1601
2060
  for (const item of event.evaluatorScores) {
1602
- const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
1603
- console.log(
1604
- formatEvaluatorScoreLine(
1605
- name,
1606
- item.scores,
1607
- item.passed,
1608
- item.metrics
1609
- )
1610
- );
1611
- if (!item.passed && item.logs && item.logs.length > 0) {
1612
- for (const log of item.logs) {
1613
- if (log.type === "diff") {
1614
- const useColor = process.stdout.isTTY;
1615
- for (const { type, line } of getDiffLines(log)) {
1616
- const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1617
- console.log(colored);
1618
- }
1619
- }
1620
- }
1621
- }
1622
2061
  const numeric = toNumericScoreFromScores(item.scores);
1623
2062
  if (numeric !== void 0) {
1624
2063
  const current = aggregates.get(item.evaluatorId) ?? {
@@ -1637,19 +2076,67 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1637
2076
  overallScoreCount += 1;
1638
2077
  }
1639
2078
  }
1640
- testCaseSummaries.push({
1641
- name: event.testCaseName,
1642
- averageScore,
1643
- durationMs: event.durationMs,
1644
- passed: event.passed
1645
- });
2079
+ const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2080
+ const isLastRerun = event.rerunIndex >= event.rerunTotal;
2081
+ const isNonTty = !process.stdout.isTTY;
2082
+ const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
2083
+ if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
2084
+ cursorUp(lastPrintedLineCount);
2085
+ }
2086
+ const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2087
+ existing.events);
2088
+ const isAggregated = existing.events.length > 1;
2089
+ const durationMs = existing.events.reduce(
2090
+ (s, e) => s + e.durationMs,
2091
+ 0
2092
+ );
2093
+ existing.events.every((e) => e.passed);
2094
+ const lines = [];
2095
+ lines.push(
2096
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2097
+ );
2098
+ for (const item of aggregatedScores) {
2099
+ const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2100
+ lines.push(
2101
+ formatEvaluatorScoreLine(
2102
+ name,
2103
+ item.scores,
2104
+ item.passed,
2105
+ item.metrics,
2106
+ { isAggregated }
2107
+ )
2108
+ );
2109
+ const lastEvent = existing.events[existing.events.length - 1];
2110
+ const lastEs = lastEvent?.evaluatorScores.find(
2111
+ (x) => x.evaluatorId === item.evaluatorId
2112
+ );
2113
+ if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2114
+ for (const log of lastEs.logs) {
2115
+ if (log.type === "diff") {
2116
+ const useColor = process.stdout.isTTY;
2117
+ for (const { type, line } of getDiffLines(log)) {
2118
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2119
+ lines.push(colored);
2120
+ }
2121
+ }
2122
+ }
2123
+ }
2124
+ }
2125
+ if (!skipPrintNonTty) {
2126
+ for (let i = 0; i < lines.length; i++) {
2127
+ process.stdout.write(`\r\x1B[2K${lines[i]}
2128
+ `);
2129
+ }
2130
+ lastPrintedTestCaseId = testCaseId;
2131
+ lastPrintedLineCount = lines.length;
2132
+ }
1646
2133
  drawSpinner();
1647
2134
  }
1648
2135
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
1649
2136
  runFinished = true;
1650
2137
  clearLine();
1651
2138
  unsubscribe();
1652
- resolve4(event);
2139
+ resolve5(event);
1653
2140
  }
1654
2141
  });
1655
2142
  });
@@ -1706,6 +2193,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1706
2193
  getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
1707
2194
  );
1708
2195
  }
2196
+ const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
1709
2197
  if (testCaseSummaries.length > 0) {
1710
2198
  console.log(colorize("- test case scores:", ansi2.magenta));
1711
2199
  for (const summary of testCaseSummaries) {
@@ -1716,9 +2204,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1716
2204
  );
1717
2205
  continue;
1718
2206
  }
2207
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2208
+ summary.aggregatedScoreItem.data,
2209
+ { isAggregated: true }
2210
+ ) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
1719
2211
  console.log(
1720
2212
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1721
- summary.averageScore.toFixed(2),
2213
+ scoreLabel,
1722
2214
  scoreToColor(summary.averageScore)
1723
2215
  )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1724
2216
  );
@@ -1727,7 +2219,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1727
2219
  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1728
2220
  }
1729
2221
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1730
- return new Promise((resolve4, reject) => {
2222
+ return new Promise((resolve5, reject) => {
1731
2223
  const app = ink.render(
1732
2224
  React2__default.default.createElement(RunView, {
1733
2225
  runner,
@@ -1738,7 +2230,7 @@ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1738
2230
  if (err) {
1739
2231
  reject(err);
1740
2232
  } else {
1741
- resolve4();
2233
+ resolve5();
1742
2234
  }
1743
2235
  }
1744
2236
  })