@m4trix/evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,7 +56,8 @@ var defaultRunnerConfig = {
56
56
  ],
57
57
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
58
58
  },
59
- artifactDirectory: ".eval-results"
59
+ artifactDirectory: ".eval-results",
60
+ maxConcurrency: 1
60
61
  };
61
62
  function toRunnerConfigOverrides(config) {
62
63
  if (!config) {
@@ -89,6 +90,9 @@ function toRunnerConfigOverrides(config) {
89
90
  if (config.artifactDirectory !== void 0) {
90
91
  overrides.artifactDirectory = config.artifactDirectory;
91
92
  }
93
+ if (config.maxConcurrency !== void 0) {
94
+ overrides.maxConcurrency = config.maxConcurrency;
95
+ }
92
96
  if (Object.keys(discovery).length > 0) {
93
97
  overrides.discovery = discovery;
94
98
  }
@@ -313,6 +317,7 @@ var Metric = {
313
317
  const def = {
314
318
  id: config.id,
315
319
  name: config.name,
320
+ aggregate: config.aggregate,
316
321
  format: config.format,
317
322
  make: (data) => ({ id: config.id, data })
318
323
  };
@@ -332,6 +337,7 @@ var Score = {
332
337
  id: config.id,
333
338
  name: config.name,
334
339
  displayStrategy: config.displayStrategy,
340
+ aggregate: config.aggregate,
335
341
  format: config.format,
336
342
  make: (data, options) => {
337
343
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -350,23 +356,62 @@ function getScoreById(id) {
350
356
  return registry2.get(id);
351
357
  }
352
358
 
359
+ // src/evals/aggregators.ts
360
+ function aggregateAverage(values) {
361
+ if (values.length === 0) {
362
+ return { value: 0 };
363
+ }
364
+ const sum = values.reduce((s, v) => s + v.value, 0);
365
+ return { value: sum / values.length };
366
+ }
367
+ function aggregateAll(values) {
368
+ return { passed: values.length > 0 && values.every((v) => v.passed) };
369
+ }
370
+ function aggregateTokenCountSum(values) {
371
+ const initial = {
372
+ input: 0,
373
+ output: 0,
374
+ inputCached: 0,
375
+ outputCached: 0
376
+ };
377
+ return values.reduce(
378
+ (acc, v) => ({
379
+ input: acc.input + (v.input ?? 0),
380
+ output: acc.output + (v.output ?? 0),
381
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
382
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
383
+ }),
384
+ initial
385
+ );
386
+ }
387
+ function aggregateLatencyAverage(values) {
388
+ if (values.length === 0) {
389
+ return { ms: 0 };
390
+ }
391
+ const sum = values.reduce((s, v) => s + v.ms, 0);
392
+ return { ms: sum / values.length };
393
+ }
394
+
353
395
  // src/evals/metrics/standard.ts
354
396
  Metric.of({
355
397
  id: "token-count",
356
398
  name: "Tokens",
357
- format: (data) => {
399
+ aggregate: aggregateTokenCountSum,
400
+ format: (data, options) => {
358
401
  const input = data.input ?? 0;
359
402
  const output = data.output ?? 0;
360
403
  const inputCached = data.inputCached ?? 0;
361
404
  const outputCached = data.outputCached ?? 0;
362
405
  const cached = inputCached + outputCached;
363
- return `in:${input} out:${output} cached:${cached}`;
406
+ const base = `in:${input} out:${output} cached:${cached}`;
407
+ return options?.isAggregated ? `Total: ${base}` : base;
364
408
  }
365
409
  });
366
410
  Metric.of({
367
411
  id: "latency",
368
412
  name: "Latency",
369
- format: (data) => `${data.ms}ms`
413
+ aggregate: aggregateLatencyAverage,
414
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
370
415
  });
371
416
 
372
417
  // src/evals/scores/standard.ts
@@ -374,16 +419,36 @@ Score.of({
374
419
  id: "percent",
375
420
  name: "Score",
376
421
  displayStrategy: "bar",
377
- format: (data) => data.value.toFixed(2)
422
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
423
+ aggregate: aggregateAverage
378
424
  });
379
425
  Score.of({
380
426
  id: "binary",
381
427
  name: "Result",
382
428
  displayStrategy: "passFail",
383
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
429
+ format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
430
+ aggregate: aggregateAll
384
431
  });
385
432
 
386
433
  // src/runner/score-utils.ts
434
+ function aggregateScoreItems(items) {
435
+ if (items.length === 0)
436
+ return void 0;
437
+ const def = getScoreById(items[0].id);
438
+ if (!def?.aggregate)
439
+ return items[items.length - 1];
440
+ const aggregated = def.aggregate(items.map((i) => i.data));
441
+ return { ...items[0], data: aggregated };
442
+ }
443
+ function aggregateMetricItems(items) {
444
+ if (items.length === 0)
445
+ return void 0;
446
+ const def = getMetricById(items[0].id);
447
+ if (!def?.aggregate)
448
+ return items[items.length - 1];
449
+ const aggregated = def.aggregate(items.map((i) => i.data));
450
+ return { ...items[0], data: aggregated };
451
+ }
387
452
  function toNumericScoreFromScores(scores) {
388
453
  for (const item of scores) {
389
454
  const def = getScoreById(item.id);
@@ -462,6 +527,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
462
527
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
463
528
  );
464
529
  }
530
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
531
+ return effect.Effect.gen(function* () {
532
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
533
+ const rerunPassed = [];
534
+ for (let r = 0; r < reruns; r++) {
535
+ const started = Date.now();
536
+ const evaluatorScores = [];
537
+ let testCaseError;
538
+ const output = readOutput(testCaseItem.testCase);
539
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
540
+ const evaluateFn = evaluator.getEvaluateFn();
541
+ if (!evaluateFn) {
542
+ continue;
543
+ }
544
+ try {
545
+ const logs = [];
546
+ const logDiff = (expected, actual, options) => {
547
+ logs.push(createDiffLogEntry(expected, actual, options));
548
+ };
549
+ const ctx = yield* effect.Effect.promise(
550
+ () => Promise.resolve(evaluator.resolveContext())
551
+ );
552
+ const result = yield* effect.Effect.promise(
553
+ () => Promise.resolve(
554
+ evaluateFn({
555
+ input: testCaseItem.testCase.getInput(),
556
+ ctx,
557
+ output,
558
+ logDiff
559
+ })
560
+ )
561
+ );
562
+ const { scores, metrics } = normalizeResult(result);
563
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
564
+ evaluatorScores.push({
565
+ evaluatorId,
566
+ scores,
567
+ passed: passed2,
568
+ metrics,
569
+ logs: logs.length > 0 ? logs : void 0
570
+ });
571
+ } catch (error) {
572
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
573
+ evaluatorScores.push({
574
+ evaluatorId,
575
+ scores: [],
576
+ passed: false
577
+ });
578
+ }
579
+ }
580
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
581
+ rerunPassed.push(rerunPassedThis);
582
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
583
+ n + 1,
584
+ n + 1
585
+ ]);
586
+ const progressEvent = {
587
+ type: "TestCaseProgress",
588
+ runId: task.runId,
589
+ testCaseId: testCaseItem.id,
590
+ testCaseName: testCaseItem.testCase.getName(),
591
+ completedTestCases: completedEvaluations,
592
+ totalTestCases: totalEvaluations,
593
+ rerunIndex: r + 1,
594
+ rerunTotal: reruns,
595
+ passed: rerunPassedThis,
596
+ durationMs: Date.now() - started,
597
+ evaluatorScores,
598
+ output,
599
+ errorMessage: testCaseError
600
+ };
601
+ updateSnapshot(task.runId, (snapshot) => ({
602
+ ...snapshot,
603
+ completedTestCases: completedEvaluations
604
+ }));
605
+ yield* publishEvent(progressEvent);
606
+ yield* effect.Queue.offer(persistenceQueue, {
607
+ runId: task.runId,
608
+ artifactPath: task.snapshot.artifactPath,
609
+ payload: progressEvent
610
+ });
611
+ }
612
+ const testCasePassed = rerunPassed.every(Boolean);
613
+ if (testCasePassed) {
614
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
615
+ } else {
616
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
617
+ }
618
+ const [passed, failed] = yield* effect.Effect.all([
619
+ effect.Ref.get(passedRef),
620
+ effect.Ref.get(failedRef)
621
+ ]);
622
+ updateSnapshot(task.runId, (snapshot) => ({
623
+ ...snapshot,
624
+ passedTestCases: passed,
625
+ failedTestCases: failed
626
+ }));
627
+ });
628
+ }
465
629
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
466
630
  const startedAt = Date.now();
467
631
  updateSnapshot(task.runId, (snapshot) => ({
@@ -474,104 +638,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
474
638
  runId: task.runId,
475
639
  startedAt
476
640
  });
477
- let completedTestCases = 0;
478
- let passedTestCases = 0;
479
- let failedTestCases = 0;
480
- for (const testCaseItem of task.testCases) {
481
- const started = Date.now();
482
- const evaluatorScores = [];
483
- let testCaseError;
484
- const output = readOutput(testCaseItem.testCase);
485
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
486
- const evaluateFn = evaluator.getEvaluateFn();
487
- if (!evaluateFn) {
488
- continue;
489
- }
490
- try {
491
- const logs = [];
492
- const logDiff = (expected, actual, options) => {
493
- logs.push(createDiffLogEntry(expected, actual, options));
494
- };
495
- const ctx = yield* effect.Effect.promise(
496
- () => Promise.resolve(evaluator.resolveContext())
497
- );
498
- const result = yield* effect.Effect.promise(
499
- () => Promise.resolve(
500
- evaluateFn({
501
- input: testCaseItem.testCase.getInput(),
502
- ctx,
503
- output,
504
- logDiff
505
- })
506
- )
507
- );
508
- const { scores, metrics } = normalizeResult(result);
509
- const passed = computeEvaluatorPassed(evaluator, result, scores);
510
- evaluatorScores.push({
511
- evaluatorId,
512
- scores,
513
- passed,
514
- metrics,
515
- logs: logs.length > 0 ? logs : void 0
516
- });
517
- } catch (error) {
518
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
519
- evaluatorScores.push({
520
- evaluatorId,
521
- scores: [],
522
- passed: false
523
- });
524
- }
525
- }
526
- const testCasePassed = evaluatorScores.every((s) => s.passed);
527
- completedTestCases += 1;
528
- if (testCasePassed) {
529
- passedTestCases += 1;
530
- } else {
531
- failedTestCases += 1;
532
- }
533
- const progressEvent = {
534
- type: "TestCaseProgress",
535
- runId: task.runId,
536
- testCaseId: testCaseItem.id,
537
- testCaseName: testCaseItem.testCase.getName(),
538
- completedTestCases,
539
- totalTestCases: task.testCases.length,
540
- passed: testCasePassed,
541
- durationMs: Date.now() - started,
542
- evaluatorScores,
543
- output,
544
- errorMessage: testCaseError
545
- };
546
- updateSnapshot(task.runId, (snapshot) => ({
547
- ...snapshot,
548
- completedTestCases,
549
- passedTestCases,
550
- failedTestCases
551
- }));
552
- yield* publishEvent(progressEvent);
553
- yield* effect.Queue.offer(persistenceQueue, {
554
- runId: task.runId,
555
- artifactPath: task.snapshot.artifactPath,
556
- payload: progressEvent
557
- });
558
- }
641
+ const totalEvaluations = task.testCases.reduce(
642
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
643
+ 0
644
+ );
645
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
646
+ const completedRef = yield* effect.Ref.make(0);
647
+ const passedRef = yield* effect.Ref.make(0);
648
+ const failedRef = yield* effect.Ref.make(0);
649
+ const processTestCase = (testCaseItem) => processOneTestCase(
650
+ task,
651
+ testCaseItem,
652
+ totalEvaluations,
653
+ publishEvent,
654
+ persistenceQueue,
655
+ updateSnapshot,
656
+ completedRef,
657
+ passedRef,
658
+ failedRef
659
+ );
660
+ yield* effect.Effect.forEach(
661
+ task.testCases,
662
+ processTestCase,
663
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
664
+ );
665
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
666
+ effect.Ref.get(completedRef),
667
+ effect.Ref.get(passedRef),
668
+ effect.Ref.get(failedRef)
669
+ ]);
559
670
  const finishedAt = Date.now();
560
671
  const completedEvent = {
561
672
  type: "RunCompleted",
562
673
  runId: task.runId,
563
674
  finishedAt,
564
- passedTestCases,
565
- failedTestCases,
675
+ passedTestCases: passedUniqueTestCases,
676
+ failedTestCases: failedUniqueTestCases,
566
677
  totalTestCases: task.testCases.length,
567
678
  artifactPath: task.snapshot.artifactPath
568
679
  };
569
680
  updateSnapshot(task.runId, (snapshot) => ({
570
681
  ...snapshot,
571
682
  status: "completed",
572
- completedTestCases,
573
- passedTestCases,
574
- failedTestCases,
683
+ completedTestCases: completedEvaluations,
684
+ passedTestCases: passedUniqueTestCases,
685
+ failedTestCases: failedUniqueTestCases,
575
686
  finishedAt
576
687
  }));
577
688
  yield* publishEvent(completedEvent);
@@ -659,7 +770,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
659
770
  const artifactPath = filePath;
660
771
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
661
772
  const progress = aggregateTestCaseProgress(lines);
662
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
773
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
663
774
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
664
775
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
665
776
  return {
@@ -681,23 +792,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
681
792
  }
682
793
  function aggregateTestCaseProgress(lines) {
683
794
  let completedTestCases = 0;
684
- let passedTestCases = 0;
685
- let failedTestCases = 0;
795
+ const testCasePassedBy = /* @__PURE__ */ new Map();
686
796
  for (const line of lines) {
687
797
  try {
688
798
  const event = JSON.parse(line);
689
799
  if (event.type === "TestCaseProgress") {
690
800
  const ev = event;
691
801
  completedTestCases = ev.completedTestCases ?? completedTestCases;
692
- if (ev.passed) {
693
- passedTestCases += 1;
694
- } else {
695
- failedTestCases += 1;
696
- }
802
+ const id = ev.testCaseId;
803
+ const current = testCasePassedBy.get(id);
804
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
697
805
  }
698
806
  } catch {
699
807
  }
700
808
  }
809
+ let passedTestCases = 0;
810
+ let failedTestCases = 0;
811
+ for (const passed of testCasePassedBy.values()) {
812
+ if (passed) {
813
+ passedTestCases += 1;
814
+ } else {
815
+ failedTestCases += 1;
816
+ }
817
+ }
701
818
  return { completedTestCases, passedTestCases, failedTestCases };
702
819
  }
703
820
  async function appendJsonLine(artifactPath, payload) {
@@ -892,6 +1009,10 @@ var EffectRunner = class {
892
1009
  throw new Error("No evaluators selected for run");
893
1010
  }
894
1011
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1012
+ const totalEvaluations = selectedTestCases.reduce(
1013
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1014
+ 0
1015
+ );
895
1016
  const runId = `run-${crypto.randomUUID()}`;
896
1017
  const artifactPath = createArtifactPath(
897
1018
  this.config.artifactDirectory,
@@ -904,7 +1025,7 @@ var EffectRunner = class {
904
1025
  datasetName: dataset.dataset.getName(),
905
1026
  evaluatorIds: selectedEvaluators.map((item) => item.id),
906
1027
  queuedAt: Date.now(),
907
- totalTestCases: selectedTestCases.length,
1028
+ totalTestCases: totalEvaluations,
908
1029
  completedTestCases: 0,
909
1030
  passedTestCases: 0,
910
1031
  failedTestCases: 0,
@@ -918,7 +1039,7 @@ var EffectRunner = class {
918
1039
  datasetId: request.datasetId,
919
1040
  datasetName: dataset.dataset.getName(),
920
1041
  evaluatorIds: selectedEvaluators.map((item) => item.id),
921
- totalTestCases: selectedTestCases.length,
1042
+ totalTestCases: totalEvaluations,
922
1043
  artifactPath
923
1044
  };
924
1045
  await effect.Effect.runPromise(this.publishEvent(queuedEvent));
@@ -929,6 +1050,7 @@ var EffectRunner = class {
929
1050
  payload: queuedEvent
930
1051
  })
931
1052
  );
1053
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
932
1054
  await effect.Effect.runPromise(
933
1055
  effect.Queue.offer(this.runQueue, {
934
1056
  runId,
@@ -936,7 +1058,8 @@ var EffectRunner = class {
936
1058
  dataset: dataset.dataset,
937
1059
  evaluators: selectedEvaluators,
938
1060
  testCases: selectedTestCases,
939
- snapshot
1061
+ snapshot,
1062
+ maxConcurrency
940
1063
  })
941
1064
  );
942
1065
  return snapshot;
@@ -1254,13 +1377,62 @@ function createBar(value, max = 100, width = 20) {
1254
1377
  const filled = Math.round(safe / max * width);
1255
1378
  return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1256
1379
  }
1257
- function formatScorePart(item, scoreToColor2) {
1380
+ function aggregateEvaluatorScores(events, nameById) {
1381
+ if (events.length === 0)
1382
+ return [];
1383
+ const evaluatorIds = new Set(
1384
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1385
+ );
1386
+ const result = [];
1387
+ for (const evaluatorId of evaluatorIds) {
1388
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1389
+ const metricIdToItems = /* @__PURE__ */ new Map();
1390
+ for (const ev of events) {
1391
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1392
+ for (const s of es?.scores ?? []) {
1393
+ const list = scoreIdToItems.get(s.id) ?? [];
1394
+ list.push(s);
1395
+ scoreIdToItems.set(s.id, list);
1396
+ }
1397
+ for (const m of es?.metrics ?? []) {
1398
+ const list = metricIdToItems.get(m.id) ?? [];
1399
+ list.push(m);
1400
+ metricIdToItems.set(m.id, list);
1401
+ }
1402
+ }
1403
+ const aggregatedScores = [];
1404
+ for (const items of scoreIdToItems.values()) {
1405
+ const agg = aggregateScoreItems(items);
1406
+ if (agg)
1407
+ aggregatedScores.push(agg);
1408
+ }
1409
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1410
+ const passed = events.every((ev) => {
1411
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1412
+ return es?.passed ?? false;
1413
+ });
1414
+ const lastEvent = events[events.length - 1];
1415
+ const lastEs = lastEvent?.evaluatorScores.find(
1416
+ (x) => x.evaluatorId === evaluatorId
1417
+ );
1418
+ result.push({
1419
+ evaluatorId,
1420
+ evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
1421
+ scores: aggregatedScores,
1422
+ passed,
1423
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
1424
+ logs: lastEs?.logs
1425
+ });
1426
+ }
1427
+ return result;
1428
+ }
1429
+ function formatScorePart(item, scoreToColor2, options) {
1258
1430
  const def = getScoreById(item.id);
1259
1431
  if (!def) {
1260
1432
  const numeric = toNumericScore(item.data);
1261
1433
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1262
1434
  }
1263
- const formatted = def.format(item.data);
1435
+ const formatted = def.format(item.data, options);
1264
1436
  if (def.displayStrategy === "bar") {
1265
1437
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1266
1438
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1280,6 +1452,7 @@ function RunView({
1280
1452
  );
1281
1453
  const [runInfo, setRunInfo] = React2.useState(null);
1282
1454
  const [testCases, setTestCases] = React2.useState([]);
1455
+ const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
1283
1456
  const [summary, setSummary] = React2.useState(null);
1284
1457
  const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1285
1458
  const runEval = React2.useCallback(async () => {
@@ -1306,10 +1479,7 @@ function RunView({
1306
1479
  return;
1307
1480
  }
1308
1481
  const nameById = new Map(
1309
- evaluators.map((item) => [
1310
- item.id,
1311
- item.evaluator.getName() ?? item.id
1312
- ])
1482
+ evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1313
1483
  );
1314
1484
  setEvaluatorNameById(nameById);
1315
1485
  const aggregates = /* @__PURE__ */ new Map();
@@ -1319,7 +1489,7 @@ function RunView({
1319
1489
  const unsubscribe = runner.subscribeRunEvents((event) => {
1320
1490
  if (event.type === "TestCaseProgress") {
1321
1491
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1322
- const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1492
+ numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1323
1493
  for (const item of event.evaluatorScores) {
1324
1494
  const numeric = toNumericScoreFromScores(item.scores);
1325
1495
  if (numeric !== void 0) {
@@ -1339,15 +1509,10 @@ function RunView({
1339
1509
  overallScoreCount += 1;
1340
1510
  }
1341
1511
  }
1342
- setTestCases((prev) => [
1343
- ...prev,
1344
- {
1345
- name: event.testCaseName,
1346
- completedTestCases: event.completedTestCases,
1347
- totalTestCases: event.totalTestCases,
1348
- durationMs: event.durationMs,
1349
- passed: event.passed,
1350
- averageScore,
1512
+ setTestCases((prev) => {
1513
+ const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
1514
+ const existing = byId.get(event.testCaseId);
1515
+ const newEvent = {
1351
1516
  evaluatorScores: event.evaluatorScores.map((item) => ({
1352
1517
  evaluatorId: item.evaluatorId,
1353
1518
  evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
@@ -1355,9 +1520,33 @@ function RunView({
1355
1520
  passed: item.passed,
1356
1521
  metrics: item.metrics,
1357
1522
  logs: item.logs
1358
- }))
1359
- }
1360
- ]);
1523
+ })),
1524
+ passed: event.passed,
1525
+ durationMs: event.durationMs
1526
+ };
1527
+ const events = existing ? [...existing.events, newEvent] : [newEvent];
1528
+ const isAggregated = events.length > 1;
1529
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1530
+ events,
1531
+ nameById
1532
+ );
1533
+ const merged = {
1534
+ name: event.testCaseName,
1535
+ testCaseId: event.testCaseId,
1536
+ completedTestCases: event.completedTestCases,
1537
+ totalTestCases: event.totalTestCases,
1538
+ rerunIndex: event.rerunIndex,
1539
+ rerunTotal: event.rerunTotal,
1540
+ durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1541
+ passed: events.every((e) => e.passed),
1542
+ events,
1543
+ aggregatedEvaluatorScores,
1544
+ isAggregated
1545
+ };
1546
+ byId.set(event.testCaseId, merged);
1547
+ setCompletedEvaluations(event.completedTestCases);
1548
+ return Array.from(byId.values());
1549
+ });
1361
1550
  }
1362
1551
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
1363
1552
  unsubscribe();
@@ -1372,9 +1561,7 @@ function RunView({
1372
1561
  setRunInfo({
1373
1562
  runId: snapshot.runId,
1374
1563
  datasetName: snapshot.datasetName,
1375
- evaluatorNames: evaluators.map(
1376
- (e) => e.evaluator.getName() ?? e.id
1377
- ),
1564
+ evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
1378
1565
  totalTestCases: snapshot.totalTestCases
1379
1566
  });
1380
1567
  setPhase("running");
@@ -1402,29 +1589,41 @@ function RunView({
1402
1589
  /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1403
1590
  runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1404
1591
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1405
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
1592
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1593
+ "Run",
1594
+ " "
1595
+ ] }),
1406
1596
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
1407
1597
  ] }),
1408
1598
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1409
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
1599
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1600
+ "Dataset",
1601
+ " "
1602
+ ] }),
1410
1603
  runInfo.datasetName
1411
1604
  ] }),
1412
1605
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1413
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
1606
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1607
+ "Evaluators",
1608
+ " "
1609
+ ] }),
1414
1610
  runInfo.evaluatorNames.join(", ")
1415
1611
  ] }),
1416
1612
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1417
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
1613
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1614
+ "Test cases",
1615
+ " "
1616
+ ] }),
1418
1617
  runInfo.totalTestCases
1419
1618
  ] })
1420
1619
  ] }),
1421
1620
  phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1422
1621
  Spinner,
1423
1622
  {
1424
- label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1623
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1425
1624
  }
1426
1625
  ) }),
1427
- testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1626
+ testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1428
1627
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1429
1628
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1430
1629
  "[",
@@ -1435,49 +1634,78 @@ function RunView({
1435
1634
  ] }),
1436
1635
  " ",
1437
1636
  tc.name,
1637
+ " ",
1638
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1639
+ "(",
1640
+ tc.rerunIndex,
1641
+ "/",
1642
+ tc.rerunTotal,
1643
+ ")"
1644
+ ] }),
1438
1645
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1439
1646
  " (",
1440
1647
  tc.durationMs,
1441
1648
  "ms)"
1442
1649
  ] })
1443
1650
  ] }),
1444
- tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
1445
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1446
- item.evaluatorName,
1447
- ":",
1448
- " ",
1449
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1450
- " ",
1451
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1452
- formatScorePart(s),
1453
- " "
1454
- ] }, s.id)),
1455
- item.metrics?.map((m) => {
1456
- const def = getMetricById(m.id);
1457
- if (!def)
1458
- return null;
1459
- const formatted = def.format(m.data);
1460
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1461
- "[",
1462
- def.name ? `${def.name}: ` : "",
1463
- formatted,
1464
- "]",
1465
- " "
1466
- ] }, m.id);
1467
- })
1468
- ] }),
1469
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1470
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1471
- ink.Text,
1472
- {
1473
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1474
- children: line
1475
- },
1476
- lineIdx
1477
- )) }, logIdx) : null
1478
- ) })
1479
- ] }, item.evaluatorId))
1480
- ] }, i)) }),
1651
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
1652
+ ink.Box,
1653
+ {
1654
+ flexDirection: "column",
1655
+ marginLeft: 2,
1656
+ children: [
1657
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1658
+ item.evaluatorName,
1659
+ ":",
1660
+ " ",
1661
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1662
+ " ",
1663
+ item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
1664
+ ink.Text,
1665
+ {
1666
+ color: scoreColor(toNumericScore(s.data) ?? 0),
1667
+ children: [
1668
+ formatScorePart(s, scoreColor, {
1669
+ isAggregated: tc.isAggregated
1670
+ }),
1671
+ " "
1672
+ ]
1673
+ },
1674
+ s.id
1675
+ )),
1676
+ item.metrics?.map((m) => {
1677
+ const def = getMetricById(m.id);
1678
+ if (!def)
1679
+ return null;
1680
+ const formatted = def.format(m.data, {
1681
+ isAggregated: tc.isAggregated
1682
+ });
1683
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1684
+ "[",
1685
+ def.name ? `${def.name}: ` : "",
1686
+ formatted,
1687
+ "]",
1688
+ " "
1689
+ ] }, m.id);
1690
+ })
1691
+ ] }),
1692
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1693
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1694
+ ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1695
+ ink.Text,
1696
+ {
1697
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1698
+ children: line
1699
+ },
1700
+ lineIdx
1701
+ )
1702
+ ) }, logIdx) : null
1703
+ ) })
1704
+ ]
1705
+ },
1706
+ item.evaluatorId
1707
+ ))
1708
+ ] }, tc.testCaseId)) }),
1481
1709
  phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1482
1710
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
1483
1711
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
@@ -1524,7 +1752,8 @@ function RunView({
1524
1752
  name.padEnd(28),
1525
1753
  " avg=",
1526
1754
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1527
- " passed=",
1755
+ " ",
1756
+ "passed=",
1528
1757
  agg.passed,
1529
1758
  " failed=",
1530
1759
  agg.failed
@@ -1533,28 +1762,38 @@ function RunView({
1533
1762
  ] }),
1534
1763
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1535
1764
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
1536
- testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1537
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1538
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1539
- " ",
1540
- tc.name.padEnd(24)
1541
- ] }),
1542
- tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1543
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
1544
- "score=",
1545
- tc.averageScore.toFixed(2)
1765
+ testCases.map((tc) => {
1766
+ const numericScores = tc.aggregatedEvaluatorScores.flatMap(
1767
+ (item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
1768
+ );
1769
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1770
+ const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1771
+ const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1772
+ isAggregated: true
1773
+ }) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
1774
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1775
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1776
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1777
+ " ",
1778
+ tc.name.padEnd(24)
1546
1779
  ] }),
1780
+ averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1781
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(averageScore), children: [
1782
+ "score=",
1783
+ scoreLabel
1784
+ ] }),
1785
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1786
+ " ",
1787
+ createBar(averageScore, 100, 14)
1788
+ ] })
1789
+ ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1547
1790
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1548
- " ",
1549
- createBar(tc.averageScore, 100, 14)
1791
+ " (",
1792
+ tc.durationMs,
1793
+ "ms)"
1550
1794
  ] })
1551
- ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1552
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1553
- " (",
1554
- tc.durationMs,
1555
- "ms)"
1556
- ] })
1557
- ] }, i))
1795
+ ] }, tc.testCaseId);
1796
+ })
1558
1797
  ] }),
1559
1798
  /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1560
1799
  "artifact: ",
@@ -1565,6 +1804,51 @@ function RunView({
1565
1804
  }
1566
1805
 
1567
1806
  // src/cli-simple/run.ts
1807
+ function buildTestCaseSummaries(byId) {
1808
+ const summaries = [];
1809
+ for (const { name, events } of byId.values()) {
1810
+ const passed = events.every((e) => e.passed);
1811
+ const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1812
+ const isAggregated = events.length > 1;
1813
+ const numericScores = [];
1814
+ let firstAggregatedScore;
1815
+ for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1816
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1817
+ for (const ev of events) {
1818
+ const es = ev.evaluatorScores.find(
1819
+ (x) => x.evaluatorId === evaluatorScores.evaluatorId
1820
+ );
1821
+ for (const s of es?.scores ?? []) {
1822
+ const list = scoreIdToItems.get(s.id) ?? [];
1823
+ list.push(s);
1824
+ scoreIdToItems.set(s.id, list);
1825
+ }
1826
+ }
1827
+ for (const items of scoreIdToItems.values()) {
1828
+ const agg = aggregateScoreItems(items);
1829
+ if (agg) {
1830
+ const n = toNumericScoreFromScores([agg]);
1831
+ if (n !== void 0) {
1832
+ numericScores.push(n);
1833
+ if (firstAggregatedScore === void 0) {
1834
+ firstAggregatedScore = agg;
1835
+ }
1836
+ }
1837
+ }
1838
+ }
1839
+ }
1840
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1841
+ summaries.push({
1842
+ name,
1843
+ averageScore,
1844
+ aggregatedScoreItem: firstAggregatedScore,
1845
+ isAggregated,
1846
+ durationMs,
1847
+ passed
1848
+ });
1849
+ }
1850
+ return summaries;
1851
+ }
1568
1852
  var ansi2 = {
1569
1853
  reset: "\x1B[0m",
1570
1854
  bold: "\x1B[1m",
@@ -1599,7 +1883,50 @@ function createBar2(value, max = 100, width = 20) {
1599
1883
  const filled = Math.round(safe / max * width);
1600
1884
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
1601
1885
  }
1602
- function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1886
+ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1887
+ if (events.length === 0)
1888
+ return [];
1889
+ const evaluatorIds = new Set(
1890
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1891
+ );
1892
+ const result = [];
1893
+ for (const evaluatorId of evaluatorIds) {
1894
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1895
+ const metricIdToItems = /* @__PURE__ */ new Map();
1896
+ for (const ev of events) {
1897
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1898
+ for (const s of es?.scores ?? []) {
1899
+ const list = scoreIdToItems.get(s.id) ?? [];
1900
+ list.push(s);
1901
+ scoreIdToItems.set(s.id, list);
1902
+ }
1903
+ for (const m of es?.metrics ?? []) {
1904
+ const list = metricIdToItems.get(m.id) ?? [];
1905
+ list.push(m);
1906
+ metricIdToItems.set(m.id, list);
1907
+ }
1908
+ }
1909
+ const aggregatedScores = [];
1910
+ for (const items of scoreIdToItems.values()) {
1911
+ const agg = aggregateScoreItems(items);
1912
+ if (agg)
1913
+ aggregatedScores.push(agg);
1914
+ }
1915
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1916
+ const passed = events.every((ev) => {
1917
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1918
+ return es?.passed ?? false;
1919
+ });
1920
+ result.push({
1921
+ evaluatorId,
1922
+ scores: aggregatedScores,
1923
+ passed,
1924
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
1925
+ });
1926
+ }
1927
+ return result;
1928
+ }
1929
+ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1603
1930
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1604
1931
  const scoreParts = [];
1605
1932
  for (const item of scores) {
@@ -1611,7 +1938,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1611
1938
  );
1612
1939
  continue;
1613
1940
  }
1614
- const formatted = def.format(item.data);
1941
+ const formatted = def.format(item.data, options);
1615
1942
  switch (def.displayStrategy) {
1616
1943
  case "bar": {
1617
1944
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -1644,7 +1971,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1644
1971
  for (const { id, data } of metrics) {
1645
1972
  const def = getMetricById(id);
1646
1973
  if (def) {
1647
- const formatted = def.format(data);
1974
+ const formatted = def.format(data, options);
1648
1975
  metricParts.push(
1649
1976
  def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
1650
1977
  );
@@ -1677,7 +2004,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1677
2004
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1678
2005
  );
1679
2006
  const aggregates = /* @__PURE__ */ new Map();
1680
- const testCaseSummaries = [];
2007
+ const testCaseByTestId = /* @__PURE__ */ new Map();
1681
2008
  let overallScoreTotal = 0;
1682
2009
  let overallScoreCount = 0;
1683
2010
  let completedCount = 0;
@@ -1691,6 +2018,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1691
2018
  }
1692
2019
  process.stdout.write("\r\x1B[2K");
1693
2020
  }
2021
+ function cursorUp(n) {
2022
+ if (!process.stdout.isTTY || n <= 0)
2023
+ return;
2024
+ process.stdout.write(`\x1B[${n}A`);
2025
+ }
1694
2026
  function drawSpinner() {
1695
2027
  if (!process.stdout.isTTY || runFinished) {
1696
2028
  return;
@@ -1704,6 +2036,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1704
2036
  )} ${colorize("(live)", ansi2.dim)}`
1705
2037
  );
1706
2038
  }
2039
+ let lastPrintedTestCaseId = null;
2040
+ let lastPrintedLineCount = 0;
1707
2041
  let spinnerTimer;
1708
2042
  const done = new Promise((resolve5) => {
1709
2043
  const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1711,31 +2045,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1711
2045
  completedCount = event.completedTestCases;
1712
2046
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1713
2047
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
1714
- clearLine();
1715
- console.log(
1716
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
1717
- );
2048
+ const testCaseId = event.testCaseId;
2049
+ const existing = testCaseByTestId.get(testCaseId) ?? {
2050
+ name: event.testCaseName,
2051
+ events: []
2052
+ };
2053
+ existing.events.push({
2054
+ averageScore,
2055
+ passed: event.passed,
2056
+ durationMs: event.durationMs,
2057
+ evaluatorScores: event.evaluatorScores
2058
+ });
2059
+ testCaseByTestId.set(testCaseId, existing);
1718
2060
  for (const item of event.evaluatorScores) {
1719
- const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
1720
- console.log(
1721
- formatEvaluatorScoreLine(
1722
- name,
1723
- item.scores,
1724
- item.passed,
1725
- item.metrics
1726
- )
1727
- );
1728
- if (!item.passed && item.logs && item.logs.length > 0) {
1729
- for (const log of item.logs) {
1730
- if (log.type === "diff") {
1731
- const useColor = process.stdout.isTTY;
1732
- for (const { type, line } of getDiffLines(log)) {
1733
- const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1734
- console.log(colored);
1735
- }
1736
- }
1737
- }
1738
- }
1739
2061
  const numeric = toNumericScoreFromScores(item.scores);
1740
2062
  if (numeric !== void 0) {
1741
2063
  const current = aggregates.get(item.evaluatorId) ?? {
@@ -1754,12 +2076,60 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1754
2076
  overallScoreCount += 1;
1755
2077
  }
1756
2078
  }
1757
- testCaseSummaries.push({
1758
- name: event.testCaseName,
1759
- averageScore,
1760
- durationMs: event.durationMs,
1761
- passed: event.passed
1762
- });
2079
+ const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2080
+ const isLastRerun = event.rerunIndex >= event.rerunTotal;
2081
+ const isNonTty = !process.stdout.isTTY;
2082
+ const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
2083
+ if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
2084
+ cursorUp(lastPrintedLineCount);
2085
+ }
2086
+ const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2087
+ existing.events);
2088
+ const isAggregated = existing.events.length > 1;
2089
+ const durationMs = existing.events.reduce(
2090
+ (s, e) => s + e.durationMs,
2091
+ 0
2092
+ );
2093
+ existing.events.every((e) => e.passed);
2094
+ const lines = [];
2095
+ lines.push(
2096
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2097
+ );
2098
+ for (const item of aggregatedScores) {
2099
+ const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2100
+ lines.push(
2101
+ formatEvaluatorScoreLine(
2102
+ name,
2103
+ item.scores,
2104
+ item.passed,
2105
+ item.metrics,
2106
+ { isAggregated }
2107
+ )
2108
+ );
2109
+ const lastEvent = existing.events[existing.events.length - 1];
2110
+ const lastEs = lastEvent?.evaluatorScores.find(
2111
+ (x) => x.evaluatorId === item.evaluatorId
2112
+ );
2113
+ if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2114
+ for (const log of lastEs.logs) {
2115
+ if (log.type === "diff") {
2116
+ const useColor = process.stdout.isTTY;
2117
+ for (const { type, line } of getDiffLines(log)) {
2118
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2119
+ lines.push(colored);
2120
+ }
2121
+ }
2122
+ }
2123
+ }
2124
+ }
2125
+ if (!skipPrintNonTty) {
2126
+ for (let i = 0; i < lines.length; i++) {
2127
+ process.stdout.write(`\r\x1B[2K${lines[i]}
2128
+ `);
2129
+ }
2130
+ lastPrintedTestCaseId = testCaseId;
2131
+ lastPrintedLineCount = lines.length;
2132
+ }
1763
2133
  drawSpinner();
1764
2134
  }
1765
2135
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
@@ -1823,6 +2193,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1823
2193
  getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
1824
2194
  );
1825
2195
  }
2196
+ const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
1826
2197
  if (testCaseSummaries.length > 0) {
1827
2198
  console.log(colorize("- test case scores:", ansi2.magenta));
1828
2199
  for (const summary of testCaseSummaries) {
@@ -1833,9 +2204,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1833
2204
  );
1834
2205
  continue;
1835
2206
  }
2207
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2208
+ summary.aggregatedScoreItem.data,
2209
+ { isAggregated: true }
2210
+ ) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
1836
2211
  console.log(
1837
2212
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1838
- summary.averageScore.toFixed(2),
2213
+ scoreLabel,
1839
2214
  scoreToColor(summary.averageScore)
1840
2215
  )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1841
2216
  );