@m4trix/evals 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- var jsonDiff = require('json-diff');
11
+ var diff = require('diff');
12
12
  var React2 = require('react');
13
13
  var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
@@ -56,7 +56,8 @@ var defaultRunnerConfig = {
56
56
  ],
57
57
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
58
58
  },
59
- artifactDirectory: ".eval-results"
59
+ artifactDirectory: ".eval-results",
60
+ maxConcurrency: 1
60
61
  };
61
62
  function toRunnerConfigOverrides(config) {
62
63
  if (!config) {
@@ -89,6 +90,9 @@ function toRunnerConfigOverrides(config) {
89
90
  if (config.artifactDirectory !== void 0) {
90
91
  overrides.artifactDirectory = config.artifactDirectory;
91
92
  }
93
+ if (config.maxConcurrency !== void 0) {
94
+ overrides.maxConcurrency = config.maxConcurrency;
95
+ }
92
96
  if (Object.keys(discovery).length > 0) {
93
97
  overrides.discovery = discovery;
94
98
  }
@@ -282,8 +286,35 @@ async function collectTestCasesFromFiles(config) {
282
286
  );
283
287
  return found.flat();
284
288
  }
289
+ function toJsonLines(value) {
290
+ try {
291
+ return JSON.stringify(value, null, 2);
292
+ } catch {
293
+ return String(value);
294
+ }
295
+ }
296
+ function formatDiffString(changes) {
297
+ const lines = [];
298
+ for (const part of changes) {
299
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
300
+ const partLines = part.value.split("\n");
301
+ if (partLines[partLines.length - 1] === "") {
302
+ partLines.pop();
303
+ }
304
+ for (const line of partLines) {
305
+ lines.push(`${prefix} ${line}`);
306
+ }
307
+ }
308
+ return lines.join("\n");
309
+ }
310
+ function createDiffString(expected, actual) {
311
+ const expectedStr = toJsonLines(expected);
312
+ const actualStr = toJsonLines(actual);
313
+ const changes = diff.diffLines(expectedStr, actualStr);
314
+ return formatDiffString(changes);
315
+ }
285
316
  function createDiffLogEntry(expected, actual, options) {
286
- const diff = jsonDiff.diffString(expected, actual, { color: false });
317
+ const diff = createDiffString(expected, actual);
287
318
  return {
288
319
  type: "diff",
289
320
  label: options?.label,
@@ -293,7 +324,7 @@ function createDiffLogEntry(expected, actual, options) {
293
324
  };
294
325
  }
295
326
  function getDiffLines(entry) {
296
- const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
327
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
297
328
  return raw.split("\n").map((line) => {
298
329
  const trimmed = line.trimStart();
299
330
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -313,6 +344,7 @@ var Metric = {
313
344
  const def = {
314
345
  id: config.id,
315
346
  name: config.name,
347
+ aggregate: config.aggregate,
316
348
  format: config.format,
317
349
  make: (data) => ({ id: config.id, data })
318
350
  };
@@ -332,6 +364,7 @@ var Score = {
332
364
  id: config.id,
333
365
  name: config.name,
334
366
  displayStrategy: config.displayStrategy,
367
+ aggregate: config.aggregate,
335
368
  format: config.format,
336
369
  make: (data, options) => {
337
370
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -350,23 +383,75 @@ function getScoreById(id) {
350
383
  return registry2.get(id);
351
384
  }
352
385
 
386
+ // src/evals/aggregators.ts
387
+ function aggregateAverageWithVariance(values) {
388
+ if (values.length === 0) {
389
+ return { value: 0, count: 0 };
390
+ }
391
+ const sum = values.reduce((s, v) => s + v.value, 0);
392
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
393
+ const mean = sum / values.length;
394
+ let stdDev;
395
+ if (values.length >= 2) {
396
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
397
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
398
+ }
399
+ return { value: mean, stdDev, count: values.length };
400
+ }
401
+ function aggregateAll(values) {
402
+ const total = values.length;
403
+ const passedCount = values.filter((v) => v.passed).length;
404
+ return {
405
+ passed: total > 0 && values.every((v) => v.passed),
406
+ passedCount,
407
+ totalCount: total
408
+ };
409
+ }
410
+ function aggregateTokenCountSum(values) {
411
+ const initial = {
412
+ input: 0,
413
+ output: 0,
414
+ inputCached: 0,
415
+ outputCached: 0
416
+ };
417
+ return values.reduce(
418
+ (acc, v) => ({
419
+ input: acc.input + (v.input ?? 0),
420
+ output: acc.output + (v.output ?? 0),
421
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
422
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
423
+ }),
424
+ initial
425
+ );
426
+ }
427
+ function aggregateLatencyAverage(values) {
428
+ if (values.length === 0) {
429
+ return { ms: 0 };
430
+ }
431
+ const sum = values.reduce((s, v) => s + v.ms, 0);
432
+ return { ms: sum / values.length };
433
+ }
434
+
353
435
  // src/evals/metrics/standard.ts
354
436
  Metric.of({
355
437
  id: "token-count",
356
438
  name: "Tokens",
357
- format: (data) => {
439
+ aggregate: aggregateTokenCountSum,
440
+ format: (data, options) => {
358
441
  const input = data.input ?? 0;
359
442
  const output = data.output ?? 0;
360
443
  const inputCached = data.inputCached ?? 0;
361
444
  const outputCached = data.outputCached ?? 0;
362
445
  const cached = inputCached + outputCached;
363
- return `in:${input} out:${output} cached:${cached}`;
446
+ const base = `in:${input} out:${output} cached:${cached}`;
447
+ return options?.isAggregated ? `Total: ${base}` : base;
364
448
  }
365
449
  });
366
450
  Metric.of({
367
451
  id: "latency",
368
452
  name: "Latency",
369
- format: (data) => `${data.ms}ms`
453
+ aggregate: aggregateLatencyAverage,
454
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
370
455
  });
371
456
 
372
457
  // src/evals/scores/standard.ts
@@ -374,16 +459,50 @@ Score.of({
374
459
  id: "percent",
375
460
  name: "Score",
376
461
  displayStrategy: "bar",
377
- format: (data) => data.value.toFixed(2)
462
+ format: (data, options) => {
463
+ if (options?.isAggregated) {
464
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
465
+ }
466
+ return data.value.toFixed(2);
467
+ },
468
+ aggregate: aggregateAverageWithVariance
378
469
  });
379
470
  Score.of({
380
471
  id: "binary",
381
472
  name: "Result",
382
473
  displayStrategy: "passFail",
383
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
474
+ format: (data, options) => {
475
+ if (options?.isAggregated) {
476
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
477
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
478
+ return `${base} (${data.passedCount}/${data.totalCount})`;
479
+ }
480
+ return base;
481
+ }
482
+ return data.passed ? "PASSED" : "NOT PASSED";
483
+ },
484
+ aggregate: aggregateAll
384
485
  });
385
486
 
386
487
  // src/runner/score-utils.ts
488
+ function aggregateScoreItems(items) {
489
+ if (items.length === 0)
490
+ return void 0;
491
+ const def = getScoreById(items[0].id);
492
+ if (!def?.aggregate)
493
+ return items[items.length - 1];
494
+ const aggregated = def.aggregate(items.map((i) => i.data));
495
+ return { ...items[0], data: aggregated };
496
+ }
497
+ function aggregateMetricItems(items) {
498
+ if (items.length === 0)
499
+ return void 0;
500
+ const def = getMetricById(items[0].id);
501
+ if (!def?.aggregate)
502
+ return items[items.length - 1];
503
+ const aggregated = def.aggregate(items.map((i) => i.data));
504
+ return { ...items[0], data: aggregated };
505
+ }
387
506
  function toNumericScoreFromScores(scores) {
388
507
  for (const item of scores) {
389
508
  const def = getScoreById(item.id);
@@ -462,6 +581,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
462
581
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
463
582
  );
464
583
  }
584
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
585
+ return effect.Effect.gen(function* () {
586
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
587
+ const rerunPassed = [];
588
+ for (let r = 0; r < reruns; r++) {
589
+ const started = Date.now();
590
+ const evaluatorScores = [];
591
+ let testCaseError;
592
+ const output = readOutput(testCaseItem.testCase);
593
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
594
+ const evaluateFn = evaluator.getEvaluateFn();
595
+ if (!evaluateFn) {
596
+ continue;
597
+ }
598
+ try {
599
+ const logs = [];
600
+ const logDiff = (expected, actual, options) => {
601
+ logs.push(createDiffLogEntry(expected, actual, options));
602
+ };
603
+ const ctx = yield* effect.Effect.promise(
604
+ () => Promise.resolve(evaluator.resolveContext())
605
+ );
606
+ const result = yield* effect.Effect.promise(
607
+ () => Promise.resolve(
608
+ evaluateFn({
609
+ input: testCaseItem.testCase.getInput(),
610
+ ctx,
611
+ output,
612
+ logDiff
613
+ })
614
+ )
615
+ );
616
+ const { scores, metrics } = normalizeResult(result);
617
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
618
+ evaluatorScores.push({
619
+ evaluatorId,
620
+ scores,
621
+ passed: passed2,
622
+ metrics,
623
+ logs: logs.length > 0 ? logs : void 0
624
+ });
625
+ } catch (error) {
626
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
627
+ evaluatorScores.push({
628
+ evaluatorId,
629
+ scores: [],
630
+ passed: false
631
+ });
632
+ }
633
+ }
634
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
635
+ rerunPassed.push(rerunPassedThis);
636
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
637
+ n + 1,
638
+ n + 1
639
+ ]);
640
+ const progressEvent = {
641
+ type: "TestCaseProgress",
642
+ runId: task.runId,
643
+ testCaseId: testCaseItem.id,
644
+ testCaseName: testCaseItem.testCase.getName(),
645
+ completedTestCases: completedEvaluations,
646
+ totalTestCases: totalEvaluations,
647
+ rerunIndex: r + 1,
648
+ rerunTotal: reruns,
649
+ passed: rerunPassedThis,
650
+ durationMs: Date.now() - started,
651
+ evaluatorScores,
652
+ output,
653
+ errorMessage: testCaseError
654
+ };
655
+ updateSnapshot(task.runId, (snapshot) => ({
656
+ ...snapshot,
657
+ completedTestCases: completedEvaluations
658
+ }));
659
+ yield* publishEvent(progressEvent);
660
+ yield* effect.Queue.offer(persistenceQueue, {
661
+ runId: task.runId,
662
+ artifactPath: task.snapshot.artifactPath,
663
+ payload: progressEvent
664
+ });
665
+ }
666
+ const testCasePassed = rerunPassed.every(Boolean);
667
+ if (testCasePassed) {
668
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
669
+ } else {
670
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
671
+ }
672
+ const [passed, failed] = yield* effect.Effect.all([
673
+ effect.Ref.get(passedRef),
674
+ effect.Ref.get(failedRef)
675
+ ]);
676
+ updateSnapshot(task.runId, (snapshot) => ({
677
+ ...snapshot,
678
+ passedTestCases: passed,
679
+ failedTestCases: failed
680
+ }));
681
+ });
682
+ }
465
683
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
466
684
  const startedAt = Date.now();
467
685
  updateSnapshot(task.runId, (snapshot) => ({
@@ -474,104 +692,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
474
692
  runId: task.runId,
475
693
  startedAt
476
694
  });
477
- let completedTestCases = 0;
478
- let passedTestCases = 0;
479
- let failedTestCases = 0;
480
- for (const testCaseItem of task.testCases) {
481
- const started = Date.now();
482
- const evaluatorScores = [];
483
- let testCaseError;
484
- const output = readOutput(testCaseItem.testCase);
485
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
486
- const evaluateFn = evaluator.getEvaluateFn();
487
- if (!evaluateFn) {
488
- continue;
489
- }
490
- try {
491
- const logs = [];
492
- const logDiff = (expected, actual, options) => {
493
- logs.push(createDiffLogEntry(expected, actual, options));
494
- };
495
- const ctx = yield* effect.Effect.promise(
496
- () => Promise.resolve(evaluator.resolveContext())
497
- );
498
- const result = yield* effect.Effect.promise(
499
- () => Promise.resolve(
500
- evaluateFn({
501
- input: testCaseItem.testCase.getInput(),
502
- ctx,
503
- output,
504
- logDiff
505
- })
506
- )
507
- );
508
- const { scores, metrics } = normalizeResult(result);
509
- const passed = computeEvaluatorPassed(evaluator, result, scores);
510
- evaluatorScores.push({
511
- evaluatorId,
512
- scores,
513
- passed,
514
- metrics,
515
- logs: logs.length > 0 ? logs : void 0
516
- });
517
- } catch (error) {
518
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
519
- evaluatorScores.push({
520
- evaluatorId,
521
- scores: [],
522
- passed: false
523
- });
524
- }
525
- }
526
- const testCasePassed = evaluatorScores.every((s) => s.passed);
527
- completedTestCases += 1;
528
- if (testCasePassed) {
529
- passedTestCases += 1;
530
- } else {
531
- failedTestCases += 1;
532
- }
533
- const progressEvent = {
534
- type: "TestCaseProgress",
535
- runId: task.runId,
536
- testCaseId: testCaseItem.id,
537
- testCaseName: testCaseItem.testCase.getName(),
538
- completedTestCases,
539
- totalTestCases: task.testCases.length,
540
- passed: testCasePassed,
541
- durationMs: Date.now() - started,
542
- evaluatorScores,
543
- output,
544
- errorMessage: testCaseError
545
- };
546
- updateSnapshot(task.runId, (snapshot) => ({
547
- ...snapshot,
548
- completedTestCases,
549
- passedTestCases,
550
- failedTestCases
551
- }));
552
- yield* publishEvent(progressEvent);
553
- yield* effect.Queue.offer(persistenceQueue, {
554
- runId: task.runId,
555
- artifactPath: task.snapshot.artifactPath,
556
- payload: progressEvent
557
- });
558
- }
695
+ const totalEvaluations = task.testCases.reduce(
696
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
697
+ 0
698
+ );
699
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
700
+ const completedRef = yield* effect.Ref.make(0);
701
+ const passedRef = yield* effect.Ref.make(0);
702
+ const failedRef = yield* effect.Ref.make(0);
703
+ const processTestCase = (testCaseItem) => processOneTestCase(
704
+ task,
705
+ testCaseItem,
706
+ totalEvaluations,
707
+ publishEvent,
708
+ persistenceQueue,
709
+ updateSnapshot,
710
+ completedRef,
711
+ passedRef,
712
+ failedRef
713
+ );
714
+ yield* effect.Effect.forEach(
715
+ task.testCases,
716
+ processTestCase,
717
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
718
+ );
719
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
720
+ effect.Ref.get(completedRef),
721
+ effect.Ref.get(passedRef),
722
+ effect.Ref.get(failedRef)
723
+ ]);
559
724
  const finishedAt = Date.now();
560
725
  const completedEvent = {
561
726
  type: "RunCompleted",
562
727
  runId: task.runId,
563
728
  finishedAt,
564
- passedTestCases,
565
- failedTestCases,
729
+ passedTestCases: passedUniqueTestCases,
730
+ failedTestCases: failedUniqueTestCases,
566
731
  totalTestCases: task.testCases.length,
567
732
  artifactPath: task.snapshot.artifactPath
568
733
  };
569
734
  updateSnapshot(task.runId, (snapshot) => ({
570
735
  ...snapshot,
571
736
  status: "completed",
572
- completedTestCases,
573
- passedTestCases,
574
- failedTestCases,
737
+ completedTestCases: completedEvaluations,
738
+ passedTestCases: passedUniqueTestCases,
739
+ failedTestCases: failedUniqueTestCases,
575
740
  finishedAt
576
741
  }));
577
742
  yield* publishEvent(completedEvent);
@@ -659,7 +824,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
659
824
  const artifactPath = filePath;
660
825
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
661
826
  const progress = aggregateTestCaseProgress(lines);
662
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
827
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
663
828
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
664
829
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
665
830
  return {
@@ -681,23 +846,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
681
846
  }
682
847
  function aggregateTestCaseProgress(lines) {
683
848
  let completedTestCases = 0;
684
- let passedTestCases = 0;
685
- let failedTestCases = 0;
849
+ const testCasePassedBy = /* @__PURE__ */ new Map();
686
850
  for (const line of lines) {
687
851
  try {
688
852
  const event = JSON.parse(line);
689
853
  if (event.type === "TestCaseProgress") {
690
854
  const ev = event;
691
855
  completedTestCases = ev.completedTestCases ?? completedTestCases;
692
- if (ev.passed) {
693
- passedTestCases += 1;
694
- } else {
695
- failedTestCases += 1;
696
- }
856
+ const id = ev.testCaseId;
857
+ const current = testCasePassedBy.get(id);
858
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
697
859
  }
698
860
  } catch {
699
861
  }
700
862
  }
863
+ let passedTestCases = 0;
864
+ let failedTestCases = 0;
865
+ for (const passed of testCasePassedBy.values()) {
866
+ if (passed) {
867
+ passedTestCases += 1;
868
+ } else {
869
+ failedTestCases += 1;
870
+ }
871
+ }
701
872
  return { completedTestCases, passedTestCases, failedTestCases };
702
873
  }
703
874
  async function appendJsonLine(artifactPath, payload) {
@@ -892,6 +1063,10 @@ var EffectRunner = class {
892
1063
  throw new Error("No evaluators selected for run");
893
1064
  }
894
1065
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1066
+ const totalEvaluations = selectedTestCases.reduce(
1067
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1068
+ 0
1069
+ );
895
1070
  const runId = `run-${crypto.randomUUID()}`;
896
1071
  const artifactPath = createArtifactPath(
897
1072
  this.config.artifactDirectory,
@@ -904,7 +1079,7 @@ var EffectRunner = class {
904
1079
  datasetName: dataset.dataset.getName(),
905
1080
  evaluatorIds: selectedEvaluators.map((item) => item.id),
906
1081
  queuedAt: Date.now(),
907
- totalTestCases: selectedTestCases.length,
1082
+ totalTestCases: totalEvaluations,
908
1083
  completedTestCases: 0,
909
1084
  passedTestCases: 0,
910
1085
  failedTestCases: 0,
@@ -918,7 +1093,7 @@ var EffectRunner = class {
918
1093
  datasetId: request.datasetId,
919
1094
  datasetName: dataset.dataset.getName(),
920
1095
  evaluatorIds: selectedEvaluators.map((item) => item.id),
921
- totalTestCases: selectedTestCases.length,
1096
+ totalTestCases: totalEvaluations,
922
1097
  artifactPath
923
1098
  };
924
1099
  await effect.Effect.runPromise(this.publishEvent(queuedEvent));
@@ -929,6 +1104,7 @@ var EffectRunner = class {
929
1104
  payload: queuedEvent
930
1105
  })
931
1106
  );
1107
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
932
1108
  await effect.Effect.runPromise(
933
1109
  effect.Queue.offer(this.runQueue, {
934
1110
  runId,
@@ -936,7 +1112,8 @@ var EffectRunner = class {
936
1112
  dataset: dataset.dataset,
937
1113
  evaluators: selectedEvaluators,
938
1114
  testCases: selectedTestCases,
939
- snapshot
1115
+ snapshot,
1116
+ maxConcurrency
940
1117
  })
941
1118
  );
942
1119
  return snapshot;
@@ -1242,6 +1419,13 @@ function Spinner({ label = "Running" }) {
1242
1419
  label
1243
1420
  ] });
1244
1421
  }
1422
+ function sampleStdDev(sum, sumSq, n) {
1423
+ if (n < 2)
1424
+ return void 0;
1425
+ const mean = sum / n;
1426
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1427
+ return variance > 0 ? Math.sqrt(variance) : 0;
1428
+ }
1245
1429
  function scoreColor(score) {
1246
1430
  if (score >= 80)
1247
1431
  return "green";
@@ -1254,13 +1438,62 @@ function createBar(value, max = 100, width = 20) {
1254
1438
  const filled = Math.round(safe / max * width);
1255
1439
  return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1256
1440
  }
1257
- function formatScorePart(item, scoreToColor2) {
1441
+ function aggregateEvaluatorScores(events, nameById) {
1442
+ if (events.length === 0)
1443
+ return [];
1444
+ const evaluatorIds = new Set(
1445
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1446
+ );
1447
+ const result = [];
1448
+ for (const evaluatorId of evaluatorIds) {
1449
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1450
+ const metricIdToItems = /* @__PURE__ */ new Map();
1451
+ for (const ev of events) {
1452
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1453
+ for (const s of es?.scores ?? []) {
1454
+ const list = scoreIdToItems.get(s.id) ?? [];
1455
+ list.push(s);
1456
+ scoreIdToItems.set(s.id, list);
1457
+ }
1458
+ for (const m of es?.metrics ?? []) {
1459
+ const list = metricIdToItems.get(m.id) ?? [];
1460
+ list.push(m);
1461
+ metricIdToItems.set(m.id, list);
1462
+ }
1463
+ }
1464
+ const aggregatedScores = [];
1465
+ for (const items of scoreIdToItems.values()) {
1466
+ const agg = aggregateScoreItems(items);
1467
+ if (agg)
1468
+ aggregatedScores.push(agg);
1469
+ }
1470
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1471
+ const passed = events.every((ev) => {
1472
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1473
+ return es?.passed ?? false;
1474
+ });
1475
+ const lastEvent = events[events.length - 1];
1476
+ const lastEs = lastEvent?.evaluatorScores.find(
1477
+ (x) => x.evaluatorId === evaluatorId
1478
+ );
1479
+ result.push({
1480
+ evaluatorId,
1481
+ evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
1482
+ scores: aggregatedScores,
1483
+ passed,
1484
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
1485
+ logs: lastEs?.logs
1486
+ });
1487
+ }
1488
+ return result;
1489
+ }
1490
+ function formatScorePart(item, scoreToColor2, options) {
1258
1491
  const def = getScoreById(item.id);
1259
1492
  if (!def) {
1260
1493
  const numeric = toNumericScore(item.data);
1261
1494
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1262
1495
  }
1263
- const formatted = def.format(item.data);
1496
+ const formatted = def.format(item.data, options);
1264
1497
  if (def.displayStrategy === "bar") {
1265
1498
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1266
1499
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1280,6 +1513,7 @@ function RunView({
1280
1513
  );
1281
1514
  const [runInfo, setRunInfo] = React2.useState(null);
1282
1515
  const [testCases, setTestCases] = React2.useState([]);
1516
+ const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
1283
1517
  const [summary, setSummary] = React2.useState(null);
1284
1518
  const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1285
1519
  const runEval = React2.useCallback(async () => {
@@ -1306,48 +1540,44 @@ function RunView({
1306
1540
  return;
1307
1541
  }
1308
1542
  const nameById = new Map(
1309
- evaluators.map((item) => [
1310
- item.id,
1311
- item.evaluator.getName() ?? item.id
1312
- ])
1543
+ evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1313
1544
  );
1314
1545
  setEvaluatorNameById(nameById);
1315
1546
  const aggregates = /* @__PURE__ */ new Map();
1316
1547
  let overallScoreTotal = 0;
1548
+ let overallScoreSumSq = 0;
1317
1549
  let overallScoreCount = 0;
1318
1550
  const done = new Promise((resolve5) => {
1319
1551
  const unsubscribe = runner.subscribeRunEvents((event) => {
1320
1552
  if (event.type === "TestCaseProgress") {
1321
1553
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1322
- const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1554
+ numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1323
1555
  for (const item of event.evaluatorScores) {
1324
1556
  const numeric = toNumericScoreFromScores(item.scores);
1325
1557
  if (numeric !== void 0) {
1326
1558
  const current = aggregates.get(item.evaluatorId) ?? {
1327
1559
  total: 0,
1560
+ sumSq: 0,
1328
1561
  count: 0,
1329
1562
  passed: 0,
1330
1563
  failed: 0
1331
1564
  };
1332
1565
  aggregates.set(item.evaluatorId, {
1333
1566
  total: current.total + numeric,
1567
+ sumSq: current.sumSq + numeric * numeric,
1334
1568
  count: current.count + 1,
1335
1569
  passed: current.passed + (item.passed ? 1 : 0),
1336
1570
  failed: current.failed + (item.passed ? 0 : 1)
1337
1571
  });
1338
1572
  overallScoreTotal += numeric;
1573
+ overallScoreSumSq += numeric * numeric;
1339
1574
  overallScoreCount += 1;
1340
1575
  }
1341
1576
  }
1342
- setTestCases((prev) => [
1343
- ...prev,
1344
- {
1345
- name: event.testCaseName,
1346
- completedTestCases: event.completedTestCases,
1347
- totalTestCases: event.totalTestCases,
1348
- durationMs: event.durationMs,
1349
- passed: event.passed,
1350
- averageScore,
1577
+ setTestCases((prev) => {
1578
+ const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
1579
+ const existing = byId.get(event.testCaseId);
1580
+ const newEvent = {
1351
1581
  evaluatorScores: event.evaluatorScores.map((item) => ({
1352
1582
  evaluatorId: item.evaluatorId,
1353
1583
  evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
@@ -1355,9 +1585,33 @@ function RunView({
1355
1585
  passed: item.passed,
1356
1586
  metrics: item.metrics,
1357
1587
  logs: item.logs
1358
- }))
1359
- }
1360
- ]);
1588
+ })),
1589
+ passed: event.passed,
1590
+ durationMs: event.durationMs
1591
+ };
1592
+ const events = existing ? [...existing.events, newEvent] : [newEvent];
1593
+ const isAggregated = events.length > 1;
1594
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1595
+ events,
1596
+ nameById
1597
+ );
1598
+ const merged = {
1599
+ name: event.testCaseName,
1600
+ testCaseId: event.testCaseId,
1601
+ completedTestCases: event.completedTestCases,
1602
+ totalTestCases: event.totalTestCases,
1603
+ rerunIndex: event.rerunIndex,
1604
+ rerunTotal: event.rerunTotal,
1605
+ durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1606
+ passed: events.every((e) => e.passed),
1607
+ events,
1608
+ aggregatedEvaluatorScores,
1609
+ isAggregated
1610
+ };
1611
+ byId.set(event.testCaseId, merged);
1612
+ setCompletedEvaluations(event.completedTestCases);
1613
+ return Array.from(byId.values());
1614
+ });
1361
1615
  }
1362
1616
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
1363
1617
  unsubscribe();
@@ -1372,9 +1626,7 @@ function RunView({
1372
1626
  setRunInfo({
1373
1627
  runId: snapshot.runId,
1374
1628
  datasetName: snapshot.datasetName,
1375
- evaluatorNames: evaluators.map(
1376
- (e) => e.evaluator.getName() ?? e.id
1377
- ),
1629
+ evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
1378
1630
  totalTestCases: snapshot.totalTestCases
1379
1631
  });
1380
1632
  setPhase("running");
@@ -1388,6 +1640,7 @@ function RunView({
1388
1640
  failedTestCases: finalEvent.failedTestCases,
1389
1641
  totalTestCases: finalEvent.totalTestCases,
1390
1642
  overallScoreTotal,
1643
+ overallScoreSumSq,
1391
1644
  overallScoreCount,
1392
1645
  aggregates: new Map(aggregates),
1393
1646
  artifactPath: finalEvent.artifactPath
@@ -1402,29 +1655,41 @@ function RunView({
1402
1655
  /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1403
1656
  runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1404
1657
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1405
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
1658
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1659
+ "Run",
1660
+ " "
1661
+ ] }),
1406
1662
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
1407
1663
  ] }),
1408
1664
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1409
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
1665
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1666
+ "Dataset",
1667
+ " "
1668
+ ] }),
1410
1669
  runInfo.datasetName
1411
1670
  ] }),
1412
1671
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1413
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
1672
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1673
+ "Evaluators",
1674
+ " "
1675
+ ] }),
1414
1676
  runInfo.evaluatorNames.join(", ")
1415
1677
  ] }),
1416
1678
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1417
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
1679
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1680
+ "Test cases",
1681
+ " "
1682
+ ] }),
1418
1683
  runInfo.totalTestCases
1419
1684
  ] })
1420
1685
  ] }),
1421
1686
  phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1422
1687
  Spinner,
1423
1688
  {
1424
- label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1689
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1425
1690
  }
1426
1691
  ) }),
1427
- testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1692
+ testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1428
1693
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1429
1694
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1430
1695
  "[",
@@ -1435,49 +1700,78 @@ function RunView({
1435
1700
  ] }),
1436
1701
  " ",
1437
1702
  tc.name,
1703
+ " ",
1704
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1705
+ "(",
1706
+ tc.rerunIndex,
1707
+ "/",
1708
+ tc.rerunTotal,
1709
+ ")"
1710
+ ] }),
1438
1711
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1439
1712
  " (",
1440
1713
  tc.durationMs,
1441
1714
  "ms)"
1442
1715
  ] })
1443
1716
  ] }),
1444
- tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
1445
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1446
- item.evaluatorName,
1447
- ":",
1448
- " ",
1449
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1450
- " ",
1451
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1452
- formatScorePart(s),
1453
- " "
1454
- ] }, s.id)),
1455
- item.metrics?.map((m) => {
1456
- const def = getMetricById(m.id);
1457
- if (!def)
1458
- return null;
1459
- const formatted = def.format(m.data);
1460
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1461
- "[",
1462
- def.name ? `${def.name}: ` : "",
1463
- formatted,
1464
- "]",
1465
- " "
1466
- ] }, m.id);
1467
- })
1468
- ] }),
1469
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1470
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1471
- ink.Text,
1472
- {
1473
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1474
- children: line
1475
- },
1476
- lineIdx
1477
- )) }, logIdx) : null
1478
- ) })
1479
- ] }, item.evaluatorId))
1480
- ] }, i)) }),
1717
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
1718
+ ink.Box,
1719
+ {
1720
+ flexDirection: "column",
1721
+ marginLeft: 2,
1722
+ children: [
1723
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1724
+ item.evaluatorName,
1725
+ ":",
1726
+ " ",
1727
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1728
+ " ",
1729
+ item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
1730
+ ink.Text,
1731
+ {
1732
+ color: scoreColor(toNumericScore(s.data) ?? 0),
1733
+ children: [
1734
+ formatScorePart(s, scoreColor, {
1735
+ isAggregated: tc.isAggregated
1736
+ }),
1737
+ " "
1738
+ ]
1739
+ },
1740
+ s.id
1741
+ )),
1742
+ item.metrics?.map((m) => {
1743
+ const def = getMetricById(m.id);
1744
+ if (!def)
1745
+ return null;
1746
+ const formatted = def.format(m.data, {
1747
+ isAggregated: tc.isAggregated
1748
+ });
1749
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1750
+ "[",
1751
+ def.name ? `${def.name}: ` : "",
1752
+ formatted,
1753
+ "]",
1754
+ " "
1755
+ ] }, m.id);
1756
+ })
1757
+ ] }),
1758
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1759
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1760
+ ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1761
+ ink.Text,
1762
+ {
1763
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1764
+ children: line
1765
+ },
1766
+ lineIdx
1767
+ )
1768
+ ) }, logIdx) : null
1769
+ ) })
1770
+ ]
1771
+ },
1772
+ item.evaluatorId
1773
+ ))
1774
+ ] }, tc.testCaseId)) }),
1481
1775
  phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1482
1776
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
1483
1777
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
@@ -1504,7 +1798,14 @@ function RunView({
1504
1798
  label: "overall avg",
1505
1799
  value: summary.overallScoreTotal / summary.overallScoreCount,
1506
1800
  barWidth: 20,
1507
- format: (v) => v.toFixed(2)
1801
+ format: (v) => {
1802
+ const sd = sampleStdDev(
1803
+ summary.overallScoreTotal,
1804
+ summary.overallScoreSumSq,
1805
+ summary.overallScoreCount
1806
+ );
1807
+ return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
1808
+ }
1508
1809
  }
1509
1810
  ) }),
1510
1811
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
@@ -1519,12 +1820,15 @@ function RunView({
1519
1820
  ] }, id);
1520
1821
  }
1521
1822
  const mean = agg.total / agg.count;
1823
+ const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
1824
+ const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1522
1825
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1523
1826
  "- ",
1524
1827
  name.padEnd(28),
1525
1828
  " avg=",
1526
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1527
- " passed=",
1829
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: meanStr }),
1830
+ " ",
1831
+ "passed=",
1528
1832
  agg.passed,
1529
1833
  " failed=",
1530
1834
  agg.failed
@@ -1533,28 +1837,41 @@ function RunView({
1533
1837
  ] }),
1534
1838
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1535
1839
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
1536
- testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1537
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1538
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1539
- " ",
1540
- tc.name.padEnd(24)
1541
- ] }),
1542
- tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1543
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
1544
- "score=",
1545
- tc.averageScore.toFixed(2)
1840
+ testCases.map((tc) => {
1841
+ const allScores = tc.events.flatMap(
1842
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1843
+ );
1844
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1845
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1846
+ const total = allScores.reduce((a, b) => a + b, 0);
1847
+ const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
1848
+ const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1849
+ const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1850
+ isAggregated: true
1851
+ }) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
1852
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1853
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1854
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1855
+ " ",
1856
+ tc.name.padEnd(24)
1546
1857
  ] }),
1858
+ averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1859
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(averageScore), children: [
1860
+ "score=",
1861
+ scoreLabel
1862
+ ] }),
1863
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1864
+ " ",
1865
+ createBar(averageScore, 100, 14)
1866
+ ] })
1867
+ ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1547
1868
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1548
- " ",
1549
- createBar(tc.averageScore, 100, 14)
1869
+ " (",
1870
+ tc.durationMs,
1871
+ "ms)"
1550
1872
  ] })
1551
- ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1552
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1553
- " (",
1554
- tc.durationMs,
1555
- "ms)"
1556
- ] })
1557
- ] }, i))
1873
+ ] }, tc.testCaseId);
1874
+ })
1558
1875
  ] }),
1559
1876
  /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1560
1877
  "artifact: ",
@@ -1565,6 +1882,61 @@ function RunView({
1565
1882
  }
1566
1883
 
1567
1884
  // src/cli-simple/run.ts
1885
+ function sampleStdDev2(sum, sumSq, n) {
1886
+ if (n < 2)
1887
+ return void 0;
1888
+ const mean = sum / n;
1889
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1890
+ return variance > 0 ? Math.sqrt(variance) : 0;
1891
+ }
1892
+ function buildTestCaseSummaries(byId) {
1893
+ const summaries = [];
1894
+ for (const { name, events } of byId.values()) {
1895
+ const passed = events.every((e) => e.passed);
1896
+ const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1897
+ const isAggregated = events.length > 1;
1898
+ const allScores = events.flatMap(
1899
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1900
+ );
1901
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1902
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1903
+ const total = allScores.reduce((a, b) => a + b, 0);
1904
+ const stdDev = sampleStdDev2(total, sumSq, allScores.length);
1905
+ let firstAggregatedScore;
1906
+ for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1907
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1908
+ for (const ev of events) {
1909
+ const es = ev.evaluatorScores.find(
1910
+ (x) => x.evaluatorId === evaluatorScores.evaluatorId
1911
+ );
1912
+ for (const s of es?.scores ?? []) {
1913
+ const list = scoreIdToItems.get(s.id) ?? [];
1914
+ list.push(s);
1915
+ scoreIdToItems.set(s.id, list);
1916
+ }
1917
+ }
1918
+ for (const items of scoreIdToItems.values()) {
1919
+ const agg = aggregateScoreItems(items);
1920
+ if (agg && firstAggregatedScore === void 0) {
1921
+ firstAggregatedScore = agg;
1922
+ break;
1923
+ }
1924
+ }
1925
+ if (firstAggregatedScore !== void 0)
1926
+ break;
1927
+ }
1928
+ summaries.push({
1929
+ name,
1930
+ averageScore,
1931
+ stdDev: stdDev ?? void 0,
1932
+ aggregatedScoreItem: firstAggregatedScore,
1933
+ isAggregated,
1934
+ durationMs,
1935
+ passed
1936
+ });
1937
+ }
1938
+ return summaries;
1939
+ }
1568
1940
  var ansi2 = {
1569
1941
  reset: "\x1B[0m",
1570
1942
  bold: "\x1B[1m",
@@ -1592,14 +1964,59 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1592
1964
  return `- ${evaluatorName.padEnd(28)} no numeric scores`;
1593
1965
  }
1594
1966
  const mean = aggregate.total / aggregate.count;
1595
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
1967
+ const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
1968
+ const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1969
+ return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
1596
1970
  }
1597
1971
  function createBar2(value, max = 100, width = 20) {
1598
1972
  const safe = Math.max(0, Math.min(max, value));
1599
1973
  const filled = Math.round(safe / max * width);
1600
1974
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
1601
1975
  }
1602
- function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1976
+ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1977
+ if (events.length === 0)
1978
+ return [];
1979
+ const evaluatorIds = new Set(
1980
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1981
+ );
1982
+ const result = [];
1983
+ for (const evaluatorId of evaluatorIds) {
1984
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1985
+ const metricIdToItems = /* @__PURE__ */ new Map();
1986
+ for (const ev of events) {
1987
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1988
+ for (const s of es?.scores ?? []) {
1989
+ const list = scoreIdToItems.get(s.id) ?? [];
1990
+ list.push(s);
1991
+ scoreIdToItems.set(s.id, list);
1992
+ }
1993
+ for (const m of es?.metrics ?? []) {
1994
+ const list = metricIdToItems.get(m.id) ?? [];
1995
+ list.push(m);
1996
+ metricIdToItems.set(m.id, list);
1997
+ }
1998
+ }
1999
+ const aggregatedScores = [];
2000
+ for (const items of scoreIdToItems.values()) {
2001
+ const agg = aggregateScoreItems(items);
2002
+ if (agg)
2003
+ aggregatedScores.push(agg);
2004
+ }
2005
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
2006
+ const passed = events.every((ev) => {
2007
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
2008
+ return es?.passed ?? false;
2009
+ });
2010
+ result.push({
2011
+ evaluatorId,
2012
+ scores: aggregatedScores,
2013
+ passed,
2014
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
2015
+ });
2016
+ }
2017
+ return result;
2018
+ }
2019
+ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1603
2020
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1604
2021
  const scoreParts = [];
1605
2022
  for (const item of scores) {
@@ -1611,7 +2028,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1611
2028
  );
1612
2029
  continue;
1613
2030
  }
1614
- const formatted = def.format(item.data);
2031
+ const formatted = def.format(item.data, options);
1615
2032
  switch (def.displayStrategy) {
1616
2033
  case "bar": {
1617
2034
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -1644,7 +2061,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1644
2061
  for (const { id, data } of metrics) {
1645
2062
  const def = getMetricById(id);
1646
2063
  if (def) {
1647
- const formatted = def.format(data);
2064
+ const formatted = def.format(data, options);
1648
2065
  metricParts.push(
1649
2066
  def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
1650
2067
  );
@@ -1677,8 +2094,9 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1677
2094
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1678
2095
  );
1679
2096
  const aggregates = /* @__PURE__ */ new Map();
1680
- const testCaseSummaries = [];
2097
+ const testCaseByTestId = /* @__PURE__ */ new Map();
1681
2098
  let overallScoreTotal = 0;
2099
+ let overallScoreSumSq = 0;
1682
2100
  let overallScoreCount = 0;
1683
2101
  let completedCount = 0;
1684
2102
  let totalCount = 0;
@@ -1691,6 +2109,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1691
2109
  }
1692
2110
  process.stdout.write("\r\x1B[2K");
1693
2111
  }
2112
+ function cursorUp(n) {
2113
+ if (!process.stdout.isTTY || n <= 0)
2114
+ return;
2115
+ process.stdout.write(`\x1B[${n}A`);
2116
+ }
1694
2117
  function drawSpinner() {
1695
2118
  if (!process.stdout.isTTY || runFinished) {
1696
2119
  return;
@@ -1704,6 +2127,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1704
2127
  )} ${colorize("(live)", ansi2.dim)}`
1705
2128
  );
1706
2129
  }
2130
+ let lastPrintedTestCaseId = null;
2131
+ let lastPrintedLineCount = 0;
1707
2132
  let spinnerTimer;
1708
2133
  const done = new Promise((resolve5) => {
1709
2134
  const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1711,55 +2136,94 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1711
2136
  completedCount = event.completedTestCases;
1712
2137
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1713
2138
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
1714
- clearLine();
1715
- console.log(
1716
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
1717
- );
2139
+ const testCaseId = event.testCaseId;
2140
+ const existing = testCaseByTestId.get(testCaseId) ?? {
2141
+ name: event.testCaseName,
2142
+ events: []
2143
+ };
2144
+ existing.events.push({
2145
+ averageScore,
2146
+ passed: event.passed,
2147
+ durationMs: event.durationMs,
2148
+ evaluatorScores: event.evaluatorScores
2149
+ });
2150
+ testCaseByTestId.set(testCaseId, existing);
1718
2151
  for (const item of event.evaluatorScores) {
1719
- const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
1720
- console.log(
1721
- formatEvaluatorScoreLine(
1722
- name,
1723
- item.scores,
1724
- item.passed,
1725
- item.metrics
1726
- )
1727
- );
1728
- if (!item.passed && item.logs && item.logs.length > 0) {
1729
- for (const log of item.logs) {
1730
- if (log.type === "diff") {
1731
- const useColor = process.stdout.isTTY;
1732
- for (const { type, line } of getDiffLines(log)) {
1733
- const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1734
- console.log(colored);
1735
- }
1736
- }
1737
- }
1738
- }
1739
2152
  const numeric = toNumericScoreFromScores(item.scores);
1740
2153
  if (numeric !== void 0) {
1741
2154
  const current = aggregates.get(item.evaluatorId) ?? {
1742
2155
  total: 0,
2156
+ sumSq: 0,
1743
2157
  count: 0,
1744
2158
  passed: 0,
1745
2159
  failed: 0
1746
2160
  };
1747
2161
  aggregates.set(item.evaluatorId, {
1748
2162
  total: current.total + numeric,
2163
+ sumSq: current.sumSq + numeric * numeric,
1749
2164
  count: current.count + 1,
1750
2165
  passed: current.passed + (item.passed ? 1 : 0),
1751
2166
  failed: current.failed + (item.passed ? 0 : 1)
1752
2167
  });
1753
2168
  overallScoreTotal += numeric;
2169
+ overallScoreSumSq += numeric * numeric;
1754
2170
  overallScoreCount += 1;
1755
2171
  }
1756
2172
  }
1757
- testCaseSummaries.push({
1758
- name: event.testCaseName,
1759
- averageScore,
1760
- durationMs: event.durationMs,
1761
- passed: event.passed
1762
- });
2173
+ const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2174
+ const isLastRerun = event.rerunIndex >= event.rerunTotal;
2175
+ const isNonTty = !process.stdout.isTTY;
2176
+ const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
2177
+ if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
2178
+ cursorUp(lastPrintedLineCount);
2179
+ }
2180
+ const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2181
+ existing.events);
2182
+ const isAggregated = existing.events.length > 1;
2183
+ const durationMs = existing.events.reduce(
2184
+ (s, e) => s + e.durationMs,
2185
+ 0
2186
+ );
2187
+ existing.events.every((e) => e.passed);
2188
+ const lines = [];
2189
+ lines.push(
2190
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2191
+ );
2192
+ for (const item of aggregatedScores) {
2193
+ const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2194
+ lines.push(
2195
+ formatEvaluatorScoreLine(
2196
+ name,
2197
+ item.scores,
2198
+ item.passed,
2199
+ item.metrics,
2200
+ { isAggregated }
2201
+ )
2202
+ );
2203
+ const lastEvent = existing.events[existing.events.length - 1];
2204
+ const lastEs = lastEvent?.evaluatorScores.find(
2205
+ (x) => x.evaluatorId === item.evaluatorId
2206
+ );
2207
+ if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2208
+ for (const log of lastEs.logs) {
2209
+ if (log.type === "diff") {
2210
+ const useColor = process.stdout.isTTY;
2211
+ for (const { type, line } of getDiffLines(log)) {
2212
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2213
+ lines.push(colored);
2214
+ }
2215
+ }
2216
+ }
2217
+ }
2218
+ }
2219
+ if (!skipPrintNonTty) {
2220
+ for (let i = 0; i < lines.length; i++) {
2221
+ process.stdout.write(`\r\x1B[2K${lines[i]}
2222
+ `);
2223
+ }
2224
+ lastPrintedTestCaseId = testCaseId;
2225
+ lastPrintedLineCount = lines.length;
2226
+ }
1763
2227
  drawSpinner();
1764
2228
  }
1765
2229
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
@@ -1810,9 +2274,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1810
2274
  );
1811
2275
  if (overallScoreCount > 0) {
1812
2276
  const overallAverage = overallScoreTotal / overallScoreCount;
2277
+ const overallSd = sampleStdDev2(
2278
+ overallScoreTotal,
2279
+ overallScoreSumSq,
2280
+ overallScoreCount
2281
+ );
2282
+ const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
1813
2283
  console.log(
1814
2284
  `- overall avg score: ${colorize(
1815
- overallAverage.toFixed(2),
2285
+ avgStr,
1816
2286
  scoreToColor(overallAverage)
1817
2287
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
1818
2288
  );
@@ -1823,6 +2293,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1823
2293
  getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
1824
2294
  );
1825
2295
  }
2296
+ const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
1826
2297
  if (testCaseSummaries.length > 0) {
1827
2298
  console.log(colorize("- test case scores:", ansi2.magenta));
1828
2299
  for (const summary of testCaseSummaries) {
@@ -1833,9 +2304,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1833
2304
  );
1834
2305
  continue;
1835
2306
  }
2307
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2308
+ summary.aggregatedScoreItem.data,
2309
+ { isAggregated: true }
2310
+ ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
1836
2311
  console.log(
1837
2312
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1838
- summary.averageScore.toFixed(2),
2313
+ scoreLabel,
1839
2314
  scoreToColor(summary.averageScore)
1840
2315
  )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1841
2316
  );