@m4trix/evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
  import { randomUUID } from 'crypto';
3
- import { Effect, PubSub, Queue, Fiber } from 'effect';
3
+ import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
4
4
  import { existsSync } from 'fs';
5
5
  import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
@@ -30,7 +30,8 @@ var defaultRunnerConfig = {
30
30
  ],
31
31
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
32
32
  },
33
- artifactDirectory: ".eval-results"
33
+ artifactDirectory: ".eval-results",
34
+ maxConcurrency: 1
34
35
  };
35
36
  function toRunnerConfigOverrides(config) {
36
37
  if (!config) {
@@ -63,6 +64,9 @@ function toRunnerConfigOverrides(config) {
63
64
  if (config.artifactDirectory !== void 0) {
64
65
  overrides.artifactDirectory = config.artifactDirectory;
65
66
  }
67
+ if (config.maxConcurrency !== void 0) {
68
+ overrides.maxConcurrency = config.maxConcurrency;
69
+ }
66
70
  if (Object.keys(discovery).length > 0) {
67
71
  overrides.discovery = discovery;
68
72
  }
@@ -287,6 +291,7 @@ var Metric = {
287
291
  const def = {
288
292
  id: config.id,
289
293
  name: config.name,
294
+ aggregate: config.aggregate,
290
295
  format: config.format,
291
296
  make: (data) => ({ id: config.id, data })
292
297
  };
@@ -306,6 +311,7 @@ var Score = {
306
311
  id: config.id,
307
312
  name: config.name,
308
313
  displayStrategy: config.displayStrategy,
314
+ aggregate: config.aggregate,
309
315
  format: config.format,
310
316
  make: (data, options) => {
311
317
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -324,23 +330,62 @@ function getScoreById(id) {
324
330
  return registry2.get(id);
325
331
  }
326
332
 
333
+ // src/evals/aggregators.ts
334
+ function aggregateAverage(values) {
335
+ if (values.length === 0) {
336
+ return { value: 0 };
337
+ }
338
+ const sum = values.reduce((s, v) => s + v.value, 0);
339
+ return { value: sum / values.length };
340
+ }
341
+ function aggregateAll(values) {
342
+ return { passed: values.length > 0 && values.every((v) => v.passed) };
343
+ }
344
+ function aggregateTokenCountSum(values) {
345
+ const initial = {
346
+ input: 0,
347
+ output: 0,
348
+ inputCached: 0,
349
+ outputCached: 0
350
+ };
351
+ return values.reduce(
352
+ (acc, v) => ({
353
+ input: acc.input + (v.input ?? 0),
354
+ output: acc.output + (v.output ?? 0),
355
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
356
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
357
+ }),
358
+ initial
359
+ );
360
+ }
361
+ function aggregateLatencyAverage(values) {
362
+ if (values.length === 0) {
363
+ return { ms: 0 };
364
+ }
365
+ const sum = values.reduce((s, v) => s + v.ms, 0);
366
+ return { ms: sum / values.length };
367
+ }
368
+
327
369
  // src/evals/metrics/standard.ts
328
370
  Metric.of({
329
371
  id: "token-count",
330
372
  name: "Tokens",
331
- format: (data) => {
373
+ aggregate: aggregateTokenCountSum,
374
+ format: (data, options) => {
332
375
  const input = data.input ?? 0;
333
376
  const output = data.output ?? 0;
334
377
  const inputCached = data.inputCached ?? 0;
335
378
  const outputCached = data.outputCached ?? 0;
336
379
  const cached = inputCached + outputCached;
337
- return `in:${input} out:${output} cached:${cached}`;
380
+ const base = `in:${input} out:${output} cached:${cached}`;
381
+ return options?.isAggregated ? `Total: ${base}` : base;
338
382
  }
339
383
  });
340
384
  Metric.of({
341
385
  id: "latency",
342
386
  name: "Latency",
343
- format: (data) => `${data.ms}ms`
387
+ aggregate: aggregateLatencyAverage,
388
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
344
389
  });
345
390
 
346
391
  // src/evals/scores/standard.ts
@@ -348,16 +393,36 @@ Score.of({
348
393
  id: "percent",
349
394
  name: "Score",
350
395
  displayStrategy: "bar",
351
- format: (data) => data.value.toFixed(2)
396
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
397
+ aggregate: aggregateAverage
352
398
  });
353
399
  Score.of({
354
400
  id: "binary",
355
401
  name: "Result",
356
402
  displayStrategy: "passFail",
357
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
403
+ format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
404
+ aggregate: aggregateAll
358
405
  });
359
406
 
360
407
  // src/runner/score-utils.ts
408
+ function aggregateScoreItems(items) {
409
+ if (items.length === 0)
410
+ return void 0;
411
+ const def = getScoreById(items[0].id);
412
+ if (!def?.aggregate)
413
+ return items[items.length - 1];
414
+ const aggregated = def.aggregate(items.map((i) => i.data));
415
+ return { ...items[0], data: aggregated };
416
+ }
417
+ function aggregateMetricItems(items) {
418
+ if (items.length === 0)
419
+ return void 0;
420
+ const def = getMetricById(items[0].id);
421
+ if (!def?.aggregate)
422
+ return items[items.length - 1];
423
+ const aggregated = def.aggregate(items.map((i) => i.data));
424
+ return { ...items[0], data: aggregated };
425
+ }
361
426
  function toNumericScoreFromScores(scores) {
362
427
  for (const item of scores) {
363
428
  const def = getScoreById(item.id);
@@ -436,6 +501,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
436
501
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
437
502
  );
438
503
  }
504
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
505
+ return Effect.gen(function* () {
506
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
507
+ const rerunPassed = [];
508
+ for (let r = 0; r < reruns; r++) {
509
+ const started = Date.now();
510
+ const evaluatorScores = [];
511
+ let testCaseError;
512
+ const output = readOutput(testCaseItem.testCase);
513
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
514
+ const evaluateFn = evaluator.getEvaluateFn();
515
+ if (!evaluateFn) {
516
+ continue;
517
+ }
518
+ try {
519
+ const logs = [];
520
+ const logDiff = (expected, actual, options) => {
521
+ logs.push(createDiffLogEntry(expected, actual, options));
522
+ };
523
+ const ctx = yield* Effect.promise(
524
+ () => Promise.resolve(evaluator.resolveContext())
525
+ );
526
+ const result = yield* Effect.promise(
527
+ () => Promise.resolve(
528
+ evaluateFn({
529
+ input: testCaseItem.testCase.getInput(),
530
+ ctx,
531
+ output,
532
+ logDiff
533
+ })
534
+ )
535
+ );
536
+ const { scores, metrics } = normalizeResult(result);
537
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
538
+ evaluatorScores.push({
539
+ evaluatorId,
540
+ scores,
541
+ passed: passed2,
542
+ metrics,
543
+ logs: logs.length > 0 ? logs : void 0
544
+ });
545
+ } catch (error) {
546
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
547
+ evaluatorScores.push({
548
+ evaluatorId,
549
+ scores: [],
550
+ passed: false
551
+ });
552
+ }
553
+ }
554
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
555
+ rerunPassed.push(rerunPassedThis);
556
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
557
+ n + 1,
558
+ n + 1
559
+ ]);
560
+ const progressEvent = {
561
+ type: "TestCaseProgress",
562
+ runId: task.runId,
563
+ testCaseId: testCaseItem.id,
564
+ testCaseName: testCaseItem.testCase.getName(),
565
+ completedTestCases: completedEvaluations,
566
+ totalTestCases: totalEvaluations,
567
+ rerunIndex: r + 1,
568
+ rerunTotal: reruns,
569
+ passed: rerunPassedThis,
570
+ durationMs: Date.now() - started,
571
+ evaluatorScores,
572
+ output,
573
+ errorMessage: testCaseError
574
+ };
575
+ updateSnapshot(task.runId, (snapshot) => ({
576
+ ...snapshot,
577
+ completedTestCases: completedEvaluations
578
+ }));
579
+ yield* publishEvent(progressEvent);
580
+ yield* Queue.offer(persistenceQueue, {
581
+ runId: task.runId,
582
+ artifactPath: task.snapshot.artifactPath,
583
+ payload: progressEvent
584
+ });
585
+ }
586
+ const testCasePassed = rerunPassed.every(Boolean);
587
+ if (testCasePassed) {
588
+ yield* Ref.update(passedRef, (n) => n + 1);
589
+ } else {
590
+ yield* Ref.update(failedRef, (n) => n + 1);
591
+ }
592
+ const [passed, failed] = yield* Effect.all([
593
+ Ref.get(passedRef),
594
+ Ref.get(failedRef)
595
+ ]);
596
+ updateSnapshot(task.runId, (snapshot) => ({
597
+ ...snapshot,
598
+ passedTestCases: passed,
599
+ failedTestCases: failed
600
+ }));
601
+ });
602
+ }
439
603
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
440
604
  const startedAt = Date.now();
441
605
  updateSnapshot(task.runId, (snapshot) => ({
@@ -448,104 +612,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
448
612
  runId: task.runId,
449
613
  startedAt
450
614
  });
451
- let completedTestCases = 0;
452
- let passedTestCases = 0;
453
- let failedTestCases = 0;
454
- for (const testCaseItem of task.testCases) {
455
- const started = Date.now();
456
- const evaluatorScores = [];
457
- let testCaseError;
458
- const output = readOutput(testCaseItem.testCase);
459
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
460
- const evaluateFn = evaluator.getEvaluateFn();
461
- if (!evaluateFn) {
462
- continue;
463
- }
464
- try {
465
- const logs = [];
466
- const logDiff = (expected, actual, options) => {
467
- logs.push(createDiffLogEntry(expected, actual, options));
468
- };
469
- const ctx = yield* Effect.promise(
470
- () => Promise.resolve(evaluator.resolveContext())
471
- );
472
- const result = yield* Effect.promise(
473
- () => Promise.resolve(
474
- evaluateFn({
475
- input: testCaseItem.testCase.getInput(),
476
- ctx,
477
- output,
478
- logDiff
479
- })
480
- )
481
- );
482
- const { scores, metrics } = normalizeResult(result);
483
- const passed = computeEvaluatorPassed(evaluator, result, scores);
484
- evaluatorScores.push({
485
- evaluatorId,
486
- scores,
487
- passed,
488
- metrics,
489
- logs: logs.length > 0 ? logs : void 0
490
- });
491
- } catch (error) {
492
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
493
- evaluatorScores.push({
494
- evaluatorId,
495
- scores: [],
496
- passed: false
497
- });
498
- }
499
- }
500
- const testCasePassed = evaluatorScores.every((s) => s.passed);
501
- completedTestCases += 1;
502
- if (testCasePassed) {
503
- passedTestCases += 1;
504
- } else {
505
- failedTestCases += 1;
506
- }
507
- const progressEvent = {
508
- type: "TestCaseProgress",
509
- runId: task.runId,
510
- testCaseId: testCaseItem.id,
511
- testCaseName: testCaseItem.testCase.getName(),
512
- completedTestCases,
513
- totalTestCases: task.testCases.length,
514
- passed: testCasePassed,
515
- durationMs: Date.now() - started,
516
- evaluatorScores,
517
- output,
518
- errorMessage: testCaseError
519
- };
520
- updateSnapshot(task.runId, (snapshot) => ({
521
- ...snapshot,
522
- completedTestCases,
523
- passedTestCases,
524
- failedTestCases
525
- }));
526
- yield* publishEvent(progressEvent);
527
- yield* Queue.offer(persistenceQueue, {
528
- runId: task.runId,
529
- artifactPath: task.snapshot.artifactPath,
530
- payload: progressEvent
531
- });
532
- }
615
+ const totalEvaluations = task.testCases.reduce(
616
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
617
+ 0
618
+ );
619
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
620
+ const completedRef = yield* Ref.make(0);
621
+ const passedRef = yield* Ref.make(0);
622
+ const failedRef = yield* Ref.make(0);
623
+ const processTestCase = (testCaseItem) => processOneTestCase(
624
+ task,
625
+ testCaseItem,
626
+ totalEvaluations,
627
+ publishEvent,
628
+ persistenceQueue,
629
+ updateSnapshot,
630
+ completedRef,
631
+ passedRef,
632
+ failedRef
633
+ );
634
+ yield* Effect.forEach(
635
+ task.testCases,
636
+ processTestCase,
637
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
638
+ );
639
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
640
+ Ref.get(completedRef),
641
+ Ref.get(passedRef),
642
+ Ref.get(failedRef)
643
+ ]);
533
644
  const finishedAt = Date.now();
534
645
  const completedEvent = {
535
646
  type: "RunCompleted",
536
647
  runId: task.runId,
537
648
  finishedAt,
538
- passedTestCases,
539
- failedTestCases,
649
+ passedTestCases: passedUniqueTestCases,
650
+ failedTestCases: failedUniqueTestCases,
540
651
  totalTestCases: task.testCases.length,
541
652
  artifactPath: task.snapshot.artifactPath
542
653
  };
543
654
  updateSnapshot(task.runId, (snapshot) => ({
544
655
  ...snapshot,
545
656
  status: "completed",
546
- completedTestCases,
547
- passedTestCases,
548
- failedTestCases,
657
+ completedTestCases: completedEvaluations,
658
+ passedTestCases: passedUniqueTestCases,
659
+ failedTestCases: failedUniqueTestCases,
549
660
  finishedAt
550
661
  }));
551
662
  yield* publishEvent(completedEvent);
@@ -633,7 +744,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
633
744
  const artifactPath = filePath;
634
745
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
635
746
  const progress = aggregateTestCaseProgress(lines);
636
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
747
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
637
748
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
638
749
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
639
750
  return {
@@ -655,23 +766,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
655
766
  }
656
767
  function aggregateTestCaseProgress(lines) {
657
768
  let completedTestCases = 0;
658
- let passedTestCases = 0;
659
- let failedTestCases = 0;
769
+ const testCasePassedBy = /* @__PURE__ */ new Map();
660
770
  for (const line of lines) {
661
771
  try {
662
772
  const event = JSON.parse(line);
663
773
  if (event.type === "TestCaseProgress") {
664
774
  const ev = event;
665
775
  completedTestCases = ev.completedTestCases ?? completedTestCases;
666
- if (ev.passed) {
667
- passedTestCases += 1;
668
- } else {
669
- failedTestCases += 1;
670
- }
776
+ const id = ev.testCaseId;
777
+ const current = testCasePassedBy.get(id);
778
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
671
779
  }
672
780
  } catch {
673
781
  }
674
782
  }
783
+ let passedTestCases = 0;
784
+ let failedTestCases = 0;
785
+ for (const passed of testCasePassedBy.values()) {
786
+ if (passed) {
787
+ passedTestCases += 1;
788
+ } else {
789
+ failedTestCases += 1;
790
+ }
791
+ }
675
792
  return { completedTestCases, passedTestCases, failedTestCases };
676
793
  }
677
794
  async function appendJsonLine(artifactPath, payload) {
@@ -866,6 +983,10 @@ var EffectRunner = class {
866
983
  throw new Error("No evaluators selected for run");
867
984
  }
868
985
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
986
+ const totalEvaluations = selectedTestCases.reduce(
987
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
988
+ 0
989
+ );
869
990
  const runId = `run-${randomUUID()}`;
870
991
  const artifactPath = createArtifactPath(
871
992
  this.config.artifactDirectory,
@@ -878,7 +999,7 @@ var EffectRunner = class {
878
999
  datasetName: dataset.dataset.getName(),
879
1000
  evaluatorIds: selectedEvaluators.map((item) => item.id),
880
1001
  queuedAt: Date.now(),
881
- totalTestCases: selectedTestCases.length,
1002
+ totalTestCases: totalEvaluations,
882
1003
  completedTestCases: 0,
883
1004
  passedTestCases: 0,
884
1005
  failedTestCases: 0,
@@ -892,7 +1013,7 @@ var EffectRunner = class {
892
1013
  datasetId: request.datasetId,
893
1014
  datasetName: dataset.dataset.getName(),
894
1015
  evaluatorIds: selectedEvaluators.map((item) => item.id),
895
- totalTestCases: selectedTestCases.length,
1016
+ totalTestCases: totalEvaluations,
896
1017
  artifactPath
897
1018
  };
898
1019
  await Effect.runPromise(this.publishEvent(queuedEvent));
@@ -903,6 +1024,7 @@ var EffectRunner = class {
903
1024
  payload: queuedEvent
904
1025
  })
905
1026
  );
1027
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
906
1028
  await Effect.runPromise(
907
1029
  Queue.offer(this.runQueue, {
908
1030
  runId,
@@ -910,7 +1032,8 @@ var EffectRunner = class {
910
1032
  dataset: dataset.dataset,
911
1033
  evaluators: selectedEvaluators,
912
1034
  testCases: selectedTestCases,
913
- snapshot
1035
+ snapshot,
1036
+ maxConcurrency
914
1037
  })
915
1038
  );
916
1039
  return snapshot;
@@ -1228,13 +1351,62 @@ function createBar(value, max = 100, width = 20) {
1228
1351
  const filled = Math.round(safe / max * width);
1229
1352
  return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1230
1353
  }
1231
- function formatScorePart(item, scoreToColor2) {
1354
+ function aggregateEvaluatorScores(events, nameById) {
1355
+ if (events.length === 0)
1356
+ return [];
1357
+ const evaluatorIds = new Set(
1358
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1359
+ );
1360
+ const result = [];
1361
+ for (const evaluatorId of evaluatorIds) {
1362
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1363
+ const metricIdToItems = /* @__PURE__ */ new Map();
1364
+ for (const ev of events) {
1365
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1366
+ for (const s of es?.scores ?? []) {
1367
+ const list = scoreIdToItems.get(s.id) ?? [];
1368
+ list.push(s);
1369
+ scoreIdToItems.set(s.id, list);
1370
+ }
1371
+ for (const m of es?.metrics ?? []) {
1372
+ const list = metricIdToItems.get(m.id) ?? [];
1373
+ list.push(m);
1374
+ metricIdToItems.set(m.id, list);
1375
+ }
1376
+ }
1377
+ const aggregatedScores = [];
1378
+ for (const items of scoreIdToItems.values()) {
1379
+ const agg = aggregateScoreItems(items);
1380
+ if (agg)
1381
+ aggregatedScores.push(agg);
1382
+ }
1383
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1384
+ const passed = events.every((ev) => {
1385
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1386
+ return es?.passed ?? false;
1387
+ });
1388
+ const lastEvent = events[events.length - 1];
1389
+ const lastEs = lastEvent?.evaluatorScores.find(
1390
+ (x) => x.evaluatorId === evaluatorId
1391
+ );
1392
+ result.push({
1393
+ evaluatorId,
1394
+ evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
1395
+ scores: aggregatedScores,
1396
+ passed,
1397
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
1398
+ logs: lastEs?.logs
1399
+ });
1400
+ }
1401
+ return result;
1402
+ }
1403
+ function formatScorePart(item, scoreToColor2, options) {
1232
1404
  const def = getScoreById(item.id);
1233
1405
  if (!def) {
1234
1406
  const numeric = toNumericScore(item.data);
1235
1407
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1236
1408
  }
1237
- const formatted = def.format(item.data);
1409
+ const formatted = def.format(item.data, options);
1238
1410
  if (def.displayStrategy === "bar") {
1239
1411
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1240
1412
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1254,6 +1426,7 @@ function RunView({
1254
1426
  );
1255
1427
  const [runInfo, setRunInfo] = useState(null);
1256
1428
  const [testCases, setTestCases] = useState([]);
1429
+ const [completedEvaluations, setCompletedEvaluations] = useState(0);
1257
1430
  const [summary, setSummary] = useState(null);
1258
1431
  const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1259
1432
  const runEval = useCallback(async () => {
@@ -1280,10 +1453,7 @@ function RunView({
1280
1453
  return;
1281
1454
  }
1282
1455
  const nameById = new Map(
1283
- evaluators.map((item) => [
1284
- item.id,
1285
- item.evaluator.getName() ?? item.id
1286
- ])
1456
+ evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1287
1457
  );
1288
1458
  setEvaluatorNameById(nameById);
1289
1459
  const aggregates = /* @__PURE__ */ new Map();
@@ -1293,7 +1463,7 @@ function RunView({
1293
1463
  const unsubscribe = runner.subscribeRunEvents((event) => {
1294
1464
  if (event.type === "TestCaseProgress") {
1295
1465
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1296
- const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1466
+ numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1297
1467
  for (const item of event.evaluatorScores) {
1298
1468
  const numeric = toNumericScoreFromScores(item.scores);
1299
1469
  if (numeric !== void 0) {
@@ -1313,15 +1483,10 @@ function RunView({
1313
1483
  overallScoreCount += 1;
1314
1484
  }
1315
1485
  }
1316
- setTestCases((prev) => [
1317
- ...prev,
1318
- {
1319
- name: event.testCaseName,
1320
- completedTestCases: event.completedTestCases,
1321
- totalTestCases: event.totalTestCases,
1322
- durationMs: event.durationMs,
1323
- passed: event.passed,
1324
- averageScore,
1486
+ setTestCases((prev) => {
1487
+ const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
1488
+ const existing = byId.get(event.testCaseId);
1489
+ const newEvent = {
1325
1490
  evaluatorScores: event.evaluatorScores.map((item) => ({
1326
1491
  evaluatorId: item.evaluatorId,
1327
1492
  evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
@@ -1329,9 +1494,33 @@ function RunView({
1329
1494
  passed: item.passed,
1330
1495
  metrics: item.metrics,
1331
1496
  logs: item.logs
1332
- }))
1333
- }
1334
- ]);
1497
+ })),
1498
+ passed: event.passed,
1499
+ durationMs: event.durationMs
1500
+ };
1501
+ const events = existing ? [...existing.events, newEvent] : [newEvent];
1502
+ const isAggregated = events.length > 1;
1503
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1504
+ events,
1505
+ nameById
1506
+ );
1507
+ const merged = {
1508
+ name: event.testCaseName,
1509
+ testCaseId: event.testCaseId,
1510
+ completedTestCases: event.completedTestCases,
1511
+ totalTestCases: event.totalTestCases,
1512
+ rerunIndex: event.rerunIndex,
1513
+ rerunTotal: event.rerunTotal,
1514
+ durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1515
+ passed: events.every((e) => e.passed),
1516
+ events,
1517
+ aggregatedEvaluatorScores,
1518
+ isAggregated
1519
+ };
1520
+ byId.set(event.testCaseId, merged);
1521
+ setCompletedEvaluations(event.completedTestCases);
1522
+ return Array.from(byId.values());
1523
+ });
1335
1524
  }
1336
1525
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
1337
1526
  unsubscribe();
@@ -1346,9 +1535,7 @@ function RunView({
1346
1535
  setRunInfo({
1347
1536
  runId: snapshot.runId,
1348
1537
  datasetName: snapshot.datasetName,
1349
- evaluatorNames: evaluators.map(
1350
- (e) => e.evaluator.getName() ?? e.id
1351
- ),
1538
+ evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
1352
1539
  totalTestCases: snapshot.totalTestCases
1353
1540
  });
1354
1541
  setPhase("running");
@@ -1376,29 +1563,41 @@ function RunView({
1376
1563
  /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
1377
1564
  runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1378
1565
  /* @__PURE__ */ jsxs(Text, { children: [
1379
- /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
1566
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1567
+ "Run",
1568
+ " "
1569
+ ] }),
1380
1570
  /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
1381
1571
  ] }),
1382
1572
  /* @__PURE__ */ jsxs(Text, { children: [
1383
- /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
1573
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1574
+ "Dataset",
1575
+ " "
1576
+ ] }),
1384
1577
  runInfo.datasetName
1385
1578
  ] }),
1386
1579
  /* @__PURE__ */ jsxs(Text, { children: [
1387
- /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
1580
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1581
+ "Evaluators",
1582
+ " "
1583
+ ] }),
1388
1584
  runInfo.evaluatorNames.join(", ")
1389
1585
  ] }),
1390
1586
  /* @__PURE__ */ jsxs(Text, { children: [
1391
- /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
1587
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1588
+ "Test cases",
1589
+ " "
1590
+ ] }),
1392
1591
  runInfo.totalTestCases
1393
1592
  ] })
1394
1593
  ] }),
1395
1594
  phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
1396
1595
  Spinner,
1397
1596
  {
1398
- label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1597
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1399
1598
  }
1400
1599
  ) }),
1401
- testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1600
+ testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1402
1601
  /* @__PURE__ */ jsxs(Text, { children: [
1403
1602
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1404
1603
  "[",
@@ -1409,49 +1608,78 @@ function RunView({
1409
1608
  ] }),
1410
1609
  " ",
1411
1610
  tc.name,
1611
+ " ",
1612
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1613
+ "(",
1614
+ tc.rerunIndex,
1615
+ "/",
1616
+ tc.rerunTotal,
1617
+ ")"
1618
+ ] }),
1412
1619
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1413
1620
  " (",
1414
1621
  tc.durationMs,
1415
1622
  "ms)"
1416
1623
  ] })
1417
1624
  ] }),
1418
- tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
1419
- /* @__PURE__ */ jsxs(Text, { children: [
1420
- item.evaluatorName,
1421
- ":",
1422
- " ",
1423
- /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1424
- " ",
1425
- item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1426
- formatScorePart(s),
1427
- " "
1428
- ] }, s.id)),
1429
- item.metrics?.map((m) => {
1430
- const def = getMetricById(m.id);
1431
- if (!def)
1432
- return null;
1433
- const formatted = def.format(m.data);
1434
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1435
- "[",
1436
- def.name ? `${def.name}: ` : "",
1437
- formatted,
1438
- "]",
1439
- " "
1440
- ] }, m.id);
1441
- })
1442
- ] }),
1443
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1444
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
1445
- Text,
1446
- {
1447
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1448
- children: line
1449
- },
1450
- lineIdx
1451
- )) }, logIdx) : null
1452
- ) })
1453
- ] }, item.evaluatorId))
1454
- ] }, i)) }),
1625
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
1626
+ Box,
1627
+ {
1628
+ flexDirection: "column",
1629
+ marginLeft: 2,
1630
+ children: [
1631
+ /* @__PURE__ */ jsxs(Text, { children: [
1632
+ item.evaluatorName,
1633
+ ":",
1634
+ " ",
1635
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1636
+ " ",
1637
+ item.scores.map((s) => /* @__PURE__ */ jsxs(
1638
+ Text,
1639
+ {
1640
+ color: scoreColor(toNumericScore(s.data) ?? 0),
1641
+ children: [
1642
+ formatScorePart(s, scoreColor, {
1643
+ isAggregated: tc.isAggregated
1644
+ }),
1645
+ " "
1646
+ ]
1647
+ },
1648
+ s.id
1649
+ )),
1650
+ item.metrics?.map((m) => {
1651
+ const def = getMetricById(m.id);
1652
+ if (!def)
1653
+ return null;
1654
+ const formatted = def.format(m.data, {
1655
+ isAggregated: tc.isAggregated
1656
+ });
1657
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1658
+ "[",
1659
+ def.name ? `${def.name}: ` : "",
1660
+ formatted,
1661
+ "]",
1662
+ " "
1663
+ ] }, m.id);
1664
+ })
1665
+ ] }),
1666
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1667
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
1668
+ ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
1669
+ Text,
1670
+ {
1671
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1672
+ children: line
1673
+ },
1674
+ lineIdx
1675
+ )
1676
+ ) }, logIdx) : null
1677
+ ) })
1678
+ ]
1679
+ },
1680
+ item.evaluatorId
1681
+ ))
1682
+ ] }, tc.testCaseId)) }),
1455
1683
  phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1456
1684
  /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
1457
1685
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
@@ -1498,7 +1726,8 @@ function RunView({
1498
1726
  name.padEnd(28),
1499
1727
  " avg=",
1500
1728
  /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1501
- " passed=",
1729
+ " ",
1730
+ "passed=",
1502
1731
  agg.passed,
1503
1732
  " failed=",
1504
1733
  agg.failed
@@ -1507,28 +1736,38 @@ function RunView({
1507
1736
  ] }),
1508
1737
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1509
1738
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
1510
- testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
1511
- /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1512
- /* @__PURE__ */ jsxs(Text, { children: [
1513
- " ",
1514
- tc.name.padEnd(24)
1515
- ] }),
1516
- tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1517
- /* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
1518
- "score=",
1519
- tc.averageScore.toFixed(2)
1739
+ testCases.map((tc) => {
1740
+ const numericScores = tc.aggregatedEvaluatorScores.flatMap(
1741
+ (item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
1742
+ );
1743
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1744
+ const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1745
+ const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1746
+ isAggregated: true
1747
+ }) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
1748
+ return /* @__PURE__ */ jsxs(Box, { children: [
1749
+ /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1750
+ /* @__PURE__ */ jsxs(Text, { children: [
1751
+ " ",
1752
+ tc.name.padEnd(24)
1520
1753
  ] }),
1754
+ averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1755
+ /* @__PURE__ */ jsxs(Text, { color: scoreColor(averageScore), children: [
1756
+ "score=",
1757
+ scoreLabel
1758
+ ] }),
1759
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1760
+ " ",
1761
+ createBar(averageScore, 100, 14)
1762
+ ] })
1763
+ ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
1521
1764
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1522
- " ",
1523
- createBar(tc.averageScore, 100, 14)
1765
+ " (",
1766
+ tc.durationMs,
1767
+ "ms)"
1524
1768
  ] })
1525
- ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
1526
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1527
- " (",
1528
- tc.durationMs,
1529
- "ms)"
1530
- ] })
1531
- ] }, i))
1769
+ ] }, tc.testCaseId);
1770
+ })
1532
1771
  ] }),
1533
1772
  /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1534
1773
  "artifact: ",
@@ -1539,6 +1778,51 @@ function RunView({
1539
1778
  }
1540
1779
 
1541
1780
  // src/cli-simple/run.ts
1781
+ function buildTestCaseSummaries(byId) {
1782
+ const summaries = [];
1783
+ for (const { name, events } of byId.values()) {
1784
+ const passed = events.every((e) => e.passed);
1785
+ const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1786
+ const isAggregated = events.length > 1;
1787
+ const numericScores = [];
1788
+ let firstAggregatedScore;
1789
+ for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1790
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1791
+ for (const ev of events) {
1792
+ const es = ev.evaluatorScores.find(
1793
+ (x) => x.evaluatorId === evaluatorScores.evaluatorId
1794
+ );
1795
+ for (const s of es?.scores ?? []) {
1796
+ const list = scoreIdToItems.get(s.id) ?? [];
1797
+ list.push(s);
1798
+ scoreIdToItems.set(s.id, list);
1799
+ }
1800
+ }
1801
+ for (const items of scoreIdToItems.values()) {
1802
+ const agg = aggregateScoreItems(items);
1803
+ if (agg) {
1804
+ const n = toNumericScoreFromScores([agg]);
1805
+ if (n !== void 0) {
1806
+ numericScores.push(n);
1807
+ if (firstAggregatedScore === void 0) {
1808
+ firstAggregatedScore = agg;
1809
+ }
1810
+ }
1811
+ }
1812
+ }
1813
+ }
1814
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1815
+ summaries.push({
1816
+ name,
1817
+ averageScore,
1818
+ aggregatedScoreItem: firstAggregatedScore,
1819
+ isAggregated,
1820
+ durationMs,
1821
+ passed
1822
+ });
1823
+ }
1824
+ return summaries;
1825
+ }
1542
1826
  var ansi2 = {
1543
1827
  reset: "\x1B[0m",
1544
1828
  bold: "\x1B[1m",
@@ -1573,7 +1857,50 @@ function createBar2(value, max = 100, width = 20) {
1573
1857
  const filled = Math.round(safe / max * width);
1574
1858
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
1575
1859
  }
1576
- function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1860
+ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1861
+ if (events.length === 0)
1862
+ return [];
1863
+ const evaluatorIds = new Set(
1864
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1865
+ );
1866
+ const result = [];
1867
+ for (const evaluatorId of evaluatorIds) {
1868
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1869
+ const metricIdToItems = /* @__PURE__ */ new Map();
1870
+ for (const ev of events) {
1871
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1872
+ for (const s of es?.scores ?? []) {
1873
+ const list = scoreIdToItems.get(s.id) ?? [];
1874
+ list.push(s);
1875
+ scoreIdToItems.set(s.id, list);
1876
+ }
1877
+ for (const m of es?.metrics ?? []) {
1878
+ const list = metricIdToItems.get(m.id) ?? [];
1879
+ list.push(m);
1880
+ metricIdToItems.set(m.id, list);
1881
+ }
1882
+ }
1883
+ const aggregatedScores = [];
1884
+ for (const items of scoreIdToItems.values()) {
1885
+ const agg = aggregateScoreItems(items);
1886
+ if (agg)
1887
+ aggregatedScores.push(agg);
1888
+ }
1889
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1890
+ const passed = events.every((ev) => {
1891
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1892
+ return es?.passed ?? false;
1893
+ });
1894
+ result.push({
1895
+ evaluatorId,
1896
+ scores: aggregatedScores,
1897
+ passed,
1898
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
1899
+ });
1900
+ }
1901
+ return result;
1902
+ }
1903
+ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1577
1904
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1578
1905
  const scoreParts = [];
1579
1906
  for (const item of scores) {
@@ -1585,7 +1912,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1585
1912
  );
1586
1913
  continue;
1587
1914
  }
1588
- const formatted = def.format(item.data);
1915
+ const formatted = def.format(item.data, options);
1589
1916
  switch (def.displayStrategy) {
1590
1917
  case "bar": {
1591
1918
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -1618,7 +1945,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1618
1945
  for (const { id, data } of metrics) {
1619
1946
  const def = getMetricById(id);
1620
1947
  if (def) {
1621
- const formatted = def.format(data);
1948
+ const formatted = def.format(data, options);
1622
1949
  metricParts.push(
1623
1950
  def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
1624
1951
  );
@@ -1651,7 +1978,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1651
1978
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1652
1979
  );
1653
1980
  const aggregates = /* @__PURE__ */ new Map();
1654
- const testCaseSummaries = [];
1981
+ const testCaseByTestId = /* @__PURE__ */ new Map();
1655
1982
  let overallScoreTotal = 0;
1656
1983
  let overallScoreCount = 0;
1657
1984
  let completedCount = 0;
@@ -1665,6 +1992,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1665
1992
  }
1666
1993
  process.stdout.write("\r\x1B[2K");
1667
1994
  }
1995
+ function cursorUp(n) {
1996
+ if (!process.stdout.isTTY || n <= 0)
1997
+ return;
1998
+ process.stdout.write(`\x1B[${n}A`);
1999
+ }
1668
2000
  function drawSpinner() {
1669
2001
  if (!process.stdout.isTTY || runFinished) {
1670
2002
  return;
@@ -1678,6 +2010,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1678
2010
  )} ${colorize("(live)", ansi2.dim)}`
1679
2011
  );
1680
2012
  }
2013
+ let lastPrintedTestCaseId = null;
2014
+ let lastPrintedLineCount = 0;
1681
2015
  let spinnerTimer;
1682
2016
  const done = new Promise((resolve5) => {
1683
2017
  const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1685,31 +2019,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1685
2019
  completedCount = event.completedTestCases;
1686
2020
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1687
2021
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
1688
- clearLine();
1689
- console.log(
1690
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
1691
- );
2022
+ const testCaseId = event.testCaseId;
2023
+ const existing = testCaseByTestId.get(testCaseId) ?? {
2024
+ name: event.testCaseName,
2025
+ events: []
2026
+ };
2027
+ existing.events.push({
2028
+ averageScore,
2029
+ passed: event.passed,
2030
+ durationMs: event.durationMs,
2031
+ evaluatorScores: event.evaluatorScores
2032
+ });
2033
+ testCaseByTestId.set(testCaseId, existing);
1692
2034
  for (const item of event.evaluatorScores) {
1693
- const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
1694
- console.log(
1695
- formatEvaluatorScoreLine(
1696
- name,
1697
- item.scores,
1698
- item.passed,
1699
- item.metrics
1700
- )
1701
- );
1702
- if (!item.passed && item.logs && item.logs.length > 0) {
1703
- for (const log of item.logs) {
1704
- if (log.type === "diff") {
1705
- const useColor = process.stdout.isTTY;
1706
- for (const { type, line } of getDiffLines(log)) {
1707
- const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1708
- console.log(colored);
1709
- }
1710
- }
1711
- }
1712
- }
1713
2035
  const numeric = toNumericScoreFromScores(item.scores);
1714
2036
  if (numeric !== void 0) {
1715
2037
  const current = aggregates.get(item.evaluatorId) ?? {
@@ -1728,12 +2050,60 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1728
2050
  overallScoreCount += 1;
1729
2051
  }
1730
2052
  }
1731
- testCaseSummaries.push({
1732
- name: event.testCaseName,
1733
- averageScore,
1734
- durationMs: event.durationMs,
1735
- passed: event.passed
1736
- });
2053
+ const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2054
+ const isLastRerun = event.rerunIndex >= event.rerunTotal;
2055
+ const isNonTty = !process.stdout.isTTY;
2056
+ const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
2057
+ if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
2058
+ cursorUp(lastPrintedLineCount);
2059
+ }
2060
+ const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2061
+ existing.events);
2062
+ const isAggregated = existing.events.length > 1;
2063
+ const durationMs = existing.events.reduce(
2064
+ (s, e) => s + e.durationMs,
2065
+ 0
2066
+ );
2067
+ existing.events.every((e) => e.passed);
2068
+ const lines = [];
2069
+ lines.push(
2070
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2071
+ );
2072
+ for (const item of aggregatedScores) {
2073
+ const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2074
+ lines.push(
2075
+ formatEvaluatorScoreLine(
2076
+ name,
2077
+ item.scores,
2078
+ item.passed,
2079
+ item.metrics,
2080
+ { isAggregated }
2081
+ )
2082
+ );
2083
+ const lastEvent = existing.events[existing.events.length - 1];
2084
+ const lastEs = lastEvent?.evaluatorScores.find(
2085
+ (x) => x.evaluatorId === item.evaluatorId
2086
+ );
2087
+ if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2088
+ for (const log of lastEs.logs) {
2089
+ if (log.type === "diff") {
2090
+ const useColor = process.stdout.isTTY;
2091
+ for (const { type, line } of getDiffLines(log)) {
2092
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2093
+ lines.push(colored);
2094
+ }
2095
+ }
2096
+ }
2097
+ }
2098
+ }
2099
+ if (!skipPrintNonTty) {
2100
+ for (let i = 0; i < lines.length; i++) {
2101
+ process.stdout.write(`\r\x1B[2K${lines[i]}
2102
+ `);
2103
+ }
2104
+ lastPrintedTestCaseId = testCaseId;
2105
+ lastPrintedLineCount = lines.length;
2106
+ }
1737
2107
  drawSpinner();
1738
2108
  }
1739
2109
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
@@ -1797,6 +2167,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1797
2167
  getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
1798
2168
  );
1799
2169
  }
2170
+ const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
1800
2171
  if (testCaseSummaries.length > 0) {
1801
2172
  console.log(colorize("- test case scores:", ansi2.magenta));
1802
2173
  for (const summary of testCaseSummaries) {
@@ -1807,9 +2178,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1807
2178
  );
1808
2179
  continue;
1809
2180
  }
2181
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2182
+ summary.aggregatedScoreItem.data,
2183
+ { isAggregated: true }
2184
+ ) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
1810
2185
  console.log(
1811
2186
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1812
- summary.averageScore.toFixed(2),
2187
+ scoreLabel,
1813
2188
  scoreToColor(summary.averageScore)
1814
2189
  )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1815
2190
  );