@m4trix/evals 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
1
  #!/usr/bin/env node
2
2
  import { randomUUID } from 'crypto';
3
- import { Effect, PubSub, Queue, Fiber } from 'effect';
3
+ import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
4
4
  import { existsSync } from 'fs';
5
5
  import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import { diffString } from 'json-diff';
9
+ import { diffLines } from 'diff';
10
10
  import React2, { useState, useEffect, useCallback } from 'react';
11
11
  import { render, Box, Text } from 'ink';
12
12
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
@@ -30,7 +30,8 @@ var defaultRunnerConfig = {
30
30
  ],
31
31
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
32
32
  },
33
- artifactDirectory: ".eval-results"
33
+ artifactDirectory: ".eval-results",
34
+ maxConcurrency: 1
34
35
  };
35
36
  function toRunnerConfigOverrides(config) {
36
37
  if (!config) {
@@ -63,6 +64,9 @@ function toRunnerConfigOverrides(config) {
63
64
  if (config.artifactDirectory !== void 0) {
64
65
  overrides.artifactDirectory = config.artifactDirectory;
65
66
  }
67
+ if (config.maxConcurrency !== void 0) {
68
+ overrides.maxConcurrency = config.maxConcurrency;
69
+ }
66
70
  if (Object.keys(discovery).length > 0) {
67
71
  overrides.discovery = discovery;
68
72
  }
@@ -256,8 +260,35 @@ async function collectTestCasesFromFiles(config) {
256
260
  );
257
261
  return found.flat();
258
262
  }
263
+ function toJsonLines(value) {
264
+ try {
265
+ return JSON.stringify(value, null, 2);
266
+ } catch {
267
+ return String(value);
268
+ }
269
+ }
270
+ function formatDiffString(changes) {
271
+ const lines = [];
272
+ for (const part of changes) {
273
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
274
+ const partLines = part.value.split("\n");
275
+ if (partLines[partLines.length - 1] === "") {
276
+ partLines.pop();
277
+ }
278
+ for (const line of partLines) {
279
+ lines.push(`${prefix} ${line}`);
280
+ }
281
+ }
282
+ return lines.join("\n");
283
+ }
284
+ function createDiffString(expected, actual) {
285
+ const expectedStr = toJsonLines(expected);
286
+ const actualStr = toJsonLines(actual);
287
+ const changes = diffLines(expectedStr, actualStr);
288
+ return formatDiffString(changes);
289
+ }
259
290
  function createDiffLogEntry(expected, actual, options) {
260
- const diff = diffString(expected, actual, { color: false });
291
+ const diff = createDiffString(expected, actual);
261
292
  return {
262
293
  type: "diff",
263
294
  label: options?.label,
@@ -267,7 +298,7 @@ function createDiffLogEntry(expected, actual, options) {
267
298
  };
268
299
  }
269
300
  function getDiffLines(entry) {
270
- const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
301
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
271
302
  return raw.split("\n").map((line) => {
272
303
  const trimmed = line.trimStart();
273
304
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -287,6 +318,7 @@ var Metric = {
287
318
  const def = {
288
319
  id: config.id,
289
320
  name: config.name,
321
+ aggregate: config.aggregate,
290
322
  format: config.format,
291
323
  make: (data) => ({ id: config.id, data })
292
324
  };
@@ -306,6 +338,7 @@ var Score = {
306
338
  id: config.id,
307
339
  name: config.name,
308
340
  displayStrategy: config.displayStrategy,
341
+ aggregate: config.aggregate,
309
342
  format: config.format,
310
343
  make: (data, options) => {
311
344
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -324,23 +357,75 @@ function getScoreById(id) {
324
357
  return registry2.get(id);
325
358
  }
326
359
 
360
+ // src/evals/aggregators.ts
361
+ function aggregateAverageWithVariance(values) {
362
+ if (values.length === 0) {
363
+ return { value: 0, count: 0 };
364
+ }
365
+ const sum = values.reduce((s, v) => s + v.value, 0);
366
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
367
+ const mean = sum / values.length;
368
+ let stdDev;
369
+ if (values.length >= 2) {
370
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
371
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
372
+ }
373
+ return { value: mean, stdDev, count: values.length };
374
+ }
375
+ function aggregateAll(values) {
376
+ const total = values.length;
377
+ const passedCount = values.filter((v) => v.passed).length;
378
+ return {
379
+ passed: total > 0 && values.every((v) => v.passed),
380
+ passedCount,
381
+ totalCount: total
382
+ };
383
+ }
384
+ function aggregateTokenCountSum(values) {
385
+ const initial = {
386
+ input: 0,
387
+ output: 0,
388
+ inputCached: 0,
389
+ outputCached: 0
390
+ };
391
+ return values.reduce(
392
+ (acc, v) => ({
393
+ input: acc.input + (v.input ?? 0),
394
+ output: acc.output + (v.output ?? 0),
395
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
396
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
397
+ }),
398
+ initial
399
+ );
400
+ }
401
+ function aggregateLatencyAverage(values) {
402
+ if (values.length === 0) {
403
+ return { ms: 0 };
404
+ }
405
+ const sum = values.reduce((s, v) => s + v.ms, 0);
406
+ return { ms: sum / values.length };
407
+ }
408
+
327
409
  // src/evals/metrics/standard.ts
328
410
  Metric.of({
329
411
  id: "token-count",
330
412
  name: "Tokens",
331
- format: (data) => {
413
+ aggregate: aggregateTokenCountSum,
414
+ format: (data, options) => {
332
415
  const input = data.input ?? 0;
333
416
  const output = data.output ?? 0;
334
417
  const inputCached = data.inputCached ?? 0;
335
418
  const outputCached = data.outputCached ?? 0;
336
419
  const cached = inputCached + outputCached;
337
- return `in:${input} out:${output} cached:${cached}`;
420
+ const base = `in:${input} out:${output} cached:${cached}`;
421
+ return options?.isAggregated ? `Total: ${base}` : base;
338
422
  }
339
423
  });
340
424
  Metric.of({
341
425
  id: "latency",
342
426
  name: "Latency",
343
- format: (data) => `${data.ms}ms`
427
+ aggregate: aggregateLatencyAverage,
428
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
344
429
  });
345
430
 
346
431
  // src/evals/scores/standard.ts
@@ -348,16 +433,50 @@ Score.of({
348
433
  id: "percent",
349
434
  name: "Score",
350
435
  displayStrategy: "bar",
351
- format: (data) => data.value.toFixed(2)
436
+ format: (data, options) => {
437
+ if (options?.isAggregated) {
438
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
439
+ }
440
+ return data.value.toFixed(2);
441
+ },
442
+ aggregate: aggregateAverageWithVariance
352
443
  });
353
444
  Score.of({
354
445
  id: "binary",
355
446
  name: "Result",
356
447
  displayStrategy: "passFail",
357
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
448
+ format: (data, options) => {
449
+ if (options?.isAggregated) {
450
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
451
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
452
+ return `${base} (${data.passedCount}/${data.totalCount})`;
453
+ }
454
+ return base;
455
+ }
456
+ return data.passed ? "PASSED" : "NOT PASSED";
457
+ },
458
+ aggregate: aggregateAll
358
459
  });
359
460
 
360
461
  // src/runner/score-utils.ts
462
+ function aggregateScoreItems(items) {
463
+ if (items.length === 0)
464
+ return void 0;
465
+ const def = getScoreById(items[0].id);
466
+ if (!def?.aggregate)
467
+ return items[items.length - 1];
468
+ const aggregated = def.aggregate(items.map((i) => i.data));
469
+ return { ...items[0], data: aggregated };
470
+ }
471
+ function aggregateMetricItems(items) {
472
+ if (items.length === 0)
473
+ return void 0;
474
+ const def = getMetricById(items[0].id);
475
+ if (!def?.aggregate)
476
+ return items[items.length - 1];
477
+ const aggregated = def.aggregate(items.map((i) => i.data));
478
+ return { ...items[0], data: aggregated };
479
+ }
361
480
  function toNumericScoreFromScores(scores) {
362
481
  for (const item of scores) {
363
482
  const def = getScoreById(item.id);
@@ -436,6 +555,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
436
555
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
437
556
  );
438
557
  }
558
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
559
+ return Effect.gen(function* () {
560
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
561
+ const rerunPassed = [];
562
+ for (let r = 0; r < reruns; r++) {
563
+ const started = Date.now();
564
+ const evaluatorScores = [];
565
+ let testCaseError;
566
+ const output = readOutput(testCaseItem.testCase);
567
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
568
+ const evaluateFn = evaluator.getEvaluateFn();
569
+ if (!evaluateFn) {
570
+ continue;
571
+ }
572
+ try {
573
+ const logs = [];
574
+ const logDiff = (expected, actual, options) => {
575
+ logs.push(createDiffLogEntry(expected, actual, options));
576
+ };
577
+ const ctx = yield* Effect.promise(
578
+ () => Promise.resolve(evaluator.resolveContext())
579
+ );
580
+ const result = yield* Effect.promise(
581
+ () => Promise.resolve(
582
+ evaluateFn({
583
+ input: testCaseItem.testCase.getInput(),
584
+ ctx,
585
+ output,
586
+ logDiff
587
+ })
588
+ )
589
+ );
590
+ const { scores, metrics } = normalizeResult(result);
591
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
592
+ evaluatorScores.push({
593
+ evaluatorId,
594
+ scores,
595
+ passed: passed2,
596
+ metrics,
597
+ logs: logs.length > 0 ? logs : void 0
598
+ });
599
+ } catch (error) {
600
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
601
+ evaluatorScores.push({
602
+ evaluatorId,
603
+ scores: [],
604
+ passed: false
605
+ });
606
+ }
607
+ }
608
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
609
+ rerunPassed.push(rerunPassedThis);
610
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
611
+ n + 1,
612
+ n + 1
613
+ ]);
614
+ const progressEvent = {
615
+ type: "TestCaseProgress",
616
+ runId: task.runId,
617
+ testCaseId: testCaseItem.id,
618
+ testCaseName: testCaseItem.testCase.getName(),
619
+ completedTestCases: completedEvaluations,
620
+ totalTestCases: totalEvaluations,
621
+ rerunIndex: r + 1,
622
+ rerunTotal: reruns,
623
+ passed: rerunPassedThis,
624
+ durationMs: Date.now() - started,
625
+ evaluatorScores,
626
+ output,
627
+ errorMessage: testCaseError
628
+ };
629
+ updateSnapshot(task.runId, (snapshot) => ({
630
+ ...snapshot,
631
+ completedTestCases: completedEvaluations
632
+ }));
633
+ yield* publishEvent(progressEvent);
634
+ yield* Queue.offer(persistenceQueue, {
635
+ runId: task.runId,
636
+ artifactPath: task.snapshot.artifactPath,
637
+ payload: progressEvent
638
+ });
639
+ }
640
+ const testCasePassed = rerunPassed.every(Boolean);
641
+ if (testCasePassed) {
642
+ yield* Ref.update(passedRef, (n) => n + 1);
643
+ } else {
644
+ yield* Ref.update(failedRef, (n) => n + 1);
645
+ }
646
+ const [passed, failed] = yield* Effect.all([
647
+ Ref.get(passedRef),
648
+ Ref.get(failedRef)
649
+ ]);
650
+ updateSnapshot(task.runId, (snapshot) => ({
651
+ ...snapshot,
652
+ passedTestCases: passed,
653
+ failedTestCases: failed
654
+ }));
655
+ });
656
+ }
439
657
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
440
658
  const startedAt = Date.now();
441
659
  updateSnapshot(task.runId, (snapshot) => ({
@@ -448,104 +666,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
448
666
  runId: task.runId,
449
667
  startedAt
450
668
  });
451
- let completedTestCases = 0;
452
- let passedTestCases = 0;
453
- let failedTestCases = 0;
454
- for (const testCaseItem of task.testCases) {
455
- const started = Date.now();
456
- const evaluatorScores = [];
457
- let testCaseError;
458
- const output = readOutput(testCaseItem.testCase);
459
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
460
- const evaluateFn = evaluator.getEvaluateFn();
461
- if (!evaluateFn) {
462
- continue;
463
- }
464
- try {
465
- const logs = [];
466
- const logDiff = (expected, actual, options) => {
467
- logs.push(createDiffLogEntry(expected, actual, options));
468
- };
469
- const ctx = yield* Effect.promise(
470
- () => Promise.resolve(evaluator.resolveContext())
471
- );
472
- const result = yield* Effect.promise(
473
- () => Promise.resolve(
474
- evaluateFn({
475
- input: testCaseItem.testCase.getInput(),
476
- ctx,
477
- output,
478
- logDiff
479
- })
480
- )
481
- );
482
- const { scores, metrics } = normalizeResult(result);
483
- const passed = computeEvaluatorPassed(evaluator, result, scores);
484
- evaluatorScores.push({
485
- evaluatorId,
486
- scores,
487
- passed,
488
- metrics,
489
- logs: logs.length > 0 ? logs : void 0
490
- });
491
- } catch (error) {
492
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
493
- evaluatorScores.push({
494
- evaluatorId,
495
- scores: [],
496
- passed: false
497
- });
498
- }
499
- }
500
- const testCasePassed = evaluatorScores.every((s) => s.passed);
501
- completedTestCases += 1;
502
- if (testCasePassed) {
503
- passedTestCases += 1;
504
- } else {
505
- failedTestCases += 1;
506
- }
507
- const progressEvent = {
508
- type: "TestCaseProgress",
509
- runId: task.runId,
510
- testCaseId: testCaseItem.id,
511
- testCaseName: testCaseItem.testCase.getName(),
512
- completedTestCases,
513
- totalTestCases: task.testCases.length,
514
- passed: testCasePassed,
515
- durationMs: Date.now() - started,
516
- evaluatorScores,
517
- output,
518
- errorMessage: testCaseError
519
- };
520
- updateSnapshot(task.runId, (snapshot) => ({
521
- ...snapshot,
522
- completedTestCases,
523
- passedTestCases,
524
- failedTestCases
525
- }));
526
- yield* publishEvent(progressEvent);
527
- yield* Queue.offer(persistenceQueue, {
528
- runId: task.runId,
529
- artifactPath: task.snapshot.artifactPath,
530
- payload: progressEvent
531
- });
532
- }
669
+ const totalEvaluations = task.testCases.reduce(
670
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
671
+ 0
672
+ );
673
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
674
+ const completedRef = yield* Ref.make(0);
675
+ const passedRef = yield* Ref.make(0);
676
+ const failedRef = yield* Ref.make(0);
677
+ const processTestCase = (testCaseItem) => processOneTestCase(
678
+ task,
679
+ testCaseItem,
680
+ totalEvaluations,
681
+ publishEvent,
682
+ persistenceQueue,
683
+ updateSnapshot,
684
+ completedRef,
685
+ passedRef,
686
+ failedRef
687
+ );
688
+ yield* Effect.forEach(
689
+ task.testCases,
690
+ processTestCase,
691
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
692
+ );
693
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
694
+ Ref.get(completedRef),
695
+ Ref.get(passedRef),
696
+ Ref.get(failedRef)
697
+ ]);
533
698
  const finishedAt = Date.now();
534
699
  const completedEvent = {
535
700
  type: "RunCompleted",
536
701
  runId: task.runId,
537
702
  finishedAt,
538
- passedTestCases,
539
- failedTestCases,
703
+ passedTestCases: passedUniqueTestCases,
704
+ failedTestCases: failedUniqueTestCases,
540
705
  totalTestCases: task.testCases.length,
541
706
  artifactPath: task.snapshot.artifactPath
542
707
  };
543
708
  updateSnapshot(task.runId, (snapshot) => ({
544
709
  ...snapshot,
545
710
  status: "completed",
546
- completedTestCases,
547
- passedTestCases,
548
- failedTestCases,
711
+ completedTestCases: completedEvaluations,
712
+ passedTestCases: passedUniqueTestCases,
713
+ failedTestCases: failedUniqueTestCases,
549
714
  finishedAt
550
715
  }));
551
716
  yield* publishEvent(completedEvent);
@@ -633,7 +798,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
633
798
  const artifactPath = filePath;
634
799
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
635
800
  const progress = aggregateTestCaseProgress(lines);
636
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
801
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
637
802
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
638
803
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
639
804
  return {
@@ -655,23 +820,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
655
820
  }
656
821
  function aggregateTestCaseProgress(lines) {
657
822
  let completedTestCases = 0;
658
- let passedTestCases = 0;
659
- let failedTestCases = 0;
823
+ const testCasePassedBy = /* @__PURE__ */ new Map();
660
824
  for (const line of lines) {
661
825
  try {
662
826
  const event = JSON.parse(line);
663
827
  if (event.type === "TestCaseProgress") {
664
828
  const ev = event;
665
829
  completedTestCases = ev.completedTestCases ?? completedTestCases;
666
- if (ev.passed) {
667
- passedTestCases += 1;
668
- } else {
669
- failedTestCases += 1;
670
- }
830
+ const id = ev.testCaseId;
831
+ const current = testCasePassedBy.get(id);
832
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
671
833
  }
672
834
  } catch {
673
835
  }
674
836
  }
837
+ let passedTestCases = 0;
838
+ let failedTestCases = 0;
839
+ for (const passed of testCasePassedBy.values()) {
840
+ if (passed) {
841
+ passedTestCases += 1;
842
+ } else {
843
+ failedTestCases += 1;
844
+ }
845
+ }
675
846
  return { completedTestCases, passedTestCases, failedTestCases };
676
847
  }
677
848
  async function appendJsonLine(artifactPath, payload) {
@@ -866,6 +1037,10 @@ var EffectRunner = class {
866
1037
  throw new Error("No evaluators selected for run");
867
1038
  }
868
1039
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1040
+ const totalEvaluations = selectedTestCases.reduce(
1041
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1042
+ 0
1043
+ );
869
1044
  const runId = `run-${randomUUID()}`;
870
1045
  const artifactPath = createArtifactPath(
871
1046
  this.config.artifactDirectory,
@@ -878,7 +1053,7 @@ var EffectRunner = class {
878
1053
  datasetName: dataset.dataset.getName(),
879
1054
  evaluatorIds: selectedEvaluators.map((item) => item.id),
880
1055
  queuedAt: Date.now(),
881
- totalTestCases: selectedTestCases.length,
1056
+ totalTestCases: totalEvaluations,
882
1057
  completedTestCases: 0,
883
1058
  passedTestCases: 0,
884
1059
  failedTestCases: 0,
@@ -892,7 +1067,7 @@ var EffectRunner = class {
892
1067
  datasetId: request.datasetId,
893
1068
  datasetName: dataset.dataset.getName(),
894
1069
  evaluatorIds: selectedEvaluators.map((item) => item.id),
895
- totalTestCases: selectedTestCases.length,
1070
+ totalTestCases: totalEvaluations,
896
1071
  artifactPath
897
1072
  };
898
1073
  await Effect.runPromise(this.publishEvent(queuedEvent));
@@ -903,6 +1078,7 @@ var EffectRunner = class {
903
1078
  payload: queuedEvent
904
1079
  })
905
1080
  );
1081
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
906
1082
  await Effect.runPromise(
907
1083
  Queue.offer(this.runQueue, {
908
1084
  runId,
@@ -910,7 +1086,8 @@ var EffectRunner = class {
910
1086
  dataset: dataset.dataset,
911
1087
  evaluators: selectedEvaluators,
912
1088
  testCases: selectedTestCases,
913
- snapshot
1089
+ snapshot,
1090
+ maxConcurrency
914
1091
  })
915
1092
  );
916
1093
  return snapshot;
@@ -1216,6 +1393,13 @@ function Spinner({ label = "Running" }) {
1216
1393
  label
1217
1394
  ] });
1218
1395
  }
1396
+ function sampleStdDev(sum, sumSq, n) {
1397
+ if (n < 2)
1398
+ return void 0;
1399
+ const mean = sum / n;
1400
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1401
+ return variance > 0 ? Math.sqrt(variance) : 0;
1402
+ }
1219
1403
  function scoreColor(score) {
1220
1404
  if (score >= 80)
1221
1405
  return "green";
@@ -1228,13 +1412,62 @@ function createBar(value, max = 100, width = 20) {
1228
1412
  const filled = Math.round(safe / max * width);
1229
1413
  return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1230
1414
  }
1231
- function formatScorePart(item, scoreToColor2) {
1415
+ function aggregateEvaluatorScores(events, nameById) {
1416
+ if (events.length === 0)
1417
+ return [];
1418
+ const evaluatorIds = new Set(
1419
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1420
+ );
1421
+ const result = [];
1422
+ for (const evaluatorId of evaluatorIds) {
1423
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1424
+ const metricIdToItems = /* @__PURE__ */ new Map();
1425
+ for (const ev of events) {
1426
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1427
+ for (const s of es?.scores ?? []) {
1428
+ const list = scoreIdToItems.get(s.id) ?? [];
1429
+ list.push(s);
1430
+ scoreIdToItems.set(s.id, list);
1431
+ }
1432
+ for (const m of es?.metrics ?? []) {
1433
+ const list = metricIdToItems.get(m.id) ?? [];
1434
+ list.push(m);
1435
+ metricIdToItems.set(m.id, list);
1436
+ }
1437
+ }
1438
+ const aggregatedScores = [];
1439
+ for (const items of scoreIdToItems.values()) {
1440
+ const agg = aggregateScoreItems(items);
1441
+ if (agg)
1442
+ aggregatedScores.push(agg);
1443
+ }
1444
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1445
+ const passed = events.every((ev) => {
1446
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1447
+ return es?.passed ?? false;
1448
+ });
1449
+ const lastEvent = events[events.length - 1];
1450
+ const lastEs = lastEvent?.evaluatorScores.find(
1451
+ (x) => x.evaluatorId === evaluatorId
1452
+ );
1453
+ result.push({
1454
+ evaluatorId,
1455
+ evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
1456
+ scores: aggregatedScores,
1457
+ passed,
1458
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
1459
+ logs: lastEs?.logs
1460
+ });
1461
+ }
1462
+ return result;
1463
+ }
1464
+ function formatScorePart(item, scoreToColor2, options) {
1232
1465
  const def = getScoreById(item.id);
1233
1466
  if (!def) {
1234
1467
  const numeric = toNumericScore(item.data);
1235
1468
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1236
1469
  }
1237
- const formatted = def.format(item.data);
1470
+ const formatted = def.format(item.data, options);
1238
1471
  if (def.displayStrategy === "bar") {
1239
1472
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1240
1473
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1254,6 +1487,7 @@ function RunView({
1254
1487
  );
1255
1488
  const [runInfo, setRunInfo] = useState(null);
1256
1489
  const [testCases, setTestCases] = useState([]);
1490
+ const [completedEvaluations, setCompletedEvaluations] = useState(0);
1257
1491
  const [summary, setSummary] = useState(null);
1258
1492
  const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1259
1493
  const runEval = useCallback(async () => {
@@ -1280,48 +1514,44 @@ function RunView({
1280
1514
  return;
1281
1515
  }
1282
1516
  const nameById = new Map(
1283
- evaluators.map((item) => [
1284
- item.id,
1285
- item.evaluator.getName() ?? item.id
1286
- ])
1517
+ evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1287
1518
  );
1288
1519
  setEvaluatorNameById(nameById);
1289
1520
  const aggregates = /* @__PURE__ */ new Map();
1290
1521
  let overallScoreTotal = 0;
1522
+ let overallScoreSumSq = 0;
1291
1523
  let overallScoreCount = 0;
1292
1524
  const done = new Promise((resolve5) => {
1293
1525
  const unsubscribe = runner.subscribeRunEvents((event) => {
1294
1526
  if (event.type === "TestCaseProgress") {
1295
1527
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1296
- const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1528
+ numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1297
1529
  for (const item of event.evaluatorScores) {
1298
1530
  const numeric = toNumericScoreFromScores(item.scores);
1299
1531
  if (numeric !== void 0) {
1300
1532
  const current = aggregates.get(item.evaluatorId) ?? {
1301
1533
  total: 0,
1534
+ sumSq: 0,
1302
1535
  count: 0,
1303
1536
  passed: 0,
1304
1537
  failed: 0
1305
1538
  };
1306
1539
  aggregates.set(item.evaluatorId, {
1307
1540
  total: current.total + numeric,
1541
+ sumSq: current.sumSq + numeric * numeric,
1308
1542
  count: current.count + 1,
1309
1543
  passed: current.passed + (item.passed ? 1 : 0),
1310
1544
  failed: current.failed + (item.passed ? 0 : 1)
1311
1545
  });
1312
1546
  overallScoreTotal += numeric;
1547
+ overallScoreSumSq += numeric * numeric;
1313
1548
  overallScoreCount += 1;
1314
1549
  }
1315
1550
  }
1316
- setTestCases((prev) => [
1317
- ...prev,
1318
- {
1319
- name: event.testCaseName,
1320
- completedTestCases: event.completedTestCases,
1321
- totalTestCases: event.totalTestCases,
1322
- durationMs: event.durationMs,
1323
- passed: event.passed,
1324
- averageScore,
1551
+ setTestCases((prev) => {
1552
+ const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
1553
+ const existing = byId.get(event.testCaseId);
1554
+ const newEvent = {
1325
1555
  evaluatorScores: event.evaluatorScores.map((item) => ({
1326
1556
  evaluatorId: item.evaluatorId,
1327
1557
  evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
@@ -1329,9 +1559,33 @@ function RunView({
1329
1559
  passed: item.passed,
1330
1560
  metrics: item.metrics,
1331
1561
  logs: item.logs
1332
- }))
1333
- }
1334
- ]);
1562
+ })),
1563
+ passed: event.passed,
1564
+ durationMs: event.durationMs
1565
+ };
1566
+ const events = existing ? [...existing.events, newEvent] : [newEvent];
1567
+ const isAggregated = events.length > 1;
1568
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1569
+ events,
1570
+ nameById
1571
+ );
1572
+ const merged = {
1573
+ name: event.testCaseName,
1574
+ testCaseId: event.testCaseId,
1575
+ completedTestCases: event.completedTestCases,
1576
+ totalTestCases: event.totalTestCases,
1577
+ rerunIndex: event.rerunIndex,
1578
+ rerunTotal: event.rerunTotal,
1579
+ durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1580
+ passed: events.every((e) => e.passed),
1581
+ events,
1582
+ aggregatedEvaluatorScores,
1583
+ isAggregated
1584
+ };
1585
+ byId.set(event.testCaseId, merged);
1586
+ setCompletedEvaluations(event.completedTestCases);
1587
+ return Array.from(byId.values());
1588
+ });
1335
1589
  }
1336
1590
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
1337
1591
  unsubscribe();
@@ -1346,9 +1600,7 @@ function RunView({
1346
1600
  setRunInfo({
1347
1601
  runId: snapshot.runId,
1348
1602
  datasetName: snapshot.datasetName,
1349
- evaluatorNames: evaluators.map(
1350
- (e) => e.evaluator.getName() ?? e.id
1351
- ),
1603
+ evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
1352
1604
  totalTestCases: snapshot.totalTestCases
1353
1605
  });
1354
1606
  setPhase("running");
@@ -1362,6 +1614,7 @@ function RunView({
1362
1614
  failedTestCases: finalEvent.failedTestCases,
1363
1615
  totalTestCases: finalEvent.totalTestCases,
1364
1616
  overallScoreTotal,
1617
+ overallScoreSumSq,
1365
1618
  overallScoreCount,
1366
1619
  aggregates: new Map(aggregates),
1367
1620
  artifactPath: finalEvent.artifactPath
@@ -1376,29 +1629,41 @@ function RunView({
1376
1629
  /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
1377
1630
  runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1378
1631
  /* @__PURE__ */ jsxs(Text, { children: [
1379
- /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
1632
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1633
+ "Run",
1634
+ " "
1635
+ ] }),
1380
1636
  /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
1381
1637
  ] }),
1382
1638
  /* @__PURE__ */ jsxs(Text, { children: [
1383
- /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
1639
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1640
+ "Dataset",
1641
+ " "
1642
+ ] }),
1384
1643
  runInfo.datasetName
1385
1644
  ] }),
1386
1645
  /* @__PURE__ */ jsxs(Text, { children: [
1387
- /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
1646
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1647
+ "Evaluators",
1648
+ " "
1649
+ ] }),
1388
1650
  runInfo.evaluatorNames.join(", ")
1389
1651
  ] }),
1390
1652
  /* @__PURE__ */ jsxs(Text, { children: [
1391
- /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
1653
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1654
+ "Test cases",
1655
+ " "
1656
+ ] }),
1392
1657
  runInfo.totalTestCases
1393
1658
  ] })
1394
1659
  ] }),
1395
1660
  phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
1396
1661
  Spinner,
1397
1662
  {
1398
- label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1663
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1399
1664
  }
1400
1665
  ) }),
1401
- testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1666
+ testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1402
1667
  /* @__PURE__ */ jsxs(Text, { children: [
1403
1668
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1404
1669
  "[",
@@ -1409,49 +1674,78 @@ function RunView({
1409
1674
  ] }),
1410
1675
  " ",
1411
1676
  tc.name,
1677
+ " ",
1678
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1679
+ "(",
1680
+ tc.rerunIndex,
1681
+ "/",
1682
+ tc.rerunTotal,
1683
+ ")"
1684
+ ] }),
1412
1685
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1413
1686
  " (",
1414
1687
  tc.durationMs,
1415
1688
  "ms)"
1416
1689
  ] })
1417
1690
  ] }),
1418
- tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
1419
- /* @__PURE__ */ jsxs(Text, { children: [
1420
- item.evaluatorName,
1421
- ":",
1422
- " ",
1423
- /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1424
- " ",
1425
- item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1426
- formatScorePart(s),
1427
- " "
1428
- ] }, s.id)),
1429
- item.metrics?.map((m) => {
1430
- const def = getMetricById(m.id);
1431
- if (!def)
1432
- return null;
1433
- const formatted = def.format(m.data);
1434
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1435
- "[",
1436
- def.name ? `${def.name}: ` : "",
1437
- formatted,
1438
- "]",
1439
- " "
1440
- ] }, m.id);
1441
- })
1442
- ] }),
1443
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1444
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
1445
- Text,
1446
- {
1447
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1448
- children: line
1449
- },
1450
- lineIdx
1451
- )) }, logIdx) : null
1452
- ) })
1453
- ] }, item.evaluatorId))
1454
- ] }, i)) }),
1691
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
1692
+ Box,
1693
+ {
1694
+ flexDirection: "column",
1695
+ marginLeft: 2,
1696
+ children: [
1697
+ /* @__PURE__ */ jsxs(Text, { children: [
1698
+ item.evaluatorName,
1699
+ ":",
1700
+ " ",
1701
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1702
+ " ",
1703
+ item.scores.map((s) => /* @__PURE__ */ jsxs(
1704
+ Text,
1705
+ {
1706
+ color: scoreColor(toNumericScore(s.data) ?? 0),
1707
+ children: [
1708
+ formatScorePart(s, scoreColor, {
1709
+ isAggregated: tc.isAggregated
1710
+ }),
1711
+ " "
1712
+ ]
1713
+ },
1714
+ s.id
1715
+ )),
1716
+ item.metrics?.map((m) => {
1717
+ const def = getMetricById(m.id);
1718
+ if (!def)
1719
+ return null;
1720
+ const formatted = def.format(m.data, {
1721
+ isAggregated: tc.isAggregated
1722
+ });
1723
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1724
+ "[",
1725
+ def.name ? `${def.name}: ` : "",
1726
+ formatted,
1727
+ "]",
1728
+ " "
1729
+ ] }, m.id);
1730
+ })
1731
+ ] }),
1732
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1733
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
1734
+ ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
1735
+ Text,
1736
+ {
1737
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1738
+ children: line
1739
+ },
1740
+ lineIdx
1741
+ )
1742
+ ) }, logIdx) : null
1743
+ ) })
1744
+ ]
1745
+ },
1746
+ item.evaluatorId
1747
+ ))
1748
+ ] }, tc.testCaseId)) }),
1455
1749
  phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1456
1750
  /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
1457
1751
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
@@ -1478,7 +1772,14 @@ function RunView({
1478
1772
  label: "overall avg",
1479
1773
  value: summary.overallScoreTotal / summary.overallScoreCount,
1480
1774
  barWidth: 20,
1481
- format: (v) => v.toFixed(2)
1775
+ format: (v) => {
1776
+ const sd = sampleStdDev(
1777
+ summary.overallScoreTotal,
1778
+ summary.overallScoreSumSq,
1779
+ summary.overallScoreCount
1780
+ );
1781
+ return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
1782
+ }
1482
1783
  }
1483
1784
  ) }),
1484
1785
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
@@ -1493,12 +1794,15 @@ function RunView({
1493
1794
  ] }, id);
1494
1795
  }
1495
1796
  const mean = agg.total / agg.count;
1797
+ const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
1798
+ const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1496
1799
  return /* @__PURE__ */ jsxs(Text, { children: [
1497
1800
  "- ",
1498
1801
  name.padEnd(28),
1499
1802
  " avg=",
1500
- /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1501
- " passed=",
1803
+ /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
1804
+ " ",
1805
+ "passed=",
1502
1806
  agg.passed,
1503
1807
  " failed=",
1504
1808
  agg.failed
@@ -1507,28 +1811,41 @@ function RunView({
1507
1811
  ] }),
1508
1812
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1509
1813
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
1510
- testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
1511
- /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1512
- /* @__PURE__ */ jsxs(Text, { children: [
1513
- " ",
1514
- tc.name.padEnd(24)
1515
- ] }),
1516
- tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1517
- /* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
1518
- "score=",
1519
- tc.averageScore.toFixed(2)
1814
+ testCases.map((tc) => {
1815
+ const allScores = tc.events.flatMap(
1816
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1817
+ );
1818
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1819
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1820
+ const total = allScores.reduce((a, b) => a + b, 0);
1821
+ const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
1822
+ const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1823
+ const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1824
+ isAggregated: true
1825
+ }) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
1826
+ return /* @__PURE__ */ jsxs(Box, { children: [
1827
+ /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1828
+ /* @__PURE__ */ jsxs(Text, { children: [
1829
+ " ",
1830
+ tc.name.padEnd(24)
1520
1831
  ] }),
1832
+ averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1833
+ /* @__PURE__ */ jsxs(Text, { color: scoreColor(averageScore), children: [
1834
+ "score=",
1835
+ scoreLabel
1836
+ ] }),
1837
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1838
+ " ",
1839
+ createBar(averageScore, 100, 14)
1840
+ ] })
1841
+ ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
1521
1842
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1522
- " ",
1523
- createBar(tc.averageScore, 100, 14)
1843
+ " (",
1844
+ tc.durationMs,
1845
+ "ms)"
1524
1846
  ] })
1525
- ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
1526
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1527
- " (",
1528
- tc.durationMs,
1529
- "ms)"
1530
- ] })
1531
- ] }, i))
1847
+ ] }, tc.testCaseId);
1848
+ })
1532
1849
  ] }),
1533
1850
  /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1534
1851
  "artifact: ",
@@ -1539,6 +1856,61 @@ function RunView({
1539
1856
  }
1540
1857
 
1541
1858
  // src/cli-simple/run.ts
1859
+ function sampleStdDev2(sum, sumSq, n) {
1860
+ if (n < 2)
1861
+ return void 0;
1862
+ const mean = sum / n;
1863
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1864
+ return variance > 0 ? Math.sqrt(variance) : 0;
1865
+ }
1866
+ function buildTestCaseSummaries(byId) {
1867
+ const summaries = [];
1868
+ for (const { name, events } of byId.values()) {
1869
+ const passed = events.every((e) => e.passed);
1870
+ const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1871
+ const isAggregated = events.length > 1;
1872
+ const allScores = events.flatMap(
1873
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1874
+ );
1875
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1876
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1877
+ const total = allScores.reduce((a, b) => a + b, 0);
1878
+ const stdDev = sampleStdDev2(total, sumSq, allScores.length);
1879
+ let firstAggregatedScore;
1880
+ for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1881
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1882
+ for (const ev of events) {
1883
+ const es = ev.evaluatorScores.find(
1884
+ (x) => x.evaluatorId === evaluatorScores.evaluatorId
1885
+ );
1886
+ for (const s of es?.scores ?? []) {
1887
+ const list = scoreIdToItems.get(s.id) ?? [];
1888
+ list.push(s);
1889
+ scoreIdToItems.set(s.id, list);
1890
+ }
1891
+ }
1892
+ for (const items of scoreIdToItems.values()) {
1893
+ const agg = aggregateScoreItems(items);
1894
+ if (agg && firstAggregatedScore === void 0) {
1895
+ firstAggregatedScore = agg;
1896
+ break;
1897
+ }
1898
+ }
1899
+ if (firstAggregatedScore !== void 0)
1900
+ break;
1901
+ }
1902
+ summaries.push({
1903
+ name,
1904
+ averageScore,
1905
+ stdDev: stdDev ?? void 0,
1906
+ aggregatedScoreItem: firstAggregatedScore,
1907
+ isAggregated,
1908
+ durationMs,
1909
+ passed
1910
+ });
1911
+ }
1912
+ return summaries;
1913
+ }
1542
1914
  var ansi2 = {
1543
1915
  reset: "\x1B[0m",
1544
1916
  bold: "\x1B[1m",
@@ -1566,14 +1938,59 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1566
1938
  return `- ${evaluatorName.padEnd(28)} no numeric scores`;
1567
1939
  }
1568
1940
  const mean = aggregate.total / aggregate.count;
1569
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
1941
+ const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
1942
+ const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1943
+ return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
1570
1944
  }
1571
1945
  function createBar2(value, max = 100, width = 20) {
1572
1946
  const safe = Math.max(0, Math.min(max, value));
1573
1947
  const filled = Math.round(safe / max * width);
1574
1948
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
1575
1949
  }
1576
- function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1950
+ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1951
+ if (events.length === 0)
1952
+ return [];
1953
+ const evaluatorIds = new Set(
1954
+ events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1955
+ );
1956
+ const result = [];
1957
+ for (const evaluatorId of evaluatorIds) {
1958
+ const scoreIdToItems = /* @__PURE__ */ new Map();
1959
+ const metricIdToItems = /* @__PURE__ */ new Map();
1960
+ for (const ev of events) {
1961
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1962
+ for (const s of es?.scores ?? []) {
1963
+ const list = scoreIdToItems.get(s.id) ?? [];
1964
+ list.push(s);
1965
+ scoreIdToItems.set(s.id, list);
1966
+ }
1967
+ for (const m of es?.metrics ?? []) {
1968
+ const list = metricIdToItems.get(m.id) ?? [];
1969
+ list.push(m);
1970
+ metricIdToItems.set(m.id, list);
1971
+ }
1972
+ }
1973
+ const aggregatedScores = [];
1974
+ for (const items of scoreIdToItems.values()) {
1975
+ const agg = aggregateScoreItems(items);
1976
+ if (agg)
1977
+ aggregatedScores.push(agg);
1978
+ }
1979
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1980
+ const passed = events.every((ev) => {
1981
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1982
+ return es?.passed ?? false;
1983
+ });
1984
+ result.push({
1985
+ evaluatorId,
1986
+ scores: aggregatedScores,
1987
+ passed,
1988
+ metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
1989
+ });
1990
+ }
1991
+ return result;
1992
+ }
1993
+ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1577
1994
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1578
1995
  const scoreParts = [];
1579
1996
  for (const item of scores) {
@@ -1585,7 +2002,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1585
2002
  );
1586
2003
  continue;
1587
2004
  }
1588
- const formatted = def.format(item.data);
2005
+ const formatted = def.format(item.data, options);
1589
2006
  switch (def.displayStrategy) {
1590
2007
  case "bar": {
1591
2008
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -1618,7 +2035,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1618
2035
  for (const { id, data } of metrics) {
1619
2036
  const def = getMetricById(id);
1620
2037
  if (def) {
1621
- const formatted = def.format(data);
2038
+ const formatted = def.format(data, options);
1622
2039
  metricParts.push(
1623
2040
  def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
1624
2041
  );
@@ -1651,8 +2068,9 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1651
2068
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1652
2069
  );
1653
2070
  const aggregates = /* @__PURE__ */ new Map();
1654
- const testCaseSummaries = [];
2071
+ const testCaseByTestId = /* @__PURE__ */ new Map();
1655
2072
  let overallScoreTotal = 0;
2073
+ let overallScoreSumSq = 0;
1656
2074
  let overallScoreCount = 0;
1657
2075
  let completedCount = 0;
1658
2076
  let totalCount = 0;
@@ -1665,6 +2083,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1665
2083
  }
1666
2084
  process.stdout.write("\r\x1B[2K");
1667
2085
  }
2086
+ function cursorUp(n) {
2087
+ if (!process.stdout.isTTY || n <= 0)
2088
+ return;
2089
+ process.stdout.write(`\x1B[${n}A`);
2090
+ }
1668
2091
  function drawSpinner() {
1669
2092
  if (!process.stdout.isTTY || runFinished) {
1670
2093
  return;
@@ -1678,6 +2101,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1678
2101
  )} ${colorize("(live)", ansi2.dim)}`
1679
2102
  );
1680
2103
  }
2104
+ let lastPrintedTestCaseId = null;
2105
+ let lastPrintedLineCount = 0;
1681
2106
  let spinnerTimer;
1682
2107
  const done = new Promise((resolve5) => {
1683
2108
  const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1685,55 +2110,94 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1685
2110
  completedCount = event.completedTestCases;
1686
2111
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1687
2112
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
1688
- clearLine();
1689
- console.log(
1690
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
1691
- );
2113
+ const testCaseId = event.testCaseId;
2114
+ const existing = testCaseByTestId.get(testCaseId) ?? {
2115
+ name: event.testCaseName,
2116
+ events: []
2117
+ };
2118
+ existing.events.push({
2119
+ averageScore,
2120
+ passed: event.passed,
2121
+ durationMs: event.durationMs,
2122
+ evaluatorScores: event.evaluatorScores
2123
+ });
2124
+ testCaseByTestId.set(testCaseId, existing);
1692
2125
  for (const item of event.evaluatorScores) {
1693
- const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
1694
- console.log(
1695
- formatEvaluatorScoreLine(
1696
- name,
1697
- item.scores,
1698
- item.passed,
1699
- item.metrics
1700
- )
1701
- );
1702
- if (!item.passed && item.logs && item.logs.length > 0) {
1703
- for (const log of item.logs) {
1704
- if (log.type === "diff") {
1705
- const useColor = process.stdout.isTTY;
1706
- for (const { type, line } of getDiffLines(log)) {
1707
- const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1708
- console.log(colored);
1709
- }
1710
- }
1711
- }
1712
- }
1713
2126
  const numeric = toNumericScoreFromScores(item.scores);
1714
2127
  if (numeric !== void 0) {
1715
2128
  const current = aggregates.get(item.evaluatorId) ?? {
1716
2129
  total: 0,
2130
+ sumSq: 0,
1717
2131
  count: 0,
1718
2132
  passed: 0,
1719
2133
  failed: 0
1720
2134
  };
1721
2135
  aggregates.set(item.evaluatorId, {
1722
2136
  total: current.total + numeric,
2137
+ sumSq: current.sumSq + numeric * numeric,
1723
2138
  count: current.count + 1,
1724
2139
  passed: current.passed + (item.passed ? 1 : 0),
1725
2140
  failed: current.failed + (item.passed ? 0 : 1)
1726
2141
  });
1727
2142
  overallScoreTotal += numeric;
2143
+ overallScoreSumSq += numeric * numeric;
1728
2144
  overallScoreCount += 1;
1729
2145
  }
1730
2146
  }
1731
- testCaseSummaries.push({
1732
- name: event.testCaseName,
1733
- averageScore,
1734
- durationMs: event.durationMs,
1735
- passed: event.passed
1736
- });
2147
+ const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2148
+ const isLastRerun = event.rerunIndex >= event.rerunTotal;
2149
+ const isNonTty = !process.stdout.isTTY;
2150
+ const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
2151
+ if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
2152
+ cursorUp(lastPrintedLineCount);
2153
+ }
2154
+ const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2155
+ existing.events);
2156
+ const isAggregated = existing.events.length > 1;
2157
+ const durationMs = existing.events.reduce(
2158
+ (s, e) => s + e.durationMs,
2159
+ 0
2160
+ );
2161
+ existing.events.every((e) => e.passed);
2162
+ const lines = [];
2163
+ lines.push(
2164
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2165
+ );
2166
+ for (const item of aggregatedScores) {
2167
+ const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2168
+ lines.push(
2169
+ formatEvaluatorScoreLine(
2170
+ name,
2171
+ item.scores,
2172
+ item.passed,
2173
+ item.metrics,
2174
+ { isAggregated }
2175
+ )
2176
+ );
2177
+ const lastEvent = existing.events[existing.events.length - 1];
2178
+ const lastEs = lastEvent?.evaluatorScores.find(
2179
+ (x) => x.evaluatorId === item.evaluatorId
2180
+ );
2181
+ if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2182
+ for (const log of lastEs.logs) {
2183
+ if (log.type === "diff") {
2184
+ const useColor = process.stdout.isTTY;
2185
+ for (const { type, line } of getDiffLines(log)) {
2186
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2187
+ lines.push(colored);
2188
+ }
2189
+ }
2190
+ }
2191
+ }
2192
+ }
2193
+ if (!skipPrintNonTty) {
2194
+ for (let i = 0; i < lines.length; i++) {
2195
+ process.stdout.write(`\r\x1B[2K${lines[i]}
2196
+ `);
2197
+ }
2198
+ lastPrintedTestCaseId = testCaseId;
2199
+ lastPrintedLineCount = lines.length;
2200
+ }
1737
2201
  drawSpinner();
1738
2202
  }
1739
2203
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
@@ -1784,9 +2248,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1784
2248
  );
1785
2249
  if (overallScoreCount > 0) {
1786
2250
  const overallAverage = overallScoreTotal / overallScoreCount;
2251
+ const overallSd = sampleStdDev2(
2252
+ overallScoreTotal,
2253
+ overallScoreSumSq,
2254
+ overallScoreCount
2255
+ );
2256
+ const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
1787
2257
  console.log(
1788
2258
  `- overall avg score: ${colorize(
1789
- overallAverage.toFixed(2),
2259
+ avgStr,
1790
2260
  scoreToColor(overallAverage)
1791
2261
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
1792
2262
  );
@@ -1797,6 +2267,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1797
2267
  getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
1798
2268
  );
1799
2269
  }
2270
+ const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
1800
2271
  if (testCaseSummaries.length > 0) {
1801
2272
  console.log(colorize("- test case scores:", ansi2.magenta));
1802
2273
  for (const summary of testCaseSummaries) {
@@ -1807,9 +2278,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1807
2278
  );
1808
2279
  continue;
1809
2280
  }
2281
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2282
+ summary.aggregatedScoreItem.data,
2283
+ { isAggregated: true }
2284
+ ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
1810
2285
  console.log(
1811
2286
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1812
- summary.averageScore.toFixed(2),
2287
+ scoreLabel,
1813
2288
  scoreToColor(summary.averageScore)
1814
2289
  )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1815
2290
  );