npm - @m4trix/evals - Versions diffs - 0.12.0 → 0.14.0 - Mend

@m4trix/evals 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/cli-simple.js CHANGED Viewed

@@ -1,12 +1,12 @@
 #!/usr/bin/env node
 import { randomUUID } from 'crypto';
-import { Effect, PubSub, Queue, Fiber } from 'effect';
+import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
 import { existsSync } from 'fs';
 import { resolve, relative, join, parse, dirname } from 'path';
 import * as jitiModule from 'jiti';
 import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
 import { pathToFileURL } from 'url';
-import { diffString } from 'json-diff';
+import { diffLines } from 'diff';
 import React2, { useState, useEffect, useCallback } from 'react';
 import { render, Box, Text } from 'ink';
 import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
@@ -30,7 +30,8 @@ var defaultRunnerConfig = {
     ],
     excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
   },
-  artifactDirectory: ".eval-results"
+  artifactDirectory: ".eval-results",
+  maxConcurrency: 1
 };
 function toRunnerConfigOverrides(config) {
   if (!config) {
@@ -63,6 +64,9 @@ function toRunnerConfigOverrides(config) {
   if (config.artifactDirectory !== void 0) {
     overrides.artifactDirectory = config.artifactDirectory;
   }
+  if (config.maxConcurrency !== void 0) {
+    overrides.maxConcurrency = config.maxConcurrency;
+  }
   if (Object.keys(discovery).length > 0) {
     overrides.discovery = discovery;
   }
@@ -256,8 +260,35 @@ async function collectTestCasesFromFiles(config) {
   );
   return found.flat();
 }
+function toJsonLines(value) {
+  try {
+    return JSON.stringify(value, null, 2);
+  } catch {
+    return String(value);
+  }
+}
+function formatDiffString(changes) {
+  const lines = [];
+  for (const part of changes) {
+    const prefix = part.added ? "+" : part.removed ? "-" : " ";
+    const partLines = part.value.split("\n");
+    if (partLines[partLines.length - 1] === "") {
+      partLines.pop();
+    }
+    for (const line of partLines) {
+      lines.push(`${prefix} ${line}`);
+    }
+  }
+  return lines.join("\n");
+}
+function createDiffString(expected, actual) {
+  const expectedStr = toJsonLines(expected);
+  const actualStr = toJsonLines(actual);
+  const changes = diffLines(expectedStr, actualStr);
+  return formatDiffString(changes);
+}
 function createDiffLogEntry(expected, actual, options) {
-  const diff = diffString(expected, actual, { color: false });
+  const diff = createDiffString(expected, actual);
   return {
     type: "diff",
     label: options?.label,
@@ -267,7 +298,7 @@ function createDiffLogEntry(expected, actual, options) {
   };
 }
 function getDiffLines(entry) {
-  const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
+  const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
   return raw.split("\n").map((line) => {
     const trimmed = line.trimStart();
     if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -287,6 +318,7 @@ var Metric = {
     const def = {
       id: config.id,
       name: config.name,
+      aggregate: config.aggregate,
       format: config.format,
       make: (data) => ({ id: config.id, data })
     };
@@ -306,6 +338,7 @@ var Score = {
       id: config.id,
       name: config.name,
       displayStrategy: config.displayStrategy,
+      aggregate: config.aggregate,
       format: config.format,
       make: (data, options) => {
         const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -324,23 +357,75 @@ function getScoreById(id) {
   return registry2.get(id);
 }
+// src/evals/aggregators.ts
+function aggregateAverageWithVariance(values) {
+  if (values.length === 0) {
+    return { value: 0, count: 0 };
+  }
+  const sum = values.reduce((s, v) => s + v.value, 0);
+  const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
+  const mean = sum / values.length;
+  let stdDev;
+  if (values.length >= 2) {
+    const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
+    stdDev = variance > 0 ? Math.sqrt(variance) : 0;
+  }
+  return { value: mean, stdDev, count: values.length };
+}
+function aggregateAll(values) {
+  const total = values.length;
+  const passedCount = values.filter((v) => v.passed).length;
+  return {
+    passed: total > 0 && values.every((v) => v.passed),
+    passedCount,
+    totalCount: total
+  };
+}
+function aggregateTokenCountSum(values) {
+  const initial = {
+    input: 0,
+    output: 0,
+    inputCached: 0,
+    outputCached: 0
+  };
+  return values.reduce(
+    (acc, v) => ({
+      input: acc.input + (v.input ?? 0),
+      output: acc.output + (v.output ?? 0),
+      inputCached: acc.inputCached + (v.inputCached ?? 0),
+      outputCached: acc.outputCached + (v.outputCached ?? 0)
+    }),
+    initial
+  );
+}
+function aggregateLatencyAverage(values) {
+  if (values.length === 0) {
+    return { ms: 0 };
+  }
+  const sum = values.reduce((s, v) => s + v.ms, 0);
+  return { ms: sum / values.length };
+}
 // src/evals/metrics/standard.ts
 Metric.of({
   id: "token-count",
   name: "Tokens",
-  format: (data) => {
+  aggregate: aggregateTokenCountSum,
+  format: (data, options) => {
     const input = data.input ?? 0;
     const output = data.output ?? 0;
     const inputCached = data.inputCached ?? 0;
     const outputCached = data.outputCached ?? 0;
     const cached = inputCached + outputCached;
-    return `in:${input} out:${output} cached:${cached}`;
+    const base = `in:${input} out:${output} cached:${cached}`;
+    return options?.isAggregated ? `Total: ${base}` : base;
   }
 });
 Metric.of({
   id: "latency",
   name: "Latency",
-  format: (data) => `${data.ms}ms`
+  aggregate: aggregateLatencyAverage,
+  format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
 });
 // src/evals/scores/standard.ts
@@ -348,16 +433,50 @@ Score.of({
   id: "percent",
   name: "Score",
   displayStrategy: "bar",
-  format: (data) => data.value.toFixed(2)
+  format: (data, options) => {
+    if (options?.isAggregated) {
+      return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
+    }
+    return data.value.toFixed(2);
+  },
+  aggregate: aggregateAverageWithVariance
 });
 Score.of({
   id: "binary",
   name: "Result",
   displayStrategy: "passFail",
-  format: (data) => data.passed ? "PASSED" : "NOT PASSED"
+  format: (data, options) => {
+    if (options?.isAggregated) {
+      const base = data.passed ? "All: PASSED" : "Some: FAILED";
+      if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
+        return `${base} (${data.passedCount}/${data.totalCount})`;
+      }
+      return base;
+    }
+    return data.passed ? "PASSED" : "NOT PASSED";
+  },
+  aggregate: aggregateAll
 });
 // src/runner/score-utils.ts
+function aggregateScoreItems(items) {
+  if (items.length === 0)
+    return void 0;
+  const def = getScoreById(items[0].id);
+  if (!def?.aggregate)
+    return items[items.length - 1];
+  const aggregated = def.aggregate(items.map((i) => i.data));
+  return { ...items[0], data: aggregated };
+}
+function aggregateMetricItems(items) {
+  if (items.length === 0)
+    return void 0;
+  const def = getMetricById(items[0].id);
+  if (!def?.aggregate)
+    return items[items.length - 1];
+  const aggregated = def.aggregate(items.map((i) => i.data));
+  return { ...items[0], data: aggregated };
+}
 function toNumericScoreFromScores(scores) {
   for (const item of scores) {
     const def = getScoreById(item.id);
@@ -436,6 +555,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
     `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
   );
 }
+function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
+  return Effect.gen(function* () {
+    const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
+    const rerunPassed = [];
+    for (let r = 0; r < reruns; r++) {
+      const started = Date.now();
+      const evaluatorScores = [];
+      let testCaseError;
+      const output = readOutput(testCaseItem.testCase);
+      for (const { id: evaluatorId, evaluator } of task.evaluators) {
+        const evaluateFn = evaluator.getEvaluateFn();
+        if (!evaluateFn) {
+          continue;
+        }
+        try {
+          const logs = [];
+          const logDiff = (expected, actual, options) => {
+            logs.push(createDiffLogEntry(expected, actual, options));
+          };
+          const ctx = yield* Effect.promise(
+            () => Promise.resolve(evaluator.resolveContext())
+          );
+          const result = yield* Effect.promise(
+            () => Promise.resolve(
+              evaluateFn({
+                input: testCaseItem.testCase.getInput(),
+                ctx,
+                output,
+                logDiff
+              })
+            )
+          );
+          const { scores, metrics } = normalizeResult(result);
+          const passed2 = computeEvaluatorPassed(evaluator, result, scores);
+          evaluatorScores.push({
+            evaluatorId,
+            scores,
+            passed: passed2,
+            metrics,
+            logs: logs.length > 0 ? logs : void 0
+          });
+        } catch (error) {
+          testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
+          evaluatorScores.push({
+            evaluatorId,
+            scores: [],
+            passed: false
+          });
+        }
+      }
+      const rerunPassedThis = evaluatorScores.every((s) => s.passed);
+      rerunPassed.push(rerunPassedThis);
+      const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
+        n + 1,
+        n + 1
+      ]);
+      const progressEvent = {
+        type: "TestCaseProgress",
+        runId: task.runId,
+        testCaseId: testCaseItem.id,
+        testCaseName: testCaseItem.testCase.getName(),
+        completedTestCases: completedEvaluations,
+        totalTestCases: totalEvaluations,
+        rerunIndex: r + 1,
+        rerunTotal: reruns,
+        passed: rerunPassedThis,
+        durationMs: Date.now() - started,
+        evaluatorScores,
+        output,
+        errorMessage: testCaseError
+      };
+      updateSnapshot(task.runId, (snapshot) => ({
+        ...snapshot,
+        completedTestCases: completedEvaluations
+      }));
+      yield* publishEvent(progressEvent);
+      yield* Queue.offer(persistenceQueue, {
+        runId: task.runId,
+        artifactPath: task.snapshot.artifactPath,
+        payload: progressEvent
+      });
+    }
+    const testCasePassed = rerunPassed.every(Boolean);
+    if (testCasePassed) {
+      yield* Ref.update(passedRef, (n) => n + 1);
+    } else {
+      yield* Ref.update(failedRef, (n) => n + 1);
+    }
+    const [passed, failed] = yield* Effect.all([
+      Ref.get(passedRef),
+      Ref.get(failedRef)
+    ]);
+    updateSnapshot(task.runId, (snapshot) => ({
+      ...snapshot,
+      passedTestCases: passed,
+      failedTestCases: failed
+    }));
+  });
+}
 var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
   const startedAt = Date.now();
   updateSnapshot(task.runId, (snapshot) => ({
@@ -448,104 +666,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
     runId: task.runId,
     startedAt
   });
-  let completedTestCases = 0;
-  let passedTestCases = 0;
-  let failedTestCases = 0;
-  for (const testCaseItem of task.testCases) {
-    const started = Date.now();
-    const evaluatorScores = [];
-    let testCaseError;
-    const output = readOutput(testCaseItem.testCase);
-    for (const { id: evaluatorId, evaluator } of task.evaluators) {
-      const evaluateFn = evaluator.getEvaluateFn();
-      if (!evaluateFn) {
-        continue;
-      }
-      try {
-        const logs = [];
-        const logDiff = (expected, actual, options) => {
-          logs.push(createDiffLogEntry(expected, actual, options));
-        };
-        const ctx = yield* Effect.promise(
-          () => Promise.resolve(evaluator.resolveContext())
-        );
-        const result = yield* Effect.promise(
-          () => Promise.resolve(
-            evaluateFn({
-              input: testCaseItem.testCase.getInput(),
-              ctx,
-              output,
-              logDiff
-            })
-          )
-        );
-        const { scores, metrics } = normalizeResult(result);
-        const passed = computeEvaluatorPassed(evaluator, result, scores);
-        evaluatorScores.push({
-          evaluatorId,
-          scores,
-          passed,
-          metrics,
-          logs: logs.length > 0 ? logs : void 0
-        });
-      } catch (error) {
-        testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
-        evaluatorScores.push({
-          evaluatorId,
-          scores: [],
-          passed: false
-        });
-      }
-    }
-    const testCasePassed = evaluatorScores.every((s) => s.passed);
-    completedTestCases += 1;
-    if (testCasePassed) {
-      passedTestCases += 1;
-    } else {
-      failedTestCases += 1;
-    }
-    const progressEvent = {
-      type: "TestCaseProgress",
-      runId: task.runId,
-      testCaseId: testCaseItem.id,
-      testCaseName: testCaseItem.testCase.getName(),
-      completedTestCases,
-      totalTestCases: task.testCases.length,
-      passed: testCasePassed,
-      durationMs: Date.now() - started,
-      evaluatorScores,
-      output,
-      errorMessage: testCaseError
-    };
-    updateSnapshot(task.runId, (snapshot) => ({
-      ...snapshot,
-      completedTestCases,
-      passedTestCases,
-      failedTestCases
-    }));
-    yield* publishEvent(progressEvent);
-    yield* Queue.offer(persistenceQueue, {
-      runId: task.runId,
-      artifactPath: task.snapshot.artifactPath,
-      payload: progressEvent
-    });
-  }
+  const totalEvaluations = task.testCases.reduce(
+    (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
+    0
+  );
+  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
+  const completedRef = yield* Ref.make(0);
+  const passedRef = yield* Ref.make(0);
+  const failedRef = yield* Ref.make(0);
+  const processTestCase = (testCaseItem) => processOneTestCase(
+    task,
+    testCaseItem,
+    totalEvaluations,
+    publishEvent,
+    persistenceQueue,
+    updateSnapshot,
+    completedRef,
+    passedRef,
+    failedRef
+  );
+  yield* Effect.forEach(
+    task.testCases,
+    processTestCase,
+    maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
+  );
+  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
+    Ref.get(completedRef),
+    Ref.get(passedRef),
+    Ref.get(failedRef)
+  ]);
   const finishedAt = Date.now();
   const completedEvent = {
     type: "RunCompleted",
     runId: task.runId,
     finishedAt,
-    passedTestCases,
-    failedTestCases,
+    passedTestCases: passedUniqueTestCases,
+    failedTestCases: failedUniqueTestCases,
     totalTestCases: task.testCases.length,
     artifactPath: task.snapshot.artifactPath
   };
   updateSnapshot(task.runId, (snapshot) => ({
     ...snapshot,
     status: "completed",
-    completedTestCases,
-    passedTestCases,
-    failedTestCases,
+    completedTestCases: completedEvaluations,
+    passedTestCases: passedUniqueTestCases,
+    failedTestCases: failedUniqueTestCases,
     finishedAt
   }));
   yield* publishEvent(completedEvent);
@@ -633,7 +798,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
   const artifactPath = filePath;
   const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
   const progress = aggregateTestCaseProgress(lines);
-  const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
+  const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
   const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
   const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
   return {
@@ -655,23 +820,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
 }
 function aggregateTestCaseProgress(lines) {
   let completedTestCases = 0;
-  let passedTestCases = 0;
-  let failedTestCases = 0;
+  const testCasePassedBy = /* @__PURE__ */ new Map();
   for (const line of lines) {
     try {
       const event = JSON.parse(line);
       if (event.type === "TestCaseProgress") {
         const ev = event;
         completedTestCases = ev.completedTestCases ?? completedTestCases;
-        if (ev.passed) {
-          passedTestCases += 1;
-        } else {
-          failedTestCases += 1;
-        }
+        const id = ev.testCaseId;
+        const current = testCasePassedBy.get(id);
+        testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
       }
     } catch {
     }
   }
+  let passedTestCases = 0;
+  let failedTestCases = 0;
+  for (const passed of testCasePassedBy.values()) {
+    if (passed) {
+      passedTestCases += 1;
+    } else {
+      failedTestCases += 1;
+    }
+  }
   return { completedTestCases, passedTestCases, failedTestCases };
 }
 async function appendJsonLine(artifactPath, payload) {
@@ -866,6 +1037,10 @@ var EffectRunner = class {
       throw new Error("No evaluators selected for run");
     }
     const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
+    const totalEvaluations = selectedTestCases.reduce(
+      (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
+      0
+    );
     const runId = `run-${randomUUID()}`;
     const artifactPath = createArtifactPath(
       this.config.artifactDirectory,
@@ -878,7 +1053,7 @@ var EffectRunner = class {
       datasetName: dataset.dataset.getName(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       queuedAt: Date.now(),
-      totalTestCases: selectedTestCases.length,
+      totalTestCases: totalEvaluations,
       completedTestCases: 0,
       passedTestCases: 0,
       failedTestCases: 0,
@@ -892,7 +1067,7 @@ var EffectRunner = class {
       datasetId: request.datasetId,
       datasetName: dataset.dataset.getName(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
-      totalTestCases: selectedTestCases.length,
+      totalTestCases: totalEvaluations,
       artifactPath
     };
     await Effect.runPromise(this.publishEvent(queuedEvent));
@@ -903,6 +1078,7 @@ var EffectRunner = class {
         payload: queuedEvent
       })
     );
+    const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
     await Effect.runPromise(
       Queue.offer(this.runQueue, {
         runId,
@@ -910,7 +1086,8 @@ var EffectRunner = class {
         dataset: dataset.dataset,
         evaluators: selectedEvaluators,
         testCases: selectedTestCases,
-        snapshot
+        snapshot,
+        maxConcurrency
       })
     );
     return snapshot;
@@ -1216,6 +1393,13 @@ function Spinner({ label = "Running" }) {
     label
   ] });
 }
+function sampleStdDev(sum, sumSq, n) {
+  if (n < 2)
+    return void 0;
+  const mean = sum / n;
+  const variance = (sumSq - n * mean * mean) / (n - 1);
+  return variance > 0 ? Math.sqrt(variance) : 0;
+}
 function scoreColor(score) {
   if (score >= 80)
     return "green";
@@ -1228,13 +1412,62 @@ function createBar(value, max = 100, width = 20) {
   const filled = Math.round(safe / max * width);
   return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
 }
-function formatScorePart(item, scoreToColor2) {
+function aggregateEvaluatorScores(events, nameById) {
+  if (events.length === 0)
+    return [];
+  const evaluatorIds = new Set(
+    events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
+  );
+  const result = [];
+  for (const evaluatorId of evaluatorIds) {
+    const scoreIdToItems = /* @__PURE__ */ new Map();
+    const metricIdToItems = /* @__PURE__ */ new Map();
+    for (const ev of events) {
+      const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
+      for (const s of es?.scores ?? []) {
+        const list = scoreIdToItems.get(s.id) ?? [];
+        list.push(s);
+        scoreIdToItems.set(s.id, list);
+      }
+      for (const m of es?.metrics ?? []) {
+        const list = metricIdToItems.get(m.id) ?? [];
+        list.push(m);
+        metricIdToItems.set(m.id, list);
+      }
+    }
+    const aggregatedScores = [];
+    for (const items of scoreIdToItems.values()) {
+      const agg = aggregateScoreItems(items);
+      if (agg)
+        aggregatedScores.push(agg);
+    }
+    const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
+    const passed = events.every((ev) => {
+      const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
+      return es?.passed ?? false;
+    });
+    const lastEvent = events[events.length - 1];
+    const lastEs = lastEvent?.evaluatorScores.find(
+      (x) => x.evaluatorId === evaluatorId
+    );
+    result.push({
+      evaluatorId,
+      evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
+      scores: aggregatedScores,
+      passed,
+      metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
+      logs: lastEs?.logs
+    });
+  }
+  return result;
+}
+function formatScorePart(item, scoreToColor2, options) {
   const def = getScoreById(item.id);
   if (!def) {
     const numeric = toNumericScore(item.data);
     return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
   }
-  const formatted = def.format(item.data);
+  const formatted = def.format(item.data, options);
   if (def.displayStrategy === "bar") {
     const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
     if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1254,6 +1487,7 @@ function RunView({
   );
   const [runInfo, setRunInfo] = useState(null);
   const [testCases, setTestCases] = useState([]);
+  const [completedEvaluations, setCompletedEvaluations] = useState(0);
   const [summary, setSummary] = useState(null);
   const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
   const runEval = useCallback(async () => {
@@ -1280,48 +1514,44 @@ function RunView({
       return;
     }
     const nameById = new Map(
-      evaluators.map((item) => [
-        item.id,
-        item.evaluator.getName() ?? item.id
-      ])
+      evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
     );
     setEvaluatorNameById(nameById);
     const aggregates = /* @__PURE__ */ new Map();
     let overallScoreTotal = 0;
+    let overallScoreSumSq = 0;
     let overallScoreCount = 0;
     const done = new Promise((resolve5) => {
       const unsubscribe = runner.subscribeRunEvents((event) => {
         if (event.type === "TestCaseProgress") {
           const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
-          const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
+          numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
           for (const item of event.evaluatorScores) {
             const numeric = toNumericScoreFromScores(item.scores);
             if (numeric !== void 0) {
               const current = aggregates.get(item.evaluatorId) ?? {
                 total: 0,
+                sumSq: 0,
                 count: 0,
                 passed: 0,
                 failed: 0
               };
               aggregates.set(item.evaluatorId, {
                 total: current.total + numeric,
+                sumSq: current.sumSq + numeric * numeric,
                 count: current.count + 1,
                 passed: current.passed + (item.passed ? 1 : 0),
                 failed: current.failed + (item.passed ? 0 : 1)
               });
               overallScoreTotal += numeric;
+              overallScoreSumSq += numeric * numeric;
               overallScoreCount += 1;
             }
           }
-          setTestCases((prev) => [
-            ...prev,
-            {
-              name: event.testCaseName,
-              completedTestCases: event.completedTestCases,
-              totalTestCases: event.totalTestCases,
-              durationMs: event.durationMs,
-              passed: event.passed,
-              averageScore,
+          setTestCases((prev) => {
+            const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
+            const existing = byId.get(event.testCaseId);
+            const newEvent = {
               evaluatorScores: event.evaluatorScores.map((item) => ({
                 evaluatorId: item.evaluatorId,
                 evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
@@ -1329,9 +1559,33 @@ function RunView({
                 passed: item.passed,
                 metrics: item.metrics,
                 logs: item.logs
-              }))
-            }
-          ]);
+              })),
+              passed: event.passed,
+              durationMs: event.durationMs
+            };
+            const events = existing ? [...existing.events, newEvent] : [newEvent];
+            const isAggregated = events.length > 1;
+            const aggregatedEvaluatorScores = aggregateEvaluatorScores(
+              events,
+              nameById
+            );
+            const merged = {
+              name: event.testCaseName,
+              testCaseId: event.testCaseId,
+              completedTestCases: event.completedTestCases,
+              totalTestCases: event.totalTestCases,
+              rerunIndex: event.rerunIndex,
+              rerunTotal: event.rerunTotal,
+              durationMs: events.reduce((s, e) => s + e.durationMs, 0),
+              passed: events.every((e) => e.passed),
+              events,
+              aggregatedEvaluatorScores,
+              isAggregated
+            };
+            byId.set(event.testCaseId, merged);
+            setCompletedEvaluations(event.completedTestCases);
+            return Array.from(byId.values());
+          });
         }
         if (event.type === "RunCompleted" || event.type === "RunFailed") {
           unsubscribe();
@@ -1346,9 +1600,7 @@ function RunView({
     setRunInfo({
       runId: snapshot.runId,
       datasetName: snapshot.datasetName,
-      evaluatorNames: evaluators.map(
-        (e) => e.evaluator.getName() ?? e.id
-      ),
+      evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
       totalTestCases: snapshot.totalTestCases
     });
     setPhase("running");
@@ -1362,6 +1614,7 @@ function RunView({
       failedTestCases: finalEvent.failedTestCases,
       totalTestCases: finalEvent.totalTestCases,
       overallScoreTotal,
+      overallScoreSumSq,
       overallScoreCount,
       aggregates: new Map(aggregates),
       artifactPath: finalEvent.artifactPath
@@ -1376,29 +1629,41 @@ function RunView({
     /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
     runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
       /* @__PURE__ */ jsxs(Text, { children: [
-        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
+        /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
+          "Run",
+          " "
+        ] }),
         /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
       ] }),
       /* @__PURE__ */ jsxs(Text, { children: [
-        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
+        /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
+          "Dataset",
+          " "
+        ] }),
         runInfo.datasetName
       ] }),
       /* @__PURE__ */ jsxs(Text, { children: [
-        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
+        /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
+          "Evaluators",
+          " "
+        ] }),
         runInfo.evaluatorNames.join(", ")
       ] }),
       /* @__PURE__ */ jsxs(Text, { children: [
-        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
+        /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
+          "Test cases",
+          " "
+        ] }),
         runInfo.totalTestCases
       ] })
     ] }),
     phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
       Spinner,
       {
-        label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
+        label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
       }
     ) }),
-    testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
+    testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
       /* @__PURE__ */ jsxs(Text, { children: [
         /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
           "[",
@@ -1409,49 +1674,78 @@ function RunView({
         ] }),
         " ",
         tc.name,
+        " ",
+        /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
+          "(",
+          tc.rerunIndex,
+          "/",
+          tc.rerunTotal,
+          ")"
+        ] }),
         /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
           " (",
           tc.durationMs,
           "ms)"
         ] })
       ] }),
-      tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
-        /* @__PURE__ */ jsxs(Text, { children: [
-          item.evaluatorName,
-          ":",
-          " ",
-          /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
-          " ",
-          item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
-            formatScorePart(s),
-            " "
-          ] }, s.id)),
-          item.metrics?.map((m) => {
-            const def = getMetricById(m.id);
-            if (!def)
-              return null;
-            const formatted = def.format(m.data);
-            return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
-              "[",
-              def.name ? `${def.name}: ` : "",
-              formatted,
-              "]",
-              " "
-            ] }, m.id);
-          })
-        ] }),
-        !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
-          (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
-            Text,
-            {
-              color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
-              children: line
-            },
-            lineIdx
-          )) }, logIdx) : null
-        ) })
-      ] }, item.evaluatorId))
-    ] }, i)) }),
+      tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
+        Box,
+        {
+          flexDirection: "column",
+          marginLeft: 2,
+          children: [
+            /* @__PURE__ */ jsxs(Text, { children: [
+              item.evaluatorName,
+              ":",
+              " ",
+              /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
+              " ",
+              item.scores.map((s) => /* @__PURE__ */ jsxs(
+                Text,
+                {
+                  color: scoreColor(toNumericScore(s.data) ?? 0),
+                  children: [
+                    formatScorePart(s, scoreColor, {
+                      isAggregated: tc.isAggregated
+                    }),
+                    " "
+                  ]
+                },
+                s.id
+              )),
+              item.metrics?.map((m) => {
+                const def = getMetricById(m.id);
+                if (!def)
+                  return null;
+                const formatted = def.format(m.data, {
+                  isAggregated: tc.isAggregated
+                });
+                return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+                  "[",
+                  def.name ? `${def.name}: ` : "",
+                  formatted,
+                  "]",
+                  " "
+                ] }, m.id);
+              })
+            ] }),
+            !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
+              (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
+                ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
+                  Text,
+                  {
+                    color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
+                    children: line
+                  },
+                  lineIdx
+                )
+              ) }, logIdx) : null
+            ) })
+          ]
+        },
+        item.evaluatorId
+      ))
+    ] }, tc.testCaseId)) }),
     phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
       /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
       /* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
@@ -1478,7 +1772,14 @@ function RunView({
           label: "overall avg",
           value: summary.overallScoreTotal / summary.overallScoreCount,
           barWidth: 20,
-          format: (v) => v.toFixed(2)
+          format: (v) => {
+            const sd = sampleStdDev(
+              summary.overallScoreTotal,
+              summary.overallScoreSumSq,
+              summary.overallScoreCount
+            );
+            return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
+          }
         }
       ) }),
       /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
@@ -1493,12 +1794,15 @@ function RunView({
             ] }, id);
           }
           const mean = agg.total / agg.count;
+          const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
+          const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
           return /* @__PURE__ */ jsxs(Text, { children: [
             "- ",
             name.padEnd(28),
             " avg=",
-            /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
-            " passed=",
+            /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
+            " ",
+            "passed=",
             agg.passed,
             " failed=",
             agg.failed
@@ -1507,28 +1811,41 @@ function RunView({
       ] }),
       /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
         /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
-        testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
-          /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
-          /* @__PURE__ */ jsxs(Text, { children: [
-            " ",
-            tc.name.padEnd(24)
-          ] }),
-          tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
-            /* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
-              "score=",
-              tc.averageScore.toFixed(2)
+        testCases.map((tc) => {
+          const allScores = tc.events.flatMap(
+            (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
+          );
+          const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
+          const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
+          const total = allScores.reduce((a, b) => a + b, 0);
+          const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
+          const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
+          const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
+            isAggregated: true
+          }) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
+          return /* @__PURE__ */ jsxs(Box, { children: [
+            /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
+            /* @__PURE__ */ jsxs(Text, { children: [
+              " ",
+              tc.name.padEnd(24)
             ] }),
+            averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
+              /* @__PURE__ */ jsxs(Text, { color: scoreColor(averageScore), children: [
+                "score=",
+                scoreLabel
+              ] }),
+              /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+                " ",
+                createBar(averageScore, 100, 14)
+              ] })
+            ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
             /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
-              " ",
-              createBar(tc.averageScore, 100, 14)
+              " (",
+              tc.durationMs,
+              "ms)"
             ] })
-          ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
-          /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
-            " (",
-            tc.durationMs,
-            "ms)"
-          ] })
-        ] }, i))
+          ] }, tc.testCaseId);
+        })
       ] }),
       /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
         "artifact: ",
@@ -1539,6 +1856,61 @@ function RunView({
 }
 // src/cli-simple/run.ts
+function sampleStdDev2(sum, sumSq, n) {
+  if (n < 2)
+    return void 0;
+  const mean = sum / n;
+  const variance = (sumSq - n * mean * mean) / (n - 1);
+  return variance > 0 ? Math.sqrt(variance) : 0;
+}
+function buildTestCaseSummaries(byId) {
+  const summaries = [];
+  for (const { name, events } of byId.values()) {
+    const passed = events.every((e) => e.passed);
+    const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
+    const isAggregated = events.length > 1;
+    const allScores = events.flatMap(
+      (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
+    );
+    const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
+    const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
+    const total = allScores.reduce((a, b) => a + b, 0);
+    const stdDev = sampleStdDev2(total, sumSq, allScores.length);
+    let firstAggregatedScore;
+    for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
+      const scoreIdToItems = /* @__PURE__ */ new Map();
+      for (const ev of events) {
+        const es = ev.evaluatorScores.find(
+          (x) => x.evaluatorId === evaluatorScores.evaluatorId
+        );
+        for (const s of es?.scores ?? []) {
+          const list = scoreIdToItems.get(s.id) ?? [];
+          list.push(s);
+          scoreIdToItems.set(s.id, list);
+        }
+      }
+      for (const items of scoreIdToItems.values()) {
+        const agg = aggregateScoreItems(items);
+        if (agg && firstAggregatedScore === void 0) {
+          firstAggregatedScore = agg;
+          break;
+        }
+      }
+      if (firstAggregatedScore !== void 0)
+        break;
+    }
+    summaries.push({
+      name,
+      averageScore,
+      stdDev: stdDev ?? void 0,
+      aggregatedScoreItem: firstAggregatedScore,
+      isAggregated,
+      durationMs,
+      passed
+    });
+  }
+  return summaries;
+}
 var ansi2 = {
   reset: "\x1B[0m",
   bold: "\x1B[1m",
@@ -1566,14 +1938,59 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
     return `- ${evaluatorName.padEnd(28)} no numeric scores`;
   }
   const mean = aggregate.total / aggregate.count;
-  return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
+  const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
+  const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
+  return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
 }
 function createBar2(value, max = 100, width = 20) {
   const safe = Math.max(0, Math.min(max, value));
   const filled = Math.round(safe / max * width);
   return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
 }
-function formatEvaluatorScoreLine(name, scores, passed, metrics) {
+function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
+  if (events.length === 0)
+    return [];
+  const evaluatorIds = new Set(
+    events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
+  );
+  const result = [];
+  for (const evaluatorId of evaluatorIds) {
+    const scoreIdToItems = /* @__PURE__ */ new Map();
+    const metricIdToItems = /* @__PURE__ */ new Map();
+    for (const ev of events) {
+      const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
+      for (const s of es?.scores ?? []) {
+        const list = scoreIdToItems.get(s.id) ?? [];
+        list.push(s);
+        scoreIdToItems.set(s.id, list);
+      }
+      for (const m of es?.metrics ?? []) {
+        const list = metricIdToItems.get(m.id) ?? [];
+        list.push(m);
+        metricIdToItems.set(m.id, list);
+      }
+    }
+    const aggregatedScores = [];
+    for (const items of scoreIdToItems.values()) {
+      const agg = aggregateScoreItems(items);
+      if (agg)
+        aggregatedScores.push(agg);
+    }
+    const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
+    const passed = events.every((ev) => {
+      const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
+      return es?.passed ?? false;
+    });
+    result.push({
+      evaluatorId,
+      scores: aggregatedScores,
+      passed,
+      metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
+    });
+  }
+  return result;
+}
+function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
   const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
   const scoreParts = [];
   for (const item of scores) {
@@ -1585,7 +2002,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
       );
       continue;
     }
-    const formatted = def.format(item.data);
+    const formatted = def.format(item.data, options);
     switch (def.displayStrategy) {
       case "bar": {
         const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -1618,7 +2035,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
     for (const { id, data } of metrics) {
       const def = getMetricById(id);
       if (def) {
-        const formatted = def.format(data);
+        const formatted = def.format(data, options);
         metricParts.push(
           def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
         );
@@ -1651,8 +2068,9 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
     evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
   );
   const aggregates = /* @__PURE__ */ new Map();
-  const testCaseSummaries = [];
+  const testCaseByTestId = /* @__PURE__ */ new Map();
   let overallScoreTotal = 0;
+  let overallScoreSumSq = 0;
   let overallScoreCount = 0;
   let completedCount = 0;
   let totalCount = 0;
@@ -1665,6 +2083,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
     }
     process.stdout.write("\r\x1B[2K");
   }
+  function cursorUp(n) {
+    if (!process.stdout.isTTY || n <= 0)
+      return;
+    process.stdout.write(`\x1B[${n}A`);
+  }
   function drawSpinner() {
     if (!process.stdout.isTTY || runFinished) {
       return;
@@ -1678,6 +2101,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
       )} ${colorize("(live)", ansi2.dim)}`
     );
   }
+  let lastPrintedTestCaseId = null;
+  let lastPrintedLineCount = 0;
   let spinnerTimer;
   const done = new Promise((resolve5) => {
     const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1685,55 +2110,94 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
         completedCount = event.completedTestCases;
         const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
         const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
-        clearLine();
-        console.log(
-          `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
-        );
+        const testCaseId = event.testCaseId;
+        const existing = testCaseByTestId.get(testCaseId) ?? {
+          name: event.testCaseName,
+          events: []
+        };
+        existing.events.push({
+          averageScore,
+          passed: event.passed,
+          durationMs: event.durationMs,
+          evaluatorScores: event.evaluatorScores
+        });
+        testCaseByTestId.set(testCaseId, existing);
         for (const item of event.evaluatorScores) {
-          const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
-          console.log(
-            formatEvaluatorScoreLine(
-              name,
-              item.scores,
-              item.passed,
-              item.metrics
-            )
-          );
-          if (!item.passed && item.logs && item.logs.length > 0) {
-            for (const log of item.logs) {
-              if (log.type === "diff") {
-                const useColor = process.stdout.isTTY;
-                for (const { type, line } of getDiffLines(log)) {
-                  const colored = useColor && type === "remove" ? colorize(`      ${line}`, ansi2.red) : useColor && type === "add" ? colorize(`      ${line}`, ansi2.green) : `      ${line}`;
-                  console.log(colored);
-                }
-              }
-            }
-          }
           const numeric = toNumericScoreFromScores(item.scores);
           if (numeric !== void 0) {
             const current = aggregates.get(item.evaluatorId) ?? {
               total: 0,
+              sumSq: 0,
               count: 0,
               passed: 0,
               failed: 0
             };
             aggregates.set(item.evaluatorId, {
               total: current.total + numeric,
+              sumSq: current.sumSq + numeric * numeric,
               count: current.count + 1,
               passed: current.passed + (item.passed ? 1 : 0),
               failed: current.failed + (item.passed ? 0 : 1)
             });
             overallScoreTotal += numeric;
+            overallScoreSumSq += numeric * numeric;
             overallScoreCount += 1;
           }
         }
-        testCaseSummaries.push({
-          name: event.testCaseName,
-          averageScore,
-          durationMs: event.durationMs,
-          passed: event.passed
-        });
+        const isSameTestCase = lastPrintedTestCaseId === testCaseId;
+        const isLastRerun = event.rerunIndex >= event.rerunTotal;
+        const isNonTty = !process.stdout.isTTY;
+        const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
+        if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
+          cursorUp(lastPrintedLineCount);
+        }
+        const aggregatedScores = aggregateEvaluatorScoresFromEvents(
+          existing.events);
+        const isAggregated = existing.events.length > 1;
+        const durationMs = existing.events.reduce(
+          (s, e) => s + e.durationMs,
+          0
+        );
+        existing.events.every((e) => e.passed);
+        const lines = [];
+        lines.push(
+          `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
+        );
+        for (const item of aggregatedScores) {
+          const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
+          lines.push(
+            formatEvaluatorScoreLine(
+              name,
+              item.scores,
+              item.passed,
+              item.metrics,
+              { isAggregated }
+            )
+          );
+          const lastEvent = existing.events[existing.events.length - 1];
+          const lastEs = lastEvent?.evaluatorScores.find(
+            (x) => x.evaluatorId === item.evaluatorId
+          );
+          if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
+            for (const log of lastEs.logs) {
+              if (log.type === "diff") {
+                const useColor = process.stdout.isTTY;
+                for (const { type, line } of getDiffLines(log)) {
+                  const colored = useColor && type === "remove" ? colorize(`      ${line}`, ansi2.red) : useColor && type === "add" ? colorize(`      ${line}`, ansi2.green) : `      ${line}`;
+                  lines.push(colored);
+                }
+              }
+            }
+          }
+        }
+        if (!skipPrintNonTty) {
+          for (let i = 0; i < lines.length; i++) {
+            process.stdout.write(`\r\x1B[2K${lines[i]}
+`);
+          }
+          lastPrintedTestCaseId = testCaseId;
+          lastPrintedLineCount = lines.length;
+        }
         drawSpinner();
       }
       if (event.type === "RunCompleted" || event.type === "RunFailed") {
@@ -1784,9 +2248,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
   );
   if (overallScoreCount > 0) {
     const overallAverage = overallScoreTotal / overallScoreCount;
+    const overallSd = sampleStdDev2(
+      overallScoreTotal,
+      overallScoreSumSq,
+      overallScoreCount
+    );
+    const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
     console.log(
       `- overall avg score: ${colorize(
-        overallAverage.toFixed(2),
+        avgStr,
         scoreToColor(overallAverage)
       )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
     );
@@ -1797,6 +2267,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
       getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
     );
   }
+  const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
   if (testCaseSummaries.length > 0) {
     console.log(colorize("- test case scores:", ansi2.magenta));
     for (const summary of testCaseSummaries) {
@@ -1807,9 +2278,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
         );
         continue;
       }
+      const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
+        summary.aggregatedScoreItem.data,
+        { isAggregated: true }
+      ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
       console.log(
         `  ${status} ${summary.name.padEnd(24)} score=${colorize(
-          summary.averageScore.toFixed(2),
+          scoreLabel,
           scoreToColor(summary.averageScore)
         )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
       );