@m4trix/evals 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import { diffString } from 'json-diff';
9
+ import { diffLines } from 'diff';
10
10
  import React2, { useState, useEffect, useCallback } from 'react';
11
11
  import { render, Box, Text } from 'ink';
12
12
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
@@ -260,8 +260,35 @@ async function collectTestCasesFromFiles(config) {
260
260
  );
261
261
  return found.flat();
262
262
  }
263
+ function toJsonLines(value) {
264
+ try {
265
+ return JSON.stringify(value, null, 2);
266
+ } catch {
267
+ return String(value);
268
+ }
269
+ }
270
+ function formatDiffString(changes) {
271
+ const lines = [];
272
+ for (const part of changes) {
273
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
274
+ const partLines = part.value.split("\n");
275
+ if (partLines[partLines.length - 1] === "") {
276
+ partLines.pop();
277
+ }
278
+ for (const line of partLines) {
279
+ lines.push(`${prefix} ${line}`);
280
+ }
281
+ }
282
+ return lines.join("\n");
283
+ }
284
+ function createDiffString(expected, actual) {
285
+ const expectedStr = toJsonLines(expected);
286
+ const actualStr = toJsonLines(actual);
287
+ const changes = diffLines(expectedStr, actualStr);
288
+ return formatDiffString(changes);
289
+ }
263
290
  function createDiffLogEntry(expected, actual, options) {
264
- const diff = diffString(expected, actual, { color: false });
291
+ const diff = createDiffString(expected, actual);
265
292
  return {
266
293
  type: "diff",
267
294
  label: options?.label,
@@ -271,7 +298,7 @@ function createDiffLogEntry(expected, actual, options) {
271
298
  };
272
299
  }
273
300
  function getDiffLines(entry) {
274
- const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
301
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
275
302
  return raw.split("\n").map((line) => {
276
303
  const trimmed = line.trimStart();
277
304
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -331,15 +358,28 @@ function getScoreById(id) {
331
358
  }
332
359
 
333
360
  // src/evals/aggregators.ts
334
- function aggregateAverage(values) {
361
+ function aggregateAverageWithVariance(values) {
335
362
  if (values.length === 0) {
336
- return { value: 0 };
363
+ return { value: 0, count: 0 };
337
364
  }
338
365
  const sum = values.reduce((s, v) => s + v.value, 0);
339
- return { value: sum / values.length };
366
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
367
+ const mean = sum / values.length;
368
+ let stdDev;
369
+ if (values.length >= 2) {
370
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
371
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
372
+ }
373
+ return { value: mean, stdDev, count: values.length };
340
374
  }
341
375
  function aggregateAll(values) {
342
- return { passed: values.length > 0 && values.every((v) => v.passed) };
376
+ const total = values.length;
377
+ const passedCount = values.filter((v) => v.passed).length;
378
+ return {
379
+ passed: total > 0 && values.every((v) => v.passed),
380
+ passedCount,
381
+ totalCount: total
382
+ };
343
383
  }
344
384
  function aggregateTokenCountSum(values) {
345
385
  const initial = {
@@ -393,14 +433,28 @@ Score.of({
393
433
  id: "percent",
394
434
  name: "Score",
395
435
  displayStrategy: "bar",
396
- format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
397
- aggregate: aggregateAverage
436
+ format: (data, options) => {
437
+ if (options?.isAggregated) {
438
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
439
+ }
440
+ return data.value.toFixed(2);
441
+ },
442
+ aggregate: aggregateAverageWithVariance
398
443
  });
399
444
  Score.of({
400
445
  id: "binary",
401
446
  name: "Result",
402
447
  displayStrategy: "passFail",
403
- format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
448
+ format: (data, options) => {
449
+ if (options?.isAggregated) {
450
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
451
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
452
+ return `${base} (${data.passedCount}/${data.totalCount})`;
453
+ }
454
+ return base;
455
+ }
456
+ return data.passed ? "PASSED" : "NOT PASSED";
457
+ },
404
458
  aggregate: aggregateAll
405
459
  });
406
460
 
@@ -1339,6 +1393,13 @@ function Spinner({ label = "Running" }) {
1339
1393
  label
1340
1394
  ] });
1341
1395
  }
1396
+ function sampleStdDev(sum, sumSq, n) {
1397
+ if (n < 2)
1398
+ return void 0;
1399
+ const mean = sum / n;
1400
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1401
+ return variance > 0 ? Math.sqrt(variance) : 0;
1402
+ }
1342
1403
  function scoreColor(score) {
1343
1404
  if (score >= 80)
1344
1405
  return "green";
@@ -1457,7 +1518,9 @@ function RunView({
1457
1518
  );
1458
1519
  setEvaluatorNameById(nameById);
1459
1520
  const aggregates = /* @__PURE__ */ new Map();
1521
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
1460
1522
  let overallScoreTotal = 0;
1523
+ let overallScoreSumSq = 0;
1461
1524
  let overallScoreCount = 0;
1462
1525
  const done = new Promise((resolve5) => {
1463
1526
  const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1469,19 +1532,28 @@ function RunView({
1469
1532
  if (numeric !== void 0) {
1470
1533
  const current = aggregates.get(item.evaluatorId) ?? {
1471
1534
  total: 0,
1535
+ sumSq: 0,
1472
1536
  count: 0,
1473
1537
  passed: 0,
1474
1538
  failed: 0
1475
1539
  };
1476
1540
  aggregates.set(item.evaluatorId, {
1477
1541
  total: current.total + numeric,
1542
+ sumSq: current.sumSq + numeric * numeric,
1478
1543
  count: current.count + 1,
1479
1544
  passed: current.passed + (item.passed ? 1 : 0),
1480
1545
  failed: current.failed + (item.passed ? 0 : 1)
1481
1546
  });
1482
1547
  overallScoreTotal += numeric;
1548
+ overallScoreSumSq += numeric * numeric;
1483
1549
  overallScoreCount += 1;
1484
1550
  }
1551
+ for (const s of item.scores) {
1552
+ const key = `${item.evaluatorId}:${s.id}`;
1553
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
1554
+ list.push(s);
1555
+ scoreItemsByEvaluatorScore.set(key, list);
1556
+ }
1485
1557
  }
1486
1558
  setTestCases((prev) => {
1487
1559
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
@@ -1549,8 +1621,10 @@ function RunView({
1549
1621
  failedTestCases: finalEvent.failedTestCases,
1550
1622
  totalTestCases: finalEvent.totalTestCases,
1551
1623
  overallScoreTotal,
1624
+ overallScoreSumSq,
1552
1625
  overallScoreCount,
1553
1626
  aggregates: new Map(aggregates),
1627
+ scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1554
1628
  artifactPath: finalEvent.artifactPath
1555
1629
  });
1556
1630
  setPhase("completed");
@@ -1633,36 +1707,45 @@ function RunView({
1633
1707
  ":",
1634
1708
  " ",
1635
1709
  /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1636
- " ",
1637
- item.scores.map((s) => /* @__PURE__ */ jsxs(
1710
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1711
+ " ",
1712
+ item.metrics.map((m) => {
1713
+ const def = getMetricById(m.id);
1714
+ if (!def)
1715
+ return null;
1716
+ const formatted = def.format(m.data, {
1717
+ isAggregated: tc.isAggregated
1718
+ });
1719
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1720
+ "[",
1721
+ def.name ? `${def.name}: ` : "",
1722
+ formatted,
1723
+ "]",
1724
+ " "
1725
+ ] }, m.id);
1726
+ })
1727
+ ] }) : null
1728
+ ] }),
1729
+ item.scores.length > 0 ? item.scores.map((s, idx) => {
1730
+ const def = getScoreById(s.id);
1731
+ const scoreLabel = def ? def.name ?? def.id : s.id;
1732
+ return /* @__PURE__ */ jsxs(
1638
1733
  Text,
1639
1734
  {
1640
1735
  color: scoreColor(toNumericScore(s.data) ?? 0),
1641
1736
  children: [
1737
+ " ",
1738
+ scoreLabel,
1739
+ ":",
1740
+ " ",
1642
1741
  formatScorePart(s, scoreColor, {
1643
1742
  isAggregated: tc.isAggregated
1644
- }),
1645
- " "
1743
+ })
1646
1744
  ]
1647
1745
  },
1648
- s.id
1649
- )),
1650
- item.metrics?.map((m) => {
1651
- const def = getMetricById(m.id);
1652
- if (!def)
1653
- return null;
1654
- const formatted = def.format(m.data, {
1655
- isAggregated: tc.isAggregated
1656
- });
1657
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1658
- "[",
1659
- def.name ? `${def.name}: ` : "",
1660
- formatted,
1661
- "]",
1662
- " "
1663
- ] }, m.id);
1664
- })
1665
- ] }),
1746
+ `${item.evaluatorId}-${s.id}-${idx}`
1747
+ );
1748
+ }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
1666
1749
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1667
1750
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
1668
1751
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
@@ -1706,45 +1789,85 @@ function RunView({
1706
1789
  label: "overall avg",
1707
1790
  value: summary.overallScoreTotal / summary.overallScoreCount,
1708
1791
  barWidth: 20,
1709
- format: (v) => v.toFixed(2)
1792
+ format: (v) => {
1793
+ const sd = sampleStdDev(
1794
+ summary.overallScoreTotal,
1795
+ summary.overallScoreSumSq,
1796
+ summary.overallScoreCount
1797
+ );
1798
+ return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
1799
+ }
1710
1800
  }
1711
1801
  ) }),
1712
1802
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1713
1803
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
1714
1804
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1715
1805
  const agg = summary.aggregates.get(id);
1716
- if (!agg || agg.count === 0) {
1806
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
1807
+ (k) => k.startsWith(`${id}:`)
1808
+ );
1809
+ if (scoreKeys.length === 0) {
1717
1810
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1718
1811
  "- ",
1719
1812
  name.padEnd(28),
1720
- " no numeric scores"
1813
+ " no scores"
1721
1814
  ] }, id);
1722
1815
  }
1723
- const mean = agg.total / agg.count;
1724
- return /* @__PURE__ */ jsxs(Text, { children: [
1725
- "- ",
1726
- name.padEnd(28),
1727
- " avg=",
1728
- /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1816
+ const passedFailed = agg != null ? /* @__PURE__ */ jsxs(Text, { children: [
1729
1817
  " ",
1730
1818
  "passed=",
1731
1819
  agg.passed,
1732
1820
  " failed=",
1733
1821
  agg.failed
1822
+ ] }) : null;
1823
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1824
+ /* @__PURE__ */ jsxs(Text, { children: [
1825
+ "- ",
1826
+ name.padEnd(28),
1827
+ passedFailed
1828
+ ] }),
1829
+ scoreKeys.map((key) => {
1830
+ const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
1831
+ const aggregated = aggregateScoreItems(items);
1832
+ if (!aggregated)
1833
+ return null;
1834
+ const def = getScoreById(aggregated.id);
1835
+ const label = def ? def.name ?? def.id : aggregated.id;
1836
+ const formatted = def?.format(aggregated.data, {
1837
+ isAggregated: true
1838
+ }) ?? "n/a";
1839
+ const numeric = toNumericScore(aggregated.data);
1840
+ return /* @__PURE__ */ jsxs(
1841
+ Text,
1842
+ {
1843
+ color: numeric !== void 0 ? scoreColor(numeric) : "gray",
1844
+ children: [
1845
+ " ",
1846
+ label,
1847
+ ": ",
1848
+ formatted
1849
+ ]
1850
+ },
1851
+ key
1852
+ );
1853
+ })
1734
1854
  ] }, id);
1735
1855
  })
1736
1856
  ] }),
1737
1857
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1738
1858
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
1739
1859
  testCases.map((tc) => {
1740
- const numericScores = tc.aggregatedEvaluatorScores.flatMap(
1741
- (item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
1860
+ const allScores = tc.events.flatMap(
1861
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1742
1862
  );
1743
- const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1863
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1864
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1865
+ const total = allScores.reduce((a, b) => a + b, 0);
1866
+ const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
1744
1867
  const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1745
1868
  const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1746
1869
  isAggregated: true
1747
- }) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
1870
+ }) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
1748
1871
  return /* @__PURE__ */ jsxs(Box, { children: [
1749
1872
  /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1750
1873
  /* @__PURE__ */ jsxs(Text, { children: [
@@ -1778,13 +1901,26 @@ function RunView({
1778
1901
  }
1779
1902
 
1780
1903
  // src/cli-simple/run.ts
1904
+ function sampleStdDev2(sum, sumSq, n) {
1905
+ if (n < 2)
1906
+ return void 0;
1907
+ const mean = sum / n;
1908
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1909
+ return variance > 0 ? Math.sqrt(variance) : 0;
1910
+ }
1781
1911
  function buildTestCaseSummaries(byId) {
1782
1912
  const summaries = [];
1783
1913
  for (const { name, events } of byId.values()) {
1784
1914
  const passed = events.every((e) => e.passed);
1785
1915
  const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1786
1916
  const isAggregated = events.length > 1;
1787
- const numericScores = [];
1917
+ const allScores = events.flatMap(
1918
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1919
+ );
1920
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1921
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1922
+ const total = allScores.reduce((a, b) => a + b, 0);
1923
+ const stdDev = sampleStdDev2(total, sumSq, allScores.length);
1788
1924
  let firstAggregatedScore;
1789
1925
  for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1790
1926
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -1800,21 +1936,18 @@ function buildTestCaseSummaries(byId) {
1800
1936
  }
1801
1937
  for (const items of scoreIdToItems.values()) {
1802
1938
  const agg = aggregateScoreItems(items);
1803
- if (agg) {
1804
- const n = toNumericScoreFromScores([agg]);
1805
- if (n !== void 0) {
1806
- numericScores.push(n);
1807
- if (firstAggregatedScore === void 0) {
1808
- firstAggregatedScore = agg;
1809
- }
1810
- }
1939
+ if (agg && firstAggregatedScore === void 0) {
1940
+ firstAggregatedScore = agg;
1941
+ break;
1811
1942
  }
1812
1943
  }
1944
+ if (firstAggregatedScore !== void 0)
1945
+ break;
1813
1946
  }
1814
- const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1815
1947
  summaries.push({
1816
1948
  name,
1817
1949
  averageScore,
1950
+ stdDev: stdDev ?? void 0,
1818
1951
  aggregatedScoreItem: firstAggregatedScore,
1819
1952
  isAggregated,
1820
1953
  durationMs,
@@ -1845,12 +1978,36 @@ function scoreToColor(score) {
1845
1978
  }
1846
1979
  return ansi2.red;
1847
1980
  }
1848
- function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1849
- if (!aggregate || aggregate.count === 0) {
1850
- return `- ${evaluatorName.padEnd(28)} no numeric scores`;
1981
+ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
1982
+ const lines = [];
1983
+ const scoreKeys = [...scoreItemsByKey.keys()].filter(
1984
+ (k) => k.startsWith(`${evaluatorId}:`)
1985
+ );
1986
+ if (scoreKeys.length === 0) {
1987
+ lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
1988
+ return lines;
1989
+ }
1990
+ const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
1991
+ const scoreLines = [];
1992
+ for (const key of scoreKeys) {
1993
+ const items = scoreItemsByKey.get(key) ?? [];
1994
+ const agg = aggregateScoreItems(items);
1995
+ if (!agg)
1996
+ continue;
1997
+ const def = getScoreById(agg.id);
1998
+ const label = def ? def.name ?? def.id : agg.id;
1999
+ const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2000
+ const numeric = toNumericScore(agg.data);
2001
+ const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2002
+ scoreLines.push(` ${label}: ${colored}`);
2003
+ }
2004
+ if (scoreLines.length > 0) {
2005
+ lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
2006
+ lines.push(...scoreLines);
2007
+ } else {
2008
+ lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
1851
2009
  }
1852
- const mean = aggregate.total / aggregate.count;
1853
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
2010
+ return lines;
1854
2011
  }
1855
2012
  function createBar2(value, max = 100, width = 20) {
1856
2013
  const safe = Math.max(0, Math.min(max, value));
@@ -1902,46 +2059,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1902
2059
  }
1903
2060
  function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1904
2061
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1905
- const scoreParts = [];
1906
- for (const item of scores) {
1907
- const def = getScoreById(item.id);
1908
- if (!def) {
1909
- const numeric = toNumericScore(item.data);
1910
- scoreParts.push(
1911
- numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
1912
- );
1913
- continue;
1914
- }
1915
- const formatted = def.format(item.data, options);
1916
- switch (def.displayStrategy) {
1917
- case "bar": {
1918
- const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1919
- if (typeof numeric === "number" && Number.isFinite(numeric)) {
1920
- scoreParts.push(
1921
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
1922
- );
1923
- } else {
1924
- scoreParts.push(formatted);
1925
- }
1926
- break;
1927
- }
1928
- case "number":
1929
- scoreParts.push(formatted);
1930
- break;
1931
- case "passFail":
1932
- scoreParts.push(
1933
- colorize(
1934
- formatted,
1935
- item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
1936
- )
1937
- );
1938
- break;
1939
- }
1940
- }
1941
- const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
1942
- let line = ` ${name}: ${passLabel} ${scoreStr}`;
2062
+ const metricParts = [];
1943
2063
  if (metrics && metrics.length > 0) {
1944
- const metricParts = [];
1945
2064
  for (const { id, data } of metrics) {
1946
2065
  const def = getMetricById(id);
1947
2066
  if (def) {
@@ -1951,11 +2070,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1951
2070
  );
1952
2071
  }
1953
2072
  }
1954
- if (metricParts.length > 0) {
1955
- line += ` ${metricParts.join(" ")}`;
2073
+ }
2074
+ const scoreLines = [];
2075
+ for (const item of scores) {
2076
+ const def = getScoreById(item.id);
2077
+ const scoreLabel = def ? def.name ?? def.id : item.id;
2078
+ let formatted;
2079
+ if (!def) {
2080
+ const numeric = toNumericScore(item.data);
2081
+ formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2082
+ } else {
2083
+ const raw = def.format(item.data, options);
2084
+ switch (def.displayStrategy) {
2085
+ case "bar": {
2086
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2087
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
2088
+ formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
2089
+ } else {
2090
+ formatted = raw;
2091
+ }
2092
+ break;
2093
+ }
2094
+ case "number":
2095
+ formatted = raw;
2096
+ break;
2097
+ case "passFail":
2098
+ formatted = colorize(
2099
+ raw,
2100
+ item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2101
+ );
2102
+ break;
2103
+ }
1956
2104
  }
2105
+ scoreLines.push(` ${scoreLabel}: ${formatted}`);
1957
2106
  }
1958
- return line;
2107
+ const lines = [];
2108
+ const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
2109
+ lines.push(` ${name}: ${passLabel}${metricStr}`);
2110
+ if (scoreLines.length > 0) {
2111
+ lines.push(...scoreLines);
2112
+ } else {
2113
+ lines.push(` n/a`);
2114
+ }
2115
+ return lines;
1959
2116
  }
1960
2117
  async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
1961
2118
  const dataset = await runner.resolveDatasetByName(datasetName);
@@ -1978,8 +2135,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1978
2135
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
1979
2136
  );
1980
2137
  const aggregates = /* @__PURE__ */ new Map();
2138
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
1981
2139
  const testCaseByTestId = /* @__PURE__ */ new Map();
1982
2140
  let overallScoreTotal = 0;
2141
+ let overallScoreSumSq = 0;
1983
2142
  let overallScoreCount = 0;
1984
2143
  let completedCount = 0;
1985
2144
  let totalCount = 0;
@@ -2036,19 +2195,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2036
2195
  if (numeric !== void 0) {
2037
2196
  const current = aggregates.get(item.evaluatorId) ?? {
2038
2197
  total: 0,
2198
+ sumSq: 0,
2039
2199
  count: 0,
2040
2200
  passed: 0,
2041
2201
  failed: 0
2042
2202
  };
2043
2203
  aggregates.set(item.evaluatorId, {
2044
2204
  total: current.total + numeric,
2205
+ sumSq: current.sumSq + numeric * numeric,
2045
2206
  count: current.count + 1,
2046
2207
  passed: current.passed + (item.passed ? 1 : 0),
2047
2208
  failed: current.failed + (item.passed ? 0 : 1)
2048
2209
  });
2049
2210
  overallScoreTotal += numeric;
2211
+ overallScoreSumSq += numeric * numeric;
2050
2212
  overallScoreCount += 1;
2051
2213
  }
2214
+ for (const s of item.scores) {
2215
+ const key = `${item.evaluatorId}:${s.id}`;
2216
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
2217
+ list.push(s);
2218
+ scoreItemsByEvaluatorScore.set(key, list);
2219
+ }
2052
2220
  }
2053
2221
  const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2054
2222
  const isLastRerun = event.rerunIndex >= event.rerunTotal;
@@ -2072,7 +2240,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2072
2240
  for (const item of aggregatedScores) {
2073
2241
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2074
2242
  lines.push(
2075
- formatEvaluatorScoreLine(
2243
+ ...formatEvaluatorScoreLine(
2076
2244
  name,
2077
2245
  item.scores,
2078
2246
  item.passed,
@@ -2154,18 +2322,30 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2154
2322
  );
2155
2323
  if (overallScoreCount > 0) {
2156
2324
  const overallAverage = overallScoreTotal / overallScoreCount;
2325
+ const overallSd = sampleStdDev2(
2326
+ overallScoreTotal,
2327
+ overallScoreSumSq,
2328
+ overallScoreCount
2329
+ );
2330
+ const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2157
2331
  console.log(
2158
2332
  `- overall avg score: ${colorize(
2159
- overallAverage.toFixed(2),
2333
+ avgStr,
2160
2334
  scoreToColor(overallAverage)
2161
2335
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
2162
2336
  );
2163
2337
  }
2164
2338
  console.log(colorize("- evaluator averages:", ansi2.magenta));
2165
2339
  for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
2166
- console.log(
2167
- getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
2340
+ const evaluatorLines = getEvaluatorSummaryLines(
2341
+ evaluatorId,
2342
+ evaluatorName,
2343
+ aggregates.get(evaluatorId),
2344
+ scoreItemsByEvaluatorScore
2168
2345
  );
2346
+ for (const line of evaluatorLines) {
2347
+ console.log(line);
2348
+ }
2169
2349
  }
2170
2350
  const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
2171
2351
  if (testCaseSummaries.length > 0) {
@@ -2181,7 +2361,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2181
2361
  const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2182
2362
  summary.aggregatedScoreItem.data,
2183
2363
  { isAggregated: true }
2184
- ) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
2364
+ ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2185
2365
  console.log(
2186
2366
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2187
2367
  scoreLabel,