@m4trix/evals 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- var jsonDiff = require('json-diff');
11
+ var diff = require('diff');
12
12
  var React2 = require('react');
13
13
  var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
@@ -286,8 +286,35 @@ async function collectTestCasesFromFiles(config) {
286
286
  );
287
287
  return found.flat();
288
288
  }
289
+ function toJsonLines(value) {
290
+ try {
291
+ return JSON.stringify(value, null, 2);
292
+ } catch {
293
+ return String(value);
294
+ }
295
+ }
296
+ function formatDiffString(changes) {
297
+ const lines = [];
298
+ for (const part of changes) {
299
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
300
+ const partLines = part.value.split("\n");
301
+ if (partLines[partLines.length - 1] === "") {
302
+ partLines.pop();
303
+ }
304
+ for (const line of partLines) {
305
+ lines.push(`${prefix} ${line}`);
306
+ }
307
+ }
308
+ return lines.join("\n");
309
+ }
310
+ function createDiffString(expected, actual) {
311
+ const expectedStr = toJsonLines(expected);
312
+ const actualStr = toJsonLines(actual);
313
+ const changes = diff.diffLines(expectedStr, actualStr);
314
+ return formatDiffString(changes);
315
+ }
289
316
  function createDiffLogEntry(expected, actual, options) {
290
- const diff = jsonDiff.diffString(expected, actual, { color: false });
317
+ const diff = createDiffString(expected, actual);
291
318
  return {
292
319
  type: "diff",
293
320
  label: options?.label,
@@ -297,7 +324,7 @@ function createDiffLogEntry(expected, actual, options) {
297
324
  };
298
325
  }
299
326
  function getDiffLines(entry) {
300
- const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
327
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
301
328
  return raw.split("\n").map((line) => {
302
329
  const trimmed = line.trimStart();
303
330
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -357,15 +384,28 @@ function getScoreById(id) {
357
384
  }
358
385
 
359
386
  // src/evals/aggregators.ts
360
- function aggregateAverage(values) {
387
+ function aggregateAverageWithVariance(values) {
361
388
  if (values.length === 0) {
362
- return { value: 0 };
389
+ return { value: 0, count: 0 };
363
390
  }
364
391
  const sum = values.reduce((s, v) => s + v.value, 0);
365
- return { value: sum / values.length };
392
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
393
+ const mean = sum / values.length;
394
+ let stdDev;
395
+ if (values.length >= 2) {
396
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
397
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
398
+ }
399
+ return { value: mean, stdDev, count: values.length };
366
400
  }
367
401
  function aggregateAll(values) {
368
- return { passed: values.length > 0 && values.every((v) => v.passed) };
402
+ const total = values.length;
403
+ const passedCount = values.filter((v) => v.passed).length;
404
+ return {
405
+ passed: total > 0 && values.every((v) => v.passed),
406
+ passedCount,
407
+ totalCount: total
408
+ };
369
409
  }
370
410
  function aggregateTokenCountSum(values) {
371
411
  const initial = {
@@ -419,14 +459,28 @@ Score.of({
419
459
  id: "percent",
420
460
  name: "Score",
421
461
  displayStrategy: "bar",
422
- format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
423
- aggregate: aggregateAverage
462
+ format: (data, options) => {
463
+ if (options?.isAggregated) {
464
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
465
+ }
466
+ return data.value.toFixed(2);
467
+ },
468
+ aggregate: aggregateAverageWithVariance
424
469
  });
425
470
  Score.of({
426
471
  id: "binary",
427
472
  name: "Result",
428
473
  displayStrategy: "passFail",
429
- format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
474
+ format: (data, options) => {
475
+ if (options?.isAggregated) {
476
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
477
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
478
+ return `${base} (${data.passedCount}/${data.totalCount})`;
479
+ }
480
+ return base;
481
+ }
482
+ return data.passed ? "PASSED" : "NOT PASSED";
483
+ },
430
484
  aggregate: aggregateAll
431
485
  });
432
486
 
@@ -1365,6 +1419,13 @@ function Spinner({ label = "Running" }) {
1365
1419
  label
1366
1420
  ] });
1367
1421
  }
1422
+ function sampleStdDev(sum, sumSq, n) {
1423
+ if (n < 2)
1424
+ return void 0;
1425
+ const mean = sum / n;
1426
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1427
+ return variance > 0 ? Math.sqrt(variance) : 0;
1428
+ }
1368
1429
  function scoreColor(score) {
1369
1430
  if (score >= 80)
1370
1431
  return "green";
@@ -1483,7 +1544,9 @@ function RunView({
1483
1544
  );
1484
1545
  setEvaluatorNameById(nameById);
1485
1546
  const aggregates = /* @__PURE__ */ new Map();
1547
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
1486
1548
  let overallScoreTotal = 0;
1549
+ let overallScoreSumSq = 0;
1487
1550
  let overallScoreCount = 0;
1488
1551
  const done = new Promise((resolve5) => {
1489
1552
  const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1495,19 +1558,28 @@ function RunView({
1495
1558
  if (numeric !== void 0) {
1496
1559
  const current = aggregates.get(item.evaluatorId) ?? {
1497
1560
  total: 0,
1561
+ sumSq: 0,
1498
1562
  count: 0,
1499
1563
  passed: 0,
1500
1564
  failed: 0
1501
1565
  };
1502
1566
  aggregates.set(item.evaluatorId, {
1503
1567
  total: current.total + numeric,
1568
+ sumSq: current.sumSq + numeric * numeric,
1504
1569
  count: current.count + 1,
1505
1570
  passed: current.passed + (item.passed ? 1 : 0),
1506
1571
  failed: current.failed + (item.passed ? 0 : 1)
1507
1572
  });
1508
1573
  overallScoreTotal += numeric;
1574
+ overallScoreSumSq += numeric * numeric;
1509
1575
  overallScoreCount += 1;
1510
1576
  }
1577
+ for (const s of item.scores) {
1578
+ const key = `${item.evaluatorId}:${s.id}`;
1579
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
1580
+ list.push(s);
1581
+ scoreItemsByEvaluatorScore.set(key, list);
1582
+ }
1511
1583
  }
1512
1584
  setTestCases((prev) => {
1513
1585
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
@@ -1575,8 +1647,10 @@ function RunView({
1575
1647
  failedTestCases: finalEvent.failedTestCases,
1576
1648
  totalTestCases: finalEvent.totalTestCases,
1577
1649
  overallScoreTotal,
1650
+ overallScoreSumSq,
1578
1651
  overallScoreCount,
1579
1652
  aggregates: new Map(aggregates),
1653
+ scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1580
1654
  artifactPath: finalEvent.artifactPath
1581
1655
  });
1582
1656
  setPhase("completed");
@@ -1659,36 +1733,45 @@ function RunView({
1659
1733
  ":",
1660
1734
  " ",
1661
1735
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1662
- " ",
1663
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
1736
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1737
+ " ",
1738
+ item.metrics.map((m) => {
1739
+ const def = getMetricById(m.id);
1740
+ if (!def)
1741
+ return null;
1742
+ const formatted = def.format(m.data, {
1743
+ isAggregated: tc.isAggregated
1744
+ });
1745
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1746
+ "[",
1747
+ def.name ? `${def.name}: ` : "",
1748
+ formatted,
1749
+ "]",
1750
+ " "
1751
+ ] }, m.id);
1752
+ })
1753
+ ] }) : null
1754
+ ] }),
1755
+ item.scores.length > 0 ? item.scores.map((s, idx) => {
1756
+ const def = getScoreById(s.id);
1757
+ const scoreLabel = def ? def.name ?? def.id : s.id;
1758
+ return /* @__PURE__ */ jsxRuntime.jsxs(
1664
1759
  ink.Text,
1665
1760
  {
1666
1761
  color: scoreColor(toNumericScore(s.data) ?? 0),
1667
1762
  children: [
1763
+ " ",
1764
+ scoreLabel,
1765
+ ":",
1766
+ " ",
1668
1767
  formatScorePart(s, scoreColor, {
1669
1768
  isAggregated: tc.isAggregated
1670
- }),
1671
- " "
1769
+ })
1672
1770
  ]
1673
1771
  },
1674
- s.id
1675
- )),
1676
- item.metrics?.map((m) => {
1677
- const def = getMetricById(m.id);
1678
- if (!def)
1679
- return null;
1680
- const formatted = def.format(m.data, {
1681
- isAggregated: tc.isAggregated
1682
- });
1683
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1684
- "[",
1685
- def.name ? `${def.name}: ` : "",
1686
- formatted,
1687
- "]",
1688
- " "
1689
- ] }, m.id);
1690
- })
1691
- ] }),
1772
+ `${item.evaluatorId}-${s.id}-${idx}`
1773
+ );
1774
+ }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
1692
1775
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1693
1776
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1694
1777
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
@@ -1732,45 +1815,85 @@ function RunView({
1732
1815
  label: "overall avg",
1733
1816
  value: summary.overallScoreTotal / summary.overallScoreCount,
1734
1817
  barWidth: 20,
1735
- format: (v) => v.toFixed(2)
1818
+ format: (v) => {
1819
+ const sd = sampleStdDev(
1820
+ summary.overallScoreTotal,
1821
+ summary.overallScoreSumSq,
1822
+ summary.overallScoreCount
1823
+ );
1824
+ return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
1825
+ }
1736
1826
  }
1737
1827
  ) }),
1738
1828
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1739
1829
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
1740
1830
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1741
1831
  const agg = summary.aggregates.get(id);
1742
- if (!agg || agg.count === 0) {
1832
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
1833
+ (k) => k.startsWith(`${id}:`)
1834
+ );
1835
+ if (scoreKeys.length === 0) {
1743
1836
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1744
1837
  "- ",
1745
1838
  name.padEnd(28),
1746
- " no numeric scores"
1839
+ " no scores"
1747
1840
  ] }, id);
1748
1841
  }
1749
- const mean = agg.total / agg.count;
1750
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1751
- "- ",
1752
- name.padEnd(28),
1753
- " avg=",
1754
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1842
+ const passedFailed = agg != null ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1755
1843
  " ",
1756
1844
  "passed=",
1757
1845
  agg.passed,
1758
1846
  " failed=",
1759
1847
  agg.failed
1848
+ ] }) : null;
1849
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1850
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1851
+ "- ",
1852
+ name.padEnd(28),
1853
+ passedFailed
1854
+ ] }),
1855
+ scoreKeys.map((key) => {
1856
+ const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
1857
+ const aggregated = aggregateScoreItems(items);
1858
+ if (!aggregated)
1859
+ return null;
1860
+ const def = getScoreById(aggregated.id);
1861
+ const label = def ? def.name ?? def.id : aggregated.id;
1862
+ const formatted = def?.format(aggregated.data, {
1863
+ isAggregated: true
1864
+ }) ?? "n/a";
1865
+ const numeric = toNumericScore(aggregated.data);
1866
+ return /* @__PURE__ */ jsxRuntime.jsxs(
1867
+ ink.Text,
1868
+ {
1869
+ color: numeric !== void 0 ? scoreColor(numeric) : "gray",
1870
+ children: [
1871
+ " ",
1872
+ label,
1873
+ ": ",
1874
+ formatted
1875
+ ]
1876
+ },
1877
+ key
1878
+ );
1879
+ })
1760
1880
  ] }, id);
1761
1881
  })
1762
1882
  ] }),
1763
1883
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1764
1884
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
1765
1885
  testCases.map((tc) => {
1766
- const numericScores = tc.aggregatedEvaluatorScores.flatMap(
1767
- (item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
1886
+ const allScores = tc.events.flatMap(
1887
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1768
1888
  );
1769
- const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1889
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1890
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1891
+ const total = allScores.reduce((a, b) => a + b, 0);
1892
+ const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
1770
1893
  const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1771
1894
  const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1772
1895
  isAggregated: true
1773
- }) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
1896
+ }) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
1774
1897
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1775
1898
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1776
1899
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
@@ -1804,13 +1927,26 @@ function RunView({
1804
1927
  }
1805
1928
 
1806
1929
  // src/cli-simple/run.ts
1930
+ function sampleStdDev2(sum, sumSq, n) {
1931
+ if (n < 2)
1932
+ return void 0;
1933
+ const mean = sum / n;
1934
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1935
+ return variance > 0 ? Math.sqrt(variance) : 0;
1936
+ }
1807
1937
  function buildTestCaseSummaries(byId) {
1808
1938
  const summaries = [];
1809
1939
  for (const { name, events } of byId.values()) {
1810
1940
  const passed = events.every((e) => e.passed);
1811
1941
  const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1812
1942
  const isAggregated = events.length > 1;
1813
- const numericScores = [];
1943
+ const allScores = events.flatMap(
1944
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1945
+ );
1946
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1947
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1948
+ const total = allScores.reduce((a, b) => a + b, 0);
1949
+ const stdDev = sampleStdDev2(total, sumSq, allScores.length);
1814
1950
  let firstAggregatedScore;
1815
1951
  for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1816
1952
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -1826,21 +1962,18 @@ function buildTestCaseSummaries(byId) {
1826
1962
  }
1827
1963
  for (const items of scoreIdToItems.values()) {
1828
1964
  const agg = aggregateScoreItems(items);
1829
- if (agg) {
1830
- const n = toNumericScoreFromScores([agg]);
1831
- if (n !== void 0) {
1832
- numericScores.push(n);
1833
- if (firstAggregatedScore === void 0) {
1834
- firstAggregatedScore = agg;
1835
- }
1836
- }
1965
+ if (agg && firstAggregatedScore === void 0) {
1966
+ firstAggregatedScore = agg;
1967
+ break;
1837
1968
  }
1838
1969
  }
1970
+ if (firstAggregatedScore !== void 0)
1971
+ break;
1839
1972
  }
1840
- const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1841
1973
  summaries.push({
1842
1974
  name,
1843
1975
  averageScore,
1976
+ stdDev: stdDev ?? void 0,
1844
1977
  aggregatedScoreItem: firstAggregatedScore,
1845
1978
  isAggregated,
1846
1979
  durationMs,
@@ -1871,12 +2004,36 @@ function scoreToColor(score) {
1871
2004
  }
1872
2005
  return ansi2.red;
1873
2006
  }
1874
- function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1875
- if (!aggregate || aggregate.count === 0) {
1876
- return `- ${evaluatorName.padEnd(28)} no numeric scores`;
2007
+ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
2008
+ const lines = [];
2009
+ const scoreKeys = [...scoreItemsByKey.keys()].filter(
2010
+ (k) => k.startsWith(`${evaluatorId}:`)
2011
+ );
2012
+ if (scoreKeys.length === 0) {
2013
+ lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
2014
+ return lines;
2015
+ }
2016
+ const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
2017
+ const scoreLines = [];
2018
+ for (const key of scoreKeys) {
2019
+ const items = scoreItemsByKey.get(key) ?? [];
2020
+ const agg = aggregateScoreItems(items);
2021
+ if (!agg)
2022
+ continue;
2023
+ const def = getScoreById(agg.id);
2024
+ const label = def ? def.name ?? def.id : agg.id;
2025
+ const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2026
+ const numeric = toNumericScore(agg.data);
2027
+ const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2028
+ scoreLines.push(` ${label}: ${colored}`);
2029
+ }
2030
+ if (scoreLines.length > 0) {
2031
+ lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
2032
+ lines.push(...scoreLines);
2033
+ } else {
2034
+ lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
1877
2035
  }
1878
- const mean = aggregate.total / aggregate.count;
1879
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
2036
+ return lines;
1880
2037
  }
1881
2038
  function createBar2(value, max = 100, width = 20) {
1882
2039
  const safe = Math.max(0, Math.min(max, value));
@@ -1928,46 +2085,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1928
2085
  }
1929
2086
  function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1930
2087
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1931
- const scoreParts = [];
1932
- for (const item of scores) {
1933
- const def = getScoreById(item.id);
1934
- if (!def) {
1935
- const numeric = toNumericScore(item.data);
1936
- scoreParts.push(
1937
- numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
1938
- );
1939
- continue;
1940
- }
1941
- const formatted = def.format(item.data, options);
1942
- switch (def.displayStrategy) {
1943
- case "bar": {
1944
- const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1945
- if (typeof numeric === "number" && Number.isFinite(numeric)) {
1946
- scoreParts.push(
1947
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
1948
- );
1949
- } else {
1950
- scoreParts.push(formatted);
1951
- }
1952
- break;
1953
- }
1954
- case "number":
1955
- scoreParts.push(formatted);
1956
- break;
1957
- case "passFail":
1958
- scoreParts.push(
1959
- colorize(
1960
- formatted,
1961
- item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
1962
- )
1963
- );
1964
- break;
1965
- }
1966
- }
1967
- const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
1968
- let line = ` ${name}: ${passLabel} ${scoreStr}`;
2088
+ const metricParts = [];
1969
2089
  if (metrics && metrics.length > 0) {
1970
- const metricParts = [];
1971
2090
  for (const { id, data } of metrics) {
1972
2091
  const def = getMetricById(id);
1973
2092
  if (def) {
@@ -1977,11 +2096,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1977
2096
  );
1978
2097
  }
1979
2098
  }
1980
- if (metricParts.length > 0) {
1981
- line += ` ${metricParts.join(" ")}`;
2099
+ }
2100
+ const scoreLines = [];
2101
+ for (const item of scores) {
2102
+ const def = getScoreById(item.id);
2103
+ const scoreLabel = def ? def.name ?? def.id : item.id;
2104
+ let formatted;
2105
+ if (!def) {
2106
+ const numeric = toNumericScore(item.data);
2107
+ formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2108
+ } else {
2109
+ const raw = def.format(item.data, options);
2110
+ switch (def.displayStrategy) {
2111
+ case "bar": {
2112
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2113
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
2114
+ formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
2115
+ } else {
2116
+ formatted = raw;
2117
+ }
2118
+ break;
2119
+ }
2120
+ case "number":
2121
+ formatted = raw;
2122
+ break;
2123
+ case "passFail":
2124
+ formatted = colorize(
2125
+ raw,
2126
+ item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2127
+ );
2128
+ break;
2129
+ }
1982
2130
  }
2131
+ scoreLines.push(` ${scoreLabel}: ${formatted}`);
1983
2132
  }
1984
- return line;
2133
+ const lines = [];
2134
+ const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
2135
+ lines.push(` ${name}: ${passLabel}${metricStr}`);
2136
+ if (scoreLines.length > 0) {
2137
+ lines.push(...scoreLines);
2138
+ } else {
2139
+ lines.push(` n/a`);
2140
+ }
2141
+ return lines;
1985
2142
  }
1986
2143
  async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
1987
2144
  const dataset = await runner.resolveDatasetByName(datasetName);
@@ -2004,8 +2161,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2004
2161
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2005
2162
  );
2006
2163
  const aggregates = /* @__PURE__ */ new Map();
2164
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
2007
2165
  const testCaseByTestId = /* @__PURE__ */ new Map();
2008
2166
  let overallScoreTotal = 0;
2167
+ let overallScoreSumSq = 0;
2009
2168
  let overallScoreCount = 0;
2010
2169
  let completedCount = 0;
2011
2170
  let totalCount = 0;
@@ -2062,19 +2221,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2062
2221
  if (numeric !== void 0) {
2063
2222
  const current = aggregates.get(item.evaluatorId) ?? {
2064
2223
  total: 0,
2224
+ sumSq: 0,
2065
2225
  count: 0,
2066
2226
  passed: 0,
2067
2227
  failed: 0
2068
2228
  };
2069
2229
  aggregates.set(item.evaluatorId, {
2070
2230
  total: current.total + numeric,
2231
+ sumSq: current.sumSq + numeric * numeric,
2071
2232
  count: current.count + 1,
2072
2233
  passed: current.passed + (item.passed ? 1 : 0),
2073
2234
  failed: current.failed + (item.passed ? 0 : 1)
2074
2235
  });
2075
2236
  overallScoreTotal += numeric;
2237
+ overallScoreSumSq += numeric * numeric;
2076
2238
  overallScoreCount += 1;
2077
2239
  }
2240
+ for (const s of item.scores) {
2241
+ const key = `${item.evaluatorId}:${s.id}`;
2242
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
2243
+ list.push(s);
2244
+ scoreItemsByEvaluatorScore.set(key, list);
2245
+ }
2078
2246
  }
2079
2247
  const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2080
2248
  const isLastRerun = event.rerunIndex >= event.rerunTotal;
@@ -2098,7 +2266,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2098
2266
  for (const item of aggregatedScores) {
2099
2267
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2100
2268
  lines.push(
2101
- formatEvaluatorScoreLine(
2269
+ ...formatEvaluatorScoreLine(
2102
2270
  name,
2103
2271
  item.scores,
2104
2272
  item.passed,
@@ -2180,18 +2348,30 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2180
2348
  );
2181
2349
  if (overallScoreCount > 0) {
2182
2350
  const overallAverage = overallScoreTotal / overallScoreCount;
2351
+ const overallSd = sampleStdDev2(
2352
+ overallScoreTotal,
2353
+ overallScoreSumSq,
2354
+ overallScoreCount
2355
+ );
2356
+ const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2183
2357
  console.log(
2184
2358
  `- overall avg score: ${colorize(
2185
- overallAverage.toFixed(2),
2359
+ avgStr,
2186
2360
  scoreToColor(overallAverage)
2187
2361
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
2188
2362
  );
2189
2363
  }
2190
2364
  console.log(colorize("- evaluator averages:", ansi2.magenta));
2191
2365
  for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
2192
- console.log(
2193
- getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
2366
+ const evaluatorLines = getEvaluatorSummaryLines(
2367
+ evaluatorId,
2368
+ evaluatorName,
2369
+ aggregates.get(evaluatorId),
2370
+ scoreItemsByEvaluatorScore
2194
2371
  );
2372
+ for (const line of evaluatorLines) {
2373
+ console.log(line);
2374
+ }
2195
2375
  }
2196
2376
  const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
2197
2377
  if (testCaseSummaries.length > 0) {
@@ -2207,7 +2387,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2207
2387
  const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2208
2388
  summary.aggregatedScoreItem.data,
2209
2389
  { isAggregated: true }
2210
- ) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
2390
+ ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2211
2391
  console.log(
2212
2392
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2213
2393
  scoreLabel,