@m4trix/evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- var jsonDiff = require('json-diff');
11
+ var diff = require('diff');
12
12
  var React2 = require('react');
13
13
  var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
@@ -286,8 +286,35 @@ async function collectTestCasesFromFiles(config) {
286
286
  );
287
287
  return found.flat();
288
288
  }
289
+ function toJsonLines(value) {
290
+ try {
291
+ return JSON.stringify(value, null, 2);
292
+ } catch {
293
+ return String(value);
294
+ }
295
+ }
296
+ function formatDiffString(changes) {
297
+ const lines = [];
298
+ for (const part of changes) {
299
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
300
+ const partLines = part.value.split("\n");
301
+ if (partLines[partLines.length - 1] === "") {
302
+ partLines.pop();
303
+ }
304
+ for (const line of partLines) {
305
+ lines.push(`${prefix} ${line}`);
306
+ }
307
+ }
308
+ return lines.join("\n");
309
+ }
310
+ function createDiffString(expected, actual) {
311
+ const expectedStr = toJsonLines(expected);
312
+ const actualStr = toJsonLines(actual);
313
+ const changes = diff.diffLines(expectedStr, actualStr);
314
+ return formatDiffString(changes);
315
+ }
289
316
  function createDiffLogEntry(expected, actual, options) {
290
- const diff = jsonDiff.diffString(expected, actual, { color: false });
317
+ const diff = createDiffString(expected, actual);
291
318
  return {
292
319
  type: "diff",
293
320
  label: options?.label,
@@ -297,7 +324,7 @@ function createDiffLogEntry(expected, actual, options) {
297
324
  };
298
325
  }
299
326
  function getDiffLines(entry) {
300
- const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
327
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
301
328
  return raw.split("\n").map((line) => {
302
329
  const trimmed = line.trimStart();
303
330
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -357,15 +384,28 @@ function getScoreById(id) {
357
384
  }
358
385
 
359
386
  // src/evals/aggregators.ts
360
- function aggregateAverage(values) {
387
+ function aggregateAverageWithVariance(values) {
361
388
  if (values.length === 0) {
362
- return { value: 0 };
389
+ return { value: 0, count: 0 };
363
390
  }
364
391
  const sum = values.reduce((s, v) => s + v.value, 0);
365
- return { value: sum / values.length };
392
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
393
+ const mean = sum / values.length;
394
+ let stdDev;
395
+ if (values.length >= 2) {
396
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
397
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
398
+ }
399
+ return { value: mean, stdDev, count: values.length };
366
400
  }
367
401
  function aggregateAll(values) {
368
- return { passed: values.length > 0 && values.every((v) => v.passed) };
402
+ const total = values.length;
403
+ const passedCount = values.filter((v) => v.passed).length;
404
+ return {
405
+ passed: total > 0 && values.every((v) => v.passed),
406
+ passedCount,
407
+ totalCount: total
408
+ };
369
409
  }
370
410
  function aggregateTokenCountSum(values) {
371
411
  const initial = {
@@ -419,14 +459,28 @@ Score.of({
419
459
  id: "percent",
420
460
  name: "Score",
421
461
  displayStrategy: "bar",
422
- format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
423
- aggregate: aggregateAverage
462
+ format: (data, options) => {
463
+ if (options?.isAggregated) {
464
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
465
+ }
466
+ return data.value.toFixed(2);
467
+ },
468
+ aggregate: aggregateAverageWithVariance
424
469
  });
425
470
  Score.of({
426
471
  id: "binary",
427
472
  name: "Result",
428
473
  displayStrategy: "passFail",
429
- format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
474
+ format: (data, options) => {
475
+ if (options?.isAggregated) {
476
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
477
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
478
+ return `${base} (${data.passedCount}/${data.totalCount})`;
479
+ }
480
+ return base;
481
+ }
482
+ return data.passed ? "PASSED" : "NOT PASSED";
483
+ },
430
484
  aggregate: aggregateAll
431
485
  });
432
486
 
@@ -1365,6 +1419,13 @@ function Spinner({ label = "Running" }) {
1365
1419
  label
1366
1420
  ] });
1367
1421
  }
1422
+ function sampleStdDev(sum, sumSq, n) {
1423
+ if (n < 2)
1424
+ return void 0;
1425
+ const mean = sum / n;
1426
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1427
+ return variance > 0 ? Math.sqrt(variance) : 0;
1428
+ }
1368
1429
  function scoreColor(score) {
1369
1430
  if (score >= 80)
1370
1431
  return "green";
@@ -1484,6 +1545,7 @@ function RunView({
1484
1545
  setEvaluatorNameById(nameById);
1485
1546
  const aggregates = /* @__PURE__ */ new Map();
1486
1547
  let overallScoreTotal = 0;
1548
+ let overallScoreSumSq = 0;
1487
1549
  let overallScoreCount = 0;
1488
1550
  const done = new Promise((resolve5) => {
1489
1551
  const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1495,17 +1557,20 @@ function RunView({
1495
1557
  if (numeric !== void 0) {
1496
1558
  const current = aggregates.get(item.evaluatorId) ?? {
1497
1559
  total: 0,
1560
+ sumSq: 0,
1498
1561
  count: 0,
1499
1562
  passed: 0,
1500
1563
  failed: 0
1501
1564
  };
1502
1565
  aggregates.set(item.evaluatorId, {
1503
1566
  total: current.total + numeric,
1567
+ sumSq: current.sumSq + numeric * numeric,
1504
1568
  count: current.count + 1,
1505
1569
  passed: current.passed + (item.passed ? 1 : 0),
1506
1570
  failed: current.failed + (item.passed ? 0 : 1)
1507
1571
  });
1508
1572
  overallScoreTotal += numeric;
1573
+ overallScoreSumSq += numeric * numeric;
1509
1574
  overallScoreCount += 1;
1510
1575
  }
1511
1576
  }
@@ -1575,6 +1640,7 @@ function RunView({
1575
1640
  failedTestCases: finalEvent.failedTestCases,
1576
1641
  totalTestCases: finalEvent.totalTestCases,
1577
1642
  overallScoreTotal,
1643
+ overallScoreSumSq,
1578
1644
  overallScoreCount,
1579
1645
  aggregates: new Map(aggregates),
1580
1646
  artifactPath: finalEvent.artifactPath
@@ -1732,7 +1798,14 @@ function RunView({
1732
1798
  label: "overall avg",
1733
1799
  value: summary.overallScoreTotal / summary.overallScoreCount,
1734
1800
  barWidth: 20,
1735
- format: (v) => v.toFixed(2)
1801
+ format: (v) => {
1802
+ const sd = sampleStdDev(
1803
+ summary.overallScoreTotal,
1804
+ summary.overallScoreSumSq,
1805
+ summary.overallScoreCount
1806
+ );
1807
+ return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
1808
+ }
1736
1809
  }
1737
1810
  ) }),
1738
1811
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
@@ -1747,11 +1820,13 @@ function RunView({
1747
1820
  ] }, id);
1748
1821
  }
1749
1822
  const mean = agg.total / agg.count;
1823
+ const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
1824
+ const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1750
1825
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1751
1826
  "- ",
1752
1827
  name.padEnd(28),
1753
1828
  " avg=",
1754
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1829
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: meanStr }),
1755
1830
  " ",
1756
1831
  "passed=",
1757
1832
  agg.passed,
@@ -1763,14 +1838,17 @@ function RunView({
1763
1838
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1764
1839
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
1765
1840
  testCases.map((tc) => {
1766
- const numericScores = tc.aggregatedEvaluatorScores.flatMap(
1767
- (item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
1841
+ const allScores = tc.events.flatMap(
1842
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1768
1843
  );
1769
- const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1844
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1845
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1846
+ const total = allScores.reduce((a, b) => a + b, 0);
1847
+ const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
1770
1848
  const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1771
1849
  const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1772
1850
  isAggregated: true
1773
- }) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
1851
+ }) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
1774
1852
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1775
1853
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1776
1854
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
@@ -1804,13 +1882,26 @@ function RunView({
1804
1882
  }
1805
1883
 
1806
1884
  // src/cli-simple/run.ts
1885
+ function sampleStdDev2(sum, sumSq, n) {
1886
+ if (n < 2)
1887
+ return void 0;
1888
+ const mean = sum / n;
1889
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1890
+ return variance > 0 ? Math.sqrt(variance) : 0;
1891
+ }
1807
1892
  function buildTestCaseSummaries(byId) {
1808
1893
  const summaries = [];
1809
1894
  for (const { name, events } of byId.values()) {
1810
1895
  const passed = events.every((e) => e.passed);
1811
1896
  const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1812
1897
  const isAggregated = events.length > 1;
1813
- const numericScores = [];
1898
+ const allScores = events.flatMap(
1899
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1900
+ );
1901
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1902
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1903
+ const total = allScores.reduce((a, b) => a + b, 0);
1904
+ const stdDev = sampleStdDev2(total, sumSq, allScores.length);
1814
1905
  let firstAggregatedScore;
1815
1906
  for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1816
1907
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -1826,21 +1917,18 @@ function buildTestCaseSummaries(byId) {
1826
1917
  }
1827
1918
  for (const items of scoreIdToItems.values()) {
1828
1919
  const agg = aggregateScoreItems(items);
1829
- if (agg) {
1830
- const n = toNumericScoreFromScores([agg]);
1831
- if (n !== void 0) {
1832
- numericScores.push(n);
1833
- if (firstAggregatedScore === void 0) {
1834
- firstAggregatedScore = agg;
1835
- }
1836
- }
1920
+ if (agg && firstAggregatedScore === void 0) {
1921
+ firstAggregatedScore = agg;
1922
+ break;
1837
1923
  }
1838
1924
  }
1925
+ if (firstAggregatedScore !== void 0)
1926
+ break;
1839
1927
  }
1840
- const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1841
1928
  summaries.push({
1842
1929
  name,
1843
1930
  averageScore,
1931
+ stdDev: stdDev ?? void 0,
1844
1932
  aggregatedScoreItem: firstAggregatedScore,
1845
1933
  isAggregated,
1846
1934
  durationMs,
@@ -1876,7 +1964,9 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1876
1964
  return `- ${evaluatorName.padEnd(28)} no numeric scores`;
1877
1965
  }
1878
1966
  const mean = aggregate.total / aggregate.count;
1879
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
1967
+ const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
1968
+ const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1969
+ return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
1880
1970
  }
1881
1971
  function createBar2(value, max = 100, width = 20) {
1882
1972
  const safe = Math.max(0, Math.min(max, value));
@@ -2006,6 +2096,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2006
2096
  const aggregates = /* @__PURE__ */ new Map();
2007
2097
  const testCaseByTestId = /* @__PURE__ */ new Map();
2008
2098
  let overallScoreTotal = 0;
2099
+ let overallScoreSumSq = 0;
2009
2100
  let overallScoreCount = 0;
2010
2101
  let completedCount = 0;
2011
2102
  let totalCount = 0;
@@ -2062,17 +2153,20 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2062
2153
  if (numeric !== void 0) {
2063
2154
  const current = aggregates.get(item.evaluatorId) ?? {
2064
2155
  total: 0,
2156
+ sumSq: 0,
2065
2157
  count: 0,
2066
2158
  passed: 0,
2067
2159
  failed: 0
2068
2160
  };
2069
2161
  aggregates.set(item.evaluatorId, {
2070
2162
  total: current.total + numeric,
2163
+ sumSq: current.sumSq + numeric * numeric,
2071
2164
  count: current.count + 1,
2072
2165
  passed: current.passed + (item.passed ? 1 : 0),
2073
2166
  failed: current.failed + (item.passed ? 0 : 1)
2074
2167
  });
2075
2168
  overallScoreTotal += numeric;
2169
+ overallScoreSumSq += numeric * numeric;
2076
2170
  overallScoreCount += 1;
2077
2171
  }
2078
2172
  }
@@ -2180,9 +2274,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2180
2274
  );
2181
2275
  if (overallScoreCount > 0) {
2182
2276
  const overallAverage = overallScoreTotal / overallScoreCount;
2277
+ const overallSd = sampleStdDev2(
2278
+ overallScoreTotal,
2279
+ overallScoreSumSq,
2280
+ overallScoreCount
2281
+ );
2282
+ const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2183
2283
  console.log(
2184
2284
  `- overall avg score: ${colorize(
2185
- overallAverage.toFixed(2),
2285
+ avgStr,
2186
2286
  scoreToColor(overallAverage)
2187
2287
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
2188
2288
  );
@@ -2207,7 +2307,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2207
2307
  const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2208
2308
  summary.aggregatedScoreItem.data,
2209
2309
  { isAggregated: true }
2210
- ) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
2310
+ ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2211
2311
  console.log(
2212
2312
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2213
2313
  scoreLabel,