@m4trix/evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import { diffString } from 'json-diff';
9
+ import { diffLines } from 'diff';
10
10
  import React2, { useState, useEffect, useCallback } from 'react';
11
11
  import { render, Box, Text } from 'ink';
12
12
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
@@ -260,8 +260,35 @@ async function collectTestCasesFromFiles(config) {
260
260
  );
261
261
  return found.flat();
262
262
  }
263
+ function toJsonLines(value) {
264
+ try {
265
+ return JSON.stringify(value, null, 2);
266
+ } catch {
267
+ return String(value);
268
+ }
269
+ }
270
+ function formatDiffString(changes) {
271
+ const lines = [];
272
+ for (const part of changes) {
273
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
274
+ const partLines = part.value.split("\n");
275
+ if (partLines[partLines.length - 1] === "") {
276
+ partLines.pop();
277
+ }
278
+ for (const line of partLines) {
279
+ lines.push(`${prefix} ${line}`);
280
+ }
281
+ }
282
+ return lines.join("\n");
283
+ }
284
+ function createDiffString(expected, actual) {
285
+ const expectedStr = toJsonLines(expected);
286
+ const actualStr = toJsonLines(actual);
287
+ const changes = diffLines(expectedStr, actualStr);
288
+ return formatDiffString(changes);
289
+ }
263
290
  function createDiffLogEntry(expected, actual, options) {
264
- const diff = diffString(expected, actual, { color: false });
291
+ const diff = createDiffString(expected, actual);
265
292
  return {
266
293
  type: "diff",
267
294
  label: options?.label,
@@ -271,7 +298,7 @@ function createDiffLogEntry(expected, actual, options) {
271
298
  };
272
299
  }
273
300
  function getDiffLines(entry) {
274
- const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
301
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
275
302
  return raw.split("\n").map((line) => {
276
303
  const trimmed = line.trimStart();
277
304
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -331,15 +358,28 @@ function getScoreById(id) {
331
358
  }
332
359
 
333
360
  // src/evals/aggregators.ts
334
- function aggregateAverage(values) {
361
+ function aggregateAverageWithVariance(values) {
335
362
  if (values.length === 0) {
336
- return { value: 0 };
363
+ return { value: 0, count: 0 };
337
364
  }
338
365
  const sum = values.reduce((s, v) => s + v.value, 0);
339
- return { value: sum / values.length };
366
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
367
+ const mean = sum / values.length;
368
+ let stdDev;
369
+ if (values.length >= 2) {
370
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
371
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
372
+ }
373
+ return { value: mean, stdDev, count: values.length };
340
374
  }
341
375
  function aggregateAll(values) {
342
- return { passed: values.length > 0 && values.every((v) => v.passed) };
376
+ const total = values.length;
377
+ const passedCount = values.filter((v) => v.passed).length;
378
+ return {
379
+ passed: total > 0 && values.every((v) => v.passed),
380
+ passedCount,
381
+ totalCount: total
382
+ };
343
383
  }
344
384
  function aggregateTokenCountSum(values) {
345
385
  const initial = {
@@ -393,14 +433,28 @@ Score.of({
393
433
  id: "percent",
394
434
  name: "Score",
395
435
  displayStrategy: "bar",
396
- format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
397
- aggregate: aggregateAverage
436
+ format: (data, options) => {
437
+ if (options?.isAggregated) {
438
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
439
+ }
440
+ return data.value.toFixed(2);
441
+ },
442
+ aggregate: aggregateAverageWithVariance
398
443
  });
399
444
  Score.of({
400
445
  id: "binary",
401
446
  name: "Result",
402
447
  displayStrategy: "passFail",
403
- format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
448
+ format: (data, options) => {
449
+ if (options?.isAggregated) {
450
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
451
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
452
+ return `${base} (${data.passedCount}/${data.totalCount})`;
453
+ }
454
+ return base;
455
+ }
456
+ return data.passed ? "PASSED" : "NOT PASSED";
457
+ },
404
458
  aggregate: aggregateAll
405
459
  });
406
460
 
@@ -1339,6 +1393,13 @@ function Spinner({ label = "Running" }) {
1339
1393
  label
1340
1394
  ] });
1341
1395
  }
1396
+ function sampleStdDev(sum, sumSq, n) {
1397
+ if (n < 2)
1398
+ return void 0;
1399
+ const mean = sum / n;
1400
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1401
+ return variance > 0 ? Math.sqrt(variance) : 0;
1402
+ }
1342
1403
  function scoreColor(score) {
1343
1404
  if (score >= 80)
1344
1405
  return "green";
@@ -1458,6 +1519,7 @@ function RunView({
1458
1519
  setEvaluatorNameById(nameById);
1459
1520
  const aggregates = /* @__PURE__ */ new Map();
1460
1521
  let overallScoreTotal = 0;
1522
+ let overallScoreSumSq = 0;
1461
1523
  let overallScoreCount = 0;
1462
1524
  const done = new Promise((resolve5) => {
1463
1525
  const unsubscribe = runner.subscribeRunEvents((event) => {
@@ -1469,17 +1531,20 @@ function RunView({
1469
1531
  if (numeric !== void 0) {
1470
1532
  const current = aggregates.get(item.evaluatorId) ?? {
1471
1533
  total: 0,
1534
+ sumSq: 0,
1472
1535
  count: 0,
1473
1536
  passed: 0,
1474
1537
  failed: 0
1475
1538
  };
1476
1539
  aggregates.set(item.evaluatorId, {
1477
1540
  total: current.total + numeric,
1541
+ sumSq: current.sumSq + numeric * numeric,
1478
1542
  count: current.count + 1,
1479
1543
  passed: current.passed + (item.passed ? 1 : 0),
1480
1544
  failed: current.failed + (item.passed ? 0 : 1)
1481
1545
  });
1482
1546
  overallScoreTotal += numeric;
1547
+ overallScoreSumSq += numeric * numeric;
1483
1548
  overallScoreCount += 1;
1484
1549
  }
1485
1550
  }
@@ -1549,6 +1614,7 @@ function RunView({
1549
1614
  failedTestCases: finalEvent.failedTestCases,
1550
1615
  totalTestCases: finalEvent.totalTestCases,
1551
1616
  overallScoreTotal,
1617
+ overallScoreSumSq,
1552
1618
  overallScoreCount,
1553
1619
  aggregates: new Map(aggregates),
1554
1620
  artifactPath: finalEvent.artifactPath
@@ -1706,7 +1772,14 @@ function RunView({
1706
1772
  label: "overall avg",
1707
1773
  value: summary.overallScoreTotal / summary.overallScoreCount,
1708
1774
  barWidth: 20,
1709
- format: (v) => v.toFixed(2)
1775
+ format: (v) => {
1776
+ const sd = sampleStdDev(
1777
+ summary.overallScoreTotal,
1778
+ summary.overallScoreSumSq,
1779
+ summary.overallScoreCount
1780
+ );
1781
+ return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
1782
+ }
1710
1783
  }
1711
1784
  ) }),
1712
1785
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
@@ -1721,11 +1794,13 @@ function RunView({
1721
1794
  ] }, id);
1722
1795
  }
1723
1796
  const mean = agg.total / agg.count;
1797
+ const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
1798
+ const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1724
1799
  return /* @__PURE__ */ jsxs(Text, { children: [
1725
1800
  "- ",
1726
1801
  name.padEnd(28),
1727
1802
  " avg=",
1728
- /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1803
+ /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
1729
1804
  " ",
1730
1805
  "passed=",
1731
1806
  agg.passed,
@@ -1737,14 +1812,17 @@ function RunView({
1737
1812
  /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1738
1813
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
1739
1814
  testCases.map((tc) => {
1740
- const numericScores = tc.aggregatedEvaluatorScores.flatMap(
1741
- (item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
1815
+ const allScores = tc.events.flatMap(
1816
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1742
1817
  );
1743
- const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1818
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1819
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1820
+ const total = allScores.reduce((a, b) => a + b, 0);
1821
+ const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
1744
1822
  const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
1745
1823
  const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
1746
1824
  isAggregated: true
1747
- }) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
1825
+ }) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
1748
1826
  return /* @__PURE__ */ jsxs(Box, { children: [
1749
1827
  /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1750
1828
  /* @__PURE__ */ jsxs(Text, { children: [
@@ -1778,13 +1856,26 @@ function RunView({
1778
1856
  }
1779
1857
 
1780
1858
  // src/cli-simple/run.ts
1859
+ function sampleStdDev2(sum, sumSq, n) {
1860
+ if (n < 2)
1861
+ return void 0;
1862
+ const mean = sum / n;
1863
+ const variance = (sumSq - n * mean * mean) / (n - 1);
1864
+ return variance > 0 ? Math.sqrt(variance) : 0;
1865
+ }
1781
1866
  function buildTestCaseSummaries(byId) {
1782
1867
  const summaries = [];
1783
1868
  for (const { name, events } of byId.values()) {
1784
1869
  const passed = events.every((e) => e.passed);
1785
1870
  const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
1786
1871
  const isAggregated = events.length > 1;
1787
- const numericScores = [];
1872
+ const allScores = events.flatMap(
1873
+ (ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
1874
+ );
1875
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
1876
+ const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
1877
+ const total = allScores.reduce((a, b) => a + b, 0);
1878
+ const stdDev = sampleStdDev2(total, sumSq, allScores.length);
1788
1879
  let firstAggregatedScore;
1789
1880
  for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
1790
1881
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -1800,21 +1891,18 @@ function buildTestCaseSummaries(byId) {
1800
1891
  }
1801
1892
  for (const items of scoreIdToItems.values()) {
1802
1893
  const agg = aggregateScoreItems(items);
1803
- if (agg) {
1804
- const n = toNumericScoreFromScores([agg]);
1805
- if (n !== void 0) {
1806
- numericScores.push(n);
1807
- if (firstAggregatedScore === void 0) {
1808
- firstAggregatedScore = agg;
1809
- }
1810
- }
1894
+ if (agg && firstAggregatedScore === void 0) {
1895
+ firstAggregatedScore = agg;
1896
+ break;
1811
1897
  }
1812
1898
  }
1899
+ if (firstAggregatedScore !== void 0)
1900
+ break;
1813
1901
  }
1814
- const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
1815
1902
  summaries.push({
1816
1903
  name,
1817
1904
  averageScore,
1905
+ stdDev: stdDev ?? void 0,
1818
1906
  aggregatedScoreItem: firstAggregatedScore,
1819
1907
  isAggregated,
1820
1908
  durationMs,
@@ -1850,7 +1938,9 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1850
1938
  return `- ${evaluatorName.padEnd(28)} no numeric scores`;
1851
1939
  }
1852
1940
  const mean = aggregate.total / aggregate.count;
1853
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
1941
+ const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
1942
+ const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1943
+ return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
1854
1944
  }
1855
1945
  function createBar2(value, max = 100, width = 20) {
1856
1946
  const safe = Math.max(0, Math.min(max, value));
@@ -1980,6 +2070,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1980
2070
  const aggregates = /* @__PURE__ */ new Map();
1981
2071
  const testCaseByTestId = /* @__PURE__ */ new Map();
1982
2072
  let overallScoreTotal = 0;
2073
+ let overallScoreSumSq = 0;
1983
2074
  let overallScoreCount = 0;
1984
2075
  let completedCount = 0;
1985
2076
  let totalCount = 0;
@@ -2036,17 +2127,20 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2036
2127
  if (numeric !== void 0) {
2037
2128
  const current = aggregates.get(item.evaluatorId) ?? {
2038
2129
  total: 0,
2130
+ sumSq: 0,
2039
2131
  count: 0,
2040
2132
  passed: 0,
2041
2133
  failed: 0
2042
2134
  };
2043
2135
  aggregates.set(item.evaluatorId, {
2044
2136
  total: current.total + numeric,
2137
+ sumSq: current.sumSq + numeric * numeric,
2045
2138
  count: current.count + 1,
2046
2139
  passed: current.passed + (item.passed ? 1 : 0),
2047
2140
  failed: current.failed + (item.passed ? 0 : 1)
2048
2141
  });
2049
2142
  overallScoreTotal += numeric;
2143
+ overallScoreSumSq += numeric * numeric;
2050
2144
  overallScoreCount += 1;
2051
2145
  }
2052
2146
  }
@@ -2154,9 +2248,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2154
2248
  );
2155
2249
  if (overallScoreCount > 0) {
2156
2250
  const overallAverage = overallScoreTotal / overallScoreCount;
2251
+ const overallSd = sampleStdDev2(
2252
+ overallScoreTotal,
2253
+ overallScoreSumSq,
2254
+ overallScoreCount
2255
+ );
2256
+ const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2157
2257
  console.log(
2158
2258
  `- overall avg score: ${colorize(
2159
- overallAverage.toFixed(2),
2259
+ avgStr,
2160
2260
  scoreToColor(overallAverage)
2161
2261
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
2162
2262
  );
@@ -2181,7 +2281,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2181
2281
  const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2182
2282
  summary.aggregatedScoreItem.data,
2183
2283
  { isAggregated: true }
2184
- ) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
2284
+ ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2185
2285
  console.log(
2186
2286
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2187
2287
  scoreLabel,