@m4trix/evals 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- var diff = require('diff');
11
+ var jsonDiff = require('json-diff');
12
12
  var React2 = require('react');
13
13
  var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
@@ -286,45 +286,46 @@ async function collectTestCasesFromFiles(config) {
286
286
  );
287
287
  return found.flat();
288
288
  }
289
- function toJsonLines(value) {
289
+ function createDiffString(expected, actual, diffOptions) {
290
+ const opts = { ...diffOptions, color: false };
291
+ const result = jsonDiff.diffString(expected, actual, opts);
292
+ return typeof result === "string" ? result : "";
293
+ }
294
+ function formatLogMessage(msg) {
295
+ if (typeof msg === "string")
296
+ return msg;
290
297
  try {
291
- return JSON.stringify(value, null, 2);
298
+ if (msg !== null && typeof msg === "object") {
299
+ return JSON.stringify(msg, null, 2);
300
+ }
301
+ return String(msg);
292
302
  } catch {
293
- return String(value);
303
+ return String(msg);
294
304
  }
295
305
  }
296
- function formatDiffString(changes) {
297
- const lines = [];
298
- for (const part of changes) {
299
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
300
- const partLines = part.value.split("\n");
301
- if (partLines[partLines.length - 1] === "") {
302
- partLines.pop();
303
- }
304
- for (const line of partLines) {
305
- lines.push(`${prefix} ${line}`);
306
- }
307
- }
308
- return lines.join("\n");
306
+ function createLogEntry(message, options) {
307
+ return {
308
+ type: "log",
309
+ label: options?.label,
310
+ message: formatLogMessage(message)
311
+ };
309
312
  }
310
- function createDiffString(expected, actual) {
311
- const expectedStr = toJsonLines(expected);
312
- const actualStr = toJsonLines(actual);
313
- const changes = diff.diffLines(expectedStr, actualStr);
314
- return formatDiffString(changes);
313
+ function getLogLines(entry) {
314
+ return entry.message.split("\n");
315
315
  }
316
316
  function createDiffLogEntry(expected, actual, options) {
317
- const diff = createDiffString(expected, actual);
317
+ const { label, ...diffOpts } = options ?? {};
318
+ const diff = createDiffString(expected, actual, diffOpts);
318
319
  return {
319
320
  type: "diff",
320
- label: options?.label,
321
+ label,
321
322
  expected,
322
323
  actual,
323
324
  diff: diff || "(no differences)"
324
325
  };
325
326
  }
326
327
  function getDiffLines(entry) {
327
- const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
328
+ const raw = entry.diff || "(no differences)";
328
329
  return raw.split("\n").map((line) => {
329
330
  const trimmed = line.trimStart();
330
331
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -600,6 +601,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
600
601
  const logDiff = (expected, actual, options) => {
601
602
  logs.push(createDiffLogEntry(expected, actual, options));
602
603
  };
604
+ const log = (message, options) => {
605
+ logs.push(createLogEntry(message, options));
606
+ };
603
607
  const ctx = yield* effect.Effect.promise(
604
608
  () => Promise.resolve(evaluator.resolveContext())
605
609
  );
@@ -609,7 +613,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
609
613
  input: testCaseItem.testCase.getInput(),
610
614
  ctx,
611
615
  output,
612
- logDiff
616
+ logDiff,
617
+ log
613
618
  })
614
619
  )
615
620
  );
@@ -1544,6 +1549,7 @@ function RunView({
1544
1549
  );
1545
1550
  setEvaluatorNameById(nameById);
1546
1551
  const aggregates = /* @__PURE__ */ new Map();
1552
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
1547
1553
  let overallScoreTotal = 0;
1548
1554
  let overallScoreSumSq = 0;
1549
1555
  let overallScoreCount = 0;
@@ -1573,6 +1579,12 @@ function RunView({
1573
1579
  overallScoreSumSq += numeric * numeric;
1574
1580
  overallScoreCount += 1;
1575
1581
  }
1582
+ for (const s of item.scores) {
1583
+ const key = `${item.evaluatorId}:${s.id}`;
1584
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
1585
+ list.push(s);
1586
+ scoreItemsByEvaluatorScore.set(key, list);
1587
+ }
1576
1588
  }
1577
1589
  setTestCases((prev) => {
1578
1590
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
@@ -1643,6 +1655,7 @@ function RunView({
1643
1655
  overallScoreSumSq,
1644
1656
  overallScoreCount,
1645
1657
  aggregates: new Map(aggregates),
1658
+ scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1646
1659
  artifactPath: finalEvent.artifactPath
1647
1660
  });
1648
1661
  setPhase("completed");
@@ -1725,36 +1738,45 @@ function RunView({
1725
1738
  ":",
1726
1739
  " ",
1727
1740
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1728
- " ",
1729
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
1741
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1742
+ " ",
1743
+ item.metrics.map((m) => {
1744
+ const def = getMetricById(m.id);
1745
+ if (!def)
1746
+ return null;
1747
+ const formatted = def.format(m.data, {
1748
+ isAggregated: tc.isAggregated
1749
+ });
1750
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1751
+ "[",
1752
+ def.name ? `${def.name}: ` : "",
1753
+ formatted,
1754
+ "]",
1755
+ " "
1756
+ ] }, m.id);
1757
+ })
1758
+ ] }) : null
1759
+ ] }),
1760
+ item.scores.length > 0 ? item.scores.map((s, idx) => {
1761
+ const def = getScoreById(s.id);
1762
+ const scoreLabel = def ? def.name ?? def.id : s.id;
1763
+ return /* @__PURE__ */ jsxRuntime.jsxs(
1730
1764
  ink.Text,
1731
1765
  {
1732
1766
  color: scoreColor(toNumericScore(s.data) ?? 0),
1733
1767
  children: [
1768
+ " ",
1769
+ scoreLabel,
1770
+ ":",
1771
+ " ",
1734
1772
  formatScorePart(s, scoreColor, {
1735
1773
  isAggregated: tc.isAggregated
1736
- }),
1737
- " "
1774
+ })
1738
1775
  ]
1739
1776
  },
1740
- s.id
1741
- )),
1742
- item.metrics?.map((m) => {
1743
- const def = getMetricById(m.id);
1744
- if (!def)
1745
- return null;
1746
- const formatted = def.format(m.data, {
1747
- isAggregated: tc.isAggregated
1748
- });
1749
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1750
- "[",
1751
- def.name ? `${def.name}: ` : "",
1752
- formatted,
1753
- "]",
1754
- " "
1755
- ] }, m.id);
1756
- })
1757
- ] }),
1777
+ `${item.evaluatorId}-${s.id}-${idx}`
1778
+ );
1779
+ }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
1758
1780
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1759
1781
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1760
1782
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
@@ -1765,7 +1787,7 @@ function RunView({
1765
1787
  },
1766
1788
  lineIdx
1767
1789
  )
1768
- ) }, logIdx) : null
1790
+ ) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
1769
1791
  ) })
1770
1792
  ]
1771
1793
  },
@@ -1812,26 +1834,54 @@ function RunView({
1812
1834
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
1813
1835
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1814
1836
  const agg = summary.aggregates.get(id);
1815
- if (!agg || agg.count === 0) {
1837
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
1838
+ (k) => k.startsWith(`${id}:`)
1839
+ );
1840
+ if (scoreKeys.length === 0) {
1816
1841
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1817
1842
  "- ",
1818
1843
  name.padEnd(28),
1819
- " no numeric scores"
1844
+ " no scores"
1820
1845
  ] }, id);
1821
1846
  }
1822
- const mean = agg.total / agg.count;
1823
- const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
1824
- const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1825
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1826
- "- ",
1827
- name.padEnd(28),
1828
- " avg=",
1829
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: meanStr }),
1847
+ const passedFailed = agg != null ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1830
1848
  " ",
1831
1849
  "passed=",
1832
1850
  agg.passed,
1833
1851
  " failed=",
1834
1852
  agg.failed
1853
+ ] }) : null;
1854
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1855
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1856
+ "- ",
1857
+ name.padEnd(28),
1858
+ passedFailed
1859
+ ] }),
1860
+ scoreKeys.map((key) => {
1861
+ const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
1862
+ const aggregated = aggregateScoreItems(items);
1863
+ if (!aggregated)
1864
+ return null;
1865
+ const def = getScoreById(aggregated.id);
1866
+ const label = def ? def.name ?? def.id : aggregated.id;
1867
+ const formatted = def?.format(aggregated.data, {
1868
+ isAggregated: true
1869
+ }) ?? "n/a";
1870
+ const numeric = toNumericScore(aggregated.data);
1871
+ return /* @__PURE__ */ jsxRuntime.jsxs(
1872
+ ink.Text,
1873
+ {
1874
+ color: numeric !== void 0 ? scoreColor(numeric) : "gray",
1875
+ children: [
1876
+ " ",
1877
+ label,
1878
+ ": ",
1879
+ formatted
1880
+ ]
1881
+ },
1882
+ key
1883
+ );
1884
+ })
1835
1885
  ] }, id);
1836
1886
  })
1837
1887
  ] }),
@@ -1959,14 +2009,36 @@ function scoreToColor(score) {
1959
2009
  }
1960
2010
  return ansi2.red;
1961
2011
  }
1962
- function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1963
- if (!aggregate || aggregate.count === 0) {
1964
- return `- ${evaluatorName.padEnd(28)} no numeric scores`;
2012
+ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
2013
+ const lines = [];
2014
+ const scoreKeys = [...scoreItemsByKey.keys()].filter(
2015
+ (k) => k.startsWith(`${evaluatorId}:`)
2016
+ );
2017
+ if (scoreKeys.length === 0) {
2018
+ lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
2019
+ return lines;
2020
+ }
2021
+ const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
2022
+ const scoreLines = [];
2023
+ for (const key of scoreKeys) {
2024
+ const items = scoreItemsByKey.get(key) ?? [];
2025
+ const agg = aggregateScoreItems(items);
2026
+ if (!agg)
2027
+ continue;
2028
+ const def = getScoreById(agg.id);
2029
+ const label = def ? def.name ?? def.id : agg.id;
2030
+ const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2031
+ const numeric = toNumericScore(agg.data);
2032
+ const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2033
+ scoreLines.push(` ${label}: ${colored}`);
2034
+ }
2035
+ if (scoreLines.length > 0) {
2036
+ lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
2037
+ lines.push(...scoreLines);
2038
+ } else {
2039
+ lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
1965
2040
  }
1966
- const mean = aggregate.total / aggregate.count;
1967
- const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
1968
- const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1969
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
2041
+ return lines;
1970
2042
  }
1971
2043
  function createBar2(value, max = 100, width = 20) {
1972
2044
  const safe = Math.max(0, Math.min(max, value));
@@ -2018,46 +2090,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
2018
2090
  }
2019
2091
  function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2020
2092
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
2021
- const scoreParts = [];
2022
- for (const item of scores) {
2023
- const def = getScoreById(item.id);
2024
- if (!def) {
2025
- const numeric = toNumericScore(item.data);
2026
- scoreParts.push(
2027
- numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
2028
- );
2029
- continue;
2030
- }
2031
- const formatted = def.format(item.data, options);
2032
- switch (def.displayStrategy) {
2033
- case "bar": {
2034
- const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2035
- if (typeof numeric === "number" && Number.isFinite(numeric)) {
2036
- scoreParts.push(
2037
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
2038
- );
2039
- } else {
2040
- scoreParts.push(formatted);
2041
- }
2042
- break;
2043
- }
2044
- case "number":
2045
- scoreParts.push(formatted);
2046
- break;
2047
- case "passFail":
2048
- scoreParts.push(
2049
- colorize(
2050
- formatted,
2051
- item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2052
- )
2053
- );
2054
- break;
2055
- }
2056
- }
2057
- const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
2058
- let line = ` ${name}: ${passLabel} ${scoreStr}`;
2093
+ const metricParts = [];
2059
2094
  if (metrics && metrics.length > 0) {
2060
- const metricParts = [];
2061
2095
  for (const { id, data } of metrics) {
2062
2096
  const def = getMetricById(id);
2063
2097
  if (def) {
@@ -2067,11 +2101,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2067
2101
  );
2068
2102
  }
2069
2103
  }
2070
- if (metricParts.length > 0) {
2071
- line += ` ${metricParts.join(" ")}`;
2104
+ }
2105
+ const scoreLines = [];
2106
+ for (const item of scores) {
2107
+ const def = getScoreById(item.id);
2108
+ const scoreLabel = def ? def.name ?? def.id : item.id;
2109
+ let formatted;
2110
+ if (!def) {
2111
+ const numeric = toNumericScore(item.data);
2112
+ formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2113
+ } else {
2114
+ const raw = def.format(item.data, options);
2115
+ switch (def.displayStrategy) {
2116
+ case "bar": {
2117
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2118
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
2119
+ formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
2120
+ } else {
2121
+ formatted = raw;
2122
+ }
2123
+ break;
2124
+ }
2125
+ case "number":
2126
+ formatted = raw;
2127
+ break;
2128
+ case "passFail":
2129
+ formatted = colorize(
2130
+ raw,
2131
+ item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2132
+ );
2133
+ break;
2134
+ }
2072
2135
  }
2136
+ scoreLines.push(` ${scoreLabel}: ${formatted}`);
2073
2137
  }
2074
- return line;
2138
+ const lines = [];
2139
+ const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
2140
+ lines.push(` ${name}: ${passLabel}${metricStr}`);
2141
+ if (scoreLines.length > 0) {
2142
+ lines.push(...scoreLines);
2143
+ } else {
2144
+ lines.push(` n/a`);
2145
+ }
2146
+ return lines;
2075
2147
  }
2076
2148
  async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2077
2149
  const dataset = await runner.resolveDatasetByName(datasetName);
@@ -2094,6 +2166,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2094
2166
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2095
2167
  );
2096
2168
  const aggregates = /* @__PURE__ */ new Map();
2169
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
2097
2170
  const testCaseByTestId = /* @__PURE__ */ new Map();
2098
2171
  let overallScoreTotal = 0;
2099
2172
  let overallScoreSumSq = 0;
@@ -2169,6 +2242,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2169
2242
  overallScoreSumSq += numeric * numeric;
2170
2243
  overallScoreCount += 1;
2171
2244
  }
2245
+ for (const s of item.scores) {
2246
+ const key = `${item.evaluatorId}:${s.id}`;
2247
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
2248
+ list.push(s);
2249
+ scoreItemsByEvaluatorScore.set(key, list);
2250
+ }
2172
2251
  }
2173
2252
  const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2174
2253
  const isLastRerun = event.rerunIndex >= event.rerunTotal;
@@ -2192,7 +2271,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2192
2271
  for (const item of aggregatedScores) {
2193
2272
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2194
2273
  lines.push(
2195
- formatEvaluatorScoreLine(
2274
+ ...formatEvaluatorScoreLine(
2196
2275
  name,
2197
2276
  item.scores,
2198
2277
  item.passed,
@@ -2212,6 +2291,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2212
2291
  const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2213
2292
  lines.push(colored);
2214
2293
  }
2294
+ } else if (log.type === "log") {
2295
+ for (const line of getLogLines(log)) {
2296
+ lines.push(` ${line}`);
2297
+ }
2215
2298
  }
2216
2299
  }
2217
2300
  }
@@ -2289,9 +2372,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2289
2372
  }
2290
2373
  console.log(colorize("- evaluator averages:", ansi2.magenta));
2291
2374
  for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
2292
- console.log(
2293
- getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
2375
+ const evaluatorLines = getEvaluatorSummaryLines(
2376
+ evaluatorId,
2377
+ evaluatorName,
2378
+ aggregates.get(evaluatorId),
2379
+ scoreItemsByEvaluatorScore
2294
2380
  );
2381
+ for (const line of evaluatorLines) {
2382
+ console.log(line);
2383
+ }
2295
2384
  }
2296
2385
  const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
2297
2386
  if (testCaseSummaries.length > 0) {