@m4trix/evals 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import { diffLines } from 'diff';
9
+ import { diffString } from 'json-diff';
10
10
  import React2, { useState, useEffect, useCallback } from 'react';
11
11
  import { render, Box, Text } from 'ink';
12
12
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
@@ -260,45 +260,46 @@ async function collectTestCasesFromFiles(config) {
260
260
  );
261
261
  return found.flat();
262
262
  }
263
- function toJsonLines(value) {
263
+ function createDiffString(expected, actual, diffOptions) {
264
+ const opts = { ...diffOptions, color: false };
265
+ const result = diffString(expected, actual, opts);
266
+ return typeof result === "string" ? result : "";
267
+ }
268
+ function formatLogMessage(msg) {
269
+ if (typeof msg === "string")
270
+ return msg;
264
271
  try {
265
- return JSON.stringify(value, null, 2);
272
+ if (msg !== null && typeof msg === "object") {
273
+ return JSON.stringify(msg, null, 2);
274
+ }
275
+ return String(msg);
266
276
  } catch {
267
- return String(value);
277
+ return String(msg);
268
278
  }
269
279
  }
270
- function formatDiffString(changes) {
271
- const lines = [];
272
- for (const part of changes) {
273
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
274
- const partLines = part.value.split("\n");
275
- if (partLines[partLines.length - 1] === "") {
276
- partLines.pop();
277
- }
278
- for (const line of partLines) {
279
- lines.push(`${prefix} ${line}`);
280
- }
281
- }
282
- return lines.join("\n");
280
+ function createLogEntry(message, options) {
281
+ return {
282
+ type: "log",
283
+ label: options?.label,
284
+ message: formatLogMessage(message)
285
+ };
283
286
  }
284
- function createDiffString(expected, actual) {
285
- const expectedStr = toJsonLines(expected);
286
- const actualStr = toJsonLines(actual);
287
- const changes = diffLines(expectedStr, actualStr);
288
- return formatDiffString(changes);
287
+ function getLogLines(entry) {
288
+ return entry.message.split("\n");
289
289
  }
290
290
  function createDiffLogEntry(expected, actual, options) {
291
- const diff = createDiffString(expected, actual);
291
+ const { label, ...diffOpts } = options ?? {};
292
+ const diff = createDiffString(expected, actual, diffOpts);
292
293
  return {
293
294
  type: "diff",
294
- label: options?.label,
295
+ label,
295
296
  expected,
296
297
  actual,
297
298
  diff: diff || "(no differences)"
298
299
  };
299
300
  }
300
301
  function getDiffLines(entry) {
301
- const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
302
+ const raw = entry.diff || "(no differences)";
302
303
  return raw.split("\n").map((line) => {
303
304
  const trimmed = line.trimStart();
304
305
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -574,6 +575,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
574
575
  const logDiff = (expected, actual, options) => {
575
576
  logs.push(createDiffLogEntry(expected, actual, options));
576
577
  };
578
+ const log = (message, options) => {
579
+ logs.push(createLogEntry(message, options));
580
+ };
577
581
  const ctx = yield* Effect.promise(
578
582
  () => Promise.resolve(evaluator.resolveContext())
579
583
  );
@@ -583,7 +587,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
583
587
  input: testCaseItem.testCase.getInput(),
584
588
  ctx,
585
589
  output,
586
- logDiff
590
+ logDiff,
591
+ log
587
592
  })
588
593
  )
589
594
  );
@@ -1518,6 +1523,7 @@ function RunView({
1518
1523
  );
1519
1524
  setEvaluatorNameById(nameById);
1520
1525
  const aggregates = /* @__PURE__ */ new Map();
1526
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
1521
1527
  let overallScoreTotal = 0;
1522
1528
  let overallScoreSumSq = 0;
1523
1529
  let overallScoreCount = 0;
@@ -1547,6 +1553,12 @@ function RunView({
1547
1553
  overallScoreSumSq += numeric * numeric;
1548
1554
  overallScoreCount += 1;
1549
1555
  }
1556
+ for (const s of item.scores) {
1557
+ const key = `${item.evaluatorId}:${s.id}`;
1558
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
1559
+ list.push(s);
1560
+ scoreItemsByEvaluatorScore.set(key, list);
1561
+ }
1550
1562
  }
1551
1563
  setTestCases((prev) => {
1552
1564
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
@@ -1617,6 +1629,7 @@ function RunView({
1617
1629
  overallScoreSumSq,
1618
1630
  overallScoreCount,
1619
1631
  aggregates: new Map(aggregates),
1632
+ scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1620
1633
  artifactPath: finalEvent.artifactPath
1621
1634
  });
1622
1635
  setPhase("completed");
@@ -1699,36 +1712,45 @@ function RunView({
1699
1712
  ":",
1700
1713
  " ",
1701
1714
  /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1702
- " ",
1703
- item.scores.map((s) => /* @__PURE__ */ jsxs(
1715
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1716
+ " ",
1717
+ item.metrics.map((m) => {
1718
+ const def = getMetricById(m.id);
1719
+ if (!def)
1720
+ return null;
1721
+ const formatted = def.format(m.data, {
1722
+ isAggregated: tc.isAggregated
1723
+ });
1724
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1725
+ "[",
1726
+ def.name ? `${def.name}: ` : "",
1727
+ formatted,
1728
+ "]",
1729
+ " "
1730
+ ] }, m.id);
1731
+ })
1732
+ ] }) : null
1733
+ ] }),
1734
+ item.scores.length > 0 ? item.scores.map((s, idx) => {
1735
+ const def = getScoreById(s.id);
1736
+ const scoreLabel = def ? def.name ?? def.id : s.id;
1737
+ return /* @__PURE__ */ jsxs(
1704
1738
  Text,
1705
1739
  {
1706
1740
  color: scoreColor(toNumericScore(s.data) ?? 0),
1707
1741
  children: [
1742
+ " ",
1743
+ scoreLabel,
1744
+ ":",
1745
+ " ",
1708
1746
  formatScorePart(s, scoreColor, {
1709
1747
  isAggregated: tc.isAggregated
1710
- }),
1711
- " "
1748
+ })
1712
1749
  ]
1713
1750
  },
1714
- s.id
1715
- )),
1716
- item.metrics?.map((m) => {
1717
- const def = getMetricById(m.id);
1718
- if (!def)
1719
- return null;
1720
- const formatted = def.format(m.data, {
1721
- isAggregated: tc.isAggregated
1722
- });
1723
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1724
- "[",
1725
- def.name ? `${def.name}: ` : "",
1726
- formatted,
1727
- "]",
1728
- " "
1729
- ] }, m.id);
1730
- })
1731
- ] }),
1751
+ `${item.evaluatorId}-${s.id}-${idx}`
1752
+ );
1753
+ }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
1732
1754
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1733
1755
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
1734
1756
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
@@ -1739,7 +1761,7 @@ function RunView({
1739
1761
  },
1740
1762
  lineIdx
1741
1763
  )
1742
- ) }, logIdx) : null
1764
+ ) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
1743
1765
  ) })
1744
1766
  ]
1745
1767
  },
@@ -1786,26 +1808,54 @@ function RunView({
1786
1808
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
1787
1809
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1788
1810
  const agg = summary.aggregates.get(id);
1789
- if (!agg || agg.count === 0) {
1811
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
1812
+ (k) => k.startsWith(`${id}:`)
1813
+ );
1814
+ if (scoreKeys.length === 0) {
1790
1815
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1791
1816
  "- ",
1792
1817
  name.padEnd(28),
1793
- " no numeric scores"
1818
+ " no scores"
1794
1819
  ] }, id);
1795
1820
  }
1796
- const mean = agg.total / agg.count;
1797
- const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
1798
- const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1799
- return /* @__PURE__ */ jsxs(Text, { children: [
1800
- "- ",
1801
- name.padEnd(28),
1802
- " avg=",
1803
- /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
1821
+ const passedFailed = agg != null ? /* @__PURE__ */ jsxs(Text, { children: [
1804
1822
  " ",
1805
1823
  "passed=",
1806
1824
  agg.passed,
1807
1825
  " failed=",
1808
1826
  agg.failed
1827
+ ] }) : null;
1828
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1829
+ /* @__PURE__ */ jsxs(Text, { children: [
1830
+ "- ",
1831
+ name.padEnd(28),
1832
+ passedFailed
1833
+ ] }),
1834
+ scoreKeys.map((key) => {
1835
+ const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
1836
+ const aggregated = aggregateScoreItems(items);
1837
+ if (!aggregated)
1838
+ return null;
1839
+ const def = getScoreById(aggregated.id);
1840
+ const label = def ? def.name ?? def.id : aggregated.id;
1841
+ const formatted = def?.format(aggregated.data, {
1842
+ isAggregated: true
1843
+ }) ?? "n/a";
1844
+ const numeric = toNumericScore(aggregated.data);
1845
+ return /* @__PURE__ */ jsxs(
1846
+ Text,
1847
+ {
1848
+ color: numeric !== void 0 ? scoreColor(numeric) : "gray",
1849
+ children: [
1850
+ " ",
1851
+ label,
1852
+ ": ",
1853
+ formatted
1854
+ ]
1855
+ },
1856
+ key
1857
+ );
1858
+ })
1809
1859
  ] }, id);
1810
1860
  })
1811
1861
  ] }),
@@ -1933,14 +1983,36 @@ function scoreToColor(score) {
1933
1983
  }
1934
1984
  return ansi2.red;
1935
1985
  }
1936
- function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1937
- if (!aggregate || aggregate.count === 0) {
1938
- return `- ${evaluatorName.padEnd(28)} no numeric scores`;
1986
+ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
1987
+ const lines = [];
1988
+ const scoreKeys = [...scoreItemsByKey.keys()].filter(
1989
+ (k) => k.startsWith(`${evaluatorId}:`)
1990
+ );
1991
+ if (scoreKeys.length === 0) {
1992
+ lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
1993
+ return lines;
1994
+ }
1995
+ const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
1996
+ const scoreLines = [];
1997
+ for (const key of scoreKeys) {
1998
+ const items = scoreItemsByKey.get(key) ?? [];
1999
+ const agg = aggregateScoreItems(items);
2000
+ if (!agg)
2001
+ continue;
2002
+ const def = getScoreById(agg.id);
2003
+ const label = def ? def.name ?? def.id : agg.id;
2004
+ const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2005
+ const numeric = toNumericScore(agg.data);
2006
+ const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2007
+ scoreLines.push(` ${label}: ${colored}`);
2008
+ }
2009
+ if (scoreLines.length > 0) {
2010
+ lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
2011
+ lines.push(...scoreLines);
2012
+ } else {
2013
+ lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
1939
2014
  }
1940
- const mean = aggregate.total / aggregate.count;
1941
- const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
1942
- const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1943
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
2015
+ return lines;
1944
2016
  }
1945
2017
  function createBar2(value, max = 100, width = 20) {
1946
2018
  const safe = Math.max(0, Math.min(max, value));
@@ -1992,46 +2064,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1992
2064
  }
1993
2065
  function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1994
2066
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1995
- const scoreParts = [];
1996
- for (const item of scores) {
1997
- const def = getScoreById(item.id);
1998
- if (!def) {
1999
- const numeric = toNumericScore(item.data);
2000
- scoreParts.push(
2001
- numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
2002
- );
2003
- continue;
2004
- }
2005
- const formatted = def.format(item.data, options);
2006
- switch (def.displayStrategy) {
2007
- case "bar": {
2008
- const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2009
- if (typeof numeric === "number" && Number.isFinite(numeric)) {
2010
- scoreParts.push(
2011
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
2012
- );
2013
- } else {
2014
- scoreParts.push(formatted);
2015
- }
2016
- break;
2017
- }
2018
- case "number":
2019
- scoreParts.push(formatted);
2020
- break;
2021
- case "passFail":
2022
- scoreParts.push(
2023
- colorize(
2024
- formatted,
2025
- item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2026
- )
2027
- );
2028
- break;
2029
- }
2030
- }
2031
- const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
2032
- let line = ` ${name}: ${passLabel} ${scoreStr}`;
2067
+ const metricParts = [];
2033
2068
  if (metrics && metrics.length > 0) {
2034
- const metricParts = [];
2035
2069
  for (const { id, data } of metrics) {
2036
2070
  const def = getMetricById(id);
2037
2071
  if (def) {
@@ -2041,11 +2075,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2041
2075
  );
2042
2076
  }
2043
2077
  }
2044
- if (metricParts.length > 0) {
2045
- line += ` ${metricParts.join(" ")}`;
2078
+ }
2079
+ const scoreLines = [];
2080
+ for (const item of scores) {
2081
+ const def = getScoreById(item.id);
2082
+ const scoreLabel = def ? def.name ?? def.id : item.id;
2083
+ let formatted;
2084
+ if (!def) {
2085
+ const numeric = toNumericScore(item.data);
2086
+ formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2087
+ } else {
2088
+ const raw = def.format(item.data, options);
2089
+ switch (def.displayStrategy) {
2090
+ case "bar": {
2091
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2092
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
2093
+ formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
2094
+ } else {
2095
+ formatted = raw;
2096
+ }
2097
+ break;
2098
+ }
2099
+ case "number":
2100
+ formatted = raw;
2101
+ break;
2102
+ case "passFail":
2103
+ formatted = colorize(
2104
+ raw,
2105
+ item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2106
+ );
2107
+ break;
2108
+ }
2046
2109
  }
2110
+ scoreLines.push(` ${scoreLabel}: ${formatted}`);
2047
2111
  }
2048
- return line;
2112
+ const lines = [];
2113
+ const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
2114
+ lines.push(` ${name}: ${passLabel}${metricStr}`);
2115
+ if (scoreLines.length > 0) {
2116
+ lines.push(...scoreLines);
2117
+ } else {
2118
+ lines.push(` n/a`);
2119
+ }
2120
+ return lines;
2049
2121
  }
2050
2122
  async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2051
2123
  const dataset = await runner.resolveDatasetByName(datasetName);
@@ -2068,6 +2140,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2068
2140
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2069
2141
  );
2070
2142
  const aggregates = /* @__PURE__ */ new Map();
2143
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
2071
2144
  const testCaseByTestId = /* @__PURE__ */ new Map();
2072
2145
  let overallScoreTotal = 0;
2073
2146
  let overallScoreSumSq = 0;
@@ -2143,6 +2216,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2143
2216
  overallScoreSumSq += numeric * numeric;
2144
2217
  overallScoreCount += 1;
2145
2218
  }
2219
+ for (const s of item.scores) {
2220
+ const key = `${item.evaluatorId}:${s.id}`;
2221
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
2222
+ list.push(s);
2223
+ scoreItemsByEvaluatorScore.set(key, list);
2224
+ }
2146
2225
  }
2147
2226
  const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2148
2227
  const isLastRerun = event.rerunIndex >= event.rerunTotal;
@@ -2166,7 +2245,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2166
2245
  for (const item of aggregatedScores) {
2167
2246
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2168
2247
  lines.push(
2169
- formatEvaluatorScoreLine(
2248
+ ...formatEvaluatorScoreLine(
2170
2249
  name,
2171
2250
  item.scores,
2172
2251
  item.passed,
@@ -2186,6 +2265,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2186
2265
  const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2187
2266
  lines.push(colored);
2188
2267
  }
2268
+ } else if (log.type === "log") {
2269
+ for (const line of getLogLines(log)) {
2270
+ lines.push(` ${line}`);
2271
+ }
2189
2272
  }
2190
2273
  }
2191
2274
  }
@@ -2263,9 +2346,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2263
2346
  }
2264
2347
  console.log(colorize("- evaluator averages:", ansi2.magenta));
2265
2348
  for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
2266
- console.log(
2267
- getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
2349
+ const evaluatorLines = getEvaluatorSummaryLines(
2350
+ evaluatorId,
2351
+ evaluatorName,
2352
+ aggregates.get(evaluatorId),
2353
+ scoreItemsByEvaluatorScore
2268
2354
  );
2355
+ for (const line of evaluatorLines) {
2356
+ console.log(line);
2357
+ }
2269
2358
  }
2270
2359
  const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
2271
2360
  if (testCaseSummaries.length > 0) {