@m4trix/evals 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1544,6 +1544,7 @@ function RunView({
1544
1544
  );
1545
1545
  setEvaluatorNameById(nameById);
1546
1546
  const aggregates = /* @__PURE__ */ new Map();
1547
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
1547
1548
  let overallScoreTotal = 0;
1548
1549
  let overallScoreSumSq = 0;
1549
1550
  let overallScoreCount = 0;
@@ -1573,6 +1574,12 @@ function RunView({
1573
1574
  overallScoreSumSq += numeric * numeric;
1574
1575
  overallScoreCount += 1;
1575
1576
  }
1577
+ for (const s of item.scores) {
1578
+ const key = `${item.evaluatorId}:${s.id}`;
1579
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
1580
+ list.push(s);
1581
+ scoreItemsByEvaluatorScore.set(key, list);
1582
+ }
1576
1583
  }
1577
1584
  setTestCases((prev) => {
1578
1585
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
@@ -1643,6 +1650,7 @@ function RunView({
1643
1650
  overallScoreSumSq,
1644
1651
  overallScoreCount,
1645
1652
  aggregates: new Map(aggregates),
1653
+ scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1646
1654
  artifactPath: finalEvent.artifactPath
1647
1655
  });
1648
1656
  setPhase("completed");
@@ -1725,36 +1733,45 @@ function RunView({
1725
1733
  ":",
1726
1734
  " ",
1727
1735
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1728
- " ",
1729
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
1736
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1737
+ " ",
1738
+ item.metrics.map((m) => {
1739
+ const def = getMetricById(m.id);
1740
+ if (!def)
1741
+ return null;
1742
+ const formatted = def.format(m.data, {
1743
+ isAggregated: tc.isAggregated
1744
+ });
1745
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1746
+ "[",
1747
+ def.name ? `${def.name}: ` : "",
1748
+ formatted,
1749
+ "]",
1750
+ " "
1751
+ ] }, m.id);
1752
+ })
1753
+ ] }) : null
1754
+ ] }),
1755
+ item.scores.length > 0 ? item.scores.map((s, idx) => {
1756
+ const def = getScoreById(s.id);
1757
+ const scoreLabel = def ? def.name ?? def.id : s.id;
1758
+ return /* @__PURE__ */ jsxRuntime.jsxs(
1730
1759
  ink.Text,
1731
1760
  {
1732
1761
  color: scoreColor(toNumericScore(s.data) ?? 0),
1733
1762
  children: [
1763
+ " ",
1764
+ scoreLabel,
1765
+ ":",
1766
+ " ",
1734
1767
  formatScorePart(s, scoreColor, {
1735
1768
  isAggregated: tc.isAggregated
1736
- }),
1737
- " "
1769
+ })
1738
1770
  ]
1739
1771
  },
1740
- s.id
1741
- )),
1742
- item.metrics?.map((m) => {
1743
- const def = getMetricById(m.id);
1744
- if (!def)
1745
- return null;
1746
- const formatted = def.format(m.data, {
1747
- isAggregated: tc.isAggregated
1748
- });
1749
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1750
- "[",
1751
- def.name ? `${def.name}: ` : "",
1752
- formatted,
1753
- "]",
1754
- " "
1755
- ] }, m.id);
1756
- })
1757
- ] }),
1772
+ `${item.evaluatorId}-${s.id}-${idx}`
1773
+ );
1774
+ }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
1758
1775
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1759
1776
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1760
1777
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
@@ -1812,26 +1829,54 @@ function RunView({
1812
1829
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
1813
1830
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1814
1831
  const agg = summary.aggregates.get(id);
1815
- if (!agg || agg.count === 0) {
1832
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
1833
+ (k) => k.startsWith(`${id}:`)
1834
+ );
1835
+ if (scoreKeys.length === 0) {
1816
1836
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1817
1837
  "- ",
1818
1838
  name.padEnd(28),
1819
- " no numeric scores"
1839
+ " no scores"
1820
1840
  ] }, id);
1821
1841
  }
1822
- const mean = agg.total / agg.count;
1823
- const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
1824
- const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1825
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1826
- "- ",
1827
- name.padEnd(28),
1828
- " avg=",
1829
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: meanStr }),
1842
+ const passedFailed = agg != null ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1830
1843
  " ",
1831
1844
  "passed=",
1832
1845
  agg.passed,
1833
1846
  " failed=",
1834
1847
  agg.failed
1848
+ ] }) : null;
1849
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1850
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1851
+ "- ",
1852
+ name.padEnd(28),
1853
+ passedFailed
1854
+ ] }),
1855
+ scoreKeys.map((key) => {
1856
+ const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
1857
+ const aggregated = aggregateScoreItems(items);
1858
+ if (!aggregated)
1859
+ return null;
1860
+ const def = getScoreById(aggregated.id);
1861
+ const label = def ? def.name ?? def.id : aggregated.id;
1862
+ const formatted = def?.format(aggregated.data, {
1863
+ isAggregated: true
1864
+ }) ?? "n/a";
1865
+ const numeric = toNumericScore(aggregated.data);
1866
+ return /* @__PURE__ */ jsxRuntime.jsxs(
1867
+ ink.Text,
1868
+ {
1869
+ color: numeric !== void 0 ? scoreColor(numeric) : "gray",
1870
+ children: [
1871
+ " ",
1872
+ label,
1873
+ ": ",
1874
+ formatted
1875
+ ]
1876
+ },
1877
+ key
1878
+ );
1879
+ })
1835
1880
  ] }, id);
1836
1881
  })
1837
1882
  ] }),
@@ -1959,14 +2004,36 @@ function scoreToColor(score) {
1959
2004
  }
1960
2005
  return ansi2.red;
1961
2006
  }
1962
- function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1963
- if (!aggregate || aggregate.count === 0) {
1964
- return `- ${evaluatorName.padEnd(28)} no numeric scores`;
2007
+ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
2008
+ const lines = [];
2009
+ const scoreKeys = [...scoreItemsByKey.keys()].filter(
2010
+ (k) => k.startsWith(`${evaluatorId}:`)
2011
+ );
2012
+ if (scoreKeys.length === 0) {
2013
+ lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
2014
+ return lines;
2015
+ }
2016
+ const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
2017
+ const scoreLines = [];
2018
+ for (const key of scoreKeys) {
2019
+ const items = scoreItemsByKey.get(key) ?? [];
2020
+ const agg = aggregateScoreItems(items);
2021
+ if (!agg)
2022
+ continue;
2023
+ const def = getScoreById(agg.id);
2024
+ const label = def ? def.name ?? def.id : agg.id;
2025
+ const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2026
+ const numeric = toNumericScore(agg.data);
2027
+ const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2028
+ scoreLines.push(` ${label}: ${colored}`);
2029
+ }
2030
+ if (scoreLines.length > 0) {
2031
+ lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
2032
+ lines.push(...scoreLines);
2033
+ } else {
2034
+ lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
1965
2035
  }
1966
- const mean = aggregate.total / aggregate.count;
1967
- const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
1968
- const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1969
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
2036
+ return lines;
1970
2037
  }
1971
2038
  function createBar2(value, max = 100, width = 20) {
1972
2039
  const safe = Math.max(0, Math.min(max, value));
@@ -2018,46 +2085,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
2018
2085
  }
2019
2086
  function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2020
2087
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
2021
- const scoreParts = [];
2022
- for (const item of scores) {
2023
- const def = getScoreById(item.id);
2024
- if (!def) {
2025
- const numeric = toNumericScore(item.data);
2026
- scoreParts.push(
2027
- numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
2028
- );
2029
- continue;
2030
- }
2031
- const formatted = def.format(item.data, options);
2032
- switch (def.displayStrategy) {
2033
- case "bar": {
2034
- const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2035
- if (typeof numeric === "number" && Number.isFinite(numeric)) {
2036
- scoreParts.push(
2037
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
2038
- );
2039
- } else {
2040
- scoreParts.push(formatted);
2041
- }
2042
- break;
2043
- }
2044
- case "number":
2045
- scoreParts.push(formatted);
2046
- break;
2047
- case "passFail":
2048
- scoreParts.push(
2049
- colorize(
2050
- formatted,
2051
- item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2052
- )
2053
- );
2054
- break;
2055
- }
2056
- }
2057
- const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
2058
- let line = ` ${name}: ${passLabel} ${scoreStr}`;
2088
+ const metricParts = [];
2059
2089
  if (metrics && metrics.length > 0) {
2060
- const metricParts = [];
2061
2090
  for (const { id, data } of metrics) {
2062
2091
  const def = getMetricById(id);
2063
2092
  if (def) {
@@ -2067,11 +2096,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2067
2096
  );
2068
2097
  }
2069
2098
  }
2070
- if (metricParts.length > 0) {
2071
- line += ` ${metricParts.join(" ")}`;
2099
+ }
2100
+ const scoreLines = [];
2101
+ for (const item of scores) {
2102
+ const def = getScoreById(item.id);
2103
+ const scoreLabel = def ? def.name ?? def.id : item.id;
2104
+ let formatted;
2105
+ if (!def) {
2106
+ const numeric = toNumericScore(item.data);
2107
+ formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2108
+ } else {
2109
+ const raw = def.format(item.data, options);
2110
+ switch (def.displayStrategy) {
2111
+ case "bar": {
2112
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2113
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
2114
+ formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
2115
+ } else {
2116
+ formatted = raw;
2117
+ }
2118
+ break;
2119
+ }
2120
+ case "number":
2121
+ formatted = raw;
2122
+ break;
2123
+ case "passFail":
2124
+ formatted = colorize(
2125
+ raw,
2126
+ item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2127
+ );
2128
+ break;
2129
+ }
2072
2130
  }
2131
+ scoreLines.push(` ${scoreLabel}: ${formatted}`);
2132
+ }
2133
+ const lines = [];
2134
+ const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
2135
+ lines.push(` ${name}: ${passLabel}${metricStr}`);
2136
+ if (scoreLines.length > 0) {
2137
+ lines.push(...scoreLines);
2138
+ } else {
2139
+ lines.push(` n/a`);
2073
2140
  }
2074
- return line;
2141
+ return lines;
2075
2142
  }
2076
2143
  async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2077
2144
  const dataset = await runner.resolveDatasetByName(datasetName);
@@ -2094,6 +2161,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2094
2161
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2095
2162
  );
2096
2163
  const aggregates = /* @__PURE__ */ new Map();
2164
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
2097
2165
  const testCaseByTestId = /* @__PURE__ */ new Map();
2098
2166
  let overallScoreTotal = 0;
2099
2167
  let overallScoreSumSq = 0;
@@ -2169,6 +2237,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2169
2237
  overallScoreSumSq += numeric * numeric;
2170
2238
  overallScoreCount += 1;
2171
2239
  }
2240
+ for (const s of item.scores) {
2241
+ const key = `${item.evaluatorId}:${s.id}`;
2242
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
2243
+ list.push(s);
2244
+ scoreItemsByEvaluatorScore.set(key, list);
2245
+ }
2172
2246
  }
2173
2247
  const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2174
2248
  const isLastRerun = event.rerunIndex >= event.rerunTotal;
@@ -2192,7 +2266,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2192
2266
  for (const item of aggregatedScores) {
2193
2267
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2194
2268
  lines.push(
2195
- formatEvaluatorScoreLine(
2269
+ ...formatEvaluatorScoreLine(
2196
2270
  name,
2197
2271
  item.scores,
2198
2272
  item.passed,
@@ -2289,9 +2363,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2289
2363
  }
2290
2364
  console.log(colorize("- evaluator averages:", ansi2.magenta));
2291
2365
  for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
2292
- console.log(
2293
- getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
2366
+ const evaluatorLines = getEvaluatorSummaryLines(
2367
+ evaluatorId,
2368
+ evaluatorName,
2369
+ aggregates.get(evaluatorId),
2370
+ scoreItemsByEvaluatorScore
2294
2371
  );
2372
+ for (const line of evaluatorLines) {
2373
+ console.log(line);
2374
+ }
2295
2375
  }
2296
2376
  const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
2297
2377
  if (testCaseSummaries.length > 0) {