@m4trix/evals 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1518,6 +1518,7 @@ function RunView({
1518
1518
  );
1519
1519
  setEvaluatorNameById(nameById);
1520
1520
  const aggregates = /* @__PURE__ */ new Map();
1521
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
1521
1522
  let overallScoreTotal = 0;
1522
1523
  let overallScoreSumSq = 0;
1523
1524
  let overallScoreCount = 0;
@@ -1547,6 +1548,12 @@ function RunView({
1547
1548
  overallScoreSumSq += numeric * numeric;
1548
1549
  overallScoreCount += 1;
1549
1550
  }
1551
+ for (const s of item.scores) {
1552
+ const key = `${item.evaluatorId}:${s.id}`;
1553
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
1554
+ list.push(s);
1555
+ scoreItemsByEvaluatorScore.set(key, list);
1556
+ }
1550
1557
  }
1551
1558
  setTestCases((prev) => {
1552
1559
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
@@ -1617,6 +1624,7 @@ function RunView({
1617
1624
  overallScoreSumSq,
1618
1625
  overallScoreCount,
1619
1626
  aggregates: new Map(aggregates),
1627
+ scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1620
1628
  artifactPath: finalEvent.artifactPath
1621
1629
  });
1622
1630
  setPhase("completed");
@@ -1699,36 +1707,45 @@ function RunView({
1699
1707
  ":",
1700
1708
  " ",
1701
1709
  /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1702
- " ",
1703
- item.scores.map((s) => /* @__PURE__ */ jsxs(
1710
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1711
+ " ",
1712
+ item.metrics.map((m) => {
1713
+ const def = getMetricById(m.id);
1714
+ if (!def)
1715
+ return null;
1716
+ const formatted = def.format(m.data, {
1717
+ isAggregated: tc.isAggregated
1718
+ });
1719
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1720
+ "[",
1721
+ def.name ? `${def.name}: ` : "",
1722
+ formatted,
1723
+ "]",
1724
+ " "
1725
+ ] }, m.id);
1726
+ })
1727
+ ] }) : null
1728
+ ] }),
1729
+ item.scores.length > 0 ? item.scores.map((s, idx) => {
1730
+ const def = getScoreById(s.id);
1731
+ const scoreLabel = def ? def.name ?? def.id : s.id;
1732
+ return /* @__PURE__ */ jsxs(
1704
1733
  Text,
1705
1734
  {
1706
1735
  color: scoreColor(toNumericScore(s.data) ?? 0),
1707
1736
  children: [
1737
+ " ",
1738
+ scoreLabel,
1739
+ ":",
1740
+ " ",
1708
1741
  formatScorePart(s, scoreColor, {
1709
1742
  isAggregated: tc.isAggregated
1710
- }),
1711
- " "
1743
+ })
1712
1744
  ]
1713
1745
  },
1714
- s.id
1715
- )),
1716
- item.metrics?.map((m) => {
1717
- const def = getMetricById(m.id);
1718
- if (!def)
1719
- return null;
1720
- const formatted = def.format(m.data, {
1721
- isAggregated: tc.isAggregated
1722
- });
1723
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1724
- "[",
1725
- def.name ? `${def.name}: ` : "",
1726
- formatted,
1727
- "]",
1728
- " "
1729
- ] }, m.id);
1730
- })
1731
- ] }),
1746
+ `${item.evaluatorId}-${s.id}-${idx}`
1747
+ );
1748
+ }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
1732
1749
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1733
1750
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
1734
1751
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
@@ -1786,26 +1803,54 @@ function RunView({
1786
1803
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
1787
1804
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1788
1805
  const agg = summary.aggregates.get(id);
1789
- if (!agg || agg.count === 0) {
1806
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
1807
+ (k) => k.startsWith(`${id}:`)
1808
+ );
1809
+ if (scoreKeys.length === 0) {
1790
1810
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1791
1811
  "- ",
1792
1812
  name.padEnd(28),
1793
- " no numeric scores"
1813
+ " no scores"
1794
1814
  ] }, id);
1795
1815
  }
1796
- const mean = agg.total / agg.count;
1797
- const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
1798
- const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1799
- return /* @__PURE__ */ jsxs(Text, { children: [
1800
- "- ",
1801
- name.padEnd(28),
1802
- " avg=",
1803
- /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
1816
+ const passedFailed = agg != null ? /* @__PURE__ */ jsxs(Text, { children: [
1804
1817
  " ",
1805
1818
  "passed=",
1806
1819
  agg.passed,
1807
1820
  " failed=",
1808
1821
  agg.failed
1822
+ ] }) : null;
1823
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1824
+ /* @__PURE__ */ jsxs(Text, { children: [
1825
+ "- ",
1826
+ name.padEnd(28),
1827
+ passedFailed
1828
+ ] }),
1829
+ scoreKeys.map((key) => {
1830
+ const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
1831
+ const aggregated = aggregateScoreItems(items);
1832
+ if (!aggregated)
1833
+ return null;
1834
+ const def = getScoreById(aggregated.id);
1835
+ const label = def ? def.name ?? def.id : aggregated.id;
1836
+ const formatted = def?.format(aggregated.data, {
1837
+ isAggregated: true
1838
+ }) ?? "n/a";
1839
+ const numeric = toNumericScore(aggregated.data);
1840
+ return /* @__PURE__ */ jsxs(
1841
+ Text,
1842
+ {
1843
+ color: numeric !== void 0 ? scoreColor(numeric) : "gray",
1844
+ children: [
1845
+ " ",
1846
+ label,
1847
+ ": ",
1848
+ formatted
1849
+ ]
1850
+ },
1851
+ key
1852
+ );
1853
+ })
1809
1854
  ] }, id);
1810
1855
  })
1811
1856
  ] }),
@@ -1933,14 +1978,36 @@ function scoreToColor(score) {
1933
1978
  }
1934
1979
  return ansi2.red;
1935
1980
  }
1936
- function getEvaluatorSummaryLine(evaluatorName, aggregate) {
1937
- if (!aggregate || aggregate.count === 0) {
1938
- return `- ${evaluatorName.padEnd(28)} no numeric scores`;
1981
+ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
1982
+ const lines = [];
1983
+ const scoreKeys = [...scoreItemsByKey.keys()].filter(
1984
+ (k) => k.startsWith(`${evaluatorId}:`)
1985
+ );
1986
+ if (scoreKeys.length === 0) {
1987
+ lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
1988
+ return lines;
1989
+ }
1990
+ const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
1991
+ const scoreLines = [];
1992
+ for (const key of scoreKeys) {
1993
+ const items = scoreItemsByKey.get(key) ?? [];
1994
+ const agg = aggregateScoreItems(items);
1995
+ if (!agg)
1996
+ continue;
1997
+ const def = getScoreById(agg.id);
1998
+ const label = def ? def.name ?? def.id : agg.id;
1999
+ const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2000
+ const numeric = toNumericScore(agg.data);
2001
+ const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2002
+ scoreLines.push(` ${label}: ${colored}`);
2003
+ }
2004
+ if (scoreLines.length > 0) {
2005
+ lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
2006
+ lines.push(...scoreLines);
2007
+ } else {
2008
+ lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
1939
2009
  }
1940
- const mean = aggregate.total / aggregate.count;
1941
- const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
1942
- const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
1943
- return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
2010
+ return lines;
1944
2011
  }
1945
2012
  function createBar2(value, max = 100, width = 20) {
1946
2013
  const safe = Math.max(0, Math.min(max, value));
@@ -1992,46 +2059,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
1992
2059
  }
1993
2060
  function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
1994
2061
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
1995
- const scoreParts = [];
1996
- for (const item of scores) {
1997
- const def = getScoreById(item.id);
1998
- if (!def) {
1999
- const numeric = toNumericScore(item.data);
2000
- scoreParts.push(
2001
- numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
2002
- );
2003
- continue;
2004
- }
2005
- const formatted = def.format(item.data, options);
2006
- switch (def.displayStrategy) {
2007
- case "bar": {
2008
- const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2009
- if (typeof numeric === "number" && Number.isFinite(numeric)) {
2010
- scoreParts.push(
2011
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
2012
- );
2013
- } else {
2014
- scoreParts.push(formatted);
2015
- }
2016
- break;
2017
- }
2018
- case "number":
2019
- scoreParts.push(formatted);
2020
- break;
2021
- case "passFail":
2022
- scoreParts.push(
2023
- colorize(
2024
- formatted,
2025
- item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2026
- )
2027
- );
2028
- break;
2029
- }
2030
- }
2031
- const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
2032
- let line = ` ${name}: ${passLabel} ${scoreStr}`;
2062
+ const metricParts = [];
2033
2063
  if (metrics && metrics.length > 0) {
2034
- const metricParts = [];
2035
2064
  for (const { id, data } of metrics) {
2036
2065
  const def = getMetricById(id);
2037
2066
  if (def) {
@@ -2041,11 +2070,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2041
2070
  );
2042
2071
  }
2043
2072
  }
2044
- if (metricParts.length > 0) {
2045
- line += ` ${metricParts.join(" ")}`;
2073
+ }
2074
+ const scoreLines = [];
2075
+ for (const item of scores) {
2076
+ const def = getScoreById(item.id);
2077
+ const scoreLabel = def ? def.name ?? def.id : item.id;
2078
+ let formatted;
2079
+ if (!def) {
2080
+ const numeric = toNumericScore(item.data);
2081
+ formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2082
+ } else {
2083
+ const raw = def.format(item.data, options);
2084
+ switch (def.displayStrategy) {
2085
+ case "bar": {
2086
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2087
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
2088
+ formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
2089
+ } else {
2090
+ formatted = raw;
2091
+ }
2092
+ break;
2093
+ }
2094
+ case "number":
2095
+ formatted = raw;
2096
+ break;
2097
+ case "passFail":
2098
+ formatted = colorize(
2099
+ raw,
2100
+ item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
2101
+ );
2102
+ break;
2103
+ }
2046
2104
  }
2105
+ scoreLines.push(` ${scoreLabel}: ${formatted}`);
2106
+ }
2107
+ const lines = [];
2108
+ const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
2109
+ lines.push(` ${name}: ${passLabel}${metricStr}`);
2110
+ if (scoreLines.length > 0) {
2111
+ lines.push(...scoreLines);
2112
+ } else {
2113
+ lines.push(` n/a`);
2047
2114
  }
2048
- return line;
2115
+ return lines;
2049
2116
  }
2050
2117
  async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2051
2118
  const dataset = await runner.resolveDatasetByName(datasetName);
@@ -2068,6 +2135,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2068
2135
  evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2069
2136
  );
2070
2137
  const aggregates = /* @__PURE__ */ new Map();
2138
+ const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
2071
2139
  const testCaseByTestId = /* @__PURE__ */ new Map();
2072
2140
  let overallScoreTotal = 0;
2073
2141
  let overallScoreSumSq = 0;
@@ -2143,6 +2211,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2143
2211
  overallScoreSumSq += numeric * numeric;
2144
2212
  overallScoreCount += 1;
2145
2213
  }
2214
+ for (const s of item.scores) {
2215
+ const key = `${item.evaluatorId}:${s.id}`;
2216
+ const list = scoreItemsByEvaluatorScore.get(key) ?? [];
2217
+ list.push(s);
2218
+ scoreItemsByEvaluatorScore.set(key, list);
2219
+ }
2146
2220
  }
2147
2221
  const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2148
2222
  const isLastRerun = event.rerunIndex >= event.rerunTotal;
@@ -2166,7 +2240,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2166
2240
  for (const item of aggregatedScores) {
2167
2241
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2168
2242
  lines.push(
2169
- formatEvaluatorScoreLine(
2243
+ ...formatEvaluatorScoreLine(
2170
2244
  name,
2171
2245
  item.scores,
2172
2246
  item.passed,
@@ -2263,9 +2337,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2263
2337
  }
2264
2338
  console.log(colorize("- evaluator averages:", ansi2.magenta));
2265
2339
  for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
2266
- console.log(
2267
- getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
2340
+ const evaluatorLines = getEvaluatorSummaryLines(
2341
+ evaluatorId,
2342
+ evaluatorName,
2343
+ aggregates.get(evaluatorId),
2344
+ scoreItemsByEvaluatorScore
2268
2345
  );
2346
+ for (const line of evaluatorLines) {
2347
+ console.log(line);
2348
+ }
2269
2349
  }
2270
2350
  const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
2271
2351
  if (testCaseSummaries.length > 0) {