@m4trix/evals 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +164 -84
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +164 -84
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +52 -18
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +52 -18
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -1544,6 +1544,7 @@ function RunView({
|
|
|
1544
1544
|
);
|
|
1545
1545
|
setEvaluatorNameById(nameById);
|
|
1546
1546
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1547
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
1547
1548
|
let overallScoreTotal = 0;
|
|
1548
1549
|
let overallScoreSumSq = 0;
|
|
1549
1550
|
let overallScoreCount = 0;
|
|
@@ -1573,6 +1574,12 @@ function RunView({
|
|
|
1573
1574
|
overallScoreSumSq += numeric * numeric;
|
|
1574
1575
|
overallScoreCount += 1;
|
|
1575
1576
|
}
|
|
1577
|
+
for (const s of item.scores) {
|
|
1578
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
1579
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
1580
|
+
list.push(s);
|
|
1581
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
1582
|
+
}
|
|
1576
1583
|
}
|
|
1577
1584
|
setTestCases((prev) => {
|
|
1578
1585
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
@@ -1643,6 +1650,7 @@ function RunView({
|
|
|
1643
1650
|
overallScoreSumSq,
|
|
1644
1651
|
overallScoreCount,
|
|
1645
1652
|
aggregates: new Map(aggregates),
|
|
1653
|
+
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1646
1654
|
artifactPath: finalEvent.artifactPath
|
|
1647
1655
|
});
|
|
1648
1656
|
setPhase("completed");
|
|
@@ -1725,36 +1733,45 @@ function RunView({
|
|
|
1725
1733
|
":",
|
|
1726
1734
|
" ",
|
|
1727
1735
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1728
|
-
|
|
1729
|
-
|
|
1736
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1737
|
+
" ",
|
|
1738
|
+
item.metrics.map((m) => {
|
|
1739
|
+
const def = getMetricById(m.id);
|
|
1740
|
+
if (!def)
|
|
1741
|
+
return null;
|
|
1742
|
+
const formatted = def.format(m.data, {
|
|
1743
|
+
isAggregated: tc.isAggregated
|
|
1744
|
+
});
|
|
1745
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1746
|
+
"[",
|
|
1747
|
+
def.name ? `${def.name}: ` : "",
|
|
1748
|
+
formatted,
|
|
1749
|
+
"]",
|
|
1750
|
+
" "
|
|
1751
|
+
] }, m.id);
|
|
1752
|
+
})
|
|
1753
|
+
] }) : null
|
|
1754
|
+
] }),
|
|
1755
|
+
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1756
|
+
const def = getScoreById(s.id);
|
|
1757
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
1758
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1730
1759
|
ink.Text,
|
|
1731
1760
|
{
|
|
1732
1761
|
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1733
1762
|
children: [
|
|
1763
|
+
" ",
|
|
1764
|
+
scoreLabel,
|
|
1765
|
+
":",
|
|
1766
|
+
" ",
|
|
1734
1767
|
formatScorePart(s, scoreColor, {
|
|
1735
1768
|
isAggregated: tc.isAggregated
|
|
1736
|
-
})
|
|
1737
|
-
" "
|
|
1769
|
+
})
|
|
1738
1770
|
]
|
|
1739
1771
|
},
|
|
1740
|
-
s.id
|
|
1741
|
-
)
|
|
1742
|
-
|
|
1743
|
-
const def = getMetricById(m.id);
|
|
1744
|
-
if (!def)
|
|
1745
|
-
return null;
|
|
1746
|
-
const formatted = def.format(m.data, {
|
|
1747
|
-
isAggregated: tc.isAggregated
|
|
1748
|
-
});
|
|
1749
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1750
|
-
"[",
|
|
1751
|
-
def.name ? `${def.name}: ` : "",
|
|
1752
|
-
formatted,
|
|
1753
|
-
"]",
|
|
1754
|
-
" "
|
|
1755
|
-
] }, m.id);
|
|
1756
|
-
})
|
|
1757
|
-
] }),
|
|
1772
|
+
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1773
|
+
);
|
|
1774
|
+
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
1758
1775
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1759
1776
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1760
1777
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -1812,26 +1829,54 @@ function RunView({
|
|
|
1812
1829
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
1813
1830
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1814
1831
|
const agg = summary.aggregates.get(id);
|
|
1815
|
-
|
|
1832
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
1833
|
+
(k) => k.startsWith(`${id}:`)
|
|
1834
|
+
);
|
|
1835
|
+
if (scoreKeys.length === 0) {
|
|
1816
1836
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1817
1837
|
"- ",
|
|
1818
1838
|
name.padEnd(28),
|
|
1819
|
-
" no
|
|
1839
|
+
" no scores"
|
|
1820
1840
|
] }, id);
|
|
1821
1841
|
}
|
|
1822
|
-
const
|
|
1823
|
-
const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
|
|
1824
|
-
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1825
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1826
|
-
"- ",
|
|
1827
|
-
name.padEnd(28),
|
|
1828
|
-
" avg=",
|
|
1829
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: meanStr }),
|
|
1842
|
+
const passedFailed = agg != null ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1830
1843
|
" ",
|
|
1831
1844
|
"passed=",
|
|
1832
1845
|
agg.passed,
|
|
1833
1846
|
" failed=",
|
|
1834
1847
|
agg.failed
|
|
1848
|
+
] }) : null;
|
|
1849
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1850
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1851
|
+
"- ",
|
|
1852
|
+
name.padEnd(28),
|
|
1853
|
+
passedFailed
|
|
1854
|
+
] }),
|
|
1855
|
+
scoreKeys.map((key) => {
|
|
1856
|
+
const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
|
|
1857
|
+
const aggregated = aggregateScoreItems(items);
|
|
1858
|
+
if (!aggregated)
|
|
1859
|
+
return null;
|
|
1860
|
+
const def = getScoreById(aggregated.id);
|
|
1861
|
+
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1862
|
+
const formatted = def?.format(aggregated.data, {
|
|
1863
|
+
isAggregated: true
|
|
1864
|
+
}) ?? "n/a";
|
|
1865
|
+
const numeric = toNumericScore(aggregated.data);
|
|
1866
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1867
|
+
ink.Text,
|
|
1868
|
+
{
|
|
1869
|
+
color: numeric !== void 0 ? scoreColor(numeric) : "gray",
|
|
1870
|
+
children: [
|
|
1871
|
+
" ",
|
|
1872
|
+
label,
|
|
1873
|
+
": ",
|
|
1874
|
+
formatted
|
|
1875
|
+
]
|
|
1876
|
+
},
|
|
1877
|
+
key
|
|
1878
|
+
);
|
|
1879
|
+
})
|
|
1835
1880
|
] }, id);
|
|
1836
1881
|
})
|
|
1837
1882
|
] }),
|
|
@@ -1959,14 +2004,36 @@ function scoreToColor(score) {
|
|
|
1959
2004
|
}
|
|
1960
2005
|
return ansi2.red;
|
|
1961
2006
|
}
|
|
1962
|
-
function
|
|
1963
|
-
|
|
1964
|
-
|
|
2007
|
+
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
2008
|
+
const lines = [];
|
|
2009
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
2010
|
+
(k) => k.startsWith(`${evaluatorId}:`)
|
|
2011
|
+
);
|
|
2012
|
+
if (scoreKeys.length === 0) {
|
|
2013
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
2014
|
+
return lines;
|
|
2015
|
+
}
|
|
2016
|
+
const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
|
|
2017
|
+
const scoreLines = [];
|
|
2018
|
+
for (const key of scoreKeys) {
|
|
2019
|
+
const items = scoreItemsByKey.get(key) ?? [];
|
|
2020
|
+
const agg = aggregateScoreItems(items);
|
|
2021
|
+
if (!agg)
|
|
2022
|
+
continue;
|
|
2023
|
+
const def = getScoreById(agg.id);
|
|
2024
|
+
const label = def ? def.name ?? def.id : agg.id;
|
|
2025
|
+
const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
|
|
2026
|
+
const numeric = toNumericScore(agg.data);
|
|
2027
|
+
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2028
|
+
scoreLines.push(` ${label}: ${colored}`);
|
|
2029
|
+
}
|
|
2030
|
+
if (scoreLines.length > 0) {
|
|
2031
|
+
lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
|
|
2032
|
+
lines.push(...scoreLines);
|
|
2033
|
+
} else {
|
|
2034
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
|
|
1965
2035
|
}
|
|
1966
|
-
|
|
1967
|
-
const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
|
|
1968
|
-
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1969
|
-
return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
2036
|
+
return lines;
|
|
1970
2037
|
}
|
|
1971
2038
|
function createBar2(value, max = 100, width = 20) {
|
|
1972
2039
|
const safe = Math.max(0, Math.min(max, value));
|
|
@@ -2018,46 +2085,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
|
2018
2085
|
}
|
|
2019
2086
|
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
2020
2087
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
2021
|
-
const
|
|
2022
|
-
for (const item of scores) {
|
|
2023
|
-
const def = getScoreById(item.id);
|
|
2024
|
-
if (!def) {
|
|
2025
|
-
const numeric = toNumericScore(item.data);
|
|
2026
|
-
scoreParts.push(
|
|
2027
|
-
numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
|
|
2028
|
-
);
|
|
2029
|
-
continue;
|
|
2030
|
-
}
|
|
2031
|
-
const formatted = def.format(item.data, options);
|
|
2032
|
-
switch (def.displayStrategy) {
|
|
2033
|
-
case "bar": {
|
|
2034
|
-
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2035
|
-
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2036
|
-
scoreParts.push(
|
|
2037
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
2038
|
-
);
|
|
2039
|
-
} else {
|
|
2040
|
-
scoreParts.push(formatted);
|
|
2041
|
-
}
|
|
2042
|
-
break;
|
|
2043
|
-
}
|
|
2044
|
-
case "number":
|
|
2045
|
-
scoreParts.push(formatted);
|
|
2046
|
-
break;
|
|
2047
|
-
case "passFail":
|
|
2048
|
-
scoreParts.push(
|
|
2049
|
-
colorize(
|
|
2050
|
-
formatted,
|
|
2051
|
-
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2052
|
-
)
|
|
2053
|
-
);
|
|
2054
|
-
break;
|
|
2055
|
-
}
|
|
2056
|
-
}
|
|
2057
|
-
const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
|
|
2058
|
-
let line = ` ${name}: ${passLabel} ${scoreStr}`;
|
|
2088
|
+
const metricParts = [];
|
|
2059
2089
|
if (metrics && metrics.length > 0) {
|
|
2060
|
-
const metricParts = [];
|
|
2061
2090
|
for (const { id, data } of metrics) {
|
|
2062
2091
|
const def = getMetricById(id);
|
|
2063
2092
|
if (def) {
|
|
@@ -2067,11 +2096,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2067
2096
|
);
|
|
2068
2097
|
}
|
|
2069
2098
|
}
|
|
2070
|
-
|
|
2071
|
-
|
|
2099
|
+
}
|
|
2100
|
+
const scoreLines = [];
|
|
2101
|
+
for (const item of scores) {
|
|
2102
|
+
const def = getScoreById(item.id);
|
|
2103
|
+
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2104
|
+
let formatted;
|
|
2105
|
+
if (!def) {
|
|
2106
|
+
const numeric = toNumericScore(item.data);
|
|
2107
|
+
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2108
|
+
} else {
|
|
2109
|
+
const raw = def.format(item.data, options);
|
|
2110
|
+
switch (def.displayStrategy) {
|
|
2111
|
+
case "bar": {
|
|
2112
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2113
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2114
|
+
formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
|
|
2115
|
+
} else {
|
|
2116
|
+
formatted = raw;
|
|
2117
|
+
}
|
|
2118
|
+
break;
|
|
2119
|
+
}
|
|
2120
|
+
case "number":
|
|
2121
|
+
formatted = raw;
|
|
2122
|
+
break;
|
|
2123
|
+
case "passFail":
|
|
2124
|
+
formatted = colorize(
|
|
2125
|
+
raw,
|
|
2126
|
+
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2127
|
+
);
|
|
2128
|
+
break;
|
|
2129
|
+
}
|
|
2072
2130
|
}
|
|
2131
|
+
scoreLines.push(` ${scoreLabel}: ${formatted}`);
|
|
2132
|
+
}
|
|
2133
|
+
const lines = [];
|
|
2134
|
+
const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
|
|
2135
|
+
lines.push(` ${name}: ${passLabel}${metricStr}`);
|
|
2136
|
+
if (scoreLines.length > 0) {
|
|
2137
|
+
lines.push(...scoreLines);
|
|
2138
|
+
} else {
|
|
2139
|
+
lines.push(` n/a`);
|
|
2073
2140
|
}
|
|
2074
|
-
return
|
|
2141
|
+
return lines;
|
|
2075
2142
|
}
|
|
2076
2143
|
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2077
2144
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
@@ -2094,6 +2161,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2094
2161
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
2095
2162
|
);
|
|
2096
2163
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2164
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
2097
2165
|
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
2098
2166
|
let overallScoreTotal = 0;
|
|
2099
2167
|
let overallScoreSumSq = 0;
|
|
@@ -2169,6 +2237,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2169
2237
|
overallScoreSumSq += numeric * numeric;
|
|
2170
2238
|
overallScoreCount += 1;
|
|
2171
2239
|
}
|
|
2240
|
+
for (const s of item.scores) {
|
|
2241
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
2242
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
2243
|
+
list.push(s);
|
|
2244
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
2245
|
+
}
|
|
2172
2246
|
}
|
|
2173
2247
|
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2174
2248
|
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
@@ -2192,7 +2266,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2192
2266
|
for (const item of aggregatedScores) {
|
|
2193
2267
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2194
2268
|
lines.push(
|
|
2195
|
-
formatEvaluatorScoreLine(
|
|
2269
|
+
...formatEvaluatorScoreLine(
|
|
2196
2270
|
name,
|
|
2197
2271
|
item.scores,
|
|
2198
2272
|
item.passed,
|
|
@@ -2289,9 +2363,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2289
2363
|
}
|
|
2290
2364
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
2291
2365
|
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
2292
|
-
|
|
2293
|
-
|
|
2366
|
+
const evaluatorLines = getEvaluatorSummaryLines(
|
|
2367
|
+
evaluatorId,
|
|
2368
|
+
evaluatorName,
|
|
2369
|
+
aggregates.get(evaluatorId),
|
|
2370
|
+
scoreItemsByEvaluatorScore
|
|
2294
2371
|
);
|
|
2372
|
+
for (const line of evaluatorLines) {
|
|
2373
|
+
console.log(line);
|
|
2374
|
+
}
|
|
2295
2375
|
}
|
|
2296
2376
|
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
2297
2377
|
if (testCaseSummaries.length > 0) {
|