@m4trix/evals 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +164 -84
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +164 -84
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +52 -18
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +52 -18
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -1518,6 +1518,7 @@ function RunView({
|
|
|
1518
1518
|
);
|
|
1519
1519
|
setEvaluatorNameById(nameById);
|
|
1520
1520
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1521
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
1521
1522
|
let overallScoreTotal = 0;
|
|
1522
1523
|
let overallScoreSumSq = 0;
|
|
1523
1524
|
let overallScoreCount = 0;
|
|
@@ -1547,6 +1548,12 @@ function RunView({
|
|
|
1547
1548
|
overallScoreSumSq += numeric * numeric;
|
|
1548
1549
|
overallScoreCount += 1;
|
|
1549
1550
|
}
|
|
1551
|
+
for (const s of item.scores) {
|
|
1552
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
1553
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
1554
|
+
list.push(s);
|
|
1555
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
1556
|
+
}
|
|
1550
1557
|
}
|
|
1551
1558
|
setTestCases((prev) => {
|
|
1552
1559
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
@@ -1617,6 +1624,7 @@ function RunView({
|
|
|
1617
1624
|
overallScoreSumSq,
|
|
1618
1625
|
overallScoreCount,
|
|
1619
1626
|
aggregates: new Map(aggregates),
|
|
1627
|
+
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1620
1628
|
artifactPath: finalEvent.artifactPath
|
|
1621
1629
|
});
|
|
1622
1630
|
setPhase("completed");
|
|
@@ -1699,36 +1707,45 @@ function RunView({
|
|
|
1699
1707
|
":",
|
|
1700
1708
|
" ",
|
|
1701
1709
|
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1702
|
-
|
|
1703
|
-
|
|
1710
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1711
|
+
" ",
|
|
1712
|
+
item.metrics.map((m) => {
|
|
1713
|
+
const def = getMetricById(m.id);
|
|
1714
|
+
if (!def)
|
|
1715
|
+
return null;
|
|
1716
|
+
const formatted = def.format(m.data, {
|
|
1717
|
+
isAggregated: tc.isAggregated
|
|
1718
|
+
});
|
|
1719
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1720
|
+
"[",
|
|
1721
|
+
def.name ? `${def.name}: ` : "",
|
|
1722
|
+
formatted,
|
|
1723
|
+
"]",
|
|
1724
|
+
" "
|
|
1725
|
+
] }, m.id);
|
|
1726
|
+
})
|
|
1727
|
+
] }) : null
|
|
1728
|
+
] }),
|
|
1729
|
+
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1730
|
+
const def = getScoreById(s.id);
|
|
1731
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
1732
|
+
return /* @__PURE__ */ jsxs(
|
|
1704
1733
|
Text,
|
|
1705
1734
|
{
|
|
1706
1735
|
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1707
1736
|
children: [
|
|
1737
|
+
" ",
|
|
1738
|
+
scoreLabel,
|
|
1739
|
+
":",
|
|
1740
|
+
" ",
|
|
1708
1741
|
formatScorePart(s, scoreColor, {
|
|
1709
1742
|
isAggregated: tc.isAggregated
|
|
1710
|
-
})
|
|
1711
|
-
" "
|
|
1743
|
+
})
|
|
1712
1744
|
]
|
|
1713
1745
|
},
|
|
1714
|
-
s.id
|
|
1715
|
-
)
|
|
1716
|
-
|
|
1717
|
-
const def = getMetricById(m.id);
|
|
1718
|
-
if (!def)
|
|
1719
|
-
return null;
|
|
1720
|
-
const formatted = def.format(m.data, {
|
|
1721
|
-
isAggregated: tc.isAggregated
|
|
1722
|
-
});
|
|
1723
|
-
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1724
|
-
"[",
|
|
1725
|
-
def.name ? `${def.name}: ` : "",
|
|
1726
|
-
formatted,
|
|
1727
|
-
"]",
|
|
1728
|
-
" "
|
|
1729
|
-
] }, m.id);
|
|
1730
|
-
})
|
|
1731
|
-
] }),
|
|
1746
|
+
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1747
|
+
);
|
|
1748
|
+
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
1732
1749
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1733
1750
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1734
1751
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
@@ -1786,26 +1803,54 @@ function RunView({
|
|
|
1786
1803
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
1787
1804
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1788
1805
|
const agg = summary.aggregates.get(id);
|
|
1789
|
-
|
|
1806
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
1807
|
+
(k) => k.startsWith(`${id}:`)
|
|
1808
|
+
);
|
|
1809
|
+
if (scoreKeys.length === 0) {
|
|
1790
1810
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1791
1811
|
"- ",
|
|
1792
1812
|
name.padEnd(28),
|
|
1793
|
-
" no
|
|
1813
|
+
" no scores"
|
|
1794
1814
|
] }, id);
|
|
1795
1815
|
}
|
|
1796
|
-
const
|
|
1797
|
-
const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
|
|
1798
|
-
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1799
|
-
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1800
|
-
"- ",
|
|
1801
|
-
name.padEnd(28),
|
|
1802
|
-
" avg=",
|
|
1803
|
-
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
|
|
1816
|
+
const passedFailed = agg != null ? /* @__PURE__ */ jsxs(Text, { children: [
|
|
1804
1817
|
" ",
|
|
1805
1818
|
"passed=",
|
|
1806
1819
|
agg.passed,
|
|
1807
1820
|
" failed=",
|
|
1808
1821
|
agg.failed
|
|
1822
|
+
] }) : null;
|
|
1823
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1824
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1825
|
+
"- ",
|
|
1826
|
+
name.padEnd(28),
|
|
1827
|
+
passedFailed
|
|
1828
|
+
] }),
|
|
1829
|
+
scoreKeys.map((key) => {
|
|
1830
|
+
const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
|
|
1831
|
+
const aggregated = aggregateScoreItems(items);
|
|
1832
|
+
if (!aggregated)
|
|
1833
|
+
return null;
|
|
1834
|
+
const def = getScoreById(aggregated.id);
|
|
1835
|
+
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1836
|
+
const formatted = def?.format(aggregated.data, {
|
|
1837
|
+
isAggregated: true
|
|
1838
|
+
}) ?? "n/a";
|
|
1839
|
+
const numeric = toNumericScore(aggregated.data);
|
|
1840
|
+
return /* @__PURE__ */ jsxs(
|
|
1841
|
+
Text,
|
|
1842
|
+
{
|
|
1843
|
+
color: numeric !== void 0 ? scoreColor(numeric) : "gray",
|
|
1844
|
+
children: [
|
|
1845
|
+
" ",
|
|
1846
|
+
label,
|
|
1847
|
+
": ",
|
|
1848
|
+
formatted
|
|
1849
|
+
]
|
|
1850
|
+
},
|
|
1851
|
+
key
|
|
1852
|
+
);
|
|
1853
|
+
})
|
|
1809
1854
|
] }, id);
|
|
1810
1855
|
})
|
|
1811
1856
|
] }),
|
|
@@ -1933,14 +1978,36 @@ function scoreToColor(score) {
|
|
|
1933
1978
|
}
|
|
1934
1979
|
return ansi2.red;
|
|
1935
1980
|
}
|
|
1936
|
-
function
|
|
1937
|
-
|
|
1938
|
-
|
|
1981
|
+
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
1982
|
+
const lines = [];
|
|
1983
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
1984
|
+
(k) => k.startsWith(`${evaluatorId}:`)
|
|
1985
|
+
);
|
|
1986
|
+
if (scoreKeys.length === 0) {
|
|
1987
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
1988
|
+
return lines;
|
|
1989
|
+
}
|
|
1990
|
+
const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
|
|
1991
|
+
const scoreLines = [];
|
|
1992
|
+
for (const key of scoreKeys) {
|
|
1993
|
+
const items = scoreItemsByKey.get(key) ?? [];
|
|
1994
|
+
const agg = aggregateScoreItems(items);
|
|
1995
|
+
if (!agg)
|
|
1996
|
+
continue;
|
|
1997
|
+
const def = getScoreById(agg.id);
|
|
1998
|
+
const label = def ? def.name ?? def.id : agg.id;
|
|
1999
|
+
const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
|
|
2000
|
+
const numeric = toNumericScore(agg.data);
|
|
2001
|
+
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2002
|
+
scoreLines.push(` ${label}: ${colored}`);
|
|
2003
|
+
}
|
|
2004
|
+
if (scoreLines.length > 0) {
|
|
2005
|
+
lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
|
|
2006
|
+
lines.push(...scoreLines);
|
|
2007
|
+
} else {
|
|
2008
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
|
|
1939
2009
|
}
|
|
1940
|
-
|
|
1941
|
-
const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
|
|
1942
|
-
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1943
|
-
return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
2010
|
+
return lines;
|
|
1944
2011
|
}
|
|
1945
2012
|
function createBar2(value, max = 100, width = 20) {
|
|
1946
2013
|
const safe = Math.max(0, Math.min(max, value));
|
|
@@ -1992,46 +2059,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
|
1992
2059
|
}
|
|
1993
2060
|
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1994
2061
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1995
|
-
const
|
|
1996
|
-
for (const item of scores) {
|
|
1997
|
-
const def = getScoreById(item.id);
|
|
1998
|
-
if (!def) {
|
|
1999
|
-
const numeric = toNumericScore(item.data);
|
|
2000
|
-
scoreParts.push(
|
|
2001
|
-
numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
|
|
2002
|
-
);
|
|
2003
|
-
continue;
|
|
2004
|
-
}
|
|
2005
|
-
const formatted = def.format(item.data, options);
|
|
2006
|
-
switch (def.displayStrategy) {
|
|
2007
|
-
case "bar": {
|
|
2008
|
-
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2009
|
-
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2010
|
-
scoreParts.push(
|
|
2011
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
2012
|
-
);
|
|
2013
|
-
} else {
|
|
2014
|
-
scoreParts.push(formatted);
|
|
2015
|
-
}
|
|
2016
|
-
break;
|
|
2017
|
-
}
|
|
2018
|
-
case "number":
|
|
2019
|
-
scoreParts.push(formatted);
|
|
2020
|
-
break;
|
|
2021
|
-
case "passFail":
|
|
2022
|
-
scoreParts.push(
|
|
2023
|
-
colorize(
|
|
2024
|
-
formatted,
|
|
2025
|
-
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2026
|
-
)
|
|
2027
|
-
);
|
|
2028
|
-
break;
|
|
2029
|
-
}
|
|
2030
|
-
}
|
|
2031
|
-
const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
|
|
2032
|
-
let line = ` ${name}: ${passLabel} ${scoreStr}`;
|
|
2062
|
+
const metricParts = [];
|
|
2033
2063
|
if (metrics && metrics.length > 0) {
|
|
2034
|
-
const metricParts = [];
|
|
2035
2064
|
for (const { id, data } of metrics) {
|
|
2036
2065
|
const def = getMetricById(id);
|
|
2037
2066
|
if (def) {
|
|
@@ -2041,11 +2070,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2041
2070
|
);
|
|
2042
2071
|
}
|
|
2043
2072
|
}
|
|
2044
|
-
|
|
2045
|
-
|
|
2073
|
+
}
|
|
2074
|
+
const scoreLines = [];
|
|
2075
|
+
for (const item of scores) {
|
|
2076
|
+
const def = getScoreById(item.id);
|
|
2077
|
+
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2078
|
+
let formatted;
|
|
2079
|
+
if (!def) {
|
|
2080
|
+
const numeric = toNumericScore(item.data);
|
|
2081
|
+
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2082
|
+
} else {
|
|
2083
|
+
const raw = def.format(item.data, options);
|
|
2084
|
+
switch (def.displayStrategy) {
|
|
2085
|
+
case "bar": {
|
|
2086
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2087
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2088
|
+
formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
|
|
2089
|
+
} else {
|
|
2090
|
+
formatted = raw;
|
|
2091
|
+
}
|
|
2092
|
+
break;
|
|
2093
|
+
}
|
|
2094
|
+
case "number":
|
|
2095
|
+
formatted = raw;
|
|
2096
|
+
break;
|
|
2097
|
+
case "passFail":
|
|
2098
|
+
formatted = colorize(
|
|
2099
|
+
raw,
|
|
2100
|
+
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2101
|
+
);
|
|
2102
|
+
break;
|
|
2103
|
+
}
|
|
2046
2104
|
}
|
|
2105
|
+
scoreLines.push(` ${scoreLabel}: ${formatted}`);
|
|
2106
|
+
}
|
|
2107
|
+
const lines = [];
|
|
2108
|
+
const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
|
|
2109
|
+
lines.push(` ${name}: ${passLabel}${metricStr}`);
|
|
2110
|
+
if (scoreLines.length > 0) {
|
|
2111
|
+
lines.push(...scoreLines);
|
|
2112
|
+
} else {
|
|
2113
|
+
lines.push(` n/a`);
|
|
2047
2114
|
}
|
|
2048
|
-
return
|
|
2115
|
+
return lines;
|
|
2049
2116
|
}
|
|
2050
2117
|
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2051
2118
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
@@ -2068,6 +2135,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2068
2135
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
2069
2136
|
);
|
|
2070
2137
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2138
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
2071
2139
|
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
2072
2140
|
let overallScoreTotal = 0;
|
|
2073
2141
|
let overallScoreSumSq = 0;
|
|
@@ -2143,6 +2211,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2143
2211
|
overallScoreSumSq += numeric * numeric;
|
|
2144
2212
|
overallScoreCount += 1;
|
|
2145
2213
|
}
|
|
2214
|
+
for (const s of item.scores) {
|
|
2215
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
2216
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
2217
|
+
list.push(s);
|
|
2218
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
2219
|
+
}
|
|
2146
2220
|
}
|
|
2147
2221
|
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2148
2222
|
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
@@ -2166,7 +2240,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2166
2240
|
for (const item of aggregatedScores) {
|
|
2167
2241
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2168
2242
|
lines.push(
|
|
2169
|
-
formatEvaluatorScoreLine(
|
|
2243
|
+
...formatEvaluatorScoreLine(
|
|
2170
2244
|
name,
|
|
2171
2245
|
item.scores,
|
|
2172
2246
|
item.passed,
|
|
@@ -2263,9 +2337,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2263
2337
|
}
|
|
2264
2338
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
2265
2339
|
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
2266
|
-
|
|
2267
|
-
|
|
2340
|
+
const evaluatorLines = getEvaluatorSummaryLines(
|
|
2341
|
+
evaluatorId,
|
|
2342
|
+
evaluatorName,
|
|
2343
|
+
aggregates.get(evaluatorId),
|
|
2344
|
+
scoreItemsByEvaluatorScore
|
|
2268
2345
|
);
|
|
2346
|
+
for (const line of evaluatorLines) {
|
|
2347
|
+
console.log(line);
|
|
2348
|
+
}
|
|
2269
2349
|
}
|
|
2270
2350
|
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
2271
2351
|
if (testCaseSummaries.length > 0) {
|