@m4trix/evals 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +200 -111
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +200 -111
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -44
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -44
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +35 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +49 -4
- package/dist/index.js +34 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.js
CHANGED
|
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
|
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import {
|
|
9
|
+
import { diffString } from 'json-diff';
|
|
10
10
|
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
11
|
import { render, Box, Text } from 'ink';
|
|
12
12
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
@@ -260,45 +260,46 @@ async function collectTestCasesFromFiles(config) {
|
|
|
260
260
|
);
|
|
261
261
|
return found.flat();
|
|
262
262
|
}
|
|
263
|
-
function
|
|
263
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
264
|
+
const opts = { ...diffOptions, color: false };
|
|
265
|
+
const result = diffString(expected, actual, opts);
|
|
266
|
+
return typeof result === "string" ? result : "";
|
|
267
|
+
}
|
|
268
|
+
function formatLogMessage(msg) {
|
|
269
|
+
if (typeof msg === "string")
|
|
270
|
+
return msg;
|
|
264
271
|
try {
|
|
265
|
-
|
|
272
|
+
if (msg !== null && typeof msg === "object") {
|
|
273
|
+
return JSON.stringify(msg, null, 2);
|
|
274
|
+
}
|
|
275
|
+
return String(msg);
|
|
266
276
|
} catch {
|
|
267
|
-
return String(
|
|
277
|
+
return String(msg);
|
|
268
278
|
}
|
|
269
279
|
}
|
|
270
|
-
function
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
partLines.pop();
|
|
277
|
-
}
|
|
278
|
-
for (const line of partLines) {
|
|
279
|
-
lines.push(`${prefix} ${line}`);
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
return lines.join("\n");
|
|
280
|
+
function createLogEntry(message, options) {
|
|
281
|
+
return {
|
|
282
|
+
type: "log",
|
|
283
|
+
label: options?.label,
|
|
284
|
+
message: formatLogMessage(message)
|
|
285
|
+
};
|
|
283
286
|
}
|
|
284
|
-
function
|
|
285
|
-
|
|
286
|
-
const actualStr = toJsonLines(actual);
|
|
287
|
-
const changes = diffLines(expectedStr, actualStr);
|
|
288
|
-
return formatDiffString(changes);
|
|
287
|
+
function getLogLines(entry) {
|
|
288
|
+
return entry.message.split("\n");
|
|
289
289
|
}
|
|
290
290
|
function createDiffLogEntry(expected, actual, options) {
|
|
291
|
-
const
|
|
291
|
+
const { label, ...diffOpts } = options ?? {};
|
|
292
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
292
293
|
return {
|
|
293
294
|
type: "diff",
|
|
294
|
-
label
|
|
295
|
+
label,
|
|
295
296
|
expected,
|
|
296
297
|
actual,
|
|
297
298
|
diff: diff || "(no differences)"
|
|
298
299
|
};
|
|
299
300
|
}
|
|
300
301
|
function getDiffLines(entry) {
|
|
301
|
-
const raw =
|
|
302
|
+
const raw = entry.diff || "(no differences)";
|
|
302
303
|
return raw.split("\n").map((line) => {
|
|
303
304
|
const trimmed = line.trimStart();
|
|
304
305
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -574,6 +575,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
574
575
|
const logDiff = (expected, actual, options) => {
|
|
575
576
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
576
577
|
};
|
|
578
|
+
const log = (message, options) => {
|
|
579
|
+
logs.push(createLogEntry(message, options));
|
|
580
|
+
};
|
|
577
581
|
const ctx = yield* Effect.promise(
|
|
578
582
|
() => Promise.resolve(evaluator.resolveContext())
|
|
579
583
|
);
|
|
@@ -583,7 +587,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
583
587
|
input: testCaseItem.testCase.getInput(),
|
|
584
588
|
ctx,
|
|
585
589
|
output,
|
|
586
|
-
logDiff
|
|
590
|
+
logDiff,
|
|
591
|
+
log
|
|
587
592
|
})
|
|
588
593
|
)
|
|
589
594
|
);
|
|
@@ -1518,6 +1523,7 @@ function RunView({
|
|
|
1518
1523
|
);
|
|
1519
1524
|
setEvaluatorNameById(nameById);
|
|
1520
1525
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1526
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
1521
1527
|
let overallScoreTotal = 0;
|
|
1522
1528
|
let overallScoreSumSq = 0;
|
|
1523
1529
|
let overallScoreCount = 0;
|
|
@@ -1547,6 +1553,12 @@ function RunView({
|
|
|
1547
1553
|
overallScoreSumSq += numeric * numeric;
|
|
1548
1554
|
overallScoreCount += 1;
|
|
1549
1555
|
}
|
|
1556
|
+
for (const s of item.scores) {
|
|
1557
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
1558
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
1559
|
+
list.push(s);
|
|
1560
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
1561
|
+
}
|
|
1550
1562
|
}
|
|
1551
1563
|
setTestCases((prev) => {
|
|
1552
1564
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
@@ -1617,6 +1629,7 @@ function RunView({
|
|
|
1617
1629
|
overallScoreSumSq,
|
|
1618
1630
|
overallScoreCount,
|
|
1619
1631
|
aggregates: new Map(aggregates),
|
|
1632
|
+
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1620
1633
|
artifactPath: finalEvent.artifactPath
|
|
1621
1634
|
});
|
|
1622
1635
|
setPhase("completed");
|
|
@@ -1699,36 +1712,45 @@ function RunView({
|
|
|
1699
1712
|
":",
|
|
1700
1713
|
" ",
|
|
1701
1714
|
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1702
|
-
|
|
1703
|
-
|
|
1715
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1716
|
+
" ",
|
|
1717
|
+
item.metrics.map((m) => {
|
|
1718
|
+
const def = getMetricById(m.id);
|
|
1719
|
+
if (!def)
|
|
1720
|
+
return null;
|
|
1721
|
+
const formatted = def.format(m.data, {
|
|
1722
|
+
isAggregated: tc.isAggregated
|
|
1723
|
+
});
|
|
1724
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1725
|
+
"[",
|
|
1726
|
+
def.name ? `${def.name}: ` : "",
|
|
1727
|
+
formatted,
|
|
1728
|
+
"]",
|
|
1729
|
+
" "
|
|
1730
|
+
] }, m.id);
|
|
1731
|
+
})
|
|
1732
|
+
] }) : null
|
|
1733
|
+
] }),
|
|
1734
|
+
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1735
|
+
const def = getScoreById(s.id);
|
|
1736
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
1737
|
+
return /* @__PURE__ */ jsxs(
|
|
1704
1738
|
Text,
|
|
1705
1739
|
{
|
|
1706
1740
|
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1707
1741
|
children: [
|
|
1742
|
+
" ",
|
|
1743
|
+
scoreLabel,
|
|
1744
|
+
":",
|
|
1745
|
+
" ",
|
|
1708
1746
|
formatScorePart(s, scoreColor, {
|
|
1709
1747
|
isAggregated: tc.isAggregated
|
|
1710
|
-
})
|
|
1711
|
-
" "
|
|
1748
|
+
})
|
|
1712
1749
|
]
|
|
1713
1750
|
},
|
|
1714
|
-
s.id
|
|
1715
|
-
)
|
|
1716
|
-
|
|
1717
|
-
const def = getMetricById(m.id);
|
|
1718
|
-
if (!def)
|
|
1719
|
-
return null;
|
|
1720
|
-
const formatted = def.format(m.data, {
|
|
1721
|
-
isAggregated: tc.isAggregated
|
|
1722
|
-
});
|
|
1723
|
-
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1724
|
-
"[",
|
|
1725
|
-
def.name ? `${def.name}: ` : "",
|
|
1726
|
-
formatted,
|
|
1727
|
-
"]",
|
|
1728
|
-
" "
|
|
1729
|
-
] }, m.id);
|
|
1730
|
-
})
|
|
1731
|
-
] }),
|
|
1751
|
+
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1752
|
+
);
|
|
1753
|
+
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
1732
1754
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1733
1755
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1734
1756
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
@@ -1739,7 +1761,7 @@ function RunView({
|
|
|
1739
1761
|
},
|
|
1740
1762
|
lineIdx
|
|
1741
1763
|
)
|
|
1742
|
-
) }, logIdx) : null
|
|
1764
|
+
) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
1743
1765
|
) })
|
|
1744
1766
|
]
|
|
1745
1767
|
},
|
|
@@ -1786,26 +1808,54 @@ function RunView({
|
|
|
1786
1808
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
1787
1809
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1788
1810
|
const agg = summary.aggregates.get(id);
|
|
1789
|
-
|
|
1811
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
1812
|
+
(k) => k.startsWith(`${id}:`)
|
|
1813
|
+
);
|
|
1814
|
+
if (scoreKeys.length === 0) {
|
|
1790
1815
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1791
1816
|
"- ",
|
|
1792
1817
|
name.padEnd(28),
|
|
1793
|
-
" no
|
|
1818
|
+
" no scores"
|
|
1794
1819
|
] }, id);
|
|
1795
1820
|
}
|
|
1796
|
-
const
|
|
1797
|
-
const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
|
|
1798
|
-
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1799
|
-
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1800
|
-
"- ",
|
|
1801
|
-
name.padEnd(28),
|
|
1802
|
-
" avg=",
|
|
1803
|
-
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
|
|
1821
|
+
const passedFailed = agg != null ? /* @__PURE__ */ jsxs(Text, { children: [
|
|
1804
1822
|
" ",
|
|
1805
1823
|
"passed=",
|
|
1806
1824
|
agg.passed,
|
|
1807
1825
|
" failed=",
|
|
1808
1826
|
agg.failed
|
|
1827
|
+
] }) : null;
|
|
1828
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1829
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1830
|
+
"- ",
|
|
1831
|
+
name.padEnd(28),
|
|
1832
|
+
passedFailed
|
|
1833
|
+
] }),
|
|
1834
|
+
scoreKeys.map((key) => {
|
|
1835
|
+
const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
|
|
1836
|
+
const aggregated = aggregateScoreItems(items);
|
|
1837
|
+
if (!aggregated)
|
|
1838
|
+
return null;
|
|
1839
|
+
const def = getScoreById(aggregated.id);
|
|
1840
|
+
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1841
|
+
const formatted = def?.format(aggregated.data, {
|
|
1842
|
+
isAggregated: true
|
|
1843
|
+
}) ?? "n/a";
|
|
1844
|
+
const numeric = toNumericScore(aggregated.data);
|
|
1845
|
+
return /* @__PURE__ */ jsxs(
|
|
1846
|
+
Text,
|
|
1847
|
+
{
|
|
1848
|
+
color: numeric !== void 0 ? scoreColor(numeric) : "gray",
|
|
1849
|
+
children: [
|
|
1850
|
+
" ",
|
|
1851
|
+
label,
|
|
1852
|
+
": ",
|
|
1853
|
+
formatted
|
|
1854
|
+
]
|
|
1855
|
+
},
|
|
1856
|
+
key
|
|
1857
|
+
);
|
|
1858
|
+
})
|
|
1809
1859
|
] }, id);
|
|
1810
1860
|
})
|
|
1811
1861
|
] }),
|
|
@@ -1933,14 +1983,36 @@ function scoreToColor(score) {
|
|
|
1933
1983
|
}
|
|
1934
1984
|
return ansi2.red;
|
|
1935
1985
|
}
|
|
1936
|
-
function
|
|
1937
|
-
|
|
1938
|
-
|
|
1986
|
+
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
1987
|
+
const lines = [];
|
|
1988
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
1989
|
+
(k) => k.startsWith(`${evaluatorId}:`)
|
|
1990
|
+
);
|
|
1991
|
+
if (scoreKeys.length === 0) {
|
|
1992
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
1993
|
+
return lines;
|
|
1994
|
+
}
|
|
1995
|
+
const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
|
|
1996
|
+
const scoreLines = [];
|
|
1997
|
+
for (const key of scoreKeys) {
|
|
1998
|
+
const items = scoreItemsByKey.get(key) ?? [];
|
|
1999
|
+
const agg = aggregateScoreItems(items);
|
|
2000
|
+
if (!agg)
|
|
2001
|
+
continue;
|
|
2002
|
+
const def = getScoreById(agg.id);
|
|
2003
|
+
const label = def ? def.name ?? def.id : agg.id;
|
|
2004
|
+
const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
|
|
2005
|
+
const numeric = toNumericScore(agg.data);
|
|
2006
|
+
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2007
|
+
scoreLines.push(` ${label}: ${colored}`);
|
|
2008
|
+
}
|
|
2009
|
+
if (scoreLines.length > 0) {
|
|
2010
|
+
lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
|
|
2011
|
+
lines.push(...scoreLines);
|
|
2012
|
+
} else {
|
|
2013
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
|
|
1939
2014
|
}
|
|
1940
|
-
|
|
1941
|
-
const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
|
|
1942
|
-
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1943
|
-
return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
2015
|
+
return lines;
|
|
1944
2016
|
}
|
|
1945
2017
|
function createBar2(value, max = 100, width = 20) {
|
|
1946
2018
|
const safe = Math.max(0, Math.min(max, value));
|
|
@@ -1992,46 +2064,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
|
1992
2064
|
}
|
|
1993
2065
|
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1994
2066
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1995
|
-
const
|
|
1996
|
-
for (const item of scores) {
|
|
1997
|
-
const def = getScoreById(item.id);
|
|
1998
|
-
if (!def) {
|
|
1999
|
-
const numeric = toNumericScore(item.data);
|
|
2000
|
-
scoreParts.push(
|
|
2001
|
-
numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
|
|
2002
|
-
);
|
|
2003
|
-
continue;
|
|
2004
|
-
}
|
|
2005
|
-
const formatted = def.format(item.data, options);
|
|
2006
|
-
switch (def.displayStrategy) {
|
|
2007
|
-
case "bar": {
|
|
2008
|
-
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2009
|
-
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2010
|
-
scoreParts.push(
|
|
2011
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
2012
|
-
);
|
|
2013
|
-
} else {
|
|
2014
|
-
scoreParts.push(formatted);
|
|
2015
|
-
}
|
|
2016
|
-
break;
|
|
2017
|
-
}
|
|
2018
|
-
case "number":
|
|
2019
|
-
scoreParts.push(formatted);
|
|
2020
|
-
break;
|
|
2021
|
-
case "passFail":
|
|
2022
|
-
scoreParts.push(
|
|
2023
|
-
colorize(
|
|
2024
|
-
formatted,
|
|
2025
|
-
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2026
|
-
)
|
|
2027
|
-
);
|
|
2028
|
-
break;
|
|
2029
|
-
}
|
|
2030
|
-
}
|
|
2031
|
-
const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
|
|
2032
|
-
let line = ` ${name}: ${passLabel} ${scoreStr}`;
|
|
2067
|
+
const metricParts = [];
|
|
2033
2068
|
if (metrics && metrics.length > 0) {
|
|
2034
|
-
const metricParts = [];
|
|
2035
2069
|
for (const { id, data } of metrics) {
|
|
2036
2070
|
const def = getMetricById(id);
|
|
2037
2071
|
if (def) {
|
|
@@ -2041,11 +2075,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2041
2075
|
);
|
|
2042
2076
|
}
|
|
2043
2077
|
}
|
|
2044
|
-
|
|
2045
|
-
|
|
2078
|
+
}
|
|
2079
|
+
const scoreLines = [];
|
|
2080
|
+
for (const item of scores) {
|
|
2081
|
+
const def = getScoreById(item.id);
|
|
2082
|
+
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2083
|
+
let formatted;
|
|
2084
|
+
if (!def) {
|
|
2085
|
+
const numeric = toNumericScore(item.data);
|
|
2086
|
+
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2087
|
+
} else {
|
|
2088
|
+
const raw = def.format(item.data, options);
|
|
2089
|
+
switch (def.displayStrategy) {
|
|
2090
|
+
case "bar": {
|
|
2091
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2092
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2093
|
+
formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
|
|
2094
|
+
} else {
|
|
2095
|
+
formatted = raw;
|
|
2096
|
+
}
|
|
2097
|
+
break;
|
|
2098
|
+
}
|
|
2099
|
+
case "number":
|
|
2100
|
+
formatted = raw;
|
|
2101
|
+
break;
|
|
2102
|
+
case "passFail":
|
|
2103
|
+
formatted = colorize(
|
|
2104
|
+
raw,
|
|
2105
|
+
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2106
|
+
);
|
|
2107
|
+
break;
|
|
2108
|
+
}
|
|
2046
2109
|
}
|
|
2110
|
+
scoreLines.push(` ${scoreLabel}: ${formatted}`);
|
|
2047
2111
|
}
|
|
2048
|
-
|
|
2112
|
+
const lines = [];
|
|
2113
|
+
const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
|
|
2114
|
+
lines.push(` ${name}: ${passLabel}${metricStr}`);
|
|
2115
|
+
if (scoreLines.length > 0) {
|
|
2116
|
+
lines.push(...scoreLines);
|
|
2117
|
+
} else {
|
|
2118
|
+
lines.push(` n/a`);
|
|
2119
|
+
}
|
|
2120
|
+
return lines;
|
|
2049
2121
|
}
|
|
2050
2122
|
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2051
2123
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
@@ -2068,6 +2140,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2068
2140
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
2069
2141
|
);
|
|
2070
2142
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2143
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
2071
2144
|
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
2072
2145
|
let overallScoreTotal = 0;
|
|
2073
2146
|
let overallScoreSumSq = 0;
|
|
@@ -2143,6 +2216,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2143
2216
|
overallScoreSumSq += numeric * numeric;
|
|
2144
2217
|
overallScoreCount += 1;
|
|
2145
2218
|
}
|
|
2219
|
+
for (const s of item.scores) {
|
|
2220
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
2221
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
2222
|
+
list.push(s);
|
|
2223
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
2224
|
+
}
|
|
2146
2225
|
}
|
|
2147
2226
|
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2148
2227
|
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
@@ -2166,7 +2245,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2166
2245
|
for (const item of aggregatedScores) {
|
|
2167
2246
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2168
2247
|
lines.push(
|
|
2169
|
-
formatEvaluatorScoreLine(
|
|
2248
|
+
...formatEvaluatorScoreLine(
|
|
2170
2249
|
name,
|
|
2171
2250
|
item.scores,
|
|
2172
2251
|
item.passed,
|
|
@@ -2186,6 +2265,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2186
2265
|
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2187
2266
|
lines.push(colored);
|
|
2188
2267
|
}
|
|
2268
|
+
} else if (log.type === "log") {
|
|
2269
|
+
for (const line of getLogLines(log)) {
|
|
2270
|
+
lines.push(` ${line}`);
|
|
2271
|
+
}
|
|
2189
2272
|
}
|
|
2190
2273
|
}
|
|
2191
2274
|
}
|
|
@@ -2263,9 +2346,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2263
2346
|
}
|
|
2264
2347
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
2265
2348
|
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
2266
|
-
|
|
2267
|
-
|
|
2349
|
+
const evaluatorLines = getEvaluatorSummaryLines(
|
|
2350
|
+
evaluatorId,
|
|
2351
|
+
evaluatorName,
|
|
2352
|
+
aggregates.get(evaluatorId),
|
|
2353
|
+
scoreItemsByEvaluatorScore
|
|
2268
2354
|
);
|
|
2355
|
+
for (const line of evaluatorLines) {
|
|
2356
|
+
console.log(line);
|
|
2357
|
+
}
|
|
2269
2358
|
}
|
|
2270
2359
|
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
2271
2360
|
if (testCaseSummaries.length > 0) {
|