@m4trix/evals 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +200 -111
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +200 -111
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -44
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -44
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +35 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +49 -4
- package/dist/index.js +34 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,7 +8,7 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
var
|
|
11
|
+
var jsonDiff = require('json-diff');
|
|
12
12
|
var React2 = require('react');
|
|
13
13
|
var ink = require('ink');
|
|
14
14
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -286,45 +286,46 @@ async function collectTestCasesFromFiles(config) {
|
|
|
286
286
|
);
|
|
287
287
|
return found.flat();
|
|
288
288
|
}
|
|
289
|
-
function
|
|
289
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
290
|
+
const opts = { ...diffOptions, color: false };
|
|
291
|
+
const result = jsonDiff.diffString(expected, actual, opts);
|
|
292
|
+
return typeof result === "string" ? result : "";
|
|
293
|
+
}
|
|
294
|
+
function formatLogMessage(msg) {
|
|
295
|
+
if (typeof msg === "string")
|
|
296
|
+
return msg;
|
|
290
297
|
try {
|
|
291
|
-
|
|
298
|
+
if (msg !== null && typeof msg === "object") {
|
|
299
|
+
return JSON.stringify(msg, null, 2);
|
|
300
|
+
}
|
|
301
|
+
return String(msg);
|
|
292
302
|
} catch {
|
|
293
|
-
return String(
|
|
303
|
+
return String(msg);
|
|
294
304
|
}
|
|
295
305
|
}
|
|
296
|
-
function
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
partLines.pop();
|
|
303
|
-
}
|
|
304
|
-
for (const line of partLines) {
|
|
305
|
-
lines.push(`${prefix} ${line}`);
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
return lines.join("\n");
|
|
306
|
+
function createLogEntry(message, options) {
|
|
307
|
+
return {
|
|
308
|
+
type: "log",
|
|
309
|
+
label: options?.label,
|
|
310
|
+
message: formatLogMessage(message)
|
|
311
|
+
};
|
|
309
312
|
}
|
|
310
|
-
function
|
|
311
|
-
|
|
312
|
-
const actualStr = toJsonLines(actual);
|
|
313
|
-
const changes = diff.diffLines(expectedStr, actualStr);
|
|
314
|
-
return formatDiffString(changes);
|
|
313
|
+
function getLogLines(entry) {
|
|
314
|
+
return entry.message.split("\n");
|
|
315
315
|
}
|
|
316
316
|
function createDiffLogEntry(expected, actual, options) {
|
|
317
|
-
const
|
|
317
|
+
const { label, ...diffOpts } = options ?? {};
|
|
318
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
318
319
|
return {
|
|
319
320
|
type: "diff",
|
|
320
|
-
label
|
|
321
|
+
label,
|
|
321
322
|
expected,
|
|
322
323
|
actual,
|
|
323
324
|
diff: diff || "(no differences)"
|
|
324
325
|
};
|
|
325
326
|
}
|
|
326
327
|
function getDiffLines(entry) {
|
|
327
|
-
const raw =
|
|
328
|
+
const raw = entry.diff || "(no differences)";
|
|
328
329
|
return raw.split("\n").map((line) => {
|
|
329
330
|
const trimmed = line.trimStart();
|
|
330
331
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -600,6 +601,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
600
601
|
const logDiff = (expected, actual, options) => {
|
|
601
602
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
602
603
|
};
|
|
604
|
+
const log = (message, options) => {
|
|
605
|
+
logs.push(createLogEntry(message, options));
|
|
606
|
+
};
|
|
603
607
|
const ctx = yield* effect.Effect.promise(
|
|
604
608
|
() => Promise.resolve(evaluator.resolveContext())
|
|
605
609
|
);
|
|
@@ -609,7 +613,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
609
613
|
input: testCaseItem.testCase.getInput(),
|
|
610
614
|
ctx,
|
|
611
615
|
output,
|
|
612
|
-
logDiff
|
|
616
|
+
logDiff,
|
|
617
|
+
log
|
|
613
618
|
})
|
|
614
619
|
)
|
|
615
620
|
);
|
|
@@ -1544,6 +1549,7 @@ function RunView({
|
|
|
1544
1549
|
);
|
|
1545
1550
|
setEvaluatorNameById(nameById);
|
|
1546
1551
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1552
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
1547
1553
|
let overallScoreTotal = 0;
|
|
1548
1554
|
let overallScoreSumSq = 0;
|
|
1549
1555
|
let overallScoreCount = 0;
|
|
@@ -1573,6 +1579,12 @@ function RunView({
|
|
|
1573
1579
|
overallScoreSumSq += numeric * numeric;
|
|
1574
1580
|
overallScoreCount += 1;
|
|
1575
1581
|
}
|
|
1582
|
+
for (const s of item.scores) {
|
|
1583
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
1584
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
1585
|
+
list.push(s);
|
|
1586
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
1587
|
+
}
|
|
1576
1588
|
}
|
|
1577
1589
|
setTestCases((prev) => {
|
|
1578
1590
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
@@ -1643,6 +1655,7 @@ function RunView({
|
|
|
1643
1655
|
overallScoreSumSq,
|
|
1644
1656
|
overallScoreCount,
|
|
1645
1657
|
aggregates: new Map(aggregates),
|
|
1658
|
+
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1646
1659
|
artifactPath: finalEvent.artifactPath
|
|
1647
1660
|
});
|
|
1648
1661
|
setPhase("completed");
|
|
@@ -1725,36 +1738,45 @@ function RunView({
|
|
|
1725
1738
|
":",
|
|
1726
1739
|
" ",
|
|
1727
1740
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1728
|
-
|
|
1729
|
-
|
|
1741
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1742
|
+
" ",
|
|
1743
|
+
item.metrics.map((m) => {
|
|
1744
|
+
const def = getMetricById(m.id);
|
|
1745
|
+
if (!def)
|
|
1746
|
+
return null;
|
|
1747
|
+
const formatted = def.format(m.data, {
|
|
1748
|
+
isAggregated: tc.isAggregated
|
|
1749
|
+
});
|
|
1750
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1751
|
+
"[",
|
|
1752
|
+
def.name ? `${def.name}: ` : "",
|
|
1753
|
+
formatted,
|
|
1754
|
+
"]",
|
|
1755
|
+
" "
|
|
1756
|
+
] }, m.id);
|
|
1757
|
+
})
|
|
1758
|
+
] }) : null
|
|
1759
|
+
] }),
|
|
1760
|
+
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1761
|
+
const def = getScoreById(s.id);
|
|
1762
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
1763
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1730
1764
|
ink.Text,
|
|
1731
1765
|
{
|
|
1732
1766
|
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1733
1767
|
children: [
|
|
1768
|
+
" ",
|
|
1769
|
+
scoreLabel,
|
|
1770
|
+
":",
|
|
1771
|
+
" ",
|
|
1734
1772
|
formatScorePart(s, scoreColor, {
|
|
1735
1773
|
isAggregated: tc.isAggregated
|
|
1736
|
-
})
|
|
1737
|
-
" "
|
|
1774
|
+
})
|
|
1738
1775
|
]
|
|
1739
1776
|
},
|
|
1740
|
-
s.id
|
|
1741
|
-
)
|
|
1742
|
-
|
|
1743
|
-
const def = getMetricById(m.id);
|
|
1744
|
-
if (!def)
|
|
1745
|
-
return null;
|
|
1746
|
-
const formatted = def.format(m.data, {
|
|
1747
|
-
isAggregated: tc.isAggregated
|
|
1748
|
-
});
|
|
1749
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1750
|
-
"[",
|
|
1751
|
-
def.name ? `${def.name}: ` : "",
|
|
1752
|
-
formatted,
|
|
1753
|
-
"]",
|
|
1754
|
-
" "
|
|
1755
|
-
] }, m.id);
|
|
1756
|
-
})
|
|
1757
|
-
] }),
|
|
1777
|
+
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1778
|
+
);
|
|
1779
|
+
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
1758
1780
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1759
1781
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1760
1782
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -1765,7 +1787,7 @@ function RunView({
|
|
|
1765
1787
|
},
|
|
1766
1788
|
lineIdx
|
|
1767
1789
|
)
|
|
1768
|
-
) }, logIdx) : null
|
|
1790
|
+
) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
1769
1791
|
) })
|
|
1770
1792
|
]
|
|
1771
1793
|
},
|
|
@@ -1812,26 +1834,54 @@ function RunView({
|
|
|
1812
1834
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
1813
1835
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1814
1836
|
const agg = summary.aggregates.get(id);
|
|
1815
|
-
|
|
1837
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
1838
|
+
(k) => k.startsWith(`${id}:`)
|
|
1839
|
+
);
|
|
1840
|
+
if (scoreKeys.length === 0) {
|
|
1816
1841
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1817
1842
|
"- ",
|
|
1818
1843
|
name.padEnd(28),
|
|
1819
|
-
" no
|
|
1844
|
+
" no scores"
|
|
1820
1845
|
] }, id);
|
|
1821
1846
|
}
|
|
1822
|
-
const
|
|
1823
|
-
const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
|
|
1824
|
-
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1825
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1826
|
-
"- ",
|
|
1827
|
-
name.padEnd(28),
|
|
1828
|
-
" avg=",
|
|
1829
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: meanStr }),
|
|
1847
|
+
const passedFailed = agg != null ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1830
1848
|
" ",
|
|
1831
1849
|
"passed=",
|
|
1832
1850
|
agg.passed,
|
|
1833
1851
|
" failed=",
|
|
1834
1852
|
agg.failed
|
|
1853
|
+
] }) : null;
|
|
1854
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1855
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1856
|
+
"- ",
|
|
1857
|
+
name.padEnd(28),
|
|
1858
|
+
passedFailed
|
|
1859
|
+
] }),
|
|
1860
|
+
scoreKeys.map((key) => {
|
|
1861
|
+
const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
|
|
1862
|
+
const aggregated = aggregateScoreItems(items);
|
|
1863
|
+
if (!aggregated)
|
|
1864
|
+
return null;
|
|
1865
|
+
const def = getScoreById(aggregated.id);
|
|
1866
|
+
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1867
|
+
const formatted = def?.format(aggregated.data, {
|
|
1868
|
+
isAggregated: true
|
|
1869
|
+
}) ?? "n/a";
|
|
1870
|
+
const numeric = toNumericScore(aggregated.data);
|
|
1871
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1872
|
+
ink.Text,
|
|
1873
|
+
{
|
|
1874
|
+
color: numeric !== void 0 ? scoreColor(numeric) : "gray",
|
|
1875
|
+
children: [
|
|
1876
|
+
" ",
|
|
1877
|
+
label,
|
|
1878
|
+
": ",
|
|
1879
|
+
formatted
|
|
1880
|
+
]
|
|
1881
|
+
},
|
|
1882
|
+
key
|
|
1883
|
+
);
|
|
1884
|
+
})
|
|
1835
1885
|
] }, id);
|
|
1836
1886
|
})
|
|
1837
1887
|
] }),
|
|
@@ -1959,14 +2009,36 @@ function scoreToColor(score) {
|
|
|
1959
2009
|
}
|
|
1960
2010
|
return ansi2.red;
|
|
1961
2011
|
}
|
|
1962
|
-
function
|
|
1963
|
-
|
|
1964
|
-
|
|
2012
|
+
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
2013
|
+
const lines = [];
|
|
2014
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
2015
|
+
(k) => k.startsWith(`${evaluatorId}:`)
|
|
2016
|
+
);
|
|
2017
|
+
if (scoreKeys.length === 0) {
|
|
2018
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
2019
|
+
return lines;
|
|
2020
|
+
}
|
|
2021
|
+
const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
|
|
2022
|
+
const scoreLines = [];
|
|
2023
|
+
for (const key of scoreKeys) {
|
|
2024
|
+
const items = scoreItemsByKey.get(key) ?? [];
|
|
2025
|
+
const agg = aggregateScoreItems(items);
|
|
2026
|
+
if (!agg)
|
|
2027
|
+
continue;
|
|
2028
|
+
const def = getScoreById(agg.id);
|
|
2029
|
+
const label = def ? def.name ?? def.id : agg.id;
|
|
2030
|
+
const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
|
|
2031
|
+
const numeric = toNumericScore(agg.data);
|
|
2032
|
+
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2033
|
+
scoreLines.push(` ${label}: ${colored}`);
|
|
2034
|
+
}
|
|
2035
|
+
if (scoreLines.length > 0) {
|
|
2036
|
+
lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
|
|
2037
|
+
lines.push(...scoreLines);
|
|
2038
|
+
} else {
|
|
2039
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
|
|
1965
2040
|
}
|
|
1966
|
-
|
|
1967
|
-
const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
|
|
1968
|
-
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1969
|
-
return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
2041
|
+
return lines;
|
|
1970
2042
|
}
|
|
1971
2043
|
function createBar2(value, max = 100, width = 20) {
|
|
1972
2044
|
const safe = Math.max(0, Math.min(max, value));
|
|
@@ -2018,46 +2090,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
|
2018
2090
|
}
|
|
2019
2091
|
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
2020
2092
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
2021
|
-
const
|
|
2022
|
-
for (const item of scores) {
|
|
2023
|
-
const def = getScoreById(item.id);
|
|
2024
|
-
if (!def) {
|
|
2025
|
-
const numeric = toNumericScore(item.data);
|
|
2026
|
-
scoreParts.push(
|
|
2027
|
-
numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
|
|
2028
|
-
);
|
|
2029
|
-
continue;
|
|
2030
|
-
}
|
|
2031
|
-
const formatted = def.format(item.data, options);
|
|
2032
|
-
switch (def.displayStrategy) {
|
|
2033
|
-
case "bar": {
|
|
2034
|
-
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2035
|
-
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2036
|
-
scoreParts.push(
|
|
2037
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
2038
|
-
);
|
|
2039
|
-
} else {
|
|
2040
|
-
scoreParts.push(formatted);
|
|
2041
|
-
}
|
|
2042
|
-
break;
|
|
2043
|
-
}
|
|
2044
|
-
case "number":
|
|
2045
|
-
scoreParts.push(formatted);
|
|
2046
|
-
break;
|
|
2047
|
-
case "passFail":
|
|
2048
|
-
scoreParts.push(
|
|
2049
|
-
colorize(
|
|
2050
|
-
formatted,
|
|
2051
|
-
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2052
|
-
)
|
|
2053
|
-
);
|
|
2054
|
-
break;
|
|
2055
|
-
}
|
|
2056
|
-
}
|
|
2057
|
-
const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
|
|
2058
|
-
let line = ` ${name}: ${passLabel} ${scoreStr}`;
|
|
2093
|
+
const metricParts = [];
|
|
2059
2094
|
if (metrics && metrics.length > 0) {
|
|
2060
|
-
const metricParts = [];
|
|
2061
2095
|
for (const { id, data } of metrics) {
|
|
2062
2096
|
const def = getMetricById(id);
|
|
2063
2097
|
if (def) {
|
|
@@ -2067,11 +2101,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2067
2101
|
);
|
|
2068
2102
|
}
|
|
2069
2103
|
}
|
|
2070
|
-
|
|
2071
|
-
|
|
2104
|
+
}
|
|
2105
|
+
const scoreLines = [];
|
|
2106
|
+
for (const item of scores) {
|
|
2107
|
+
const def = getScoreById(item.id);
|
|
2108
|
+
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2109
|
+
let formatted;
|
|
2110
|
+
if (!def) {
|
|
2111
|
+
const numeric = toNumericScore(item.data);
|
|
2112
|
+
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2113
|
+
} else {
|
|
2114
|
+
const raw = def.format(item.data, options);
|
|
2115
|
+
switch (def.displayStrategy) {
|
|
2116
|
+
case "bar": {
|
|
2117
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2118
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2119
|
+
formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
|
|
2120
|
+
} else {
|
|
2121
|
+
formatted = raw;
|
|
2122
|
+
}
|
|
2123
|
+
break;
|
|
2124
|
+
}
|
|
2125
|
+
case "number":
|
|
2126
|
+
formatted = raw;
|
|
2127
|
+
break;
|
|
2128
|
+
case "passFail":
|
|
2129
|
+
formatted = colorize(
|
|
2130
|
+
raw,
|
|
2131
|
+
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2132
|
+
);
|
|
2133
|
+
break;
|
|
2134
|
+
}
|
|
2072
2135
|
}
|
|
2136
|
+
scoreLines.push(` ${scoreLabel}: ${formatted}`);
|
|
2073
2137
|
}
|
|
2074
|
-
|
|
2138
|
+
const lines = [];
|
|
2139
|
+
const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
|
|
2140
|
+
lines.push(` ${name}: ${passLabel}${metricStr}`);
|
|
2141
|
+
if (scoreLines.length > 0) {
|
|
2142
|
+
lines.push(...scoreLines);
|
|
2143
|
+
} else {
|
|
2144
|
+
lines.push(` n/a`);
|
|
2145
|
+
}
|
|
2146
|
+
return lines;
|
|
2075
2147
|
}
|
|
2076
2148
|
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2077
2149
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
@@ -2094,6 +2166,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2094
2166
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
2095
2167
|
);
|
|
2096
2168
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2169
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
2097
2170
|
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
2098
2171
|
let overallScoreTotal = 0;
|
|
2099
2172
|
let overallScoreSumSq = 0;
|
|
@@ -2169,6 +2242,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2169
2242
|
overallScoreSumSq += numeric * numeric;
|
|
2170
2243
|
overallScoreCount += 1;
|
|
2171
2244
|
}
|
|
2245
|
+
for (const s of item.scores) {
|
|
2246
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
2247
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
2248
|
+
list.push(s);
|
|
2249
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
2250
|
+
}
|
|
2172
2251
|
}
|
|
2173
2252
|
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2174
2253
|
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
@@ -2192,7 +2271,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2192
2271
|
for (const item of aggregatedScores) {
|
|
2193
2272
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2194
2273
|
lines.push(
|
|
2195
|
-
formatEvaluatorScoreLine(
|
|
2274
|
+
...formatEvaluatorScoreLine(
|
|
2196
2275
|
name,
|
|
2197
2276
|
item.scores,
|
|
2198
2277
|
item.passed,
|
|
@@ -2212,6 +2291,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2212
2291
|
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2213
2292
|
lines.push(colored);
|
|
2214
2293
|
}
|
|
2294
|
+
} else if (log.type === "log") {
|
|
2295
|
+
for (const line of getLogLines(log)) {
|
|
2296
|
+
lines.push(` ${line}`);
|
|
2297
|
+
}
|
|
2215
2298
|
}
|
|
2216
2299
|
}
|
|
2217
2300
|
}
|
|
@@ -2289,9 +2372,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2289
2372
|
}
|
|
2290
2373
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
2291
2374
|
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
2292
|
-
|
|
2293
|
-
|
|
2375
|
+
const evaluatorLines = getEvaluatorSummaryLines(
|
|
2376
|
+
evaluatorId,
|
|
2377
|
+
evaluatorName,
|
|
2378
|
+
aggregates.get(evaluatorId),
|
|
2379
|
+
scoreItemsByEvaluatorScore
|
|
2294
2380
|
);
|
|
2381
|
+
for (const line of evaluatorLines) {
|
|
2382
|
+
console.log(line);
|
|
2383
|
+
}
|
|
2295
2384
|
}
|
|
2296
2385
|
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
2297
2386
|
if (testCaseSummaries.length > 0) {
|