@m4trix/evals 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +287 -107
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +287 -107
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +643 -398
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +634 -389
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,7 +8,7 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
var
|
|
11
|
+
var diff = require('diff');
|
|
12
12
|
var React2 = require('react');
|
|
13
13
|
var ink = require('ink');
|
|
14
14
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -286,8 +286,35 @@ async function collectTestCasesFromFiles(config) {
|
|
|
286
286
|
);
|
|
287
287
|
return found.flat();
|
|
288
288
|
}
|
|
289
|
+
function toJsonLines(value) {
|
|
290
|
+
try {
|
|
291
|
+
return JSON.stringify(value, null, 2);
|
|
292
|
+
} catch {
|
|
293
|
+
return String(value);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
function formatDiffString(changes) {
|
|
297
|
+
const lines = [];
|
|
298
|
+
for (const part of changes) {
|
|
299
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
300
|
+
const partLines = part.value.split("\n");
|
|
301
|
+
if (partLines[partLines.length - 1] === "") {
|
|
302
|
+
partLines.pop();
|
|
303
|
+
}
|
|
304
|
+
for (const line of partLines) {
|
|
305
|
+
lines.push(`${prefix} ${line}`);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return lines.join("\n");
|
|
309
|
+
}
|
|
310
|
+
function createDiffString(expected, actual) {
|
|
311
|
+
const expectedStr = toJsonLines(expected);
|
|
312
|
+
const actualStr = toJsonLines(actual);
|
|
313
|
+
const changes = diff.diffLines(expectedStr, actualStr);
|
|
314
|
+
return formatDiffString(changes);
|
|
315
|
+
}
|
|
289
316
|
function createDiffLogEntry(expected, actual, options) {
|
|
290
|
-
const diff =
|
|
317
|
+
const diff = createDiffString(expected, actual);
|
|
291
318
|
return {
|
|
292
319
|
type: "diff",
|
|
293
320
|
label: options?.label,
|
|
@@ -297,7 +324,7 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
297
324
|
};
|
|
298
325
|
}
|
|
299
326
|
function getDiffLines(entry) {
|
|
300
|
-
const raw =
|
|
327
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
301
328
|
return raw.split("\n").map((line) => {
|
|
302
329
|
const trimmed = line.trimStart();
|
|
303
330
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -357,15 +384,28 @@ function getScoreById(id) {
|
|
|
357
384
|
}
|
|
358
385
|
|
|
359
386
|
// src/evals/aggregators.ts
|
|
360
|
-
function
|
|
387
|
+
function aggregateAverageWithVariance(values) {
|
|
361
388
|
if (values.length === 0) {
|
|
362
|
-
return { value: 0 };
|
|
389
|
+
return { value: 0, count: 0 };
|
|
363
390
|
}
|
|
364
391
|
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
365
|
-
|
|
392
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
393
|
+
const mean = sum / values.length;
|
|
394
|
+
let stdDev;
|
|
395
|
+
if (values.length >= 2) {
|
|
396
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
397
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
398
|
+
}
|
|
399
|
+
return { value: mean, stdDev, count: values.length };
|
|
366
400
|
}
|
|
367
401
|
function aggregateAll(values) {
|
|
368
|
-
|
|
402
|
+
const total = values.length;
|
|
403
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
404
|
+
return {
|
|
405
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
406
|
+
passedCount,
|
|
407
|
+
totalCount: total
|
|
408
|
+
};
|
|
369
409
|
}
|
|
370
410
|
function aggregateTokenCountSum(values) {
|
|
371
411
|
const initial = {
|
|
@@ -419,14 +459,28 @@ Score.of({
|
|
|
419
459
|
id: "percent",
|
|
420
460
|
name: "Score",
|
|
421
461
|
displayStrategy: "bar",
|
|
422
|
-
format: (data, options) =>
|
|
423
|
-
|
|
462
|
+
format: (data, options) => {
|
|
463
|
+
if (options?.isAggregated) {
|
|
464
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
465
|
+
}
|
|
466
|
+
return data.value.toFixed(2);
|
|
467
|
+
},
|
|
468
|
+
aggregate: aggregateAverageWithVariance
|
|
424
469
|
});
|
|
425
470
|
Score.of({
|
|
426
471
|
id: "binary",
|
|
427
472
|
name: "Result",
|
|
428
473
|
displayStrategy: "passFail",
|
|
429
|
-
format: (data, options) =>
|
|
474
|
+
format: (data, options) => {
|
|
475
|
+
if (options?.isAggregated) {
|
|
476
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
477
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
478
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
479
|
+
}
|
|
480
|
+
return base;
|
|
481
|
+
}
|
|
482
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
483
|
+
},
|
|
430
484
|
aggregate: aggregateAll
|
|
431
485
|
});
|
|
432
486
|
|
|
@@ -1365,6 +1419,13 @@ function Spinner({ label = "Running" }) {
|
|
|
1365
1419
|
label
|
|
1366
1420
|
] });
|
|
1367
1421
|
}
|
|
1422
|
+
function sampleStdDev(sum, sumSq, n) {
|
|
1423
|
+
if (n < 2)
|
|
1424
|
+
return void 0;
|
|
1425
|
+
const mean = sum / n;
|
|
1426
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1427
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1428
|
+
}
|
|
1368
1429
|
function scoreColor(score) {
|
|
1369
1430
|
if (score >= 80)
|
|
1370
1431
|
return "green";
|
|
@@ -1483,7 +1544,9 @@ function RunView({
|
|
|
1483
1544
|
);
|
|
1484
1545
|
setEvaluatorNameById(nameById);
|
|
1485
1546
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1547
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
1486
1548
|
let overallScoreTotal = 0;
|
|
1549
|
+
let overallScoreSumSq = 0;
|
|
1487
1550
|
let overallScoreCount = 0;
|
|
1488
1551
|
const done = new Promise((resolve5) => {
|
|
1489
1552
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
@@ -1495,19 +1558,28 @@ function RunView({
|
|
|
1495
1558
|
if (numeric !== void 0) {
|
|
1496
1559
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1497
1560
|
total: 0,
|
|
1561
|
+
sumSq: 0,
|
|
1498
1562
|
count: 0,
|
|
1499
1563
|
passed: 0,
|
|
1500
1564
|
failed: 0
|
|
1501
1565
|
};
|
|
1502
1566
|
aggregates.set(item.evaluatorId, {
|
|
1503
1567
|
total: current.total + numeric,
|
|
1568
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
1504
1569
|
count: current.count + 1,
|
|
1505
1570
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
1506
1571
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
1507
1572
|
});
|
|
1508
1573
|
overallScoreTotal += numeric;
|
|
1574
|
+
overallScoreSumSq += numeric * numeric;
|
|
1509
1575
|
overallScoreCount += 1;
|
|
1510
1576
|
}
|
|
1577
|
+
for (const s of item.scores) {
|
|
1578
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
1579
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
1580
|
+
list.push(s);
|
|
1581
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
1582
|
+
}
|
|
1511
1583
|
}
|
|
1512
1584
|
setTestCases((prev) => {
|
|
1513
1585
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
@@ -1575,8 +1647,10 @@ function RunView({
|
|
|
1575
1647
|
failedTestCases: finalEvent.failedTestCases,
|
|
1576
1648
|
totalTestCases: finalEvent.totalTestCases,
|
|
1577
1649
|
overallScoreTotal,
|
|
1650
|
+
overallScoreSumSq,
|
|
1578
1651
|
overallScoreCount,
|
|
1579
1652
|
aggregates: new Map(aggregates),
|
|
1653
|
+
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1580
1654
|
artifactPath: finalEvent.artifactPath
|
|
1581
1655
|
});
|
|
1582
1656
|
setPhase("completed");
|
|
@@ -1659,36 +1733,45 @@ function RunView({
|
|
|
1659
1733
|
":",
|
|
1660
1734
|
" ",
|
|
1661
1735
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1662
|
-
|
|
1663
|
-
|
|
1736
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1737
|
+
" ",
|
|
1738
|
+
item.metrics.map((m) => {
|
|
1739
|
+
const def = getMetricById(m.id);
|
|
1740
|
+
if (!def)
|
|
1741
|
+
return null;
|
|
1742
|
+
const formatted = def.format(m.data, {
|
|
1743
|
+
isAggregated: tc.isAggregated
|
|
1744
|
+
});
|
|
1745
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1746
|
+
"[",
|
|
1747
|
+
def.name ? `${def.name}: ` : "",
|
|
1748
|
+
formatted,
|
|
1749
|
+
"]",
|
|
1750
|
+
" "
|
|
1751
|
+
] }, m.id);
|
|
1752
|
+
})
|
|
1753
|
+
] }) : null
|
|
1754
|
+
] }),
|
|
1755
|
+
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1756
|
+
const def = getScoreById(s.id);
|
|
1757
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
1758
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1664
1759
|
ink.Text,
|
|
1665
1760
|
{
|
|
1666
1761
|
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1667
1762
|
children: [
|
|
1763
|
+
" ",
|
|
1764
|
+
scoreLabel,
|
|
1765
|
+
":",
|
|
1766
|
+
" ",
|
|
1668
1767
|
formatScorePart(s, scoreColor, {
|
|
1669
1768
|
isAggregated: tc.isAggregated
|
|
1670
|
-
})
|
|
1671
|
-
" "
|
|
1769
|
+
})
|
|
1672
1770
|
]
|
|
1673
1771
|
},
|
|
1674
|
-
s.id
|
|
1675
|
-
)
|
|
1676
|
-
|
|
1677
|
-
const def = getMetricById(m.id);
|
|
1678
|
-
if (!def)
|
|
1679
|
-
return null;
|
|
1680
|
-
const formatted = def.format(m.data, {
|
|
1681
|
-
isAggregated: tc.isAggregated
|
|
1682
|
-
});
|
|
1683
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1684
|
-
"[",
|
|
1685
|
-
def.name ? `${def.name}: ` : "",
|
|
1686
|
-
formatted,
|
|
1687
|
-
"]",
|
|
1688
|
-
" "
|
|
1689
|
-
] }, m.id);
|
|
1690
|
-
})
|
|
1691
|
-
] }),
|
|
1772
|
+
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1773
|
+
);
|
|
1774
|
+
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
1692
1775
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1693
1776
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1694
1777
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -1732,45 +1815,85 @@ function RunView({
|
|
|
1732
1815
|
label: "overall avg",
|
|
1733
1816
|
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1734
1817
|
barWidth: 20,
|
|
1735
|
-
format: (v) =>
|
|
1818
|
+
format: (v) => {
|
|
1819
|
+
const sd = sampleStdDev(
|
|
1820
|
+
summary.overallScoreTotal,
|
|
1821
|
+
summary.overallScoreSumSq,
|
|
1822
|
+
summary.overallScoreCount
|
|
1823
|
+
);
|
|
1824
|
+
return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
|
|
1825
|
+
}
|
|
1736
1826
|
}
|
|
1737
1827
|
) }),
|
|
1738
1828
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1739
1829
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
1740
1830
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1741
1831
|
const agg = summary.aggregates.get(id);
|
|
1742
|
-
|
|
1832
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
1833
|
+
(k) => k.startsWith(`${id}:`)
|
|
1834
|
+
);
|
|
1835
|
+
if (scoreKeys.length === 0) {
|
|
1743
1836
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1744
1837
|
"- ",
|
|
1745
1838
|
name.padEnd(28),
|
|
1746
|
-
" no
|
|
1839
|
+
" no scores"
|
|
1747
1840
|
] }, id);
|
|
1748
1841
|
}
|
|
1749
|
-
const
|
|
1750
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1751
|
-
"- ",
|
|
1752
|
-
name.padEnd(28),
|
|
1753
|
-
" avg=",
|
|
1754
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1842
|
+
const passedFailed = agg != null ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1755
1843
|
" ",
|
|
1756
1844
|
"passed=",
|
|
1757
1845
|
agg.passed,
|
|
1758
1846
|
" failed=",
|
|
1759
1847
|
agg.failed
|
|
1848
|
+
] }) : null;
|
|
1849
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1850
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1851
|
+
"- ",
|
|
1852
|
+
name.padEnd(28),
|
|
1853
|
+
passedFailed
|
|
1854
|
+
] }),
|
|
1855
|
+
scoreKeys.map((key) => {
|
|
1856
|
+
const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
|
|
1857
|
+
const aggregated = aggregateScoreItems(items);
|
|
1858
|
+
if (!aggregated)
|
|
1859
|
+
return null;
|
|
1860
|
+
const def = getScoreById(aggregated.id);
|
|
1861
|
+
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1862
|
+
const formatted = def?.format(aggregated.data, {
|
|
1863
|
+
isAggregated: true
|
|
1864
|
+
}) ?? "n/a";
|
|
1865
|
+
const numeric = toNumericScore(aggregated.data);
|
|
1866
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1867
|
+
ink.Text,
|
|
1868
|
+
{
|
|
1869
|
+
color: numeric !== void 0 ? scoreColor(numeric) : "gray",
|
|
1870
|
+
children: [
|
|
1871
|
+
" ",
|
|
1872
|
+
label,
|
|
1873
|
+
": ",
|
|
1874
|
+
formatted
|
|
1875
|
+
]
|
|
1876
|
+
},
|
|
1877
|
+
key
|
|
1878
|
+
);
|
|
1879
|
+
})
|
|
1760
1880
|
] }, id);
|
|
1761
1881
|
})
|
|
1762
1882
|
] }),
|
|
1763
1883
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1764
1884
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
|
|
1765
1885
|
testCases.map((tc) => {
|
|
1766
|
-
const
|
|
1767
|
-
(
|
|
1886
|
+
const allScores = tc.events.flatMap(
|
|
1887
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1768
1888
|
);
|
|
1769
|
-
const averageScore =
|
|
1889
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1890
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1891
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1892
|
+
const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
|
|
1770
1893
|
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1771
1894
|
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1772
1895
|
isAggregated: true
|
|
1773
|
-
}) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
|
|
1896
|
+
}) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
|
|
1774
1897
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1775
1898
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1776
1899
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
@@ -1804,13 +1927,26 @@ function RunView({
|
|
|
1804
1927
|
}
|
|
1805
1928
|
|
|
1806
1929
|
// src/cli-simple/run.ts
|
|
1930
|
+
function sampleStdDev2(sum, sumSq, n) {
|
|
1931
|
+
if (n < 2)
|
|
1932
|
+
return void 0;
|
|
1933
|
+
const mean = sum / n;
|
|
1934
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1935
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1936
|
+
}
|
|
1807
1937
|
function buildTestCaseSummaries(byId) {
|
|
1808
1938
|
const summaries = [];
|
|
1809
1939
|
for (const { name, events } of byId.values()) {
|
|
1810
1940
|
const passed = events.every((e) => e.passed);
|
|
1811
1941
|
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1812
1942
|
const isAggregated = events.length > 1;
|
|
1813
|
-
const
|
|
1943
|
+
const allScores = events.flatMap(
|
|
1944
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1945
|
+
);
|
|
1946
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1947
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1948
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1949
|
+
const stdDev = sampleStdDev2(total, sumSq, allScores.length);
|
|
1814
1950
|
let firstAggregatedScore;
|
|
1815
1951
|
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1816
1952
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -1826,21 +1962,18 @@ function buildTestCaseSummaries(byId) {
|
|
|
1826
1962
|
}
|
|
1827
1963
|
for (const items of scoreIdToItems.values()) {
|
|
1828
1964
|
const agg = aggregateScoreItems(items);
|
|
1829
|
-
if (agg) {
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
numericScores.push(n);
|
|
1833
|
-
if (firstAggregatedScore === void 0) {
|
|
1834
|
-
firstAggregatedScore = agg;
|
|
1835
|
-
}
|
|
1836
|
-
}
|
|
1965
|
+
if (agg && firstAggregatedScore === void 0) {
|
|
1966
|
+
firstAggregatedScore = agg;
|
|
1967
|
+
break;
|
|
1837
1968
|
}
|
|
1838
1969
|
}
|
|
1970
|
+
if (firstAggregatedScore !== void 0)
|
|
1971
|
+
break;
|
|
1839
1972
|
}
|
|
1840
|
-
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1841
1973
|
summaries.push({
|
|
1842
1974
|
name,
|
|
1843
1975
|
averageScore,
|
|
1976
|
+
stdDev: stdDev ?? void 0,
|
|
1844
1977
|
aggregatedScoreItem: firstAggregatedScore,
|
|
1845
1978
|
isAggregated,
|
|
1846
1979
|
durationMs,
|
|
@@ -1871,12 +2004,36 @@ function scoreToColor(score) {
|
|
|
1871
2004
|
}
|
|
1872
2005
|
return ansi2.red;
|
|
1873
2006
|
}
|
|
1874
|
-
function
|
|
1875
|
-
|
|
1876
|
-
|
|
2007
|
+
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
2008
|
+
const lines = [];
|
|
2009
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
2010
|
+
(k) => k.startsWith(`${evaluatorId}:`)
|
|
2011
|
+
);
|
|
2012
|
+
if (scoreKeys.length === 0) {
|
|
2013
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
2014
|
+
return lines;
|
|
2015
|
+
}
|
|
2016
|
+
const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
|
|
2017
|
+
const scoreLines = [];
|
|
2018
|
+
for (const key of scoreKeys) {
|
|
2019
|
+
const items = scoreItemsByKey.get(key) ?? [];
|
|
2020
|
+
const agg = aggregateScoreItems(items);
|
|
2021
|
+
if (!agg)
|
|
2022
|
+
continue;
|
|
2023
|
+
const def = getScoreById(agg.id);
|
|
2024
|
+
const label = def ? def.name ?? def.id : agg.id;
|
|
2025
|
+
const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
|
|
2026
|
+
const numeric = toNumericScore(agg.data);
|
|
2027
|
+
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2028
|
+
scoreLines.push(` ${label}: ${colored}`);
|
|
2029
|
+
}
|
|
2030
|
+
if (scoreLines.length > 0) {
|
|
2031
|
+
lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
|
|
2032
|
+
lines.push(...scoreLines);
|
|
2033
|
+
} else {
|
|
2034
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
|
|
1877
2035
|
}
|
|
1878
|
-
|
|
1879
|
-
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
2036
|
+
return lines;
|
|
1880
2037
|
}
|
|
1881
2038
|
function createBar2(value, max = 100, width = 20) {
|
|
1882
2039
|
const safe = Math.max(0, Math.min(max, value));
|
|
@@ -1928,46 +2085,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
|
1928
2085
|
}
|
|
1929
2086
|
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1930
2087
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1931
|
-
const
|
|
1932
|
-
for (const item of scores) {
|
|
1933
|
-
const def = getScoreById(item.id);
|
|
1934
|
-
if (!def) {
|
|
1935
|
-
const numeric = toNumericScore(item.data);
|
|
1936
|
-
scoreParts.push(
|
|
1937
|
-
numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
|
|
1938
|
-
);
|
|
1939
|
-
continue;
|
|
1940
|
-
}
|
|
1941
|
-
const formatted = def.format(item.data, options);
|
|
1942
|
-
switch (def.displayStrategy) {
|
|
1943
|
-
case "bar": {
|
|
1944
|
-
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1945
|
-
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
1946
|
-
scoreParts.push(
|
|
1947
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
1948
|
-
);
|
|
1949
|
-
} else {
|
|
1950
|
-
scoreParts.push(formatted);
|
|
1951
|
-
}
|
|
1952
|
-
break;
|
|
1953
|
-
}
|
|
1954
|
-
case "number":
|
|
1955
|
-
scoreParts.push(formatted);
|
|
1956
|
-
break;
|
|
1957
|
-
case "passFail":
|
|
1958
|
-
scoreParts.push(
|
|
1959
|
-
colorize(
|
|
1960
|
-
formatted,
|
|
1961
|
-
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
1962
|
-
)
|
|
1963
|
-
);
|
|
1964
|
-
break;
|
|
1965
|
-
}
|
|
1966
|
-
}
|
|
1967
|
-
const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
|
|
1968
|
-
let line = ` ${name}: ${passLabel} ${scoreStr}`;
|
|
2088
|
+
const metricParts = [];
|
|
1969
2089
|
if (metrics && metrics.length > 0) {
|
|
1970
|
-
const metricParts = [];
|
|
1971
2090
|
for (const { id, data } of metrics) {
|
|
1972
2091
|
const def = getMetricById(id);
|
|
1973
2092
|
if (def) {
|
|
@@ -1977,11 +2096,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
1977
2096
|
);
|
|
1978
2097
|
}
|
|
1979
2098
|
}
|
|
1980
|
-
|
|
1981
|
-
|
|
2099
|
+
}
|
|
2100
|
+
const scoreLines = [];
|
|
2101
|
+
for (const item of scores) {
|
|
2102
|
+
const def = getScoreById(item.id);
|
|
2103
|
+
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2104
|
+
let formatted;
|
|
2105
|
+
if (!def) {
|
|
2106
|
+
const numeric = toNumericScore(item.data);
|
|
2107
|
+
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2108
|
+
} else {
|
|
2109
|
+
const raw = def.format(item.data, options);
|
|
2110
|
+
switch (def.displayStrategy) {
|
|
2111
|
+
case "bar": {
|
|
2112
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2113
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2114
|
+
formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
|
|
2115
|
+
} else {
|
|
2116
|
+
formatted = raw;
|
|
2117
|
+
}
|
|
2118
|
+
break;
|
|
2119
|
+
}
|
|
2120
|
+
case "number":
|
|
2121
|
+
formatted = raw;
|
|
2122
|
+
break;
|
|
2123
|
+
case "passFail":
|
|
2124
|
+
formatted = colorize(
|
|
2125
|
+
raw,
|
|
2126
|
+
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2127
|
+
);
|
|
2128
|
+
break;
|
|
2129
|
+
}
|
|
1982
2130
|
}
|
|
2131
|
+
scoreLines.push(` ${scoreLabel}: ${formatted}`);
|
|
1983
2132
|
}
|
|
1984
|
-
|
|
2133
|
+
const lines = [];
|
|
2134
|
+
const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
|
|
2135
|
+
lines.push(` ${name}: ${passLabel}${metricStr}`);
|
|
2136
|
+
if (scoreLines.length > 0) {
|
|
2137
|
+
lines.push(...scoreLines);
|
|
2138
|
+
} else {
|
|
2139
|
+
lines.push(` n/a`);
|
|
2140
|
+
}
|
|
2141
|
+
return lines;
|
|
1985
2142
|
}
|
|
1986
2143
|
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
1987
2144
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
@@ -2004,8 +2161,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2004
2161
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
2005
2162
|
);
|
|
2006
2163
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2164
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
2007
2165
|
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
2008
2166
|
let overallScoreTotal = 0;
|
|
2167
|
+
let overallScoreSumSq = 0;
|
|
2009
2168
|
let overallScoreCount = 0;
|
|
2010
2169
|
let completedCount = 0;
|
|
2011
2170
|
let totalCount = 0;
|
|
@@ -2062,19 +2221,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2062
2221
|
if (numeric !== void 0) {
|
|
2063
2222
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
2064
2223
|
total: 0,
|
|
2224
|
+
sumSq: 0,
|
|
2065
2225
|
count: 0,
|
|
2066
2226
|
passed: 0,
|
|
2067
2227
|
failed: 0
|
|
2068
2228
|
};
|
|
2069
2229
|
aggregates.set(item.evaluatorId, {
|
|
2070
2230
|
total: current.total + numeric,
|
|
2231
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
2071
2232
|
count: current.count + 1,
|
|
2072
2233
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
2073
2234
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
2074
2235
|
});
|
|
2075
2236
|
overallScoreTotal += numeric;
|
|
2237
|
+
overallScoreSumSq += numeric * numeric;
|
|
2076
2238
|
overallScoreCount += 1;
|
|
2077
2239
|
}
|
|
2240
|
+
for (const s of item.scores) {
|
|
2241
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
2242
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
2243
|
+
list.push(s);
|
|
2244
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
2245
|
+
}
|
|
2078
2246
|
}
|
|
2079
2247
|
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2080
2248
|
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
@@ -2098,7 +2266,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2098
2266
|
for (const item of aggregatedScores) {
|
|
2099
2267
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2100
2268
|
lines.push(
|
|
2101
|
-
formatEvaluatorScoreLine(
|
|
2269
|
+
...formatEvaluatorScoreLine(
|
|
2102
2270
|
name,
|
|
2103
2271
|
item.scores,
|
|
2104
2272
|
item.passed,
|
|
@@ -2180,18 +2348,30 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2180
2348
|
);
|
|
2181
2349
|
if (overallScoreCount > 0) {
|
|
2182
2350
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2351
|
+
const overallSd = sampleStdDev2(
|
|
2352
|
+
overallScoreTotal,
|
|
2353
|
+
overallScoreSumSq,
|
|
2354
|
+
overallScoreCount
|
|
2355
|
+
);
|
|
2356
|
+
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2183
2357
|
console.log(
|
|
2184
2358
|
`- overall avg score: ${colorize(
|
|
2185
|
-
|
|
2359
|
+
avgStr,
|
|
2186
2360
|
scoreToColor(overallAverage)
|
|
2187
2361
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
2188
2362
|
);
|
|
2189
2363
|
}
|
|
2190
2364
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
2191
2365
|
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
2192
|
-
|
|
2193
|
-
|
|
2366
|
+
const evaluatorLines = getEvaluatorSummaryLines(
|
|
2367
|
+
evaluatorId,
|
|
2368
|
+
evaluatorName,
|
|
2369
|
+
aggregates.get(evaluatorId),
|
|
2370
|
+
scoreItemsByEvaluatorScore
|
|
2194
2371
|
);
|
|
2372
|
+
for (const line of evaluatorLines) {
|
|
2373
|
+
console.log(line);
|
|
2374
|
+
}
|
|
2195
2375
|
}
|
|
2196
2376
|
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
2197
2377
|
if (testCaseSummaries.length > 0) {
|
|
@@ -2207,7 +2387,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2207
2387
|
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2208
2388
|
summary.aggregatedScoreItem.data,
|
|
2209
2389
|
{ isAggregated: true }
|
|
2210
|
-
) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
|
|
2390
|
+
) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2211
2391
|
console.log(
|
|
2212
2392
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2213
2393
|
scoreLabel,
|