@m4trix/evals 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +287 -107
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +287 -107
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +643 -398
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +634 -389
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.js
CHANGED
|
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
|
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import {
|
|
9
|
+
import { diffLines } from 'diff';
|
|
10
10
|
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
11
|
import { render, Box, Text } from 'ink';
|
|
12
12
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
@@ -260,8 +260,35 @@ async function collectTestCasesFromFiles(config) {
|
|
|
260
260
|
);
|
|
261
261
|
return found.flat();
|
|
262
262
|
}
|
|
263
|
+
function toJsonLines(value) {
|
|
264
|
+
try {
|
|
265
|
+
return JSON.stringify(value, null, 2);
|
|
266
|
+
} catch {
|
|
267
|
+
return String(value);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
function formatDiffString(changes) {
|
|
271
|
+
const lines = [];
|
|
272
|
+
for (const part of changes) {
|
|
273
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
274
|
+
const partLines = part.value.split("\n");
|
|
275
|
+
if (partLines[partLines.length - 1] === "") {
|
|
276
|
+
partLines.pop();
|
|
277
|
+
}
|
|
278
|
+
for (const line of partLines) {
|
|
279
|
+
lines.push(`${prefix} ${line}`);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return lines.join("\n");
|
|
283
|
+
}
|
|
284
|
+
function createDiffString(expected, actual) {
|
|
285
|
+
const expectedStr = toJsonLines(expected);
|
|
286
|
+
const actualStr = toJsonLines(actual);
|
|
287
|
+
const changes = diffLines(expectedStr, actualStr);
|
|
288
|
+
return formatDiffString(changes);
|
|
289
|
+
}
|
|
263
290
|
function createDiffLogEntry(expected, actual, options) {
|
|
264
|
-
const diff =
|
|
291
|
+
const diff = createDiffString(expected, actual);
|
|
265
292
|
return {
|
|
266
293
|
type: "diff",
|
|
267
294
|
label: options?.label,
|
|
@@ -271,7 +298,7 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
271
298
|
};
|
|
272
299
|
}
|
|
273
300
|
function getDiffLines(entry) {
|
|
274
|
-
const raw =
|
|
301
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
275
302
|
return raw.split("\n").map((line) => {
|
|
276
303
|
const trimmed = line.trimStart();
|
|
277
304
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -331,15 +358,28 @@ function getScoreById(id) {
|
|
|
331
358
|
}
|
|
332
359
|
|
|
333
360
|
// src/evals/aggregators.ts
|
|
334
|
-
function
|
|
361
|
+
function aggregateAverageWithVariance(values) {
|
|
335
362
|
if (values.length === 0) {
|
|
336
|
-
return { value: 0 };
|
|
363
|
+
return { value: 0, count: 0 };
|
|
337
364
|
}
|
|
338
365
|
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
339
|
-
|
|
366
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
367
|
+
const mean = sum / values.length;
|
|
368
|
+
let stdDev;
|
|
369
|
+
if (values.length >= 2) {
|
|
370
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
371
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
372
|
+
}
|
|
373
|
+
return { value: mean, stdDev, count: values.length };
|
|
340
374
|
}
|
|
341
375
|
function aggregateAll(values) {
|
|
342
|
-
|
|
376
|
+
const total = values.length;
|
|
377
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
378
|
+
return {
|
|
379
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
380
|
+
passedCount,
|
|
381
|
+
totalCount: total
|
|
382
|
+
};
|
|
343
383
|
}
|
|
344
384
|
function aggregateTokenCountSum(values) {
|
|
345
385
|
const initial = {
|
|
@@ -393,14 +433,28 @@ Score.of({
|
|
|
393
433
|
id: "percent",
|
|
394
434
|
name: "Score",
|
|
395
435
|
displayStrategy: "bar",
|
|
396
|
-
format: (data, options) =>
|
|
397
|
-
|
|
436
|
+
format: (data, options) => {
|
|
437
|
+
if (options?.isAggregated) {
|
|
438
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
439
|
+
}
|
|
440
|
+
return data.value.toFixed(2);
|
|
441
|
+
},
|
|
442
|
+
aggregate: aggregateAverageWithVariance
|
|
398
443
|
});
|
|
399
444
|
Score.of({
|
|
400
445
|
id: "binary",
|
|
401
446
|
name: "Result",
|
|
402
447
|
displayStrategy: "passFail",
|
|
403
|
-
format: (data, options) =>
|
|
448
|
+
format: (data, options) => {
|
|
449
|
+
if (options?.isAggregated) {
|
|
450
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
451
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
452
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
453
|
+
}
|
|
454
|
+
return base;
|
|
455
|
+
}
|
|
456
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
457
|
+
},
|
|
404
458
|
aggregate: aggregateAll
|
|
405
459
|
});
|
|
406
460
|
|
|
@@ -1339,6 +1393,13 @@ function Spinner({ label = "Running" }) {
|
|
|
1339
1393
|
label
|
|
1340
1394
|
] });
|
|
1341
1395
|
}
|
|
1396
|
+
function sampleStdDev(sum, sumSq, n) {
|
|
1397
|
+
if (n < 2)
|
|
1398
|
+
return void 0;
|
|
1399
|
+
const mean = sum / n;
|
|
1400
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1401
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1402
|
+
}
|
|
1342
1403
|
function scoreColor(score) {
|
|
1343
1404
|
if (score >= 80)
|
|
1344
1405
|
return "green";
|
|
@@ -1457,7 +1518,9 @@ function RunView({
|
|
|
1457
1518
|
);
|
|
1458
1519
|
setEvaluatorNameById(nameById);
|
|
1459
1520
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1521
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
1460
1522
|
let overallScoreTotal = 0;
|
|
1523
|
+
let overallScoreSumSq = 0;
|
|
1461
1524
|
let overallScoreCount = 0;
|
|
1462
1525
|
const done = new Promise((resolve5) => {
|
|
1463
1526
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
@@ -1469,19 +1532,28 @@ function RunView({
|
|
|
1469
1532
|
if (numeric !== void 0) {
|
|
1470
1533
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1471
1534
|
total: 0,
|
|
1535
|
+
sumSq: 0,
|
|
1472
1536
|
count: 0,
|
|
1473
1537
|
passed: 0,
|
|
1474
1538
|
failed: 0
|
|
1475
1539
|
};
|
|
1476
1540
|
aggregates.set(item.evaluatorId, {
|
|
1477
1541
|
total: current.total + numeric,
|
|
1542
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
1478
1543
|
count: current.count + 1,
|
|
1479
1544
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
1480
1545
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
1481
1546
|
});
|
|
1482
1547
|
overallScoreTotal += numeric;
|
|
1548
|
+
overallScoreSumSq += numeric * numeric;
|
|
1483
1549
|
overallScoreCount += 1;
|
|
1484
1550
|
}
|
|
1551
|
+
for (const s of item.scores) {
|
|
1552
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
1553
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
1554
|
+
list.push(s);
|
|
1555
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
1556
|
+
}
|
|
1485
1557
|
}
|
|
1486
1558
|
setTestCases((prev) => {
|
|
1487
1559
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
@@ -1549,8 +1621,10 @@ function RunView({
|
|
|
1549
1621
|
failedTestCases: finalEvent.failedTestCases,
|
|
1550
1622
|
totalTestCases: finalEvent.totalTestCases,
|
|
1551
1623
|
overallScoreTotal,
|
|
1624
|
+
overallScoreSumSq,
|
|
1552
1625
|
overallScoreCount,
|
|
1553
1626
|
aggregates: new Map(aggregates),
|
|
1627
|
+
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1554
1628
|
artifactPath: finalEvent.artifactPath
|
|
1555
1629
|
});
|
|
1556
1630
|
setPhase("completed");
|
|
@@ -1633,36 +1707,45 @@ function RunView({
|
|
|
1633
1707
|
":",
|
|
1634
1708
|
" ",
|
|
1635
1709
|
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1636
|
-
|
|
1637
|
-
|
|
1710
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1711
|
+
" ",
|
|
1712
|
+
item.metrics.map((m) => {
|
|
1713
|
+
const def = getMetricById(m.id);
|
|
1714
|
+
if (!def)
|
|
1715
|
+
return null;
|
|
1716
|
+
const formatted = def.format(m.data, {
|
|
1717
|
+
isAggregated: tc.isAggregated
|
|
1718
|
+
});
|
|
1719
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1720
|
+
"[",
|
|
1721
|
+
def.name ? `${def.name}: ` : "",
|
|
1722
|
+
formatted,
|
|
1723
|
+
"]",
|
|
1724
|
+
" "
|
|
1725
|
+
] }, m.id);
|
|
1726
|
+
})
|
|
1727
|
+
] }) : null
|
|
1728
|
+
] }),
|
|
1729
|
+
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1730
|
+
const def = getScoreById(s.id);
|
|
1731
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
1732
|
+
return /* @__PURE__ */ jsxs(
|
|
1638
1733
|
Text,
|
|
1639
1734
|
{
|
|
1640
1735
|
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1641
1736
|
children: [
|
|
1737
|
+
" ",
|
|
1738
|
+
scoreLabel,
|
|
1739
|
+
":",
|
|
1740
|
+
" ",
|
|
1642
1741
|
formatScorePart(s, scoreColor, {
|
|
1643
1742
|
isAggregated: tc.isAggregated
|
|
1644
|
-
})
|
|
1645
|
-
" "
|
|
1743
|
+
})
|
|
1646
1744
|
]
|
|
1647
1745
|
},
|
|
1648
|
-
s.id
|
|
1649
|
-
)
|
|
1650
|
-
|
|
1651
|
-
const def = getMetricById(m.id);
|
|
1652
|
-
if (!def)
|
|
1653
|
-
return null;
|
|
1654
|
-
const formatted = def.format(m.data, {
|
|
1655
|
-
isAggregated: tc.isAggregated
|
|
1656
|
-
});
|
|
1657
|
-
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1658
|
-
"[",
|
|
1659
|
-
def.name ? `${def.name}: ` : "",
|
|
1660
|
-
formatted,
|
|
1661
|
-
"]",
|
|
1662
|
-
" "
|
|
1663
|
-
] }, m.id);
|
|
1664
|
-
})
|
|
1665
|
-
] }),
|
|
1746
|
+
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1747
|
+
);
|
|
1748
|
+
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
1666
1749
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1667
1750
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1668
1751
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
@@ -1706,45 +1789,85 @@ function RunView({
|
|
|
1706
1789
|
label: "overall avg",
|
|
1707
1790
|
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1708
1791
|
barWidth: 20,
|
|
1709
|
-
format: (v) =>
|
|
1792
|
+
format: (v) => {
|
|
1793
|
+
const sd = sampleStdDev(
|
|
1794
|
+
summary.overallScoreTotal,
|
|
1795
|
+
summary.overallScoreSumSq,
|
|
1796
|
+
summary.overallScoreCount
|
|
1797
|
+
);
|
|
1798
|
+
return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
|
|
1799
|
+
}
|
|
1710
1800
|
}
|
|
1711
1801
|
) }),
|
|
1712
1802
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1713
1803
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
1714
1804
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1715
1805
|
const agg = summary.aggregates.get(id);
|
|
1716
|
-
|
|
1806
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
1807
|
+
(k) => k.startsWith(`${id}:`)
|
|
1808
|
+
);
|
|
1809
|
+
if (scoreKeys.length === 0) {
|
|
1717
1810
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1718
1811
|
"- ",
|
|
1719
1812
|
name.padEnd(28),
|
|
1720
|
-
" no
|
|
1813
|
+
" no scores"
|
|
1721
1814
|
] }, id);
|
|
1722
1815
|
}
|
|
1723
|
-
const
|
|
1724
|
-
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1725
|
-
"- ",
|
|
1726
|
-
name.padEnd(28),
|
|
1727
|
-
" avg=",
|
|
1728
|
-
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1816
|
+
const passedFailed = agg != null ? /* @__PURE__ */ jsxs(Text, { children: [
|
|
1729
1817
|
" ",
|
|
1730
1818
|
"passed=",
|
|
1731
1819
|
agg.passed,
|
|
1732
1820
|
" failed=",
|
|
1733
1821
|
agg.failed
|
|
1822
|
+
] }) : null;
|
|
1823
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1824
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1825
|
+
"- ",
|
|
1826
|
+
name.padEnd(28),
|
|
1827
|
+
passedFailed
|
|
1828
|
+
] }),
|
|
1829
|
+
scoreKeys.map((key) => {
|
|
1830
|
+
const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
|
|
1831
|
+
const aggregated = aggregateScoreItems(items);
|
|
1832
|
+
if (!aggregated)
|
|
1833
|
+
return null;
|
|
1834
|
+
const def = getScoreById(aggregated.id);
|
|
1835
|
+
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1836
|
+
const formatted = def?.format(aggregated.data, {
|
|
1837
|
+
isAggregated: true
|
|
1838
|
+
}) ?? "n/a";
|
|
1839
|
+
const numeric = toNumericScore(aggregated.data);
|
|
1840
|
+
return /* @__PURE__ */ jsxs(
|
|
1841
|
+
Text,
|
|
1842
|
+
{
|
|
1843
|
+
color: numeric !== void 0 ? scoreColor(numeric) : "gray",
|
|
1844
|
+
children: [
|
|
1845
|
+
" ",
|
|
1846
|
+
label,
|
|
1847
|
+
": ",
|
|
1848
|
+
formatted
|
|
1849
|
+
]
|
|
1850
|
+
},
|
|
1851
|
+
key
|
|
1852
|
+
);
|
|
1853
|
+
})
|
|
1734
1854
|
] }, id);
|
|
1735
1855
|
})
|
|
1736
1856
|
] }),
|
|
1737
1857
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1738
1858
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
|
|
1739
1859
|
testCases.map((tc) => {
|
|
1740
|
-
const
|
|
1741
|
-
(
|
|
1860
|
+
const allScores = tc.events.flatMap(
|
|
1861
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1742
1862
|
);
|
|
1743
|
-
const averageScore =
|
|
1863
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1864
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1865
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1866
|
+
const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
|
|
1744
1867
|
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1745
1868
|
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1746
1869
|
isAggregated: true
|
|
1747
|
-
}) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
|
|
1870
|
+
}) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
|
|
1748
1871
|
return /* @__PURE__ */ jsxs(Box, { children: [
|
|
1749
1872
|
/* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1750
1873
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
@@ -1778,13 +1901,26 @@ function RunView({
|
|
|
1778
1901
|
}
|
|
1779
1902
|
|
|
1780
1903
|
// src/cli-simple/run.ts
|
|
1904
|
+
function sampleStdDev2(sum, sumSq, n) {
|
|
1905
|
+
if (n < 2)
|
|
1906
|
+
return void 0;
|
|
1907
|
+
const mean = sum / n;
|
|
1908
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1909
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1910
|
+
}
|
|
1781
1911
|
function buildTestCaseSummaries(byId) {
|
|
1782
1912
|
const summaries = [];
|
|
1783
1913
|
for (const { name, events } of byId.values()) {
|
|
1784
1914
|
const passed = events.every((e) => e.passed);
|
|
1785
1915
|
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1786
1916
|
const isAggregated = events.length > 1;
|
|
1787
|
-
const
|
|
1917
|
+
const allScores = events.flatMap(
|
|
1918
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1919
|
+
);
|
|
1920
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1921
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1922
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1923
|
+
const stdDev = sampleStdDev2(total, sumSq, allScores.length);
|
|
1788
1924
|
let firstAggregatedScore;
|
|
1789
1925
|
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1790
1926
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -1800,21 +1936,18 @@ function buildTestCaseSummaries(byId) {
|
|
|
1800
1936
|
}
|
|
1801
1937
|
for (const items of scoreIdToItems.values()) {
|
|
1802
1938
|
const agg = aggregateScoreItems(items);
|
|
1803
|
-
if (agg) {
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
numericScores.push(n);
|
|
1807
|
-
if (firstAggregatedScore === void 0) {
|
|
1808
|
-
firstAggregatedScore = agg;
|
|
1809
|
-
}
|
|
1810
|
-
}
|
|
1939
|
+
if (agg && firstAggregatedScore === void 0) {
|
|
1940
|
+
firstAggregatedScore = agg;
|
|
1941
|
+
break;
|
|
1811
1942
|
}
|
|
1812
1943
|
}
|
|
1944
|
+
if (firstAggregatedScore !== void 0)
|
|
1945
|
+
break;
|
|
1813
1946
|
}
|
|
1814
|
-
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1815
1947
|
summaries.push({
|
|
1816
1948
|
name,
|
|
1817
1949
|
averageScore,
|
|
1950
|
+
stdDev: stdDev ?? void 0,
|
|
1818
1951
|
aggregatedScoreItem: firstAggregatedScore,
|
|
1819
1952
|
isAggregated,
|
|
1820
1953
|
durationMs,
|
|
@@ -1845,12 +1978,36 @@ function scoreToColor(score) {
|
|
|
1845
1978
|
}
|
|
1846
1979
|
return ansi2.red;
|
|
1847
1980
|
}
|
|
1848
|
-
function
|
|
1849
|
-
|
|
1850
|
-
|
|
1981
|
+
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
1982
|
+
const lines = [];
|
|
1983
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
1984
|
+
(k) => k.startsWith(`${evaluatorId}:`)
|
|
1985
|
+
);
|
|
1986
|
+
if (scoreKeys.length === 0) {
|
|
1987
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
1988
|
+
return lines;
|
|
1989
|
+
}
|
|
1990
|
+
const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
|
|
1991
|
+
const scoreLines = [];
|
|
1992
|
+
for (const key of scoreKeys) {
|
|
1993
|
+
const items = scoreItemsByKey.get(key) ?? [];
|
|
1994
|
+
const agg = aggregateScoreItems(items);
|
|
1995
|
+
if (!agg)
|
|
1996
|
+
continue;
|
|
1997
|
+
const def = getScoreById(agg.id);
|
|
1998
|
+
const label = def ? def.name ?? def.id : agg.id;
|
|
1999
|
+
const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
|
|
2000
|
+
const numeric = toNumericScore(agg.data);
|
|
2001
|
+
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2002
|
+
scoreLines.push(` ${label}: ${colored}`);
|
|
2003
|
+
}
|
|
2004
|
+
if (scoreLines.length > 0) {
|
|
2005
|
+
lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
|
|
2006
|
+
lines.push(...scoreLines);
|
|
2007
|
+
} else {
|
|
2008
|
+
lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
|
|
1851
2009
|
}
|
|
1852
|
-
|
|
1853
|
-
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
2010
|
+
return lines;
|
|
1854
2011
|
}
|
|
1855
2012
|
function createBar2(value, max = 100, width = 20) {
|
|
1856
2013
|
const safe = Math.max(0, Math.min(max, value));
|
|
@@ -1902,46 +2059,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
|
1902
2059
|
}
|
|
1903
2060
|
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1904
2061
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1905
|
-
const
|
|
1906
|
-
for (const item of scores) {
|
|
1907
|
-
const def = getScoreById(item.id);
|
|
1908
|
-
if (!def) {
|
|
1909
|
-
const numeric = toNumericScore(item.data);
|
|
1910
|
-
scoreParts.push(
|
|
1911
|
-
numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
|
|
1912
|
-
);
|
|
1913
|
-
continue;
|
|
1914
|
-
}
|
|
1915
|
-
const formatted = def.format(item.data, options);
|
|
1916
|
-
switch (def.displayStrategy) {
|
|
1917
|
-
case "bar": {
|
|
1918
|
-
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1919
|
-
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
1920
|
-
scoreParts.push(
|
|
1921
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
1922
|
-
);
|
|
1923
|
-
} else {
|
|
1924
|
-
scoreParts.push(formatted);
|
|
1925
|
-
}
|
|
1926
|
-
break;
|
|
1927
|
-
}
|
|
1928
|
-
case "number":
|
|
1929
|
-
scoreParts.push(formatted);
|
|
1930
|
-
break;
|
|
1931
|
-
case "passFail":
|
|
1932
|
-
scoreParts.push(
|
|
1933
|
-
colorize(
|
|
1934
|
-
formatted,
|
|
1935
|
-
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
1936
|
-
)
|
|
1937
|
-
);
|
|
1938
|
-
break;
|
|
1939
|
-
}
|
|
1940
|
-
}
|
|
1941
|
-
const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
|
|
1942
|
-
let line = ` ${name}: ${passLabel} ${scoreStr}`;
|
|
2062
|
+
const metricParts = [];
|
|
1943
2063
|
if (metrics && metrics.length > 0) {
|
|
1944
|
-
const metricParts = [];
|
|
1945
2064
|
for (const { id, data } of metrics) {
|
|
1946
2065
|
const def = getMetricById(id);
|
|
1947
2066
|
if (def) {
|
|
@@ -1951,11 +2070,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
1951
2070
|
);
|
|
1952
2071
|
}
|
|
1953
2072
|
}
|
|
1954
|
-
|
|
1955
|
-
|
|
2073
|
+
}
|
|
2074
|
+
const scoreLines = [];
|
|
2075
|
+
for (const item of scores) {
|
|
2076
|
+
const def = getScoreById(item.id);
|
|
2077
|
+
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2078
|
+
let formatted;
|
|
2079
|
+
if (!def) {
|
|
2080
|
+
const numeric = toNumericScore(item.data);
|
|
2081
|
+
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2082
|
+
} else {
|
|
2083
|
+
const raw = def.format(item.data, options);
|
|
2084
|
+
switch (def.displayStrategy) {
|
|
2085
|
+
case "bar": {
|
|
2086
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2087
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
2088
|
+
formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
|
|
2089
|
+
} else {
|
|
2090
|
+
formatted = raw;
|
|
2091
|
+
}
|
|
2092
|
+
break;
|
|
2093
|
+
}
|
|
2094
|
+
case "number":
|
|
2095
|
+
formatted = raw;
|
|
2096
|
+
break;
|
|
2097
|
+
case "passFail":
|
|
2098
|
+
formatted = colorize(
|
|
2099
|
+
raw,
|
|
2100
|
+
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
2101
|
+
);
|
|
2102
|
+
break;
|
|
2103
|
+
}
|
|
1956
2104
|
}
|
|
2105
|
+
scoreLines.push(` ${scoreLabel}: ${formatted}`);
|
|
1957
2106
|
}
|
|
1958
|
-
|
|
2107
|
+
const lines = [];
|
|
2108
|
+
const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
|
|
2109
|
+
lines.push(` ${name}: ${passLabel}${metricStr}`);
|
|
2110
|
+
if (scoreLines.length > 0) {
|
|
2111
|
+
lines.push(...scoreLines);
|
|
2112
|
+
} else {
|
|
2113
|
+
lines.push(` n/a`);
|
|
2114
|
+
}
|
|
2115
|
+
return lines;
|
|
1959
2116
|
}
|
|
1960
2117
|
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
1961
2118
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
@@ -1978,8 +2135,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1978
2135
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1979
2136
|
);
|
|
1980
2137
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2138
|
+
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
1981
2139
|
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
1982
2140
|
let overallScoreTotal = 0;
|
|
2141
|
+
let overallScoreSumSq = 0;
|
|
1983
2142
|
let overallScoreCount = 0;
|
|
1984
2143
|
let completedCount = 0;
|
|
1985
2144
|
let totalCount = 0;
|
|
@@ -2036,19 +2195,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2036
2195
|
if (numeric !== void 0) {
|
|
2037
2196
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
2038
2197
|
total: 0,
|
|
2198
|
+
sumSq: 0,
|
|
2039
2199
|
count: 0,
|
|
2040
2200
|
passed: 0,
|
|
2041
2201
|
failed: 0
|
|
2042
2202
|
};
|
|
2043
2203
|
aggregates.set(item.evaluatorId, {
|
|
2044
2204
|
total: current.total + numeric,
|
|
2205
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
2045
2206
|
count: current.count + 1,
|
|
2046
2207
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
2047
2208
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
2048
2209
|
});
|
|
2049
2210
|
overallScoreTotal += numeric;
|
|
2211
|
+
overallScoreSumSq += numeric * numeric;
|
|
2050
2212
|
overallScoreCount += 1;
|
|
2051
2213
|
}
|
|
2214
|
+
for (const s of item.scores) {
|
|
2215
|
+
const key = `${item.evaluatorId}:${s.id}`;
|
|
2216
|
+
const list = scoreItemsByEvaluatorScore.get(key) ?? [];
|
|
2217
|
+
list.push(s);
|
|
2218
|
+
scoreItemsByEvaluatorScore.set(key, list);
|
|
2219
|
+
}
|
|
2052
2220
|
}
|
|
2053
2221
|
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2054
2222
|
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
@@ -2072,7 +2240,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2072
2240
|
for (const item of aggregatedScores) {
|
|
2073
2241
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2074
2242
|
lines.push(
|
|
2075
|
-
formatEvaluatorScoreLine(
|
|
2243
|
+
...formatEvaluatorScoreLine(
|
|
2076
2244
|
name,
|
|
2077
2245
|
item.scores,
|
|
2078
2246
|
item.passed,
|
|
@@ -2154,18 +2322,30 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2154
2322
|
);
|
|
2155
2323
|
if (overallScoreCount > 0) {
|
|
2156
2324
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2325
|
+
const overallSd = sampleStdDev2(
|
|
2326
|
+
overallScoreTotal,
|
|
2327
|
+
overallScoreSumSq,
|
|
2328
|
+
overallScoreCount
|
|
2329
|
+
);
|
|
2330
|
+
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2157
2331
|
console.log(
|
|
2158
2332
|
`- overall avg score: ${colorize(
|
|
2159
|
-
|
|
2333
|
+
avgStr,
|
|
2160
2334
|
scoreToColor(overallAverage)
|
|
2161
2335
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
2162
2336
|
);
|
|
2163
2337
|
}
|
|
2164
2338
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
2165
2339
|
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
2166
|
-
|
|
2167
|
-
|
|
2340
|
+
const evaluatorLines = getEvaluatorSummaryLines(
|
|
2341
|
+
evaluatorId,
|
|
2342
|
+
evaluatorName,
|
|
2343
|
+
aggregates.get(evaluatorId),
|
|
2344
|
+
scoreItemsByEvaluatorScore
|
|
2168
2345
|
);
|
|
2346
|
+
for (const line of evaluatorLines) {
|
|
2347
|
+
console.log(line);
|
|
2348
|
+
}
|
|
2169
2349
|
}
|
|
2170
2350
|
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
2171
2351
|
if (testCaseSummaries.length > 0) {
|
|
@@ -2181,7 +2361,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2181
2361
|
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2182
2362
|
summary.aggregatedScoreItem.data,
|
|
2183
2363
|
{ isAggregated: true }
|
|
2184
|
-
) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
|
|
2364
|
+
) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2185
2365
|
console.log(
|
|
2186
2366
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2187
2367
|
scoreLabel,
|