@m4trix/evals 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +129 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +129 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +591 -380
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +582 -371
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,7 +8,7 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
var
|
|
11
|
+
var diff = require('diff');
|
|
12
12
|
var React2 = require('react');
|
|
13
13
|
var ink = require('ink');
|
|
14
14
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -286,8 +286,35 @@ async function collectTestCasesFromFiles(config) {
|
|
|
286
286
|
);
|
|
287
287
|
return found.flat();
|
|
288
288
|
}
|
|
289
|
+
function toJsonLines(value) {
|
|
290
|
+
try {
|
|
291
|
+
return JSON.stringify(value, null, 2);
|
|
292
|
+
} catch {
|
|
293
|
+
return String(value);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
function formatDiffString(changes) {
|
|
297
|
+
const lines = [];
|
|
298
|
+
for (const part of changes) {
|
|
299
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
300
|
+
const partLines = part.value.split("\n");
|
|
301
|
+
if (partLines[partLines.length - 1] === "") {
|
|
302
|
+
partLines.pop();
|
|
303
|
+
}
|
|
304
|
+
for (const line of partLines) {
|
|
305
|
+
lines.push(`${prefix} ${line}`);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return lines.join("\n");
|
|
309
|
+
}
|
|
310
|
+
function createDiffString(expected, actual) {
|
|
311
|
+
const expectedStr = toJsonLines(expected);
|
|
312
|
+
const actualStr = toJsonLines(actual);
|
|
313
|
+
const changes = diff.diffLines(expectedStr, actualStr);
|
|
314
|
+
return formatDiffString(changes);
|
|
315
|
+
}
|
|
289
316
|
function createDiffLogEntry(expected, actual, options) {
|
|
290
|
-
const diff =
|
|
317
|
+
const diff = createDiffString(expected, actual);
|
|
291
318
|
return {
|
|
292
319
|
type: "diff",
|
|
293
320
|
label: options?.label,
|
|
@@ -297,7 +324,7 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
297
324
|
};
|
|
298
325
|
}
|
|
299
326
|
function getDiffLines(entry) {
|
|
300
|
-
const raw =
|
|
327
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
301
328
|
return raw.split("\n").map((line) => {
|
|
302
329
|
const trimmed = line.trimStart();
|
|
303
330
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -357,15 +384,28 @@ function getScoreById(id) {
|
|
|
357
384
|
}
|
|
358
385
|
|
|
359
386
|
// src/evals/aggregators.ts
|
|
360
|
-
function
|
|
387
|
+
function aggregateAverageWithVariance(values) {
|
|
361
388
|
if (values.length === 0) {
|
|
362
|
-
return { value: 0 };
|
|
389
|
+
return { value: 0, count: 0 };
|
|
363
390
|
}
|
|
364
391
|
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
365
|
-
|
|
392
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
393
|
+
const mean = sum / values.length;
|
|
394
|
+
let stdDev;
|
|
395
|
+
if (values.length >= 2) {
|
|
396
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
397
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
398
|
+
}
|
|
399
|
+
return { value: mean, stdDev, count: values.length };
|
|
366
400
|
}
|
|
367
401
|
function aggregateAll(values) {
|
|
368
|
-
|
|
402
|
+
const total = values.length;
|
|
403
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
404
|
+
return {
|
|
405
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
406
|
+
passedCount,
|
|
407
|
+
totalCount: total
|
|
408
|
+
};
|
|
369
409
|
}
|
|
370
410
|
function aggregateTokenCountSum(values) {
|
|
371
411
|
const initial = {
|
|
@@ -419,14 +459,28 @@ Score.of({
|
|
|
419
459
|
id: "percent",
|
|
420
460
|
name: "Score",
|
|
421
461
|
displayStrategy: "bar",
|
|
422
|
-
format: (data, options) =>
|
|
423
|
-
|
|
462
|
+
format: (data, options) => {
|
|
463
|
+
if (options?.isAggregated) {
|
|
464
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
465
|
+
}
|
|
466
|
+
return data.value.toFixed(2);
|
|
467
|
+
},
|
|
468
|
+
aggregate: aggregateAverageWithVariance
|
|
424
469
|
});
|
|
425
470
|
Score.of({
|
|
426
471
|
id: "binary",
|
|
427
472
|
name: "Result",
|
|
428
473
|
displayStrategy: "passFail",
|
|
429
|
-
format: (data, options) =>
|
|
474
|
+
format: (data, options) => {
|
|
475
|
+
if (options?.isAggregated) {
|
|
476
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
477
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
478
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
479
|
+
}
|
|
480
|
+
return base;
|
|
481
|
+
}
|
|
482
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
483
|
+
},
|
|
430
484
|
aggregate: aggregateAll
|
|
431
485
|
});
|
|
432
486
|
|
|
@@ -1365,6 +1419,13 @@ function Spinner({ label = "Running" }) {
|
|
|
1365
1419
|
label
|
|
1366
1420
|
] });
|
|
1367
1421
|
}
|
|
1422
|
+
function sampleStdDev(sum, sumSq, n) {
|
|
1423
|
+
if (n < 2)
|
|
1424
|
+
return void 0;
|
|
1425
|
+
const mean = sum / n;
|
|
1426
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1427
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1428
|
+
}
|
|
1368
1429
|
function scoreColor(score) {
|
|
1369
1430
|
if (score >= 80)
|
|
1370
1431
|
return "green";
|
|
@@ -1484,6 +1545,7 @@ function RunView({
|
|
|
1484
1545
|
setEvaluatorNameById(nameById);
|
|
1485
1546
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1486
1547
|
let overallScoreTotal = 0;
|
|
1548
|
+
let overallScoreSumSq = 0;
|
|
1487
1549
|
let overallScoreCount = 0;
|
|
1488
1550
|
const done = new Promise((resolve5) => {
|
|
1489
1551
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
@@ -1495,17 +1557,20 @@ function RunView({
|
|
|
1495
1557
|
if (numeric !== void 0) {
|
|
1496
1558
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1497
1559
|
total: 0,
|
|
1560
|
+
sumSq: 0,
|
|
1498
1561
|
count: 0,
|
|
1499
1562
|
passed: 0,
|
|
1500
1563
|
failed: 0
|
|
1501
1564
|
};
|
|
1502
1565
|
aggregates.set(item.evaluatorId, {
|
|
1503
1566
|
total: current.total + numeric,
|
|
1567
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
1504
1568
|
count: current.count + 1,
|
|
1505
1569
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
1506
1570
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
1507
1571
|
});
|
|
1508
1572
|
overallScoreTotal += numeric;
|
|
1573
|
+
overallScoreSumSq += numeric * numeric;
|
|
1509
1574
|
overallScoreCount += 1;
|
|
1510
1575
|
}
|
|
1511
1576
|
}
|
|
@@ -1575,6 +1640,7 @@ function RunView({
|
|
|
1575
1640
|
failedTestCases: finalEvent.failedTestCases,
|
|
1576
1641
|
totalTestCases: finalEvent.totalTestCases,
|
|
1577
1642
|
overallScoreTotal,
|
|
1643
|
+
overallScoreSumSq,
|
|
1578
1644
|
overallScoreCount,
|
|
1579
1645
|
aggregates: new Map(aggregates),
|
|
1580
1646
|
artifactPath: finalEvent.artifactPath
|
|
@@ -1732,7 +1798,14 @@ function RunView({
|
|
|
1732
1798
|
label: "overall avg",
|
|
1733
1799
|
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1734
1800
|
barWidth: 20,
|
|
1735
|
-
format: (v) =>
|
|
1801
|
+
format: (v) => {
|
|
1802
|
+
const sd = sampleStdDev(
|
|
1803
|
+
summary.overallScoreTotal,
|
|
1804
|
+
summary.overallScoreSumSq,
|
|
1805
|
+
summary.overallScoreCount
|
|
1806
|
+
);
|
|
1807
|
+
return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
|
|
1808
|
+
}
|
|
1736
1809
|
}
|
|
1737
1810
|
) }),
|
|
1738
1811
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
@@ -1747,11 +1820,13 @@ function RunView({
|
|
|
1747
1820
|
] }, id);
|
|
1748
1821
|
}
|
|
1749
1822
|
const mean = agg.total / agg.count;
|
|
1823
|
+
const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
|
|
1824
|
+
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1750
1825
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1751
1826
|
"- ",
|
|
1752
1827
|
name.padEnd(28),
|
|
1753
1828
|
" avg=",
|
|
1754
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children:
|
|
1829
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: meanStr }),
|
|
1755
1830
|
" ",
|
|
1756
1831
|
"passed=",
|
|
1757
1832
|
agg.passed,
|
|
@@ -1763,14 +1838,17 @@ function RunView({
|
|
|
1763
1838
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1764
1839
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
|
|
1765
1840
|
testCases.map((tc) => {
|
|
1766
|
-
const
|
|
1767
|
-
(
|
|
1841
|
+
const allScores = tc.events.flatMap(
|
|
1842
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1768
1843
|
);
|
|
1769
|
-
const averageScore =
|
|
1844
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1845
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1846
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1847
|
+
const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
|
|
1770
1848
|
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1771
1849
|
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1772
1850
|
isAggregated: true
|
|
1773
|
-
}) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
|
|
1851
|
+
}) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
|
|
1774
1852
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1775
1853
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1776
1854
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
@@ -1804,13 +1882,26 @@ function RunView({
|
|
|
1804
1882
|
}
|
|
1805
1883
|
|
|
1806
1884
|
// src/cli-simple/run.ts
|
|
1885
|
+
function sampleStdDev2(sum, sumSq, n) {
|
|
1886
|
+
if (n < 2)
|
|
1887
|
+
return void 0;
|
|
1888
|
+
const mean = sum / n;
|
|
1889
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1890
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1891
|
+
}
|
|
1807
1892
|
function buildTestCaseSummaries(byId) {
|
|
1808
1893
|
const summaries = [];
|
|
1809
1894
|
for (const { name, events } of byId.values()) {
|
|
1810
1895
|
const passed = events.every((e) => e.passed);
|
|
1811
1896
|
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1812
1897
|
const isAggregated = events.length > 1;
|
|
1813
|
-
const
|
|
1898
|
+
const allScores = events.flatMap(
|
|
1899
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1900
|
+
);
|
|
1901
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1902
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1903
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1904
|
+
const stdDev = sampleStdDev2(total, sumSq, allScores.length);
|
|
1814
1905
|
let firstAggregatedScore;
|
|
1815
1906
|
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1816
1907
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -1826,21 +1917,18 @@ function buildTestCaseSummaries(byId) {
|
|
|
1826
1917
|
}
|
|
1827
1918
|
for (const items of scoreIdToItems.values()) {
|
|
1828
1919
|
const agg = aggregateScoreItems(items);
|
|
1829
|
-
if (agg) {
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
numericScores.push(n);
|
|
1833
|
-
if (firstAggregatedScore === void 0) {
|
|
1834
|
-
firstAggregatedScore = agg;
|
|
1835
|
-
}
|
|
1836
|
-
}
|
|
1920
|
+
if (agg && firstAggregatedScore === void 0) {
|
|
1921
|
+
firstAggregatedScore = agg;
|
|
1922
|
+
break;
|
|
1837
1923
|
}
|
|
1838
1924
|
}
|
|
1925
|
+
if (firstAggregatedScore !== void 0)
|
|
1926
|
+
break;
|
|
1839
1927
|
}
|
|
1840
|
-
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1841
1928
|
summaries.push({
|
|
1842
1929
|
name,
|
|
1843
1930
|
averageScore,
|
|
1931
|
+
stdDev: stdDev ?? void 0,
|
|
1844
1932
|
aggregatedScoreItem: firstAggregatedScore,
|
|
1845
1933
|
isAggregated,
|
|
1846
1934
|
durationMs,
|
|
@@ -1876,7 +1964,9 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
1876
1964
|
return `- ${evaluatorName.padEnd(28)} no numeric scores`;
|
|
1877
1965
|
}
|
|
1878
1966
|
const mean = aggregate.total / aggregate.count;
|
|
1879
|
-
|
|
1967
|
+
const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
|
|
1968
|
+
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1969
|
+
return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
1880
1970
|
}
|
|
1881
1971
|
function createBar2(value, max = 100, width = 20) {
|
|
1882
1972
|
const safe = Math.max(0, Math.min(max, value));
|
|
@@ -2006,6 +2096,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2006
2096
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2007
2097
|
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
2008
2098
|
let overallScoreTotal = 0;
|
|
2099
|
+
let overallScoreSumSq = 0;
|
|
2009
2100
|
let overallScoreCount = 0;
|
|
2010
2101
|
let completedCount = 0;
|
|
2011
2102
|
let totalCount = 0;
|
|
@@ -2062,17 +2153,20 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2062
2153
|
if (numeric !== void 0) {
|
|
2063
2154
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
2064
2155
|
total: 0,
|
|
2156
|
+
sumSq: 0,
|
|
2065
2157
|
count: 0,
|
|
2066
2158
|
passed: 0,
|
|
2067
2159
|
failed: 0
|
|
2068
2160
|
};
|
|
2069
2161
|
aggregates.set(item.evaluatorId, {
|
|
2070
2162
|
total: current.total + numeric,
|
|
2163
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
2071
2164
|
count: current.count + 1,
|
|
2072
2165
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
2073
2166
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
2074
2167
|
});
|
|
2075
2168
|
overallScoreTotal += numeric;
|
|
2169
|
+
overallScoreSumSq += numeric * numeric;
|
|
2076
2170
|
overallScoreCount += 1;
|
|
2077
2171
|
}
|
|
2078
2172
|
}
|
|
@@ -2180,9 +2274,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2180
2274
|
);
|
|
2181
2275
|
if (overallScoreCount > 0) {
|
|
2182
2276
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2277
|
+
const overallSd = sampleStdDev2(
|
|
2278
|
+
overallScoreTotal,
|
|
2279
|
+
overallScoreSumSq,
|
|
2280
|
+
overallScoreCount
|
|
2281
|
+
);
|
|
2282
|
+
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2183
2283
|
console.log(
|
|
2184
2284
|
`- overall avg score: ${colorize(
|
|
2185
|
-
|
|
2285
|
+
avgStr,
|
|
2186
2286
|
scoreToColor(overallAverage)
|
|
2187
2287
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
2188
2288
|
);
|
|
@@ -2207,7 +2307,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2207
2307
|
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2208
2308
|
summary.aggregatedScoreItem.data,
|
|
2209
2309
|
{ isAggregated: true }
|
|
2210
|
-
) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
|
|
2310
|
+
) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2211
2311
|
console.log(
|
|
2212
2312
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2213
2313
|
scoreLabel,
|