@m4trix/evals 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +129 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +129 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +591 -380
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +582 -371
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.js
CHANGED
|
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
|
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import {
|
|
9
|
+
import { diffLines } from 'diff';
|
|
10
10
|
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
11
|
import { render, Box, Text } from 'ink';
|
|
12
12
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
@@ -260,8 +260,35 @@ async function collectTestCasesFromFiles(config) {
|
|
|
260
260
|
);
|
|
261
261
|
return found.flat();
|
|
262
262
|
}
|
|
263
|
+
function toJsonLines(value) {
|
|
264
|
+
try {
|
|
265
|
+
return JSON.stringify(value, null, 2);
|
|
266
|
+
} catch {
|
|
267
|
+
return String(value);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
function formatDiffString(changes) {
|
|
271
|
+
const lines = [];
|
|
272
|
+
for (const part of changes) {
|
|
273
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
274
|
+
const partLines = part.value.split("\n");
|
|
275
|
+
if (partLines[partLines.length - 1] === "") {
|
|
276
|
+
partLines.pop();
|
|
277
|
+
}
|
|
278
|
+
for (const line of partLines) {
|
|
279
|
+
lines.push(`${prefix} ${line}`);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return lines.join("\n");
|
|
283
|
+
}
|
|
284
|
+
function createDiffString(expected, actual) {
|
|
285
|
+
const expectedStr = toJsonLines(expected);
|
|
286
|
+
const actualStr = toJsonLines(actual);
|
|
287
|
+
const changes = diffLines(expectedStr, actualStr);
|
|
288
|
+
return formatDiffString(changes);
|
|
289
|
+
}
|
|
263
290
|
function createDiffLogEntry(expected, actual, options) {
|
|
264
|
-
const diff =
|
|
291
|
+
const diff = createDiffString(expected, actual);
|
|
265
292
|
return {
|
|
266
293
|
type: "diff",
|
|
267
294
|
label: options?.label,
|
|
@@ -271,7 +298,7 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
271
298
|
};
|
|
272
299
|
}
|
|
273
300
|
function getDiffLines(entry) {
|
|
274
|
-
const raw =
|
|
301
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
275
302
|
return raw.split("\n").map((line) => {
|
|
276
303
|
const trimmed = line.trimStart();
|
|
277
304
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -331,15 +358,28 @@ function getScoreById(id) {
|
|
|
331
358
|
}
|
|
332
359
|
|
|
333
360
|
// src/evals/aggregators.ts
|
|
334
|
-
function
|
|
361
|
+
function aggregateAverageWithVariance(values) {
|
|
335
362
|
if (values.length === 0) {
|
|
336
|
-
return { value: 0 };
|
|
363
|
+
return { value: 0, count: 0 };
|
|
337
364
|
}
|
|
338
365
|
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
339
|
-
|
|
366
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
367
|
+
const mean = sum / values.length;
|
|
368
|
+
let stdDev;
|
|
369
|
+
if (values.length >= 2) {
|
|
370
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
371
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
372
|
+
}
|
|
373
|
+
return { value: mean, stdDev, count: values.length };
|
|
340
374
|
}
|
|
341
375
|
function aggregateAll(values) {
|
|
342
|
-
|
|
376
|
+
const total = values.length;
|
|
377
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
378
|
+
return {
|
|
379
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
380
|
+
passedCount,
|
|
381
|
+
totalCount: total
|
|
382
|
+
};
|
|
343
383
|
}
|
|
344
384
|
function aggregateTokenCountSum(values) {
|
|
345
385
|
const initial = {
|
|
@@ -393,14 +433,28 @@ Score.of({
|
|
|
393
433
|
id: "percent",
|
|
394
434
|
name: "Score",
|
|
395
435
|
displayStrategy: "bar",
|
|
396
|
-
format: (data, options) =>
|
|
397
|
-
|
|
436
|
+
format: (data, options) => {
|
|
437
|
+
if (options?.isAggregated) {
|
|
438
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
439
|
+
}
|
|
440
|
+
return data.value.toFixed(2);
|
|
441
|
+
},
|
|
442
|
+
aggregate: aggregateAverageWithVariance
|
|
398
443
|
});
|
|
399
444
|
Score.of({
|
|
400
445
|
id: "binary",
|
|
401
446
|
name: "Result",
|
|
402
447
|
displayStrategy: "passFail",
|
|
403
|
-
format: (data, options) =>
|
|
448
|
+
format: (data, options) => {
|
|
449
|
+
if (options?.isAggregated) {
|
|
450
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
451
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
452
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
453
|
+
}
|
|
454
|
+
return base;
|
|
455
|
+
}
|
|
456
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
457
|
+
},
|
|
404
458
|
aggregate: aggregateAll
|
|
405
459
|
});
|
|
406
460
|
|
|
@@ -1339,6 +1393,13 @@ function Spinner({ label = "Running" }) {
|
|
|
1339
1393
|
label
|
|
1340
1394
|
] });
|
|
1341
1395
|
}
|
|
1396
|
+
function sampleStdDev(sum, sumSq, n) {
|
|
1397
|
+
if (n < 2)
|
|
1398
|
+
return void 0;
|
|
1399
|
+
const mean = sum / n;
|
|
1400
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1401
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1402
|
+
}
|
|
1342
1403
|
function scoreColor(score) {
|
|
1343
1404
|
if (score >= 80)
|
|
1344
1405
|
return "green";
|
|
@@ -1458,6 +1519,7 @@ function RunView({
|
|
|
1458
1519
|
setEvaluatorNameById(nameById);
|
|
1459
1520
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1460
1521
|
let overallScoreTotal = 0;
|
|
1522
|
+
let overallScoreSumSq = 0;
|
|
1461
1523
|
let overallScoreCount = 0;
|
|
1462
1524
|
const done = new Promise((resolve5) => {
|
|
1463
1525
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
@@ -1469,17 +1531,20 @@ function RunView({
|
|
|
1469
1531
|
if (numeric !== void 0) {
|
|
1470
1532
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1471
1533
|
total: 0,
|
|
1534
|
+
sumSq: 0,
|
|
1472
1535
|
count: 0,
|
|
1473
1536
|
passed: 0,
|
|
1474
1537
|
failed: 0
|
|
1475
1538
|
};
|
|
1476
1539
|
aggregates.set(item.evaluatorId, {
|
|
1477
1540
|
total: current.total + numeric,
|
|
1541
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
1478
1542
|
count: current.count + 1,
|
|
1479
1543
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
1480
1544
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
1481
1545
|
});
|
|
1482
1546
|
overallScoreTotal += numeric;
|
|
1547
|
+
overallScoreSumSq += numeric * numeric;
|
|
1483
1548
|
overallScoreCount += 1;
|
|
1484
1549
|
}
|
|
1485
1550
|
}
|
|
@@ -1549,6 +1614,7 @@ function RunView({
|
|
|
1549
1614
|
failedTestCases: finalEvent.failedTestCases,
|
|
1550
1615
|
totalTestCases: finalEvent.totalTestCases,
|
|
1551
1616
|
overallScoreTotal,
|
|
1617
|
+
overallScoreSumSq,
|
|
1552
1618
|
overallScoreCount,
|
|
1553
1619
|
aggregates: new Map(aggregates),
|
|
1554
1620
|
artifactPath: finalEvent.artifactPath
|
|
@@ -1706,7 +1772,14 @@ function RunView({
|
|
|
1706
1772
|
label: "overall avg",
|
|
1707
1773
|
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1708
1774
|
barWidth: 20,
|
|
1709
|
-
format: (v) =>
|
|
1775
|
+
format: (v) => {
|
|
1776
|
+
const sd = sampleStdDev(
|
|
1777
|
+
summary.overallScoreTotal,
|
|
1778
|
+
summary.overallScoreSumSq,
|
|
1779
|
+
summary.overallScoreCount
|
|
1780
|
+
);
|
|
1781
|
+
return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
|
|
1782
|
+
}
|
|
1710
1783
|
}
|
|
1711
1784
|
) }),
|
|
1712
1785
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
@@ -1721,11 +1794,13 @@ function RunView({
|
|
|
1721
1794
|
] }, id);
|
|
1722
1795
|
}
|
|
1723
1796
|
const mean = agg.total / agg.count;
|
|
1797
|
+
const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
|
|
1798
|
+
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1724
1799
|
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1725
1800
|
"- ",
|
|
1726
1801
|
name.padEnd(28),
|
|
1727
1802
|
" avg=",
|
|
1728
|
-
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children:
|
|
1803
|
+
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
|
|
1729
1804
|
" ",
|
|
1730
1805
|
"passed=",
|
|
1731
1806
|
agg.passed,
|
|
@@ -1737,14 +1812,17 @@ function RunView({
|
|
|
1737
1812
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1738
1813
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
|
|
1739
1814
|
testCases.map((tc) => {
|
|
1740
|
-
const
|
|
1741
|
-
(
|
|
1815
|
+
const allScores = tc.events.flatMap(
|
|
1816
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1742
1817
|
);
|
|
1743
|
-
const averageScore =
|
|
1818
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1819
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1820
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1821
|
+
const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
|
|
1744
1822
|
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1745
1823
|
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1746
1824
|
isAggregated: true
|
|
1747
|
-
}) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
|
|
1825
|
+
}) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
|
|
1748
1826
|
return /* @__PURE__ */ jsxs(Box, { children: [
|
|
1749
1827
|
/* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1750
1828
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
@@ -1778,13 +1856,26 @@ function RunView({
|
|
|
1778
1856
|
}
|
|
1779
1857
|
|
|
1780
1858
|
// src/cli-simple/run.ts
|
|
1859
|
+
function sampleStdDev2(sum, sumSq, n) {
|
|
1860
|
+
if (n < 2)
|
|
1861
|
+
return void 0;
|
|
1862
|
+
const mean = sum / n;
|
|
1863
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1864
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1865
|
+
}
|
|
1781
1866
|
function buildTestCaseSummaries(byId) {
|
|
1782
1867
|
const summaries = [];
|
|
1783
1868
|
for (const { name, events } of byId.values()) {
|
|
1784
1869
|
const passed = events.every((e) => e.passed);
|
|
1785
1870
|
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1786
1871
|
const isAggregated = events.length > 1;
|
|
1787
|
-
const
|
|
1872
|
+
const allScores = events.flatMap(
|
|
1873
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1874
|
+
);
|
|
1875
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1876
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1877
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1878
|
+
const stdDev = sampleStdDev2(total, sumSq, allScores.length);
|
|
1788
1879
|
let firstAggregatedScore;
|
|
1789
1880
|
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1790
1881
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -1800,21 +1891,18 @@ function buildTestCaseSummaries(byId) {
|
|
|
1800
1891
|
}
|
|
1801
1892
|
for (const items of scoreIdToItems.values()) {
|
|
1802
1893
|
const agg = aggregateScoreItems(items);
|
|
1803
|
-
if (agg) {
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
numericScores.push(n);
|
|
1807
|
-
if (firstAggregatedScore === void 0) {
|
|
1808
|
-
firstAggregatedScore = agg;
|
|
1809
|
-
}
|
|
1810
|
-
}
|
|
1894
|
+
if (agg && firstAggregatedScore === void 0) {
|
|
1895
|
+
firstAggregatedScore = agg;
|
|
1896
|
+
break;
|
|
1811
1897
|
}
|
|
1812
1898
|
}
|
|
1899
|
+
if (firstAggregatedScore !== void 0)
|
|
1900
|
+
break;
|
|
1813
1901
|
}
|
|
1814
|
-
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1815
1902
|
summaries.push({
|
|
1816
1903
|
name,
|
|
1817
1904
|
averageScore,
|
|
1905
|
+
stdDev: stdDev ?? void 0,
|
|
1818
1906
|
aggregatedScoreItem: firstAggregatedScore,
|
|
1819
1907
|
isAggregated,
|
|
1820
1908
|
durationMs,
|
|
@@ -1850,7 +1938,9 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
1850
1938
|
return `- ${evaluatorName.padEnd(28)} no numeric scores`;
|
|
1851
1939
|
}
|
|
1852
1940
|
const mean = aggregate.total / aggregate.count;
|
|
1853
|
-
|
|
1941
|
+
const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
|
|
1942
|
+
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1943
|
+
return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
1854
1944
|
}
|
|
1855
1945
|
function createBar2(value, max = 100, width = 20) {
|
|
1856
1946
|
const safe = Math.max(0, Math.min(max, value));
|
|
@@ -1980,6 +2070,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1980
2070
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1981
2071
|
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
1982
2072
|
let overallScoreTotal = 0;
|
|
2073
|
+
let overallScoreSumSq = 0;
|
|
1983
2074
|
let overallScoreCount = 0;
|
|
1984
2075
|
let completedCount = 0;
|
|
1985
2076
|
let totalCount = 0;
|
|
@@ -2036,17 +2127,20 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2036
2127
|
if (numeric !== void 0) {
|
|
2037
2128
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
2038
2129
|
total: 0,
|
|
2130
|
+
sumSq: 0,
|
|
2039
2131
|
count: 0,
|
|
2040
2132
|
passed: 0,
|
|
2041
2133
|
failed: 0
|
|
2042
2134
|
};
|
|
2043
2135
|
aggregates.set(item.evaluatorId, {
|
|
2044
2136
|
total: current.total + numeric,
|
|
2137
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
2045
2138
|
count: current.count + 1,
|
|
2046
2139
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
2047
2140
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
2048
2141
|
});
|
|
2049
2142
|
overallScoreTotal += numeric;
|
|
2143
|
+
overallScoreSumSq += numeric * numeric;
|
|
2050
2144
|
overallScoreCount += 1;
|
|
2051
2145
|
}
|
|
2052
2146
|
}
|
|
@@ -2154,9 +2248,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2154
2248
|
);
|
|
2155
2249
|
if (overallScoreCount > 0) {
|
|
2156
2250
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2251
|
+
const overallSd = sampleStdDev2(
|
|
2252
|
+
overallScoreTotal,
|
|
2253
|
+
overallScoreSumSq,
|
|
2254
|
+
overallScoreCount
|
|
2255
|
+
);
|
|
2256
|
+
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2157
2257
|
console.log(
|
|
2158
2258
|
`- overall avg score: ${colorize(
|
|
2159
|
-
|
|
2259
|
+
avgStr,
|
|
2160
2260
|
scoreToColor(overallAverage)
|
|
2161
2261
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
2162
2262
|
);
|
|
@@ -2181,7 +2281,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2181
2281
|
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2182
2282
|
summary.aggregatedScoreItem.data,
|
|
2183
2283
|
{ isAggregated: true }
|
|
2184
|
-
) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
|
|
2284
|
+
) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2185
2285
|
console.log(
|
|
2186
2286
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2187
2287
|
scoreLabel,
|