@m4trix/evals 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +105 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +105 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +79 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +79 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +78 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +36 -5
- package/dist/index.js +77 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -333,20 +333,70 @@ function getMetricById(id) {
|
|
|
333
333
|
|
|
334
334
|
// src/evals/score.ts
|
|
335
335
|
var registry2 = /* @__PURE__ */ new Map();
|
|
336
|
+
function formatScoreData(def, data, options) {
|
|
337
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
338
|
+
}
|
|
339
|
+
var ScoreAggregate = {
|
|
340
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
341
|
+
averageFields(fields) {
|
|
342
|
+
return (values) => {
|
|
343
|
+
const count = values.length || 1;
|
|
344
|
+
const result = {};
|
|
345
|
+
for (const field of fields) {
|
|
346
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
347
|
+
}
|
|
348
|
+
return result;
|
|
349
|
+
};
|
|
350
|
+
},
|
|
351
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
352
|
+
averageWithVariance(values) {
|
|
353
|
+
if (values.length === 0) {
|
|
354
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
355
|
+
}
|
|
356
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
357
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
358
|
+
const mean = sum / values.length;
|
|
359
|
+
let stdDev;
|
|
360
|
+
if (values.length >= 2) {
|
|
361
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
362
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
363
|
+
}
|
|
364
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
365
|
+
},
|
|
366
|
+
/** All runs must pass. Use for binary scores. */
|
|
367
|
+
all(values) {
|
|
368
|
+
const total = values.length;
|
|
369
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
370
|
+
return {
|
|
371
|
+
...values[0],
|
|
372
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
373
|
+
passedCount,
|
|
374
|
+
totalCount: total
|
|
375
|
+
};
|
|
376
|
+
},
|
|
377
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
378
|
+
last(values) {
|
|
379
|
+
return values[values.length - 1] ?? {};
|
|
380
|
+
}
|
|
381
|
+
};
|
|
336
382
|
var Score = {
|
|
383
|
+
aggregate: ScoreAggregate,
|
|
337
384
|
of(config) {
|
|
338
385
|
const def = {
|
|
339
386
|
id: config.id,
|
|
340
387
|
name: config.name,
|
|
341
388
|
displayStrategy: config.displayStrategy,
|
|
342
|
-
|
|
343
|
-
|
|
389
|
+
formatValue: config.formatValue,
|
|
390
|
+
formatAggregate: config.formatAggregate,
|
|
391
|
+
aggregateValues: config.aggregateValues,
|
|
344
392
|
make: (data, options) => {
|
|
345
393
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
346
394
|
return {
|
|
347
395
|
id: config.id,
|
|
348
396
|
data,
|
|
349
|
-
...passed !== void 0 && { passed }
|
|
397
|
+
...passed !== void 0 && { passed },
|
|
398
|
+
def
|
|
399
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
350
400
|
};
|
|
351
401
|
}
|
|
352
402
|
};
|
|
@@ -359,29 +409,6 @@ function getScoreById(id) {
|
|
|
359
409
|
}
|
|
360
410
|
|
|
361
411
|
// src/evals/aggregators.ts
|
|
362
|
-
function aggregateAverageWithVariance(values) {
|
|
363
|
-
if (values.length === 0) {
|
|
364
|
-
return { value: 0, count: 0 };
|
|
365
|
-
}
|
|
366
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
367
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
368
|
-
const mean = sum / values.length;
|
|
369
|
-
let stdDev;
|
|
370
|
-
if (values.length >= 2) {
|
|
371
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
372
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
373
|
-
}
|
|
374
|
-
return { value: mean, stdDev, count: values.length };
|
|
375
|
-
}
|
|
376
|
-
function aggregateAll(values) {
|
|
377
|
-
const total = values.length;
|
|
378
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
379
|
-
return {
|
|
380
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
381
|
-
passedCount,
|
|
382
|
-
totalCount: total
|
|
383
|
-
};
|
|
384
|
-
}
|
|
385
412
|
function aggregateTokenCountSum(values) {
|
|
386
413
|
const initial = {
|
|
387
414
|
input: 0,
|
|
@@ -434,40 +461,45 @@ Score.of({
|
|
|
434
461
|
id: "percent",
|
|
435
462
|
name: "Score",
|
|
436
463
|
displayStrategy: "bar",
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
464
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
465
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
466
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
467
|
+
});
|
|
468
|
+
Score.of({
|
|
469
|
+
id: "delta",
|
|
470
|
+
name: "Delta",
|
|
471
|
+
displayStrategy: "number",
|
|
472
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
473
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
474
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
444
475
|
});
|
|
445
476
|
Score.of({
|
|
446
477
|
id: "binary",
|
|
447
478
|
name: "Result",
|
|
448
479
|
displayStrategy: "passFail",
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
}
|
|
455
|
-
return base;
|
|
480
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
481
|
+
formatAggregate: (data) => {
|
|
482
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
483
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
484
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
456
485
|
}
|
|
457
|
-
return
|
|
486
|
+
return base;
|
|
458
487
|
},
|
|
459
|
-
|
|
488
|
+
aggregateValues: Score.aggregate.all
|
|
460
489
|
});
|
|
461
490
|
|
|
462
491
|
// src/runner/score-utils.ts
|
|
492
|
+
function getScoreDef(item) {
|
|
493
|
+
return item.def ?? getScoreById(item.id);
|
|
494
|
+
}
|
|
463
495
|
function aggregateScoreItems(items) {
|
|
464
496
|
if (items.length === 0)
|
|
465
497
|
return void 0;
|
|
466
|
-
const def =
|
|
467
|
-
if (!def?.
|
|
498
|
+
const def = getScoreDef(items[0]);
|
|
499
|
+
if (!def?.aggregateValues)
|
|
468
500
|
return items[items.length - 1];
|
|
469
|
-
const aggregated = def.
|
|
470
|
-
return { ...items[0], data: aggregated };
|
|
501
|
+
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
502
|
+
return { ...items[0], data: aggregated, def };
|
|
471
503
|
}
|
|
472
504
|
function aggregateMetricItems(items) {
|
|
473
505
|
if (items.length === 0)
|
|
@@ -480,7 +512,7 @@ function aggregateMetricItems(items) {
|
|
|
480
512
|
}
|
|
481
513
|
function toNumericScoreFromScores(scores) {
|
|
482
514
|
for (const item of scores) {
|
|
483
|
-
const def =
|
|
515
|
+
const def = getScoreDef(item);
|
|
484
516
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
485
517
|
const value = item.data.value;
|
|
486
518
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -868,7 +900,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
868
900
|
() => appendJsonLine(message.artifactPath, {
|
|
869
901
|
runId: message.runId,
|
|
870
902
|
ts: Date.now(),
|
|
871
|
-
...message.payload
|
|
903
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
872
904
|
})
|
|
873
905
|
);
|
|
874
906
|
})
|
|
@@ -1454,7 +1486,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1454
1486
|
if (agg)
|
|
1455
1487
|
aggregatedScores.push(agg);
|
|
1456
1488
|
}
|
|
1457
|
-
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([
|
|
1489
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1458
1490
|
const passed = events.every((ev) => {
|
|
1459
1491
|
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1460
1492
|
return es?.passed ?? false;
|
|
@@ -1474,13 +1506,13 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1474
1506
|
}
|
|
1475
1507
|
return result;
|
|
1476
1508
|
}
|
|
1477
|
-
function formatScorePart(item,
|
|
1478
|
-
const def = getScoreById(item.id);
|
|
1509
|
+
function formatScorePart(item, _scoreToColor, options) {
|
|
1510
|
+
const def = item.def ?? getScoreById(item.id);
|
|
1479
1511
|
if (!def) {
|
|
1480
1512
|
const numeric = toNumericScore(item.data);
|
|
1481
1513
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1482
1514
|
}
|
|
1483
|
-
const formatted = def
|
|
1515
|
+
const formatted = formatScoreData(def, item.data, options);
|
|
1484
1516
|
if (def.displayStrategy === "bar") {
|
|
1485
1517
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1486
1518
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1538,8 +1570,6 @@ function RunView({
|
|
|
1538
1570
|
const done = new Promise((resolve5) => {
|
|
1539
1571
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1540
1572
|
if (event.type === "TestCaseProgress") {
|
|
1541
|
-
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1542
|
-
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1543
1573
|
for (const item of event.evaluatorScores) {
|
|
1544
1574
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1545
1575
|
if (numeric !== void 0) {
|
|
@@ -1629,16 +1659,17 @@ function RunView({
|
|
|
1629
1659
|
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1630
1660
|
return;
|
|
1631
1661
|
}
|
|
1662
|
+
const completed = finalEvent;
|
|
1632
1663
|
setSummary({
|
|
1633
|
-
passedTestCases:
|
|
1634
|
-
failedTestCases:
|
|
1635
|
-
totalTestCases:
|
|
1664
|
+
passedTestCases: completed.passedTestCases,
|
|
1665
|
+
failedTestCases: completed.failedTestCases,
|
|
1666
|
+
totalTestCases: completed.totalTestCases,
|
|
1636
1667
|
overallScoreTotal,
|
|
1637
1668
|
overallScoreSumSq,
|
|
1638
1669
|
overallScoreCount,
|
|
1639
1670
|
aggregates: new Map(aggregates),
|
|
1640
1671
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1641
|
-
artifactPath:
|
|
1672
|
+
artifactPath: completed.artifactPath
|
|
1642
1673
|
});
|
|
1643
1674
|
setPhase("completed");
|
|
1644
1675
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1844,11 +1875,9 @@ function RunView({
|
|
|
1844
1875
|
const aggregated = aggregateScoreItems(items);
|
|
1845
1876
|
if (!aggregated)
|
|
1846
1877
|
return null;
|
|
1847
|
-
const def = getScoreById(aggregated.id);
|
|
1878
|
+
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1848
1879
|
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1849
|
-
const formatted = def
|
|
1850
|
-
isAggregated: true
|
|
1851
|
-
}) ?? "n/a";
|
|
1880
|
+
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1852
1881
|
const numeric = toNumericScore(aggregated.data);
|
|
1853
1882
|
return /* @__PURE__ */ jsxs(
|
|
1854
1883
|
Text,
|
|
@@ -2007,9 +2036,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2007
2036
|
const agg = aggregateScoreItems(items);
|
|
2008
2037
|
if (!agg)
|
|
2009
2038
|
continue;
|
|
2010
|
-
const def = getScoreById(agg.id);
|
|
2039
|
+
const def = agg.def ?? getScoreById(agg.id);
|
|
2011
2040
|
const label = def ? def.name ?? def.id : agg.id;
|
|
2012
|
-
const formatted = def
|
|
2041
|
+
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2013
2042
|
const numeric = toNumericScore(agg.data);
|
|
2014
2043
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2015
2044
|
scoreLines.push(` ${label}: ${colored}`);
|
|
@@ -2027,7 +2056,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2027
2056
|
const filled = Math.round(safe / max * width);
|
|
2028
2057
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
2029
2058
|
}
|
|
2030
|
-
function aggregateEvaluatorScoresFromEvents(events,
|
|
2059
|
+
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2031
2060
|
if (events.length === 0)
|
|
2032
2061
|
return [];
|
|
2033
2062
|
const evaluatorIds = new Set(
|
|
@@ -2086,14 +2115,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2086
2115
|
}
|
|
2087
2116
|
const scoreLines = [];
|
|
2088
2117
|
for (const item of scores) {
|
|
2089
|
-
const def = getScoreById(item.id);
|
|
2118
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2090
2119
|
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2091
2120
|
let formatted;
|
|
2092
2121
|
if (!def) {
|
|
2093
2122
|
const numeric = toNumericScore(item.data);
|
|
2094
2123
|
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2095
2124
|
} else {
|
|
2096
|
-
const raw = def
|
|
2125
|
+
const raw = formatScoreData(def, item.data, options);
|
|
2097
2126
|
switch (def.displayStrategy) {
|
|
2098
2127
|
case "bar": {
|
|
2099
2128
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -2245,7 +2274,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2245
2274
|
(s, e) => s + e.durationMs,
|
|
2246
2275
|
0
|
|
2247
2276
|
);
|
|
2248
|
-
existing.events.every((e) => e.passed);
|
|
2249
2277
|
const lines = [];
|
|
2250
2278
|
lines.push(
|
|
2251
2279
|
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
@@ -2323,18 +2351,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2323
2351
|
if (finalEvent.type === "RunFailed") {
|
|
2324
2352
|
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2325
2353
|
}
|
|
2354
|
+
const completed = finalEvent;
|
|
2326
2355
|
console.log("");
|
|
2327
2356
|
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2328
2357
|
console.log(
|
|
2329
2358
|
`- passed: ${colorize(
|
|
2330
|
-
`${
|
|
2359
|
+
`${completed.passedTestCases}/${completed.totalTestCases}`,
|
|
2331
2360
|
ansi2.green
|
|
2332
2361
|
)}`
|
|
2333
2362
|
);
|
|
2334
2363
|
console.log(
|
|
2335
2364
|
`- failed: ${colorize(
|
|
2336
|
-
`${
|
|
2337
|
-
|
|
2365
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2366
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2338
2367
|
)}`
|
|
2339
2368
|
);
|
|
2340
2369
|
if (overallScoreCount > 0) {
|
|
@@ -2375,10 +2404,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2375
2404
|
);
|
|
2376
2405
|
continue;
|
|
2377
2406
|
}
|
|
2378
|
-
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ?
|
|
2379
|
-
summary.aggregatedScoreItem.
|
|
2380
|
-
|
|
2381
|
-
)
|
|
2407
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
|
|
2408
|
+
const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
|
|
2409
|
+
return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
|
|
2410
|
+
})() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2382
2411
|
console.log(
|
|
2383
2412
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2384
2413
|
scoreLabel,
|
|
@@ -2387,7 +2416,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2387
2416
|
);
|
|
2388
2417
|
}
|
|
2389
2418
|
}
|
|
2390
|
-
console.log(`- artifact: ${colorize(
|
|
2419
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2391
2420
|
}
|
|
2392
2421
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2393
2422
|
return new Promise((resolve5, reject) => {
|