@m4trix/evals 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +105 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +105 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +79 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +79 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +78 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +36 -5
- package/dist/index.js +77 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -359,20 +359,70 @@ function getMetricById(id) {
|
|
|
359
359
|
|
|
360
360
|
// src/evals/score.ts
|
|
361
361
|
var registry2 = /* @__PURE__ */ new Map();
|
|
362
|
+
function formatScoreData(def, data, options) {
|
|
363
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
364
|
+
}
|
|
365
|
+
var ScoreAggregate = {
|
|
366
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
367
|
+
averageFields(fields) {
|
|
368
|
+
return (values) => {
|
|
369
|
+
const count = values.length || 1;
|
|
370
|
+
const result = {};
|
|
371
|
+
for (const field of fields) {
|
|
372
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
373
|
+
}
|
|
374
|
+
return result;
|
|
375
|
+
};
|
|
376
|
+
},
|
|
377
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
378
|
+
averageWithVariance(values) {
|
|
379
|
+
if (values.length === 0) {
|
|
380
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
381
|
+
}
|
|
382
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
383
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
384
|
+
const mean = sum / values.length;
|
|
385
|
+
let stdDev;
|
|
386
|
+
if (values.length >= 2) {
|
|
387
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
388
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
389
|
+
}
|
|
390
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
391
|
+
},
|
|
392
|
+
/** All runs must pass. Use for binary scores. */
|
|
393
|
+
all(values) {
|
|
394
|
+
const total = values.length;
|
|
395
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
396
|
+
return {
|
|
397
|
+
...values[0],
|
|
398
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
399
|
+
passedCount,
|
|
400
|
+
totalCount: total
|
|
401
|
+
};
|
|
402
|
+
},
|
|
403
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
404
|
+
last(values) {
|
|
405
|
+
return values[values.length - 1] ?? {};
|
|
406
|
+
}
|
|
407
|
+
};
|
|
362
408
|
var Score = {
|
|
409
|
+
aggregate: ScoreAggregate,
|
|
363
410
|
of(config) {
|
|
364
411
|
const def = {
|
|
365
412
|
id: config.id,
|
|
366
413
|
name: config.name,
|
|
367
414
|
displayStrategy: config.displayStrategy,
|
|
368
|
-
|
|
369
|
-
|
|
415
|
+
formatValue: config.formatValue,
|
|
416
|
+
formatAggregate: config.formatAggregate,
|
|
417
|
+
aggregateValues: config.aggregateValues,
|
|
370
418
|
make: (data, options) => {
|
|
371
419
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
372
420
|
return {
|
|
373
421
|
id: config.id,
|
|
374
422
|
data,
|
|
375
|
-
...passed !== void 0 && { passed }
|
|
423
|
+
...passed !== void 0 && { passed },
|
|
424
|
+
def
|
|
425
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
376
426
|
};
|
|
377
427
|
}
|
|
378
428
|
};
|
|
@@ -385,29 +435,6 @@ function getScoreById(id) {
|
|
|
385
435
|
}
|
|
386
436
|
|
|
387
437
|
// src/evals/aggregators.ts
|
|
388
|
-
function aggregateAverageWithVariance(values) {
|
|
389
|
-
if (values.length === 0) {
|
|
390
|
-
return { value: 0, count: 0 };
|
|
391
|
-
}
|
|
392
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
393
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
394
|
-
const mean = sum / values.length;
|
|
395
|
-
let stdDev;
|
|
396
|
-
if (values.length >= 2) {
|
|
397
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
398
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
399
|
-
}
|
|
400
|
-
return { value: mean, stdDev, count: values.length };
|
|
401
|
-
}
|
|
402
|
-
function aggregateAll(values) {
|
|
403
|
-
const total = values.length;
|
|
404
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
405
|
-
return {
|
|
406
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
407
|
-
passedCount,
|
|
408
|
-
totalCount: total
|
|
409
|
-
};
|
|
410
|
-
}
|
|
411
438
|
function aggregateTokenCountSum(values) {
|
|
412
439
|
const initial = {
|
|
413
440
|
input: 0,
|
|
@@ -460,40 +487,45 @@ Score.of({
|
|
|
460
487
|
id: "percent",
|
|
461
488
|
name: "Score",
|
|
462
489
|
displayStrategy: "bar",
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
490
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
491
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
492
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
493
|
+
});
|
|
494
|
+
Score.of({
|
|
495
|
+
id: "delta",
|
|
496
|
+
name: "Delta",
|
|
497
|
+
displayStrategy: "number",
|
|
498
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
499
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
500
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
470
501
|
});
|
|
471
502
|
Score.of({
|
|
472
503
|
id: "binary",
|
|
473
504
|
name: "Result",
|
|
474
505
|
displayStrategy: "passFail",
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
}
|
|
481
|
-
return base;
|
|
506
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
507
|
+
formatAggregate: (data) => {
|
|
508
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
509
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
510
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
482
511
|
}
|
|
483
|
-
return
|
|
512
|
+
return base;
|
|
484
513
|
},
|
|
485
|
-
|
|
514
|
+
aggregateValues: Score.aggregate.all
|
|
486
515
|
});
|
|
487
516
|
|
|
488
517
|
// src/runner/score-utils.ts
|
|
518
|
+
function getScoreDef(item) {
|
|
519
|
+
return item.def ?? getScoreById(item.id);
|
|
520
|
+
}
|
|
489
521
|
function aggregateScoreItems(items) {
|
|
490
522
|
if (items.length === 0)
|
|
491
523
|
return void 0;
|
|
492
|
-
const def =
|
|
493
|
-
if (!def?.
|
|
524
|
+
const def = getScoreDef(items[0]);
|
|
525
|
+
if (!def?.aggregateValues)
|
|
494
526
|
return items[items.length - 1];
|
|
495
|
-
const aggregated = def.
|
|
496
|
-
return { ...items[0], data: aggregated };
|
|
527
|
+
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
528
|
+
return { ...items[0], data: aggregated, def };
|
|
497
529
|
}
|
|
498
530
|
function aggregateMetricItems(items) {
|
|
499
531
|
if (items.length === 0)
|
|
@@ -506,7 +538,7 @@ function aggregateMetricItems(items) {
|
|
|
506
538
|
}
|
|
507
539
|
function toNumericScoreFromScores(scores) {
|
|
508
540
|
for (const item of scores) {
|
|
509
|
-
const def =
|
|
541
|
+
const def = getScoreDef(item);
|
|
510
542
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
511
543
|
const value = item.data.value;
|
|
512
544
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -894,7 +926,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
|
894
926
|
() => appendJsonLine(message.artifactPath, {
|
|
895
927
|
runId: message.runId,
|
|
896
928
|
ts: Date.now(),
|
|
897
|
-
...message.payload
|
|
929
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
898
930
|
})
|
|
899
931
|
);
|
|
900
932
|
})
|
|
@@ -1480,7 +1512,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1480
1512
|
if (agg)
|
|
1481
1513
|
aggregatedScores.push(agg);
|
|
1482
1514
|
}
|
|
1483
|
-
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([
|
|
1515
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1484
1516
|
const passed = events.every((ev) => {
|
|
1485
1517
|
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1486
1518
|
return es?.passed ?? false;
|
|
@@ -1500,13 +1532,13 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1500
1532
|
}
|
|
1501
1533
|
return result;
|
|
1502
1534
|
}
|
|
1503
|
-
function formatScorePart(item,
|
|
1504
|
-
const def = getScoreById(item.id);
|
|
1535
|
+
function formatScorePart(item, _scoreToColor, options) {
|
|
1536
|
+
const def = item.def ?? getScoreById(item.id);
|
|
1505
1537
|
if (!def) {
|
|
1506
1538
|
const numeric = toNumericScore(item.data);
|
|
1507
1539
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1508
1540
|
}
|
|
1509
|
-
const formatted = def
|
|
1541
|
+
const formatted = formatScoreData(def, item.data, options);
|
|
1510
1542
|
if (def.displayStrategy === "bar") {
|
|
1511
1543
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1512
1544
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1564,8 +1596,6 @@ function RunView({
|
|
|
1564
1596
|
const done = new Promise((resolve5) => {
|
|
1565
1597
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1566
1598
|
if (event.type === "TestCaseProgress") {
|
|
1567
|
-
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1568
|
-
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1569
1599
|
for (const item of event.evaluatorScores) {
|
|
1570
1600
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1571
1601
|
if (numeric !== void 0) {
|
|
@@ -1655,16 +1685,17 @@ function RunView({
|
|
|
1655
1685
|
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1656
1686
|
return;
|
|
1657
1687
|
}
|
|
1688
|
+
const completed = finalEvent;
|
|
1658
1689
|
setSummary({
|
|
1659
|
-
passedTestCases:
|
|
1660
|
-
failedTestCases:
|
|
1661
|
-
totalTestCases:
|
|
1690
|
+
passedTestCases: completed.passedTestCases,
|
|
1691
|
+
failedTestCases: completed.failedTestCases,
|
|
1692
|
+
totalTestCases: completed.totalTestCases,
|
|
1662
1693
|
overallScoreTotal,
|
|
1663
1694
|
overallScoreSumSq,
|
|
1664
1695
|
overallScoreCount,
|
|
1665
1696
|
aggregates: new Map(aggregates),
|
|
1666
1697
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1667
|
-
artifactPath:
|
|
1698
|
+
artifactPath: completed.artifactPath
|
|
1668
1699
|
});
|
|
1669
1700
|
setPhase("completed");
|
|
1670
1701
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1870,11 +1901,9 @@ function RunView({
|
|
|
1870
1901
|
const aggregated = aggregateScoreItems(items);
|
|
1871
1902
|
if (!aggregated)
|
|
1872
1903
|
return null;
|
|
1873
|
-
const def = getScoreById(aggregated.id);
|
|
1904
|
+
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1874
1905
|
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1875
|
-
const formatted = def
|
|
1876
|
-
isAggregated: true
|
|
1877
|
-
}) ?? "n/a";
|
|
1906
|
+
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1878
1907
|
const numeric = toNumericScore(aggregated.data);
|
|
1879
1908
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1880
1909
|
ink.Text,
|
|
@@ -2033,9 +2062,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2033
2062
|
const agg = aggregateScoreItems(items);
|
|
2034
2063
|
if (!agg)
|
|
2035
2064
|
continue;
|
|
2036
|
-
const def = getScoreById(agg.id);
|
|
2065
|
+
const def = agg.def ?? getScoreById(agg.id);
|
|
2037
2066
|
const label = def ? def.name ?? def.id : agg.id;
|
|
2038
|
-
const formatted = def
|
|
2067
|
+
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2039
2068
|
const numeric = toNumericScore(agg.data);
|
|
2040
2069
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2041
2070
|
scoreLines.push(` ${label}: ${colored}`);
|
|
@@ -2053,7 +2082,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2053
2082
|
const filled = Math.round(safe / max * width);
|
|
2054
2083
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
2055
2084
|
}
|
|
2056
|
-
function aggregateEvaluatorScoresFromEvents(events,
|
|
2085
|
+
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2057
2086
|
if (events.length === 0)
|
|
2058
2087
|
return [];
|
|
2059
2088
|
const evaluatorIds = new Set(
|
|
@@ -2112,14 +2141,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2112
2141
|
}
|
|
2113
2142
|
const scoreLines = [];
|
|
2114
2143
|
for (const item of scores) {
|
|
2115
|
-
const def = getScoreById(item.id);
|
|
2144
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2116
2145
|
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2117
2146
|
let formatted;
|
|
2118
2147
|
if (!def) {
|
|
2119
2148
|
const numeric = toNumericScore(item.data);
|
|
2120
2149
|
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2121
2150
|
} else {
|
|
2122
|
-
const raw = def
|
|
2151
|
+
const raw = formatScoreData(def, item.data, options);
|
|
2123
2152
|
switch (def.displayStrategy) {
|
|
2124
2153
|
case "bar": {
|
|
2125
2154
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -2271,7 +2300,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2271
2300
|
(s, e) => s + e.durationMs,
|
|
2272
2301
|
0
|
|
2273
2302
|
);
|
|
2274
|
-
existing.events.every((e) => e.passed);
|
|
2275
2303
|
const lines = [];
|
|
2276
2304
|
lines.push(
|
|
2277
2305
|
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
@@ -2349,18 +2377,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2349
2377
|
if (finalEvent.type === "RunFailed") {
|
|
2350
2378
|
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2351
2379
|
}
|
|
2380
|
+
const completed = finalEvent;
|
|
2352
2381
|
console.log("");
|
|
2353
2382
|
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2354
2383
|
console.log(
|
|
2355
2384
|
`- passed: ${colorize(
|
|
2356
|
-
`${
|
|
2385
|
+
`${completed.passedTestCases}/${completed.totalTestCases}`,
|
|
2357
2386
|
ansi2.green
|
|
2358
2387
|
)}`
|
|
2359
2388
|
);
|
|
2360
2389
|
console.log(
|
|
2361
2390
|
`- failed: ${colorize(
|
|
2362
|
-
`${
|
|
2363
|
-
|
|
2391
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2392
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2364
2393
|
)}`
|
|
2365
2394
|
);
|
|
2366
2395
|
if (overallScoreCount > 0) {
|
|
@@ -2401,10 +2430,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2401
2430
|
);
|
|
2402
2431
|
continue;
|
|
2403
2432
|
}
|
|
2404
|
-
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ?
|
|
2405
|
-
summary.aggregatedScoreItem.
|
|
2406
|
-
|
|
2407
|
-
)
|
|
2433
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
|
|
2434
|
+
const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
|
|
2435
|
+
return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
|
|
2436
|
+
})() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2408
2437
|
console.log(
|
|
2409
2438
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2410
2439
|
scoreLabel,
|
|
@@ -2413,7 +2442,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2413
2442
|
);
|
|
2414
2443
|
}
|
|
2415
2444
|
}
|
|
2416
|
-
console.log(`- artifact: ${colorize(
|
|
2445
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2417
2446
|
}
|
|
2418
2447
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2419
2448
|
return new Promise((resolve5, reject) => {
|