@m4trix/evals 0.18.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +142 -42
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +142 -42
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -30
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -30
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +97 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +21 -9
- package/dist/index.js +97 -28
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -268,6 +268,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
268
268
|
function formatLogMessage(msg) {
|
|
269
269
|
if (typeof msg === "string")
|
|
270
270
|
return msg;
|
|
271
|
+
if (msg instanceof Error)
|
|
272
|
+
return msg.stack ?? msg.message;
|
|
271
273
|
try {
|
|
272
274
|
if (msg !== null && typeof msg === "object") {
|
|
273
275
|
return JSON.stringify(msg, null, 2);
|
|
@@ -321,7 +323,11 @@ var Metric = {
|
|
|
321
323
|
name: config.name,
|
|
322
324
|
aggregate: config.aggregate,
|
|
323
325
|
format: config.format,
|
|
324
|
-
make: (data) => ({
|
|
326
|
+
make: (data, options) => ({
|
|
327
|
+
id: config.id,
|
|
328
|
+
data,
|
|
329
|
+
...options?.name !== void 0 && { name: options.name }
|
|
330
|
+
})
|
|
325
331
|
};
|
|
326
332
|
registry.set(config.id, def);
|
|
327
333
|
return def;
|
|
@@ -343,25 +349,61 @@ var ScoreAggregate = {
|
|
|
343
349
|
const count = values.length || 1;
|
|
344
350
|
const result = {};
|
|
345
351
|
for (const field of fields) {
|
|
346
|
-
result[field] = values.reduce(
|
|
352
|
+
result[field] = values.reduce(
|
|
353
|
+
(s, v) => s + (v[field] ?? 0),
|
|
354
|
+
0
|
|
355
|
+
) / count;
|
|
347
356
|
}
|
|
348
357
|
return result;
|
|
349
358
|
};
|
|
350
359
|
},
|
|
351
|
-
/** Average
|
|
352
|
-
averageWithVariance(
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
360
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
361
|
+
averageWithVariance(fields) {
|
|
362
|
+
return (values) => {
|
|
363
|
+
const count = values.length;
|
|
364
|
+
const result = {};
|
|
365
|
+
for (const field of fields) {
|
|
366
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
367
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
368
|
+
0
|
|
369
|
+
) / count;
|
|
370
|
+
}
|
|
371
|
+
const valueField = "value";
|
|
372
|
+
const hasValueField = fields.includes(valueField);
|
|
373
|
+
if (count === 0) {
|
|
374
|
+
if (hasValueField) {
|
|
375
|
+
result[valueField] = 0;
|
|
376
|
+
}
|
|
377
|
+
return {
|
|
378
|
+
...result,
|
|
379
|
+
stdDev: void 0,
|
|
380
|
+
count: 0
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
let stdDev;
|
|
384
|
+
if (hasValueField && count >= 2) {
|
|
385
|
+
const sum = values.reduce(
|
|
386
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
387
|
+
0
|
|
388
|
+
);
|
|
389
|
+
const sumSq = values.reduce(
|
|
390
|
+
(s, v) => {
|
|
391
|
+
const value = v[valueField] ?? 0;
|
|
392
|
+
return s + value * value;
|
|
393
|
+
},
|
|
394
|
+
0
|
|
395
|
+
);
|
|
396
|
+
const mean = sum / count;
|
|
397
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
398
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
399
|
+
}
|
|
400
|
+
return {
|
|
401
|
+
...values[0],
|
|
402
|
+
...result,
|
|
403
|
+
stdDev,
|
|
404
|
+
count
|
|
405
|
+
};
|
|
406
|
+
};
|
|
365
407
|
},
|
|
366
408
|
/** All runs must pass. Use for binary scores. */
|
|
367
409
|
all(values) {
|
|
@@ -395,6 +437,7 @@ var Score = {
|
|
|
395
437
|
id: config.id,
|
|
396
438
|
data,
|
|
397
439
|
...passed !== void 0 && { passed },
|
|
440
|
+
...options?.name !== void 0 && { name: options.name },
|
|
398
441
|
def
|
|
399
442
|
// Attach def so rendering/aggregation works without registry lookup
|
|
400
443
|
};
|
|
@@ -463,7 +506,7 @@ Score.of({
|
|
|
463
506
|
displayStrategy: "bar",
|
|
464
507
|
formatValue: (data) => data.value.toFixed(2),
|
|
465
508
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
466
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
509
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
467
510
|
});
|
|
468
511
|
Score.of({
|
|
469
512
|
id: "delta",
|
|
@@ -492,6 +535,14 @@ Score.of({
|
|
|
492
535
|
function getScoreDef(item) {
|
|
493
536
|
return item.def ?? getScoreById(item.id);
|
|
494
537
|
}
|
|
538
|
+
function lastNonEmptyName(items) {
|
|
539
|
+
for (let i = items.length - 1; i >= 0; i--) {
|
|
540
|
+
const n = items[i].name;
|
|
541
|
+
if (n != null && n.trim().length > 0)
|
|
542
|
+
return n;
|
|
543
|
+
}
|
|
544
|
+
return void 0;
|
|
545
|
+
}
|
|
495
546
|
function aggregateScoreItems(items) {
|
|
496
547
|
if (items.length === 0)
|
|
497
548
|
return void 0;
|
|
@@ -499,7 +550,13 @@ function aggregateScoreItems(items) {
|
|
|
499
550
|
if (!def?.aggregateValues)
|
|
500
551
|
return items[items.length - 1];
|
|
501
552
|
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
502
|
-
|
|
553
|
+
const nameOverride = lastNonEmptyName(items);
|
|
554
|
+
return {
|
|
555
|
+
...items[0],
|
|
556
|
+
data: aggregated,
|
|
557
|
+
def,
|
|
558
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
559
|
+
};
|
|
503
560
|
}
|
|
504
561
|
function aggregateMetricItems(items) {
|
|
505
562
|
if (items.length === 0)
|
|
@@ -508,7 +565,12 @@ function aggregateMetricItems(items) {
|
|
|
508
565
|
if (!def?.aggregate)
|
|
509
566
|
return items[items.length - 1];
|
|
510
567
|
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
511
|
-
|
|
568
|
+
const nameOverride = lastNonEmptyName(items);
|
|
569
|
+
return {
|
|
570
|
+
...items[0],
|
|
571
|
+
data: aggregated,
|
|
572
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
573
|
+
};
|
|
512
574
|
}
|
|
513
575
|
function toNumericScoreFromScores(scores) {
|
|
514
576
|
for (const item of scores) {
|
|
@@ -547,6 +609,7 @@ function toNumericScore(value) {
|
|
|
547
609
|
}
|
|
548
610
|
|
|
549
611
|
// src/runner/execution.ts
|
|
612
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
550
613
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
551
614
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
552
615
|
if (scoresWithPassed.length > 0) {
|
|
@@ -603,20 +666,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
603
666
|
if (!evaluateFn) {
|
|
604
667
|
continue;
|
|
605
668
|
}
|
|
669
|
+
const logs = [];
|
|
670
|
+
const logDiff = (expected, actual, options) => {
|
|
671
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
672
|
+
};
|
|
673
|
+
const log = (message, options) => {
|
|
674
|
+
logs.push(createLogEntry(message, options));
|
|
675
|
+
};
|
|
676
|
+
const createError = (message, options) => {
|
|
677
|
+
const entry = createLogEntry(message, options);
|
|
678
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
679
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
680
|
+
return error;
|
|
681
|
+
};
|
|
606
682
|
try {
|
|
607
|
-
const logs = [];
|
|
608
|
-
const logDiff = (expected, actual, options) => {
|
|
609
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
610
|
-
};
|
|
611
|
-
const log = (message, options) => {
|
|
612
|
-
logs.push(createLogEntry(message, options));
|
|
613
|
-
};
|
|
614
683
|
const ctx = yield* Effect.promise(
|
|
615
684
|
() => Promise.resolve(evaluator.resolveContext())
|
|
616
685
|
);
|
|
617
686
|
const result = yield* Effect.promise(
|
|
618
|
-
() => Promise.resolve(
|
|
619
|
-
evaluateFn({
|
|
687
|
+
() => Promise.resolve().then(
|
|
688
|
+
() => evaluateFn({
|
|
620
689
|
input: testCaseItem.testCase.getInput(),
|
|
621
690
|
ctx,
|
|
622
691
|
output,
|
|
@@ -626,10 +695,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
626
695
|
datasetId: task.datasetId
|
|
627
696
|
},
|
|
628
697
|
logDiff,
|
|
629
|
-
log
|
|
698
|
+
log,
|
|
699
|
+
createError
|
|
630
700
|
})
|
|
631
701
|
)
|
|
632
702
|
);
|
|
703
|
+
if (result instanceof Error) {
|
|
704
|
+
const evaluatorError = result;
|
|
705
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
706
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
707
|
+
testCaseError = result.message;
|
|
708
|
+
evaluatorScores.push({
|
|
709
|
+
evaluatorId,
|
|
710
|
+
scores: [],
|
|
711
|
+
passed: false,
|
|
712
|
+
logs: logs.length > 0 ? logs : void 0
|
|
713
|
+
});
|
|
714
|
+
continue;
|
|
715
|
+
}
|
|
633
716
|
const { scores, metrics } = normalizeResult(result);
|
|
634
717
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
635
718
|
evaluatorScores.push({
|
|
@@ -640,11 +723,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
640
723
|
logs: logs.length > 0 ? logs : void 0
|
|
641
724
|
});
|
|
642
725
|
} catch (error) {
|
|
726
|
+
if (error instanceof Error) {
|
|
727
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
728
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
729
|
+
}
|
|
643
730
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
644
731
|
evaluatorScores.push({
|
|
645
732
|
evaluatorId,
|
|
646
733
|
scores: [],
|
|
647
|
-
passed: false
|
|
734
|
+
passed: false,
|
|
735
|
+
logs: logs.length > 0 ? logs : void 0
|
|
648
736
|
});
|
|
649
737
|
}
|
|
650
738
|
}
|
|
@@ -1628,6 +1716,7 @@ function RunView({
|
|
|
1628
1716
|
rerunTotal: event.rerunTotal,
|
|
1629
1717
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1630
1718
|
passed: events.every((e) => e.passed),
|
|
1719
|
+
errorMessage: event.errorMessage,
|
|
1631
1720
|
events,
|
|
1632
1721
|
aggregatedEvaluatorScores,
|
|
1633
1722
|
isAggregated
|
|
@@ -1738,8 +1827,13 @@ function RunView({
|
|
|
1738
1827
|
" (",
|
|
1739
1828
|
tc.durationMs,
|
|
1740
1829
|
"ms)"
|
|
1741
|
-
] })
|
|
1830
|
+
] }),
|
|
1831
|
+
tc.errorMessage ? /* @__PURE__ */ jsxs(Text, { color: "red", bold: true, children: [
|
|
1832
|
+
" ",
|
|
1833
|
+
"ERROR"
|
|
1834
|
+
] }) : null
|
|
1742
1835
|
] }),
|
|
1836
|
+
tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
|
|
1743
1837
|
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
|
|
1744
1838
|
Box,
|
|
1745
1839
|
{
|
|
@@ -1760,9 +1854,10 @@ function RunView({
|
|
|
1760
1854
|
const formatted = def.format(m.data, {
|
|
1761
1855
|
isAggregated: tc.isAggregated
|
|
1762
1856
|
});
|
|
1857
|
+
const label = m.name ?? def.name;
|
|
1763
1858
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1764
1859
|
"[",
|
|
1765
|
-
|
|
1860
|
+
label ? `${label}: ` : "",
|
|
1766
1861
|
formatted,
|
|
1767
1862
|
"]",
|
|
1768
1863
|
" "
|
|
@@ -1771,8 +1866,8 @@ function RunView({
|
|
|
1771
1866
|
] }) : null
|
|
1772
1867
|
] }),
|
|
1773
1868
|
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1774
|
-
const def = getScoreById(s.id);
|
|
1775
|
-
const scoreLabel =
|
|
1869
|
+
const def = s.def ?? getScoreById(s.id);
|
|
1870
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
1776
1871
|
return /* @__PURE__ */ jsxs(
|
|
1777
1872
|
Text,
|
|
1778
1873
|
{
|
|
@@ -1876,7 +1971,7 @@ function RunView({
|
|
|
1876
1971
|
if (!aggregated)
|
|
1877
1972
|
return null;
|
|
1878
1973
|
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1879
|
-
const label =
|
|
1974
|
+
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
1880
1975
|
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1881
1976
|
const numeric = toNumericScore(aggregated.data);
|
|
1882
1977
|
return /* @__PURE__ */ jsxs(
|
|
@@ -2037,7 +2132,7 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2037
2132
|
if (!agg)
|
|
2038
2133
|
continue;
|
|
2039
2134
|
const def = agg.def ?? getScoreById(agg.id);
|
|
2040
|
-
const label =
|
|
2135
|
+
const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
|
|
2041
2136
|
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2042
2137
|
const numeric = toNumericScore(agg.data);
|
|
2043
2138
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
@@ -2103,12 +2198,13 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2103
2198
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
2104
2199
|
const metricParts = [];
|
|
2105
2200
|
if (metrics && metrics.length > 0) {
|
|
2106
|
-
for (const
|
|
2107
|
-
const def = getMetricById(id);
|
|
2201
|
+
for (const m of metrics) {
|
|
2202
|
+
const def = getMetricById(m.id);
|
|
2108
2203
|
if (def) {
|
|
2109
|
-
const formatted = def.format(data, options);
|
|
2204
|
+
const formatted = def.format(m.data, options);
|
|
2205
|
+
const label = m.name ?? def.name;
|
|
2110
2206
|
metricParts.push(
|
|
2111
|
-
|
|
2207
|
+
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2112
2208
|
);
|
|
2113
2209
|
}
|
|
2114
2210
|
}
|
|
@@ -2116,7 +2212,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2116
2212
|
const scoreLines = [];
|
|
2117
2213
|
for (const item of scores) {
|
|
2118
2214
|
const def = item.def ?? getScoreById(item.id);
|
|
2119
|
-
const scoreLabel =
|
|
2215
|
+
const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
|
|
2120
2216
|
let formatted;
|
|
2121
2217
|
if (!def) {
|
|
2122
2218
|
const numeric = toNumericScore(item.data);
|
|
@@ -2275,9 +2371,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2275
2371
|
0
|
|
2276
2372
|
);
|
|
2277
2373
|
const lines = [];
|
|
2374
|
+
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2278
2375
|
lines.push(
|
|
2279
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2376
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2280
2377
|
);
|
|
2378
|
+
if (event.errorMessage) {
|
|
2379
|
+
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
2380
|
+
}
|
|
2281
2381
|
for (const item of aggregatedScores) {
|
|
2282
2382
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2283
2383
|
lines.push(
|