@m4trix/evals 0.18.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +142 -42
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +142 -42
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -30
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -30
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +97 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +21 -9
- package/dist/index.js +97 -28
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -77,8 +77,15 @@ export const myEvaluator = Evaluator.define({
|
|
|
77
77
|
inputSchema,
|
|
78
78
|
outputSchema: S.Unknown,
|
|
79
79
|
scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
|
|
80
|
-
}).evaluate(async ({ input, ctx: _ctx, output }) => {
|
|
80
|
+
}).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
|
|
81
81
|
const start = Date.now();
|
|
82
|
+
const value = 85;
|
|
83
|
+
if (value < 50) {
|
|
84
|
+
return createError(
|
|
85
|
+
{ reason: 'score below minimum', value, prompt: input.prompt, output },
|
|
86
|
+
{ label: 'quality-check' },
|
|
87
|
+
);
|
|
88
|
+
}
|
|
82
89
|
const latencyMs = Date.now() - start;
|
|
83
90
|
const minScore =
|
|
84
91
|
typeof output === 'object' &&
|
|
@@ -90,7 +97,7 @@ export const myEvaluator = Evaluator.define({
|
|
|
90
97
|
return {
|
|
91
98
|
scores: [
|
|
92
99
|
percentScore.make(
|
|
93
|
-
{ value
|
|
100
|
+
{ value },
|
|
94
101
|
{ definePassed: (d) => d.value >= (minScore ?? 50) },
|
|
95
102
|
),
|
|
96
103
|
],
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -294,6 +294,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
294
294
|
function formatLogMessage(msg) {
|
|
295
295
|
if (typeof msg === "string")
|
|
296
296
|
return msg;
|
|
297
|
+
if (msg instanceof Error)
|
|
298
|
+
return msg.stack ?? msg.message;
|
|
297
299
|
try {
|
|
298
300
|
if (msg !== null && typeof msg === "object") {
|
|
299
301
|
return JSON.stringify(msg, null, 2);
|
|
@@ -347,7 +349,11 @@ var Metric = {
|
|
|
347
349
|
name: config.name,
|
|
348
350
|
aggregate: config.aggregate,
|
|
349
351
|
format: config.format,
|
|
350
|
-
make: (data) => ({
|
|
352
|
+
make: (data, options) => ({
|
|
353
|
+
id: config.id,
|
|
354
|
+
data,
|
|
355
|
+
...options?.name !== void 0 && { name: options.name }
|
|
356
|
+
})
|
|
351
357
|
};
|
|
352
358
|
registry.set(config.id, def);
|
|
353
359
|
return def;
|
|
@@ -369,25 +375,61 @@ var ScoreAggregate = {
|
|
|
369
375
|
const count = values.length || 1;
|
|
370
376
|
const result = {};
|
|
371
377
|
for (const field of fields) {
|
|
372
|
-
result[field] = values.reduce(
|
|
378
|
+
result[field] = values.reduce(
|
|
379
|
+
(s, v) => s + (v[field] ?? 0),
|
|
380
|
+
0
|
|
381
|
+
) / count;
|
|
373
382
|
}
|
|
374
383
|
return result;
|
|
375
384
|
};
|
|
376
385
|
},
|
|
377
|
-
/** Average
|
|
378
|
-
averageWithVariance(
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
386
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
387
|
+
averageWithVariance(fields) {
|
|
388
|
+
return (values) => {
|
|
389
|
+
const count = values.length;
|
|
390
|
+
const result = {};
|
|
391
|
+
for (const field of fields) {
|
|
392
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
393
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
394
|
+
0
|
|
395
|
+
) / count;
|
|
396
|
+
}
|
|
397
|
+
const valueField = "value";
|
|
398
|
+
const hasValueField = fields.includes(valueField);
|
|
399
|
+
if (count === 0) {
|
|
400
|
+
if (hasValueField) {
|
|
401
|
+
result[valueField] = 0;
|
|
402
|
+
}
|
|
403
|
+
return {
|
|
404
|
+
...result,
|
|
405
|
+
stdDev: void 0,
|
|
406
|
+
count: 0
|
|
407
|
+
};
|
|
408
|
+
}
|
|
409
|
+
let stdDev;
|
|
410
|
+
if (hasValueField && count >= 2) {
|
|
411
|
+
const sum = values.reduce(
|
|
412
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
413
|
+
0
|
|
414
|
+
);
|
|
415
|
+
const sumSq = values.reduce(
|
|
416
|
+
(s, v) => {
|
|
417
|
+
const value = v[valueField] ?? 0;
|
|
418
|
+
return s + value * value;
|
|
419
|
+
},
|
|
420
|
+
0
|
|
421
|
+
);
|
|
422
|
+
const mean = sum / count;
|
|
423
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
424
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
425
|
+
}
|
|
426
|
+
return {
|
|
427
|
+
...values[0],
|
|
428
|
+
...result,
|
|
429
|
+
stdDev,
|
|
430
|
+
count
|
|
431
|
+
};
|
|
432
|
+
};
|
|
391
433
|
},
|
|
392
434
|
/** All runs must pass. Use for binary scores. */
|
|
393
435
|
all(values) {
|
|
@@ -421,6 +463,7 @@ var Score = {
|
|
|
421
463
|
id: config.id,
|
|
422
464
|
data,
|
|
423
465
|
...passed !== void 0 && { passed },
|
|
466
|
+
...options?.name !== void 0 && { name: options.name },
|
|
424
467
|
def
|
|
425
468
|
// Attach def so rendering/aggregation works without registry lookup
|
|
426
469
|
};
|
|
@@ -489,7 +532,7 @@ Score.of({
|
|
|
489
532
|
displayStrategy: "bar",
|
|
490
533
|
formatValue: (data) => data.value.toFixed(2),
|
|
491
534
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
492
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
535
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
493
536
|
});
|
|
494
537
|
Score.of({
|
|
495
538
|
id: "delta",
|
|
@@ -518,6 +561,14 @@ Score.of({
|
|
|
518
561
|
function getScoreDef(item) {
|
|
519
562
|
return item.def ?? getScoreById(item.id);
|
|
520
563
|
}
|
|
564
|
+
function lastNonEmptyName(items) {
|
|
565
|
+
for (let i = items.length - 1; i >= 0; i--) {
|
|
566
|
+
const n = items[i].name;
|
|
567
|
+
if (n != null && n.trim().length > 0)
|
|
568
|
+
return n;
|
|
569
|
+
}
|
|
570
|
+
return void 0;
|
|
571
|
+
}
|
|
521
572
|
function aggregateScoreItems(items) {
|
|
522
573
|
if (items.length === 0)
|
|
523
574
|
return void 0;
|
|
@@ -525,7 +576,13 @@ function aggregateScoreItems(items) {
|
|
|
525
576
|
if (!def?.aggregateValues)
|
|
526
577
|
return items[items.length - 1];
|
|
527
578
|
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
528
|
-
|
|
579
|
+
const nameOverride = lastNonEmptyName(items);
|
|
580
|
+
return {
|
|
581
|
+
...items[0],
|
|
582
|
+
data: aggregated,
|
|
583
|
+
def,
|
|
584
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
585
|
+
};
|
|
529
586
|
}
|
|
530
587
|
function aggregateMetricItems(items) {
|
|
531
588
|
if (items.length === 0)
|
|
@@ -534,7 +591,12 @@ function aggregateMetricItems(items) {
|
|
|
534
591
|
if (!def?.aggregate)
|
|
535
592
|
return items[items.length - 1];
|
|
536
593
|
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
537
|
-
|
|
594
|
+
const nameOverride = lastNonEmptyName(items);
|
|
595
|
+
return {
|
|
596
|
+
...items[0],
|
|
597
|
+
data: aggregated,
|
|
598
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
599
|
+
};
|
|
538
600
|
}
|
|
539
601
|
function toNumericScoreFromScores(scores) {
|
|
540
602
|
for (const item of scores) {
|
|
@@ -573,6 +635,7 @@ function toNumericScore(value) {
|
|
|
573
635
|
}
|
|
574
636
|
|
|
575
637
|
// src/runner/execution.ts
|
|
638
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
576
639
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
577
640
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
578
641
|
if (scoresWithPassed.length > 0) {
|
|
@@ -629,20 +692,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
629
692
|
if (!evaluateFn) {
|
|
630
693
|
continue;
|
|
631
694
|
}
|
|
695
|
+
const logs = [];
|
|
696
|
+
const logDiff = (expected, actual, options) => {
|
|
697
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
698
|
+
};
|
|
699
|
+
const log = (message, options) => {
|
|
700
|
+
logs.push(createLogEntry(message, options));
|
|
701
|
+
};
|
|
702
|
+
const createError = (message, options) => {
|
|
703
|
+
const entry = createLogEntry(message, options);
|
|
704
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
705
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
706
|
+
return error;
|
|
707
|
+
};
|
|
632
708
|
try {
|
|
633
|
-
const logs = [];
|
|
634
|
-
const logDiff = (expected, actual, options) => {
|
|
635
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
636
|
-
};
|
|
637
|
-
const log = (message, options) => {
|
|
638
|
-
logs.push(createLogEntry(message, options));
|
|
639
|
-
};
|
|
640
709
|
const ctx = yield* effect.Effect.promise(
|
|
641
710
|
() => Promise.resolve(evaluator.resolveContext())
|
|
642
711
|
);
|
|
643
712
|
const result = yield* effect.Effect.promise(
|
|
644
|
-
() => Promise.resolve(
|
|
645
|
-
evaluateFn({
|
|
713
|
+
() => Promise.resolve().then(
|
|
714
|
+
() => evaluateFn({
|
|
646
715
|
input: testCaseItem.testCase.getInput(),
|
|
647
716
|
ctx,
|
|
648
717
|
output,
|
|
@@ -652,10 +721,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
652
721
|
datasetId: task.datasetId
|
|
653
722
|
},
|
|
654
723
|
logDiff,
|
|
655
|
-
log
|
|
724
|
+
log,
|
|
725
|
+
createError
|
|
656
726
|
})
|
|
657
727
|
)
|
|
658
728
|
);
|
|
729
|
+
if (result instanceof Error) {
|
|
730
|
+
const evaluatorError = result;
|
|
731
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
732
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
733
|
+
testCaseError = result.message;
|
|
734
|
+
evaluatorScores.push({
|
|
735
|
+
evaluatorId,
|
|
736
|
+
scores: [],
|
|
737
|
+
passed: false,
|
|
738
|
+
logs: logs.length > 0 ? logs : void 0
|
|
739
|
+
});
|
|
740
|
+
continue;
|
|
741
|
+
}
|
|
659
742
|
const { scores, metrics } = normalizeResult(result);
|
|
660
743
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
661
744
|
evaluatorScores.push({
|
|
@@ -666,11 +749,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
666
749
|
logs: logs.length > 0 ? logs : void 0
|
|
667
750
|
});
|
|
668
751
|
} catch (error) {
|
|
752
|
+
if (error instanceof Error) {
|
|
753
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
754
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
755
|
+
}
|
|
669
756
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
670
757
|
evaluatorScores.push({
|
|
671
758
|
evaluatorId,
|
|
672
759
|
scores: [],
|
|
673
|
-
passed: false
|
|
760
|
+
passed: false,
|
|
761
|
+
logs: logs.length > 0 ? logs : void 0
|
|
674
762
|
});
|
|
675
763
|
}
|
|
676
764
|
}
|
|
@@ -1654,6 +1742,7 @@ function RunView({
|
|
|
1654
1742
|
rerunTotal: event.rerunTotal,
|
|
1655
1743
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1656
1744
|
passed: events.every((e) => e.passed),
|
|
1745
|
+
errorMessage: event.errorMessage,
|
|
1657
1746
|
events,
|
|
1658
1747
|
aggregatedEvaluatorScores,
|
|
1659
1748
|
isAggregated
|
|
@@ -1764,8 +1853,13 @@ function RunView({
|
|
|
1764
1853
|
" (",
|
|
1765
1854
|
tc.durationMs,
|
|
1766
1855
|
"ms)"
|
|
1767
|
-
] })
|
|
1856
|
+
] }),
|
|
1857
|
+
tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "red", bold: true, children: [
|
|
1858
|
+
" ",
|
|
1859
|
+
"ERROR"
|
|
1860
|
+
] }) : null
|
|
1768
1861
|
] }),
|
|
1862
|
+
tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
|
|
1769
1863
|
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1770
1864
|
ink.Box,
|
|
1771
1865
|
{
|
|
@@ -1786,9 +1880,10 @@ function RunView({
|
|
|
1786
1880
|
const formatted = def.format(m.data, {
|
|
1787
1881
|
isAggregated: tc.isAggregated
|
|
1788
1882
|
});
|
|
1883
|
+
const label = m.name ?? def.name;
|
|
1789
1884
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1790
1885
|
"[",
|
|
1791
|
-
|
|
1886
|
+
label ? `${label}: ` : "",
|
|
1792
1887
|
formatted,
|
|
1793
1888
|
"]",
|
|
1794
1889
|
" "
|
|
@@ -1797,8 +1892,8 @@ function RunView({
|
|
|
1797
1892
|
] }) : null
|
|
1798
1893
|
] }),
|
|
1799
1894
|
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1800
|
-
const def = getScoreById(s.id);
|
|
1801
|
-
const scoreLabel =
|
|
1895
|
+
const def = s.def ?? getScoreById(s.id);
|
|
1896
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
1802
1897
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1803
1898
|
ink.Text,
|
|
1804
1899
|
{
|
|
@@ -1902,7 +1997,7 @@ function RunView({
|
|
|
1902
1997
|
if (!aggregated)
|
|
1903
1998
|
return null;
|
|
1904
1999
|
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1905
|
-
const label =
|
|
2000
|
+
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
1906
2001
|
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1907
2002
|
const numeric = toNumericScore(aggregated.data);
|
|
1908
2003
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
@@ -2063,7 +2158,7 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2063
2158
|
if (!agg)
|
|
2064
2159
|
continue;
|
|
2065
2160
|
const def = agg.def ?? getScoreById(agg.id);
|
|
2066
|
-
const label =
|
|
2161
|
+
const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
|
|
2067
2162
|
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2068
2163
|
const numeric = toNumericScore(agg.data);
|
|
2069
2164
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
@@ -2129,12 +2224,13 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2129
2224
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
2130
2225
|
const metricParts = [];
|
|
2131
2226
|
if (metrics && metrics.length > 0) {
|
|
2132
|
-
for (const
|
|
2133
|
-
const def = getMetricById(id);
|
|
2227
|
+
for (const m of metrics) {
|
|
2228
|
+
const def = getMetricById(m.id);
|
|
2134
2229
|
if (def) {
|
|
2135
|
-
const formatted = def.format(data, options);
|
|
2230
|
+
const formatted = def.format(m.data, options);
|
|
2231
|
+
const label = m.name ?? def.name;
|
|
2136
2232
|
metricParts.push(
|
|
2137
|
-
|
|
2233
|
+
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2138
2234
|
);
|
|
2139
2235
|
}
|
|
2140
2236
|
}
|
|
@@ -2142,7 +2238,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2142
2238
|
const scoreLines = [];
|
|
2143
2239
|
for (const item of scores) {
|
|
2144
2240
|
const def = item.def ?? getScoreById(item.id);
|
|
2145
|
-
const scoreLabel =
|
|
2241
|
+
const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
|
|
2146
2242
|
let formatted;
|
|
2147
2243
|
if (!def) {
|
|
2148
2244
|
const numeric = toNumericScore(item.data);
|
|
@@ -2301,9 +2397,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2301
2397
|
0
|
|
2302
2398
|
);
|
|
2303
2399
|
const lines = [];
|
|
2400
|
+
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2304
2401
|
lines.push(
|
|
2305
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2402
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2306
2403
|
);
|
|
2404
|
+
if (event.errorMessage) {
|
|
2405
|
+
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
2406
|
+
}
|
|
2307
2407
|
for (const item of aggregatedScores) {
|
|
2308
2408
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2309
2409
|
lines.push(
|