@m4trix/evals 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +51 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +51 -13
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +39 -11
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +39 -11
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +39 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +39 -11
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -77,8 +77,15 @@ export const myEvaluator = Evaluator.define({
|
|
|
77
77
|
inputSchema,
|
|
78
78
|
outputSchema: S.Unknown,
|
|
79
79
|
scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
|
|
80
|
-
}).evaluate(async ({ input, ctx: _ctx, output }) => {
|
|
80
|
+
}).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
|
|
81
81
|
const start = Date.now();
|
|
82
|
+
const value = 85;
|
|
83
|
+
if (value < 50) {
|
|
84
|
+
return createError(
|
|
85
|
+
{ reason: 'score below minimum', value, prompt: input.prompt, output },
|
|
86
|
+
{ label: 'quality-check' },
|
|
87
|
+
);
|
|
88
|
+
}
|
|
82
89
|
const latencyMs = Date.now() - start;
|
|
83
90
|
const minScore =
|
|
84
91
|
typeof output === 'object' &&
|
|
@@ -90,7 +97,7 @@ export const myEvaluator = Evaluator.define({
|
|
|
90
97
|
return {
|
|
91
98
|
scores: [
|
|
92
99
|
percentScore.make(
|
|
93
|
-
{ value
|
|
100
|
+
{ value },
|
|
94
101
|
{ definePassed: (d) => d.value >= (minScore ?? 50) },
|
|
95
102
|
),
|
|
96
103
|
],
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -294,6 +294,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
294
294
|
function formatLogMessage(msg) {
|
|
295
295
|
if (typeof msg === "string")
|
|
296
296
|
return msg;
|
|
297
|
+
if (msg instanceof Error)
|
|
298
|
+
return msg.stack ?? msg.message;
|
|
297
299
|
try {
|
|
298
300
|
if (msg !== null && typeof msg === "object") {
|
|
299
301
|
return JSON.stringify(msg, null, 2);
|
|
@@ -633,6 +635,7 @@ function toNumericScore(value) {
|
|
|
633
635
|
}
|
|
634
636
|
|
|
635
637
|
// src/runner/execution.ts
|
|
638
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
636
639
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
637
640
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
638
641
|
if (scoresWithPassed.length > 0) {
|
|
@@ -689,20 +692,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
689
692
|
if (!evaluateFn) {
|
|
690
693
|
continue;
|
|
691
694
|
}
|
|
695
|
+
const logs = [];
|
|
696
|
+
const logDiff = (expected, actual, options) => {
|
|
697
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
698
|
+
};
|
|
699
|
+
const log = (message, options) => {
|
|
700
|
+
logs.push(createLogEntry(message, options));
|
|
701
|
+
};
|
|
702
|
+
const createError = (message, options) => {
|
|
703
|
+
const entry = createLogEntry(message, options);
|
|
704
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
705
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
706
|
+
return error;
|
|
707
|
+
};
|
|
692
708
|
try {
|
|
693
|
-
const logs = [];
|
|
694
|
-
const logDiff = (expected, actual, options) => {
|
|
695
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
696
|
-
};
|
|
697
|
-
const log = (message, options) => {
|
|
698
|
-
logs.push(createLogEntry(message, options));
|
|
699
|
-
};
|
|
700
709
|
const ctx = yield* effect.Effect.promise(
|
|
701
710
|
() => Promise.resolve(evaluator.resolveContext())
|
|
702
711
|
);
|
|
703
712
|
const result = yield* effect.Effect.promise(
|
|
704
|
-
() => Promise.resolve(
|
|
705
|
-
evaluateFn({
|
|
713
|
+
() => Promise.resolve().then(
|
|
714
|
+
() => evaluateFn({
|
|
706
715
|
input: testCaseItem.testCase.getInput(),
|
|
707
716
|
ctx,
|
|
708
717
|
output,
|
|
@@ -712,10 +721,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
712
721
|
datasetId: task.datasetId
|
|
713
722
|
},
|
|
714
723
|
logDiff,
|
|
715
|
-
log
|
|
724
|
+
log,
|
|
725
|
+
createError
|
|
716
726
|
})
|
|
717
727
|
)
|
|
718
728
|
);
|
|
729
|
+
if (result instanceof Error) {
|
|
730
|
+
const evaluatorError = result;
|
|
731
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
732
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
733
|
+
testCaseError = result.message;
|
|
734
|
+
evaluatorScores.push({
|
|
735
|
+
evaluatorId,
|
|
736
|
+
scores: [],
|
|
737
|
+
passed: false,
|
|
738
|
+
logs: logs.length > 0 ? logs : void 0
|
|
739
|
+
});
|
|
740
|
+
continue;
|
|
741
|
+
}
|
|
719
742
|
const { scores, metrics } = normalizeResult(result);
|
|
720
743
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
721
744
|
evaluatorScores.push({
|
|
@@ -726,11 +749,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
726
749
|
logs: logs.length > 0 ? logs : void 0
|
|
727
750
|
});
|
|
728
751
|
} catch (error) {
|
|
752
|
+
if (error instanceof Error) {
|
|
753
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
754
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
755
|
+
}
|
|
729
756
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
730
757
|
evaluatorScores.push({
|
|
731
758
|
evaluatorId,
|
|
732
759
|
scores: [],
|
|
733
|
-
passed: false
|
|
760
|
+
passed: false,
|
|
761
|
+
logs: logs.length > 0 ? logs : void 0
|
|
734
762
|
});
|
|
735
763
|
}
|
|
736
764
|
}
|
|
@@ -1714,6 +1742,7 @@ function RunView({
|
|
|
1714
1742
|
rerunTotal: event.rerunTotal,
|
|
1715
1743
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1716
1744
|
passed: events.every((e) => e.passed),
|
|
1745
|
+
errorMessage: event.errorMessage,
|
|
1717
1746
|
events,
|
|
1718
1747
|
aggregatedEvaluatorScores,
|
|
1719
1748
|
isAggregated
|
|
@@ -1824,8 +1853,13 @@ function RunView({
|
|
|
1824
1853
|
" (",
|
|
1825
1854
|
tc.durationMs,
|
|
1826
1855
|
"ms)"
|
|
1827
|
-
] })
|
|
1856
|
+
] }),
|
|
1857
|
+
tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "red", bold: true, children: [
|
|
1858
|
+
" ",
|
|
1859
|
+
"ERROR"
|
|
1860
|
+
] }) : null
|
|
1828
1861
|
] }),
|
|
1862
|
+
tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
|
|
1829
1863
|
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1830
1864
|
ink.Box,
|
|
1831
1865
|
{
|
|
@@ -2363,9 +2397,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2363
2397
|
0
|
|
2364
2398
|
);
|
|
2365
2399
|
const lines = [];
|
|
2400
|
+
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2366
2401
|
lines.push(
|
|
2367
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2402
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2368
2403
|
);
|
|
2404
|
+
if (event.errorMessage) {
|
|
2405
|
+
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
2406
|
+
}
|
|
2369
2407
|
for (const item of aggregatedScores) {
|
|
2370
2408
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2371
2409
|
lines.push(
|