@m4trix/evals 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +51 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +51 -13
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +39 -11
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +39 -11
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +39 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +39 -11
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -257,8 +257,15 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
257
257
|
log: (message: unknown, options?: {
|
|
258
258
|
label?: string;
|
|
259
259
|
}) => void;
|
|
260
|
+
/**
|
|
261
|
+
* Creates an Error from string/object payloads for `return createError(...)` (or `throw createError(...)`).
|
|
262
|
+
* The payload is also logged and shown by the CLI when the evaluator fails.
|
|
263
|
+
*/
|
|
264
|
+
createError: (message: unknown, options?: {
|
|
265
|
+
label?: string;
|
|
266
|
+
}) => Error;
|
|
260
267
|
}
|
|
261
|
-
type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Promise<TScore>;
|
|
268
|
+
type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
|
|
262
269
|
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
263
270
|
name: string;
|
|
264
271
|
inputSchema: TI;
|
package/dist/index.js
CHANGED
|
@@ -716,6 +716,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
716
716
|
function formatLogMessage(msg) {
|
|
717
717
|
if (typeof msg === "string")
|
|
718
718
|
return msg;
|
|
719
|
+
if (msg instanceof Error)
|
|
720
|
+
return msg.stack ?? msg.message;
|
|
719
721
|
try {
|
|
720
722
|
if (msg !== null && typeof msg === "object") {
|
|
721
723
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1062,6 +1064,7 @@ function toNumericScore(value) {
|
|
|
1062
1064
|
}
|
|
1063
1065
|
|
|
1064
1066
|
// src/runner/execution.ts
|
|
1067
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1065
1068
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1066
1069
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1067
1070
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1118,20 +1121,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1118
1121
|
if (!evaluateFn) {
|
|
1119
1122
|
continue;
|
|
1120
1123
|
}
|
|
1124
|
+
const logs = [];
|
|
1125
|
+
const logDiff = (expected, actual, options) => {
|
|
1126
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1127
|
+
};
|
|
1128
|
+
const log = (message, options) => {
|
|
1129
|
+
logs.push(createLogEntry(message, options));
|
|
1130
|
+
};
|
|
1131
|
+
const createError = (message, options) => {
|
|
1132
|
+
const entry = createLogEntry(message, options);
|
|
1133
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1134
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1135
|
+
return error;
|
|
1136
|
+
};
|
|
1121
1137
|
try {
|
|
1122
|
-
const logs = [];
|
|
1123
|
-
const logDiff = (expected, actual, options) => {
|
|
1124
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1125
|
-
};
|
|
1126
|
-
const log = (message, options) => {
|
|
1127
|
-
logs.push(createLogEntry(message, options));
|
|
1128
|
-
};
|
|
1129
1138
|
const ctx = yield* Effect.promise(
|
|
1130
1139
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1131
1140
|
);
|
|
1132
1141
|
const result = yield* Effect.promise(
|
|
1133
|
-
() => Promise.resolve(
|
|
1134
|
-
evaluateFn({
|
|
1142
|
+
() => Promise.resolve().then(
|
|
1143
|
+
() => evaluateFn({
|
|
1135
1144
|
input: testCaseItem.testCase.getInput(),
|
|
1136
1145
|
ctx,
|
|
1137
1146
|
output,
|
|
@@ -1141,10 +1150,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1141
1150
|
datasetId: task.datasetId
|
|
1142
1151
|
},
|
|
1143
1152
|
logDiff,
|
|
1144
|
-
log
|
|
1153
|
+
log,
|
|
1154
|
+
createError
|
|
1145
1155
|
})
|
|
1146
1156
|
)
|
|
1147
1157
|
);
|
|
1158
|
+
if (result instanceof Error) {
|
|
1159
|
+
const evaluatorError = result;
|
|
1160
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1161
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1162
|
+
testCaseError = result.message;
|
|
1163
|
+
evaluatorScores.push({
|
|
1164
|
+
evaluatorId,
|
|
1165
|
+
scores: [],
|
|
1166
|
+
passed: false,
|
|
1167
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1168
|
+
});
|
|
1169
|
+
continue;
|
|
1170
|
+
}
|
|
1148
1171
|
const { scores, metrics } = normalizeResult(result);
|
|
1149
1172
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1150
1173
|
evaluatorScores.push({
|
|
@@ -1155,11 +1178,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1155
1178
|
logs: logs.length > 0 ? logs : void 0
|
|
1156
1179
|
});
|
|
1157
1180
|
} catch (error) {
|
|
1181
|
+
if (error instanceof Error) {
|
|
1182
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1183
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1184
|
+
}
|
|
1158
1185
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1159
1186
|
evaluatorScores.push({
|
|
1160
1187
|
evaluatorId,
|
|
1161
1188
|
scores: [],
|
|
1162
|
-
passed: false
|
|
1189
|
+
passed: false,
|
|
1190
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1163
1191
|
});
|
|
1164
1192
|
}
|
|
1165
1193
|
}
|