@m4trix/evals 0.19.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +135 -26
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +135 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +56 -12
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +17 -1
- package/dist/index.js +56 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -257,8 +257,15 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
257
257
|
log: (message: unknown, options?: {
|
|
258
258
|
label?: string;
|
|
259
259
|
}) => void;
|
|
260
|
+
/**
|
|
261
|
+
* Creates an Error from string/object payloads for `return createError(...)` (or `throw createError(...)`).
|
|
262
|
+
* The payload is also logged and shown by the CLI when the evaluator fails.
|
|
263
|
+
*/
|
|
264
|
+
createError: (message: unknown, options?: {
|
|
265
|
+
label?: string;
|
|
266
|
+
}) => Error;
|
|
260
267
|
}
|
|
261
|
-
type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Promise<TScore>;
|
|
268
|
+
type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
|
|
262
269
|
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
263
270
|
name: string;
|
|
264
271
|
inputSchema: TI;
|
|
@@ -430,6 +437,15 @@ type RunnerEvent = {
|
|
|
430
437
|
type: 'RunStarted';
|
|
431
438
|
runId: string;
|
|
432
439
|
startedAt: number;
|
|
440
|
+
} | {
|
|
441
|
+
type: 'TestCaseStarted';
|
|
442
|
+
runId: string;
|
|
443
|
+
testCaseId: string;
|
|
444
|
+
testCaseName: string;
|
|
445
|
+
startedTestCases: number;
|
|
446
|
+
totalTestCases: number;
|
|
447
|
+
rerunIndex: number;
|
|
448
|
+
rerunTotal: number;
|
|
433
449
|
} | {
|
|
434
450
|
type: 'TestCaseProgress';
|
|
435
451
|
runId: string;
|
package/dist/index.js
CHANGED
|
@@ -716,6 +716,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
716
716
|
function formatLogMessage(msg) {
|
|
717
717
|
if (typeof msg === "string")
|
|
718
718
|
return msg;
|
|
719
|
+
if (msg instanceof Error)
|
|
720
|
+
return msg.stack ?? msg.message;
|
|
719
721
|
try {
|
|
720
722
|
if (msg !== null && typeof msg === "object") {
|
|
721
723
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1062,6 +1064,7 @@ function toNumericScore(value) {
|
|
|
1062
1064
|
}
|
|
1063
1065
|
|
|
1064
1066
|
// src/runner/execution.ts
|
|
1067
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1065
1068
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1066
1069
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1067
1070
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1103,13 +1106,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1103
1106
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1104
1107
|
);
|
|
1105
1108
|
}
|
|
1106
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1109
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1107
1110
|
return Effect.gen(function* () {
|
|
1108
1111
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1109
1112
|
const rerunPassed = [];
|
|
1110
1113
|
for (let r = 0; r < reruns; r++) {
|
|
1111
1114
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1112
1115
|
const started = Date.now();
|
|
1116
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1117
|
+
n + 1,
|
|
1118
|
+
n + 1
|
|
1119
|
+
]);
|
|
1120
|
+
yield* publishEvent({
|
|
1121
|
+
type: "TestCaseStarted",
|
|
1122
|
+
runId: task.runId,
|
|
1123
|
+
testCaseId: testCaseItem.id,
|
|
1124
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1125
|
+
startedTestCases: startedEvaluations,
|
|
1126
|
+
totalTestCases: totalEvaluations,
|
|
1127
|
+
rerunIndex: r + 1,
|
|
1128
|
+
rerunTotal: reruns
|
|
1129
|
+
});
|
|
1113
1130
|
const evaluatorScores = [];
|
|
1114
1131
|
let testCaseError;
|
|
1115
1132
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1118,20 +1135,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1118
1135
|
if (!evaluateFn) {
|
|
1119
1136
|
continue;
|
|
1120
1137
|
}
|
|
1138
|
+
const logs = [];
|
|
1139
|
+
const logDiff = (expected, actual, options) => {
|
|
1140
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1141
|
+
};
|
|
1142
|
+
const log = (message, options) => {
|
|
1143
|
+
logs.push(createLogEntry(message, options));
|
|
1144
|
+
};
|
|
1145
|
+
const createError = (message, options) => {
|
|
1146
|
+
const entry = createLogEntry(message, options);
|
|
1147
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1148
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1149
|
+
return error;
|
|
1150
|
+
};
|
|
1121
1151
|
try {
|
|
1122
|
-
const logs = [];
|
|
1123
|
-
const logDiff = (expected, actual, options) => {
|
|
1124
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1125
|
-
};
|
|
1126
|
-
const log = (message, options) => {
|
|
1127
|
-
logs.push(createLogEntry(message, options));
|
|
1128
|
-
};
|
|
1129
1152
|
const ctx = yield* Effect.promise(
|
|
1130
1153
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1131
1154
|
);
|
|
1132
1155
|
const result = yield* Effect.promise(
|
|
1133
|
-
() => Promise.resolve(
|
|
1134
|
-
evaluateFn({
|
|
1156
|
+
() => Promise.resolve().then(
|
|
1157
|
+
() => evaluateFn({
|
|
1135
1158
|
input: testCaseItem.testCase.getInput(),
|
|
1136
1159
|
ctx,
|
|
1137
1160
|
output,
|
|
@@ -1141,10 +1164,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1141
1164
|
datasetId: task.datasetId
|
|
1142
1165
|
},
|
|
1143
1166
|
logDiff,
|
|
1144
|
-
log
|
|
1167
|
+
log,
|
|
1168
|
+
createError
|
|
1145
1169
|
})
|
|
1146
1170
|
)
|
|
1147
1171
|
);
|
|
1172
|
+
if (result instanceof Error) {
|
|
1173
|
+
const evaluatorError = result;
|
|
1174
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1175
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1176
|
+
testCaseError = result.message;
|
|
1177
|
+
evaluatorScores.push({
|
|
1178
|
+
evaluatorId,
|
|
1179
|
+
scores: [],
|
|
1180
|
+
passed: false,
|
|
1181
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1182
|
+
});
|
|
1183
|
+
continue;
|
|
1184
|
+
}
|
|
1148
1185
|
const { scores, metrics } = normalizeResult(result);
|
|
1149
1186
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1150
1187
|
evaluatorScores.push({
|
|
@@ -1155,11 +1192,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1155
1192
|
logs: logs.length > 0 ? logs : void 0
|
|
1156
1193
|
});
|
|
1157
1194
|
} catch (error) {
|
|
1195
|
+
if (error instanceof Error) {
|
|
1196
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1197
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1198
|
+
}
|
|
1158
1199
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1159
1200
|
evaluatorScores.push({
|
|
1160
1201
|
evaluatorId,
|
|
1161
1202
|
scores: [],
|
|
1162
|
-
passed: false
|
|
1203
|
+
passed: false,
|
|
1204
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1163
1205
|
});
|
|
1164
1206
|
}
|
|
1165
1207
|
}
|
|
@@ -1230,6 +1272,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1230
1272
|
);
|
|
1231
1273
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1232
1274
|
const completedRef = yield* Ref.make(0);
|
|
1275
|
+
const startedRef = yield* Ref.make(0);
|
|
1233
1276
|
const passedRef = yield* Ref.make(0);
|
|
1234
1277
|
const failedRef = yield* Ref.make(0);
|
|
1235
1278
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1239,6 +1282,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1239
1282
|
publishEvent,
|
|
1240
1283
|
persistenceQueue,
|
|
1241
1284
|
updateSnapshot,
|
|
1285
|
+
startedRef,
|
|
1242
1286
|
completedRef,
|
|
1243
1287
|
passedRef,
|
|
1244
1288
|
failedRef
|