@m4trix/evals 0.19.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +135 -26
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +135 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +56 -12
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +17 -1
- package/dist/index.js +56 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -738,6 +738,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
738
738
|
function formatLogMessage(msg) {
|
|
739
739
|
if (typeof msg === "string")
|
|
740
740
|
return msg;
|
|
741
|
+
if (msg instanceof Error)
|
|
742
|
+
return msg.stack ?? msg.message;
|
|
741
743
|
try {
|
|
742
744
|
if (msg !== null && typeof msg === "object") {
|
|
743
745
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1084,6 +1086,7 @@ function toNumericScore(value) {
|
|
|
1084
1086
|
}
|
|
1085
1087
|
|
|
1086
1088
|
// src/runner/execution.ts
|
|
1089
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1087
1090
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1088
1091
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1089
1092
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1125,13 +1128,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1125
1128
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1126
1129
|
);
|
|
1127
1130
|
}
|
|
1128
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1131
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1129
1132
|
return effect.Effect.gen(function* () {
|
|
1130
1133
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1131
1134
|
const rerunPassed = [];
|
|
1132
1135
|
for (let r = 0; r < reruns; r++) {
|
|
1133
1136
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1134
1137
|
const started = Date.now();
|
|
1138
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1139
|
+
n + 1,
|
|
1140
|
+
n + 1
|
|
1141
|
+
]);
|
|
1142
|
+
yield* publishEvent({
|
|
1143
|
+
type: "TestCaseStarted",
|
|
1144
|
+
runId: task.runId,
|
|
1145
|
+
testCaseId: testCaseItem.id,
|
|
1146
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1147
|
+
startedTestCases: startedEvaluations,
|
|
1148
|
+
totalTestCases: totalEvaluations,
|
|
1149
|
+
rerunIndex: r + 1,
|
|
1150
|
+
rerunTotal: reruns
|
|
1151
|
+
});
|
|
1135
1152
|
const evaluatorScores = [];
|
|
1136
1153
|
let testCaseError;
|
|
1137
1154
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1140,20 +1157,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1140
1157
|
if (!evaluateFn) {
|
|
1141
1158
|
continue;
|
|
1142
1159
|
}
|
|
1160
|
+
const logs = [];
|
|
1161
|
+
const logDiff = (expected, actual, options) => {
|
|
1162
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1163
|
+
};
|
|
1164
|
+
const log = (message, options) => {
|
|
1165
|
+
logs.push(createLogEntry(message, options));
|
|
1166
|
+
};
|
|
1167
|
+
const createError = (message, options) => {
|
|
1168
|
+
const entry = createLogEntry(message, options);
|
|
1169
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1170
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1171
|
+
return error;
|
|
1172
|
+
};
|
|
1143
1173
|
try {
|
|
1144
|
-
const logs = [];
|
|
1145
|
-
const logDiff = (expected, actual, options) => {
|
|
1146
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1147
|
-
};
|
|
1148
|
-
const log = (message, options) => {
|
|
1149
|
-
logs.push(createLogEntry(message, options));
|
|
1150
|
-
};
|
|
1151
1174
|
const ctx = yield* effect.Effect.promise(
|
|
1152
1175
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1153
1176
|
);
|
|
1154
1177
|
const result = yield* effect.Effect.promise(
|
|
1155
|
-
() => Promise.resolve(
|
|
1156
|
-
evaluateFn({
|
|
1178
|
+
() => Promise.resolve().then(
|
|
1179
|
+
() => evaluateFn({
|
|
1157
1180
|
input: testCaseItem.testCase.getInput(),
|
|
1158
1181
|
ctx,
|
|
1159
1182
|
output,
|
|
@@ -1163,10 +1186,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1163
1186
|
datasetId: task.datasetId
|
|
1164
1187
|
},
|
|
1165
1188
|
logDiff,
|
|
1166
|
-
log
|
|
1189
|
+
log,
|
|
1190
|
+
createError
|
|
1167
1191
|
})
|
|
1168
1192
|
)
|
|
1169
1193
|
);
|
|
1194
|
+
if (result instanceof Error) {
|
|
1195
|
+
const evaluatorError = result;
|
|
1196
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1197
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1198
|
+
testCaseError = result.message;
|
|
1199
|
+
evaluatorScores.push({
|
|
1200
|
+
evaluatorId,
|
|
1201
|
+
scores: [],
|
|
1202
|
+
passed: false,
|
|
1203
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1204
|
+
});
|
|
1205
|
+
continue;
|
|
1206
|
+
}
|
|
1170
1207
|
const { scores, metrics } = normalizeResult(result);
|
|
1171
1208
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1172
1209
|
evaluatorScores.push({
|
|
@@ -1177,11 +1214,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1177
1214
|
logs: logs.length > 0 ? logs : void 0
|
|
1178
1215
|
});
|
|
1179
1216
|
} catch (error) {
|
|
1217
|
+
if (error instanceof Error) {
|
|
1218
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1219
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1220
|
+
}
|
|
1180
1221
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1181
1222
|
evaluatorScores.push({
|
|
1182
1223
|
evaluatorId,
|
|
1183
1224
|
scores: [],
|
|
1184
|
-
passed: false
|
|
1225
|
+
passed: false,
|
|
1226
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1185
1227
|
});
|
|
1186
1228
|
}
|
|
1187
1229
|
}
|
|
@@ -1252,6 +1294,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1252
1294
|
);
|
|
1253
1295
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1254
1296
|
const completedRef = yield* effect.Ref.make(0);
|
|
1297
|
+
const startedRef = yield* effect.Ref.make(0);
|
|
1255
1298
|
const passedRef = yield* effect.Ref.make(0);
|
|
1256
1299
|
const failedRef = yield* effect.Ref.make(0);
|
|
1257
1300
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1261,6 +1304,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1261
1304
|
publishEvent,
|
|
1262
1305
|
persistenceQueue,
|
|
1263
1306
|
updateSnapshot,
|
|
1307
|
+
startedRef,
|
|
1264
1308
|
completedRef,
|
|
1265
1309
|
passedRef,
|
|
1266
1310
|
failedRef
|