@m4trix/evals 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +51 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +51 -13
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +39 -11
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +39 -11
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +39 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +39 -11
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -738,6 +738,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
738
738
|
function formatLogMessage(msg) {
|
|
739
739
|
if (typeof msg === "string")
|
|
740
740
|
return msg;
|
|
741
|
+
if (msg instanceof Error)
|
|
742
|
+
return msg.stack ?? msg.message;
|
|
741
743
|
try {
|
|
742
744
|
if (msg !== null && typeof msg === "object") {
|
|
743
745
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1084,6 +1086,7 @@ function toNumericScore(value) {
|
|
|
1084
1086
|
}
|
|
1085
1087
|
|
|
1086
1088
|
// src/runner/execution.ts
|
|
1089
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1087
1090
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1088
1091
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1089
1092
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1140,20 +1143,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1140
1143
|
if (!evaluateFn) {
|
|
1141
1144
|
continue;
|
|
1142
1145
|
}
|
|
1146
|
+
const logs = [];
|
|
1147
|
+
const logDiff = (expected, actual, options) => {
|
|
1148
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1149
|
+
};
|
|
1150
|
+
const log = (message, options) => {
|
|
1151
|
+
logs.push(createLogEntry(message, options));
|
|
1152
|
+
};
|
|
1153
|
+
const createError = (message, options) => {
|
|
1154
|
+
const entry = createLogEntry(message, options);
|
|
1155
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1156
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1157
|
+
return error;
|
|
1158
|
+
};
|
|
1143
1159
|
try {
|
|
1144
|
-
const logs = [];
|
|
1145
|
-
const logDiff = (expected, actual, options) => {
|
|
1146
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1147
|
-
};
|
|
1148
|
-
const log = (message, options) => {
|
|
1149
|
-
logs.push(createLogEntry(message, options));
|
|
1150
|
-
};
|
|
1151
1160
|
const ctx = yield* effect.Effect.promise(
|
|
1152
1161
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1153
1162
|
);
|
|
1154
1163
|
const result = yield* effect.Effect.promise(
|
|
1155
|
-
() => Promise.resolve(
|
|
1156
|
-
evaluateFn({
|
|
1164
|
+
() => Promise.resolve().then(
|
|
1165
|
+
() => evaluateFn({
|
|
1157
1166
|
input: testCaseItem.testCase.getInput(),
|
|
1158
1167
|
ctx,
|
|
1159
1168
|
output,
|
|
@@ -1163,10 +1172,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1163
1172
|
datasetId: task.datasetId
|
|
1164
1173
|
},
|
|
1165
1174
|
logDiff,
|
|
1166
|
-
log
|
|
1175
|
+
log,
|
|
1176
|
+
createError
|
|
1167
1177
|
})
|
|
1168
1178
|
)
|
|
1169
1179
|
);
|
|
1180
|
+
if (result instanceof Error) {
|
|
1181
|
+
const evaluatorError = result;
|
|
1182
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1183
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1184
|
+
testCaseError = result.message;
|
|
1185
|
+
evaluatorScores.push({
|
|
1186
|
+
evaluatorId,
|
|
1187
|
+
scores: [],
|
|
1188
|
+
passed: false,
|
|
1189
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1190
|
+
});
|
|
1191
|
+
continue;
|
|
1192
|
+
}
|
|
1170
1193
|
const { scores, metrics } = normalizeResult(result);
|
|
1171
1194
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1172
1195
|
evaluatorScores.push({
|
|
@@ -1177,11 +1200,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1177
1200
|
logs: logs.length > 0 ? logs : void 0
|
|
1178
1201
|
});
|
|
1179
1202
|
} catch (error) {
|
|
1203
|
+
if (error instanceof Error) {
|
|
1204
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1205
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1206
|
+
}
|
|
1180
1207
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1181
1208
|
evaluatorScores.push({
|
|
1182
1209
|
evaluatorId,
|
|
1183
1210
|
scores: [],
|
|
1184
|
-
passed: false
|
|
1211
|
+
passed: false,
|
|
1212
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1185
1213
|
});
|
|
1186
1214
|
}
|
|
1187
1215
|
}
|