@m4trix/evals 0.18.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +142 -42
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +142 -42
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -30
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -30
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +97 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +21 -9
- package/dist/index.js +97 -28
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -986,6 +986,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
986
986
|
function formatLogMessage(msg) {
|
|
987
987
|
if (typeof msg === "string")
|
|
988
988
|
return msg;
|
|
989
|
+
if (msg instanceof Error)
|
|
990
|
+
return msg.stack ?? msg.message;
|
|
989
991
|
try {
|
|
990
992
|
if (msg !== null && typeof msg === "object") {
|
|
991
993
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1039,7 +1041,11 @@ var Metric = {
|
|
|
1039
1041
|
name: config.name,
|
|
1040
1042
|
aggregate: config.aggregate,
|
|
1041
1043
|
format: config.format,
|
|
1042
|
-
make: (data) => ({
|
|
1044
|
+
make: (data, options) => ({
|
|
1045
|
+
id: config.id,
|
|
1046
|
+
data,
|
|
1047
|
+
...options?.name !== void 0 && { name: options.name }
|
|
1048
|
+
})
|
|
1043
1049
|
};
|
|
1044
1050
|
registry.set(config.id, def);
|
|
1045
1051
|
return def;
|
|
@@ -1061,25 +1067,61 @@ var ScoreAggregate = {
|
|
|
1061
1067
|
const count = values.length || 1;
|
|
1062
1068
|
const result = {};
|
|
1063
1069
|
for (const field of fields) {
|
|
1064
|
-
result[field] = values.reduce(
|
|
1070
|
+
result[field] = values.reduce(
|
|
1071
|
+
(s, v) => s + (v[field] ?? 0),
|
|
1072
|
+
0
|
|
1073
|
+
) / count;
|
|
1065
1074
|
}
|
|
1066
1075
|
return result;
|
|
1067
1076
|
};
|
|
1068
1077
|
},
|
|
1069
|
-
/** Average
|
|
1070
|
-
averageWithVariance(
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1078
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
1079
|
+
averageWithVariance(fields) {
|
|
1080
|
+
return (values) => {
|
|
1081
|
+
const count = values.length;
|
|
1082
|
+
const result = {};
|
|
1083
|
+
for (const field of fields) {
|
|
1084
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
1085
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
1086
|
+
0
|
|
1087
|
+
) / count;
|
|
1088
|
+
}
|
|
1089
|
+
const valueField = "value";
|
|
1090
|
+
const hasValueField = fields.includes(valueField);
|
|
1091
|
+
if (count === 0) {
|
|
1092
|
+
if (hasValueField) {
|
|
1093
|
+
result[valueField] = 0;
|
|
1094
|
+
}
|
|
1095
|
+
return {
|
|
1096
|
+
...result,
|
|
1097
|
+
stdDev: void 0,
|
|
1098
|
+
count: 0
|
|
1099
|
+
};
|
|
1100
|
+
}
|
|
1101
|
+
let stdDev;
|
|
1102
|
+
if (hasValueField && count >= 2) {
|
|
1103
|
+
const sum = values.reduce(
|
|
1104
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
1105
|
+
0
|
|
1106
|
+
);
|
|
1107
|
+
const sumSq = values.reduce(
|
|
1108
|
+
(s, v) => {
|
|
1109
|
+
const value = v[valueField] ?? 0;
|
|
1110
|
+
return s + value * value;
|
|
1111
|
+
},
|
|
1112
|
+
0
|
|
1113
|
+
);
|
|
1114
|
+
const mean = sum / count;
|
|
1115
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1116
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1117
|
+
}
|
|
1118
|
+
return {
|
|
1119
|
+
...values[0],
|
|
1120
|
+
...result,
|
|
1121
|
+
stdDev,
|
|
1122
|
+
count
|
|
1123
|
+
};
|
|
1124
|
+
};
|
|
1083
1125
|
},
|
|
1084
1126
|
/** All runs must pass. Use for binary scores. */
|
|
1085
1127
|
all(values) {
|
|
@@ -1113,6 +1155,7 @@ var Score = {
|
|
|
1113
1155
|
id: config.id,
|
|
1114
1156
|
data,
|
|
1115
1157
|
...passed !== void 0 && { passed },
|
|
1158
|
+
...options?.name !== void 0 && { name: options.name },
|
|
1116
1159
|
def
|
|
1117
1160
|
// Attach def so rendering/aggregation works without registry lookup
|
|
1118
1161
|
};
|
|
@@ -1181,7 +1224,7 @@ Score.of({
|
|
|
1181
1224
|
displayStrategy: "bar",
|
|
1182
1225
|
formatValue: (data) => data.value.toFixed(2),
|
|
1183
1226
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1184
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
1227
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
1185
1228
|
});
|
|
1186
1229
|
Score.of({
|
|
1187
1230
|
id: "delta",
|
|
@@ -1247,6 +1290,7 @@ function toNumericScore(value) {
|
|
|
1247
1290
|
}
|
|
1248
1291
|
|
|
1249
1292
|
// src/runner/execution.ts
|
|
1293
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1250
1294
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1251
1295
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1252
1296
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1303,20 +1347,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1303
1347
|
if (!evaluateFn) {
|
|
1304
1348
|
continue;
|
|
1305
1349
|
}
|
|
1350
|
+
const logs = [];
|
|
1351
|
+
const logDiff = (expected, actual, options) => {
|
|
1352
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1353
|
+
};
|
|
1354
|
+
const log = (message, options) => {
|
|
1355
|
+
logs.push(createLogEntry(message, options));
|
|
1356
|
+
};
|
|
1357
|
+
const createError = (message, options) => {
|
|
1358
|
+
const entry = createLogEntry(message, options);
|
|
1359
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1360
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1361
|
+
return error;
|
|
1362
|
+
};
|
|
1306
1363
|
try {
|
|
1307
|
-
const logs = [];
|
|
1308
|
-
const logDiff = (expected, actual, options) => {
|
|
1309
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1310
|
-
};
|
|
1311
|
-
const log = (message, options) => {
|
|
1312
|
-
logs.push(createLogEntry(message, options));
|
|
1313
|
-
};
|
|
1314
1364
|
const ctx = yield* Effect.promise(
|
|
1315
1365
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1316
1366
|
);
|
|
1317
1367
|
const result = yield* Effect.promise(
|
|
1318
|
-
() => Promise.resolve(
|
|
1319
|
-
evaluateFn({
|
|
1368
|
+
() => Promise.resolve().then(
|
|
1369
|
+
() => evaluateFn({
|
|
1320
1370
|
input: testCaseItem.testCase.getInput(),
|
|
1321
1371
|
ctx,
|
|
1322
1372
|
output,
|
|
@@ -1326,10 +1376,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1326
1376
|
datasetId: task.datasetId
|
|
1327
1377
|
},
|
|
1328
1378
|
logDiff,
|
|
1329
|
-
log
|
|
1379
|
+
log,
|
|
1380
|
+
createError
|
|
1330
1381
|
})
|
|
1331
1382
|
)
|
|
1332
1383
|
);
|
|
1384
|
+
if (result instanceof Error) {
|
|
1385
|
+
const evaluatorError = result;
|
|
1386
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1387
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1388
|
+
testCaseError = result.message;
|
|
1389
|
+
evaluatorScores.push({
|
|
1390
|
+
evaluatorId,
|
|
1391
|
+
scores: [],
|
|
1392
|
+
passed: false,
|
|
1393
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1394
|
+
});
|
|
1395
|
+
continue;
|
|
1396
|
+
}
|
|
1333
1397
|
const { scores, metrics } = normalizeResult(result);
|
|
1334
1398
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1335
1399
|
evaluatorScores.push({
|
|
@@ -1340,11 +1404,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1340
1404
|
logs: logs.length > 0 ? logs : void 0
|
|
1341
1405
|
});
|
|
1342
1406
|
} catch (error) {
|
|
1407
|
+
if (error instanceof Error) {
|
|
1408
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1409
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1410
|
+
}
|
|
1343
1411
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1344
1412
|
evaluatorScores.push({
|
|
1345
1413
|
evaluatorId,
|
|
1346
1414
|
scores: [],
|
|
1347
|
-
passed: false
|
|
1415
|
+
passed: false,
|
|
1416
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1348
1417
|
});
|
|
1349
1418
|
}
|
|
1350
1419
|
}
|
|
@@ -2337,9 +2406,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2337
2406
|
if (!def)
|
|
2338
2407
|
return null;
|
|
2339
2408
|
const formatted = def.format(m.data);
|
|
2409
|
+
const label = m.name ?? def.name;
|
|
2340
2410
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2341
2411
|
"[",
|
|
2342
|
-
|
|
2412
|
+
label ? `${label}: ` : "",
|
|
2343
2413
|
formatted,
|
|
2344
2414
|
"]",
|
|
2345
2415
|
" "
|
|
@@ -2352,7 +2422,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2352
2422
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2353
2423
|
const s = item.scores[sIdx];
|
|
2354
2424
|
const def = s.def ?? getScoreById(s.id);
|
|
2355
|
-
const scoreLabel =
|
|
2425
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2356
2426
|
rows.push(
|
|
2357
2427
|
/* @__PURE__ */ jsxs(
|
|
2358
2428
|
Text,
|