@m4trix/evals 0.18.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +142 -42
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +142 -42
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -30
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -30
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +97 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +21 -9
- package/dist/index.js +97 -28
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -1012,6 +1012,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1012
1012
|
function formatLogMessage(msg) {
|
|
1013
1013
|
if (typeof msg === "string")
|
|
1014
1014
|
return msg;
|
|
1015
|
+
if (msg instanceof Error)
|
|
1016
|
+
return msg.stack ?? msg.message;
|
|
1015
1017
|
try {
|
|
1016
1018
|
if (msg !== null && typeof msg === "object") {
|
|
1017
1019
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1065,7 +1067,11 @@ var Metric = {
|
|
|
1065
1067
|
name: config.name,
|
|
1066
1068
|
aggregate: config.aggregate,
|
|
1067
1069
|
format: config.format,
|
|
1068
|
-
make: (data) => ({
|
|
1070
|
+
make: (data, options) => ({
|
|
1071
|
+
id: config.id,
|
|
1072
|
+
data,
|
|
1073
|
+
...options?.name !== void 0 && { name: options.name }
|
|
1074
|
+
})
|
|
1069
1075
|
};
|
|
1070
1076
|
registry.set(config.id, def);
|
|
1071
1077
|
return def;
|
|
@@ -1087,25 +1093,61 @@ var ScoreAggregate = {
|
|
|
1087
1093
|
const count = values.length || 1;
|
|
1088
1094
|
const result = {};
|
|
1089
1095
|
for (const field of fields) {
|
|
1090
|
-
result[field] = values.reduce(
|
|
1096
|
+
result[field] = values.reduce(
|
|
1097
|
+
(s, v) => s + (v[field] ?? 0),
|
|
1098
|
+
0
|
|
1099
|
+
) / count;
|
|
1091
1100
|
}
|
|
1092
1101
|
return result;
|
|
1093
1102
|
};
|
|
1094
1103
|
},
|
|
1095
|
-
/** Average
|
|
1096
|
-
averageWithVariance(
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1104
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
1105
|
+
averageWithVariance(fields) {
|
|
1106
|
+
return (values) => {
|
|
1107
|
+
const count = values.length;
|
|
1108
|
+
const result = {};
|
|
1109
|
+
for (const field of fields) {
|
|
1110
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
1111
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
1112
|
+
0
|
|
1113
|
+
) / count;
|
|
1114
|
+
}
|
|
1115
|
+
const valueField = "value";
|
|
1116
|
+
const hasValueField = fields.includes(valueField);
|
|
1117
|
+
if (count === 0) {
|
|
1118
|
+
if (hasValueField) {
|
|
1119
|
+
result[valueField] = 0;
|
|
1120
|
+
}
|
|
1121
|
+
return {
|
|
1122
|
+
...result,
|
|
1123
|
+
stdDev: void 0,
|
|
1124
|
+
count: 0
|
|
1125
|
+
};
|
|
1126
|
+
}
|
|
1127
|
+
let stdDev;
|
|
1128
|
+
if (hasValueField && count >= 2) {
|
|
1129
|
+
const sum = values.reduce(
|
|
1130
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
1131
|
+
0
|
|
1132
|
+
);
|
|
1133
|
+
const sumSq = values.reduce(
|
|
1134
|
+
(s, v) => {
|
|
1135
|
+
const value = v[valueField] ?? 0;
|
|
1136
|
+
return s + value * value;
|
|
1137
|
+
},
|
|
1138
|
+
0
|
|
1139
|
+
);
|
|
1140
|
+
const mean = sum / count;
|
|
1141
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1142
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1143
|
+
}
|
|
1144
|
+
return {
|
|
1145
|
+
...values[0],
|
|
1146
|
+
...result,
|
|
1147
|
+
stdDev,
|
|
1148
|
+
count
|
|
1149
|
+
};
|
|
1150
|
+
};
|
|
1109
1151
|
},
|
|
1110
1152
|
/** All runs must pass. Use for binary scores. */
|
|
1111
1153
|
all(values) {
|
|
@@ -1139,6 +1181,7 @@ var Score = {
|
|
|
1139
1181
|
id: config.id,
|
|
1140
1182
|
data,
|
|
1141
1183
|
...passed !== void 0 && { passed },
|
|
1184
|
+
...options?.name !== void 0 && { name: options.name },
|
|
1142
1185
|
def
|
|
1143
1186
|
// Attach def so rendering/aggregation works without registry lookup
|
|
1144
1187
|
};
|
|
@@ -1207,7 +1250,7 @@ Score.of({
|
|
|
1207
1250
|
displayStrategy: "bar",
|
|
1208
1251
|
formatValue: (data) => data.value.toFixed(2),
|
|
1209
1252
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1210
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
1253
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
1211
1254
|
});
|
|
1212
1255
|
Score.of({
|
|
1213
1256
|
id: "delta",
|
|
@@ -1273,6 +1316,7 @@ function toNumericScore(value) {
|
|
|
1273
1316
|
}
|
|
1274
1317
|
|
|
1275
1318
|
// src/runner/execution.ts
|
|
1319
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1276
1320
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1277
1321
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1278
1322
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1329,20 +1373,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1329
1373
|
if (!evaluateFn) {
|
|
1330
1374
|
continue;
|
|
1331
1375
|
}
|
|
1376
|
+
const logs = [];
|
|
1377
|
+
const logDiff = (expected, actual, options) => {
|
|
1378
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1379
|
+
};
|
|
1380
|
+
const log = (message, options) => {
|
|
1381
|
+
logs.push(createLogEntry(message, options));
|
|
1382
|
+
};
|
|
1383
|
+
const createError = (message, options) => {
|
|
1384
|
+
const entry = createLogEntry(message, options);
|
|
1385
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1386
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1387
|
+
return error;
|
|
1388
|
+
};
|
|
1332
1389
|
try {
|
|
1333
|
-
const logs = [];
|
|
1334
|
-
const logDiff = (expected, actual, options) => {
|
|
1335
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1336
|
-
};
|
|
1337
|
-
const log = (message, options) => {
|
|
1338
|
-
logs.push(createLogEntry(message, options));
|
|
1339
|
-
};
|
|
1340
1390
|
const ctx = yield* effect.Effect.promise(
|
|
1341
1391
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1342
1392
|
);
|
|
1343
1393
|
const result = yield* effect.Effect.promise(
|
|
1344
|
-
() => Promise.resolve(
|
|
1345
|
-
evaluateFn({
|
|
1394
|
+
() => Promise.resolve().then(
|
|
1395
|
+
() => evaluateFn({
|
|
1346
1396
|
input: testCaseItem.testCase.getInput(),
|
|
1347
1397
|
ctx,
|
|
1348
1398
|
output,
|
|
@@ -1352,10 +1402,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1352
1402
|
datasetId: task.datasetId
|
|
1353
1403
|
},
|
|
1354
1404
|
logDiff,
|
|
1355
|
-
log
|
|
1405
|
+
log,
|
|
1406
|
+
createError
|
|
1356
1407
|
})
|
|
1357
1408
|
)
|
|
1358
1409
|
);
|
|
1410
|
+
if (result instanceof Error) {
|
|
1411
|
+
const evaluatorError = result;
|
|
1412
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1413
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1414
|
+
testCaseError = result.message;
|
|
1415
|
+
evaluatorScores.push({
|
|
1416
|
+
evaluatorId,
|
|
1417
|
+
scores: [],
|
|
1418
|
+
passed: false,
|
|
1419
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1420
|
+
});
|
|
1421
|
+
continue;
|
|
1422
|
+
}
|
|
1359
1423
|
const { scores, metrics } = normalizeResult(result);
|
|
1360
1424
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1361
1425
|
evaluatorScores.push({
|
|
@@ -1366,11 +1430,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1366
1430
|
logs: logs.length > 0 ? logs : void 0
|
|
1367
1431
|
});
|
|
1368
1432
|
} catch (error) {
|
|
1433
|
+
if (error instanceof Error) {
|
|
1434
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1435
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1436
|
+
}
|
|
1369
1437
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1370
1438
|
evaluatorScores.push({
|
|
1371
1439
|
evaluatorId,
|
|
1372
1440
|
scores: [],
|
|
1373
|
-
passed: false
|
|
1441
|
+
passed: false,
|
|
1442
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1374
1443
|
});
|
|
1375
1444
|
}
|
|
1376
1445
|
}
|
|
@@ -2363,9 +2432,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2363
2432
|
if (!def)
|
|
2364
2433
|
return null;
|
|
2365
2434
|
const formatted = def.format(m.data);
|
|
2435
|
+
const label = m.name ?? def.name;
|
|
2366
2436
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2367
2437
|
"[",
|
|
2368
|
-
|
|
2438
|
+
label ? `${label}: ` : "",
|
|
2369
2439
|
formatted,
|
|
2370
2440
|
"]",
|
|
2371
2441
|
" "
|
|
@@ -2378,7 +2448,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2378
2448
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2379
2449
|
const s = item.scores[sIdx];
|
|
2380
2450
|
const def = s.def ?? getScoreById(s.id);
|
|
2381
|
-
const scoreLabel =
|
|
2451
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2382
2452
|
rows.push(
|
|
2383
2453
|
/* @__PURE__ */ jsxRuntime.jsxs(
|
|
2384
2454
|
ink.Text,
|