@m4trix/evals 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +179 -88
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +179 -88
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +124 -50
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +124 -50
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +120 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +42 -6
- package/dist/index.js +119 -46
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -1065,7 +1065,11 @@ var Metric = {
|
|
|
1065
1065
|
name: config.name,
|
|
1066
1066
|
aggregate: config.aggregate,
|
|
1067
1067
|
format: config.format,
|
|
1068
|
-
make: (data) => ({
|
|
1068
|
+
make: (data, options) => ({
|
|
1069
|
+
id: config.id,
|
|
1070
|
+
data,
|
|
1071
|
+
...options?.name !== void 0 && { name: options.name }
|
|
1072
|
+
})
|
|
1069
1073
|
};
|
|
1070
1074
|
registry.set(config.id, def);
|
|
1071
1075
|
return def;
|
|
@@ -1077,20 +1081,107 @@ function getMetricById(id) {
|
|
|
1077
1081
|
|
|
1078
1082
|
// src/evals/score.ts
|
|
1079
1083
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1084
|
+
function formatScoreData(def, data, options) {
|
|
1085
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
1086
|
+
}
|
|
1087
|
+
var ScoreAggregate = {
|
|
1088
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
1089
|
+
averageFields(fields) {
|
|
1090
|
+
return (values) => {
|
|
1091
|
+
const count = values.length || 1;
|
|
1092
|
+
const result = {};
|
|
1093
|
+
for (const field of fields) {
|
|
1094
|
+
result[field] = values.reduce(
|
|
1095
|
+
(s, v) => s + (v[field] ?? 0),
|
|
1096
|
+
0
|
|
1097
|
+
) / count;
|
|
1098
|
+
}
|
|
1099
|
+
return result;
|
|
1100
|
+
};
|
|
1101
|
+
},
|
|
1102
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
1103
|
+
averageWithVariance(fields) {
|
|
1104
|
+
return (values) => {
|
|
1105
|
+
const count = values.length;
|
|
1106
|
+
const result = {};
|
|
1107
|
+
for (const field of fields) {
|
|
1108
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
1109
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
1110
|
+
0
|
|
1111
|
+
) / count;
|
|
1112
|
+
}
|
|
1113
|
+
const valueField = "value";
|
|
1114
|
+
const hasValueField = fields.includes(valueField);
|
|
1115
|
+
if (count === 0) {
|
|
1116
|
+
if (hasValueField) {
|
|
1117
|
+
result[valueField] = 0;
|
|
1118
|
+
}
|
|
1119
|
+
return {
|
|
1120
|
+
...result,
|
|
1121
|
+
stdDev: void 0,
|
|
1122
|
+
count: 0
|
|
1123
|
+
};
|
|
1124
|
+
}
|
|
1125
|
+
let stdDev;
|
|
1126
|
+
if (hasValueField && count >= 2) {
|
|
1127
|
+
const sum = values.reduce(
|
|
1128
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
1129
|
+
0
|
|
1130
|
+
);
|
|
1131
|
+
const sumSq = values.reduce(
|
|
1132
|
+
(s, v) => {
|
|
1133
|
+
const value = v[valueField] ?? 0;
|
|
1134
|
+
return s + value * value;
|
|
1135
|
+
},
|
|
1136
|
+
0
|
|
1137
|
+
);
|
|
1138
|
+
const mean = sum / count;
|
|
1139
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1140
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1141
|
+
}
|
|
1142
|
+
return {
|
|
1143
|
+
...values[0],
|
|
1144
|
+
...result,
|
|
1145
|
+
stdDev,
|
|
1146
|
+
count
|
|
1147
|
+
};
|
|
1148
|
+
};
|
|
1149
|
+
},
|
|
1150
|
+
/** All runs must pass. Use for binary scores. */
|
|
1151
|
+
all(values) {
|
|
1152
|
+
const total = values.length;
|
|
1153
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1154
|
+
return {
|
|
1155
|
+
...values[0],
|
|
1156
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1157
|
+
passedCount,
|
|
1158
|
+
totalCount: total
|
|
1159
|
+
};
|
|
1160
|
+
},
|
|
1161
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
1162
|
+
last(values) {
|
|
1163
|
+
return values[values.length - 1] ?? {};
|
|
1164
|
+
}
|
|
1165
|
+
};
|
|
1080
1166
|
var Score = {
|
|
1167
|
+
aggregate: ScoreAggregate,
|
|
1081
1168
|
of(config) {
|
|
1082
1169
|
const def = {
|
|
1083
1170
|
id: config.id,
|
|
1084
1171
|
name: config.name,
|
|
1085
1172
|
displayStrategy: config.displayStrategy,
|
|
1086
|
-
|
|
1087
|
-
|
|
1173
|
+
formatValue: config.formatValue,
|
|
1174
|
+
formatAggregate: config.formatAggregate,
|
|
1175
|
+
aggregateValues: config.aggregateValues,
|
|
1088
1176
|
make: (data, options) => {
|
|
1089
1177
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1090
1178
|
return {
|
|
1091
1179
|
id: config.id,
|
|
1092
1180
|
data,
|
|
1093
|
-
...passed !== void 0 && { passed }
|
|
1181
|
+
...passed !== void 0 && { passed },
|
|
1182
|
+
...options?.name !== void 0 && { name: options.name },
|
|
1183
|
+
def
|
|
1184
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
1094
1185
|
};
|
|
1095
1186
|
}
|
|
1096
1187
|
};
|
|
@@ -1103,29 +1194,6 @@ function getScoreById(id) {
|
|
|
1103
1194
|
}
|
|
1104
1195
|
|
|
1105
1196
|
// src/evals/aggregators.ts
|
|
1106
|
-
function aggregateAverageWithVariance(values) {
|
|
1107
|
-
if (values.length === 0) {
|
|
1108
|
-
return { value: 0, count: 0 };
|
|
1109
|
-
}
|
|
1110
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1111
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1112
|
-
const mean = sum / values.length;
|
|
1113
|
-
let stdDev;
|
|
1114
|
-
if (values.length >= 2) {
|
|
1115
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1116
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1117
|
-
}
|
|
1118
|
-
return { value: mean, stdDev, count: values.length };
|
|
1119
|
-
}
|
|
1120
|
-
function aggregateAll(values) {
|
|
1121
|
-
const total = values.length;
|
|
1122
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
1123
|
-
return {
|
|
1124
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
1125
|
-
passedCount,
|
|
1126
|
-
totalCount: total
|
|
1127
|
-
};
|
|
1128
|
-
}
|
|
1129
1197
|
function aggregateTokenCountSum(values) {
|
|
1130
1198
|
const initial = {
|
|
1131
1199
|
input: 0,
|
|
@@ -1178,35 +1246,40 @@ Score.of({
|
|
|
1178
1246
|
id: "percent",
|
|
1179
1247
|
name: "Score",
|
|
1180
1248
|
displayStrategy: "bar",
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1249
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
1250
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1251
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
1252
|
+
});
|
|
1253
|
+
Score.of({
|
|
1254
|
+
id: "delta",
|
|
1255
|
+
name: "Delta",
|
|
1256
|
+
displayStrategy: "number",
|
|
1257
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
1258
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
1259
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
1188
1260
|
});
|
|
1189
1261
|
Score.of({
|
|
1190
1262
|
id: "binary",
|
|
1191
1263
|
name: "Result",
|
|
1192
1264
|
displayStrategy: "passFail",
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
}
|
|
1199
|
-
return base;
|
|
1265
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
1266
|
+
formatAggregate: (data) => {
|
|
1267
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1268
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1269
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1200
1270
|
}
|
|
1201
|
-
return
|
|
1271
|
+
return base;
|
|
1202
1272
|
},
|
|
1203
|
-
|
|
1273
|
+
aggregateValues: Score.aggregate.all
|
|
1204
1274
|
});
|
|
1205
1275
|
|
|
1206
1276
|
// src/runner/score-utils.ts
|
|
1277
|
+
function getScoreDef(item) {
|
|
1278
|
+
return item.def ?? getScoreById(item.id);
|
|
1279
|
+
}
|
|
1207
1280
|
function toNumericScoreFromScores(scores) {
|
|
1208
1281
|
for (const item of scores) {
|
|
1209
|
-
const def =
|
|
1282
|
+
const def = getScoreDef(item);
|
|
1210
1283
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1211
1284
|
const value = item.data.value;
|
|
1212
1285
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1624,7 +1697,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
|
1624
1697
|
() => appendJsonLine(message.artifactPath, {
|
|
1625
1698
|
runId: message.runId,
|
|
1626
1699
|
ts: Date.now(),
|
|
1627
|
-
...message.payload
|
|
1700
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1628
1701
|
})
|
|
1629
1702
|
);
|
|
1630
1703
|
})
|
|
@@ -2166,12 +2239,12 @@ function scoreColor(score) {
|
|
|
2166
2239
|
return "red";
|
|
2167
2240
|
}
|
|
2168
2241
|
function formatScorePart(item) {
|
|
2169
|
-
const def = getScoreById(item.id);
|
|
2242
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2170
2243
|
if (!def) {
|
|
2171
2244
|
const numeric = toNumericScore(item.data);
|
|
2172
2245
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
2173
2246
|
}
|
|
2174
|
-
const formatted = def
|
|
2247
|
+
const formatted = formatScoreData(def, item.data);
|
|
2175
2248
|
if (def.displayStrategy === "bar") {
|
|
2176
2249
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2177
2250
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -2331,9 +2404,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2331
2404
|
if (!def)
|
|
2332
2405
|
return null;
|
|
2333
2406
|
const formatted = def.format(m.data);
|
|
2407
|
+
const label = m.name ?? def.name;
|
|
2334
2408
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2335
2409
|
"[",
|
|
2336
|
-
|
|
2410
|
+
label ? `${label}: ` : "",
|
|
2337
2411
|
formatted,
|
|
2338
2412
|
"]",
|
|
2339
2413
|
" "
|
|
@@ -2345,8 +2419,8 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2345
2419
|
if (item.scores.length > 0) {
|
|
2346
2420
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2347
2421
|
const s = item.scores[sIdx];
|
|
2348
|
-
const def = getScoreById(s.id);
|
|
2349
|
-
const scoreLabel =
|
|
2422
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2423
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2350
2424
|
rows.push(
|
|
2351
2425
|
/* @__PURE__ */ jsxRuntime.jsxs(
|
|
2352
2426
|
ink.Text,
|