@m4trix/evals 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +179 -88
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +179 -88
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +124 -50
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +124 -50
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +120 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +42 -6
- package/dist/index.js +119 -46
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1039,7 +1039,11 @@ var Metric = {
|
|
|
1039
1039
|
name: config.name,
|
|
1040
1040
|
aggregate: config.aggregate,
|
|
1041
1041
|
format: config.format,
|
|
1042
|
-
make: (data) => ({
|
|
1042
|
+
make: (data, options) => ({
|
|
1043
|
+
id: config.id,
|
|
1044
|
+
data,
|
|
1045
|
+
...options?.name !== void 0 && { name: options.name }
|
|
1046
|
+
})
|
|
1043
1047
|
};
|
|
1044
1048
|
registry.set(config.id, def);
|
|
1045
1049
|
return def;
|
|
@@ -1051,20 +1055,107 @@ function getMetricById(id) {
|
|
|
1051
1055
|
|
|
1052
1056
|
// src/evals/score.ts
|
|
1053
1057
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1058
|
+
function formatScoreData(def, data, options) {
|
|
1059
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
1060
|
+
}
|
|
1061
|
+
var ScoreAggregate = {
|
|
1062
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
1063
|
+
averageFields(fields) {
|
|
1064
|
+
return (values) => {
|
|
1065
|
+
const count = values.length || 1;
|
|
1066
|
+
const result = {};
|
|
1067
|
+
for (const field of fields) {
|
|
1068
|
+
result[field] = values.reduce(
|
|
1069
|
+
(s, v) => s + (v[field] ?? 0),
|
|
1070
|
+
0
|
|
1071
|
+
) / count;
|
|
1072
|
+
}
|
|
1073
|
+
return result;
|
|
1074
|
+
};
|
|
1075
|
+
},
|
|
1076
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
1077
|
+
averageWithVariance(fields) {
|
|
1078
|
+
return (values) => {
|
|
1079
|
+
const count = values.length;
|
|
1080
|
+
const result = {};
|
|
1081
|
+
for (const field of fields) {
|
|
1082
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
1083
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
1084
|
+
0
|
|
1085
|
+
) / count;
|
|
1086
|
+
}
|
|
1087
|
+
const valueField = "value";
|
|
1088
|
+
const hasValueField = fields.includes(valueField);
|
|
1089
|
+
if (count === 0) {
|
|
1090
|
+
if (hasValueField) {
|
|
1091
|
+
result[valueField] = 0;
|
|
1092
|
+
}
|
|
1093
|
+
return {
|
|
1094
|
+
...result,
|
|
1095
|
+
stdDev: void 0,
|
|
1096
|
+
count: 0
|
|
1097
|
+
};
|
|
1098
|
+
}
|
|
1099
|
+
let stdDev;
|
|
1100
|
+
if (hasValueField && count >= 2) {
|
|
1101
|
+
const sum = values.reduce(
|
|
1102
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
1103
|
+
0
|
|
1104
|
+
);
|
|
1105
|
+
const sumSq = values.reduce(
|
|
1106
|
+
(s, v) => {
|
|
1107
|
+
const value = v[valueField] ?? 0;
|
|
1108
|
+
return s + value * value;
|
|
1109
|
+
},
|
|
1110
|
+
0
|
|
1111
|
+
);
|
|
1112
|
+
const mean = sum / count;
|
|
1113
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1114
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1115
|
+
}
|
|
1116
|
+
return {
|
|
1117
|
+
...values[0],
|
|
1118
|
+
...result,
|
|
1119
|
+
stdDev,
|
|
1120
|
+
count
|
|
1121
|
+
};
|
|
1122
|
+
};
|
|
1123
|
+
},
|
|
1124
|
+
/** All runs must pass. Use for binary scores. */
|
|
1125
|
+
all(values) {
|
|
1126
|
+
const total = values.length;
|
|
1127
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1128
|
+
return {
|
|
1129
|
+
...values[0],
|
|
1130
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1131
|
+
passedCount,
|
|
1132
|
+
totalCount: total
|
|
1133
|
+
};
|
|
1134
|
+
},
|
|
1135
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
1136
|
+
last(values) {
|
|
1137
|
+
return values[values.length - 1] ?? {};
|
|
1138
|
+
}
|
|
1139
|
+
};
|
|
1054
1140
|
var Score = {
|
|
1141
|
+
aggregate: ScoreAggregate,
|
|
1055
1142
|
of(config) {
|
|
1056
1143
|
const def = {
|
|
1057
1144
|
id: config.id,
|
|
1058
1145
|
name: config.name,
|
|
1059
1146
|
displayStrategy: config.displayStrategy,
|
|
1060
|
-
|
|
1061
|
-
|
|
1147
|
+
formatValue: config.formatValue,
|
|
1148
|
+
formatAggregate: config.formatAggregate,
|
|
1149
|
+
aggregateValues: config.aggregateValues,
|
|
1062
1150
|
make: (data, options) => {
|
|
1063
1151
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1064
1152
|
return {
|
|
1065
1153
|
id: config.id,
|
|
1066
1154
|
data,
|
|
1067
|
-
...passed !== void 0 && { passed }
|
|
1155
|
+
...passed !== void 0 && { passed },
|
|
1156
|
+
...options?.name !== void 0 && { name: options.name },
|
|
1157
|
+
def
|
|
1158
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
1068
1159
|
};
|
|
1069
1160
|
}
|
|
1070
1161
|
};
|
|
@@ -1077,29 +1168,6 @@ function getScoreById(id) {
|
|
|
1077
1168
|
}
|
|
1078
1169
|
|
|
1079
1170
|
// src/evals/aggregators.ts
|
|
1080
|
-
function aggregateAverageWithVariance(values) {
|
|
1081
|
-
if (values.length === 0) {
|
|
1082
|
-
return { value: 0, count: 0 };
|
|
1083
|
-
}
|
|
1084
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1085
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1086
|
-
const mean = sum / values.length;
|
|
1087
|
-
let stdDev;
|
|
1088
|
-
if (values.length >= 2) {
|
|
1089
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1090
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1091
|
-
}
|
|
1092
|
-
return { value: mean, stdDev, count: values.length };
|
|
1093
|
-
}
|
|
1094
|
-
function aggregateAll(values) {
|
|
1095
|
-
const total = values.length;
|
|
1096
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
1097
|
-
return {
|
|
1098
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
1099
|
-
passedCount,
|
|
1100
|
-
totalCount: total
|
|
1101
|
-
};
|
|
1102
|
-
}
|
|
1103
1171
|
function aggregateTokenCountSum(values) {
|
|
1104
1172
|
const initial = {
|
|
1105
1173
|
input: 0,
|
|
@@ -1152,35 +1220,40 @@ Score.of({
|
|
|
1152
1220
|
id: "percent",
|
|
1153
1221
|
name: "Score",
|
|
1154
1222
|
displayStrategy: "bar",
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1223
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
1224
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1225
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
1226
|
+
});
|
|
1227
|
+
Score.of({
|
|
1228
|
+
id: "delta",
|
|
1229
|
+
name: "Delta",
|
|
1230
|
+
displayStrategy: "number",
|
|
1231
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
1232
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
1233
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
1162
1234
|
});
|
|
1163
1235
|
Score.of({
|
|
1164
1236
|
id: "binary",
|
|
1165
1237
|
name: "Result",
|
|
1166
1238
|
displayStrategy: "passFail",
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
}
|
|
1173
|
-
return base;
|
|
1239
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
1240
|
+
formatAggregate: (data) => {
|
|
1241
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1242
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1243
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1174
1244
|
}
|
|
1175
|
-
return
|
|
1245
|
+
return base;
|
|
1176
1246
|
},
|
|
1177
|
-
|
|
1247
|
+
aggregateValues: Score.aggregate.all
|
|
1178
1248
|
});
|
|
1179
1249
|
|
|
1180
1250
|
// src/runner/score-utils.ts
|
|
1251
|
+
function getScoreDef(item) {
|
|
1252
|
+
return item.def ?? getScoreById(item.id);
|
|
1253
|
+
}
|
|
1181
1254
|
function toNumericScoreFromScores(scores) {
|
|
1182
1255
|
for (const item of scores) {
|
|
1183
|
-
const def =
|
|
1256
|
+
const def = getScoreDef(item);
|
|
1184
1257
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1185
1258
|
const value = item.data.value;
|
|
1186
1259
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1598,7 +1671,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
1598
1671
|
() => appendJsonLine(message.artifactPath, {
|
|
1599
1672
|
runId: message.runId,
|
|
1600
1673
|
ts: Date.now(),
|
|
1601
|
-
...message.payload
|
|
1674
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1602
1675
|
})
|
|
1603
1676
|
);
|
|
1604
1677
|
})
|
|
@@ -2140,12 +2213,12 @@ function scoreColor(score) {
|
|
|
2140
2213
|
return "red";
|
|
2141
2214
|
}
|
|
2142
2215
|
function formatScorePart(item) {
|
|
2143
|
-
const def = getScoreById(item.id);
|
|
2216
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2144
2217
|
if (!def) {
|
|
2145
2218
|
const numeric = toNumericScore(item.data);
|
|
2146
2219
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
2147
2220
|
}
|
|
2148
|
-
const formatted = def
|
|
2221
|
+
const formatted = formatScoreData(def, item.data);
|
|
2149
2222
|
if (def.displayStrategy === "bar") {
|
|
2150
2223
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2151
2224
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -2305,9 +2378,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2305
2378
|
if (!def)
|
|
2306
2379
|
return null;
|
|
2307
2380
|
const formatted = def.format(m.data);
|
|
2381
|
+
const label = m.name ?? def.name;
|
|
2308
2382
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2309
2383
|
"[",
|
|
2310
|
-
|
|
2384
|
+
label ? `${label}: ` : "",
|
|
2311
2385
|
formatted,
|
|
2312
2386
|
"]",
|
|
2313
2387
|
" "
|
|
@@ -2319,8 +2393,8 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2319
2393
|
if (item.scores.length > 0) {
|
|
2320
2394
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2321
2395
|
const s = item.scores[sIdx];
|
|
2322
|
-
const def = getScoreById(s.id);
|
|
2323
|
-
const scoreLabel =
|
|
2396
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2397
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2324
2398
|
rows.push(
|
|
2325
2399
|
/* @__PURE__ */ jsxs(
|
|
2326
2400
|
Text,
|