@m4trix/evals 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +91 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +91 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +61 -19
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +61 -19
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +58 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +13 -8
- package/dist/index.js +58 -17
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -1065,7 +1065,11 @@ var Metric = {
|
|
|
1065
1065
|
name: config.name,
|
|
1066
1066
|
aggregate: config.aggregate,
|
|
1067
1067
|
format: config.format,
|
|
1068
|
-
make: (data) => ({
|
|
1068
|
+
make: (data, options) => ({
|
|
1069
|
+
id: config.id,
|
|
1070
|
+
data,
|
|
1071
|
+
...options?.name !== void 0 && { name: options.name }
|
|
1072
|
+
})
|
|
1069
1073
|
};
|
|
1070
1074
|
registry.set(config.id, def);
|
|
1071
1075
|
return def;
|
|
@@ -1087,25 +1091,61 @@ var ScoreAggregate = {
|
|
|
1087
1091
|
const count = values.length || 1;
|
|
1088
1092
|
const result = {};
|
|
1089
1093
|
for (const field of fields) {
|
|
1090
|
-
result[field] = values.reduce(
|
|
1094
|
+
result[field] = values.reduce(
|
|
1095
|
+
(s, v) => s + (v[field] ?? 0),
|
|
1096
|
+
0
|
|
1097
|
+
) / count;
|
|
1091
1098
|
}
|
|
1092
1099
|
return result;
|
|
1093
1100
|
};
|
|
1094
1101
|
},
|
|
1095
|
-
/** Average
|
|
1096
|
-
averageWithVariance(
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1102
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
1103
|
+
averageWithVariance(fields) {
|
|
1104
|
+
return (values) => {
|
|
1105
|
+
const count = values.length;
|
|
1106
|
+
const result = {};
|
|
1107
|
+
for (const field of fields) {
|
|
1108
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
1109
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
1110
|
+
0
|
|
1111
|
+
) / count;
|
|
1112
|
+
}
|
|
1113
|
+
const valueField = "value";
|
|
1114
|
+
const hasValueField = fields.includes(valueField);
|
|
1115
|
+
if (count === 0) {
|
|
1116
|
+
if (hasValueField) {
|
|
1117
|
+
result[valueField] = 0;
|
|
1118
|
+
}
|
|
1119
|
+
return {
|
|
1120
|
+
...result,
|
|
1121
|
+
stdDev: void 0,
|
|
1122
|
+
count: 0
|
|
1123
|
+
};
|
|
1124
|
+
}
|
|
1125
|
+
let stdDev;
|
|
1126
|
+
if (hasValueField && count >= 2) {
|
|
1127
|
+
const sum = values.reduce(
|
|
1128
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
1129
|
+
0
|
|
1130
|
+
);
|
|
1131
|
+
const sumSq = values.reduce(
|
|
1132
|
+
(s, v) => {
|
|
1133
|
+
const value = v[valueField] ?? 0;
|
|
1134
|
+
return s + value * value;
|
|
1135
|
+
},
|
|
1136
|
+
0
|
|
1137
|
+
);
|
|
1138
|
+
const mean = sum / count;
|
|
1139
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1140
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1141
|
+
}
|
|
1142
|
+
return {
|
|
1143
|
+
...values[0],
|
|
1144
|
+
...result,
|
|
1145
|
+
stdDev,
|
|
1146
|
+
count
|
|
1147
|
+
};
|
|
1148
|
+
};
|
|
1109
1149
|
},
|
|
1110
1150
|
/** All runs must pass. Use for binary scores. */
|
|
1111
1151
|
all(values) {
|
|
@@ -1139,6 +1179,7 @@ var Score = {
|
|
|
1139
1179
|
id: config.id,
|
|
1140
1180
|
data,
|
|
1141
1181
|
...passed !== void 0 && { passed },
|
|
1182
|
+
...options?.name !== void 0 && { name: options.name },
|
|
1142
1183
|
def
|
|
1143
1184
|
// Attach def so rendering/aggregation works without registry lookup
|
|
1144
1185
|
};
|
|
@@ -1207,7 +1248,7 @@ Score.of({
|
|
|
1207
1248
|
displayStrategy: "bar",
|
|
1208
1249
|
formatValue: (data) => data.value.toFixed(2),
|
|
1209
1250
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1210
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
1251
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
1211
1252
|
});
|
|
1212
1253
|
Score.of({
|
|
1213
1254
|
id: "delta",
|
|
@@ -2363,9 +2404,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2363
2404
|
if (!def)
|
|
2364
2405
|
return null;
|
|
2365
2406
|
const formatted = def.format(m.data);
|
|
2407
|
+
const label = m.name ?? def.name;
|
|
2366
2408
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2367
2409
|
"[",
|
|
2368
|
-
|
|
2410
|
+
label ? `${label}: ` : "",
|
|
2369
2411
|
formatted,
|
|
2370
2412
|
"]",
|
|
2371
2413
|
" "
|
|
@@ -2378,7 +2420,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2378
2420
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2379
2421
|
const s = item.scores[sIdx];
|
|
2380
2422
|
const def = s.def ?? getScoreById(s.id);
|
|
2381
|
-
const scoreLabel =
|
|
2423
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2382
2424
|
rows.push(
|
|
2383
2425
|
/* @__PURE__ */ jsxRuntime.jsxs(
|
|
2384
2426
|
ink.Text,
|