@m4trix/evals 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +91 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +91 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +61 -19
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +61 -19
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +58 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +13 -8
- package/dist/index.js +58 -17
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1039,7 +1039,11 @@ var Metric = {
|
|
|
1039
1039
|
name: config.name,
|
|
1040
1040
|
aggregate: config.aggregate,
|
|
1041
1041
|
format: config.format,
|
|
1042
|
-
make: (data) => ({
|
|
1042
|
+
make: (data, options) => ({
|
|
1043
|
+
id: config.id,
|
|
1044
|
+
data,
|
|
1045
|
+
...options?.name !== void 0 && { name: options.name }
|
|
1046
|
+
})
|
|
1043
1047
|
};
|
|
1044
1048
|
registry.set(config.id, def);
|
|
1045
1049
|
return def;
|
|
@@ -1061,25 +1065,61 @@ var ScoreAggregate = {
|
|
|
1061
1065
|
const count = values.length || 1;
|
|
1062
1066
|
const result = {};
|
|
1063
1067
|
for (const field of fields) {
|
|
1064
|
-
result[field] = values.reduce(
|
|
1068
|
+
result[field] = values.reduce(
|
|
1069
|
+
(s, v) => s + (v[field] ?? 0),
|
|
1070
|
+
0
|
|
1071
|
+
) / count;
|
|
1065
1072
|
}
|
|
1066
1073
|
return result;
|
|
1067
1074
|
};
|
|
1068
1075
|
},
|
|
1069
|
-
/** Average
|
|
1070
|
-
averageWithVariance(
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1076
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
1077
|
+
averageWithVariance(fields) {
|
|
1078
|
+
return (values) => {
|
|
1079
|
+
const count = values.length;
|
|
1080
|
+
const result = {};
|
|
1081
|
+
for (const field of fields) {
|
|
1082
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
1083
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
1084
|
+
0
|
|
1085
|
+
) / count;
|
|
1086
|
+
}
|
|
1087
|
+
const valueField = "value";
|
|
1088
|
+
const hasValueField = fields.includes(valueField);
|
|
1089
|
+
if (count === 0) {
|
|
1090
|
+
if (hasValueField) {
|
|
1091
|
+
result[valueField] = 0;
|
|
1092
|
+
}
|
|
1093
|
+
return {
|
|
1094
|
+
...result,
|
|
1095
|
+
stdDev: void 0,
|
|
1096
|
+
count: 0
|
|
1097
|
+
};
|
|
1098
|
+
}
|
|
1099
|
+
let stdDev;
|
|
1100
|
+
if (hasValueField && count >= 2) {
|
|
1101
|
+
const sum = values.reduce(
|
|
1102
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
1103
|
+
0
|
|
1104
|
+
);
|
|
1105
|
+
const sumSq = values.reduce(
|
|
1106
|
+
(s, v) => {
|
|
1107
|
+
const value = v[valueField] ?? 0;
|
|
1108
|
+
return s + value * value;
|
|
1109
|
+
},
|
|
1110
|
+
0
|
|
1111
|
+
);
|
|
1112
|
+
const mean = sum / count;
|
|
1113
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1114
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1115
|
+
}
|
|
1116
|
+
return {
|
|
1117
|
+
...values[0],
|
|
1118
|
+
...result,
|
|
1119
|
+
stdDev,
|
|
1120
|
+
count
|
|
1121
|
+
};
|
|
1122
|
+
};
|
|
1083
1123
|
},
|
|
1084
1124
|
/** All runs must pass. Use for binary scores. */
|
|
1085
1125
|
all(values) {
|
|
@@ -1113,6 +1153,7 @@ var Score = {
|
|
|
1113
1153
|
id: config.id,
|
|
1114
1154
|
data,
|
|
1115
1155
|
...passed !== void 0 && { passed },
|
|
1156
|
+
...options?.name !== void 0 && { name: options.name },
|
|
1116
1157
|
def
|
|
1117
1158
|
// Attach def so rendering/aggregation works without registry lookup
|
|
1118
1159
|
};
|
|
@@ -1181,7 +1222,7 @@ Score.of({
|
|
|
1181
1222
|
displayStrategy: "bar",
|
|
1182
1223
|
formatValue: (data) => data.value.toFixed(2),
|
|
1183
1224
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1184
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
1225
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
1185
1226
|
});
|
|
1186
1227
|
Score.of({
|
|
1187
1228
|
id: "delta",
|
|
@@ -2337,9 +2378,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2337
2378
|
if (!def)
|
|
2338
2379
|
return null;
|
|
2339
2380
|
const formatted = def.format(m.data);
|
|
2381
|
+
const label = m.name ?? def.name;
|
|
2340
2382
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2341
2383
|
"[",
|
|
2342
|
-
|
|
2384
|
+
label ? `${label}: ` : "",
|
|
2343
2385
|
formatted,
|
|
2344
2386
|
"]",
|
|
2345
2387
|
" "
|
|
@@ -2352,7 +2394,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2352
2394
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2353
2395
|
const s = item.scores[sIdx];
|
|
2354
2396
|
const def = s.def ?? getScoreById(s.id);
|
|
2355
|
-
const scoreLabel =
|
|
2397
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2356
2398
|
rows.push(
|
|
2357
2399
|
/* @__PURE__ */ jsxs(
|
|
2358
2400
|
Text,
|