@m4trix/evals 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1039,7 +1039,11 @@ var Metric = {
1039
1039
  name: config.name,
1040
1040
  aggregate: config.aggregate,
1041
1041
  format: config.format,
1042
- make: (data) => ({ id: config.id, data })
1042
+ make: (data, options) => ({
1043
+ id: config.id,
1044
+ data,
1045
+ ...options?.name !== void 0 && { name: options.name }
1046
+ })
1043
1047
  };
1044
1048
  registry.set(config.id, def);
1045
1049
  return def;
@@ -1061,25 +1065,61 @@ var ScoreAggregate = {
1061
1065
  const count = values.length || 1;
1062
1066
  const result = {};
1063
1067
  for (const field of fields) {
1064
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1068
+ result[field] = values.reduce(
1069
+ (s, v) => s + (v[field] ?? 0),
1070
+ 0
1071
+ ) / count;
1065
1072
  }
1066
1073
  return result;
1067
1074
  };
1068
1075
  },
1069
- /** Average `value` with sample std dev. Use for percent-style scores. */
1070
- averageWithVariance(values) {
1071
- if (values.length === 0) {
1072
- return { value: 0, stdDev: void 0, count: 0 };
1073
- }
1074
- const sum = values.reduce((s, v) => s + v.value, 0);
1075
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1076
- const mean = sum / values.length;
1077
- let stdDev;
1078
- if (values.length >= 2) {
1079
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1080
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1081
- }
1082
- return { ...values[0], value: mean, stdDev, count: values.length };
1076
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
1077
+ averageWithVariance(fields) {
1078
+ return (values) => {
1079
+ const count = values.length;
1080
+ const result = {};
1081
+ for (const field of fields) {
1082
+ result[field] = count === 0 ? 0 : values.reduce(
1083
+ (sum, item) => sum + (item[field] ?? 0),
1084
+ 0
1085
+ ) / count;
1086
+ }
1087
+ const valueField = "value";
1088
+ const hasValueField = fields.includes(valueField);
1089
+ if (count === 0) {
1090
+ if (hasValueField) {
1091
+ result[valueField] = 0;
1092
+ }
1093
+ return {
1094
+ ...result,
1095
+ stdDev: void 0,
1096
+ count: 0
1097
+ };
1098
+ }
1099
+ let stdDev;
1100
+ if (hasValueField && count >= 2) {
1101
+ const sum = values.reduce(
1102
+ (s, v) => s + (v[valueField] ?? 0),
1103
+ 0
1104
+ );
1105
+ const sumSq = values.reduce(
1106
+ (s, v) => {
1107
+ const value = v[valueField] ?? 0;
1108
+ return s + value * value;
1109
+ },
1110
+ 0
1111
+ );
1112
+ const mean = sum / count;
1113
+ const variance = (sumSq - count * mean * mean) / (count - 1);
1114
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1115
+ }
1116
+ return {
1117
+ ...values[0],
1118
+ ...result,
1119
+ stdDev,
1120
+ count
1121
+ };
1122
+ };
1083
1123
  },
1084
1124
  /** All runs must pass. Use for binary scores. */
1085
1125
  all(values) {
@@ -1113,6 +1153,7 @@ var Score = {
1113
1153
  id: config.id,
1114
1154
  data,
1115
1155
  ...passed !== void 0 && { passed },
1156
+ ...options?.name !== void 0 && { name: options.name },
1116
1157
  def
1117
1158
  // Attach def so rendering/aggregation works without registry lookup
1118
1159
  };
@@ -1181,7 +1222,7 @@ Score.of({
1181
1222
  displayStrategy: "bar",
1182
1223
  formatValue: (data) => data.value.toFixed(2),
1183
1224
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1184
- aggregateValues: Score.aggregate.averageWithVariance
1225
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
1185
1226
  });
1186
1227
  Score.of({
1187
1228
  id: "delta",
@@ -2337,9 +2378,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2337
2378
  if (!def)
2338
2379
  return null;
2339
2380
  const formatted = def.format(m.data);
2381
+ const label = m.name ?? def.name;
2340
2382
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2341
2383
  "[",
2342
- def.name ? `${def.name}: ` : "",
2384
+ label ? `${label}: ` : "",
2343
2385
  formatted,
2344
2386
  "]",
2345
2387
  " "
@@ -2352,7 +2394,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2352
2394
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2353
2395
  const s = item.scores[sIdx];
2354
2396
  const def = s.def ?? getScoreById(s.id);
2355
- const scoreLabel = def ? def.name ?? def.id : s.id;
2397
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2356
2398
  rows.push(
2357
2399
  /* @__PURE__ */ jsxs(
2358
2400
  Text,