@m4trix/evals 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -1065,7 +1065,11 @@ var Metric = {
1065
1065
  name: config.name,
1066
1066
  aggregate: config.aggregate,
1067
1067
  format: config.format,
1068
- make: (data) => ({ id: config.id, data })
1068
+ make: (data, options) => ({
1069
+ id: config.id,
1070
+ data,
1071
+ ...options?.name !== void 0 && { name: options.name }
1072
+ })
1069
1073
  };
1070
1074
  registry.set(config.id, def);
1071
1075
  return def;
@@ -1087,25 +1091,61 @@ var ScoreAggregate = {
1087
1091
  const count = values.length || 1;
1088
1092
  const result = {};
1089
1093
  for (const field of fields) {
1090
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1094
+ result[field] = values.reduce(
1095
+ (s, v) => s + (v[field] ?? 0),
1096
+ 0
1097
+ ) / count;
1091
1098
  }
1092
1099
  return result;
1093
1100
  };
1094
1101
  },
1095
- /** Average `value` with sample std dev. Use for percent-style scores. */
1096
- averageWithVariance(values) {
1097
- if (values.length === 0) {
1098
- return { value: 0, stdDev: void 0, count: 0 };
1099
- }
1100
- const sum = values.reduce((s, v) => s + v.value, 0);
1101
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1102
- const mean = sum / values.length;
1103
- let stdDev;
1104
- if (values.length >= 2) {
1105
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1106
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1107
- }
1108
- return { ...values[0], value: mean, stdDev, count: values.length };
1102
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
1103
+ averageWithVariance(fields) {
1104
+ return (values) => {
1105
+ const count = values.length;
1106
+ const result = {};
1107
+ for (const field of fields) {
1108
+ result[field] = count === 0 ? 0 : values.reduce(
1109
+ (sum, item) => sum + (item[field] ?? 0),
1110
+ 0
1111
+ ) / count;
1112
+ }
1113
+ const valueField = "value";
1114
+ const hasValueField = fields.includes(valueField);
1115
+ if (count === 0) {
1116
+ if (hasValueField) {
1117
+ result[valueField] = 0;
1118
+ }
1119
+ return {
1120
+ ...result,
1121
+ stdDev: void 0,
1122
+ count: 0
1123
+ };
1124
+ }
1125
+ let stdDev;
1126
+ if (hasValueField && count >= 2) {
1127
+ const sum = values.reduce(
1128
+ (s, v) => s + (v[valueField] ?? 0),
1129
+ 0
1130
+ );
1131
+ const sumSq = values.reduce(
1132
+ (s, v) => {
1133
+ const value = v[valueField] ?? 0;
1134
+ return s + value * value;
1135
+ },
1136
+ 0
1137
+ );
1138
+ const mean = sum / count;
1139
+ const variance = (sumSq - count * mean * mean) / (count - 1);
1140
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1141
+ }
1142
+ return {
1143
+ ...values[0],
1144
+ ...result,
1145
+ stdDev,
1146
+ count
1147
+ };
1148
+ };
1109
1149
  },
1110
1150
  /** All runs must pass. Use for binary scores. */
1111
1151
  all(values) {
@@ -1139,6 +1179,7 @@ var Score = {
1139
1179
  id: config.id,
1140
1180
  data,
1141
1181
  ...passed !== void 0 && { passed },
1182
+ ...options?.name !== void 0 && { name: options.name },
1142
1183
  def
1143
1184
  // Attach def so rendering/aggregation works without registry lookup
1144
1185
  };
@@ -1207,7 +1248,7 @@ Score.of({
1207
1248
  displayStrategy: "bar",
1208
1249
  formatValue: (data) => data.value.toFixed(2),
1209
1250
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1210
- aggregateValues: Score.aggregate.averageWithVariance
1251
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
1211
1252
  });
1212
1253
  Score.of({
1213
1254
  id: "delta",
@@ -2363,9 +2404,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2363
2404
  if (!def)
2364
2405
  return null;
2365
2406
  const formatted = def.format(m.data);
2407
+ const label = m.name ?? def.name;
2366
2408
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2367
2409
  "[",
2368
- def.name ? `${def.name}: ` : "",
2410
+ label ? `${label}: ` : "",
2369
2411
  formatted,
2370
2412
  "]",
2371
2413
  " "
@@ -2378,7 +2420,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2378
2420
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2379
2421
  const s = item.scores[sIdx];
2380
2422
  const def = s.def ?? getScoreById(s.id);
2381
- const scoreLabel = def ? def.name ?? def.id : s.id;
2423
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2382
2424
  rows.push(
2383
2425
  /* @__PURE__ */ jsxRuntime.jsxs(
2384
2426
  ink.Text,