@m4trix/evals 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1039,7 +1039,11 @@ var Metric = {
1039
1039
  name: config.name,
1040
1040
  aggregate: config.aggregate,
1041
1041
  format: config.format,
1042
- make: (data) => ({ id: config.id, data })
1042
+ make: (data, options) => ({
1043
+ id: config.id,
1044
+ data,
1045
+ ...options?.name !== void 0 && { name: options.name }
1046
+ })
1043
1047
  };
1044
1048
  registry.set(config.id, def);
1045
1049
  return def;
@@ -1051,20 +1055,107 @@ function getMetricById(id) {
1051
1055
 
1052
1056
  // src/evals/score.ts
1053
1057
  var registry2 = /* @__PURE__ */ new Map();
1058
+ function formatScoreData(def, data, options) {
1059
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1060
+ }
1061
+ var ScoreAggregate = {
1062
+ /** Average numeric fields. Use for scores like { value, delta }. */
1063
+ averageFields(fields) {
1064
+ return (values) => {
1065
+ const count = values.length || 1;
1066
+ const result = {};
1067
+ for (const field of fields) {
1068
+ result[field] = values.reduce(
1069
+ (s, v) => s + (v[field] ?? 0),
1070
+ 0
1071
+ ) / count;
1072
+ }
1073
+ return result;
1074
+ };
1075
+ },
1076
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
1077
+ averageWithVariance(fields) {
1078
+ return (values) => {
1079
+ const count = values.length;
1080
+ const result = {};
1081
+ for (const field of fields) {
1082
+ result[field] = count === 0 ? 0 : values.reduce(
1083
+ (sum, item) => sum + (item[field] ?? 0),
1084
+ 0
1085
+ ) / count;
1086
+ }
1087
+ const valueField = "value";
1088
+ const hasValueField = fields.includes(valueField);
1089
+ if (count === 0) {
1090
+ if (hasValueField) {
1091
+ result[valueField] = 0;
1092
+ }
1093
+ return {
1094
+ ...result,
1095
+ stdDev: void 0,
1096
+ count: 0
1097
+ };
1098
+ }
1099
+ let stdDev;
1100
+ if (hasValueField && count >= 2) {
1101
+ const sum = values.reduce(
1102
+ (s, v) => s + (v[valueField] ?? 0),
1103
+ 0
1104
+ );
1105
+ const sumSq = values.reduce(
1106
+ (s, v) => {
1107
+ const value = v[valueField] ?? 0;
1108
+ return s + value * value;
1109
+ },
1110
+ 0
1111
+ );
1112
+ const mean = sum / count;
1113
+ const variance = (sumSq - count * mean * mean) / (count - 1);
1114
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1115
+ }
1116
+ return {
1117
+ ...values[0],
1118
+ ...result,
1119
+ stdDev,
1120
+ count
1121
+ };
1122
+ };
1123
+ },
1124
+ /** All runs must pass. Use for binary scores. */
1125
+ all(values) {
1126
+ const total = values.length;
1127
+ const passedCount = values.filter((v) => v.passed).length;
1128
+ return {
1129
+ ...values[0],
1130
+ passed: total > 0 && values.every((v) => v.passed),
1131
+ passedCount,
1132
+ totalCount: total
1133
+ };
1134
+ },
1135
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
1136
+ last(values) {
1137
+ return values[values.length - 1] ?? {};
1138
+ }
1139
+ };
1054
1140
  var Score = {
1141
+ aggregate: ScoreAggregate,
1055
1142
  of(config) {
1056
1143
  const def = {
1057
1144
  id: config.id,
1058
1145
  name: config.name,
1059
1146
  displayStrategy: config.displayStrategy,
1060
- aggregate: config.aggregate,
1061
- format: config.format,
1147
+ formatValue: config.formatValue,
1148
+ formatAggregate: config.formatAggregate,
1149
+ aggregateValues: config.aggregateValues,
1062
1150
  make: (data, options) => {
1063
1151
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1064
1152
  return {
1065
1153
  id: config.id,
1066
1154
  data,
1067
- ...passed !== void 0 && { passed }
1155
+ ...passed !== void 0 && { passed },
1156
+ ...options?.name !== void 0 && { name: options.name },
1157
+ def
1158
+ // Attach def so rendering/aggregation works without registry lookup
1068
1159
  };
1069
1160
  }
1070
1161
  };
@@ -1077,29 +1168,6 @@ function getScoreById(id) {
1077
1168
  }
1078
1169
 
1079
1170
  // src/evals/aggregators.ts
1080
- function aggregateAverageWithVariance(values) {
1081
- if (values.length === 0) {
1082
- return { value: 0, count: 0 };
1083
- }
1084
- const sum = values.reduce((s, v) => s + v.value, 0);
1085
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1086
- const mean = sum / values.length;
1087
- let stdDev;
1088
- if (values.length >= 2) {
1089
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1090
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1091
- }
1092
- return { value: mean, stdDev, count: values.length };
1093
- }
1094
- function aggregateAll(values) {
1095
- const total = values.length;
1096
- const passedCount = values.filter((v) => v.passed).length;
1097
- return {
1098
- passed: total > 0 && values.every((v) => v.passed),
1099
- passedCount,
1100
- totalCount: total
1101
- };
1102
- }
1103
1171
  function aggregateTokenCountSum(values) {
1104
1172
  const initial = {
1105
1173
  input: 0,
@@ -1152,35 +1220,40 @@ Score.of({
1152
1220
  id: "percent",
1153
1221
  name: "Score",
1154
1222
  displayStrategy: "bar",
1155
- format: (data, options) => {
1156
- if (options?.isAggregated) {
1157
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1158
- }
1159
- return data.value.toFixed(2);
1160
- },
1161
- aggregate: aggregateAverageWithVariance
1223
+ formatValue: (data) => data.value.toFixed(2),
1224
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1225
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
1226
+ });
1227
+ Score.of({
1228
+ id: "delta",
1229
+ name: "Delta",
1230
+ displayStrategy: "number",
1231
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
1232
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
1233
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
1162
1234
  });
1163
1235
  Score.of({
1164
1236
  id: "binary",
1165
1237
  name: "Result",
1166
1238
  displayStrategy: "passFail",
1167
- format: (data, options) => {
1168
- if (options?.isAggregated) {
1169
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
1170
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1171
- return `${base} (${data.passedCount}/${data.totalCount})`;
1172
- }
1173
- return base;
1239
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
1240
+ formatAggregate: (data) => {
1241
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1242
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1243
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1174
1244
  }
1175
- return data.passed ? "PASSED" : "NOT PASSED";
1245
+ return base;
1176
1246
  },
1177
- aggregate: aggregateAll
1247
+ aggregateValues: Score.aggregate.all
1178
1248
  });
1179
1249
 
1180
1250
  // src/runner/score-utils.ts
1251
+ function getScoreDef(item) {
1252
+ return item.def ?? getScoreById(item.id);
1253
+ }
1181
1254
  function toNumericScoreFromScores(scores) {
1182
1255
  for (const item of scores) {
1183
- const def = getScoreById(item.id);
1256
+ const def = getScoreDef(item);
1184
1257
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1185
1258
  const value = item.data.value;
1186
1259
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1598,7 +1671,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
1598
1671
  () => appendJsonLine(message.artifactPath, {
1599
1672
  runId: message.runId,
1600
1673
  ts: Date.now(),
1601
- ...message.payload
1674
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1602
1675
  })
1603
1676
  );
1604
1677
  })
@@ -2140,12 +2213,12 @@ function scoreColor(score) {
2140
2213
  return "red";
2141
2214
  }
2142
2215
  function formatScorePart(item) {
2143
- const def = getScoreById(item.id);
2216
+ const def = item.def ?? getScoreById(item.id);
2144
2217
  if (!def) {
2145
2218
  const numeric = toNumericScore(item.data);
2146
2219
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
2147
2220
  }
2148
- const formatted = def.format(item.data);
2221
+ const formatted = formatScoreData(def, item.data);
2149
2222
  if (def.displayStrategy === "bar") {
2150
2223
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2151
2224
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -2305,9 +2378,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2305
2378
  if (!def)
2306
2379
  return null;
2307
2380
  const formatted = def.format(m.data);
2381
+ const label = m.name ?? def.name;
2308
2382
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2309
2383
  "[",
2310
- def.name ? `${def.name}: ` : "",
2384
+ label ? `${label}: ` : "",
2311
2385
  formatted,
2312
2386
  "]",
2313
2387
  " "
@@ -2319,8 +2393,8 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2319
2393
  if (item.scores.length > 0) {
2320
2394
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2321
2395
  const s = item.scores[sIdx];
2322
- const def = getScoreById(s.id);
2323
- const scoreLabel = def ? def.name ?? def.id : s.id;
2396
+ const def = s.def ?? getScoreById(s.id);
2397
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2324
2398
  rows.push(
2325
2399
  /* @__PURE__ */ jsxs(
2326
2400
  Text,