@m4trix/evals 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -1065,7 +1065,11 @@ var Metric = {
1065
1065
  name: config.name,
1066
1066
  aggregate: config.aggregate,
1067
1067
  format: config.format,
1068
- make: (data) => ({ id: config.id, data })
1068
+ make: (data, options) => ({
1069
+ id: config.id,
1070
+ data,
1071
+ ...options?.name !== void 0 && { name: options.name }
1072
+ })
1069
1073
  };
1070
1074
  registry.set(config.id, def);
1071
1075
  return def;
@@ -1077,20 +1081,107 @@ function getMetricById(id) {
1077
1081
 
1078
1082
  // src/evals/score.ts
1079
1083
  var registry2 = /* @__PURE__ */ new Map();
1084
+ function formatScoreData(def, data, options) {
1085
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1086
+ }
1087
+ var ScoreAggregate = {
1088
+ /** Average numeric fields. Use for scores like { value, delta }. */
1089
+ averageFields(fields) {
1090
+ return (values) => {
1091
+ const count = values.length || 1;
1092
+ const result = {};
1093
+ for (const field of fields) {
1094
+ result[field] = values.reduce(
1095
+ (s, v) => s + (v[field] ?? 0),
1096
+ 0
1097
+ ) / count;
1098
+ }
1099
+ return result;
1100
+ };
1101
+ },
1102
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
1103
+ averageWithVariance(fields) {
1104
+ return (values) => {
1105
+ const count = values.length;
1106
+ const result = {};
1107
+ for (const field of fields) {
1108
+ result[field] = count === 0 ? 0 : values.reduce(
1109
+ (sum, item) => sum + (item[field] ?? 0),
1110
+ 0
1111
+ ) / count;
1112
+ }
1113
+ const valueField = "value";
1114
+ const hasValueField = fields.includes(valueField);
1115
+ if (count === 0) {
1116
+ if (hasValueField) {
1117
+ result[valueField] = 0;
1118
+ }
1119
+ return {
1120
+ ...result,
1121
+ stdDev: void 0,
1122
+ count: 0
1123
+ };
1124
+ }
1125
+ let stdDev;
1126
+ if (hasValueField && count >= 2) {
1127
+ const sum = values.reduce(
1128
+ (s, v) => s + (v[valueField] ?? 0),
1129
+ 0
1130
+ );
1131
+ const sumSq = values.reduce(
1132
+ (s, v) => {
1133
+ const value = v[valueField] ?? 0;
1134
+ return s + value * value;
1135
+ },
1136
+ 0
1137
+ );
1138
+ const mean = sum / count;
1139
+ const variance = (sumSq - count * mean * mean) / (count - 1);
1140
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1141
+ }
1142
+ return {
1143
+ ...values[0],
1144
+ ...result,
1145
+ stdDev,
1146
+ count
1147
+ };
1148
+ };
1149
+ },
1150
+ /** All runs must pass. Use for binary scores. */
1151
+ all(values) {
1152
+ const total = values.length;
1153
+ const passedCount = values.filter((v) => v.passed).length;
1154
+ return {
1155
+ ...values[0],
1156
+ passed: total > 0 && values.every((v) => v.passed),
1157
+ passedCount,
1158
+ totalCount: total
1159
+ };
1160
+ },
1161
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
1162
+ last(values) {
1163
+ return values[values.length - 1] ?? {};
1164
+ }
1165
+ };
1080
1166
  var Score = {
1167
+ aggregate: ScoreAggregate,
1081
1168
  of(config) {
1082
1169
  const def = {
1083
1170
  id: config.id,
1084
1171
  name: config.name,
1085
1172
  displayStrategy: config.displayStrategy,
1086
- aggregate: config.aggregate,
1087
- format: config.format,
1173
+ formatValue: config.formatValue,
1174
+ formatAggregate: config.formatAggregate,
1175
+ aggregateValues: config.aggregateValues,
1088
1176
  make: (data, options) => {
1089
1177
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1090
1178
  return {
1091
1179
  id: config.id,
1092
1180
  data,
1093
- ...passed !== void 0 && { passed }
1181
+ ...passed !== void 0 && { passed },
1182
+ ...options?.name !== void 0 && { name: options.name },
1183
+ def
1184
+ // Attach def so rendering/aggregation works without registry lookup
1094
1185
  };
1095
1186
  }
1096
1187
  };
@@ -1103,29 +1194,6 @@ function getScoreById(id) {
1103
1194
  }
1104
1195
 
1105
1196
  // src/evals/aggregators.ts
1106
- function aggregateAverageWithVariance(values) {
1107
- if (values.length === 0) {
1108
- return { value: 0, count: 0 };
1109
- }
1110
- const sum = values.reduce((s, v) => s + v.value, 0);
1111
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1112
- const mean = sum / values.length;
1113
- let stdDev;
1114
- if (values.length >= 2) {
1115
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1116
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1117
- }
1118
- return { value: mean, stdDev, count: values.length };
1119
- }
1120
- function aggregateAll(values) {
1121
- const total = values.length;
1122
- const passedCount = values.filter((v) => v.passed).length;
1123
- return {
1124
- passed: total > 0 && values.every((v) => v.passed),
1125
- passedCount,
1126
- totalCount: total
1127
- };
1128
- }
1129
1197
  function aggregateTokenCountSum(values) {
1130
1198
  const initial = {
1131
1199
  input: 0,
@@ -1178,35 +1246,40 @@ Score.of({
1178
1246
  id: "percent",
1179
1247
  name: "Score",
1180
1248
  displayStrategy: "bar",
1181
- format: (data, options) => {
1182
- if (options?.isAggregated) {
1183
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1184
- }
1185
- return data.value.toFixed(2);
1186
- },
1187
- aggregate: aggregateAverageWithVariance
1249
+ formatValue: (data) => data.value.toFixed(2),
1250
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1251
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
1252
+ });
1253
+ Score.of({
1254
+ id: "delta",
1255
+ name: "Delta",
1256
+ displayStrategy: "number",
1257
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
1258
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
1259
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
1188
1260
  });
1189
1261
  Score.of({
1190
1262
  id: "binary",
1191
1263
  name: "Result",
1192
1264
  displayStrategy: "passFail",
1193
- format: (data, options) => {
1194
- if (options?.isAggregated) {
1195
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
1196
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1197
- return `${base} (${data.passedCount}/${data.totalCount})`;
1198
- }
1199
- return base;
1265
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
1266
+ formatAggregate: (data) => {
1267
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1268
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1269
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1200
1270
  }
1201
- return data.passed ? "PASSED" : "NOT PASSED";
1271
+ return base;
1202
1272
  },
1203
- aggregate: aggregateAll
1273
+ aggregateValues: Score.aggregate.all
1204
1274
  });
1205
1275
 
1206
1276
  // src/runner/score-utils.ts
1277
+ function getScoreDef(item) {
1278
+ return item.def ?? getScoreById(item.id);
1279
+ }
1207
1280
  function toNumericScoreFromScores(scores) {
1208
1281
  for (const item of scores) {
1209
- const def = getScoreById(item.id);
1282
+ const def = getScoreDef(item);
1210
1283
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1211
1284
  const value = item.data.value;
1212
1285
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1624,7 +1697,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
1624
1697
  () => appendJsonLine(message.artifactPath, {
1625
1698
  runId: message.runId,
1626
1699
  ts: Date.now(),
1627
- ...message.payload
1700
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1628
1701
  })
1629
1702
  );
1630
1703
  })
@@ -2166,12 +2239,12 @@ function scoreColor(score) {
2166
2239
  return "red";
2167
2240
  }
2168
2241
  function formatScorePart(item) {
2169
- const def = getScoreById(item.id);
2242
+ const def = item.def ?? getScoreById(item.id);
2170
2243
  if (!def) {
2171
2244
  const numeric = toNumericScore(item.data);
2172
2245
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
2173
2246
  }
2174
- const formatted = def.format(item.data);
2247
+ const formatted = formatScoreData(def, item.data);
2175
2248
  if (def.displayStrategy === "bar") {
2176
2249
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2177
2250
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -2331,9 +2404,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2331
2404
  if (!def)
2332
2405
  return null;
2333
2406
  const formatted = def.format(m.data);
2407
+ const label = m.name ?? def.name;
2334
2408
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2335
2409
  "[",
2336
- def.name ? `${def.name}: ` : "",
2410
+ label ? `${label}: ` : "",
2337
2411
  formatted,
2338
2412
  "]",
2339
2413
  " "
@@ -2345,8 +2419,8 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2345
2419
  if (item.scores.length > 0) {
2346
2420
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2347
2421
  const s = item.scores[sIdx];
2348
- const def = getScoreById(s.id);
2349
- const scoreLabel = def ? def.name ?? def.id : s.id;
2422
+ const def = s.def ?? getScoreById(s.id);
2423
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2350
2424
  rows.push(
2351
2425
  /* @__PURE__ */ jsxRuntime.jsxs(
2352
2426
  ink.Text,