@m4trix/evals 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -1077,20 +1077,70 @@ function getMetricById(id) {
1077
1077
 
1078
1078
  // src/evals/score.ts
1079
1079
  var registry2 = /* @__PURE__ */ new Map();
1080
+ function formatScoreData(def, data, options) {
1081
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1082
+ }
1083
+ var ScoreAggregate = {
1084
+ /** Average numeric fields. Use for scores like { value, delta }. */
1085
+ averageFields(fields) {
1086
+ return (values) => {
1087
+ const count = values.length || 1;
1088
+ const result = {};
1089
+ for (const field of fields) {
1090
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1091
+ }
1092
+ return result;
1093
+ };
1094
+ },
1095
+ /** Average `value` with sample std dev. Use for percent-style scores. */
1096
+ averageWithVariance(values) {
1097
+ if (values.length === 0) {
1098
+ return { value: 0, stdDev: void 0, count: 0 };
1099
+ }
1100
+ const sum = values.reduce((s, v) => s + v.value, 0);
1101
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1102
+ const mean = sum / values.length;
1103
+ let stdDev;
1104
+ if (values.length >= 2) {
1105
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1106
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1107
+ }
1108
+ return { ...values[0], value: mean, stdDev, count: values.length };
1109
+ },
1110
+ /** All runs must pass. Use for binary scores. */
1111
+ all(values) {
1112
+ const total = values.length;
1113
+ const passedCount = values.filter((v) => v.passed).length;
1114
+ return {
1115
+ ...values[0],
1116
+ passed: total > 0 && values.every((v) => v.passed),
1117
+ passedCount,
1118
+ totalCount: total
1119
+ };
1120
+ },
1121
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
1122
+ last(values) {
1123
+ return values[values.length - 1] ?? {};
1124
+ }
1125
+ };
1080
1126
  var Score = {
1127
+ aggregate: ScoreAggregate,
1081
1128
  of(config) {
1082
1129
  const def = {
1083
1130
  id: config.id,
1084
1131
  name: config.name,
1085
1132
  displayStrategy: config.displayStrategy,
1086
- aggregate: config.aggregate,
1087
- format: config.format,
1133
+ formatValue: config.formatValue,
1134
+ formatAggregate: config.formatAggregate,
1135
+ aggregateValues: config.aggregateValues,
1088
1136
  make: (data, options) => {
1089
1137
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1090
1138
  return {
1091
1139
  id: config.id,
1092
1140
  data,
1093
- ...passed !== void 0 && { passed }
1141
+ ...passed !== void 0 && { passed },
1142
+ def
1143
+ // Attach def so rendering/aggregation works without registry lookup
1094
1144
  };
1095
1145
  }
1096
1146
  };
@@ -1103,29 +1153,6 @@ function getScoreById(id) {
1103
1153
  }
1104
1154
 
1105
1155
  // src/evals/aggregators.ts
1106
- function aggregateAverageWithVariance(values) {
1107
- if (values.length === 0) {
1108
- return { value: 0, count: 0 };
1109
- }
1110
- const sum = values.reduce((s, v) => s + v.value, 0);
1111
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1112
- const mean = sum / values.length;
1113
- let stdDev;
1114
- if (values.length >= 2) {
1115
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1116
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1117
- }
1118
- return { value: mean, stdDev, count: values.length };
1119
- }
1120
- function aggregateAll(values) {
1121
- const total = values.length;
1122
- const passedCount = values.filter((v) => v.passed).length;
1123
- return {
1124
- passed: total > 0 && values.every((v) => v.passed),
1125
- passedCount,
1126
- totalCount: total
1127
- };
1128
- }
1129
1156
  function aggregateTokenCountSum(values) {
1130
1157
  const initial = {
1131
1158
  input: 0,
@@ -1178,35 +1205,40 @@ Score.of({
1178
1205
  id: "percent",
1179
1206
  name: "Score",
1180
1207
  displayStrategy: "bar",
1181
- format: (data, options) => {
1182
- if (options?.isAggregated) {
1183
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1184
- }
1185
- return data.value.toFixed(2);
1186
- },
1187
- aggregate: aggregateAverageWithVariance
1208
+ formatValue: (data) => data.value.toFixed(2),
1209
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1210
+ aggregateValues: Score.aggregate.averageWithVariance
1211
+ });
1212
+ Score.of({
1213
+ id: "delta",
1214
+ name: "Delta",
1215
+ displayStrategy: "number",
1216
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
1217
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
1218
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
1188
1219
  });
1189
1220
  Score.of({
1190
1221
  id: "binary",
1191
1222
  name: "Result",
1192
1223
  displayStrategy: "passFail",
1193
- format: (data, options) => {
1194
- if (options?.isAggregated) {
1195
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
1196
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1197
- return `${base} (${data.passedCount}/${data.totalCount})`;
1198
- }
1199
- return base;
1224
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
1225
+ formatAggregate: (data) => {
1226
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1227
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1228
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1200
1229
  }
1201
- return data.passed ? "PASSED" : "NOT PASSED";
1230
+ return base;
1202
1231
  },
1203
- aggregate: aggregateAll
1232
+ aggregateValues: Score.aggregate.all
1204
1233
  });
1205
1234
 
1206
1235
  // src/runner/score-utils.ts
1236
+ function getScoreDef(item) {
1237
+ return item.def ?? getScoreById(item.id);
1238
+ }
1207
1239
  function toNumericScoreFromScores(scores) {
1208
1240
  for (const item of scores) {
1209
- const def = getScoreById(item.id);
1241
+ const def = getScoreDef(item);
1210
1242
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1211
1243
  const value = item.data.value;
1212
1244
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1624,7 +1656,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
1624
1656
  () => appendJsonLine(message.artifactPath, {
1625
1657
  runId: message.runId,
1626
1658
  ts: Date.now(),
1627
- ...message.payload
1659
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1628
1660
  })
1629
1661
  );
1630
1662
  })
@@ -2166,12 +2198,12 @@ function scoreColor(score) {
2166
2198
  return "red";
2167
2199
  }
2168
2200
  function formatScorePart(item) {
2169
- const def = getScoreById(item.id);
2201
+ const def = item.def ?? getScoreById(item.id);
2170
2202
  if (!def) {
2171
2203
  const numeric = toNumericScore(item.data);
2172
2204
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
2173
2205
  }
2174
- const formatted = def.format(item.data);
2206
+ const formatted = formatScoreData(def, item.data);
2175
2207
  if (def.displayStrategy === "bar") {
2176
2208
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2177
2209
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -2345,7 +2377,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2345
2377
  if (item.scores.length > 0) {
2346
2378
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2347
2379
  const s = item.scores[sIdx];
2348
- const def = getScoreById(s.id);
2380
+ const def = s.def ?? getScoreById(s.id);
2349
2381
  const scoreLabel = def ? def.name ?? def.id : s.id;
2350
2382
  rows.push(
2351
2383
  /* @__PURE__ */ jsxRuntime.jsxs(