@m4trix/evals 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1051,20 +1051,70 @@ function getMetricById(id) {
1051
1051
 
1052
1052
  // src/evals/score.ts
1053
1053
  var registry2 = /* @__PURE__ */ new Map();
1054
+ function formatScoreData(def, data, options) {
1055
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1056
+ }
1057
+ var ScoreAggregate = {
1058
+ /** Average numeric fields. Use for scores like { value, delta }. */
1059
+ averageFields(fields) {
1060
+ return (values) => {
1061
+ const count = values.length || 1;
1062
+ const result = {};
1063
+ for (const field of fields) {
1064
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1065
+ }
1066
+ return result;
1067
+ };
1068
+ },
1069
+ /** Average `value` with sample std dev. Use for percent-style scores. */
1070
+ averageWithVariance(values) {
1071
+ if (values.length === 0) {
1072
+ return { value: 0, stdDev: void 0, count: 0 };
1073
+ }
1074
+ const sum = values.reduce((s, v) => s + v.value, 0);
1075
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1076
+ const mean = sum / values.length;
1077
+ let stdDev;
1078
+ if (values.length >= 2) {
1079
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1080
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1081
+ }
1082
+ return { ...values[0], value: mean, stdDev, count: values.length };
1083
+ },
1084
+ /** All runs must pass. Use for binary scores. */
1085
+ all(values) {
1086
+ const total = values.length;
1087
+ const passedCount = values.filter((v) => v.passed).length;
1088
+ return {
1089
+ ...values[0],
1090
+ passed: total > 0 && values.every((v) => v.passed),
1091
+ passedCount,
1092
+ totalCount: total
1093
+ };
1094
+ },
1095
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
1096
+ last(values) {
1097
+ return values[values.length - 1] ?? {};
1098
+ }
1099
+ };
1054
1100
  var Score = {
1101
+ aggregate: ScoreAggregate,
1055
1102
  of(config) {
1056
1103
  const def = {
1057
1104
  id: config.id,
1058
1105
  name: config.name,
1059
1106
  displayStrategy: config.displayStrategy,
1060
- aggregate: config.aggregate,
1061
- format: config.format,
1107
+ formatValue: config.formatValue,
1108
+ formatAggregate: config.formatAggregate,
1109
+ aggregateValues: config.aggregateValues,
1062
1110
  make: (data, options) => {
1063
1111
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1064
1112
  return {
1065
1113
  id: config.id,
1066
1114
  data,
1067
- ...passed !== void 0 && { passed }
1115
+ ...passed !== void 0 && { passed },
1116
+ def
1117
+ // Attach def so rendering/aggregation works without registry lookup
1068
1118
  };
1069
1119
  }
1070
1120
  };
@@ -1077,29 +1127,6 @@ function getScoreById(id) {
1077
1127
  }
1078
1128
 
1079
1129
  // src/evals/aggregators.ts
1080
- function aggregateAverageWithVariance(values) {
1081
- if (values.length === 0) {
1082
- return { value: 0, count: 0 };
1083
- }
1084
- const sum = values.reduce((s, v) => s + v.value, 0);
1085
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1086
- const mean = sum / values.length;
1087
- let stdDev;
1088
- if (values.length >= 2) {
1089
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1090
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1091
- }
1092
- return { value: mean, stdDev, count: values.length };
1093
- }
1094
- function aggregateAll(values) {
1095
- const total = values.length;
1096
- const passedCount = values.filter((v) => v.passed).length;
1097
- return {
1098
- passed: total > 0 && values.every((v) => v.passed),
1099
- passedCount,
1100
- totalCount: total
1101
- };
1102
- }
1103
1130
  function aggregateTokenCountSum(values) {
1104
1131
  const initial = {
1105
1132
  input: 0,
@@ -1152,35 +1179,40 @@ Score.of({
1152
1179
  id: "percent",
1153
1180
  name: "Score",
1154
1181
  displayStrategy: "bar",
1155
- format: (data, options) => {
1156
- if (options?.isAggregated) {
1157
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1158
- }
1159
- return data.value.toFixed(2);
1160
- },
1161
- aggregate: aggregateAverageWithVariance
1182
+ formatValue: (data) => data.value.toFixed(2),
1183
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1184
+ aggregateValues: Score.aggregate.averageWithVariance
1185
+ });
1186
+ Score.of({
1187
+ id: "delta",
1188
+ name: "Delta",
1189
+ displayStrategy: "number",
1190
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
1191
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
1192
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
1162
1193
  });
1163
1194
  Score.of({
1164
1195
  id: "binary",
1165
1196
  name: "Result",
1166
1197
  displayStrategy: "passFail",
1167
- format: (data, options) => {
1168
- if (options?.isAggregated) {
1169
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
1170
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1171
- return `${base} (${data.passedCount}/${data.totalCount})`;
1172
- }
1173
- return base;
1198
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
1199
+ formatAggregate: (data) => {
1200
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1201
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1202
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1174
1203
  }
1175
- return data.passed ? "PASSED" : "NOT PASSED";
1204
+ return base;
1176
1205
  },
1177
- aggregate: aggregateAll
1206
+ aggregateValues: Score.aggregate.all
1178
1207
  });
1179
1208
 
1180
1209
  // src/runner/score-utils.ts
1210
+ function getScoreDef(item) {
1211
+ return item.def ?? getScoreById(item.id);
1212
+ }
1181
1213
  function toNumericScoreFromScores(scores) {
1182
1214
  for (const item of scores) {
1183
- const def = getScoreById(item.id);
1215
+ const def = getScoreDef(item);
1184
1216
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1185
1217
  const value = item.data.value;
1186
1218
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1598,7 +1630,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
1598
1630
  () => appendJsonLine(message.artifactPath, {
1599
1631
  runId: message.runId,
1600
1632
  ts: Date.now(),
1601
- ...message.payload
1633
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1602
1634
  })
1603
1635
  );
1604
1636
  })
@@ -2140,12 +2172,12 @@ function scoreColor(score) {
2140
2172
  return "red";
2141
2173
  }
2142
2174
  function formatScorePart(item) {
2143
- const def = getScoreById(item.id);
2175
+ const def = item.def ?? getScoreById(item.id);
2144
2176
  if (!def) {
2145
2177
  const numeric = toNumericScore(item.data);
2146
2178
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
2147
2179
  }
2148
- const formatted = def.format(item.data);
2180
+ const formatted = formatScoreData(def, item.data);
2149
2181
  if (def.displayStrategy === "bar") {
2150
2182
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2151
2183
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -2319,7 +2351,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2319
2351
  if (item.scores.length > 0) {
2320
2352
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2321
2353
  const s = item.scores[sIdx];
2322
- const def = getScoreById(s.id);
2354
+ const def = s.def ?? getScoreById(s.id);
2323
2355
  const scoreLabel = def ? def.name ?? def.id : s.id;
2324
2356
  rows.push(
2325
2357
  /* @__PURE__ */ jsxs(