@m4trix/evals 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -1077,20 +1077,70 @@ function getMetricById(id) {
1077
1077
 
1078
1078
  // src/evals/score.ts
1079
1079
  var registry2 = /* @__PURE__ */ new Map();
1080
+ function formatScoreData(def, data, options) {
1081
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1082
+ }
1083
+ var ScoreAggregate = {
1084
+ /** Average numeric fields. Use for scores like { value, delta }. */
1085
+ averageFields(fields) {
1086
+ return (values) => {
1087
+ const count = values.length || 1;
1088
+ const result = {};
1089
+ for (const field of fields) {
1090
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1091
+ }
1092
+ return result;
1093
+ };
1094
+ },
1095
+ /** Average `value` with sample std dev. Use for percent-style scores. */
1096
+ averageWithVariance(values) {
1097
+ if (values.length === 0) {
1098
+ return { value: 0, stdDev: void 0, count: 0 };
1099
+ }
1100
+ const sum = values.reduce((s, v) => s + v.value, 0);
1101
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1102
+ const mean = sum / values.length;
1103
+ let stdDev;
1104
+ if (values.length >= 2) {
1105
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1106
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1107
+ }
1108
+ return { ...values[0], value: mean, stdDev, count: values.length };
1109
+ },
1110
+ /** All runs must pass. Use for binary scores. */
1111
+ all(values) {
1112
+ const total = values.length;
1113
+ const passedCount = values.filter((v) => v.passed).length;
1114
+ return {
1115
+ ...values[0],
1116
+ passed: total > 0 && values.every((v) => v.passed),
1117
+ passedCount,
1118
+ totalCount: total
1119
+ };
1120
+ },
1121
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
1122
+ last(values) {
1123
+ return values[values.length - 1] ?? {};
1124
+ }
1125
+ };
1080
1126
  var Score = {
1127
+ aggregate: ScoreAggregate,
1081
1128
  of(config) {
1082
1129
  const def = {
1083
1130
  id: config.id,
1084
1131
  name: config.name,
1085
1132
  displayStrategy: config.displayStrategy,
1086
- aggregate: config.aggregate,
1087
- format: config.format,
1133
+ formatValue: config.formatValue,
1134
+ formatAggregate: config.formatAggregate,
1135
+ aggregateValues: config.aggregateValues,
1088
1136
  make: (data, options) => {
1089
1137
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1090
1138
  return {
1091
1139
  id: config.id,
1092
1140
  data,
1093
- ...passed !== void 0 && { passed }
1141
+ ...passed !== void 0 && { passed },
1142
+ def
1143
+ // Attach def so rendering/aggregation works without registry lookup
1094
1144
  };
1095
1145
  }
1096
1146
  };
@@ -1103,29 +1153,6 @@ function getScoreById(id) {
1103
1153
  }
1104
1154
 
1105
1155
  // src/evals/aggregators.ts
1106
- function aggregateAverageWithVariance(values) {
1107
- if (values.length === 0) {
1108
- return { value: 0, count: 0 };
1109
- }
1110
- const sum = values.reduce((s, v) => s + v.value, 0);
1111
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1112
- const mean = sum / values.length;
1113
- let stdDev;
1114
- if (values.length >= 2) {
1115
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1116
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1117
- }
1118
- return { value: mean, stdDev, count: values.length };
1119
- }
1120
- function aggregateAll(values) {
1121
- const total = values.length;
1122
- const passedCount = values.filter((v) => v.passed).length;
1123
- return {
1124
- passed: total > 0 && values.every((v) => v.passed),
1125
- passedCount,
1126
- totalCount: total
1127
- };
1128
- }
1129
1156
  function aggregateTokenCountSum(values) {
1130
1157
  const initial = {
1131
1158
  input: 0,
@@ -1178,35 +1205,40 @@ Score.of({
1178
1205
  id: "percent",
1179
1206
  name: "Score",
1180
1207
  displayStrategy: "bar",
1181
- format: (data, options) => {
1182
- if (options?.isAggregated) {
1183
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1184
- }
1185
- return data.value.toFixed(2);
1186
- },
1187
- aggregate: aggregateAverageWithVariance
1208
+ formatValue: (data) => data.value.toFixed(2),
1209
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1210
+ aggregateValues: Score.aggregate.averageWithVariance
1211
+ });
1212
+ Score.of({
1213
+ id: "delta",
1214
+ name: "Delta",
1215
+ displayStrategy: "number",
1216
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
1217
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
1218
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
1188
1219
  });
1189
1220
  Score.of({
1190
1221
  id: "binary",
1191
1222
  name: "Result",
1192
1223
  displayStrategy: "passFail",
1193
- format: (data, options) => {
1194
- if (options?.isAggregated) {
1195
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
1196
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1197
- return `${base} (${data.passedCount}/${data.totalCount})`;
1198
- }
1199
- return base;
1224
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
1225
+ formatAggregate: (data) => {
1226
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1227
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1228
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1200
1229
  }
1201
- return data.passed ? "PASSED" : "NOT PASSED";
1230
+ return base;
1202
1231
  },
1203
- aggregate: aggregateAll
1232
+ aggregateValues: Score.aggregate.all
1204
1233
  });
1205
1234
 
1206
1235
  // src/runner/score-utils.ts
1236
+ function getScoreDef(item) {
1237
+ return item.def ?? getScoreById(item.id);
1238
+ }
1207
1239
  function toNumericScoreFromScores(scores) {
1208
1240
  for (const item of scores) {
1209
- const def = getScoreById(item.id);
1241
+ const def = getScoreDef(item);
1210
1242
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1211
1243
  const value = item.data.value;
1212
1244
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1287,6 +1319,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1287
1319
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1288
1320
  const rerunPassed = [];
1289
1321
  for (let r = 0; r < reruns; r++) {
1322
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
1290
1323
  const started = Date.now();
1291
1324
  const evaluatorScores = [];
1292
1325
  let testCaseError;
@@ -1313,6 +1346,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1313
1346
  input: testCaseItem.testCase.getInput(),
1314
1347
  ctx,
1315
1348
  output,
1349
+ meta: {
1350
+ triggerId: task.triggerId,
1351
+ runId: evaluatorRunId,
1352
+ datasetId: task.datasetId
1353
+ },
1316
1354
  logDiff,
1317
1355
  log
1318
1356
  })
@@ -1618,7 +1656,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
1618
1656
  () => appendJsonLine(message.artifactPath, {
1619
1657
  runId: message.runId,
1620
1658
  ts: Date.now(),
1621
- ...message.payload
1659
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1622
1660
  })
1623
1661
  );
1624
1662
  })
@@ -1802,6 +1840,7 @@ var EffectRunner = class {
1802
1840
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1803
1841
  0
1804
1842
  );
1843
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1805
1844
  const runId = `run-${crypto.randomUUID()}`;
1806
1845
  const artifactPath = createArtifactPath(
1807
1846
  this.config.artifactDirectory,
@@ -1843,6 +1882,7 @@ var EffectRunner = class {
1843
1882
  await effect.Effect.runPromise(
1844
1883
  effect.Queue.offer(this.runQueue, {
1845
1884
  runId,
1885
+ triggerId,
1846
1886
  datasetId: request.datasetId,
1847
1887
  dataset: dataset.dataset,
1848
1888
  evaluators: selectedEvaluators,
@@ -2158,12 +2198,12 @@ function scoreColor(score) {
2158
2198
  return "red";
2159
2199
  }
2160
2200
  function formatScorePart(item) {
2161
- const def = getScoreById(item.id);
2201
+ const def = item.def ?? getScoreById(item.id);
2162
2202
  if (!def) {
2163
2203
  const numeric = toNumericScore(item.data);
2164
2204
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
2165
2205
  }
2166
- const formatted = def.format(item.data);
2206
+ const formatted = formatScoreData(def, item.data);
2167
2207
  if (def.displayStrategy === "bar") {
2168
2208
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
2169
2209
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -2337,7 +2377,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2337
2377
  if (item.scores.length > 0) {
2338
2378
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2339
2379
  const s = item.scores[sIdx];
2340
- const def = getScoreById(s.id);
2380
+ const def = s.def ?? getScoreById(s.id);
2341
2381
  const scoreLabel = def ? def.name ?? def.id : s.id;
2342
2382
  rows.push(
2343
2383
  /* @__PURE__ */ jsxRuntime.jsxs(