@m4trix/evals 0.18.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -986,6 +986,8 @@ function createDiffString(expected, actual, diffOptions) {
986
986
  function formatLogMessage(msg) {
987
987
  if (typeof msg === "string")
988
988
  return msg;
989
+ if (msg instanceof Error)
990
+ return msg.stack ?? msg.message;
989
991
  try {
990
992
  if (msg !== null && typeof msg === "object") {
991
993
  return JSON.stringify(msg, null, 2);
@@ -1039,7 +1041,11 @@ var Metric = {
1039
1041
  name: config.name,
1040
1042
  aggregate: config.aggregate,
1041
1043
  format: config.format,
1042
- make: (data) => ({ id: config.id, data })
1044
+ make: (data, options) => ({
1045
+ id: config.id,
1046
+ data,
1047
+ ...options?.name !== void 0 && { name: options.name }
1048
+ })
1043
1049
  };
1044
1050
  registry.set(config.id, def);
1045
1051
  return def;
@@ -1061,25 +1067,61 @@ var ScoreAggregate = {
1061
1067
  const count = values.length || 1;
1062
1068
  const result = {};
1063
1069
  for (const field of fields) {
1064
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1070
+ result[field] = values.reduce(
1071
+ (s, v) => s + (v[field] ?? 0),
1072
+ 0
1073
+ ) / count;
1065
1074
  }
1066
1075
  return result;
1067
1076
  };
1068
1077
  },
1069
- /** Average `value` with sample std dev. Use for percent-style scores. */
1070
- averageWithVariance(values) {
1071
- if (values.length === 0) {
1072
- return { value: 0, stdDev: void 0, count: 0 };
1073
- }
1074
- const sum = values.reduce((s, v) => s + v.value, 0);
1075
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1076
- const mean = sum / values.length;
1077
- let stdDev;
1078
- if (values.length >= 2) {
1079
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1080
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1081
- }
1082
- return { ...values[0], value: mean, stdDev, count: values.length };
1078
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
1079
+ averageWithVariance(fields) {
1080
+ return (values) => {
1081
+ const count = values.length;
1082
+ const result = {};
1083
+ for (const field of fields) {
1084
+ result[field] = count === 0 ? 0 : values.reduce(
1085
+ (sum, item) => sum + (item[field] ?? 0),
1086
+ 0
1087
+ ) / count;
1088
+ }
1089
+ const valueField = "value";
1090
+ const hasValueField = fields.includes(valueField);
1091
+ if (count === 0) {
1092
+ if (hasValueField) {
1093
+ result[valueField] = 0;
1094
+ }
1095
+ return {
1096
+ ...result,
1097
+ stdDev: void 0,
1098
+ count: 0
1099
+ };
1100
+ }
1101
+ let stdDev;
1102
+ if (hasValueField && count >= 2) {
1103
+ const sum = values.reduce(
1104
+ (s, v) => s + (v[valueField] ?? 0),
1105
+ 0
1106
+ );
1107
+ const sumSq = values.reduce(
1108
+ (s, v) => {
1109
+ const value = v[valueField] ?? 0;
1110
+ return s + value * value;
1111
+ },
1112
+ 0
1113
+ );
1114
+ const mean = sum / count;
1115
+ const variance = (sumSq - count * mean * mean) / (count - 1);
1116
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1117
+ }
1118
+ return {
1119
+ ...values[0],
1120
+ ...result,
1121
+ stdDev,
1122
+ count
1123
+ };
1124
+ };
1083
1125
  },
1084
1126
  /** All runs must pass. Use for binary scores. */
1085
1127
  all(values) {
@@ -1113,6 +1155,7 @@ var Score = {
1113
1155
  id: config.id,
1114
1156
  data,
1115
1157
  ...passed !== void 0 && { passed },
1158
+ ...options?.name !== void 0 && { name: options.name },
1116
1159
  def
1117
1160
  // Attach def so rendering/aggregation works without registry lookup
1118
1161
  };
@@ -1181,7 +1224,7 @@ Score.of({
1181
1224
  displayStrategy: "bar",
1182
1225
  formatValue: (data) => data.value.toFixed(2),
1183
1226
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1184
- aggregateValues: Score.aggregate.averageWithVariance
1227
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
1185
1228
  });
1186
1229
  Score.of({
1187
1230
  id: "delta",
@@ -1247,6 +1290,7 @@ function toNumericScore(value) {
1247
1290
  }
1248
1291
 
1249
1292
  // src/runner/execution.ts
1293
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1250
1294
  function computeEvaluatorPassed(evaluator, result, scores) {
1251
1295
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1252
1296
  if (scoresWithPassed.length > 0) {
@@ -1303,20 +1347,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1303
1347
  if (!evaluateFn) {
1304
1348
  continue;
1305
1349
  }
1350
+ const logs = [];
1351
+ const logDiff = (expected, actual, options) => {
1352
+ logs.push(createDiffLogEntry(expected, actual, options));
1353
+ };
1354
+ const log = (message, options) => {
1355
+ logs.push(createLogEntry(message, options));
1356
+ };
1357
+ const createError = (message, options) => {
1358
+ const entry = createLogEntry(message, options);
1359
+ const error = message instanceof Error ? message : new Error(entry.message);
1360
+ error[evaluatorErrorLogEntryKey] = entry;
1361
+ return error;
1362
+ };
1306
1363
  try {
1307
- const logs = [];
1308
- const logDiff = (expected, actual, options) => {
1309
- logs.push(createDiffLogEntry(expected, actual, options));
1310
- };
1311
- const log = (message, options) => {
1312
- logs.push(createLogEntry(message, options));
1313
- };
1314
1364
  const ctx = yield* Effect.promise(
1315
1365
  () => Promise.resolve(evaluator.resolveContext())
1316
1366
  );
1317
1367
  const result = yield* Effect.promise(
1318
- () => Promise.resolve(
1319
- evaluateFn({
1368
+ () => Promise.resolve().then(
1369
+ () => evaluateFn({
1320
1370
  input: testCaseItem.testCase.getInput(),
1321
1371
  ctx,
1322
1372
  output,
@@ -1326,10 +1376,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1326
1376
  datasetId: task.datasetId
1327
1377
  },
1328
1378
  logDiff,
1329
- log
1379
+ log,
1380
+ createError
1330
1381
  })
1331
1382
  )
1332
1383
  );
1384
+ if (result instanceof Error) {
1385
+ const evaluatorError = result;
1386
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1387
+ logs.push(taggedEntry ?? createLogEntry(result));
1388
+ testCaseError = result.message;
1389
+ evaluatorScores.push({
1390
+ evaluatorId,
1391
+ scores: [],
1392
+ passed: false,
1393
+ logs: logs.length > 0 ? logs : void 0
1394
+ });
1395
+ continue;
1396
+ }
1333
1397
  const { scores, metrics } = normalizeResult(result);
1334
1398
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1335
1399
  evaluatorScores.push({
@@ -1340,11 +1404,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1340
1404
  logs: logs.length > 0 ? logs : void 0
1341
1405
  });
1342
1406
  } catch (error) {
1407
+ if (error instanceof Error) {
1408
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1409
+ logs.push(taggedEntry ?? createLogEntry(error));
1410
+ }
1343
1411
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1344
1412
  evaluatorScores.push({
1345
1413
  evaluatorId,
1346
1414
  scores: [],
1347
- passed: false
1415
+ passed: false,
1416
+ logs: logs.length > 0 ? logs : void 0
1348
1417
  });
1349
1418
  }
1350
1419
  }
@@ -2337,9 +2406,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2337
2406
  if (!def)
2338
2407
  return null;
2339
2408
  const formatted = def.format(m.data);
2409
+ const label = m.name ?? def.name;
2340
2410
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2341
2411
  "[",
2342
- def.name ? `${def.name}: ` : "",
2412
+ label ? `${label}: ` : "",
2343
2413
  formatted,
2344
2414
  "]",
2345
2415
  " "
@@ -2352,7 +2422,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2352
2422
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2353
2423
  const s = item.scores[sIdx];
2354
2424
  const def = s.def ?? getScoreById(s.id);
2355
- const scoreLabel = def ? def.name ?? def.id : s.id;
2425
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2356
2426
  rows.push(
2357
2427
  /* @__PURE__ */ jsxs(
2358
2428
  Text,