@m4trix/evals 0.18.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -1012,6 +1012,8 @@ function createDiffString(expected, actual, diffOptions) {
1012
1012
  function formatLogMessage(msg) {
1013
1013
  if (typeof msg === "string")
1014
1014
  return msg;
1015
+ if (msg instanceof Error)
1016
+ return msg.stack ?? msg.message;
1015
1017
  try {
1016
1018
  if (msg !== null && typeof msg === "object") {
1017
1019
  return JSON.stringify(msg, null, 2);
@@ -1065,7 +1067,11 @@ var Metric = {
1065
1067
  name: config.name,
1066
1068
  aggregate: config.aggregate,
1067
1069
  format: config.format,
1068
- make: (data) => ({ id: config.id, data })
1070
+ make: (data, options) => ({
1071
+ id: config.id,
1072
+ data,
1073
+ ...options?.name !== void 0 && { name: options.name }
1074
+ })
1069
1075
  };
1070
1076
  registry.set(config.id, def);
1071
1077
  return def;
@@ -1087,25 +1093,61 @@ var ScoreAggregate = {
1087
1093
  const count = values.length || 1;
1088
1094
  const result = {};
1089
1095
  for (const field of fields) {
1090
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1096
+ result[field] = values.reduce(
1097
+ (s, v) => s + (v[field] ?? 0),
1098
+ 0
1099
+ ) / count;
1091
1100
  }
1092
1101
  return result;
1093
1102
  };
1094
1103
  },
1095
- /** Average `value` with sample std dev. Use for percent-style scores. */
1096
- averageWithVariance(values) {
1097
- if (values.length === 0) {
1098
- return { value: 0, stdDev: void 0, count: 0 };
1099
- }
1100
- const sum = values.reduce((s, v) => s + v.value, 0);
1101
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1102
- const mean = sum / values.length;
1103
- let stdDev;
1104
- if (values.length >= 2) {
1105
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1106
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1107
- }
1108
- return { ...values[0], value: mean, stdDev, count: values.length };
1104
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
1105
+ averageWithVariance(fields) {
1106
+ return (values) => {
1107
+ const count = values.length;
1108
+ const result = {};
1109
+ for (const field of fields) {
1110
+ result[field] = count === 0 ? 0 : values.reduce(
1111
+ (sum, item) => sum + (item[field] ?? 0),
1112
+ 0
1113
+ ) / count;
1114
+ }
1115
+ const valueField = "value";
1116
+ const hasValueField = fields.includes(valueField);
1117
+ if (count === 0) {
1118
+ if (hasValueField) {
1119
+ result[valueField] = 0;
1120
+ }
1121
+ return {
1122
+ ...result,
1123
+ stdDev: void 0,
1124
+ count: 0
1125
+ };
1126
+ }
1127
+ let stdDev;
1128
+ if (hasValueField && count >= 2) {
1129
+ const sum = values.reduce(
1130
+ (s, v) => s + (v[valueField] ?? 0),
1131
+ 0
1132
+ );
1133
+ const sumSq = values.reduce(
1134
+ (s, v) => {
1135
+ const value = v[valueField] ?? 0;
1136
+ return s + value * value;
1137
+ },
1138
+ 0
1139
+ );
1140
+ const mean = sum / count;
1141
+ const variance = (sumSq - count * mean * mean) / (count - 1);
1142
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1143
+ }
1144
+ return {
1145
+ ...values[0],
1146
+ ...result,
1147
+ stdDev,
1148
+ count
1149
+ };
1150
+ };
1109
1151
  },
1110
1152
  /** All runs must pass. Use for binary scores. */
1111
1153
  all(values) {
@@ -1139,6 +1181,7 @@ var Score = {
1139
1181
  id: config.id,
1140
1182
  data,
1141
1183
  ...passed !== void 0 && { passed },
1184
+ ...options?.name !== void 0 && { name: options.name },
1142
1185
  def
1143
1186
  // Attach def so rendering/aggregation works without registry lookup
1144
1187
  };
@@ -1207,7 +1250,7 @@ Score.of({
1207
1250
  displayStrategy: "bar",
1208
1251
  formatValue: (data) => data.value.toFixed(2),
1209
1252
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
1210
- aggregateValues: Score.aggregate.averageWithVariance
1253
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
1211
1254
  });
1212
1255
  Score.of({
1213
1256
  id: "delta",
@@ -1273,6 +1316,7 @@ function toNumericScore(value) {
1273
1316
  }
1274
1317
 
1275
1318
  // src/runner/execution.ts
1319
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1276
1320
  function computeEvaluatorPassed(evaluator, result, scores) {
1277
1321
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1278
1322
  if (scoresWithPassed.length > 0) {
@@ -1329,20 +1373,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1329
1373
  if (!evaluateFn) {
1330
1374
  continue;
1331
1375
  }
1376
+ const logs = [];
1377
+ const logDiff = (expected, actual, options) => {
1378
+ logs.push(createDiffLogEntry(expected, actual, options));
1379
+ };
1380
+ const log = (message, options) => {
1381
+ logs.push(createLogEntry(message, options));
1382
+ };
1383
+ const createError = (message, options) => {
1384
+ const entry = createLogEntry(message, options);
1385
+ const error = message instanceof Error ? message : new Error(entry.message);
1386
+ error[evaluatorErrorLogEntryKey] = entry;
1387
+ return error;
1388
+ };
1332
1389
  try {
1333
- const logs = [];
1334
- const logDiff = (expected, actual, options) => {
1335
- logs.push(createDiffLogEntry(expected, actual, options));
1336
- };
1337
- const log = (message, options) => {
1338
- logs.push(createLogEntry(message, options));
1339
- };
1340
1390
  const ctx = yield* effect.Effect.promise(
1341
1391
  () => Promise.resolve(evaluator.resolveContext())
1342
1392
  );
1343
1393
  const result = yield* effect.Effect.promise(
1344
- () => Promise.resolve(
1345
- evaluateFn({
1394
+ () => Promise.resolve().then(
1395
+ () => evaluateFn({
1346
1396
  input: testCaseItem.testCase.getInput(),
1347
1397
  ctx,
1348
1398
  output,
@@ -1352,10 +1402,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1352
1402
  datasetId: task.datasetId
1353
1403
  },
1354
1404
  logDiff,
1355
- log
1405
+ log,
1406
+ createError
1356
1407
  })
1357
1408
  )
1358
1409
  );
1410
+ if (result instanceof Error) {
1411
+ const evaluatorError = result;
1412
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1413
+ logs.push(taggedEntry ?? createLogEntry(result));
1414
+ testCaseError = result.message;
1415
+ evaluatorScores.push({
1416
+ evaluatorId,
1417
+ scores: [],
1418
+ passed: false,
1419
+ logs: logs.length > 0 ? logs : void 0
1420
+ });
1421
+ continue;
1422
+ }
1359
1423
  const { scores, metrics } = normalizeResult(result);
1360
1424
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1361
1425
  evaluatorScores.push({
@@ -1366,11 +1430,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1366
1430
  logs: logs.length > 0 ? logs : void 0
1367
1431
  });
1368
1432
  } catch (error) {
1433
+ if (error instanceof Error) {
1434
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1435
+ logs.push(taggedEntry ?? createLogEntry(error));
1436
+ }
1369
1437
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1370
1438
  evaluatorScores.push({
1371
1439
  evaluatorId,
1372
1440
  scores: [],
1373
- passed: false
1441
+ passed: false,
1442
+ logs: logs.length > 0 ? logs : void 0
1374
1443
  });
1375
1444
  }
1376
1445
  }
@@ -2363,9 +2432,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2363
2432
  if (!def)
2364
2433
  return null;
2365
2434
  const formatted = def.format(m.data);
2435
+ const label = m.name ?? def.name;
2366
2436
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2367
2437
  "[",
2368
- def.name ? `${def.name}: ` : "",
2438
+ label ? `${label}: ` : "",
2369
2439
  formatted,
2370
2440
  "]",
2371
2441
  " "
@@ -2378,7 +2448,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2378
2448
  for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2379
2449
  const s = item.scores[sIdx];
2380
2450
  const def = s.def ?? getScoreById(s.id);
2381
- const scoreLabel = def ? def.name ?? def.id : s.id;
2451
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2382
2452
  rows.push(
2383
2453
  /* @__PURE__ */ jsxRuntime.jsxs(
2384
2454
  ink.Text,