@m4trix/evals 0.18.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -523,7 +523,11 @@ var Metric = {
523
523
  name: config.name,
524
524
  aggregate: config.aggregate,
525
525
  format: config.format,
526
- make: (data) => ({ id: config.id, data })
526
+ make: (data, options) => ({
527
+ id: config.id,
528
+ data,
529
+ ...options?.name !== void 0 && { name: options.name }
530
+ })
527
531
  };
528
532
  registry.set(config.id, def);
529
533
  return def;
@@ -545,25 +549,61 @@ var ScoreAggregate = {
545
549
  const count = values.length || 1;
546
550
  const result = {};
547
551
  for (const field of fields) {
548
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
552
+ result[field] = values.reduce(
553
+ (s, v) => s + (v[field] ?? 0),
554
+ 0
555
+ ) / count;
549
556
  }
550
557
  return result;
551
558
  };
552
559
  },
553
- /** Average `value` with sample std dev. Use for percent-style scores. */
554
- averageWithVariance(values) {
555
- if (values.length === 0) {
556
- return { value: 0, stdDev: void 0, count: 0 };
557
- }
558
- const sum = values.reduce((s, v) => s + v.value, 0);
559
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
560
- const mean = sum / values.length;
561
- let stdDev;
562
- if (values.length >= 2) {
563
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
564
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
565
- }
566
- return { ...values[0], value: mean, stdDev, count: values.length };
560
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
561
+ averageWithVariance(fields) {
562
+ return (values) => {
563
+ const count = values.length;
564
+ const result = {};
565
+ for (const field of fields) {
566
+ result[field] = count === 0 ? 0 : values.reduce(
567
+ (sum, item) => sum + (item[field] ?? 0),
568
+ 0
569
+ ) / count;
570
+ }
571
+ const valueField = "value";
572
+ const hasValueField = fields.includes(valueField);
573
+ if (count === 0) {
574
+ if (hasValueField) {
575
+ result[valueField] = 0;
576
+ }
577
+ return {
578
+ ...result,
579
+ stdDev: void 0,
580
+ count: 0
581
+ };
582
+ }
583
+ let stdDev;
584
+ if (hasValueField && count >= 2) {
585
+ const sum = values.reduce(
586
+ (s, v) => s + (v[valueField] ?? 0),
587
+ 0
588
+ );
589
+ const sumSq = values.reduce(
590
+ (s, v) => {
591
+ const value = v[valueField] ?? 0;
592
+ return s + value * value;
593
+ },
594
+ 0
595
+ );
596
+ const mean = sum / count;
597
+ const variance = (sumSq - count * mean * mean) / (count - 1);
598
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
599
+ }
600
+ return {
601
+ ...values[0],
602
+ ...result,
603
+ stdDev,
604
+ count
605
+ };
606
+ };
567
607
  },
568
608
  /** All runs must pass. Use for binary scores. */
569
609
  all(values) {
@@ -597,6 +637,7 @@ var Score = {
597
637
  id: config.id,
598
638
  data,
599
639
  ...passed !== void 0 && { passed },
640
+ ...options?.name !== void 0 && { name: options.name },
600
641
  def
601
642
  // Attach def so rendering/aggregation works without registry lookup
602
643
  };
@@ -665,7 +706,7 @@ var percentScore = Score.of({
665
706
  displayStrategy: "bar",
666
707
  formatValue: (data) => data.value.toFixed(2),
667
708
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
668
- aggregateValues: Score.aggregate.averageWithVariance
709
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
669
710
  });
670
711
  var deltaScore = Score.of({
671
712
  id: "delta",
@@ -697,6 +738,8 @@ function createDiffString(expected, actual, diffOptions) {
697
738
  function formatLogMessage(msg) {
698
739
  if (typeof msg === "string")
699
740
  return msg;
741
+ if (msg instanceof Error)
742
+ return msg.stack ?? msg.message;
700
743
  try {
701
744
  if (msg !== null && typeof msg === "object") {
702
745
  return JSON.stringify(msg, null, 2);
@@ -1043,6 +1086,7 @@ function toNumericScore(value) {
1043
1086
  }
1044
1087
 
1045
1088
  // src/runner/execution.ts
1089
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1046
1090
  function computeEvaluatorPassed(evaluator, result, scores) {
1047
1091
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1048
1092
  if (scoresWithPassed.length > 0) {
@@ -1099,20 +1143,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1099
1143
  if (!evaluateFn) {
1100
1144
  continue;
1101
1145
  }
1146
+ const logs = [];
1147
+ const logDiff = (expected, actual, options) => {
1148
+ logs.push(createDiffLogEntry(expected, actual, options));
1149
+ };
1150
+ const log = (message, options) => {
1151
+ logs.push(createLogEntry(message, options));
1152
+ };
1153
+ const createError = (message, options) => {
1154
+ const entry = createLogEntry(message, options);
1155
+ const error = message instanceof Error ? message : new Error(entry.message);
1156
+ error[evaluatorErrorLogEntryKey] = entry;
1157
+ return error;
1158
+ };
1102
1159
  try {
1103
- const logs = [];
1104
- const logDiff = (expected, actual, options) => {
1105
- logs.push(createDiffLogEntry(expected, actual, options));
1106
- };
1107
- const log = (message, options) => {
1108
- logs.push(createLogEntry(message, options));
1109
- };
1110
1160
  const ctx = yield* effect.Effect.promise(
1111
1161
  () => Promise.resolve(evaluator.resolveContext())
1112
1162
  );
1113
1163
  const result = yield* effect.Effect.promise(
1114
- () => Promise.resolve(
1115
- evaluateFn({
1164
+ () => Promise.resolve().then(
1165
+ () => evaluateFn({
1116
1166
  input: testCaseItem.testCase.getInput(),
1117
1167
  ctx,
1118
1168
  output,
@@ -1122,10 +1172,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1122
1172
  datasetId: task.datasetId
1123
1173
  },
1124
1174
  logDiff,
1125
- log
1175
+ log,
1176
+ createError
1126
1177
  })
1127
1178
  )
1128
1179
  );
1180
+ if (result instanceof Error) {
1181
+ const evaluatorError = result;
1182
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1183
+ logs.push(taggedEntry ?? createLogEntry(result));
1184
+ testCaseError = result.message;
1185
+ evaluatorScores.push({
1186
+ evaluatorId,
1187
+ scores: [],
1188
+ passed: false,
1189
+ logs: logs.length > 0 ? logs : void 0
1190
+ });
1191
+ continue;
1192
+ }
1129
1193
  const { scores, metrics } = normalizeResult(result);
1130
1194
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1131
1195
  evaluatorScores.push({
@@ -1136,11 +1200,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1136
1200
  logs: logs.length > 0 ? logs : void 0
1137
1201
  });
1138
1202
  } catch (error) {
1203
+ if (error instanceof Error) {
1204
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1205
+ logs.push(taggedEntry ?? createLogEntry(error));
1206
+ }
1139
1207
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1140
1208
  evaluatorScores.push({
1141
1209
  evaluatorId,
1142
1210
  scores: [],
1143
- passed: false
1211
+ passed: false,
1212
+ logs: logs.length > 0 ? logs : void 0
1144
1213
  });
1145
1214
  }
1146
1215
  }