@m4trix/evals 0.18.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -268,6 +268,8 @@ function createDiffString(expected, actual, diffOptions) {
268
268
  function formatLogMessage(msg) {
269
269
  if (typeof msg === "string")
270
270
  return msg;
271
+ if (msg instanceof Error)
272
+ return msg.stack ?? msg.message;
271
273
  try {
272
274
  if (msg !== null && typeof msg === "object") {
273
275
  return JSON.stringify(msg, null, 2);
@@ -321,7 +323,11 @@ var Metric = {
321
323
  name: config.name,
322
324
  aggregate: config.aggregate,
323
325
  format: config.format,
324
- make: (data) => ({ id: config.id, data })
326
+ make: (data, options) => ({
327
+ id: config.id,
328
+ data,
329
+ ...options?.name !== void 0 && { name: options.name }
330
+ })
325
331
  };
326
332
  registry.set(config.id, def);
327
333
  return def;
@@ -343,25 +349,61 @@ var ScoreAggregate = {
343
349
  const count = values.length || 1;
344
350
  const result = {};
345
351
  for (const field of fields) {
346
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
352
+ result[field] = values.reduce(
353
+ (s, v) => s + (v[field] ?? 0),
354
+ 0
355
+ ) / count;
347
356
  }
348
357
  return result;
349
358
  };
350
359
  },
351
- /** Average `value` with sample std dev. Use for percent-style scores. */
352
- averageWithVariance(values) {
353
- if (values.length === 0) {
354
- return { value: 0, stdDev: void 0, count: 0 };
355
- }
356
- const sum = values.reduce((s, v) => s + v.value, 0);
357
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
358
- const mean = sum / values.length;
359
- let stdDev;
360
- if (values.length >= 2) {
361
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
362
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
363
- }
364
- return { ...values[0], value: mean, stdDev, count: values.length };
360
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
361
+ averageWithVariance(fields) {
362
+ return (values) => {
363
+ const count = values.length;
364
+ const result = {};
365
+ for (const field of fields) {
366
+ result[field] = count === 0 ? 0 : values.reduce(
367
+ (sum, item) => sum + (item[field] ?? 0),
368
+ 0
369
+ ) / count;
370
+ }
371
+ const valueField = "value";
372
+ const hasValueField = fields.includes(valueField);
373
+ if (count === 0) {
374
+ if (hasValueField) {
375
+ result[valueField] = 0;
376
+ }
377
+ return {
378
+ ...result,
379
+ stdDev: void 0,
380
+ count: 0
381
+ };
382
+ }
383
+ let stdDev;
384
+ if (hasValueField && count >= 2) {
385
+ const sum = values.reduce(
386
+ (s, v) => s + (v[valueField] ?? 0),
387
+ 0
388
+ );
389
+ const sumSq = values.reduce(
390
+ (s, v) => {
391
+ const value = v[valueField] ?? 0;
392
+ return s + value * value;
393
+ },
394
+ 0
395
+ );
396
+ const mean = sum / count;
397
+ const variance = (sumSq - count * mean * mean) / (count - 1);
398
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
399
+ }
400
+ return {
401
+ ...values[0],
402
+ ...result,
403
+ stdDev,
404
+ count
405
+ };
406
+ };
365
407
  },
366
408
  /** All runs must pass. Use for binary scores. */
367
409
  all(values) {
@@ -395,6 +437,7 @@ var Score = {
395
437
  id: config.id,
396
438
  data,
397
439
  ...passed !== void 0 && { passed },
440
+ ...options?.name !== void 0 && { name: options.name },
398
441
  def
399
442
  // Attach def so rendering/aggregation works without registry lookup
400
443
  };
@@ -463,7 +506,7 @@ Score.of({
463
506
  displayStrategy: "bar",
464
507
  formatValue: (data) => data.value.toFixed(2),
465
508
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
466
- aggregateValues: Score.aggregate.averageWithVariance
509
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
467
510
  });
468
511
  Score.of({
469
512
  id: "delta",
@@ -492,6 +535,14 @@ Score.of({
492
535
  function getScoreDef(item) {
493
536
  return item.def ?? getScoreById(item.id);
494
537
  }
538
+ function lastNonEmptyName(items) {
539
+ for (let i = items.length - 1; i >= 0; i--) {
540
+ const n = items[i].name;
541
+ if (n != null && n.trim().length > 0)
542
+ return n;
543
+ }
544
+ return void 0;
545
+ }
495
546
  function aggregateScoreItems(items) {
496
547
  if (items.length === 0)
497
548
  return void 0;
@@ -499,7 +550,13 @@ function aggregateScoreItems(items) {
499
550
  if (!def?.aggregateValues)
500
551
  return items[items.length - 1];
501
552
  const aggregated = def.aggregateValues(items.map((i) => i.data));
502
- return { ...items[0], data: aggregated, def };
553
+ const nameOverride = lastNonEmptyName(items);
554
+ return {
555
+ ...items[0],
556
+ data: aggregated,
557
+ def,
558
+ ...nameOverride !== void 0 && { name: nameOverride }
559
+ };
503
560
  }
504
561
  function aggregateMetricItems(items) {
505
562
  if (items.length === 0)
@@ -508,7 +565,12 @@ function aggregateMetricItems(items) {
508
565
  if (!def?.aggregate)
509
566
  return items[items.length - 1];
510
567
  const aggregated = def.aggregate(items.map((i) => i.data));
511
- return { ...items[0], data: aggregated };
568
+ const nameOverride = lastNonEmptyName(items);
569
+ return {
570
+ ...items[0],
571
+ data: aggregated,
572
+ ...nameOverride !== void 0 && { name: nameOverride }
573
+ };
512
574
  }
513
575
  function toNumericScoreFromScores(scores) {
514
576
  for (const item of scores) {
@@ -547,6 +609,7 @@ function toNumericScore(value) {
547
609
  }
548
610
 
549
611
  // src/runner/execution.ts
612
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
550
613
  function computeEvaluatorPassed(evaluator, result, scores) {
551
614
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
552
615
  if (scoresWithPassed.length > 0) {
@@ -603,20 +666,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
603
666
  if (!evaluateFn) {
604
667
  continue;
605
668
  }
669
+ const logs = [];
670
+ const logDiff = (expected, actual, options) => {
671
+ logs.push(createDiffLogEntry(expected, actual, options));
672
+ };
673
+ const log = (message, options) => {
674
+ logs.push(createLogEntry(message, options));
675
+ };
676
+ const createError = (message, options) => {
677
+ const entry = createLogEntry(message, options);
678
+ const error = message instanceof Error ? message : new Error(entry.message);
679
+ error[evaluatorErrorLogEntryKey] = entry;
680
+ return error;
681
+ };
606
682
  try {
607
- const logs = [];
608
- const logDiff = (expected, actual, options) => {
609
- logs.push(createDiffLogEntry(expected, actual, options));
610
- };
611
- const log = (message, options) => {
612
- logs.push(createLogEntry(message, options));
613
- };
614
683
  const ctx = yield* Effect.promise(
615
684
  () => Promise.resolve(evaluator.resolveContext())
616
685
  );
617
686
  const result = yield* Effect.promise(
618
- () => Promise.resolve(
619
- evaluateFn({
687
+ () => Promise.resolve().then(
688
+ () => evaluateFn({
620
689
  input: testCaseItem.testCase.getInput(),
621
690
  ctx,
622
691
  output,
@@ -626,10 +695,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
626
695
  datasetId: task.datasetId
627
696
  },
628
697
  logDiff,
629
- log
698
+ log,
699
+ createError
630
700
  })
631
701
  )
632
702
  );
703
+ if (result instanceof Error) {
704
+ const evaluatorError = result;
705
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
706
+ logs.push(taggedEntry ?? createLogEntry(result));
707
+ testCaseError = result.message;
708
+ evaluatorScores.push({
709
+ evaluatorId,
710
+ scores: [],
711
+ passed: false,
712
+ logs: logs.length > 0 ? logs : void 0
713
+ });
714
+ continue;
715
+ }
633
716
  const { scores, metrics } = normalizeResult(result);
634
717
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
635
718
  evaluatorScores.push({
@@ -640,11 +723,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
640
723
  logs: logs.length > 0 ? logs : void 0
641
724
  });
642
725
  } catch (error) {
726
+ if (error instanceof Error) {
727
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
728
+ logs.push(taggedEntry ?? createLogEntry(error));
729
+ }
643
730
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
644
731
  evaluatorScores.push({
645
732
  evaluatorId,
646
733
  scores: [],
647
- passed: false
734
+ passed: false,
735
+ logs: logs.length > 0 ? logs : void 0
648
736
  });
649
737
  }
650
738
  }
@@ -1628,6 +1716,7 @@ function RunView({
1628
1716
  rerunTotal: event.rerunTotal,
1629
1717
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1630
1718
  passed: events.every((e) => e.passed),
1719
+ errorMessage: event.errorMessage,
1631
1720
  events,
1632
1721
  aggregatedEvaluatorScores,
1633
1722
  isAggregated
@@ -1738,8 +1827,13 @@ function RunView({
1738
1827
  " (",
1739
1828
  tc.durationMs,
1740
1829
  "ms)"
1741
- ] })
1830
+ ] }),
1831
+ tc.errorMessage ? /* @__PURE__ */ jsxs(Text, { color: "red", bold: true, children: [
1832
+ " ",
1833
+ "ERROR"
1834
+ ] }) : null
1742
1835
  ] }),
1836
+ tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
1743
1837
  tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
1744
1838
  Box,
1745
1839
  {
@@ -1760,9 +1854,10 @@ function RunView({
1760
1854
  const formatted = def.format(m.data, {
1761
1855
  isAggregated: tc.isAggregated
1762
1856
  });
1857
+ const label = m.name ?? def.name;
1763
1858
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1764
1859
  "[",
1765
- def.name ? `${def.name}: ` : "",
1860
+ label ? `${label}: ` : "",
1766
1861
  formatted,
1767
1862
  "]",
1768
1863
  " "
@@ -1771,8 +1866,8 @@ function RunView({
1771
1866
  ] }) : null
1772
1867
  ] }),
1773
1868
  item.scores.length > 0 ? item.scores.map((s, idx) => {
1774
- const def = getScoreById(s.id);
1775
- const scoreLabel = def ? def.name ?? def.id : s.id;
1869
+ const def = s.def ?? getScoreById(s.id);
1870
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
1776
1871
  return /* @__PURE__ */ jsxs(
1777
1872
  Text,
1778
1873
  {
@@ -1876,7 +1971,7 @@ function RunView({
1876
1971
  if (!aggregated)
1877
1972
  return null;
1878
1973
  const def = aggregated.def ?? getScoreById(aggregated.id);
1879
- const label = def ? def.name ?? def.id : aggregated.id;
1974
+ const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
1880
1975
  const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1881
1976
  const numeric = toNumericScore(aggregated.data);
1882
1977
  return /* @__PURE__ */ jsxs(
@@ -2037,7 +2132,7 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2037
2132
  if (!agg)
2038
2133
  continue;
2039
2134
  const def = agg.def ?? getScoreById(agg.id);
2040
- const label = def ? def.name ?? def.id : agg.id;
2135
+ const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
2041
2136
  const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2042
2137
  const numeric = toNumericScore(agg.data);
2043
2138
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
@@ -2103,12 +2198,13 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2103
2198
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
2104
2199
  const metricParts = [];
2105
2200
  if (metrics && metrics.length > 0) {
2106
- for (const { id, data } of metrics) {
2107
- const def = getMetricById(id);
2201
+ for (const m of metrics) {
2202
+ const def = getMetricById(m.id);
2108
2203
  if (def) {
2109
- const formatted = def.format(data, options);
2204
+ const formatted = def.format(m.data, options);
2205
+ const label = m.name ?? def.name;
2110
2206
  metricParts.push(
2111
- def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
2207
+ label ? `[${label}: ${formatted}]` : `[${formatted}]`
2112
2208
  );
2113
2209
  }
2114
2210
  }
@@ -2116,7 +2212,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2116
2212
  const scoreLines = [];
2117
2213
  for (const item of scores) {
2118
2214
  const def = item.def ?? getScoreById(item.id);
2119
- const scoreLabel = def ? def.name ?? def.id : item.id;
2215
+ const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
2120
2216
  let formatted;
2121
2217
  if (!def) {
2122
2218
  const numeric = toNumericScore(item.data);
@@ -2275,9 +2371,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2275
2371
  0
2276
2372
  );
2277
2373
  const lines = [];
2374
+ const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2278
2375
  lines.push(
2279
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2376
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2280
2377
  );
2378
+ if (event.errorMessage) {
2379
+ lines.push(colorize(event.errorMessage, ansi2.red));
2380
+ }
2281
2381
  for (const item of aggregatedScores) {
2282
2382
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2283
2383
  lines.push(