@m4trix/evals 0.18.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -77,8 +77,15 @@ export const myEvaluator = Evaluator.define({
77
77
  inputSchema,
78
78
  outputSchema: S.Unknown,
79
79
  scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
80
- }).evaluate(async ({ input, ctx: _ctx, output }) => {
80
+ }).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
81
81
  const start = Date.now();
82
+ const value = 85;
83
+ if (value < 50) {
84
+ return createError(
85
+ { reason: 'score below minimum', value, prompt: input.prompt, output },
86
+ { label: 'quality-check' },
87
+ );
88
+ }
82
89
  const latencyMs = Date.now() - start;
83
90
  const minScore =
84
91
  typeof output === 'object' &&
@@ -90,7 +97,7 @@ export const myEvaluator = Evaluator.define({
90
97
  return {
91
98
  scores: [
92
99
  percentScore.make(
93
- { value: 85 },
100
+ { value },
94
101
  { definePassed: (d) => d.value >= (minScore ?? 50) },
95
102
  ),
96
103
  ],
@@ -294,6 +294,8 @@ function createDiffString(expected, actual, diffOptions) {
294
294
  function formatLogMessage(msg) {
295
295
  if (typeof msg === "string")
296
296
  return msg;
297
+ if (msg instanceof Error)
298
+ return msg.stack ?? msg.message;
297
299
  try {
298
300
  if (msg !== null && typeof msg === "object") {
299
301
  return JSON.stringify(msg, null, 2);
@@ -347,7 +349,11 @@ var Metric = {
347
349
  name: config.name,
348
350
  aggregate: config.aggregate,
349
351
  format: config.format,
350
- make: (data) => ({ id: config.id, data })
352
+ make: (data, options) => ({
353
+ id: config.id,
354
+ data,
355
+ ...options?.name !== void 0 && { name: options.name }
356
+ })
351
357
  };
352
358
  registry.set(config.id, def);
353
359
  return def;
@@ -369,25 +375,61 @@ var ScoreAggregate = {
369
375
  const count = values.length || 1;
370
376
  const result = {};
371
377
  for (const field of fields) {
372
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
378
+ result[field] = values.reduce(
379
+ (s, v) => s + (v[field] ?? 0),
380
+ 0
381
+ ) / count;
373
382
  }
374
383
  return result;
375
384
  };
376
385
  },
377
- /** Average `value` with sample std dev. Use for percent-style scores. */
378
- averageWithVariance(values) {
379
- if (values.length === 0) {
380
- return { value: 0, stdDev: void 0, count: 0 };
381
- }
382
- const sum = values.reduce((s, v) => s + v.value, 0);
383
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
384
- const mean = sum / values.length;
385
- let stdDev;
386
- if (values.length >= 2) {
387
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
388
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
389
- }
390
- return { ...values[0], value: mean, stdDev, count: values.length };
386
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
387
+ averageWithVariance(fields) {
388
+ return (values) => {
389
+ const count = values.length;
390
+ const result = {};
391
+ for (const field of fields) {
392
+ result[field] = count === 0 ? 0 : values.reduce(
393
+ (sum, item) => sum + (item[field] ?? 0),
394
+ 0
395
+ ) / count;
396
+ }
397
+ const valueField = "value";
398
+ const hasValueField = fields.includes(valueField);
399
+ if (count === 0) {
400
+ if (hasValueField) {
401
+ result[valueField] = 0;
402
+ }
403
+ return {
404
+ ...result,
405
+ stdDev: void 0,
406
+ count: 0
407
+ };
408
+ }
409
+ let stdDev;
410
+ if (hasValueField && count >= 2) {
411
+ const sum = values.reduce(
412
+ (s, v) => s + (v[valueField] ?? 0),
413
+ 0
414
+ );
415
+ const sumSq = values.reduce(
416
+ (s, v) => {
417
+ const value = v[valueField] ?? 0;
418
+ return s + value * value;
419
+ },
420
+ 0
421
+ );
422
+ const mean = sum / count;
423
+ const variance = (sumSq - count * mean * mean) / (count - 1);
424
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
425
+ }
426
+ return {
427
+ ...values[0],
428
+ ...result,
429
+ stdDev,
430
+ count
431
+ };
432
+ };
391
433
  },
392
434
  /** All runs must pass. Use for binary scores. */
393
435
  all(values) {
@@ -421,6 +463,7 @@ var Score = {
421
463
  id: config.id,
422
464
  data,
423
465
  ...passed !== void 0 && { passed },
466
+ ...options?.name !== void 0 && { name: options.name },
424
467
  def
425
468
  // Attach def so rendering/aggregation works without registry lookup
426
469
  };
@@ -489,7 +532,7 @@ Score.of({
489
532
  displayStrategy: "bar",
490
533
  formatValue: (data) => data.value.toFixed(2),
491
534
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
492
- aggregateValues: Score.aggregate.averageWithVariance
535
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
493
536
  });
494
537
  Score.of({
495
538
  id: "delta",
@@ -518,6 +561,14 @@ Score.of({
518
561
  function getScoreDef(item) {
519
562
  return item.def ?? getScoreById(item.id);
520
563
  }
564
+ function lastNonEmptyName(items) {
565
+ for (let i = items.length - 1; i >= 0; i--) {
566
+ const n = items[i].name;
567
+ if (n != null && n.trim().length > 0)
568
+ return n;
569
+ }
570
+ return void 0;
571
+ }
521
572
  function aggregateScoreItems(items) {
522
573
  if (items.length === 0)
523
574
  return void 0;
@@ -525,7 +576,13 @@ function aggregateScoreItems(items) {
525
576
  if (!def?.aggregateValues)
526
577
  return items[items.length - 1];
527
578
  const aggregated = def.aggregateValues(items.map((i) => i.data));
528
- return { ...items[0], data: aggregated, def };
579
+ const nameOverride = lastNonEmptyName(items);
580
+ return {
581
+ ...items[0],
582
+ data: aggregated,
583
+ def,
584
+ ...nameOverride !== void 0 && { name: nameOverride }
585
+ };
529
586
  }
530
587
  function aggregateMetricItems(items) {
531
588
  if (items.length === 0)
@@ -534,7 +591,12 @@ function aggregateMetricItems(items) {
534
591
  if (!def?.aggregate)
535
592
  return items[items.length - 1];
536
593
  const aggregated = def.aggregate(items.map((i) => i.data));
537
- return { ...items[0], data: aggregated };
594
+ const nameOverride = lastNonEmptyName(items);
595
+ return {
596
+ ...items[0],
597
+ data: aggregated,
598
+ ...nameOverride !== void 0 && { name: nameOverride }
599
+ };
538
600
  }
539
601
  function toNumericScoreFromScores(scores) {
540
602
  for (const item of scores) {
@@ -573,6 +635,7 @@ function toNumericScore(value) {
573
635
  }
574
636
 
575
637
  // src/runner/execution.ts
638
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
576
639
  function computeEvaluatorPassed(evaluator, result, scores) {
577
640
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
578
641
  if (scoresWithPassed.length > 0) {
@@ -629,20 +692,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
629
692
  if (!evaluateFn) {
630
693
  continue;
631
694
  }
695
+ const logs = [];
696
+ const logDiff = (expected, actual, options) => {
697
+ logs.push(createDiffLogEntry(expected, actual, options));
698
+ };
699
+ const log = (message, options) => {
700
+ logs.push(createLogEntry(message, options));
701
+ };
702
+ const createError = (message, options) => {
703
+ const entry = createLogEntry(message, options);
704
+ const error = message instanceof Error ? message : new Error(entry.message);
705
+ error[evaluatorErrorLogEntryKey] = entry;
706
+ return error;
707
+ };
632
708
  try {
633
- const logs = [];
634
- const logDiff = (expected, actual, options) => {
635
- logs.push(createDiffLogEntry(expected, actual, options));
636
- };
637
- const log = (message, options) => {
638
- logs.push(createLogEntry(message, options));
639
- };
640
709
  const ctx = yield* effect.Effect.promise(
641
710
  () => Promise.resolve(evaluator.resolveContext())
642
711
  );
643
712
  const result = yield* effect.Effect.promise(
644
- () => Promise.resolve(
645
- evaluateFn({
713
+ () => Promise.resolve().then(
714
+ () => evaluateFn({
646
715
  input: testCaseItem.testCase.getInput(),
647
716
  ctx,
648
717
  output,
@@ -652,10 +721,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
652
721
  datasetId: task.datasetId
653
722
  },
654
723
  logDiff,
655
- log
724
+ log,
725
+ createError
656
726
  })
657
727
  )
658
728
  );
729
+ if (result instanceof Error) {
730
+ const evaluatorError = result;
731
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
732
+ logs.push(taggedEntry ?? createLogEntry(result));
733
+ testCaseError = result.message;
734
+ evaluatorScores.push({
735
+ evaluatorId,
736
+ scores: [],
737
+ passed: false,
738
+ logs: logs.length > 0 ? logs : void 0
739
+ });
740
+ continue;
741
+ }
659
742
  const { scores, metrics } = normalizeResult(result);
660
743
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
661
744
  evaluatorScores.push({
@@ -666,11 +749,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
666
749
  logs: logs.length > 0 ? logs : void 0
667
750
  });
668
751
  } catch (error) {
752
+ if (error instanceof Error) {
753
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
754
+ logs.push(taggedEntry ?? createLogEntry(error));
755
+ }
669
756
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
670
757
  evaluatorScores.push({
671
758
  evaluatorId,
672
759
  scores: [],
673
- passed: false
760
+ passed: false,
761
+ logs: logs.length > 0 ? logs : void 0
674
762
  });
675
763
  }
676
764
  }
@@ -1654,6 +1742,7 @@ function RunView({
1654
1742
  rerunTotal: event.rerunTotal,
1655
1743
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1656
1744
  passed: events.every((e) => e.passed),
1745
+ errorMessage: event.errorMessage,
1657
1746
  events,
1658
1747
  aggregatedEvaluatorScores,
1659
1748
  isAggregated
@@ -1764,8 +1853,13 @@ function RunView({
1764
1853
  " (",
1765
1854
  tc.durationMs,
1766
1855
  "ms)"
1767
- ] })
1856
+ ] }),
1857
+ tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "red", bold: true, children: [
1858
+ " ",
1859
+ "ERROR"
1860
+ ] }) : null
1768
1861
  ] }),
1862
+ tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
1769
1863
  tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
1770
1864
  ink.Box,
1771
1865
  {
@@ -1786,9 +1880,10 @@ function RunView({
1786
1880
  const formatted = def.format(m.data, {
1787
1881
  isAggregated: tc.isAggregated
1788
1882
  });
1883
+ const label = m.name ?? def.name;
1789
1884
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1790
1885
  "[",
1791
- def.name ? `${def.name}: ` : "",
1886
+ label ? `${label}: ` : "",
1792
1887
  formatted,
1793
1888
  "]",
1794
1889
  " "
@@ -1797,8 +1892,8 @@ function RunView({
1797
1892
  ] }) : null
1798
1893
  ] }),
1799
1894
  item.scores.length > 0 ? item.scores.map((s, idx) => {
1800
- const def = getScoreById(s.id);
1801
- const scoreLabel = def ? def.name ?? def.id : s.id;
1895
+ const def = s.def ?? getScoreById(s.id);
1896
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
1802
1897
  return /* @__PURE__ */ jsxRuntime.jsxs(
1803
1898
  ink.Text,
1804
1899
  {
@@ -1902,7 +1997,7 @@ function RunView({
1902
1997
  if (!aggregated)
1903
1998
  return null;
1904
1999
  const def = aggregated.def ?? getScoreById(aggregated.id);
1905
- const label = def ? def.name ?? def.id : aggregated.id;
2000
+ const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
1906
2001
  const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1907
2002
  const numeric = toNumericScore(aggregated.data);
1908
2003
  return /* @__PURE__ */ jsxRuntime.jsxs(
@@ -2063,7 +2158,7 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2063
2158
  if (!agg)
2064
2159
  continue;
2065
2160
  const def = agg.def ?? getScoreById(agg.id);
2066
- const label = def ? def.name ?? def.id : agg.id;
2161
+ const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
2067
2162
  const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2068
2163
  const numeric = toNumericScore(agg.data);
2069
2164
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
@@ -2129,12 +2224,13 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2129
2224
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
2130
2225
  const metricParts = [];
2131
2226
  if (metrics && metrics.length > 0) {
2132
- for (const { id, data } of metrics) {
2133
- const def = getMetricById(id);
2227
+ for (const m of metrics) {
2228
+ const def = getMetricById(m.id);
2134
2229
  if (def) {
2135
- const formatted = def.format(data, options);
2230
+ const formatted = def.format(m.data, options);
2231
+ const label = m.name ?? def.name;
2136
2232
  metricParts.push(
2137
- def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
2233
+ label ? `[${label}: ${formatted}]` : `[${formatted}]`
2138
2234
  );
2139
2235
  }
2140
2236
  }
@@ -2142,7 +2238,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2142
2238
  const scoreLines = [];
2143
2239
  for (const item of scores) {
2144
2240
  const def = item.def ?? getScoreById(item.id);
2145
- const scoreLabel = def ? def.name ?? def.id : item.id;
2241
+ const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
2146
2242
  let formatted;
2147
2243
  if (!def) {
2148
2244
  const numeric = toNumericScore(item.data);
@@ -2301,9 +2397,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2301
2397
  0
2302
2398
  );
2303
2399
  const lines = [];
2400
+ const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2304
2401
  lines.push(
2305
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2402
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2306
2403
  );
2404
+ if (event.errorMessage) {
2405
+ lines.push(colorize(event.errorMessage, ansi2.red));
2406
+ }
2307
2407
  for (const item of aggregatedScores) {
2308
2408
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2309
2409
  lines.push(