@m4trix/evals 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -347,7 +347,11 @@ var Metric = {
347
347
  name: config.name,
348
348
  aggregate: config.aggregate,
349
349
  format: config.format,
350
- make: (data) => ({ id: config.id, data })
350
+ make: (data, options) => ({
351
+ id: config.id,
352
+ data,
353
+ ...options?.name !== void 0 && { name: options.name }
354
+ })
351
355
  };
352
356
  registry.set(config.id, def);
353
357
  return def;
@@ -359,20 +363,107 @@ function getMetricById(id) {
359
363
 
360
364
  // src/evals/score.ts
361
365
  var registry2 = /* @__PURE__ */ new Map();
366
+ function formatScoreData(def, data, options) {
367
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
368
+ }
369
+ var ScoreAggregate = {
370
+ /** Average numeric fields. Use for scores like { value, delta }. */
371
+ averageFields(fields) {
372
+ return (values) => {
373
+ const count = values.length || 1;
374
+ const result = {};
375
+ for (const field of fields) {
376
+ result[field] = values.reduce(
377
+ (s, v) => s + (v[field] ?? 0),
378
+ 0
379
+ ) / count;
380
+ }
381
+ return result;
382
+ };
383
+ },
384
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
385
+ averageWithVariance(fields) {
386
+ return (values) => {
387
+ const count = values.length;
388
+ const result = {};
389
+ for (const field of fields) {
390
+ result[field] = count === 0 ? 0 : values.reduce(
391
+ (sum, item) => sum + (item[field] ?? 0),
392
+ 0
393
+ ) / count;
394
+ }
395
+ const valueField = "value";
396
+ const hasValueField = fields.includes(valueField);
397
+ if (count === 0) {
398
+ if (hasValueField) {
399
+ result[valueField] = 0;
400
+ }
401
+ return {
402
+ ...result,
403
+ stdDev: void 0,
404
+ count: 0
405
+ };
406
+ }
407
+ let stdDev;
408
+ if (hasValueField && count >= 2) {
409
+ const sum = values.reduce(
410
+ (s, v) => s + (v[valueField] ?? 0),
411
+ 0
412
+ );
413
+ const sumSq = values.reduce(
414
+ (s, v) => {
415
+ const value = v[valueField] ?? 0;
416
+ return s + value * value;
417
+ },
418
+ 0
419
+ );
420
+ const mean = sum / count;
421
+ const variance = (sumSq - count * mean * mean) / (count - 1);
422
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
423
+ }
424
+ return {
425
+ ...values[0],
426
+ ...result,
427
+ stdDev,
428
+ count
429
+ };
430
+ };
431
+ },
432
+ /** All runs must pass. Use for binary scores. */
433
+ all(values) {
434
+ const total = values.length;
435
+ const passedCount = values.filter((v) => v.passed).length;
436
+ return {
437
+ ...values[0],
438
+ passed: total > 0 && values.every((v) => v.passed),
439
+ passedCount,
440
+ totalCount: total
441
+ };
442
+ },
443
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
444
+ last(values) {
445
+ return values[values.length - 1] ?? {};
446
+ }
447
+ };
362
448
  var Score = {
449
+ aggregate: ScoreAggregate,
363
450
  of(config) {
364
451
  const def = {
365
452
  id: config.id,
366
453
  name: config.name,
367
454
  displayStrategy: config.displayStrategy,
368
- aggregate: config.aggregate,
369
- format: config.format,
455
+ formatValue: config.formatValue,
456
+ formatAggregate: config.formatAggregate,
457
+ aggregateValues: config.aggregateValues,
370
458
  make: (data, options) => {
371
459
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
372
460
  return {
373
461
  id: config.id,
374
462
  data,
375
- ...passed !== void 0 && { passed }
463
+ ...passed !== void 0 && { passed },
464
+ ...options?.name !== void 0 && { name: options.name },
465
+ def
466
+ // Attach def so rendering/aggregation works without registry lookup
376
467
  };
377
468
  }
378
469
  };
@@ -385,29 +476,6 @@ function getScoreById(id) {
385
476
  }
386
477
 
387
478
  // src/evals/aggregators.ts
388
- function aggregateAverageWithVariance(values) {
389
- if (values.length === 0) {
390
- return { value: 0, count: 0 };
391
- }
392
- const sum = values.reduce((s, v) => s + v.value, 0);
393
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
394
- const mean = sum / values.length;
395
- let stdDev;
396
- if (values.length >= 2) {
397
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
398
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
399
- }
400
- return { value: mean, stdDev, count: values.length };
401
- }
402
- function aggregateAll(values) {
403
- const total = values.length;
404
- const passedCount = values.filter((v) => v.passed).length;
405
- return {
406
- passed: total > 0 && values.every((v) => v.passed),
407
- passedCount,
408
- totalCount: total
409
- };
410
- }
411
479
  function aggregateTokenCountSum(values) {
412
480
  const initial = {
413
481
  input: 0,
@@ -460,40 +528,59 @@ Score.of({
460
528
  id: "percent",
461
529
  name: "Score",
462
530
  displayStrategy: "bar",
463
- format: (data, options) => {
464
- if (options?.isAggregated) {
465
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
466
- }
467
- return data.value.toFixed(2);
468
- },
469
- aggregate: aggregateAverageWithVariance
531
+ formatValue: (data) => data.value.toFixed(2),
532
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
533
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
534
+ });
535
+ Score.of({
536
+ id: "delta",
537
+ name: "Delta",
538
+ displayStrategy: "number",
539
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
540
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
541
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
470
542
  });
471
543
  Score.of({
472
544
  id: "binary",
473
545
  name: "Result",
474
546
  displayStrategy: "passFail",
475
- format: (data, options) => {
476
- if (options?.isAggregated) {
477
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
478
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
479
- return `${base} (${data.passedCount}/${data.totalCount})`;
480
- }
481
- return base;
547
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
548
+ formatAggregate: (data) => {
549
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
550
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
551
+ return `${base} (${data.passedCount}/${data.totalCount})`;
482
552
  }
483
- return data.passed ? "PASSED" : "NOT PASSED";
553
+ return base;
484
554
  },
485
- aggregate: aggregateAll
555
+ aggregateValues: Score.aggregate.all
486
556
  });
487
557
 
488
558
  // src/runner/score-utils.ts
559
+ function getScoreDef(item) {
560
+ return item.def ?? getScoreById(item.id);
561
+ }
562
+ function lastNonEmptyName(items) {
563
+ for (let i = items.length - 1; i >= 0; i--) {
564
+ const n = items[i].name;
565
+ if (n != null && n.trim().length > 0)
566
+ return n;
567
+ }
568
+ return void 0;
569
+ }
489
570
  function aggregateScoreItems(items) {
490
571
  if (items.length === 0)
491
572
  return void 0;
492
- const def = getScoreById(items[0].id);
493
- if (!def?.aggregate)
573
+ const def = getScoreDef(items[0]);
574
+ if (!def?.aggregateValues)
494
575
  return items[items.length - 1];
495
- const aggregated = def.aggregate(items.map((i) => i.data));
496
- return { ...items[0], data: aggregated };
576
+ const aggregated = def.aggregateValues(items.map((i) => i.data));
577
+ const nameOverride = lastNonEmptyName(items);
578
+ return {
579
+ ...items[0],
580
+ data: aggregated,
581
+ def,
582
+ ...nameOverride !== void 0 && { name: nameOverride }
583
+ };
497
584
  }
498
585
  function aggregateMetricItems(items) {
499
586
  if (items.length === 0)
@@ -502,11 +589,16 @@ function aggregateMetricItems(items) {
502
589
  if (!def?.aggregate)
503
590
  return items[items.length - 1];
504
591
  const aggregated = def.aggregate(items.map((i) => i.data));
505
- return { ...items[0], data: aggregated };
592
+ const nameOverride = lastNonEmptyName(items);
593
+ return {
594
+ ...items[0],
595
+ data: aggregated,
596
+ ...nameOverride !== void 0 && { name: nameOverride }
597
+ };
506
598
  }
507
599
  function toNumericScoreFromScores(scores) {
508
600
  for (const item of scores) {
509
- const def = getScoreById(item.id);
601
+ const def = getScoreDef(item);
510
602
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
511
603
  const value = item.data.value;
512
604
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -894,7 +986,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
894
986
  () => appendJsonLine(message.artifactPath, {
895
987
  runId: message.runId,
896
988
  ts: Date.now(),
897
- ...message.payload
989
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
898
990
  })
899
991
  );
900
992
  })
@@ -1480,7 +1572,7 @@ function aggregateEvaluatorScores(events, nameById) {
1480
1572
  if (agg)
1481
1573
  aggregatedScores.push(agg);
1482
1574
  }
1483
- const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1575
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1484
1576
  const passed = events.every((ev) => {
1485
1577
  const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1486
1578
  return es?.passed ?? false;
@@ -1500,13 +1592,13 @@ function aggregateEvaluatorScores(events, nameById) {
1500
1592
  }
1501
1593
  return result;
1502
1594
  }
1503
- function formatScorePart(item, scoreToColor2, options) {
1504
- const def = getScoreById(item.id);
1595
+ function formatScorePart(item, _scoreToColor, options) {
1596
+ const def = item.def ?? getScoreById(item.id);
1505
1597
  if (!def) {
1506
1598
  const numeric = toNumericScore(item.data);
1507
1599
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1508
1600
  }
1509
- const formatted = def.format(item.data, options);
1601
+ const formatted = formatScoreData(def, item.data, options);
1510
1602
  if (def.displayStrategy === "bar") {
1511
1603
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1512
1604
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1564,8 +1656,6 @@ function RunView({
1564
1656
  const done = new Promise((resolve5) => {
1565
1657
  const unsubscribe = runner.subscribeRunEvents((event) => {
1566
1658
  if (event.type === "TestCaseProgress") {
1567
- const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1568
- numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1569
1659
  for (const item of event.evaluatorScores) {
1570
1660
  const numeric = toNumericScoreFromScores(item.scores);
1571
1661
  if (numeric !== void 0) {
@@ -1655,16 +1745,17 @@ function RunView({
1655
1745
  onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1656
1746
  return;
1657
1747
  }
1748
+ const completed = finalEvent;
1658
1749
  setSummary({
1659
- passedTestCases: finalEvent.passedTestCases,
1660
- failedTestCases: finalEvent.failedTestCases,
1661
- totalTestCases: finalEvent.totalTestCases,
1750
+ passedTestCases: completed.passedTestCases,
1751
+ failedTestCases: completed.failedTestCases,
1752
+ totalTestCases: completed.totalTestCases,
1662
1753
  overallScoreTotal,
1663
1754
  overallScoreSumSq,
1664
1755
  overallScoreCount,
1665
1756
  aggregates: new Map(aggregates),
1666
1757
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1667
- artifactPath: finalEvent.artifactPath
1758
+ artifactPath: completed.artifactPath
1668
1759
  });
1669
1760
  setPhase("completed");
1670
1761
  setTimeout(() => onComplete(), 200);
@@ -1755,9 +1846,10 @@ function RunView({
1755
1846
  const formatted = def.format(m.data, {
1756
1847
  isAggregated: tc.isAggregated
1757
1848
  });
1849
+ const label = m.name ?? def.name;
1758
1850
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1759
1851
  "[",
1760
- def.name ? `${def.name}: ` : "",
1852
+ label ? `${label}: ` : "",
1761
1853
  formatted,
1762
1854
  "]",
1763
1855
  " "
@@ -1766,8 +1858,8 @@ function RunView({
1766
1858
  ] }) : null
1767
1859
  ] }),
1768
1860
  item.scores.length > 0 ? item.scores.map((s, idx) => {
1769
- const def = getScoreById(s.id);
1770
- const scoreLabel = def ? def.name ?? def.id : s.id;
1861
+ const def = s.def ?? getScoreById(s.id);
1862
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
1771
1863
  return /* @__PURE__ */ jsxRuntime.jsxs(
1772
1864
  ink.Text,
1773
1865
  {
@@ -1870,11 +1962,9 @@ function RunView({
1870
1962
  const aggregated = aggregateScoreItems(items);
1871
1963
  if (!aggregated)
1872
1964
  return null;
1873
- const def = getScoreById(aggregated.id);
1874
- const label = def ? def.name ?? def.id : aggregated.id;
1875
- const formatted = def?.format(aggregated.data, {
1876
- isAggregated: true
1877
- }) ?? "n/a";
1965
+ const def = aggregated.def ?? getScoreById(aggregated.id);
1966
+ const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
1967
+ const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1878
1968
  const numeric = toNumericScore(aggregated.data);
1879
1969
  return /* @__PURE__ */ jsxRuntime.jsxs(
1880
1970
  ink.Text,
@@ -2033,9 +2123,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2033
2123
  const agg = aggregateScoreItems(items);
2034
2124
  if (!agg)
2035
2125
  continue;
2036
- const def = getScoreById(agg.id);
2037
- const label = def ? def.name ?? def.id : agg.id;
2038
- const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2126
+ const def = agg.def ?? getScoreById(agg.id);
2127
+ const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
2128
+ const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2039
2129
  const numeric = toNumericScore(agg.data);
2040
2130
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2041
2131
  scoreLines.push(` ${label}: ${colored}`);
@@ -2053,7 +2143,7 @@ function createBar2(value, max = 100, width = 20) {
2053
2143
  const filled = Math.round(safe / max * width);
2054
2144
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
2055
2145
  }
2056
- function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
2146
+ function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2057
2147
  if (events.length === 0)
2058
2148
  return [];
2059
2149
  const evaluatorIds = new Set(
@@ -2100,26 +2190,27 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2100
2190
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
2101
2191
  const metricParts = [];
2102
2192
  if (metrics && metrics.length > 0) {
2103
- for (const { id, data } of metrics) {
2104
- const def = getMetricById(id);
2193
+ for (const m of metrics) {
2194
+ const def = getMetricById(m.id);
2105
2195
  if (def) {
2106
- const formatted = def.format(data, options);
2196
+ const formatted = def.format(m.data, options);
2197
+ const label = m.name ?? def.name;
2107
2198
  metricParts.push(
2108
- def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
2199
+ label ? `[${label}: ${formatted}]` : `[${formatted}]`
2109
2200
  );
2110
2201
  }
2111
2202
  }
2112
2203
  }
2113
2204
  const scoreLines = [];
2114
2205
  for (const item of scores) {
2115
- const def = getScoreById(item.id);
2116
- const scoreLabel = def ? def.name ?? def.id : item.id;
2206
+ const def = item.def ?? getScoreById(item.id);
2207
+ const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
2117
2208
  let formatted;
2118
2209
  if (!def) {
2119
2210
  const numeric = toNumericScore(item.data);
2120
2211
  formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2121
2212
  } else {
2122
- const raw = def.format(item.data, options);
2213
+ const raw = formatScoreData(def, item.data, options);
2123
2214
  switch (def.displayStrategy) {
2124
2215
  case "bar": {
2125
2216
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -2271,7 +2362,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2271
2362
  (s, e) => s + e.durationMs,
2272
2363
  0
2273
2364
  );
2274
- existing.events.every((e) => e.passed);
2275
2365
  const lines = [];
2276
2366
  lines.push(
2277
2367
  `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
@@ -2349,18 +2439,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2349
2439
  if (finalEvent.type === "RunFailed") {
2350
2440
  throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2351
2441
  }
2442
+ const completed = finalEvent;
2352
2443
  console.log("");
2353
2444
  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2354
2445
  console.log(
2355
2446
  `- passed: ${colorize(
2356
- `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
2447
+ `${completed.passedTestCases}/${completed.totalTestCases}`,
2357
2448
  ansi2.green
2358
2449
  )}`
2359
2450
  );
2360
2451
  console.log(
2361
2452
  `- failed: ${colorize(
2362
- `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
2363
- finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
2453
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2454
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2364
2455
  )}`
2365
2456
  );
2366
2457
  if (overallScoreCount > 0) {
@@ -2401,10 +2492,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2401
2492
  );
2402
2493
  continue;
2403
2494
  }
2404
- const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2405
- summary.aggregatedScoreItem.data,
2406
- { isAggregated: true }
2407
- ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2495
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
2496
+ const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
2497
+ return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
2498
+ })() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2408
2499
  console.log(
2409
2500
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2410
2501
  scoreLabel,
@@ -2413,7 +2504,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2413
2504
  );
2414
2505
  }
2415
2506
  }
2416
- console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
2507
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2417
2508
  }
2418
2509
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2419
2510
  return new Promise((resolve5, reject) => {