@m4trix/evals 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -321,7 +321,11 @@ var Metric = {
321
321
  name: config.name,
322
322
  aggregate: config.aggregate,
323
323
  format: config.format,
324
- make: (data) => ({ id: config.id, data })
324
+ make: (data, options) => ({
325
+ id: config.id,
326
+ data,
327
+ ...options?.name !== void 0 && { name: options.name }
328
+ })
325
329
  };
326
330
  registry.set(config.id, def);
327
331
  return def;
@@ -333,20 +337,107 @@ function getMetricById(id) {
333
337
 
334
338
  // src/evals/score.ts
335
339
  var registry2 = /* @__PURE__ */ new Map();
340
+ function formatScoreData(def, data, options) {
341
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
342
+ }
343
+ var ScoreAggregate = {
344
+ /** Average numeric fields. Use for scores like { value, delta }. */
345
+ averageFields(fields) {
346
+ return (values) => {
347
+ const count = values.length || 1;
348
+ const result = {};
349
+ for (const field of fields) {
350
+ result[field] = values.reduce(
351
+ (s, v) => s + (v[field] ?? 0),
352
+ 0
353
+ ) / count;
354
+ }
355
+ return result;
356
+ };
357
+ },
358
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
359
+ averageWithVariance(fields) {
360
+ return (values) => {
361
+ const count = values.length;
362
+ const result = {};
363
+ for (const field of fields) {
364
+ result[field] = count === 0 ? 0 : values.reduce(
365
+ (sum, item) => sum + (item[field] ?? 0),
366
+ 0
367
+ ) / count;
368
+ }
369
+ const valueField = "value";
370
+ const hasValueField = fields.includes(valueField);
371
+ if (count === 0) {
372
+ if (hasValueField) {
373
+ result[valueField] = 0;
374
+ }
375
+ return {
376
+ ...result,
377
+ stdDev: void 0,
378
+ count: 0
379
+ };
380
+ }
381
+ let stdDev;
382
+ if (hasValueField && count >= 2) {
383
+ const sum = values.reduce(
384
+ (s, v) => s + (v[valueField] ?? 0),
385
+ 0
386
+ );
387
+ const sumSq = values.reduce(
388
+ (s, v) => {
389
+ const value = v[valueField] ?? 0;
390
+ return s + value * value;
391
+ },
392
+ 0
393
+ );
394
+ const mean = sum / count;
395
+ const variance = (sumSq - count * mean * mean) / (count - 1);
396
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
397
+ }
398
+ return {
399
+ ...values[0],
400
+ ...result,
401
+ stdDev,
402
+ count
403
+ };
404
+ };
405
+ },
406
+ /** All runs must pass. Use for binary scores. */
407
+ all(values) {
408
+ const total = values.length;
409
+ const passedCount = values.filter((v) => v.passed).length;
410
+ return {
411
+ ...values[0],
412
+ passed: total > 0 && values.every((v) => v.passed),
413
+ passedCount,
414
+ totalCount: total
415
+ };
416
+ },
417
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
418
+ last(values) {
419
+ return values[values.length - 1] ?? {};
420
+ }
421
+ };
336
422
  var Score = {
423
+ aggregate: ScoreAggregate,
337
424
  of(config) {
338
425
  const def = {
339
426
  id: config.id,
340
427
  name: config.name,
341
428
  displayStrategy: config.displayStrategy,
342
- aggregate: config.aggregate,
343
- format: config.format,
429
+ formatValue: config.formatValue,
430
+ formatAggregate: config.formatAggregate,
431
+ aggregateValues: config.aggregateValues,
344
432
  make: (data, options) => {
345
433
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
346
434
  return {
347
435
  id: config.id,
348
436
  data,
349
- ...passed !== void 0 && { passed }
437
+ ...passed !== void 0 && { passed },
438
+ ...options?.name !== void 0 && { name: options.name },
439
+ def
440
+ // Attach def so rendering/aggregation works without registry lookup
350
441
  };
351
442
  }
352
443
  };
@@ -359,29 +450,6 @@ function getScoreById(id) {
359
450
  }
360
451
 
361
452
  // src/evals/aggregators.ts
362
- function aggregateAverageWithVariance(values) {
363
- if (values.length === 0) {
364
- return { value: 0, count: 0 };
365
- }
366
- const sum = values.reduce((s, v) => s + v.value, 0);
367
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
368
- const mean = sum / values.length;
369
- let stdDev;
370
- if (values.length >= 2) {
371
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
372
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
373
- }
374
- return { value: mean, stdDev, count: values.length };
375
- }
376
- function aggregateAll(values) {
377
- const total = values.length;
378
- const passedCount = values.filter((v) => v.passed).length;
379
- return {
380
- passed: total > 0 && values.every((v) => v.passed),
381
- passedCount,
382
- totalCount: total
383
- };
384
- }
385
453
  function aggregateTokenCountSum(values) {
386
454
  const initial = {
387
455
  input: 0,
@@ -434,40 +502,59 @@ Score.of({
434
502
  id: "percent",
435
503
  name: "Score",
436
504
  displayStrategy: "bar",
437
- format: (data, options) => {
438
- if (options?.isAggregated) {
439
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
440
- }
441
- return data.value.toFixed(2);
442
- },
443
- aggregate: aggregateAverageWithVariance
505
+ formatValue: (data) => data.value.toFixed(2),
506
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
507
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
508
+ });
509
+ Score.of({
510
+ id: "delta",
511
+ name: "Delta",
512
+ displayStrategy: "number",
513
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
514
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
515
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
444
516
  });
445
517
  Score.of({
446
518
  id: "binary",
447
519
  name: "Result",
448
520
  displayStrategy: "passFail",
449
- format: (data, options) => {
450
- if (options?.isAggregated) {
451
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
452
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
453
- return `${base} (${data.passedCount}/${data.totalCount})`;
454
- }
455
- return base;
521
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
522
+ formatAggregate: (data) => {
523
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
524
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
525
+ return `${base} (${data.passedCount}/${data.totalCount})`;
456
526
  }
457
- return data.passed ? "PASSED" : "NOT PASSED";
527
+ return base;
458
528
  },
459
- aggregate: aggregateAll
529
+ aggregateValues: Score.aggregate.all
460
530
  });
461
531
 
462
532
  // src/runner/score-utils.ts
533
+ function getScoreDef(item) {
534
+ return item.def ?? getScoreById(item.id);
535
+ }
536
+ function lastNonEmptyName(items) {
537
+ for (let i = items.length - 1; i >= 0; i--) {
538
+ const n = items[i].name;
539
+ if (n != null && n.trim().length > 0)
540
+ return n;
541
+ }
542
+ return void 0;
543
+ }
463
544
  function aggregateScoreItems(items) {
464
545
  if (items.length === 0)
465
546
  return void 0;
466
- const def = getScoreById(items[0].id);
467
- if (!def?.aggregate)
547
+ const def = getScoreDef(items[0]);
548
+ if (!def?.aggregateValues)
468
549
  return items[items.length - 1];
469
- const aggregated = def.aggregate(items.map((i) => i.data));
470
- return { ...items[0], data: aggregated };
550
+ const aggregated = def.aggregateValues(items.map((i) => i.data));
551
+ const nameOverride = lastNonEmptyName(items);
552
+ return {
553
+ ...items[0],
554
+ data: aggregated,
555
+ def,
556
+ ...nameOverride !== void 0 && { name: nameOverride }
557
+ };
471
558
  }
472
559
  function aggregateMetricItems(items) {
473
560
  if (items.length === 0)
@@ -476,11 +563,16 @@ function aggregateMetricItems(items) {
476
563
  if (!def?.aggregate)
477
564
  return items[items.length - 1];
478
565
  const aggregated = def.aggregate(items.map((i) => i.data));
479
- return { ...items[0], data: aggregated };
566
+ const nameOverride = lastNonEmptyName(items);
567
+ return {
568
+ ...items[0],
569
+ data: aggregated,
570
+ ...nameOverride !== void 0 && { name: nameOverride }
571
+ };
480
572
  }
481
573
  function toNumericScoreFromScores(scores) {
482
574
  for (const item of scores) {
483
- const def = getScoreById(item.id);
575
+ const def = getScoreDef(item);
484
576
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
485
577
  const value = item.data.value;
486
578
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -868,7 +960,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
868
960
  () => appendJsonLine(message.artifactPath, {
869
961
  runId: message.runId,
870
962
  ts: Date.now(),
871
- ...message.payload
963
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
872
964
  })
873
965
  );
874
966
  })
@@ -1454,7 +1546,7 @@ function aggregateEvaluatorScores(events, nameById) {
1454
1546
  if (agg)
1455
1547
  aggregatedScores.push(agg);
1456
1548
  }
1457
- const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1549
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1458
1550
  const passed = events.every((ev) => {
1459
1551
  const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1460
1552
  return es?.passed ?? false;
@@ -1474,13 +1566,13 @@ function aggregateEvaluatorScores(events, nameById) {
1474
1566
  }
1475
1567
  return result;
1476
1568
  }
1477
- function formatScorePart(item, scoreToColor2, options) {
1478
- const def = getScoreById(item.id);
1569
+ function formatScorePart(item, _scoreToColor, options) {
1570
+ const def = item.def ?? getScoreById(item.id);
1479
1571
  if (!def) {
1480
1572
  const numeric = toNumericScore(item.data);
1481
1573
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1482
1574
  }
1483
- const formatted = def.format(item.data, options);
1575
+ const formatted = formatScoreData(def, item.data, options);
1484
1576
  if (def.displayStrategy === "bar") {
1485
1577
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1486
1578
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1538,8 +1630,6 @@ function RunView({
1538
1630
  const done = new Promise((resolve5) => {
1539
1631
  const unsubscribe = runner.subscribeRunEvents((event) => {
1540
1632
  if (event.type === "TestCaseProgress") {
1541
- const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1542
- numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1543
1633
  for (const item of event.evaluatorScores) {
1544
1634
  const numeric = toNumericScoreFromScores(item.scores);
1545
1635
  if (numeric !== void 0) {
@@ -1629,16 +1719,17 @@ function RunView({
1629
1719
  onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1630
1720
  return;
1631
1721
  }
1722
+ const completed = finalEvent;
1632
1723
  setSummary({
1633
- passedTestCases: finalEvent.passedTestCases,
1634
- failedTestCases: finalEvent.failedTestCases,
1635
- totalTestCases: finalEvent.totalTestCases,
1724
+ passedTestCases: completed.passedTestCases,
1725
+ failedTestCases: completed.failedTestCases,
1726
+ totalTestCases: completed.totalTestCases,
1636
1727
  overallScoreTotal,
1637
1728
  overallScoreSumSq,
1638
1729
  overallScoreCount,
1639
1730
  aggregates: new Map(aggregates),
1640
1731
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1641
- artifactPath: finalEvent.artifactPath
1732
+ artifactPath: completed.artifactPath
1642
1733
  });
1643
1734
  setPhase("completed");
1644
1735
  setTimeout(() => onComplete(), 200);
@@ -1729,9 +1820,10 @@ function RunView({
1729
1820
  const formatted = def.format(m.data, {
1730
1821
  isAggregated: tc.isAggregated
1731
1822
  });
1823
+ const label = m.name ?? def.name;
1732
1824
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1733
1825
  "[",
1734
- def.name ? `${def.name}: ` : "",
1826
+ label ? `${label}: ` : "",
1735
1827
  formatted,
1736
1828
  "]",
1737
1829
  " "
@@ -1740,8 +1832,8 @@ function RunView({
1740
1832
  ] }) : null
1741
1833
  ] }),
1742
1834
  item.scores.length > 0 ? item.scores.map((s, idx) => {
1743
- const def = getScoreById(s.id);
1744
- const scoreLabel = def ? def.name ?? def.id : s.id;
1835
+ const def = s.def ?? getScoreById(s.id);
1836
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
1745
1837
  return /* @__PURE__ */ jsxs(
1746
1838
  Text,
1747
1839
  {
@@ -1844,11 +1936,9 @@ function RunView({
1844
1936
  const aggregated = aggregateScoreItems(items);
1845
1937
  if (!aggregated)
1846
1938
  return null;
1847
- const def = getScoreById(aggregated.id);
1848
- const label = def ? def.name ?? def.id : aggregated.id;
1849
- const formatted = def?.format(aggregated.data, {
1850
- isAggregated: true
1851
- }) ?? "n/a";
1939
+ const def = aggregated.def ?? getScoreById(aggregated.id);
1940
+ const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
1941
+ const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1852
1942
  const numeric = toNumericScore(aggregated.data);
1853
1943
  return /* @__PURE__ */ jsxs(
1854
1944
  Text,
@@ -2007,9 +2097,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2007
2097
  const agg = aggregateScoreItems(items);
2008
2098
  if (!agg)
2009
2099
  continue;
2010
- const def = getScoreById(agg.id);
2011
- const label = def ? def.name ?? def.id : agg.id;
2012
- const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2100
+ const def = agg.def ?? getScoreById(agg.id);
2101
+ const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
2102
+ const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2013
2103
  const numeric = toNumericScore(agg.data);
2014
2104
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2015
2105
  scoreLines.push(` ${label}: ${colored}`);
@@ -2027,7 +2117,7 @@ function createBar2(value, max = 100, width = 20) {
2027
2117
  const filled = Math.round(safe / max * width);
2028
2118
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
2029
2119
  }
2030
- function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
2120
+ function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2031
2121
  if (events.length === 0)
2032
2122
  return [];
2033
2123
  const evaluatorIds = new Set(
@@ -2074,26 +2164,27 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2074
2164
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
2075
2165
  const metricParts = [];
2076
2166
  if (metrics && metrics.length > 0) {
2077
- for (const { id, data } of metrics) {
2078
- const def = getMetricById(id);
2167
+ for (const m of metrics) {
2168
+ const def = getMetricById(m.id);
2079
2169
  if (def) {
2080
- const formatted = def.format(data, options);
2170
+ const formatted = def.format(m.data, options);
2171
+ const label = m.name ?? def.name;
2081
2172
  metricParts.push(
2082
- def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
2173
+ label ? `[${label}: ${formatted}]` : `[${formatted}]`
2083
2174
  );
2084
2175
  }
2085
2176
  }
2086
2177
  }
2087
2178
  const scoreLines = [];
2088
2179
  for (const item of scores) {
2089
- const def = getScoreById(item.id);
2090
- const scoreLabel = def ? def.name ?? def.id : item.id;
2180
+ const def = item.def ?? getScoreById(item.id);
2181
+ const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
2091
2182
  let formatted;
2092
2183
  if (!def) {
2093
2184
  const numeric = toNumericScore(item.data);
2094
2185
  formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2095
2186
  } else {
2096
- const raw = def.format(item.data, options);
2187
+ const raw = formatScoreData(def, item.data, options);
2097
2188
  switch (def.displayStrategy) {
2098
2189
  case "bar": {
2099
2190
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -2245,7 +2336,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2245
2336
  (s, e) => s + e.durationMs,
2246
2337
  0
2247
2338
  );
2248
- existing.events.every((e) => e.passed);
2249
2339
  const lines = [];
2250
2340
  lines.push(
2251
2341
  `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
@@ -2323,18 +2413,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2323
2413
  if (finalEvent.type === "RunFailed") {
2324
2414
  throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2325
2415
  }
2416
+ const completed = finalEvent;
2326
2417
  console.log("");
2327
2418
  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2328
2419
  console.log(
2329
2420
  `- passed: ${colorize(
2330
- `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
2421
+ `${completed.passedTestCases}/${completed.totalTestCases}`,
2331
2422
  ansi2.green
2332
2423
  )}`
2333
2424
  );
2334
2425
  console.log(
2335
2426
  `- failed: ${colorize(
2336
- `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
2337
- finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
2427
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2428
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2338
2429
  )}`
2339
2430
  );
2340
2431
  if (overallScoreCount > 0) {
@@ -2375,10 +2466,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2375
2466
  );
2376
2467
  continue;
2377
2468
  }
2378
- const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2379
- summary.aggregatedScoreItem.data,
2380
- { isAggregated: true }
2381
- ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2469
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
2470
+ const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
2471
+ return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
2472
+ })() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2382
2473
  console.log(
2383
2474
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2384
2475
  scoreLabel,
@@ -2387,7 +2478,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2387
2478
  );
2388
2479
  }
2389
2480
  }
2390
- console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
2481
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2391
2482
  }
2392
2483
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2393
2484
  return new Promise((resolve5, reject) => {