@m4trix/evals 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -333,20 +333,70 @@ function getMetricById(id) {
333
333
 
334
334
  // src/evals/score.ts
335
335
  var registry2 = /* @__PURE__ */ new Map();
336
+ function formatScoreData(def, data, options) {
337
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
338
+ }
339
+ var ScoreAggregate = {
340
+ /** Average numeric fields. Use for scores like { value, delta }. */
341
+ averageFields(fields) {
342
+ return (values) => {
343
+ const count = values.length || 1;
344
+ const result = {};
345
+ for (const field of fields) {
346
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
347
+ }
348
+ return result;
349
+ };
350
+ },
351
+ /** Average `value` with sample std dev. Use for percent-style scores. */
352
+ averageWithVariance(values) {
353
+ if (values.length === 0) {
354
+ return { value: 0, stdDev: void 0, count: 0 };
355
+ }
356
+ const sum = values.reduce((s, v) => s + v.value, 0);
357
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
358
+ const mean = sum / values.length;
359
+ let stdDev;
360
+ if (values.length >= 2) {
361
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
362
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
363
+ }
364
+ return { ...values[0], value: mean, stdDev, count: values.length };
365
+ },
366
+ /** All runs must pass. Use for binary scores. */
367
+ all(values) {
368
+ const total = values.length;
369
+ const passedCount = values.filter((v) => v.passed).length;
370
+ return {
371
+ ...values[0],
372
+ passed: total > 0 && values.every((v) => v.passed),
373
+ passedCount,
374
+ totalCount: total
375
+ };
376
+ },
377
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
378
+ last(values) {
379
+ return values[values.length - 1] ?? {};
380
+ }
381
+ };
336
382
  var Score = {
383
+ aggregate: ScoreAggregate,
337
384
  of(config) {
338
385
  const def = {
339
386
  id: config.id,
340
387
  name: config.name,
341
388
  displayStrategy: config.displayStrategy,
342
- aggregate: config.aggregate,
343
- format: config.format,
389
+ formatValue: config.formatValue,
390
+ formatAggregate: config.formatAggregate,
391
+ aggregateValues: config.aggregateValues,
344
392
  make: (data, options) => {
345
393
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
346
394
  return {
347
395
  id: config.id,
348
396
  data,
349
- ...passed !== void 0 && { passed }
397
+ ...passed !== void 0 && { passed },
398
+ def
399
+ // Attach def so rendering/aggregation works without registry lookup
350
400
  };
351
401
  }
352
402
  };
@@ -359,29 +409,6 @@ function getScoreById(id) {
359
409
  }
360
410
 
361
411
  // src/evals/aggregators.ts
362
- function aggregateAverageWithVariance(values) {
363
- if (values.length === 0) {
364
- return { value: 0, count: 0 };
365
- }
366
- const sum = values.reduce((s, v) => s + v.value, 0);
367
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
368
- const mean = sum / values.length;
369
- let stdDev;
370
- if (values.length >= 2) {
371
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
372
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
373
- }
374
- return { value: mean, stdDev, count: values.length };
375
- }
376
- function aggregateAll(values) {
377
- const total = values.length;
378
- const passedCount = values.filter((v) => v.passed).length;
379
- return {
380
- passed: total > 0 && values.every((v) => v.passed),
381
- passedCount,
382
- totalCount: total
383
- };
384
- }
385
412
  function aggregateTokenCountSum(values) {
386
413
  const initial = {
387
414
  input: 0,
@@ -434,40 +461,45 @@ Score.of({
434
461
  id: "percent",
435
462
  name: "Score",
436
463
  displayStrategy: "bar",
437
- format: (data, options) => {
438
- if (options?.isAggregated) {
439
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
440
- }
441
- return data.value.toFixed(2);
442
- },
443
- aggregate: aggregateAverageWithVariance
464
+ formatValue: (data) => data.value.toFixed(2),
465
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
466
+ aggregateValues: Score.aggregate.averageWithVariance
467
+ });
468
+ Score.of({
469
+ id: "delta",
470
+ name: "Delta",
471
+ displayStrategy: "number",
472
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
473
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
474
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
444
475
  });
445
476
  Score.of({
446
477
  id: "binary",
447
478
  name: "Result",
448
479
  displayStrategy: "passFail",
449
- format: (data, options) => {
450
- if (options?.isAggregated) {
451
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
452
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
453
- return `${base} (${data.passedCount}/${data.totalCount})`;
454
- }
455
- return base;
480
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
481
+ formatAggregate: (data) => {
482
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
483
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
484
+ return `${base} (${data.passedCount}/${data.totalCount})`;
456
485
  }
457
- return data.passed ? "PASSED" : "NOT PASSED";
486
+ return base;
458
487
  },
459
- aggregate: aggregateAll
488
+ aggregateValues: Score.aggregate.all
460
489
  });
461
490
 
462
491
  // src/runner/score-utils.ts
492
+ function getScoreDef(item) {
493
+ return item.def ?? getScoreById(item.id);
494
+ }
463
495
  function aggregateScoreItems(items) {
464
496
  if (items.length === 0)
465
497
  return void 0;
466
- const def = getScoreById(items[0].id);
467
- if (!def?.aggregate)
498
+ const def = getScoreDef(items[0]);
499
+ if (!def?.aggregateValues)
468
500
  return items[items.length - 1];
469
- const aggregated = def.aggregate(items.map((i) => i.data));
470
- return { ...items[0], data: aggregated };
501
+ const aggregated = def.aggregateValues(items.map((i) => i.data));
502
+ return { ...items[0], data: aggregated, def };
471
503
  }
472
504
  function aggregateMetricItems(items) {
473
505
  if (items.length === 0)
@@ -480,7 +512,7 @@ function aggregateMetricItems(items) {
480
512
  }
481
513
  function toNumericScoreFromScores(scores) {
482
514
  for (const item of scores) {
483
- const def = getScoreById(item.id);
515
+ const def = getScoreDef(item);
484
516
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
485
517
  const value = item.data.value;
486
518
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -868,7 +900,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
868
900
  () => appendJsonLine(message.artifactPath, {
869
901
  runId: message.runId,
870
902
  ts: Date.now(),
871
- ...message.payload
903
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
872
904
  })
873
905
  );
874
906
  })
@@ -1454,7 +1486,7 @@ function aggregateEvaluatorScores(events, nameById) {
1454
1486
  if (agg)
1455
1487
  aggregatedScores.push(agg);
1456
1488
  }
1457
- const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1489
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1458
1490
  const passed = events.every((ev) => {
1459
1491
  const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1460
1492
  return es?.passed ?? false;
@@ -1474,13 +1506,13 @@ function aggregateEvaluatorScores(events, nameById) {
1474
1506
  }
1475
1507
  return result;
1476
1508
  }
1477
- function formatScorePart(item, scoreToColor2, options) {
1478
- const def = getScoreById(item.id);
1509
+ function formatScorePart(item, _scoreToColor, options) {
1510
+ const def = item.def ?? getScoreById(item.id);
1479
1511
  if (!def) {
1480
1512
  const numeric = toNumericScore(item.data);
1481
1513
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1482
1514
  }
1483
- const formatted = def.format(item.data, options);
1515
+ const formatted = formatScoreData(def, item.data, options);
1484
1516
  if (def.displayStrategy === "bar") {
1485
1517
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1486
1518
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1538,8 +1570,6 @@ function RunView({
1538
1570
  const done = new Promise((resolve5) => {
1539
1571
  const unsubscribe = runner.subscribeRunEvents((event) => {
1540
1572
  if (event.type === "TestCaseProgress") {
1541
- const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1542
- numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1543
1573
  for (const item of event.evaluatorScores) {
1544
1574
  const numeric = toNumericScoreFromScores(item.scores);
1545
1575
  if (numeric !== void 0) {
@@ -1629,16 +1659,17 @@ function RunView({
1629
1659
  onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1630
1660
  return;
1631
1661
  }
1662
+ const completed = finalEvent;
1632
1663
  setSummary({
1633
- passedTestCases: finalEvent.passedTestCases,
1634
- failedTestCases: finalEvent.failedTestCases,
1635
- totalTestCases: finalEvent.totalTestCases,
1664
+ passedTestCases: completed.passedTestCases,
1665
+ failedTestCases: completed.failedTestCases,
1666
+ totalTestCases: completed.totalTestCases,
1636
1667
  overallScoreTotal,
1637
1668
  overallScoreSumSq,
1638
1669
  overallScoreCount,
1639
1670
  aggregates: new Map(aggregates),
1640
1671
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1641
- artifactPath: finalEvent.artifactPath
1672
+ artifactPath: completed.artifactPath
1642
1673
  });
1643
1674
  setPhase("completed");
1644
1675
  setTimeout(() => onComplete(), 200);
@@ -1844,11 +1875,9 @@ function RunView({
1844
1875
  const aggregated = aggregateScoreItems(items);
1845
1876
  if (!aggregated)
1846
1877
  return null;
1847
- const def = getScoreById(aggregated.id);
1878
+ const def = aggregated.def ?? getScoreById(aggregated.id);
1848
1879
  const label = def ? def.name ?? def.id : aggregated.id;
1849
- const formatted = def?.format(aggregated.data, {
1850
- isAggregated: true
1851
- }) ?? "n/a";
1880
+ const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1852
1881
  const numeric = toNumericScore(aggregated.data);
1853
1882
  return /* @__PURE__ */ jsxs(
1854
1883
  Text,
@@ -2007,9 +2036,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2007
2036
  const agg = aggregateScoreItems(items);
2008
2037
  if (!agg)
2009
2038
  continue;
2010
- const def = getScoreById(agg.id);
2039
+ const def = agg.def ?? getScoreById(agg.id);
2011
2040
  const label = def ? def.name ?? def.id : agg.id;
2012
- const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2041
+ const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2013
2042
  const numeric = toNumericScore(agg.data);
2014
2043
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2015
2044
  scoreLines.push(` ${label}: ${colored}`);
@@ -2027,7 +2056,7 @@ function createBar2(value, max = 100, width = 20) {
2027
2056
  const filled = Math.round(safe / max * width);
2028
2057
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
2029
2058
  }
2030
- function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
2059
+ function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2031
2060
  if (events.length === 0)
2032
2061
  return [];
2033
2062
  const evaluatorIds = new Set(
@@ -2086,14 +2115,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2086
2115
  }
2087
2116
  const scoreLines = [];
2088
2117
  for (const item of scores) {
2089
- const def = getScoreById(item.id);
2118
+ const def = item.def ?? getScoreById(item.id);
2090
2119
  const scoreLabel = def ? def.name ?? def.id : item.id;
2091
2120
  let formatted;
2092
2121
  if (!def) {
2093
2122
  const numeric = toNumericScore(item.data);
2094
2123
  formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2095
2124
  } else {
2096
- const raw = def.format(item.data, options);
2125
+ const raw = formatScoreData(def, item.data, options);
2097
2126
  switch (def.displayStrategy) {
2098
2127
  case "bar": {
2099
2128
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -2245,7 +2274,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2245
2274
  (s, e) => s + e.durationMs,
2246
2275
  0
2247
2276
  );
2248
- existing.events.every((e) => e.passed);
2249
2277
  const lines = [];
2250
2278
  lines.push(
2251
2279
  `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
@@ -2323,18 +2351,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2323
2351
  if (finalEvent.type === "RunFailed") {
2324
2352
  throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2325
2353
  }
2354
+ const completed = finalEvent;
2326
2355
  console.log("");
2327
2356
  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2328
2357
  console.log(
2329
2358
  `- passed: ${colorize(
2330
- `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
2359
+ `${completed.passedTestCases}/${completed.totalTestCases}`,
2331
2360
  ansi2.green
2332
2361
  )}`
2333
2362
  );
2334
2363
  console.log(
2335
2364
  `- failed: ${colorize(
2336
- `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
2337
- finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
2365
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2366
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2338
2367
  )}`
2339
2368
  );
2340
2369
  if (overallScoreCount > 0) {
@@ -2375,10 +2404,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2375
2404
  );
2376
2405
  continue;
2377
2406
  }
2378
- const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2379
- summary.aggregatedScoreItem.data,
2380
- { isAggregated: true }
2381
- ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2407
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
2408
+ const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
2409
+ return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
2410
+ })() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2382
2411
  console.log(
2383
2412
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2384
2413
  scoreLabel,
@@ -2387,7 +2416,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2387
2416
  );
2388
2417
  }
2389
2418
  }
2390
- console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
2419
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2391
2420
  }
2392
2421
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2393
2422
  return new Promise((resolve5, reject) => {