@m4trix/evals 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -359,20 +359,70 @@ function getMetricById(id) {
359
359
 
360
360
  // src/evals/score.ts
361
361
  var registry2 = /* @__PURE__ */ new Map();
362
+ function formatScoreData(def, data, options) {
363
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
364
+ }
365
+ var ScoreAggregate = {
366
+ /** Average numeric fields. Use for scores like { value, delta }. */
367
+ averageFields(fields) {
368
+ return (values) => {
369
+ const count = values.length || 1;
370
+ const result = {};
371
+ for (const field of fields) {
372
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
373
+ }
374
+ return result;
375
+ };
376
+ },
377
+ /** Average `value` with sample std dev. Use for percent-style scores. */
378
+ averageWithVariance(values) {
379
+ if (values.length === 0) {
380
+ return { value: 0, stdDev: void 0, count: 0 };
381
+ }
382
+ const sum = values.reduce((s, v) => s + v.value, 0);
383
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
384
+ const mean = sum / values.length;
385
+ let stdDev;
386
+ if (values.length >= 2) {
387
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
388
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
389
+ }
390
+ return { ...values[0], value: mean, stdDev, count: values.length };
391
+ },
392
+ /** All runs must pass. Use for binary scores. */
393
+ all(values) {
394
+ const total = values.length;
395
+ const passedCount = values.filter((v) => v.passed).length;
396
+ return {
397
+ ...values[0],
398
+ passed: total > 0 && values.every((v) => v.passed),
399
+ passedCount,
400
+ totalCount: total
401
+ };
402
+ },
403
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
404
+ last(values) {
405
+ return values[values.length - 1] ?? {};
406
+ }
407
+ };
362
408
  var Score = {
409
+ aggregate: ScoreAggregate,
363
410
  of(config) {
364
411
  const def = {
365
412
  id: config.id,
366
413
  name: config.name,
367
414
  displayStrategy: config.displayStrategy,
368
- aggregate: config.aggregate,
369
- format: config.format,
415
+ formatValue: config.formatValue,
416
+ formatAggregate: config.formatAggregate,
417
+ aggregateValues: config.aggregateValues,
370
418
  make: (data, options) => {
371
419
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
372
420
  return {
373
421
  id: config.id,
374
422
  data,
375
- ...passed !== void 0 && { passed }
423
+ ...passed !== void 0 && { passed },
424
+ def
425
+ // Attach def so rendering/aggregation works without registry lookup
376
426
  };
377
427
  }
378
428
  };
@@ -385,29 +435,6 @@ function getScoreById(id) {
385
435
  }
386
436
 
387
437
  // src/evals/aggregators.ts
388
- function aggregateAverageWithVariance(values) {
389
- if (values.length === 0) {
390
- return { value: 0, count: 0 };
391
- }
392
- const sum = values.reduce((s, v) => s + v.value, 0);
393
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
394
- const mean = sum / values.length;
395
- let stdDev;
396
- if (values.length >= 2) {
397
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
398
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
399
- }
400
- return { value: mean, stdDev, count: values.length };
401
- }
402
- function aggregateAll(values) {
403
- const total = values.length;
404
- const passedCount = values.filter((v) => v.passed).length;
405
- return {
406
- passed: total > 0 && values.every((v) => v.passed),
407
- passedCount,
408
- totalCount: total
409
- };
410
- }
411
438
  function aggregateTokenCountSum(values) {
412
439
  const initial = {
413
440
  input: 0,
@@ -460,40 +487,45 @@ Score.of({
460
487
  id: "percent",
461
488
  name: "Score",
462
489
  displayStrategy: "bar",
463
- format: (data, options) => {
464
- if (options?.isAggregated) {
465
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
466
- }
467
- return data.value.toFixed(2);
468
- },
469
- aggregate: aggregateAverageWithVariance
490
+ formatValue: (data) => data.value.toFixed(2),
491
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
492
+ aggregateValues: Score.aggregate.averageWithVariance
493
+ });
494
+ Score.of({
495
+ id: "delta",
496
+ name: "Delta",
497
+ displayStrategy: "number",
498
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
499
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
500
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
470
501
  });
471
502
  Score.of({
472
503
  id: "binary",
473
504
  name: "Result",
474
505
  displayStrategy: "passFail",
475
- format: (data, options) => {
476
- if (options?.isAggregated) {
477
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
478
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
479
- return `${base} (${data.passedCount}/${data.totalCount})`;
480
- }
481
- return base;
506
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
507
+ formatAggregate: (data) => {
508
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
509
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
510
+ return `${base} (${data.passedCount}/${data.totalCount})`;
482
511
  }
483
- return data.passed ? "PASSED" : "NOT PASSED";
512
+ return base;
484
513
  },
485
- aggregate: aggregateAll
514
+ aggregateValues: Score.aggregate.all
486
515
  });
487
516
 
488
517
  // src/runner/score-utils.ts
518
+ function getScoreDef(item) {
519
+ return item.def ?? getScoreById(item.id);
520
+ }
489
521
  function aggregateScoreItems(items) {
490
522
  if (items.length === 0)
491
523
  return void 0;
492
- const def = getScoreById(items[0].id);
493
- if (!def?.aggregate)
524
+ const def = getScoreDef(items[0]);
525
+ if (!def?.aggregateValues)
494
526
  return items[items.length - 1];
495
- const aggregated = def.aggregate(items.map((i) => i.data));
496
- return { ...items[0], data: aggregated };
527
+ const aggregated = def.aggregateValues(items.map((i) => i.data));
528
+ return { ...items[0], data: aggregated, def };
497
529
  }
498
530
  function aggregateMetricItems(items) {
499
531
  if (items.length === 0)
@@ -506,7 +538,7 @@ function aggregateMetricItems(items) {
506
538
  }
507
539
  function toNumericScoreFromScores(scores) {
508
540
  for (const item of scores) {
509
- const def = getScoreById(item.id);
541
+ const def = getScoreDef(item);
510
542
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
511
543
  const value = item.data.value;
512
544
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -894,7 +926,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
894
926
  () => appendJsonLine(message.artifactPath, {
895
927
  runId: message.runId,
896
928
  ts: Date.now(),
897
- ...message.payload
929
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
898
930
  })
899
931
  );
900
932
  })
@@ -1480,7 +1512,7 @@ function aggregateEvaluatorScores(events, nameById) {
1480
1512
  if (agg)
1481
1513
  aggregatedScores.push(agg);
1482
1514
  }
1483
- const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1515
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1484
1516
  const passed = events.every((ev) => {
1485
1517
  const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1486
1518
  return es?.passed ?? false;
@@ -1500,13 +1532,13 @@ function aggregateEvaluatorScores(events, nameById) {
1500
1532
  }
1501
1533
  return result;
1502
1534
  }
1503
- function formatScorePart(item, scoreToColor2, options) {
1504
- const def = getScoreById(item.id);
1535
+ function formatScorePart(item, _scoreToColor, options) {
1536
+ const def = item.def ?? getScoreById(item.id);
1505
1537
  if (!def) {
1506
1538
  const numeric = toNumericScore(item.data);
1507
1539
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1508
1540
  }
1509
- const formatted = def.format(item.data, options);
1541
+ const formatted = formatScoreData(def, item.data, options);
1510
1542
  if (def.displayStrategy === "bar") {
1511
1543
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1512
1544
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1564,8 +1596,6 @@ function RunView({
1564
1596
  const done = new Promise((resolve5) => {
1565
1597
  const unsubscribe = runner.subscribeRunEvents((event) => {
1566
1598
  if (event.type === "TestCaseProgress") {
1567
- const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1568
- numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1569
1599
  for (const item of event.evaluatorScores) {
1570
1600
  const numeric = toNumericScoreFromScores(item.scores);
1571
1601
  if (numeric !== void 0) {
@@ -1655,16 +1685,17 @@ function RunView({
1655
1685
  onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1656
1686
  return;
1657
1687
  }
1688
+ const completed = finalEvent;
1658
1689
  setSummary({
1659
- passedTestCases: finalEvent.passedTestCases,
1660
- failedTestCases: finalEvent.failedTestCases,
1661
- totalTestCases: finalEvent.totalTestCases,
1690
+ passedTestCases: completed.passedTestCases,
1691
+ failedTestCases: completed.failedTestCases,
1692
+ totalTestCases: completed.totalTestCases,
1662
1693
  overallScoreTotal,
1663
1694
  overallScoreSumSq,
1664
1695
  overallScoreCount,
1665
1696
  aggregates: new Map(aggregates),
1666
1697
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1667
- artifactPath: finalEvent.artifactPath
1698
+ artifactPath: completed.artifactPath
1668
1699
  });
1669
1700
  setPhase("completed");
1670
1701
  setTimeout(() => onComplete(), 200);
@@ -1870,11 +1901,9 @@ function RunView({
1870
1901
  const aggregated = aggregateScoreItems(items);
1871
1902
  if (!aggregated)
1872
1903
  return null;
1873
- const def = getScoreById(aggregated.id);
1904
+ const def = aggregated.def ?? getScoreById(aggregated.id);
1874
1905
  const label = def ? def.name ?? def.id : aggregated.id;
1875
- const formatted = def?.format(aggregated.data, {
1876
- isAggregated: true
1877
- }) ?? "n/a";
1906
+ const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1878
1907
  const numeric = toNumericScore(aggregated.data);
1879
1908
  return /* @__PURE__ */ jsxRuntime.jsxs(
1880
1909
  ink.Text,
@@ -2033,9 +2062,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2033
2062
  const agg = aggregateScoreItems(items);
2034
2063
  if (!agg)
2035
2064
  continue;
2036
- const def = getScoreById(agg.id);
2065
+ const def = agg.def ?? getScoreById(agg.id);
2037
2066
  const label = def ? def.name ?? def.id : agg.id;
2038
- const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2067
+ const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2039
2068
  const numeric = toNumericScore(agg.data);
2040
2069
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2041
2070
  scoreLines.push(` ${label}: ${colored}`);
@@ -2053,7 +2082,7 @@ function createBar2(value, max = 100, width = 20) {
2053
2082
  const filled = Math.round(safe / max * width);
2054
2083
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
2055
2084
  }
2056
- function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
2085
+ function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2057
2086
  if (events.length === 0)
2058
2087
  return [];
2059
2088
  const evaluatorIds = new Set(
@@ -2112,14 +2141,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2112
2141
  }
2113
2142
  const scoreLines = [];
2114
2143
  for (const item of scores) {
2115
- const def = getScoreById(item.id);
2144
+ const def = item.def ?? getScoreById(item.id);
2116
2145
  const scoreLabel = def ? def.name ?? def.id : item.id;
2117
2146
  let formatted;
2118
2147
  if (!def) {
2119
2148
  const numeric = toNumericScore(item.data);
2120
2149
  formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2121
2150
  } else {
2122
- const raw = def.format(item.data, options);
2151
+ const raw = formatScoreData(def, item.data, options);
2123
2152
  switch (def.displayStrategy) {
2124
2153
  case "bar": {
2125
2154
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -2271,7 +2300,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2271
2300
  (s, e) => s + e.durationMs,
2272
2301
  0
2273
2302
  );
2274
- existing.events.every((e) => e.passed);
2275
2303
  const lines = [];
2276
2304
  lines.push(
2277
2305
  `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
@@ -2349,18 +2377,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2349
2377
  if (finalEvent.type === "RunFailed") {
2350
2378
  throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2351
2379
  }
2380
+ const completed = finalEvent;
2352
2381
  console.log("");
2353
2382
  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2354
2383
  console.log(
2355
2384
  `- passed: ${colorize(
2356
- `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
2385
+ `${completed.passedTestCases}/${completed.totalTestCases}`,
2357
2386
  ansi2.green
2358
2387
  )}`
2359
2388
  );
2360
2389
  console.log(
2361
2390
  `- failed: ${colorize(
2362
- `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
2363
- finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
2391
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2392
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2364
2393
  )}`
2365
2394
  );
2366
2395
  if (overallScoreCount > 0) {
@@ -2401,10 +2430,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2401
2430
  );
2402
2431
  continue;
2403
2432
  }
2404
- const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2405
- summary.aggregatedScoreItem.data,
2406
- { isAggregated: true }
2407
- ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2433
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
2434
+ const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
2435
+ return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
2436
+ })() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2408
2437
  console.log(
2409
2438
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2410
2439
  scoreLabel,
@@ -2413,7 +2442,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2413
2442
  );
2414
2443
  }
2415
2444
  }
2416
- console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
2445
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2417
2446
  }
2418
2447
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2419
2448
  return new Promise((resolve5, reject) => {