@m4trix/evals 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -321,7 +321,11 @@ var Metric = {
321
321
  name: config.name,
322
322
  aggregate: config.aggregate,
323
323
  format: config.format,
324
- make: (data) => ({ id: config.id, data })
324
+ make: (data, options) => ({
325
+ id: config.id,
326
+ data,
327
+ ...options?.name !== void 0 && { name: options.name }
328
+ })
325
329
  };
326
330
  registry.set(config.id, def);
327
331
  return def;
@@ -343,25 +347,61 @@ var ScoreAggregate = {
343
347
  const count = values.length || 1;
344
348
  const result = {};
345
349
  for (const field of fields) {
346
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
350
+ result[field] = values.reduce(
351
+ (s, v) => s + (v[field] ?? 0),
352
+ 0
353
+ ) / count;
347
354
  }
348
355
  return result;
349
356
  };
350
357
  },
351
- /** Average `value` with sample std dev. Use for percent-style scores. */
352
- averageWithVariance(values) {
353
- if (values.length === 0) {
354
- return { value: 0, stdDev: void 0, count: 0 };
355
- }
356
- const sum = values.reduce((s, v) => s + v.value, 0);
357
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
358
- const mean = sum / values.length;
359
- let stdDev;
360
- if (values.length >= 2) {
361
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
362
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
363
- }
364
- return { ...values[0], value: mean, stdDev, count: values.length };
358
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
359
+ averageWithVariance(fields) {
360
+ return (values) => {
361
+ const count = values.length;
362
+ const result = {};
363
+ for (const field of fields) {
364
+ result[field] = count === 0 ? 0 : values.reduce(
365
+ (sum, item) => sum + (item[field] ?? 0),
366
+ 0
367
+ ) / count;
368
+ }
369
+ const valueField = "value";
370
+ const hasValueField = fields.includes(valueField);
371
+ if (count === 0) {
372
+ if (hasValueField) {
373
+ result[valueField] = 0;
374
+ }
375
+ return {
376
+ ...result,
377
+ stdDev: void 0,
378
+ count: 0
379
+ };
380
+ }
381
+ let stdDev;
382
+ if (hasValueField && count >= 2) {
383
+ const sum = values.reduce(
384
+ (s, v) => s + (v[valueField] ?? 0),
385
+ 0
386
+ );
387
+ const sumSq = values.reduce(
388
+ (s, v) => {
389
+ const value = v[valueField] ?? 0;
390
+ return s + value * value;
391
+ },
392
+ 0
393
+ );
394
+ const mean = sum / count;
395
+ const variance = (sumSq - count * mean * mean) / (count - 1);
396
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
397
+ }
398
+ return {
399
+ ...values[0],
400
+ ...result,
401
+ stdDev,
402
+ count
403
+ };
404
+ };
365
405
  },
366
406
  /** All runs must pass. Use for binary scores. */
367
407
  all(values) {
@@ -395,6 +435,7 @@ var Score = {
395
435
  id: config.id,
396
436
  data,
397
437
  ...passed !== void 0 && { passed },
438
+ ...options?.name !== void 0 && { name: options.name },
398
439
  def
399
440
  // Attach def so rendering/aggregation works without registry lookup
400
441
  };
@@ -463,7 +504,7 @@ Score.of({
463
504
  displayStrategy: "bar",
464
505
  formatValue: (data) => data.value.toFixed(2),
465
506
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
466
- aggregateValues: Score.aggregate.averageWithVariance
507
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
467
508
  });
468
509
  Score.of({
469
510
  id: "delta",
@@ -492,6 +533,14 @@ Score.of({
492
533
  function getScoreDef(item) {
493
534
  return item.def ?? getScoreById(item.id);
494
535
  }
536
+ function lastNonEmptyName(items) {
537
+ for (let i = items.length - 1; i >= 0; i--) {
538
+ const n = items[i].name;
539
+ if (n != null && n.trim().length > 0)
540
+ return n;
541
+ }
542
+ return void 0;
543
+ }
495
544
  function aggregateScoreItems(items) {
496
545
  if (items.length === 0)
497
546
  return void 0;
@@ -499,7 +548,13 @@ function aggregateScoreItems(items) {
499
548
  if (!def?.aggregateValues)
500
549
  return items[items.length - 1];
501
550
  const aggregated = def.aggregateValues(items.map((i) => i.data));
502
- return { ...items[0], data: aggregated, def };
551
+ const nameOverride = lastNonEmptyName(items);
552
+ return {
553
+ ...items[0],
554
+ data: aggregated,
555
+ def,
556
+ ...nameOverride !== void 0 && { name: nameOverride }
557
+ };
503
558
  }
504
559
  function aggregateMetricItems(items) {
505
560
  if (items.length === 0)
@@ -508,7 +563,12 @@ function aggregateMetricItems(items) {
508
563
  if (!def?.aggregate)
509
564
  return items[items.length - 1];
510
565
  const aggregated = def.aggregate(items.map((i) => i.data));
511
- return { ...items[0], data: aggregated };
566
+ const nameOverride = lastNonEmptyName(items);
567
+ return {
568
+ ...items[0],
569
+ data: aggregated,
570
+ ...nameOverride !== void 0 && { name: nameOverride }
571
+ };
512
572
  }
513
573
  function toNumericScoreFromScores(scores) {
514
574
  for (const item of scores) {
@@ -1760,9 +1820,10 @@ function RunView({
1760
1820
  const formatted = def.format(m.data, {
1761
1821
  isAggregated: tc.isAggregated
1762
1822
  });
1823
+ const label = m.name ?? def.name;
1763
1824
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1764
1825
  "[",
1765
- def.name ? `${def.name}: ` : "",
1826
+ label ? `${label}: ` : "",
1766
1827
  formatted,
1767
1828
  "]",
1768
1829
  " "
@@ -1771,8 +1832,8 @@ function RunView({
1771
1832
  ] }) : null
1772
1833
  ] }),
1773
1834
  item.scores.length > 0 ? item.scores.map((s, idx) => {
1774
- const def = getScoreById(s.id);
1775
- const scoreLabel = def ? def.name ?? def.id : s.id;
1835
+ const def = s.def ?? getScoreById(s.id);
1836
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
1776
1837
  return /* @__PURE__ */ jsxs(
1777
1838
  Text,
1778
1839
  {
@@ -1876,7 +1937,7 @@ function RunView({
1876
1937
  if (!aggregated)
1877
1938
  return null;
1878
1939
  const def = aggregated.def ?? getScoreById(aggregated.id);
1879
- const label = def ? def.name ?? def.id : aggregated.id;
1940
+ const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
1880
1941
  const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1881
1942
  const numeric = toNumericScore(aggregated.data);
1882
1943
  return /* @__PURE__ */ jsxs(
@@ -2037,7 +2098,7 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2037
2098
  if (!agg)
2038
2099
  continue;
2039
2100
  const def = agg.def ?? getScoreById(agg.id);
2040
- const label = def ? def.name ?? def.id : agg.id;
2101
+ const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
2041
2102
  const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2042
2103
  const numeric = toNumericScore(agg.data);
2043
2104
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
@@ -2103,12 +2164,13 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2103
2164
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
2104
2165
  const metricParts = [];
2105
2166
  if (metrics && metrics.length > 0) {
2106
- for (const { id, data } of metrics) {
2107
- const def = getMetricById(id);
2167
+ for (const m of metrics) {
2168
+ const def = getMetricById(m.id);
2108
2169
  if (def) {
2109
- const formatted = def.format(data, options);
2170
+ const formatted = def.format(m.data, options);
2171
+ const label = m.name ?? def.name;
2110
2172
  metricParts.push(
2111
- def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
2173
+ label ? `[${label}: ${formatted}]` : `[${formatted}]`
2112
2174
  );
2113
2175
  }
2114
2176
  }
@@ -2116,7 +2178,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2116
2178
  const scoreLines = [];
2117
2179
  for (const item of scores) {
2118
2180
  const def = item.def ?? getScoreById(item.id);
2119
- const scoreLabel = def ? def.name ?? def.id : item.id;
2181
+ const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
2120
2182
  let formatted;
2121
2183
  if (!def) {
2122
2184
  const numeric = toNumericScore(item.data);