@m4trix/evals 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -347,7 +347,11 @@ var Metric = {
347
347
  name: config.name,
348
348
  aggregate: config.aggregate,
349
349
  format: config.format,
350
- make: (data) => ({ id: config.id, data })
350
+ make: (data, options) => ({
351
+ id: config.id,
352
+ data,
353
+ ...options?.name !== void 0 && { name: options.name }
354
+ })
351
355
  };
352
356
  registry.set(config.id, def);
353
357
  return def;
@@ -369,25 +373,61 @@ var ScoreAggregate = {
369
373
  const count = values.length || 1;
370
374
  const result = {};
371
375
  for (const field of fields) {
372
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
376
+ result[field] = values.reduce(
377
+ (s, v) => s + (v[field] ?? 0),
378
+ 0
379
+ ) / count;
373
380
  }
374
381
  return result;
375
382
  };
376
383
  },
377
- /** Average `value` with sample std dev. Use for percent-style scores. */
378
- averageWithVariance(values) {
379
- if (values.length === 0) {
380
- return { value: 0, stdDev: void 0, count: 0 };
381
- }
382
- const sum = values.reduce((s, v) => s + v.value, 0);
383
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
384
- const mean = sum / values.length;
385
- let stdDev;
386
- if (values.length >= 2) {
387
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
388
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
389
- }
390
- return { ...values[0], value: mean, stdDev, count: values.length };
384
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
385
+ averageWithVariance(fields) {
386
+ return (values) => {
387
+ const count = values.length;
388
+ const result = {};
389
+ for (const field of fields) {
390
+ result[field] = count === 0 ? 0 : values.reduce(
391
+ (sum, item) => sum + (item[field] ?? 0),
392
+ 0
393
+ ) / count;
394
+ }
395
+ const valueField = "value";
396
+ const hasValueField = fields.includes(valueField);
397
+ if (count === 0) {
398
+ if (hasValueField) {
399
+ result[valueField] = 0;
400
+ }
401
+ return {
402
+ ...result,
403
+ stdDev: void 0,
404
+ count: 0
405
+ };
406
+ }
407
+ let stdDev;
408
+ if (hasValueField && count >= 2) {
409
+ const sum = values.reduce(
410
+ (s, v) => s + (v[valueField] ?? 0),
411
+ 0
412
+ );
413
+ const sumSq = values.reduce(
414
+ (s, v) => {
415
+ const value = v[valueField] ?? 0;
416
+ return s + value * value;
417
+ },
418
+ 0
419
+ );
420
+ const mean = sum / count;
421
+ const variance = (sumSq - count * mean * mean) / (count - 1);
422
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
423
+ }
424
+ return {
425
+ ...values[0],
426
+ ...result,
427
+ stdDev,
428
+ count
429
+ };
430
+ };
391
431
  },
392
432
  /** All runs must pass. Use for binary scores. */
393
433
  all(values) {
@@ -421,6 +461,7 @@ var Score = {
421
461
  id: config.id,
422
462
  data,
423
463
  ...passed !== void 0 && { passed },
464
+ ...options?.name !== void 0 && { name: options.name },
424
465
  def
425
466
  // Attach def so rendering/aggregation works without registry lookup
426
467
  };
@@ -489,7 +530,7 @@ Score.of({
489
530
  displayStrategy: "bar",
490
531
  formatValue: (data) => data.value.toFixed(2),
491
532
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
492
- aggregateValues: Score.aggregate.averageWithVariance
533
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
493
534
  });
494
535
  Score.of({
495
536
  id: "delta",
@@ -518,6 +559,14 @@ Score.of({
518
559
  function getScoreDef(item) {
519
560
  return item.def ?? getScoreById(item.id);
520
561
  }
562
+ function lastNonEmptyName(items) {
563
+ for (let i = items.length - 1; i >= 0; i--) {
564
+ const n = items[i].name;
565
+ if (n != null && n.trim().length > 0)
566
+ return n;
567
+ }
568
+ return void 0;
569
+ }
521
570
  function aggregateScoreItems(items) {
522
571
  if (items.length === 0)
523
572
  return void 0;
@@ -525,7 +574,13 @@ function aggregateScoreItems(items) {
525
574
  if (!def?.aggregateValues)
526
575
  return items[items.length - 1];
527
576
  const aggregated = def.aggregateValues(items.map((i) => i.data));
528
- return { ...items[0], data: aggregated, def };
577
+ const nameOverride = lastNonEmptyName(items);
578
+ return {
579
+ ...items[0],
580
+ data: aggregated,
581
+ def,
582
+ ...nameOverride !== void 0 && { name: nameOverride }
583
+ };
529
584
  }
530
585
  function aggregateMetricItems(items) {
531
586
  if (items.length === 0)
@@ -534,7 +589,12 @@ function aggregateMetricItems(items) {
534
589
  if (!def?.aggregate)
535
590
  return items[items.length - 1];
536
591
  const aggregated = def.aggregate(items.map((i) => i.data));
537
- return { ...items[0], data: aggregated };
592
+ const nameOverride = lastNonEmptyName(items);
593
+ return {
594
+ ...items[0],
595
+ data: aggregated,
596
+ ...nameOverride !== void 0 && { name: nameOverride }
597
+ };
538
598
  }
539
599
  function toNumericScoreFromScores(scores) {
540
600
  for (const item of scores) {
@@ -1786,9 +1846,10 @@ function RunView({
1786
1846
  const formatted = def.format(m.data, {
1787
1847
  isAggregated: tc.isAggregated
1788
1848
  });
1849
+ const label = m.name ?? def.name;
1789
1850
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1790
1851
  "[",
1791
- def.name ? `${def.name}: ` : "",
1852
+ label ? `${label}: ` : "",
1792
1853
  formatted,
1793
1854
  "]",
1794
1855
  " "
@@ -1797,8 +1858,8 @@ function RunView({
1797
1858
  ] }) : null
1798
1859
  ] }),
1799
1860
  item.scores.length > 0 ? item.scores.map((s, idx) => {
1800
- const def = getScoreById(s.id);
1801
- const scoreLabel = def ? def.name ?? def.id : s.id;
1861
+ const def = s.def ?? getScoreById(s.id);
1862
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
1802
1863
  return /* @__PURE__ */ jsxRuntime.jsxs(
1803
1864
  ink.Text,
1804
1865
  {
@@ -1902,7 +1963,7 @@ function RunView({
1902
1963
  if (!aggregated)
1903
1964
  return null;
1904
1965
  const def = aggregated.def ?? getScoreById(aggregated.id);
1905
- const label = def ? def.name ?? def.id : aggregated.id;
1966
+ const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
1906
1967
  const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1907
1968
  const numeric = toNumericScore(aggregated.data);
1908
1969
  return /* @__PURE__ */ jsxRuntime.jsxs(
@@ -2063,7 +2124,7 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2063
2124
  if (!agg)
2064
2125
  continue;
2065
2126
  const def = agg.def ?? getScoreById(agg.id);
2066
- const label = def ? def.name ?? def.id : agg.id;
2127
+ const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
2067
2128
  const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2068
2129
  const numeric = toNumericScore(agg.data);
2069
2130
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
@@ -2129,12 +2190,13 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2129
2190
  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
2130
2191
  const metricParts = [];
2131
2192
  if (metrics && metrics.length > 0) {
2132
- for (const { id, data } of metrics) {
2133
- const def = getMetricById(id);
2193
+ for (const m of metrics) {
2194
+ const def = getMetricById(m.id);
2134
2195
  if (def) {
2135
- const formatted = def.format(data, options);
2196
+ const formatted = def.format(m.data, options);
2197
+ const label = m.name ?? def.name;
2136
2198
  metricParts.push(
2137
- def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
2199
+ label ? `[${label}: ${formatted}]` : `[${formatted}]`
2138
2200
  );
2139
2201
  }
2140
2202
  }
@@ -2142,7 +2204,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2142
2204
  const scoreLines = [];
2143
2205
  for (const item of scores) {
2144
2206
  const def = item.def ?? getScoreById(item.id);
2145
- const scoreLabel = def ? def.name ?? def.id : item.id;
2207
+ const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
2146
2208
  let formatted;
2147
2209
  if (!def) {
2148
2210
  const numeric = toNumericScore(item.data);