@m4trix/evals 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -523,7 +523,11 @@ var Metric = {
523
523
  name: config.name,
524
524
  aggregate: config.aggregate,
525
525
  format: config.format,
526
- make: (data) => ({ id: config.id, data })
526
+ make: (data, options) => ({
527
+ id: config.id,
528
+ data,
529
+ ...options?.name !== void 0 && { name: options.name }
530
+ })
527
531
  };
528
532
  registry.set(config.id, def);
529
533
  return def;
@@ -545,25 +549,61 @@ var ScoreAggregate = {
545
549
  const count = values.length || 1;
546
550
  const result = {};
547
551
  for (const field of fields) {
548
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
552
+ result[field] = values.reduce(
553
+ (s, v) => s + (v[field] ?? 0),
554
+ 0
555
+ ) / count;
549
556
  }
550
557
  return result;
551
558
  };
552
559
  },
553
- /** Average `value` with sample std dev. Use for percent-style scores. */
554
- averageWithVariance(values) {
555
- if (values.length === 0) {
556
- return { value: 0, stdDev: void 0, count: 0 };
557
- }
558
- const sum = values.reduce((s, v) => s + v.value, 0);
559
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
560
- const mean = sum / values.length;
561
- let stdDev;
562
- if (values.length >= 2) {
563
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
564
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
565
- }
566
- return { ...values[0], value: mean, stdDev, count: values.length };
560
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
561
+ averageWithVariance(fields) {
562
+ return (values) => {
563
+ const count = values.length;
564
+ const result = {};
565
+ for (const field of fields) {
566
+ result[field] = count === 0 ? 0 : values.reduce(
567
+ (sum, item) => sum + (item[field] ?? 0),
568
+ 0
569
+ ) / count;
570
+ }
571
+ const valueField = "value";
572
+ const hasValueField = fields.includes(valueField);
573
+ if (count === 0) {
574
+ if (hasValueField) {
575
+ result[valueField] = 0;
576
+ }
577
+ return {
578
+ ...result,
579
+ stdDev: void 0,
580
+ count: 0
581
+ };
582
+ }
583
+ let stdDev;
584
+ if (hasValueField && count >= 2) {
585
+ const sum = values.reduce(
586
+ (s, v) => s + (v[valueField] ?? 0),
587
+ 0
588
+ );
589
+ const sumSq = values.reduce(
590
+ (s, v) => {
591
+ const value = v[valueField] ?? 0;
592
+ return s + value * value;
593
+ },
594
+ 0
595
+ );
596
+ const mean = sum / count;
597
+ const variance = (sumSq - count * mean * mean) / (count - 1);
598
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
599
+ }
600
+ return {
601
+ ...values[0],
602
+ ...result,
603
+ stdDev,
604
+ count
605
+ };
606
+ };
567
607
  },
568
608
  /** All runs must pass. Use for binary scores. */
569
609
  all(values) {
@@ -597,6 +637,7 @@ var Score = {
597
637
  id: config.id,
598
638
  data,
599
639
  ...passed !== void 0 && { passed },
640
+ ...options?.name !== void 0 && { name: options.name },
600
641
  def
601
642
  // Attach def so rendering/aggregation works without registry lookup
602
643
  };
@@ -665,7 +706,7 @@ var percentScore = Score.of({
665
706
  displayStrategy: "bar",
666
707
  formatValue: (data) => data.value.toFixed(2),
667
708
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
668
- aggregateValues: Score.aggregate.averageWithVariance
709
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
669
710
  });
670
711
  var deltaScore = Score.of({
671
712
  id: "delta",