@m4trix/evals 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -523,7 +523,11 @@ var Metric = {
523
523
  name: config.name,
524
524
  aggregate: config.aggregate,
525
525
  format: config.format,
526
- make: (data) => ({ id: config.id, data })
526
+ make: (data, options) => ({
527
+ id: config.id,
528
+ data,
529
+ ...options?.name !== void 0 && { name: options.name }
530
+ })
527
531
  };
528
532
  registry.set(config.id, def);
529
533
  return def;
@@ -535,20 +539,107 @@ function getMetricById(id) {
535
539
 
536
540
  // src/evals/score.ts
537
541
  var registry2 = /* @__PURE__ */ new Map();
542
+ function formatScoreData(def, data, options) {
543
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
544
+ }
545
+ var ScoreAggregate = {
546
+ /** Average numeric fields. Use for scores like { value, delta }. */
547
+ averageFields(fields) {
548
+ return (values) => {
549
+ const count = values.length || 1;
550
+ const result = {};
551
+ for (const field of fields) {
552
+ result[field] = values.reduce(
553
+ (s, v) => s + (v[field] ?? 0),
554
+ 0
555
+ ) / count;
556
+ }
557
+ return result;
558
+ };
559
+ },
560
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
561
+ averageWithVariance(fields) {
562
+ return (values) => {
563
+ const count = values.length;
564
+ const result = {};
565
+ for (const field of fields) {
566
+ result[field] = count === 0 ? 0 : values.reduce(
567
+ (sum, item) => sum + (item[field] ?? 0),
568
+ 0
569
+ ) / count;
570
+ }
571
+ const valueField = "value";
572
+ const hasValueField = fields.includes(valueField);
573
+ if (count === 0) {
574
+ if (hasValueField) {
575
+ result[valueField] = 0;
576
+ }
577
+ return {
578
+ ...result,
579
+ stdDev: void 0,
580
+ count: 0
581
+ };
582
+ }
583
+ let stdDev;
584
+ if (hasValueField && count >= 2) {
585
+ const sum = values.reduce(
586
+ (s, v) => s + (v[valueField] ?? 0),
587
+ 0
588
+ );
589
+ const sumSq = values.reduce(
590
+ (s, v) => {
591
+ const value = v[valueField] ?? 0;
592
+ return s + value * value;
593
+ },
594
+ 0
595
+ );
596
+ const mean = sum / count;
597
+ const variance = (sumSq - count * mean * mean) / (count - 1);
598
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
599
+ }
600
+ return {
601
+ ...values[0],
602
+ ...result,
603
+ stdDev,
604
+ count
605
+ };
606
+ };
607
+ },
608
+ /** All runs must pass. Use for binary scores. */
609
+ all(values) {
610
+ const total = values.length;
611
+ const passedCount = values.filter((v) => v.passed).length;
612
+ return {
613
+ ...values[0],
614
+ passed: total > 0 && values.every((v) => v.passed),
615
+ passedCount,
616
+ totalCount: total
617
+ };
618
+ },
619
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
620
+ last(values) {
621
+ return values[values.length - 1] ?? {};
622
+ }
623
+ };
538
624
  var Score = {
625
+ aggregate: ScoreAggregate,
539
626
  of(config) {
540
627
  const def = {
541
628
  id: config.id,
542
629
  name: config.name,
543
630
  displayStrategy: config.displayStrategy,
544
- aggregate: config.aggregate,
545
- format: config.format,
631
+ formatValue: config.formatValue,
632
+ formatAggregate: config.formatAggregate,
633
+ aggregateValues: config.aggregateValues,
546
634
  make: (data, options) => {
547
635
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
548
636
  return {
549
637
  id: config.id,
550
638
  data,
551
- ...passed !== void 0 && { passed }
639
+ ...passed !== void 0 && { passed },
640
+ ...options?.name !== void 0 && { name: options.name },
641
+ def
642
+ // Attach def so rendering/aggregation works without registry lookup
552
643
  };
553
644
  }
554
645
  };
@@ -561,29 +652,6 @@ function getScoreById(id) {
561
652
  }
562
653
 
563
654
  // src/evals/aggregators.ts
564
- function aggregateAverageWithVariance(values) {
565
- if (values.length === 0) {
566
- return { value: 0, count: 0 };
567
- }
568
- const sum = values.reduce((s, v) => s + v.value, 0);
569
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
570
- const mean = sum / values.length;
571
- let stdDev;
572
- if (values.length >= 2) {
573
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
574
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
575
- }
576
- return { value: mean, stdDev, count: values.length };
577
- }
578
- function aggregateAll(values) {
579
- const total = values.length;
580
- const passedCount = values.filter((v) => v.passed).length;
581
- return {
582
- passed: total > 0 && values.every((v) => v.passed),
583
- passedCount,
584
- totalCount: total
585
- };
586
- }
587
655
  function aggregateTokenCountSum(values) {
588
656
  const initial = {
589
657
  input: 0,
@@ -636,29 +704,31 @@ var percentScore = Score.of({
636
704
  id: "percent",
637
705
  name: "Score",
638
706
  displayStrategy: "bar",
639
- format: (data, options) => {
640
- if (options?.isAggregated) {
641
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
642
- }
643
- return data.value.toFixed(2);
644
- },
645
- aggregate: aggregateAverageWithVariance
707
+ formatValue: (data) => data.value.toFixed(2),
708
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
709
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
710
+ });
711
+ var deltaScore = Score.of({
712
+ id: "delta",
713
+ name: "Delta",
714
+ displayStrategy: "number",
715
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
716
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
717
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
646
718
  });
647
719
  var binaryScore = Score.of({
648
720
  id: "binary",
649
721
  name: "Result",
650
722
  displayStrategy: "passFail",
651
- format: (data, options) => {
652
- if (options?.isAggregated) {
653
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
654
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
655
- return `${base} (${data.passedCount}/${data.totalCount})`;
656
- }
657
- return base;
723
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
724
+ formatAggregate: (data) => {
725
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
726
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
727
+ return `${base} (${data.passedCount}/${data.totalCount})`;
658
728
  }
659
- return data.passed ? "PASSED" : "NOT PASSED";
729
+ return base;
660
730
  },
661
- aggregate: aggregateAll
731
+ aggregateValues: Score.aggregate.all
662
732
  });
663
733
  function createDiffString(expected, actual, diffOptions) {
664
734
  const opts = { ...diffOptions, color: false };
@@ -974,9 +1044,12 @@ async function collectTestCasesFromFiles(config) {
974
1044
  }
975
1045
 
976
1046
  // src/runner/score-utils.ts
1047
+ function getScoreDef(item) {
1048
+ return item.def ?? getScoreById(item.id);
1049
+ }
977
1050
  function toNumericScoreFromScores(scores) {
978
1051
  for (const item of scores) {
979
- const def = getScoreById(item.id);
1052
+ const def = getScoreDef(item);
980
1053
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
981
1054
  const value = item.data.value;
982
1055
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1364,7 +1437,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
1364
1437
  () => appendJsonLine(message.artifactPath, {
1365
1438
  runId: message.runId,
1366
1439
  ts: Date.now(),
1367
- ...message.payload
1440
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1368
1441
  })
1369
1442
  );
1370
1443
  })
@@ -1678,6 +1751,8 @@ exports.createLogEntry = createLogEntry;
1678
1751
  exports.createRunner = createRunner;
1679
1752
  exports.defaultRunnerConfig = defaultRunnerConfig;
1680
1753
  exports.defineConfig = defineConfig;
1754
+ exports.deltaScore = deltaScore;
1755
+ exports.formatScoreData = formatScoreData;
1681
1756
  exports.getLogLines = getLogLines;
1682
1757
  exports.getMetricById = getMetricById;
1683
1758
  exports.getScoreById = getScoreById;