@m4trix/evals 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -535,20 +535,70 @@ function getMetricById(id) {
535
535
 
536
536
  // src/evals/score.ts
537
537
  var registry2 = /* @__PURE__ */ new Map();
538
+ function formatScoreData(def, data, options) {
539
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
540
+ }
541
+ var ScoreAggregate = {
542
+ /** Average numeric fields. Use for scores like { value, delta }. */
543
+ averageFields(fields) {
544
+ return (values) => {
545
+ const count = values.length || 1;
546
+ const result = {};
547
+ for (const field of fields) {
548
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
549
+ }
550
+ return result;
551
+ };
552
+ },
553
+ /** Average `value` with sample std dev. Use for percent-style scores. */
554
+ averageWithVariance(values) {
555
+ if (values.length === 0) {
556
+ return { value: 0, stdDev: void 0, count: 0 };
557
+ }
558
+ const sum = values.reduce((s, v) => s + v.value, 0);
559
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
560
+ const mean = sum / values.length;
561
+ let stdDev;
562
+ if (values.length >= 2) {
563
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
564
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
565
+ }
566
+ return { ...values[0], value: mean, stdDev, count: values.length };
567
+ },
568
+ /** All runs must pass. Use for binary scores. */
569
+ all(values) {
570
+ const total = values.length;
571
+ const passedCount = values.filter((v) => v.passed).length;
572
+ return {
573
+ ...values[0],
574
+ passed: total > 0 && values.every((v) => v.passed),
575
+ passedCount,
576
+ totalCount: total
577
+ };
578
+ },
579
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
580
+ last(values) {
581
+ return values[values.length - 1] ?? {};
582
+ }
583
+ };
538
584
  var Score = {
585
+ aggregate: ScoreAggregate,
539
586
  of(config) {
540
587
  const def = {
541
588
  id: config.id,
542
589
  name: config.name,
543
590
  displayStrategy: config.displayStrategy,
544
- aggregate: config.aggregate,
545
- format: config.format,
591
+ formatValue: config.formatValue,
592
+ formatAggregate: config.formatAggregate,
593
+ aggregateValues: config.aggregateValues,
546
594
  make: (data, options) => {
547
595
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
548
596
  return {
549
597
  id: config.id,
550
598
  data,
551
- ...passed !== void 0 && { passed }
599
+ ...passed !== void 0 && { passed },
600
+ def
601
+ // Attach def so rendering/aggregation works without registry lookup
552
602
  };
553
603
  }
554
604
  };
@@ -561,29 +611,6 @@ function getScoreById(id) {
561
611
  }
562
612
 
563
613
  // src/evals/aggregators.ts
564
- function aggregateAverageWithVariance(values) {
565
- if (values.length === 0) {
566
- return { value: 0, count: 0 };
567
- }
568
- const sum = values.reduce((s, v) => s + v.value, 0);
569
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
570
- const mean = sum / values.length;
571
- let stdDev;
572
- if (values.length >= 2) {
573
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
574
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
575
- }
576
- return { value: mean, stdDev, count: values.length };
577
- }
578
- function aggregateAll(values) {
579
- const total = values.length;
580
- const passedCount = values.filter((v) => v.passed).length;
581
- return {
582
- passed: total > 0 && values.every((v) => v.passed),
583
- passedCount,
584
- totalCount: total
585
- };
586
- }
587
614
  function aggregateTokenCountSum(values) {
588
615
  const initial = {
589
616
  input: 0,
@@ -636,29 +663,31 @@ var percentScore = Score.of({
636
663
  id: "percent",
637
664
  name: "Score",
638
665
  displayStrategy: "bar",
639
- format: (data, options) => {
640
- if (options?.isAggregated) {
641
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
642
- }
643
- return data.value.toFixed(2);
644
- },
645
- aggregate: aggregateAverageWithVariance
666
+ formatValue: (data) => data.value.toFixed(2),
667
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
668
+ aggregateValues: Score.aggregate.averageWithVariance
669
+ });
670
+ var deltaScore = Score.of({
671
+ id: "delta",
672
+ name: "Delta",
673
+ displayStrategy: "number",
674
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
675
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
676
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
646
677
  });
647
678
  var binaryScore = Score.of({
648
679
  id: "binary",
649
680
  name: "Result",
650
681
  displayStrategy: "passFail",
651
- format: (data, options) => {
652
- if (options?.isAggregated) {
653
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
654
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
655
- return `${base} (${data.passedCount}/${data.totalCount})`;
656
- }
657
- return base;
682
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
683
+ formatAggregate: (data) => {
684
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
685
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
686
+ return `${base} (${data.passedCount}/${data.totalCount})`;
658
687
  }
659
- return data.passed ? "PASSED" : "NOT PASSED";
688
+ return base;
660
689
  },
661
- aggregate: aggregateAll
690
+ aggregateValues: Score.aggregate.all
662
691
  });
663
692
  function createDiffString(expected, actual, diffOptions) {
664
693
  const opts = { ...diffOptions, color: false };
@@ -974,9 +1003,12 @@ async function collectTestCasesFromFiles(config) {
974
1003
  }
975
1004
 
976
1005
  // src/runner/score-utils.ts
1006
+ function getScoreDef(item) {
1007
+ return item.def ?? getScoreById(item.id);
1008
+ }
977
1009
  function toNumericScoreFromScores(scores) {
978
1010
  for (const item of scores) {
979
- const def = getScoreById(item.id);
1011
+ const def = getScoreDef(item);
980
1012
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
981
1013
  const value = item.data.value;
982
1014
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1364,7 +1396,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
1364
1396
  () => appendJsonLine(message.artifactPath, {
1365
1397
  runId: message.runId,
1366
1398
  ts: Date.now(),
1367
- ...message.payload
1399
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1368
1400
  })
1369
1401
  );
1370
1402
  })
@@ -1678,6 +1710,8 @@ exports.createLogEntry = createLogEntry;
1678
1710
  exports.createRunner = createRunner;
1679
1711
  exports.defaultRunnerConfig = defaultRunnerConfig;
1680
1712
  exports.defineConfig = defineConfig;
1713
+ exports.deltaScore = deltaScore;
1714
+ exports.formatScoreData = formatScoreData;
1681
1715
  exports.getLogLines = getLogLines;
1682
1716
  exports.getMetricById = getMetricById;
1683
1717
  exports.getScoreById = getScoreById;