@m4trix/evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,7 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  var effect = require('effect');
4
- var jsonDiff = require('json-diff');
4
+ var diff = require('diff');
5
5
  var crypto = require('crypto');
6
6
  var fs = require('fs');
7
7
  var path = require('path');
@@ -561,15 +561,28 @@ function getScoreById(id) {
561
561
  }
562
562
 
563
563
  // src/evals/aggregators.ts
564
- function aggregateAverage(values) {
564
+ function aggregateAverageWithVariance(values) {
565
565
  if (values.length === 0) {
566
- return { value: 0 };
566
+ return { value: 0, count: 0 };
567
567
  }
568
568
  const sum = values.reduce((s, v) => s + v.value, 0);
569
- return { value: sum / values.length };
569
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
570
+ const mean = sum / values.length;
571
+ let stdDev;
572
+ if (values.length >= 2) {
573
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
574
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
575
+ }
576
+ return { value: mean, stdDev, count: values.length };
570
577
  }
571
578
  function aggregateAll(values) {
572
- return { passed: values.length > 0 && values.every((v) => v.passed) };
579
+ const total = values.length;
580
+ const passedCount = values.filter((v) => v.passed).length;
581
+ return {
582
+ passed: total > 0 && values.every((v) => v.passed),
583
+ passedCount,
584
+ totalCount: total
585
+ };
573
586
  }
574
587
  function aggregateTokenCountSum(values) {
575
588
  const initial = {
@@ -623,18 +636,59 @@ var percentScore = Score.of({
623
636
  id: "percent",
624
637
  name: "Score",
625
638
  displayStrategy: "bar",
626
- format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
627
- aggregate: aggregateAverage
639
+ format: (data, options) => {
640
+ if (options?.isAggregated) {
641
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
642
+ }
643
+ return data.value.toFixed(2);
644
+ },
645
+ aggregate: aggregateAverageWithVariance
628
646
  });
629
647
  var binaryScore = Score.of({
630
648
  id: "binary",
631
649
  name: "Result",
632
650
  displayStrategy: "passFail",
633
- format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
651
+ format: (data, options) => {
652
+ if (options?.isAggregated) {
653
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
654
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
655
+ return `${base} (${data.passedCount}/${data.totalCount})`;
656
+ }
657
+ return base;
658
+ }
659
+ return data.passed ? "PASSED" : "NOT PASSED";
660
+ },
634
661
  aggregate: aggregateAll
635
662
  });
663
+ function toJsonLines(value) {
664
+ try {
665
+ return JSON.stringify(value, null, 2);
666
+ } catch {
667
+ return String(value);
668
+ }
669
+ }
670
+ function formatDiffString(changes) {
671
+ const lines = [];
672
+ for (const part of changes) {
673
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
674
+ const partLines = part.value.split("\n");
675
+ if (partLines[partLines.length - 1] === "") {
676
+ partLines.pop();
677
+ }
678
+ for (const line of partLines) {
679
+ lines.push(`${prefix} ${line}`);
680
+ }
681
+ }
682
+ return lines.join("\n");
683
+ }
684
+ function createDiffString(expected, actual) {
685
+ const expectedStr = toJsonLines(expected);
686
+ const actualStr = toJsonLines(actual);
687
+ const changes = diff.diffLines(expectedStr, actualStr);
688
+ return formatDiffString(changes);
689
+ }
636
690
  function createDiffLogEntry(expected, actual, options) {
637
- const diff = jsonDiff.diffString(expected, actual, { color: false });
691
+ const diff = createDiffString(expected, actual);
638
692
  return {
639
693
  type: "diff",
640
694
  label: options?.label,
@@ -644,8 +698,22 @@ function createDiffLogEntry(expected, actual, options) {
644
698
  };
645
699
  }
646
700
  function printJsonDiff(expected, actual, options = {}) {
647
- const { color = true } = options;
648
- const diff = jsonDiff.diffString(expected, actual, { color });
701
+ const diff = createDiffString(expected, actual);
702
+ if (options.color) {
703
+ const lines = diff.split("\n").map((line) => {
704
+ const trimmed = line.trimStart();
705
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
706
+ return `\x1B[31m${line}\x1B[0m`;
707
+ }
708
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
709
+ return `\x1B[32m${line}\x1B[0m`;
710
+ }
711
+ return line;
712
+ });
713
+ const colored = lines.join("\n");
714
+ console.log(colored || "(no differences)");
715
+ return colored;
716
+ }
649
717
  console.log(diff || "(no differences)");
650
718
  return diff;
651
719
  }