@m4trix/evals 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +129 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +129 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +591 -380
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +582 -371
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/index.cjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var effect = require('effect');
|
|
4
|
-
var
|
|
4
|
+
var diff = require('diff');
|
|
5
5
|
var crypto = require('crypto');
|
|
6
6
|
var fs = require('fs');
|
|
7
7
|
var path = require('path');
|
|
@@ -561,15 +561,28 @@ function getScoreById(id) {
|
|
|
561
561
|
}
|
|
562
562
|
|
|
563
563
|
// src/evals/aggregators.ts
|
|
564
|
-
function
|
|
564
|
+
function aggregateAverageWithVariance(values) {
|
|
565
565
|
if (values.length === 0) {
|
|
566
|
-
return { value: 0 };
|
|
566
|
+
return { value: 0, count: 0 };
|
|
567
567
|
}
|
|
568
568
|
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
569
|
-
|
|
569
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
570
|
+
const mean = sum / values.length;
|
|
571
|
+
let stdDev;
|
|
572
|
+
if (values.length >= 2) {
|
|
573
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
574
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
575
|
+
}
|
|
576
|
+
return { value: mean, stdDev, count: values.length };
|
|
570
577
|
}
|
|
571
578
|
function aggregateAll(values) {
|
|
572
|
-
|
|
579
|
+
const total = values.length;
|
|
580
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
581
|
+
return {
|
|
582
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
583
|
+
passedCount,
|
|
584
|
+
totalCount: total
|
|
585
|
+
};
|
|
573
586
|
}
|
|
574
587
|
function aggregateTokenCountSum(values) {
|
|
575
588
|
const initial = {
|
|
@@ -623,18 +636,59 @@ var percentScore = Score.of({
|
|
|
623
636
|
id: "percent",
|
|
624
637
|
name: "Score",
|
|
625
638
|
displayStrategy: "bar",
|
|
626
|
-
format: (data, options) =>
|
|
627
|
-
|
|
639
|
+
format: (data, options) => {
|
|
640
|
+
if (options?.isAggregated) {
|
|
641
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
642
|
+
}
|
|
643
|
+
return data.value.toFixed(2);
|
|
644
|
+
},
|
|
645
|
+
aggregate: aggregateAverageWithVariance
|
|
628
646
|
});
|
|
629
647
|
var binaryScore = Score.of({
|
|
630
648
|
id: "binary",
|
|
631
649
|
name: "Result",
|
|
632
650
|
displayStrategy: "passFail",
|
|
633
|
-
format: (data, options) =>
|
|
651
|
+
format: (data, options) => {
|
|
652
|
+
if (options?.isAggregated) {
|
|
653
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
654
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
655
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
656
|
+
}
|
|
657
|
+
return base;
|
|
658
|
+
}
|
|
659
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
660
|
+
},
|
|
634
661
|
aggregate: aggregateAll
|
|
635
662
|
});
|
|
663
|
+
function toJsonLines(value) {
|
|
664
|
+
try {
|
|
665
|
+
return JSON.stringify(value, null, 2);
|
|
666
|
+
} catch {
|
|
667
|
+
return String(value);
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
function formatDiffString(changes) {
|
|
671
|
+
const lines = [];
|
|
672
|
+
for (const part of changes) {
|
|
673
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
674
|
+
const partLines = part.value.split("\n");
|
|
675
|
+
if (partLines[partLines.length - 1] === "") {
|
|
676
|
+
partLines.pop();
|
|
677
|
+
}
|
|
678
|
+
for (const line of partLines) {
|
|
679
|
+
lines.push(`${prefix} ${line}`);
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
return lines.join("\n");
|
|
683
|
+
}
|
|
684
|
+
function createDiffString(expected, actual) {
|
|
685
|
+
const expectedStr = toJsonLines(expected);
|
|
686
|
+
const actualStr = toJsonLines(actual);
|
|
687
|
+
const changes = diff.diffLines(expectedStr, actualStr);
|
|
688
|
+
return formatDiffString(changes);
|
|
689
|
+
}
|
|
636
690
|
function createDiffLogEntry(expected, actual, options) {
|
|
637
|
-
const diff =
|
|
691
|
+
const diff = createDiffString(expected, actual);
|
|
638
692
|
return {
|
|
639
693
|
type: "diff",
|
|
640
694
|
label: options?.label,
|
|
@@ -644,8 +698,22 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
644
698
|
};
|
|
645
699
|
}
|
|
646
700
|
function printJsonDiff(expected, actual, options = {}) {
|
|
647
|
-
const
|
|
648
|
-
|
|
701
|
+
const diff = createDiffString(expected, actual);
|
|
702
|
+
if (options.color) {
|
|
703
|
+
const lines = diff.split("\n").map((line) => {
|
|
704
|
+
const trimmed = line.trimStart();
|
|
705
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
706
|
+
return `\x1B[31m${line}\x1B[0m`;
|
|
707
|
+
}
|
|
708
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
709
|
+
return `\x1B[32m${line}\x1B[0m`;
|
|
710
|
+
}
|
|
711
|
+
return line;
|
|
712
|
+
});
|
|
713
|
+
const colored = lines.join("\n");
|
|
714
|
+
console.log(colored || "(no differences)");
|
|
715
|
+
return colored;
|
|
716
|
+
}
|
|
649
717
|
console.log(diff || "(no differences)");
|
|
650
718
|
return diff;
|
|
651
719
|
}
|