@m4trix/evals 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +129 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +129 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +591 -380
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +582 -371
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/index.d.ts
CHANGED
|
@@ -74,6 +74,7 @@ interface CliState {
|
|
|
74
74
|
datasetMenuIndex: number;
|
|
75
75
|
runMenuIndex: number;
|
|
76
76
|
detailsScrollOffset: number;
|
|
77
|
+
overviewScrollOffset: number;
|
|
77
78
|
selectedEvaluatorIds: string[];
|
|
78
79
|
evaluatorMenuIndex: number;
|
|
79
80
|
searchQuery: string;
|
|
@@ -412,10 +413,14 @@ declare const latencyMetric: MetricDef<LatencyData>;
|
|
|
412
413
|
|
|
413
414
|
interface PercentScoreData {
|
|
414
415
|
value: number;
|
|
416
|
+
stdDev?: number;
|
|
417
|
+
count?: number;
|
|
415
418
|
}
|
|
416
419
|
declare const percentScore: ScoreDef<PercentScoreData>;
|
|
417
420
|
interface BinaryScoreData {
|
|
418
421
|
passed: boolean;
|
|
422
|
+
passedCount?: number;
|
|
423
|
+
totalCount?: number;
|
|
419
424
|
}
|
|
420
425
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
421
426
|
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
|
-
import {
|
|
3
|
+
import { diffLines } from 'diff';
|
|
4
4
|
import { randomUUID } from 'crypto';
|
|
5
5
|
import { existsSync } from 'fs';
|
|
6
6
|
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
@@ -539,15 +539,28 @@ function getScoreById(id) {
|
|
|
539
539
|
}
|
|
540
540
|
|
|
541
541
|
// src/evals/aggregators.ts
|
|
542
|
-
function
|
|
542
|
+
function aggregateAverageWithVariance(values) {
|
|
543
543
|
if (values.length === 0) {
|
|
544
|
-
return { value: 0 };
|
|
544
|
+
return { value: 0, count: 0 };
|
|
545
545
|
}
|
|
546
546
|
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
547
|
-
|
|
547
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
548
|
+
const mean = sum / values.length;
|
|
549
|
+
let stdDev;
|
|
550
|
+
if (values.length >= 2) {
|
|
551
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
552
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
553
|
+
}
|
|
554
|
+
return { value: mean, stdDev, count: values.length };
|
|
548
555
|
}
|
|
549
556
|
function aggregateAll(values) {
|
|
550
|
-
|
|
557
|
+
const total = values.length;
|
|
558
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
559
|
+
return {
|
|
560
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
561
|
+
passedCount,
|
|
562
|
+
totalCount: total
|
|
563
|
+
};
|
|
551
564
|
}
|
|
552
565
|
function aggregateTokenCountSum(values) {
|
|
553
566
|
const initial = {
|
|
@@ -601,18 +614,59 @@ var percentScore = Score.of({
|
|
|
601
614
|
id: "percent",
|
|
602
615
|
name: "Score",
|
|
603
616
|
displayStrategy: "bar",
|
|
604
|
-
format: (data, options) =>
|
|
605
|
-
|
|
617
|
+
format: (data, options) => {
|
|
618
|
+
if (options?.isAggregated) {
|
|
619
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
620
|
+
}
|
|
621
|
+
return data.value.toFixed(2);
|
|
622
|
+
},
|
|
623
|
+
aggregate: aggregateAverageWithVariance
|
|
606
624
|
});
|
|
607
625
|
var binaryScore = Score.of({
|
|
608
626
|
id: "binary",
|
|
609
627
|
name: "Result",
|
|
610
628
|
displayStrategy: "passFail",
|
|
611
|
-
format: (data, options) =>
|
|
629
|
+
format: (data, options) => {
|
|
630
|
+
if (options?.isAggregated) {
|
|
631
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
632
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
633
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
634
|
+
}
|
|
635
|
+
return base;
|
|
636
|
+
}
|
|
637
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
638
|
+
},
|
|
612
639
|
aggregate: aggregateAll
|
|
613
640
|
});
|
|
641
|
+
function toJsonLines(value) {
|
|
642
|
+
try {
|
|
643
|
+
return JSON.stringify(value, null, 2);
|
|
644
|
+
} catch {
|
|
645
|
+
return String(value);
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
function formatDiffString(changes) {
|
|
649
|
+
const lines = [];
|
|
650
|
+
for (const part of changes) {
|
|
651
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
652
|
+
const partLines = part.value.split("\n");
|
|
653
|
+
if (partLines[partLines.length - 1] === "") {
|
|
654
|
+
partLines.pop();
|
|
655
|
+
}
|
|
656
|
+
for (const line of partLines) {
|
|
657
|
+
lines.push(`${prefix} ${line}`);
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
return lines.join("\n");
|
|
661
|
+
}
|
|
662
|
+
function createDiffString(expected, actual) {
|
|
663
|
+
const expectedStr = toJsonLines(expected);
|
|
664
|
+
const actualStr = toJsonLines(actual);
|
|
665
|
+
const changes = diffLines(expectedStr, actualStr);
|
|
666
|
+
return formatDiffString(changes);
|
|
667
|
+
}
|
|
614
668
|
function createDiffLogEntry(expected, actual, options) {
|
|
615
|
-
const diff =
|
|
669
|
+
const diff = createDiffString(expected, actual);
|
|
616
670
|
return {
|
|
617
671
|
type: "diff",
|
|
618
672
|
label: options?.label,
|
|
@@ -622,8 +676,22 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
622
676
|
};
|
|
623
677
|
}
|
|
624
678
|
function printJsonDiff(expected, actual, options = {}) {
|
|
625
|
-
const
|
|
626
|
-
|
|
679
|
+
const diff = createDiffString(expected, actual);
|
|
680
|
+
if (options.color) {
|
|
681
|
+
const lines = diff.split("\n").map((line) => {
|
|
682
|
+
const trimmed = line.trimStart();
|
|
683
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
684
|
+
return `\x1B[31m${line}\x1B[0m`;
|
|
685
|
+
}
|
|
686
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
687
|
+
return `\x1B[32m${line}\x1B[0m`;
|
|
688
|
+
}
|
|
689
|
+
return line;
|
|
690
|
+
});
|
|
691
|
+
const colored = lines.join("\n");
|
|
692
|
+
console.log(colored || "(no differences)");
|
|
693
|
+
return colored;
|
|
694
|
+
}
|
|
627
695
|
console.log(diff || "(no differences)");
|
|
628
696
|
return diff;
|
|
629
697
|
}
|