@m4trix/evals 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +105 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +105 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +79 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +79 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +78 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +36 -5
- package/dist/index.js +77 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -535,20 +535,70 @@ function getMetricById(id) {
|
|
|
535
535
|
|
|
536
536
|
// src/evals/score.ts
|
|
537
537
|
var registry2 = /* @__PURE__ */ new Map();
|
|
538
|
+
function formatScoreData(def, data, options) {
|
|
539
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
540
|
+
}
|
|
541
|
+
var ScoreAggregate = {
|
|
542
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
543
|
+
averageFields(fields) {
|
|
544
|
+
return (values) => {
|
|
545
|
+
const count = values.length || 1;
|
|
546
|
+
const result = {};
|
|
547
|
+
for (const field of fields) {
|
|
548
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
549
|
+
}
|
|
550
|
+
return result;
|
|
551
|
+
};
|
|
552
|
+
},
|
|
553
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
554
|
+
averageWithVariance(values) {
|
|
555
|
+
if (values.length === 0) {
|
|
556
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
557
|
+
}
|
|
558
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
559
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
560
|
+
const mean = sum / values.length;
|
|
561
|
+
let stdDev;
|
|
562
|
+
if (values.length >= 2) {
|
|
563
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
564
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
565
|
+
}
|
|
566
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
567
|
+
},
|
|
568
|
+
/** All runs must pass. Use for binary scores. */
|
|
569
|
+
all(values) {
|
|
570
|
+
const total = values.length;
|
|
571
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
572
|
+
return {
|
|
573
|
+
...values[0],
|
|
574
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
575
|
+
passedCount,
|
|
576
|
+
totalCount: total
|
|
577
|
+
};
|
|
578
|
+
},
|
|
579
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
580
|
+
last(values) {
|
|
581
|
+
return values[values.length - 1] ?? {};
|
|
582
|
+
}
|
|
583
|
+
};
|
|
538
584
|
var Score = {
|
|
585
|
+
aggregate: ScoreAggregate,
|
|
539
586
|
of(config) {
|
|
540
587
|
const def = {
|
|
541
588
|
id: config.id,
|
|
542
589
|
name: config.name,
|
|
543
590
|
displayStrategy: config.displayStrategy,
|
|
544
|
-
|
|
545
|
-
|
|
591
|
+
formatValue: config.formatValue,
|
|
592
|
+
formatAggregate: config.formatAggregate,
|
|
593
|
+
aggregateValues: config.aggregateValues,
|
|
546
594
|
make: (data, options) => {
|
|
547
595
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
548
596
|
return {
|
|
549
597
|
id: config.id,
|
|
550
598
|
data,
|
|
551
|
-
...passed !== void 0 && { passed }
|
|
599
|
+
...passed !== void 0 && { passed },
|
|
600
|
+
def
|
|
601
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
552
602
|
};
|
|
553
603
|
}
|
|
554
604
|
};
|
|
@@ -561,29 +611,6 @@ function getScoreById(id) {
|
|
|
561
611
|
}
|
|
562
612
|
|
|
563
613
|
// src/evals/aggregators.ts
|
|
564
|
-
function aggregateAverageWithVariance(values) {
|
|
565
|
-
if (values.length === 0) {
|
|
566
|
-
return { value: 0, count: 0 };
|
|
567
|
-
}
|
|
568
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
569
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
570
|
-
const mean = sum / values.length;
|
|
571
|
-
let stdDev;
|
|
572
|
-
if (values.length >= 2) {
|
|
573
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
574
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
575
|
-
}
|
|
576
|
-
return { value: mean, stdDev, count: values.length };
|
|
577
|
-
}
|
|
578
|
-
function aggregateAll(values) {
|
|
579
|
-
const total = values.length;
|
|
580
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
581
|
-
return {
|
|
582
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
583
|
-
passedCount,
|
|
584
|
-
totalCount: total
|
|
585
|
-
};
|
|
586
|
-
}
|
|
587
614
|
function aggregateTokenCountSum(values) {
|
|
588
615
|
const initial = {
|
|
589
616
|
input: 0,
|
|
@@ -636,29 +663,31 @@ var percentScore = Score.of({
|
|
|
636
663
|
id: "percent",
|
|
637
664
|
name: "Score",
|
|
638
665
|
displayStrategy: "bar",
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
666
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
667
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
668
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
669
|
+
});
|
|
670
|
+
var deltaScore = Score.of({
|
|
671
|
+
id: "delta",
|
|
672
|
+
name: "Delta",
|
|
673
|
+
displayStrategy: "number",
|
|
674
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
675
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
676
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
646
677
|
});
|
|
647
678
|
var binaryScore = Score.of({
|
|
648
679
|
id: "binary",
|
|
649
680
|
name: "Result",
|
|
650
681
|
displayStrategy: "passFail",
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
}
|
|
657
|
-
return base;
|
|
682
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
683
|
+
formatAggregate: (data) => {
|
|
684
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
685
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
686
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
658
687
|
}
|
|
659
|
-
return
|
|
688
|
+
return base;
|
|
660
689
|
},
|
|
661
|
-
|
|
690
|
+
aggregateValues: Score.aggregate.all
|
|
662
691
|
});
|
|
663
692
|
function createDiffString(expected, actual, diffOptions) {
|
|
664
693
|
const opts = { ...diffOptions, color: false };
|
|
@@ -974,9 +1003,12 @@ async function collectTestCasesFromFiles(config) {
|
|
|
974
1003
|
}
|
|
975
1004
|
|
|
976
1005
|
// src/runner/score-utils.ts
|
|
1006
|
+
function getScoreDef(item) {
|
|
1007
|
+
return item.def ?? getScoreById(item.id);
|
|
1008
|
+
}
|
|
977
1009
|
function toNumericScoreFromScores(scores) {
|
|
978
1010
|
for (const item of scores) {
|
|
979
|
-
const def =
|
|
1011
|
+
const def = getScoreDef(item);
|
|
980
1012
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
981
1013
|
const value = item.data.value;
|
|
982
1014
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1364,7 +1396,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
|
1364
1396
|
() => appendJsonLine(message.artifactPath, {
|
|
1365
1397
|
runId: message.runId,
|
|
1366
1398
|
ts: Date.now(),
|
|
1367
|
-
...message.payload
|
|
1399
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1368
1400
|
})
|
|
1369
1401
|
);
|
|
1370
1402
|
})
|
|
@@ -1678,6 +1710,8 @@ exports.createLogEntry = createLogEntry;
|
|
|
1678
1710
|
exports.createRunner = createRunner;
|
|
1679
1711
|
exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
1680
1712
|
exports.defineConfig = defineConfig;
|
|
1713
|
+
exports.deltaScore = deltaScore;
|
|
1714
|
+
exports.formatScoreData = formatScoreData;
|
|
1681
1715
|
exports.getLogLines = getLogLines;
|
|
1682
1716
|
exports.getMetricById = getMetricById;
|
|
1683
1717
|
exports.getScoreById = getScoreById;
|