@m4trix/evals 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +179 -88
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +179 -88
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +124 -50
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +124 -50
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +120 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +42 -6
- package/dist/index.js +119 -46
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -523,7 +523,11 @@ var Metric = {
|
|
|
523
523
|
name: config.name,
|
|
524
524
|
aggregate: config.aggregate,
|
|
525
525
|
format: config.format,
|
|
526
|
-
make: (data) => ({
|
|
526
|
+
make: (data, options) => ({
|
|
527
|
+
id: config.id,
|
|
528
|
+
data,
|
|
529
|
+
...options?.name !== void 0 && { name: options.name }
|
|
530
|
+
})
|
|
527
531
|
};
|
|
528
532
|
registry.set(config.id, def);
|
|
529
533
|
return def;
|
|
@@ -535,20 +539,107 @@ function getMetricById(id) {
|
|
|
535
539
|
|
|
536
540
|
// src/evals/score.ts
|
|
537
541
|
var registry2 = /* @__PURE__ */ new Map();
|
|
542
|
+
function formatScoreData(def, data, options) {
|
|
543
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
544
|
+
}
|
|
545
|
+
var ScoreAggregate = {
|
|
546
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
547
|
+
averageFields(fields) {
|
|
548
|
+
return (values) => {
|
|
549
|
+
const count = values.length || 1;
|
|
550
|
+
const result = {};
|
|
551
|
+
for (const field of fields) {
|
|
552
|
+
result[field] = values.reduce(
|
|
553
|
+
(s, v) => s + (v[field] ?? 0),
|
|
554
|
+
0
|
|
555
|
+
) / count;
|
|
556
|
+
}
|
|
557
|
+
return result;
|
|
558
|
+
};
|
|
559
|
+
},
|
|
560
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
561
|
+
averageWithVariance(fields) {
|
|
562
|
+
return (values) => {
|
|
563
|
+
const count = values.length;
|
|
564
|
+
const result = {};
|
|
565
|
+
for (const field of fields) {
|
|
566
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
567
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
568
|
+
0
|
|
569
|
+
) / count;
|
|
570
|
+
}
|
|
571
|
+
const valueField = "value";
|
|
572
|
+
const hasValueField = fields.includes(valueField);
|
|
573
|
+
if (count === 0) {
|
|
574
|
+
if (hasValueField) {
|
|
575
|
+
result[valueField] = 0;
|
|
576
|
+
}
|
|
577
|
+
return {
|
|
578
|
+
...result,
|
|
579
|
+
stdDev: void 0,
|
|
580
|
+
count: 0
|
|
581
|
+
};
|
|
582
|
+
}
|
|
583
|
+
let stdDev;
|
|
584
|
+
if (hasValueField && count >= 2) {
|
|
585
|
+
const sum = values.reduce(
|
|
586
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
587
|
+
0
|
|
588
|
+
);
|
|
589
|
+
const sumSq = values.reduce(
|
|
590
|
+
(s, v) => {
|
|
591
|
+
const value = v[valueField] ?? 0;
|
|
592
|
+
return s + value * value;
|
|
593
|
+
},
|
|
594
|
+
0
|
|
595
|
+
);
|
|
596
|
+
const mean = sum / count;
|
|
597
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
598
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
599
|
+
}
|
|
600
|
+
return {
|
|
601
|
+
...values[0],
|
|
602
|
+
...result,
|
|
603
|
+
stdDev,
|
|
604
|
+
count
|
|
605
|
+
};
|
|
606
|
+
};
|
|
607
|
+
},
|
|
608
|
+
/** All runs must pass. Use for binary scores. */
|
|
609
|
+
all(values) {
|
|
610
|
+
const total = values.length;
|
|
611
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
612
|
+
return {
|
|
613
|
+
...values[0],
|
|
614
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
615
|
+
passedCount,
|
|
616
|
+
totalCount: total
|
|
617
|
+
};
|
|
618
|
+
},
|
|
619
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
620
|
+
last(values) {
|
|
621
|
+
return values[values.length - 1] ?? {};
|
|
622
|
+
}
|
|
623
|
+
};
|
|
538
624
|
var Score = {
|
|
625
|
+
aggregate: ScoreAggregate,
|
|
539
626
|
of(config) {
|
|
540
627
|
const def = {
|
|
541
628
|
id: config.id,
|
|
542
629
|
name: config.name,
|
|
543
630
|
displayStrategy: config.displayStrategy,
|
|
544
|
-
|
|
545
|
-
|
|
631
|
+
formatValue: config.formatValue,
|
|
632
|
+
formatAggregate: config.formatAggregate,
|
|
633
|
+
aggregateValues: config.aggregateValues,
|
|
546
634
|
make: (data, options) => {
|
|
547
635
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
548
636
|
return {
|
|
549
637
|
id: config.id,
|
|
550
638
|
data,
|
|
551
|
-
...passed !== void 0 && { passed }
|
|
639
|
+
...passed !== void 0 && { passed },
|
|
640
|
+
...options?.name !== void 0 && { name: options.name },
|
|
641
|
+
def
|
|
642
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
552
643
|
};
|
|
553
644
|
}
|
|
554
645
|
};
|
|
@@ -561,29 +652,6 @@ function getScoreById(id) {
|
|
|
561
652
|
}
|
|
562
653
|
|
|
563
654
|
// src/evals/aggregators.ts
|
|
564
|
-
function aggregateAverageWithVariance(values) {
|
|
565
|
-
if (values.length === 0) {
|
|
566
|
-
return { value: 0, count: 0 };
|
|
567
|
-
}
|
|
568
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
569
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
570
|
-
const mean = sum / values.length;
|
|
571
|
-
let stdDev;
|
|
572
|
-
if (values.length >= 2) {
|
|
573
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
574
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
575
|
-
}
|
|
576
|
-
return { value: mean, stdDev, count: values.length };
|
|
577
|
-
}
|
|
578
|
-
function aggregateAll(values) {
|
|
579
|
-
const total = values.length;
|
|
580
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
581
|
-
return {
|
|
582
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
583
|
-
passedCount,
|
|
584
|
-
totalCount: total
|
|
585
|
-
};
|
|
586
|
-
}
|
|
587
655
|
function aggregateTokenCountSum(values) {
|
|
588
656
|
const initial = {
|
|
589
657
|
input: 0,
|
|
@@ -636,29 +704,31 @@ var percentScore = Score.of({
|
|
|
636
704
|
id: "percent",
|
|
637
705
|
name: "Score",
|
|
638
706
|
displayStrategy: "bar",
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
707
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
708
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
709
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
710
|
+
});
|
|
711
|
+
var deltaScore = Score.of({
|
|
712
|
+
id: "delta",
|
|
713
|
+
name: "Delta",
|
|
714
|
+
displayStrategy: "number",
|
|
715
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
716
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
717
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
646
718
|
});
|
|
647
719
|
var binaryScore = Score.of({
|
|
648
720
|
id: "binary",
|
|
649
721
|
name: "Result",
|
|
650
722
|
displayStrategy: "passFail",
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
}
|
|
657
|
-
return base;
|
|
723
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
724
|
+
formatAggregate: (data) => {
|
|
725
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
726
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
727
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
658
728
|
}
|
|
659
|
-
return
|
|
729
|
+
return base;
|
|
660
730
|
},
|
|
661
|
-
|
|
731
|
+
aggregateValues: Score.aggregate.all
|
|
662
732
|
});
|
|
663
733
|
function createDiffString(expected, actual, diffOptions) {
|
|
664
734
|
const opts = { ...diffOptions, color: false };
|
|
@@ -974,9 +1044,12 @@ async function collectTestCasesFromFiles(config) {
|
|
|
974
1044
|
}
|
|
975
1045
|
|
|
976
1046
|
// src/runner/score-utils.ts
|
|
1047
|
+
function getScoreDef(item) {
|
|
1048
|
+
return item.def ?? getScoreById(item.id);
|
|
1049
|
+
}
|
|
977
1050
|
function toNumericScoreFromScores(scores) {
|
|
978
1051
|
for (const item of scores) {
|
|
979
|
-
const def =
|
|
1052
|
+
const def = getScoreDef(item);
|
|
980
1053
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
981
1054
|
const value = item.data.value;
|
|
982
1055
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1364,7 +1437,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
|
1364
1437
|
() => appendJsonLine(message.artifactPath, {
|
|
1365
1438
|
runId: message.runId,
|
|
1366
1439
|
ts: Date.now(),
|
|
1367
|
-
...message.payload
|
|
1440
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1368
1441
|
})
|
|
1369
1442
|
);
|
|
1370
1443
|
})
|
|
@@ -1678,6 +1751,8 @@ exports.createLogEntry = createLogEntry;
|
|
|
1678
1751
|
exports.createRunner = createRunner;
|
|
1679
1752
|
exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
1680
1753
|
exports.defineConfig = defineConfig;
|
|
1754
|
+
exports.deltaScore = deltaScore;
|
|
1755
|
+
exports.formatScoreData = formatScoreData;
|
|
1681
1756
|
exports.getLogLines = getLogLines;
|
|
1682
1757
|
exports.getMetricById = getMetricById;
|
|
1683
1758
|
exports.getScoreById = getScoreById;
|