@m4trix/evals 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +113 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +113 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +87 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +87 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +86 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +54 -5
- package/dist/index.js +85 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -535,20 +535,70 @@ function getMetricById(id) {
|
|
|
535
535
|
|
|
536
536
|
// src/evals/score.ts
|
|
537
537
|
var registry2 = /* @__PURE__ */ new Map();
|
|
538
|
+
function formatScoreData(def, data, options) {
|
|
539
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
540
|
+
}
|
|
541
|
+
var ScoreAggregate = {
|
|
542
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
543
|
+
averageFields(fields) {
|
|
544
|
+
return (values) => {
|
|
545
|
+
const count = values.length || 1;
|
|
546
|
+
const result = {};
|
|
547
|
+
for (const field of fields) {
|
|
548
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
549
|
+
}
|
|
550
|
+
return result;
|
|
551
|
+
};
|
|
552
|
+
},
|
|
553
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
554
|
+
averageWithVariance(values) {
|
|
555
|
+
if (values.length === 0) {
|
|
556
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
557
|
+
}
|
|
558
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
559
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
560
|
+
const mean = sum / values.length;
|
|
561
|
+
let stdDev;
|
|
562
|
+
if (values.length >= 2) {
|
|
563
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
564
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
565
|
+
}
|
|
566
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
567
|
+
},
|
|
568
|
+
/** All runs must pass. Use for binary scores. */
|
|
569
|
+
all(values) {
|
|
570
|
+
const total = values.length;
|
|
571
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
572
|
+
return {
|
|
573
|
+
...values[0],
|
|
574
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
575
|
+
passedCount,
|
|
576
|
+
totalCount: total
|
|
577
|
+
};
|
|
578
|
+
},
|
|
579
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
580
|
+
last(values) {
|
|
581
|
+
return values[values.length - 1] ?? {};
|
|
582
|
+
}
|
|
583
|
+
};
|
|
538
584
|
var Score = {
|
|
585
|
+
aggregate: ScoreAggregate,
|
|
539
586
|
of(config) {
|
|
540
587
|
const def = {
|
|
541
588
|
id: config.id,
|
|
542
589
|
name: config.name,
|
|
543
590
|
displayStrategy: config.displayStrategy,
|
|
544
|
-
|
|
545
|
-
|
|
591
|
+
formatValue: config.formatValue,
|
|
592
|
+
formatAggregate: config.formatAggregate,
|
|
593
|
+
aggregateValues: config.aggregateValues,
|
|
546
594
|
make: (data, options) => {
|
|
547
595
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
548
596
|
return {
|
|
549
597
|
id: config.id,
|
|
550
598
|
data,
|
|
551
|
-
...passed !== void 0 && { passed }
|
|
599
|
+
...passed !== void 0 && { passed },
|
|
600
|
+
def
|
|
601
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
552
602
|
};
|
|
553
603
|
}
|
|
554
604
|
};
|
|
@@ -561,29 +611,6 @@ function getScoreById(id) {
|
|
|
561
611
|
}
|
|
562
612
|
|
|
563
613
|
// src/evals/aggregators.ts
|
|
564
|
-
function aggregateAverageWithVariance(values) {
|
|
565
|
-
if (values.length === 0) {
|
|
566
|
-
return { value: 0, count: 0 };
|
|
567
|
-
}
|
|
568
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
569
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
570
|
-
const mean = sum / values.length;
|
|
571
|
-
let stdDev;
|
|
572
|
-
if (values.length >= 2) {
|
|
573
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
574
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
575
|
-
}
|
|
576
|
-
return { value: mean, stdDev, count: values.length };
|
|
577
|
-
}
|
|
578
|
-
function aggregateAll(values) {
|
|
579
|
-
const total = values.length;
|
|
580
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
581
|
-
return {
|
|
582
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
583
|
-
passedCount,
|
|
584
|
-
totalCount: total
|
|
585
|
-
};
|
|
586
|
-
}
|
|
587
614
|
function aggregateTokenCountSum(values) {
|
|
588
615
|
const initial = {
|
|
589
616
|
input: 0,
|
|
@@ -636,29 +663,31 @@ var percentScore = Score.of({
|
|
|
636
663
|
id: "percent",
|
|
637
664
|
name: "Score",
|
|
638
665
|
displayStrategy: "bar",
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
666
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
667
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
668
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
669
|
+
});
|
|
670
|
+
var deltaScore = Score.of({
|
|
671
|
+
id: "delta",
|
|
672
|
+
name: "Delta",
|
|
673
|
+
displayStrategy: "number",
|
|
674
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
675
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
676
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
646
677
|
});
|
|
647
678
|
var binaryScore = Score.of({
|
|
648
679
|
id: "binary",
|
|
649
680
|
name: "Result",
|
|
650
681
|
displayStrategy: "passFail",
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
}
|
|
657
|
-
return base;
|
|
682
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
683
|
+
formatAggregate: (data) => {
|
|
684
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
685
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
686
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
658
687
|
}
|
|
659
|
-
return
|
|
688
|
+
return base;
|
|
660
689
|
},
|
|
661
|
-
|
|
690
|
+
aggregateValues: Score.aggregate.all
|
|
662
691
|
});
|
|
663
692
|
function createDiffString(expected, actual, diffOptions) {
|
|
664
693
|
const opts = { ...diffOptions, color: false };
|
|
@@ -974,9 +1003,12 @@ async function collectTestCasesFromFiles(config) {
|
|
|
974
1003
|
}
|
|
975
1004
|
|
|
976
1005
|
// src/runner/score-utils.ts
|
|
1006
|
+
function getScoreDef(item) {
|
|
1007
|
+
return item.def ?? getScoreById(item.id);
|
|
1008
|
+
}
|
|
977
1009
|
function toNumericScoreFromScores(scores) {
|
|
978
1010
|
for (const item of scores) {
|
|
979
|
-
const def =
|
|
1011
|
+
const def = getScoreDef(item);
|
|
980
1012
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
981
1013
|
const value = item.data.value;
|
|
982
1014
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1057,6 +1089,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1057
1089
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1058
1090
|
const rerunPassed = [];
|
|
1059
1091
|
for (let r = 0; r < reruns; r++) {
|
|
1092
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1060
1093
|
const started = Date.now();
|
|
1061
1094
|
const evaluatorScores = [];
|
|
1062
1095
|
let testCaseError;
|
|
@@ -1083,6 +1116,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1083
1116
|
input: testCaseItem.testCase.getInput(),
|
|
1084
1117
|
ctx,
|
|
1085
1118
|
output,
|
|
1119
|
+
meta: {
|
|
1120
|
+
triggerId: task.triggerId,
|
|
1121
|
+
runId: evaluatorRunId,
|
|
1122
|
+
datasetId: task.datasetId
|
|
1123
|
+
},
|
|
1086
1124
|
logDiff,
|
|
1087
1125
|
log
|
|
1088
1126
|
})
|
|
@@ -1358,7 +1396,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
|
1358
1396
|
() => appendJsonLine(message.artifactPath, {
|
|
1359
1397
|
runId: message.runId,
|
|
1360
1398
|
ts: Date.now(),
|
|
1361
|
-
...message.payload
|
|
1399
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1362
1400
|
})
|
|
1363
1401
|
);
|
|
1364
1402
|
})
|
|
@@ -1542,6 +1580,7 @@ var EffectRunner = class {
|
|
|
1542
1580
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1543
1581
|
0
|
|
1544
1582
|
);
|
|
1583
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1545
1584
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1546
1585
|
const artifactPath = createArtifactPath(
|
|
1547
1586
|
this.config.artifactDirectory,
|
|
@@ -1583,6 +1622,7 @@ var EffectRunner = class {
|
|
|
1583
1622
|
await effect.Effect.runPromise(
|
|
1584
1623
|
effect.Queue.offer(this.runQueue, {
|
|
1585
1624
|
runId,
|
|
1625
|
+
triggerId,
|
|
1586
1626
|
datasetId: request.datasetId,
|
|
1587
1627
|
dataset: dataset.dataset,
|
|
1588
1628
|
evaluators: selectedEvaluators,
|
|
@@ -1670,6 +1710,8 @@ exports.createLogEntry = createLogEntry;
|
|
|
1670
1710
|
exports.createRunner = createRunner;
|
|
1671
1711
|
exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
1672
1712
|
exports.defineConfig = defineConfig;
|
|
1713
|
+
exports.deltaScore = deltaScore;
|
|
1714
|
+
exports.formatScoreData = formatScoreData;
|
|
1673
1715
|
exports.getLogLines = getLogLines;
|
|
1674
1716
|
exports.getMetricById = getMetricById;
|
|
1675
1717
|
exports.getScoreById = getScoreById;
|