@m4trix/evals 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +706 -231
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +707 -232
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +710 -390
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +702 -382
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +289 -108
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +28 -5
- package/dist/index.js +290 -109
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/index.cjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var effect = require('effect');
|
|
4
|
-
var
|
|
4
|
+
var diff = require('diff');
|
|
5
5
|
var crypto = require('crypto');
|
|
6
6
|
var fs = require('fs');
|
|
7
7
|
var path = require('path');
|
|
@@ -331,15 +331,23 @@ var TestCase = class _TestCase {
|
|
|
331
331
|
this._config = config;
|
|
332
332
|
}
|
|
333
333
|
static describe(config) {
|
|
334
|
+
const reruns = config.reruns ?? 1;
|
|
335
|
+
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
336
|
+
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
337
|
+
}
|
|
334
338
|
return new _TestCase({
|
|
335
339
|
name: config.name,
|
|
336
340
|
tags: config.tags,
|
|
341
|
+
reruns,
|
|
337
342
|
inputSchema: config.inputSchema,
|
|
338
343
|
input: config.input,
|
|
339
344
|
outputSchema: config.outputSchema,
|
|
340
345
|
output: config.output
|
|
341
346
|
});
|
|
342
347
|
}
|
|
348
|
+
getReruns() {
|
|
349
|
+
return this._config.reruns;
|
|
350
|
+
}
|
|
343
351
|
getName() {
|
|
344
352
|
return this._config.name;
|
|
345
353
|
}
|
|
@@ -513,6 +521,7 @@ var Metric = {
|
|
|
513
521
|
const def = {
|
|
514
522
|
id: config.id,
|
|
515
523
|
name: config.name,
|
|
524
|
+
aggregate: config.aggregate,
|
|
516
525
|
format: config.format,
|
|
517
526
|
make: (data) => ({ id: config.id, data })
|
|
518
527
|
};
|
|
@@ -532,6 +541,7 @@ var Score = {
|
|
|
532
541
|
id: config.id,
|
|
533
542
|
name: config.name,
|
|
534
543
|
displayStrategy: config.displayStrategy,
|
|
544
|
+
aggregate: config.aggregate,
|
|
535
545
|
format: config.format,
|
|
536
546
|
make: (data, options) => {
|
|
537
547
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -550,23 +560,75 @@ function getScoreById(id) {
|
|
|
550
560
|
return registry2.get(id);
|
|
551
561
|
}
|
|
552
562
|
|
|
563
|
+
// src/evals/aggregators.ts
|
|
564
|
+
function aggregateAverageWithVariance(values) {
|
|
565
|
+
if (values.length === 0) {
|
|
566
|
+
return { value: 0, count: 0 };
|
|
567
|
+
}
|
|
568
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
569
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
570
|
+
const mean = sum / values.length;
|
|
571
|
+
let stdDev;
|
|
572
|
+
if (values.length >= 2) {
|
|
573
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
574
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
575
|
+
}
|
|
576
|
+
return { value: mean, stdDev, count: values.length };
|
|
577
|
+
}
|
|
578
|
+
function aggregateAll(values) {
|
|
579
|
+
const total = values.length;
|
|
580
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
581
|
+
return {
|
|
582
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
583
|
+
passedCount,
|
|
584
|
+
totalCount: total
|
|
585
|
+
};
|
|
586
|
+
}
|
|
587
|
+
function aggregateTokenCountSum(values) {
|
|
588
|
+
const initial = {
|
|
589
|
+
input: 0,
|
|
590
|
+
output: 0,
|
|
591
|
+
inputCached: 0,
|
|
592
|
+
outputCached: 0
|
|
593
|
+
};
|
|
594
|
+
return values.reduce(
|
|
595
|
+
(acc, v) => ({
|
|
596
|
+
input: acc.input + (v.input ?? 0),
|
|
597
|
+
output: acc.output + (v.output ?? 0),
|
|
598
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
599
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
600
|
+
}),
|
|
601
|
+
initial
|
|
602
|
+
);
|
|
603
|
+
}
|
|
604
|
+
function aggregateLatencyAverage(values) {
|
|
605
|
+
if (values.length === 0) {
|
|
606
|
+
return { ms: 0 };
|
|
607
|
+
}
|
|
608
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
609
|
+
return { ms: sum / values.length };
|
|
610
|
+
}
|
|
611
|
+
|
|
553
612
|
// src/evals/metrics/standard.ts
|
|
554
613
|
var tokenCountMetric = Metric.of({
|
|
555
614
|
id: "token-count",
|
|
556
615
|
name: "Tokens",
|
|
557
|
-
|
|
616
|
+
aggregate: aggregateTokenCountSum,
|
|
617
|
+
format: (data, options) => {
|
|
558
618
|
const input = data.input ?? 0;
|
|
559
619
|
const output = data.output ?? 0;
|
|
560
620
|
const inputCached = data.inputCached ?? 0;
|
|
561
621
|
const outputCached = data.outputCached ?? 0;
|
|
562
622
|
const cached = inputCached + outputCached;
|
|
563
|
-
|
|
623
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
624
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
564
625
|
}
|
|
565
626
|
});
|
|
566
627
|
var latencyMetric = Metric.of({
|
|
567
628
|
id: "latency",
|
|
568
629
|
name: "Latency",
|
|
569
|
-
|
|
630
|
+
aggregate: aggregateLatencyAverage,
|
|
631
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
570
632
|
});
|
|
571
633
|
|
|
572
634
|
// src/evals/scores/standard.ts
|
|
@@ -574,16 +636,59 @@ var percentScore = Score.of({
|
|
|
574
636
|
id: "percent",
|
|
575
637
|
name: "Score",
|
|
576
638
|
displayStrategy: "bar",
|
|
577
|
-
format: (data) =>
|
|
639
|
+
format: (data, options) => {
|
|
640
|
+
if (options?.isAggregated) {
|
|
641
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
642
|
+
}
|
|
643
|
+
return data.value.toFixed(2);
|
|
644
|
+
},
|
|
645
|
+
aggregate: aggregateAverageWithVariance
|
|
578
646
|
});
|
|
579
647
|
var binaryScore = Score.of({
|
|
580
648
|
id: "binary",
|
|
581
649
|
name: "Result",
|
|
582
650
|
displayStrategy: "passFail",
|
|
583
|
-
format: (data) =>
|
|
651
|
+
format: (data, options) => {
|
|
652
|
+
if (options?.isAggregated) {
|
|
653
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
654
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
655
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
656
|
+
}
|
|
657
|
+
return base;
|
|
658
|
+
}
|
|
659
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
660
|
+
},
|
|
661
|
+
aggregate: aggregateAll
|
|
584
662
|
});
|
|
663
|
+
function toJsonLines(value) {
|
|
664
|
+
try {
|
|
665
|
+
return JSON.stringify(value, null, 2);
|
|
666
|
+
} catch {
|
|
667
|
+
return String(value);
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
function formatDiffString(changes) {
|
|
671
|
+
const lines = [];
|
|
672
|
+
for (const part of changes) {
|
|
673
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
674
|
+
const partLines = part.value.split("\n");
|
|
675
|
+
if (partLines[partLines.length - 1] === "") {
|
|
676
|
+
partLines.pop();
|
|
677
|
+
}
|
|
678
|
+
for (const line of partLines) {
|
|
679
|
+
lines.push(`${prefix} ${line}`);
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
return lines.join("\n");
|
|
683
|
+
}
|
|
684
|
+
function createDiffString(expected, actual) {
|
|
685
|
+
const expectedStr = toJsonLines(expected);
|
|
686
|
+
const actualStr = toJsonLines(actual);
|
|
687
|
+
const changes = diff.diffLines(expectedStr, actualStr);
|
|
688
|
+
return formatDiffString(changes);
|
|
689
|
+
}
|
|
585
690
|
function createDiffLogEntry(expected, actual, options) {
|
|
586
|
-
const diff =
|
|
691
|
+
const diff = createDiffString(expected, actual);
|
|
587
692
|
return {
|
|
588
693
|
type: "diff",
|
|
589
694
|
label: options?.label,
|
|
@@ -593,8 +698,22 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
593
698
|
};
|
|
594
699
|
}
|
|
595
700
|
function printJsonDiff(expected, actual, options = {}) {
|
|
596
|
-
const
|
|
597
|
-
|
|
701
|
+
const diff = createDiffString(expected, actual);
|
|
702
|
+
if (options.color) {
|
|
703
|
+
const lines = diff.split("\n").map((line) => {
|
|
704
|
+
const trimmed = line.trimStart();
|
|
705
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
706
|
+
return `\x1B[31m${line}\x1B[0m`;
|
|
707
|
+
}
|
|
708
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
709
|
+
return `\x1B[32m${line}\x1B[0m`;
|
|
710
|
+
}
|
|
711
|
+
return line;
|
|
712
|
+
});
|
|
713
|
+
const colored = lines.join("\n");
|
|
714
|
+
console.log(colored || "(no differences)");
|
|
715
|
+
return colored;
|
|
716
|
+
}
|
|
598
717
|
console.log(diff || "(no differences)");
|
|
599
718
|
return diff;
|
|
600
719
|
}
|
|
@@ -621,7 +740,8 @@ var defaultRunnerConfig = {
|
|
|
621
740
|
],
|
|
622
741
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
623
742
|
},
|
|
624
|
-
artifactDirectory: ".eval-results"
|
|
743
|
+
artifactDirectory: ".eval-results",
|
|
744
|
+
maxConcurrency: 1
|
|
625
745
|
};
|
|
626
746
|
function toRunnerConfigOverrides(config) {
|
|
627
747
|
if (!config) {
|
|
@@ -654,6 +774,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
654
774
|
if (config.artifactDirectory !== void 0) {
|
|
655
775
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
656
776
|
}
|
|
777
|
+
if (config.maxConcurrency !== void 0) {
|
|
778
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
779
|
+
}
|
|
657
780
|
if (Object.keys(discovery).length > 0) {
|
|
658
781
|
overrides.discovery = discovery;
|
|
659
782
|
}
|
|
@@ -927,6 +1050,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
927
1050
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
928
1051
|
);
|
|
929
1052
|
}
|
|
1053
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1054
|
+
return effect.Effect.gen(function* () {
|
|
1055
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1056
|
+
const rerunPassed = [];
|
|
1057
|
+
for (let r = 0; r < reruns; r++) {
|
|
1058
|
+
const started = Date.now();
|
|
1059
|
+
const evaluatorScores = [];
|
|
1060
|
+
let testCaseError;
|
|
1061
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1062
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1063
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1064
|
+
if (!evaluateFn) {
|
|
1065
|
+
continue;
|
|
1066
|
+
}
|
|
1067
|
+
try {
|
|
1068
|
+
const logs = [];
|
|
1069
|
+
const logDiff = (expected, actual, options) => {
|
|
1070
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1071
|
+
};
|
|
1072
|
+
const ctx = yield* effect.Effect.promise(
|
|
1073
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1074
|
+
);
|
|
1075
|
+
const result = yield* effect.Effect.promise(
|
|
1076
|
+
() => Promise.resolve(
|
|
1077
|
+
evaluateFn({
|
|
1078
|
+
input: testCaseItem.testCase.getInput(),
|
|
1079
|
+
ctx,
|
|
1080
|
+
output,
|
|
1081
|
+
logDiff
|
|
1082
|
+
})
|
|
1083
|
+
)
|
|
1084
|
+
);
|
|
1085
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1086
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1087
|
+
evaluatorScores.push({
|
|
1088
|
+
evaluatorId,
|
|
1089
|
+
scores,
|
|
1090
|
+
passed: passed2,
|
|
1091
|
+
metrics,
|
|
1092
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1093
|
+
});
|
|
1094
|
+
} catch (error) {
|
|
1095
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1096
|
+
evaluatorScores.push({
|
|
1097
|
+
evaluatorId,
|
|
1098
|
+
scores: [],
|
|
1099
|
+
passed: false
|
|
1100
|
+
});
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1104
|
+
rerunPassed.push(rerunPassedThis);
|
|
1105
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1106
|
+
n + 1,
|
|
1107
|
+
n + 1
|
|
1108
|
+
]);
|
|
1109
|
+
const progressEvent = {
|
|
1110
|
+
type: "TestCaseProgress",
|
|
1111
|
+
runId: task.runId,
|
|
1112
|
+
testCaseId: testCaseItem.id,
|
|
1113
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1114
|
+
completedTestCases: completedEvaluations,
|
|
1115
|
+
totalTestCases: totalEvaluations,
|
|
1116
|
+
rerunIndex: r + 1,
|
|
1117
|
+
rerunTotal: reruns,
|
|
1118
|
+
passed: rerunPassedThis,
|
|
1119
|
+
durationMs: Date.now() - started,
|
|
1120
|
+
evaluatorScores,
|
|
1121
|
+
output,
|
|
1122
|
+
errorMessage: testCaseError
|
|
1123
|
+
};
|
|
1124
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1125
|
+
...snapshot,
|
|
1126
|
+
completedTestCases: completedEvaluations
|
|
1127
|
+
}));
|
|
1128
|
+
yield* publishEvent(progressEvent);
|
|
1129
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1130
|
+
runId: task.runId,
|
|
1131
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1132
|
+
payload: progressEvent
|
|
1133
|
+
});
|
|
1134
|
+
}
|
|
1135
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1136
|
+
if (testCasePassed) {
|
|
1137
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1138
|
+
} else {
|
|
1139
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1140
|
+
}
|
|
1141
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
1142
|
+
effect.Ref.get(passedRef),
|
|
1143
|
+
effect.Ref.get(failedRef)
|
|
1144
|
+
]);
|
|
1145
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1146
|
+
...snapshot,
|
|
1147
|
+
passedTestCases: passed,
|
|
1148
|
+
failedTestCases: failed
|
|
1149
|
+
}));
|
|
1150
|
+
});
|
|
1151
|
+
}
|
|
930
1152
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
931
1153
|
const startedAt = Date.now();
|
|
932
1154
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -939,104 +1161,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
939
1161
|
runId: task.runId,
|
|
940
1162
|
startedAt
|
|
941
1163
|
});
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
)
|
|
972
|
-
);
|
|
973
|
-
const { scores, metrics } = normalizeResult(result);
|
|
974
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
975
|
-
evaluatorScores.push({
|
|
976
|
-
evaluatorId,
|
|
977
|
-
scores,
|
|
978
|
-
passed,
|
|
979
|
-
metrics,
|
|
980
|
-
logs: logs.length > 0 ? logs : void 0
|
|
981
|
-
});
|
|
982
|
-
} catch (error) {
|
|
983
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
984
|
-
evaluatorScores.push({
|
|
985
|
-
evaluatorId,
|
|
986
|
-
scores: [],
|
|
987
|
-
passed: false
|
|
988
|
-
});
|
|
989
|
-
}
|
|
990
|
-
}
|
|
991
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
992
|
-
completedTestCases += 1;
|
|
993
|
-
if (testCasePassed) {
|
|
994
|
-
passedTestCases += 1;
|
|
995
|
-
} else {
|
|
996
|
-
failedTestCases += 1;
|
|
997
|
-
}
|
|
998
|
-
const progressEvent = {
|
|
999
|
-
type: "TestCaseProgress",
|
|
1000
|
-
runId: task.runId,
|
|
1001
|
-
testCaseId: testCaseItem.id,
|
|
1002
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1003
|
-
completedTestCases,
|
|
1004
|
-
totalTestCases: task.testCases.length,
|
|
1005
|
-
passed: testCasePassed,
|
|
1006
|
-
durationMs: Date.now() - started,
|
|
1007
|
-
evaluatorScores,
|
|
1008
|
-
output,
|
|
1009
|
-
errorMessage: testCaseError
|
|
1010
|
-
};
|
|
1011
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1012
|
-
...snapshot,
|
|
1013
|
-
completedTestCases,
|
|
1014
|
-
passedTestCases,
|
|
1015
|
-
failedTestCases
|
|
1016
|
-
}));
|
|
1017
|
-
yield* publishEvent(progressEvent);
|
|
1018
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
1019
|
-
runId: task.runId,
|
|
1020
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1021
|
-
payload: progressEvent
|
|
1022
|
-
});
|
|
1023
|
-
}
|
|
1164
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1165
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1166
|
+
0
|
|
1167
|
+
);
|
|
1168
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1169
|
+
const completedRef = yield* effect.Ref.make(0);
|
|
1170
|
+
const passedRef = yield* effect.Ref.make(0);
|
|
1171
|
+
const failedRef = yield* effect.Ref.make(0);
|
|
1172
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1173
|
+
task,
|
|
1174
|
+
testCaseItem,
|
|
1175
|
+
totalEvaluations,
|
|
1176
|
+
publishEvent,
|
|
1177
|
+
persistenceQueue,
|
|
1178
|
+
updateSnapshot,
|
|
1179
|
+
completedRef,
|
|
1180
|
+
passedRef,
|
|
1181
|
+
failedRef
|
|
1182
|
+
);
|
|
1183
|
+
yield* effect.Effect.forEach(
|
|
1184
|
+
task.testCases,
|
|
1185
|
+
processTestCase,
|
|
1186
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1187
|
+
);
|
|
1188
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1189
|
+
effect.Ref.get(completedRef),
|
|
1190
|
+
effect.Ref.get(passedRef),
|
|
1191
|
+
effect.Ref.get(failedRef)
|
|
1192
|
+
]);
|
|
1024
1193
|
const finishedAt = Date.now();
|
|
1025
1194
|
const completedEvent = {
|
|
1026
1195
|
type: "RunCompleted",
|
|
1027
1196
|
runId: task.runId,
|
|
1028
1197
|
finishedAt,
|
|
1029
|
-
passedTestCases,
|
|
1030
|
-
failedTestCases,
|
|
1198
|
+
passedTestCases: passedUniqueTestCases,
|
|
1199
|
+
failedTestCases: failedUniqueTestCases,
|
|
1031
1200
|
totalTestCases: task.testCases.length,
|
|
1032
1201
|
artifactPath: task.snapshot.artifactPath
|
|
1033
1202
|
};
|
|
1034
1203
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1035
1204
|
...snapshot,
|
|
1036
1205
|
status: "completed",
|
|
1037
|
-
completedTestCases,
|
|
1038
|
-
passedTestCases,
|
|
1039
|
-
failedTestCases,
|
|
1206
|
+
completedTestCases: completedEvaluations,
|
|
1207
|
+
passedTestCases: passedUniqueTestCases,
|
|
1208
|
+
failedTestCases: failedUniqueTestCases,
|
|
1040
1209
|
finishedAt
|
|
1041
1210
|
}));
|
|
1042
1211
|
yield* publishEvent(completedEvent);
|
|
@@ -1124,7 +1293,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1124
1293
|
const artifactPath = filePath;
|
|
1125
1294
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1126
1295
|
const progress = aggregateTestCaseProgress(lines);
|
|
1127
|
-
const completedTestCases = runCompleted
|
|
1296
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1128
1297
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1129
1298
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1130
1299
|
return {
|
|
@@ -1146,23 +1315,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1146
1315
|
}
|
|
1147
1316
|
function aggregateTestCaseProgress(lines) {
|
|
1148
1317
|
let completedTestCases = 0;
|
|
1149
|
-
|
|
1150
|
-
let failedTestCases = 0;
|
|
1318
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1151
1319
|
for (const line of lines) {
|
|
1152
1320
|
try {
|
|
1153
1321
|
const event = JSON.parse(line);
|
|
1154
1322
|
if (event.type === "TestCaseProgress") {
|
|
1155
1323
|
const ev = event;
|
|
1156
1324
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
failedTestCases += 1;
|
|
1161
|
-
}
|
|
1325
|
+
const id = ev.testCaseId;
|
|
1326
|
+
const current = testCasePassedBy.get(id);
|
|
1327
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1162
1328
|
}
|
|
1163
1329
|
} catch {
|
|
1164
1330
|
}
|
|
1165
1331
|
}
|
|
1332
|
+
let passedTestCases = 0;
|
|
1333
|
+
let failedTestCases = 0;
|
|
1334
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1335
|
+
if (passed) {
|
|
1336
|
+
passedTestCases += 1;
|
|
1337
|
+
} else {
|
|
1338
|
+
failedTestCases += 1;
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1166
1341
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1167
1342
|
}
|
|
1168
1343
|
async function appendJsonLine(artifactPath, payload) {
|
|
@@ -1357,6 +1532,10 @@ var EffectRunner = class {
|
|
|
1357
1532
|
throw new Error("No evaluators selected for run");
|
|
1358
1533
|
}
|
|
1359
1534
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1535
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1536
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1537
|
+
0
|
|
1538
|
+
);
|
|
1360
1539
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1361
1540
|
const artifactPath = createArtifactPath(
|
|
1362
1541
|
this.config.artifactDirectory,
|
|
@@ -1369,7 +1548,7 @@ var EffectRunner = class {
|
|
|
1369
1548
|
datasetName: dataset.dataset.getName(),
|
|
1370
1549
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1371
1550
|
queuedAt: Date.now(),
|
|
1372
|
-
totalTestCases:
|
|
1551
|
+
totalTestCases: totalEvaluations,
|
|
1373
1552
|
completedTestCases: 0,
|
|
1374
1553
|
passedTestCases: 0,
|
|
1375
1554
|
failedTestCases: 0,
|
|
@@ -1383,7 +1562,7 @@ var EffectRunner = class {
|
|
|
1383
1562
|
datasetId: request.datasetId,
|
|
1384
1563
|
datasetName: dataset.dataset.getName(),
|
|
1385
1564
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1386
|
-
totalTestCases:
|
|
1565
|
+
totalTestCases: totalEvaluations,
|
|
1387
1566
|
artifactPath
|
|
1388
1567
|
};
|
|
1389
1568
|
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1394,6 +1573,7 @@ var EffectRunner = class {
|
|
|
1394
1573
|
payload: queuedEvent
|
|
1395
1574
|
})
|
|
1396
1575
|
);
|
|
1576
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1397
1577
|
await effect.Effect.runPromise(
|
|
1398
1578
|
effect.Queue.offer(this.runQueue, {
|
|
1399
1579
|
runId,
|
|
@@ -1401,7 +1581,8 @@ var EffectRunner = class {
|
|
|
1401
1581
|
dataset: dataset.dataset,
|
|
1402
1582
|
evaluators: selectedEvaluators,
|
|
1403
1583
|
testCases: selectedTestCases,
|
|
1404
|
-
snapshot
|
|
1584
|
+
snapshot,
|
|
1585
|
+
maxConcurrency
|
|
1405
1586
|
})
|
|
1406
1587
|
);
|
|
1407
1588
|
return snapshot;
|