@m4trix/evals 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +719 -227
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +721 -229
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +1320 -928
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +1322 -930
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +335 -99
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +24 -5
- package/dist/index.js +337 -101
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -279,11 +279,17 @@ function toEvaluatorOption(item) {
|
|
|
279
279
|
};
|
|
280
280
|
}
|
|
281
281
|
async function loadRunnerData(runner) {
|
|
282
|
-
const [datasets, evaluators] = await Promise.all([
|
|
282
|
+
const [datasets, evaluators, diskSnapshots] = await Promise.all([
|
|
283
283
|
runner.collectDatasets(),
|
|
284
|
-
runner.collectEvaluators()
|
|
284
|
+
runner.collectEvaluators(),
|
|
285
|
+
runner.loadRunSnapshotsFromArtifacts()
|
|
285
286
|
]);
|
|
286
|
-
const
|
|
287
|
+
const memSnapshots = runner.getAllRunSnapshots();
|
|
288
|
+
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
289
|
+
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
290
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
291
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
292
|
+
);
|
|
287
293
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
288
294
|
return loadMockData();
|
|
289
295
|
}
|
|
@@ -325,15 +331,23 @@ var TestCase = class _TestCase {
|
|
|
325
331
|
this._config = config;
|
|
326
332
|
}
|
|
327
333
|
static describe(config) {
|
|
334
|
+
const reruns = config.reruns ?? 1;
|
|
335
|
+
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
336
|
+
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
337
|
+
}
|
|
328
338
|
return new _TestCase({
|
|
329
339
|
name: config.name,
|
|
330
340
|
tags: config.tags,
|
|
341
|
+
reruns,
|
|
331
342
|
inputSchema: config.inputSchema,
|
|
332
343
|
input: config.input,
|
|
333
344
|
outputSchema: config.outputSchema,
|
|
334
345
|
output: config.output
|
|
335
346
|
});
|
|
336
347
|
}
|
|
348
|
+
getReruns() {
|
|
349
|
+
return this._config.reruns;
|
|
350
|
+
}
|
|
337
351
|
getName() {
|
|
338
352
|
return this._config.name;
|
|
339
353
|
}
|
|
@@ -507,6 +521,7 @@ var Metric = {
|
|
|
507
521
|
const def = {
|
|
508
522
|
id: config.id,
|
|
509
523
|
name: config.name,
|
|
524
|
+
aggregate: config.aggregate,
|
|
510
525
|
format: config.format,
|
|
511
526
|
make: (data) => ({ id: config.id, data })
|
|
512
527
|
};
|
|
@@ -526,6 +541,7 @@ var Score = {
|
|
|
526
541
|
id: config.id,
|
|
527
542
|
name: config.name,
|
|
528
543
|
displayStrategy: config.displayStrategy,
|
|
544
|
+
aggregate: config.aggregate,
|
|
529
545
|
format: config.format,
|
|
530
546
|
make: (data, options) => {
|
|
531
547
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -544,23 +560,62 @@ function getScoreById(id) {
|
|
|
544
560
|
return registry2.get(id);
|
|
545
561
|
}
|
|
546
562
|
|
|
563
|
+
// src/evals/aggregators.ts
|
|
564
|
+
function aggregateAverage(values) {
|
|
565
|
+
if (values.length === 0) {
|
|
566
|
+
return { value: 0 };
|
|
567
|
+
}
|
|
568
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
569
|
+
return { value: sum / values.length };
|
|
570
|
+
}
|
|
571
|
+
function aggregateAll(values) {
|
|
572
|
+
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
573
|
+
}
|
|
574
|
+
function aggregateTokenCountSum(values) {
|
|
575
|
+
const initial = {
|
|
576
|
+
input: 0,
|
|
577
|
+
output: 0,
|
|
578
|
+
inputCached: 0,
|
|
579
|
+
outputCached: 0
|
|
580
|
+
};
|
|
581
|
+
return values.reduce(
|
|
582
|
+
(acc, v) => ({
|
|
583
|
+
input: acc.input + (v.input ?? 0),
|
|
584
|
+
output: acc.output + (v.output ?? 0),
|
|
585
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
586
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
587
|
+
}),
|
|
588
|
+
initial
|
|
589
|
+
);
|
|
590
|
+
}
|
|
591
|
+
function aggregateLatencyAverage(values) {
|
|
592
|
+
if (values.length === 0) {
|
|
593
|
+
return { ms: 0 };
|
|
594
|
+
}
|
|
595
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
596
|
+
return { ms: sum / values.length };
|
|
597
|
+
}
|
|
598
|
+
|
|
547
599
|
// src/evals/metrics/standard.ts
|
|
548
600
|
var tokenCountMetric = Metric.of({
|
|
549
601
|
id: "token-count",
|
|
550
602
|
name: "Tokens",
|
|
551
|
-
|
|
603
|
+
aggregate: aggregateTokenCountSum,
|
|
604
|
+
format: (data, options) => {
|
|
552
605
|
const input = data.input ?? 0;
|
|
553
606
|
const output = data.output ?? 0;
|
|
554
607
|
const inputCached = data.inputCached ?? 0;
|
|
555
608
|
const outputCached = data.outputCached ?? 0;
|
|
556
609
|
const cached = inputCached + outputCached;
|
|
557
|
-
|
|
610
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
611
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
558
612
|
}
|
|
559
613
|
});
|
|
560
614
|
var latencyMetric = Metric.of({
|
|
561
615
|
id: "latency",
|
|
562
616
|
name: "Latency",
|
|
563
|
-
|
|
617
|
+
aggregate: aggregateLatencyAverage,
|
|
618
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
564
619
|
});
|
|
565
620
|
|
|
566
621
|
// src/evals/scores/standard.ts
|
|
@@ -568,13 +623,15 @@ var percentScore = Score.of({
|
|
|
568
623
|
id: "percent",
|
|
569
624
|
name: "Score",
|
|
570
625
|
displayStrategy: "bar",
|
|
571
|
-
format: (data) => data.value.toFixed(2)
|
|
626
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
627
|
+
aggregate: aggregateAverage
|
|
572
628
|
});
|
|
573
629
|
var binaryScore = Score.of({
|
|
574
630
|
id: "binary",
|
|
575
631
|
name: "Result",
|
|
576
632
|
displayStrategy: "passFail",
|
|
577
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
633
|
+
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
634
|
+
aggregate: aggregateAll
|
|
578
635
|
});
|
|
579
636
|
function createDiffLogEntry(expected, actual, options) {
|
|
580
637
|
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
@@ -615,7 +672,8 @@ var defaultRunnerConfig = {
|
|
|
615
672
|
],
|
|
616
673
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
617
674
|
},
|
|
618
|
-
artifactDirectory: ".eval-results"
|
|
675
|
+
artifactDirectory: ".eval-results",
|
|
676
|
+
maxConcurrency: 1
|
|
619
677
|
};
|
|
620
678
|
function toRunnerConfigOverrides(config) {
|
|
621
679
|
if (!config) {
|
|
@@ -648,6 +706,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
648
706
|
if (config.artifactDirectory !== void 0) {
|
|
649
707
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
650
708
|
}
|
|
709
|
+
if (config.maxConcurrency !== void 0) {
|
|
710
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
711
|
+
}
|
|
651
712
|
if (Object.keys(discovery).length > 0) {
|
|
652
713
|
overrides.discovery = discovery;
|
|
653
714
|
}
|
|
@@ -921,6 +982,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
921
982
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
922
983
|
);
|
|
923
984
|
}
|
|
985
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
986
|
+
return effect.Effect.gen(function* () {
|
|
987
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
988
|
+
const rerunPassed = [];
|
|
989
|
+
for (let r = 0; r < reruns; r++) {
|
|
990
|
+
const started = Date.now();
|
|
991
|
+
const evaluatorScores = [];
|
|
992
|
+
let testCaseError;
|
|
993
|
+
const output = readOutput(testCaseItem.testCase);
|
|
994
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
995
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
996
|
+
if (!evaluateFn) {
|
|
997
|
+
continue;
|
|
998
|
+
}
|
|
999
|
+
try {
|
|
1000
|
+
const logs = [];
|
|
1001
|
+
const logDiff = (expected, actual, options) => {
|
|
1002
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1003
|
+
};
|
|
1004
|
+
const ctx = yield* effect.Effect.promise(
|
|
1005
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1006
|
+
);
|
|
1007
|
+
const result = yield* effect.Effect.promise(
|
|
1008
|
+
() => Promise.resolve(
|
|
1009
|
+
evaluateFn({
|
|
1010
|
+
input: testCaseItem.testCase.getInput(),
|
|
1011
|
+
ctx,
|
|
1012
|
+
output,
|
|
1013
|
+
logDiff
|
|
1014
|
+
})
|
|
1015
|
+
)
|
|
1016
|
+
);
|
|
1017
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1018
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1019
|
+
evaluatorScores.push({
|
|
1020
|
+
evaluatorId,
|
|
1021
|
+
scores,
|
|
1022
|
+
passed: passed2,
|
|
1023
|
+
metrics,
|
|
1024
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1025
|
+
});
|
|
1026
|
+
} catch (error) {
|
|
1027
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1028
|
+
evaluatorScores.push({
|
|
1029
|
+
evaluatorId,
|
|
1030
|
+
scores: [],
|
|
1031
|
+
passed: false
|
|
1032
|
+
});
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1036
|
+
rerunPassed.push(rerunPassedThis);
|
|
1037
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1038
|
+
n + 1,
|
|
1039
|
+
n + 1
|
|
1040
|
+
]);
|
|
1041
|
+
const progressEvent = {
|
|
1042
|
+
type: "TestCaseProgress",
|
|
1043
|
+
runId: task.runId,
|
|
1044
|
+
testCaseId: testCaseItem.id,
|
|
1045
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1046
|
+
completedTestCases: completedEvaluations,
|
|
1047
|
+
totalTestCases: totalEvaluations,
|
|
1048
|
+
rerunIndex: r + 1,
|
|
1049
|
+
rerunTotal: reruns,
|
|
1050
|
+
passed: rerunPassedThis,
|
|
1051
|
+
durationMs: Date.now() - started,
|
|
1052
|
+
evaluatorScores,
|
|
1053
|
+
output,
|
|
1054
|
+
errorMessage: testCaseError
|
|
1055
|
+
};
|
|
1056
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1057
|
+
...snapshot,
|
|
1058
|
+
completedTestCases: completedEvaluations
|
|
1059
|
+
}));
|
|
1060
|
+
yield* publishEvent(progressEvent);
|
|
1061
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1062
|
+
runId: task.runId,
|
|
1063
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1064
|
+
payload: progressEvent
|
|
1065
|
+
});
|
|
1066
|
+
}
|
|
1067
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1068
|
+
if (testCasePassed) {
|
|
1069
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1070
|
+
} else {
|
|
1071
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1072
|
+
}
|
|
1073
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
1074
|
+
effect.Ref.get(passedRef),
|
|
1075
|
+
effect.Ref.get(failedRef)
|
|
1076
|
+
]);
|
|
1077
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1078
|
+
...snapshot,
|
|
1079
|
+
passedTestCases: passed,
|
|
1080
|
+
failedTestCases: failed
|
|
1081
|
+
}));
|
|
1082
|
+
});
|
|
1083
|
+
}
|
|
924
1084
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
925
1085
|
const startedAt = Date.now();
|
|
926
1086
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -933,104 +1093,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
933
1093
|
runId: task.runId,
|
|
934
1094
|
startedAt
|
|
935
1095
|
});
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
)
|
|
966
|
-
);
|
|
967
|
-
const { scores, metrics } = normalizeResult(result);
|
|
968
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
969
|
-
evaluatorScores.push({
|
|
970
|
-
evaluatorId,
|
|
971
|
-
scores,
|
|
972
|
-
passed,
|
|
973
|
-
metrics,
|
|
974
|
-
logs: logs.length > 0 ? logs : void 0
|
|
975
|
-
});
|
|
976
|
-
} catch (error) {
|
|
977
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
978
|
-
evaluatorScores.push({
|
|
979
|
-
evaluatorId,
|
|
980
|
-
scores: [],
|
|
981
|
-
passed: false
|
|
982
|
-
});
|
|
983
|
-
}
|
|
984
|
-
}
|
|
985
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
986
|
-
completedTestCases += 1;
|
|
987
|
-
if (testCasePassed) {
|
|
988
|
-
passedTestCases += 1;
|
|
989
|
-
} else {
|
|
990
|
-
failedTestCases += 1;
|
|
991
|
-
}
|
|
992
|
-
const progressEvent = {
|
|
993
|
-
type: "TestCaseProgress",
|
|
994
|
-
runId: task.runId,
|
|
995
|
-
testCaseId: testCaseItem.id,
|
|
996
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
997
|
-
completedTestCases,
|
|
998
|
-
totalTestCases: task.testCases.length,
|
|
999
|
-
passed: testCasePassed,
|
|
1000
|
-
durationMs: Date.now() - started,
|
|
1001
|
-
evaluatorScores,
|
|
1002
|
-
output,
|
|
1003
|
-
errorMessage: testCaseError
|
|
1004
|
-
};
|
|
1005
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1006
|
-
...snapshot,
|
|
1007
|
-
completedTestCases,
|
|
1008
|
-
passedTestCases,
|
|
1009
|
-
failedTestCases
|
|
1010
|
-
}));
|
|
1011
|
-
yield* publishEvent(progressEvent);
|
|
1012
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
1013
|
-
runId: task.runId,
|
|
1014
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1015
|
-
payload: progressEvent
|
|
1016
|
-
});
|
|
1017
|
-
}
|
|
1096
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1097
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1098
|
+
0
|
|
1099
|
+
);
|
|
1100
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1101
|
+
const completedRef = yield* effect.Ref.make(0);
|
|
1102
|
+
const passedRef = yield* effect.Ref.make(0);
|
|
1103
|
+
const failedRef = yield* effect.Ref.make(0);
|
|
1104
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1105
|
+
task,
|
|
1106
|
+
testCaseItem,
|
|
1107
|
+
totalEvaluations,
|
|
1108
|
+
publishEvent,
|
|
1109
|
+
persistenceQueue,
|
|
1110
|
+
updateSnapshot,
|
|
1111
|
+
completedRef,
|
|
1112
|
+
passedRef,
|
|
1113
|
+
failedRef
|
|
1114
|
+
);
|
|
1115
|
+
yield* effect.Effect.forEach(
|
|
1116
|
+
task.testCases,
|
|
1117
|
+
processTestCase,
|
|
1118
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1119
|
+
);
|
|
1120
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1121
|
+
effect.Ref.get(completedRef),
|
|
1122
|
+
effect.Ref.get(passedRef),
|
|
1123
|
+
effect.Ref.get(failedRef)
|
|
1124
|
+
]);
|
|
1018
1125
|
const finishedAt = Date.now();
|
|
1019
1126
|
const completedEvent = {
|
|
1020
1127
|
type: "RunCompleted",
|
|
1021
1128
|
runId: task.runId,
|
|
1022
1129
|
finishedAt,
|
|
1023
|
-
passedTestCases,
|
|
1024
|
-
failedTestCases,
|
|
1130
|
+
passedTestCases: passedUniqueTestCases,
|
|
1131
|
+
failedTestCases: failedUniqueTestCases,
|
|
1025
1132
|
totalTestCases: task.testCases.length,
|
|
1026
1133
|
artifactPath: task.snapshot.artifactPath
|
|
1027
1134
|
};
|
|
1028
1135
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1029
1136
|
...snapshot,
|
|
1030
1137
|
status: "completed",
|
|
1031
|
-
completedTestCases,
|
|
1032
|
-
passedTestCases,
|
|
1033
|
-
failedTestCases,
|
|
1138
|
+
completedTestCases: completedEvaluations,
|
|
1139
|
+
passedTestCases: passedUniqueTestCases,
|
|
1140
|
+
failedTestCases: failedUniqueTestCases,
|
|
1034
1141
|
finishedAt
|
|
1035
1142
|
}));
|
|
1036
1143
|
yield* publishEvent(completedEvent);
|
|
@@ -1045,6 +1152,126 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1045
1152
|
artifactPath: task.snapshot.artifactPath
|
|
1046
1153
|
});
|
|
1047
1154
|
});
|
|
1155
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
1156
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
1157
|
+
let entries;
|
|
1158
|
+
try {
|
|
1159
|
+
entries = await promises.readdir(baseDir);
|
|
1160
|
+
} catch {
|
|
1161
|
+
return [];
|
|
1162
|
+
}
|
|
1163
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1164
|
+
const snapshots = [];
|
|
1165
|
+
for (const fileName of jsonlFiles) {
|
|
1166
|
+
const filePath = path.join(baseDir, fileName);
|
|
1167
|
+
try {
|
|
1168
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1169
|
+
if (snapshot) {
|
|
1170
|
+
snapshots.push(snapshot);
|
|
1171
|
+
}
|
|
1172
|
+
} catch {
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1176
|
+
}
|
|
1177
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1178
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
1179
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1180
|
+
if (lines.length === 0) {
|
|
1181
|
+
return null;
|
|
1182
|
+
}
|
|
1183
|
+
let runQueued = null;
|
|
1184
|
+
let runCompleted = null;
|
|
1185
|
+
let runFailed = null;
|
|
1186
|
+
let runStarted = null;
|
|
1187
|
+
for (const line of lines) {
|
|
1188
|
+
try {
|
|
1189
|
+
const event = JSON.parse(line);
|
|
1190
|
+
const type = event.type;
|
|
1191
|
+
if (type === "RunQueued") {
|
|
1192
|
+
runQueued = {
|
|
1193
|
+
runId: event.runId,
|
|
1194
|
+
datasetId: event.datasetId,
|
|
1195
|
+
datasetName: event.datasetName,
|
|
1196
|
+
evaluatorIds: event.evaluatorIds,
|
|
1197
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
1198
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
1199
|
+
ts: event.ts
|
|
1200
|
+
};
|
|
1201
|
+
}
|
|
1202
|
+
if (type === "RunStarted") {
|
|
1203
|
+
runStarted = { startedAt: event.startedAt };
|
|
1204
|
+
}
|
|
1205
|
+
if (type === "RunCompleted") {
|
|
1206
|
+
runCompleted = {
|
|
1207
|
+
passedTestCases: event.passedTestCases,
|
|
1208
|
+
failedTestCases: event.failedTestCases,
|
|
1209
|
+
totalTestCases: event.totalTestCases,
|
|
1210
|
+
finishedAt: event.finishedAt
|
|
1211
|
+
};
|
|
1212
|
+
}
|
|
1213
|
+
if (type === "RunFailed") {
|
|
1214
|
+
runFailed = {
|
|
1215
|
+
finishedAt: event.finishedAt,
|
|
1216
|
+
errorMessage: event.errorMessage
|
|
1217
|
+
};
|
|
1218
|
+
}
|
|
1219
|
+
} catch {
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
if (!runQueued) {
|
|
1223
|
+
return null;
|
|
1224
|
+
}
|
|
1225
|
+
const artifactPath = filePath;
|
|
1226
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1227
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
1228
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1229
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1230
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1231
|
+
return {
|
|
1232
|
+
runId: runQueued.runId,
|
|
1233
|
+
datasetId: runQueued.datasetId,
|
|
1234
|
+
datasetName: runQueued.datasetName,
|
|
1235
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
1236
|
+
queuedAt: runQueued.ts ?? 0,
|
|
1237
|
+
startedAt: runStarted?.startedAt,
|
|
1238
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1239
|
+
totalTestCases: runQueued.totalTestCases,
|
|
1240
|
+
completedTestCases,
|
|
1241
|
+
passedTestCases,
|
|
1242
|
+
failedTestCases,
|
|
1243
|
+
status,
|
|
1244
|
+
artifactPath,
|
|
1245
|
+
errorMessage: runFailed?.errorMessage
|
|
1246
|
+
};
|
|
1247
|
+
}
|
|
1248
|
+
function aggregateTestCaseProgress(lines) {
|
|
1249
|
+
let completedTestCases = 0;
|
|
1250
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1251
|
+
for (const line of lines) {
|
|
1252
|
+
try {
|
|
1253
|
+
const event = JSON.parse(line);
|
|
1254
|
+
if (event.type === "TestCaseProgress") {
|
|
1255
|
+
const ev = event;
|
|
1256
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1257
|
+
const id = ev.testCaseId;
|
|
1258
|
+
const current = testCasePassedBy.get(id);
|
|
1259
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1260
|
+
}
|
|
1261
|
+
} catch {
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
let passedTestCases = 0;
|
|
1265
|
+
let failedTestCases = 0;
|
|
1266
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1267
|
+
if (passed) {
|
|
1268
|
+
passedTestCases += 1;
|
|
1269
|
+
} else {
|
|
1270
|
+
failedTestCases += 1;
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1274
|
+
}
|
|
1048
1275
|
async function appendJsonLine(artifactPath, payload) {
|
|
1049
1276
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
1050
1277
|
await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
|
|
@@ -1237,6 +1464,10 @@ var EffectRunner = class {
|
|
|
1237
1464
|
throw new Error("No evaluators selected for run");
|
|
1238
1465
|
}
|
|
1239
1466
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1467
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1468
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1469
|
+
0
|
|
1470
|
+
);
|
|
1240
1471
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1241
1472
|
const artifactPath = createArtifactPath(
|
|
1242
1473
|
this.config.artifactDirectory,
|
|
@@ -1249,7 +1480,7 @@ var EffectRunner = class {
|
|
|
1249
1480
|
datasetName: dataset.dataset.getName(),
|
|
1250
1481
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1251
1482
|
queuedAt: Date.now(),
|
|
1252
|
-
totalTestCases:
|
|
1483
|
+
totalTestCases: totalEvaluations,
|
|
1253
1484
|
completedTestCases: 0,
|
|
1254
1485
|
passedTestCases: 0,
|
|
1255
1486
|
failedTestCases: 0,
|
|
@@ -1263,7 +1494,7 @@ var EffectRunner = class {
|
|
|
1263
1494
|
datasetId: request.datasetId,
|
|
1264
1495
|
datasetName: dataset.dataset.getName(),
|
|
1265
1496
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1266
|
-
totalTestCases:
|
|
1497
|
+
totalTestCases: totalEvaluations,
|
|
1267
1498
|
artifactPath
|
|
1268
1499
|
};
|
|
1269
1500
|
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1274,6 +1505,7 @@ var EffectRunner = class {
|
|
|
1274
1505
|
payload: queuedEvent
|
|
1275
1506
|
})
|
|
1276
1507
|
);
|
|
1508
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1277
1509
|
await effect.Effect.runPromise(
|
|
1278
1510
|
effect.Queue.offer(this.runQueue, {
|
|
1279
1511
|
runId,
|
|
@@ -1281,7 +1513,8 @@ var EffectRunner = class {
|
|
|
1281
1513
|
dataset: dataset.dataset,
|
|
1282
1514
|
evaluators: selectedEvaluators,
|
|
1283
1515
|
testCases: selectedTestCases,
|
|
1284
|
-
snapshot
|
|
1516
|
+
snapshot,
|
|
1517
|
+
maxConcurrency
|
|
1285
1518
|
})
|
|
1286
1519
|
);
|
|
1287
1520
|
return snapshot;
|
|
@@ -1301,6 +1534,9 @@ var EffectRunner = class {
|
|
|
1301
1534
|
(a, b) => b.queuedAt - a.queuedAt
|
|
1302
1535
|
);
|
|
1303
1536
|
}
|
|
1537
|
+
async loadRunSnapshotsFromArtifacts() {
|
|
1538
|
+
return loadRunSnapshotsFromArtifacts(this.config);
|
|
1539
|
+
}
|
|
1304
1540
|
async shutdown() {
|
|
1305
1541
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
|
|
1306
1542
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
|