@m4trix/evals 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +599 -224
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +600 -225
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +214 -105
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +215 -106
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +23 -5
- package/dist/index.js +218 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -331,15 +331,23 @@ var TestCase = class _TestCase {
|
|
|
331
331
|
this._config = config;
|
|
332
332
|
}
|
|
333
333
|
static describe(config) {
|
|
334
|
+
const reruns = config.reruns ?? 1;
|
|
335
|
+
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
336
|
+
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
337
|
+
}
|
|
334
338
|
return new _TestCase({
|
|
335
339
|
name: config.name,
|
|
336
340
|
tags: config.tags,
|
|
341
|
+
reruns,
|
|
337
342
|
inputSchema: config.inputSchema,
|
|
338
343
|
input: config.input,
|
|
339
344
|
outputSchema: config.outputSchema,
|
|
340
345
|
output: config.output
|
|
341
346
|
});
|
|
342
347
|
}
|
|
348
|
+
getReruns() {
|
|
349
|
+
return this._config.reruns;
|
|
350
|
+
}
|
|
343
351
|
getName() {
|
|
344
352
|
return this._config.name;
|
|
345
353
|
}
|
|
@@ -513,6 +521,7 @@ var Metric = {
|
|
|
513
521
|
const def = {
|
|
514
522
|
id: config.id,
|
|
515
523
|
name: config.name,
|
|
524
|
+
aggregate: config.aggregate,
|
|
516
525
|
format: config.format,
|
|
517
526
|
make: (data) => ({ id: config.id, data })
|
|
518
527
|
};
|
|
@@ -532,6 +541,7 @@ var Score = {
|
|
|
532
541
|
id: config.id,
|
|
533
542
|
name: config.name,
|
|
534
543
|
displayStrategy: config.displayStrategy,
|
|
544
|
+
aggregate: config.aggregate,
|
|
535
545
|
format: config.format,
|
|
536
546
|
make: (data, options) => {
|
|
537
547
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -550,23 +560,62 @@ function getScoreById(id) {
|
|
|
550
560
|
return registry2.get(id);
|
|
551
561
|
}
|
|
552
562
|
|
|
563
|
+
// src/evals/aggregators.ts
|
|
564
|
+
function aggregateAverage(values) {
|
|
565
|
+
if (values.length === 0) {
|
|
566
|
+
return { value: 0 };
|
|
567
|
+
}
|
|
568
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
569
|
+
return { value: sum / values.length };
|
|
570
|
+
}
|
|
571
|
+
function aggregateAll(values) {
|
|
572
|
+
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
573
|
+
}
|
|
574
|
+
function aggregateTokenCountSum(values) {
|
|
575
|
+
const initial = {
|
|
576
|
+
input: 0,
|
|
577
|
+
output: 0,
|
|
578
|
+
inputCached: 0,
|
|
579
|
+
outputCached: 0
|
|
580
|
+
};
|
|
581
|
+
return values.reduce(
|
|
582
|
+
(acc, v) => ({
|
|
583
|
+
input: acc.input + (v.input ?? 0),
|
|
584
|
+
output: acc.output + (v.output ?? 0),
|
|
585
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
586
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
587
|
+
}),
|
|
588
|
+
initial
|
|
589
|
+
);
|
|
590
|
+
}
|
|
591
|
+
function aggregateLatencyAverage(values) {
|
|
592
|
+
if (values.length === 0) {
|
|
593
|
+
return { ms: 0 };
|
|
594
|
+
}
|
|
595
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
596
|
+
return { ms: sum / values.length };
|
|
597
|
+
}
|
|
598
|
+
|
|
553
599
|
// src/evals/metrics/standard.ts
|
|
554
600
|
var tokenCountMetric = Metric.of({
|
|
555
601
|
id: "token-count",
|
|
556
602
|
name: "Tokens",
|
|
557
|
-
|
|
603
|
+
aggregate: aggregateTokenCountSum,
|
|
604
|
+
format: (data, options) => {
|
|
558
605
|
const input = data.input ?? 0;
|
|
559
606
|
const output = data.output ?? 0;
|
|
560
607
|
const inputCached = data.inputCached ?? 0;
|
|
561
608
|
const outputCached = data.outputCached ?? 0;
|
|
562
609
|
const cached = inputCached + outputCached;
|
|
563
|
-
|
|
610
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
611
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
564
612
|
}
|
|
565
613
|
});
|
|
566
614
|
var latencyMetric = Metric.of({
|
|
567
615
|
id: "latency",
|
|
568
616
|
name: "Latency",
|
|
569
|
-
|
|
617
|
+
aggregate: aggregateLatencyAverage,
|
|
618
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
570
619
|
});
|
|
571
620
|
|
|
572
621
|
// src/evals/scores/standard.ts
|
|
@@ -574,13 +623,15 @@ var percentScore = Score.of({
|
|
|
574
623
|
id: "percent",
|
|
575
624
|
name: "Score",
|
|
576
625
|
displayStrategy: "bar",
|
|
577
|
-
format: (data) => data.value.toFixed(2)
|
|
626
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
627
|
+
aggregate: aggregateAverage
|
|
578
628
|
});
|
|
579
629
|
var binaryScore = Score.of({
|
|
580
630
|
id: "binary",
|
|
581
631
|
name: "Result",
|
|
582
632
|
displayStrategy: "passFail",
|
|
583
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
633
|
+
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
634
|
+
aggregate: aggregateAll
|
|
584
635
|
});
|
|
585
636
|
function createDiffLogEntry(expected, actual, options) {
|
|
586
637
|
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
@@ -621,7 +672,8 @@ var defaultRunnerConfig = {
|
|
|
621
672
|
],
|
|
622
673
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
623
674
|
},
|
|
624
|
-
artifactDirectory: ".eval-results"
|
|
675
|
+
artifactDirectory: ".eval-results",
|
|
676
|
+
maxConcurrency: 1
|
|
625
677
|
};
|
|
626
678
|
function toRunnerConfigOverrides(config) {
|
|
627
679
|
if (!config) {
|
|
@@ -654,6 +706,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
654
706
|
if (config.artifactDirectory !== void 0) {
|
|
655
707
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
656
708
|
}
|
|
709
|
+
if (config.maxConcurrency !== void 0) {
|
|
710
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
711
|
+
}
|
|
657
712
|
if (Object.keys(discovery).length > 0) {
|
|
658
713
|
overrides.discovery = discovery;
|
|
659
714
|
}
|
|
@@ -927,6 +982,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
927
982
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
928
983
|
);
|
|
929
984
|
}
|
|
985
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
986
|
+
return effect.Effect.gen(function* () {
|
|
987
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
988
|
+
const rerunPassed = [];
|
|
989
|
+
for (let r = 0; r < reruns; r++) {
|
|
990
|
+
const started = Date.now();
|
|
991
|
+
const evaluatorScores = [];
|
|
992
|
+
let testCaseError;
|
|
993
|
+
const output = readOutput(testCaseItem.testCase);
|
|
994
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
995
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
996
|
+
if (!evaluateFn) {
|
|
997
|
+
continue;
|
|
998
|
+
}
|
|
999
|
+
try {
|
|
1000
|
+
const logs = [];
|
|
1001
|
+
const logDiff = (expected, actual, options) => {
|
|
1002
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1003
|
+
};
|
|
1004
|
+
const ctx = yield* effect.Effect.promise(
|
|
1005
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1006
|
+
);
|
|
1007
|
+
const result = yield* effect.Effect.promise(
|
|
1008
|
+
() => Promise.resolve(
|
|
1009
|
+
evaluateFn({
|
|
1010
|
+
input: testCaseItem.testCase.getInput(),
|
|
1011
|
+
ctx,
|
|
1012
|
+
output,
|
|
1013
|
+
logDiff
|
|
1014
|
+
})
|
|
1015
|
+
)
|
|
1016
|
+
);
|
|
1017
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1018
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1019
|
+
evaluatorScores.push({
|
|
1020
|
+
evaluatorId,
|
|
1021
|
+
scores,
|
|
1022
|
+
passed: passed2,
|
|
1023
|
+
metrics,
|
|
1024
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1025
|
+
});
|
|
1026
|
+
} catch (error) {
|
|
1027
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1028
|
+
evaluatorScores.push({
|
|
1029
|
+
evaluatorId,
|
|
1030
|
+
scores: [],
|
|
1031
|
+
passed: false
|
|
1032
|
+
});
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1036
|
+
rerunPassed.push(rerunPassedThis);
|
|
1037
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1038
|
+
n + 1,
|
|
1039
|
+
n + 1
|
|
1040
|
+
]);
|
|
1041
|
+
const progressEvent = {
|
|
1042
|
+
type: "TestCaseProgress",
|
|
1043
|
+
runId: task.runId,
|
|
1044
|
+
testCaseId: testCaseItem.id,
|
|
1045
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1046
|
+
completedTestCases: completedEvaluations,
|
|
1047
|
+
totalTestCases: totalEvaluations,
|
|
1048
|
+
rerunIndex: r + 1,
|
|
1049
|
+
rerunTotal: reruns,
|
|
1050
|
+
passed: rerunPassedThis,
|
|
1051
|
+
durationMs: Date.now() - started,
|
|
1052
|
+
evaluatorScores,
|
|
1053
|
+
output,
|
|
1054
|
+
errorMessage: testCaseError
|
|
1055
|
+
};
|
|
1056
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1057
|
+
...snapshot,
|
|
1058
|
+
completedTestCases: completedEvaluations
|
|
1059
|
+
}));
|
|
1060
|
+
yield* publishEvent(progressEvent);
|
|
1061
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1062
|
+
runId: task.runId,
|
|
1063
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1064
|
+
payload: progressEvent
|
|
1065
|
+
});
|
|
1066
|
+
}
|
|
1067
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1068
|
+
if (testCasePassed) {
|
|
1069
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1070
|
+
} else {
|
|
1071
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1072
|
+
}
|
|
1073
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
1074
|
+
effect.Ref.get(passedRef),
|
|
1075
|
+
effect.Ref.get(failedRef)
|
|
1076
|
+
]);
|
|
1077
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1078
|
+
...snapshot,
|
|
1079
|
+
passedTestCases: passed,
|
|
1080
|
+
failedTestCases: failed
|
|
1081
|
+
}));
|
|
1082
|
+
});
|
|
1083
|
+
}
|
|
930
1084
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
931
1085
|
const startedAt = Date.now();
|
|
932
1086
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -939,104 +1093,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
939
1093
|
runId: task.runId,
|
|
940
1094
|
startedAt
|
|
941
1095
|
});
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
)
|
|
972
|
-
);
|
|
973
|
-
const { scores, metrics } = normalizeResult(result);
|
|
974
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
975
|
-
evaluatorScores.push({
|
|
976
|
-
evaluatorId,
|
|
977
|
-
scores,
|
|
978
|
-
passed,
|
|
979
|
-
metrics,
|
|
980
|
-
logs: logs.length > 0 ? logs : void 0
|
|
981
|
-
});
|
|
982
|
-
} catch (error) {
|
|
983
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
984
|
-
evaluatorScores.push({
|
|
985
|
-
evaluatorId,
|
|
986
|
-
scores: [],
|
|
987
|
-
passed: false
|
|
988
|
-
});
|
|
989
|
-
}
|
|
990
|
-
}
|
|
991
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
992
|
-
completedTestCases += 1;
|
|
993
|
-
if (testCasePassed) {
|
|
994
|
-
passedTestCases += 1;
|
|
995
|
-
} else {
|
|
996
|
-
failedTestCases += 1;
|
|
997
|
-
}
|
|
998
|
-
const progressEvent = {
|
|
999
|
-
type: "TestCaseProgress",
|
|
1000
|
-
runId: task.runId,
|
|
1001
|
-
testCaseId: testCaseItem.id,
|
|
1002
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1003
|
-
completedTestCases,
|
|
1004
|
-
totalTestCases: task.testCases.length,
|
|
1005
|
-
passed: testCasePassed,
|
|
1006
|
-
durationMs: Date.now() - started,
|
|
1007
|
-
evaluatorScores,
|
|
1008
|
-
output,
|
|
1009
|
-
errorMessage: testCaseError
|
|
1010
|
-
};
|
|
1011
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1012
|
-
...snapshot,
|
|
1013
|
-
completedTestCases,
|
|
1014
|
-
passedTestCases,
|
|
1015
|
-
failedTestCases
|
|
1016
|
-
}));
|
|
1017
|
-
yield* publishEvent(progressEvent);
|
|
1018
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
1019
|
-
runId: task.runId,
|
|
1020
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1021
|
-
payload: progressEvent
|
|
1022
|
-
});
|
|
1023
|
-
}
|
|
1096
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1097
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1098
|
+
0
|
|
1099
|
+
);
|
|
1100
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1101
|
+
const completedRef = yield* effect.Ref.make(0);
|
|
1102
|
+
const passedRef = yield* effect.Ref.make(0);
|
|
1103
|
+
const failedRef = yield* effect.Ref.make(0);
|
|
1104
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1105
|
+
task,
|
|
1106
|
+
testCaseItem,
|
|
1107
|
+
totalEvaluations,
|
|
1108
|
+
publishEvent,
|
|
1109
|
+
persistenceQueue,
|
|
1110
|
+
updateSnapshot,
|
|
1111
|
+
completedRef,
|
|
1112
|
+
passedRef,
|
|
1113
|
+
failedRef
|
|
1114
|
+
);
|
|
1115
|
+
yield* effect.Effect.forEach(
|
|
1116
|
+
task.testCases,
|
|
1117
|
+
processTestCase,
|
|
1118
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1119
|
+
);
|
|
1120
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1121
|
+
effect.Ref.get(completedRef),
|
|
1122
|
+
effect.Ref.get(passedRef),
|
|
1123
|
+
effect.Ref.get(failedRef)
|
|
1124
|
+
]);
|
|
1024
1125
|
const finishedAt = Date.now();
|
|
1025
1126
|
const completedEvent = {
|
|
1026
1127
|
type: "RunCompleted",
|
|
1027
1128
|
runId: task.runId,
|
|
1028
1129
|
finishedAt,
|
|
1029
|
-
passedTestCases,
|
|
1030
|
-
failedTestCases,
|
|
1130
|
+
passedTestCases: passedUniqueTestCases,
|
|
1131
|
+
failedTestCases: failedUniqueTestCases,
|
|
1031
1132
|
totalTestCases: task.testCases.length,
|
|
1032
1133
|
artifactPath: task.snapshot.artifactPath
|
|
1033
1134
|
};
|
|
1034
1135
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1035
1136
|
...snapshot,
|
|
1036
1137
|
status: "completed",
|
|
1037
|
-
completedTestCases,
|
|
1038
|
-
passedTestCases,
|
|
1039
|
-
failedTestCases,
|
|
1138
|
+
completedTestCases: completedEvaluations,
|
|
1139
|
+
passedTestCases: passedUniqueTestCases,
|
|
1140
|
+
failedTestCases: failedUniqueTestCases,
|
|
1040
1141
|
finishedAt
|
|
1041
1142
|
}));
|
|
1042
1143
|
yield* publishEvent(completedEvent);
|
|
@@ -1124,7 +1225,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1124
1225
|
const artifactPath = filePath;
|
|
1125
1226
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1126
1227
|
const progress = aggregateTestCaseProgress(lines);
|
|
1127
|
-
const completedTestCases = runCompleted
|
|
1228
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1128
1229
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1129
1230
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1130
1231
|
return {
|
|
@@ -1146,23 +1247,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1146
1247
|
}
|
|
1147
1248
|
function aggregateTestCaseProgress(lines) {
|
|
1148
1249
|
let completedTestCases = 0;
|
|
1149
|
-
|
|
1150
|
-
let failedTestCases = 0;
|
|
1250
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1151
1251
|
for (const line of lines) {
|
|
1152
1252
|
try {
|
|
1153
1253
|
const event = JSON.parse(line);
|
|
1154
1254
|
if (event.type === "TestCaseProgress") {
|
|
1155
1255
|
const ev = event;
|
|
1156
1256
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
failedTestCases += 1;
|
|
1161
|
-
}
|
|
1257
|
+
const id = ev.testCaseId;
|
|
1258
|
+
const current = testCasePassedBy.get(id);
|
|
1259
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1162
1260
|
}
|
|
1163
1261
|
} catch {
|
|
1164
1262
|
}
|
|
1165
1263
|
}
|
|
1264
|
+
let passedTestCases = 0;
|
|
1265
|
+
let failedTestCases = 0;
|
|
1266
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1267
|
+
if (passed) {
|
|
1268
|
+
passedTestCases += 1;
|
|
1269
|
+
} else {
|
|
1270
|
+
failedTestCases += 1;
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1166
1273
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1167
1274
|
}
|
|
1168
1275
|
async function appendJsonLine(artifactPath, payload) {
|
|
@@ -1357,6 +1464,10 @@ var EffectRunner = class {
|
|
|
1357
1464
|
throw new Error("No evaluators selected for run");
|
|
1358
1465
|
}
|
|
1359
1466
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1467
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1468
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1469
|
+
0
|
|
1470
|
+
);
|
|
1360
1471
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1361
1472
|
const artifactPath = createArtifactPath(
|
|
1362
1473
|
this.config.artifactDirectory,
|
|
@@ -1369,7 +1480,7 @@ var EffectRunner = class {
|
|
|
1369
1480
|
datasetName: dataset.dataset.getName(),
|
|
1370
1481
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1371
1482
|
queuedAt: Date.now(),
|
|
1372
|
-
totalTestCases:
|
|
1483
|
+
totalTestCases: totalEvaluations,
|
|
1373
1484
|
completedTestCases: 0,
|
|
1374
1485
|
passedTestCases: 0,
|
|
1375
1486
|
failedTestCases: 0,
|
|
@@ -1383,7 +1494,7 @@ var EffectRunner = class {
|
|
|
1383
1494
|
datasetId: request.datasetId,
|
|
1384
1495
|
datasetName: dataset.dataset.getName(),
|
|
1385
1496
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1386
|
-
totalTestCases:
|
|
1497
|
+
totalTestCases: totalEvaluations,
|
|
1387
1498
|
artifactPath
|
|
1388
1499
|
};
|
|
1389
1500
|
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1394,6 +1505,7 @@ var EffectRunner = class {
|
|
|
1394
1505
|
payload: queuedEvent
|
|
1395
1506
|
})
|
|
1396
1507
|
);
|
|
1508
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1397
1509
|
await effect.Effect.runPromise(
|
|
1398
1510
|
effect.Queue.offer(this.runQueue, {
|
|
1399
1511
|
runId,
|
|
@@ -1401,7 +1513,8 @@ var EffectRunner = class {
|
|
|
1401
1513
|
dataset: dataset.dataset,
|
|
1402
1514
|
evaluators: selectedEvaluators,
|
|
1403
1515
|
testCases: selectedTestCases,
|
|
1404
|
-
snapshot
|
|
1516
|
+
snapshot,
|
|
1517
|
+
maxConcurrency
|
|
1405
1518
|
})
|
|
1406
1519
|
);
|
|
1407
1520
|
return snapshot;
|