@m4trix/evals 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +599 -224
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +600 -225
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +214 -105
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +215 -106
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +23 -5
- package/dist/index.js +218 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -6,7 +6,7 @@ import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
|
|
|
6
6
|
import { resolve, relative, join, dirname } from 'path';
|
|
7
7
|
import { diffString } from 'json-diff';
|
|
8
8
|
import { randomUUID } from 'crypto';
|
|
9
|
-
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
9
|
+
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
12
|
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
@@ -834,6 +834,7 @@ var Metric = {
|
|
|
834
834
|
const def = {
|
|
835
835
|
id: config.id,
|
|
836
836
|
name: config.name,
|
|
837
|
+
aggregate: config.aggregate,
|
|
837
838
|
format: config.format,
|
|
838
839
|
make: (data) => ({ id: config.id, data })
|
|
839
840
|
};
|
|
@@ -853,6 +854,7 @@ var Score = {
|
|
|
853
854
|
id: config.id,
|
|
854
855
|
name: config.name,
|
|
855
856
|
displayStrategy: config.displayStrategy,
|
|
857
|
+
aggregate: config.aggregate,
|
|
856
858
|
format: config.format,
|
|
857
859
|
make: (data, options) => {
|
|
858
860
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -871,23 +873,62 @@ function getScoreById(id) {
|
|
|
871
873
|
return registry2.get(id);
|
|
872
874
|
}
|
|
873
875
|
|
|
876
|
+
// src/evals/aggregators.ts
|
|
877
|
+
function aggregateAverage(values) {
|
|
878
|
+
if (values.length === 0) {
|
|
879
|
+
return { value: 0 };
|
|
880
|
+
}
|
|
881
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
882
|
+
return { value: sum / values.length };
|
|
883
|
+
}
|
|
884
|
+
function aggregateAll(values) {
|
|
885
|
+
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
886
|
+
}
|
|
887
|
+
function aggregateTokenCountSum(values) {
|
|
888
|
+
const initial = {
|
|
889
|
+
input: 0,
|
|
890
|
+
output: 0,
|
|
891
|
+
inputCached: 0,
|
|
892
|
+
outputCached: 0
|
|
893
|
+
};
|
|
894
|
+
return values.reduce(
|
|
895
|
+
(acc, v) => ({
|
|
896
|
+
input: acc.input + (v.input ?? 0),
|
|
897
|
+
output: acc.output + (v.output ?? 0),
|
|
898
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
899
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
900
|
+
}),
|
|
901
|
+
initial
|
|
902
|
+
);
|
|
903
|
+
}
|
|
904
|
+
function aggregateLatencyAverage(values) {
|
|
905
|
+
if (values.length === 0) {
|
|
906
|
+
return { ms: 0 };
|
|
907
|
+
}
|
|
908
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
909
|
+
return { ms: sum / values.length };
|
|
910
|
+
}
|
|
911
|
+
|
|
874
912
|
// src/evals/metrics/standard.ts
|
|
875
913
|
Metric.of({
|
|
876
914
|
id: "token-count",
|
|
877
915
|
name: "Tokens",
|
|
878
|
-
|
|
916
|
+
aggregate: aggregateTokenCountSum,
|
|
917
|
+
format: (data, options) => {
|
|
879
918
|
const input = data.input ?? 0;
|
|
880
919
|
const output = data.output ?? 0;
|
|
881
920
|
const inputCached = data.inputCached ?? 0;
|
|
882
921
|
const outputCached = data.outputCached ?? 0;
|
|
883
922
|
const cached = inputCached + outputCached;
|
|
884
|
-
|
|
923
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
924
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
885
925
|
}
|
|
886
926
|
});
|
|
887
927
|
Metric.of({
|
|
888
928
|
id: "latency",
|
|
889
929
|
name: "Latency",
|
|
890
|
-
|
|
930
|
+
aggregate: aggregateLatencyAverage,
|
|
931
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
891
932
|
});
|
|
892
933
|
|
|
893
934
|
// src/evals/scores/standard.ts
|
|
@@ -895,13 +936,15 @@ Score.of({
|
|
|
895
936
|
id: "percent",
|
|
896
937
|
name: "Score",
|
|
897
938
|
displayStrategy: "bar",
|
|
898
|
-
format: (data) => data.value.toFixed(2)
|
|
939
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
940
|
+
aggregate: aggregateAverage
|
|
899
941
|
});
|
|
900
942
|
Score.of({
|
|
901
943
|
id: "binary",
|
|
902
944
|
name: "Result",
|
|
903
945
|
displayStrategy: "passFail",
|
|
904
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
946
|
+
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
947
|
+
aggregate: aggregateAll
|
|
905
948
|
});
|
|
906
949
|
function createDiffLogEntry(expected, actual, options) {
|
|
907
950
|
const diff = diffString(expected, actual, { color: false });
|
|
@@ -983,7 +1026,8 @@ var defaultRunnerConfig = {
|
|
|
983
1026
|
],
|
|
984
1027
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
985
1028
|
},
|
|
986
|
-
artifactDirectory: ".eval-results"
|
|
1029
|
+
artifactDirectory: ".eval-results",
|
|
1030
|
+
maxConcurrency: 1
|
|
987
1031
|
};
|
|
988
1032
|
function toRunnerConfigOverrides(config) {
|
|
989
1033
|
if (!config) {
|
|
@@ -1016,6 +1060,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
1016
1060
|
if (config.artifactDirectory !== void 0) {
|
|
1017
1061
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
1018
1062
|
}
|
|
1063
|
+
if (config.maxConcurrency !== void 0) {
|
|
1064
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
1065
|
+
}
|
|
1019
1066
|
if (Object.keys(discovery).length > 0) {
|
|
1020
1067
|
overrides.discovery = discovery;
|
|
1021
1068
|
}
|
|
@@ -1250,6 +1297,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1250
1297
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1251
1298
|
);
|
|
1252
1299
|
}
|
|
1300
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1301
|
+
return Effect.gen(function* () {
|
|
1302
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1303
|
+
const rerunPassed = [];
|
|
1304
|
+
for (let r = 0; r < reruns; r++) {
|
|
1305
|
+
const started = Date.now();
|
|
1306
|
+
const evaluatorScores = [];
|
|
1307
|
+
let testCaseError;
|
|
1308
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1309
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1310
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1311
|
+
if (!evaluateFn) {
|
|
1312
|
+
continue;
|
|
1313
|
+
}
|
|
1314
|
+
try {
|
|
1315
|
+
const logs = [];
|
|
1316
|
+
const logDiff = (expected, actual, options) => {
|
|
1317
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1318
|
+
};
|
|
1319
|
+
const ctx = yield* Effect.promise(
|
|
1320
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1321
|
+
);
|
|
1322
|
+
const result = yield* Effect.promise(
|
|
1323
|
+
() => Promise.resolve(
|
|
1324
|
+
evaluateFn({
|
|
1325
|
+
input: testCaseItem.testCase.getInput(),
|
|
1326
|
+
ctx,
|
|
1327
|
+
output,
|
|
1328
|
+
logDiff
|
|
1329
|
+
})
|
|
1330
|
+
)
|
|
1331
|
+
);
|
|
1332
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1333
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1334
|
+
evaluatorScores.push({
|
|
1335
|
+
evaluatorId,
|
|
1336
|
+
scores,
|
|
1337
|
+
passed: passed2,
|
|
1338
|
+
metrics,
|
|
1339
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1340
|
+
});
|
|
1341
|
+
} catch (error) {
|
|
1342
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1343
|
+
evaluatorScores.push({
|
|
1344
|
+
evaluatorId,
|
|
1345
|
+
scores: [],
|
|
1346
|
+
passed: false
|
|
1347
|
+
});
|
|
1348
|
+
}
|
|
1349
|
+
}
|
|
1350
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1351
|
+
rerunPassed.push(rerunPassedThis);
|
|
1352
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1353
|
+
n + 1,
|
|
1354
|
+
n + 1
|
|
1355
|
+
]);
|
|
1356
|
+
const progressEvent = {
|
|
1357
|
+
type: "TestCaseProgress",
|
|
1358
|
+
runId: task.runId,
|
|
1359
|
+
testCaseId: testCaseItem.id,
|
|
1360
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1361
|
+
completedTestCases: completedEvaluations,
|
|
1362
|
+
totalTestCases: totalEvaluations,
|
|
1363
|
+
rerunIndex: r + 1,
|
|
1364
|
+
rerunTotal: reruns,
|
|
1365
|
+
passed: rerunPassedThis,
|
|
1366
|
+
durationMs: Date.now() - started,
|
|
1367
|
+
evaluatorScores,
|
|
1368
|
+
output,
|
|
1369
|
+
errorMessage: testCaseError
|
|
1370
|
+
};
|
|
1371
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1372
|
+
...snapshot,
|
|
1373
|
+
completedTestCases: completedEvaluations
|
|
1374
|
+
}));
|
|
1375
|
+
yield* publishEvent(progressEvent);
|
|
1376
|
+
yield* Queue.offer(persistenceQueue, {
|
|
1377
|
+
runId: task.runId,
|
|
1378
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1379
|
+
payload: progressEvent
|
|
1380
|
+
});
|
|
1381
|
+
}
|
|
1382
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1383
|
+
if (testCasePassed) {
|
|
1384
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
1385
|
+
} else {
|
|
1386
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1387
|
+
}
|
|
1388
|
+
const [passed, failed] = yield* Effect.all([
|
|
1389
|
+
Ref.get(passedRef),
|
|
1390
|
+
Ref.get(failedRef)
|
|
1391
|
+
]);
|
|
1392
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1393
|
+
...snapshot,
|
|
1394
|
+
passedTestCases: passed,
|
|
1395
|
+
failedTestCases: failed
|
|
1396
|
+
}));
|
|
1397
|
+
});
|
|
1398
|
+
}
|
|
1253
1399
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
1254
1400
|
const startedAt = Date.now();
|
|
1255
1401
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -1262,104 +1408,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1262
1408
|
runId: task.runId,
|
|
1263
1409
|
startedAt
|
|
1264
1410
|
});
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
)
|
|
1295
|
-
);
|
|
1296
|
-
const { scores, metrics } = normalizeResult(result);
|
|
1297
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1298
|
-
evaluatorScores.push({
|
|
1299
|
-
evaluatorId,
|
|
1300
|
-
scores,
|
|
1301
|
-
passed,
|
|
1302
|
-
metrics,
|
|
1303
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1304
|
-
});
|
|
1305
|
-
} catch (error) {
|
|
1306
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1307
|
-
evaluatorScores.push({
|
|
1308
|
-
evaluatorId,
|
|
1309
|
-
scores: [],
|
|
1310
|
-
passed: false
|
|
1311
|
-
});
|
|
1312
|
-
}
|
|
1313
|
-
}
|
|
1314
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
1315
|
-
completedTestCases += 1;
|
|
1316
|
-
if (testCasePassed) {
|
|
1317
|
-
passedTestCases += 1;
|
|
1318
|
-
} else {
|
|
1319
|
-
failedTestCases += 1;
|
|
1320
|
-
}
|
|
1321
|
-
const progressEvent = {
|
|
1322
|
-
type: "TestCaseProgress",
|
|
1323
|
-
runId: task.runId,
|
|
1324
|
-
testCaseId: testCaseItem.id,
|
|
1325
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1326
|
-
completedTestCases,
|
|
1327
|
-
totalTestCases: task.testCases.length,
|
|
1328
|
-
passed: testCasePassed,
|
|
1329
|
-
durationMs: Date.now() - started,
|
|
1330
|
-
evaluatorScores,
|
|
1331
|
-
output,
|
|
1332
|
-
errorMessage: testCaseError
|
|
1333
|
-
};
|
|
1334
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1335
|
-
...snapshot,
|
|
1336
|
-
completedTestCases,
|
|
1337
|
-
passedTestCases,
|
|
1338
|
-
failedTestCases
|
|
1339
|
-
}));
|
|
1340
|
-
yield* publishEvent(progressEvent);
|
|
1341
|
-
yield* Queue.offer(persistenceQueue, {
|
|
1342
|
-
runId: task.runId,
|
|
1343
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1344
|
-
payload: progressEvent
|
|
1345
|
-
});
|
|
1346
|
-
}
|
|
1411
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1412
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1413
|
+
0
|
|
1414
|
+
);
|
|
1415
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1416
|
+
const completedRef = yield* Ref.make(0);
|
|
1417
|
+
const passedRef = yield* Ref.make(0);
|
|
1418
|
+
const failedRef = yield* Ref.make(0);
|
|
1419
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1420
|
+
task,
|
|
1421
|
+
testCaseItem,
|
|
1422
|
+
totalEvaluations,
|
|
1423
|
+
publishEvent,
|
|
1424
|
+
persistenceQueue,
|
|
1425
|
+
updateSnapshot,
|
|
1426
|
+
completedRef,
|
|
1427
|
+
passedRef,
|
|
1428
|
+
failedRef
|
|
1429
|
+
);
|
|
1430
|
+
yield* Effect.forEach(
|
|
1431
|
+
task.testCases,
|
|
1432
|
+
processTestCase,
|
|
1433
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1434
|
+
);
|
|
1435
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1436
|
+
Ref.get(completedRef),
|
|
1437
|
+
Ref.get(passedRef),
|
|
1438
|
+
Ref.get(failedRef)
|
|
1439
|
+
]);
|
|
1347
1440
|
const finishedAt = Date.now();
|
|
1348
1441
|
const completedEvent = {
|
|
1349
1442
|
type: "RunCompleted",
|
|
1350
1443
|
runId: task.runId,
|
|
1351
1444
|
finishedAt,
|
|
1352
|
-
passedTestCases,
|
|
1353
|
-
failedTestCases,
|
|
1445
|
+
passedTestCases: passedUniqueTestCases,
|
|
1446
|
+
failedTestCases: failedUniqueTestCases,
|
|
1354
1447
|
totalTestCases: task.testCases.length,
|
|
1355
1448
|
artifactPath: task.snapshot.artifactPath
|
|
1356
1449
|
};
|
|
1357
1450
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1358
1451
|
...snapshot,
|
|
1359
1452
|
status: "completed",
|
|
1360
|
-
completedTestCases,
|
|
1361
|
-
passedTestCases,
|
|
1362
|
-
failedTestCases,
|
|
1453
|
+
completedTestCases: completedEvaluations,
|
|
1454
|
+
passedTestCases: passedUniqueTestCases,
|
|
1455
|
+
failedTestCases: failedUniqueTestCases,
|
|
1363
1456
|
finishedAt
|
|
1364
1457
|
}));
|
|
1365
1458
|
yield* publishEvent(completedEvent);
|
|
@@ -1447,7 +1540,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1447
1540
|
const artifactPath = filePath;
|
|
1448
1541
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1449
1542
|
const progress = aggregateTestCaseProgress(lines);
|
|
1450
|
-
const completedTestCases = runCompleted
|
|
1543
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1451
1544
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1452
1545
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1453
1546
|
return {
|
|
@@ -1469,23 +1562,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1469
1562
|
}
|
|
1470
1563
|
function aggregateTestCaseProgress(lines) {
|
|
1471
1564
|
let completedTestCases = 0;
|
|
1472
|
-
|
|
1473
|
-
let failedTestCases = 0;
|
|
1565
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1474
1566
|
for (const line of lines) {
|
|
1475
1567
|
try {
|
|
1476
1568
|
const event = JSON.parse(line);
|
|
1477
1569
|
if (event.type === "TestCaseProgress") {
|
|
1478
1570
|
const ev = event;
|
|
1479
1571
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
failedTestCases += 1;
|
|
1484
|
-
}
|
|
1572
|
+
const id = ev.testCaseId;
|
|
1573
|
+
const current = testCasePassedBy.get(id);
|
|
1574
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1485
1575
|
}
|
|
1486
1576
|
} catch {
|
|
1487
1577
|
}
|
|
1488
1578
|
}
|
|
1579
|
+
let passedTestCases = 0;
|
|
1580
|
+
let failedTestCases = 0;
|
|
1581
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1582
|
+
if (passed) {
|
|
1583
|
+
passedTestCases += 1;
|
|
1584
|
+
} else {
|
|
1585
|
+
failedTestCases += 1;
|
|
1586
|
+
}
|
|
1587
|
+
}
|
|
1489
1588
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1490
1589
|
}
|
|
1491
1590
|
async function parseArtifactFile(artifactPath) {
|
|
@@ -1503,6 +1602,8 @@ async function parseArtifactFile(artifactPath) {
|
|
|
1503
1602
|
testCaseName: ev.testCaseName,
|
|
1504
1603
|
completedTestCases: ev.completedTestCases,
|
|
1505
1604
|
totalTestCases: ev.totalTestCases,
|
|
1605
|
+
rerunIndex: ev.rerunIndex,
|
|
1606
|
+
rerunTotal: ev.rerunTotal,
|
|
1506
1607
|
passed: ev.passed,
|
|
1507
1608
|
durationMs: ev.durationMs,
|
|
1508
1609
|
evaluatorScores: ev.evaluatorScores ?? []
|
|
@@ -1708,6 +1809,10 @@ var EffectRunner = class {
|
|
|
1708
1809
|
throw new Error("No evaluators selected for run");
|
|
1709
1810
|
}
|
|
1710
1811
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1812
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1813
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1814
|
+
0
|
|
1815
|
+
);
|
|
1711
1816
|
const runId = `run-${randomUUID()}`;
|
|
1712
1817
|
const artifactPath = createArtifactPath(
|
|
1713
1818
|
this.config.artifactDirectory,
|
|
@@ -1720,7 +1825,7 @@ var EffectRunner = class {
|
|
|
1720
1825
|
datasetName: dataset.dataset.getName(),
|
|
1721
1826
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1722
1827
|
queuedAt: Date.now(),
|
|
1723
|
-
totalTestCases:
|
|
1828
|
+
totalTestCases: totalEvaluations,
|
|
1724
1829
|
completedTestCases: 0,
|
|
1725
1830
|
passedTestCases: 0,
|
|
1726
1831
|
failedTestCases: 0,
|
|
@@ -1734,7 +1839,7 @@ var EffectRunner = class {
|
|
|
1734
1839
|
datasetId: request.datasetId,
|
|
1735
1840
|
datasetName: dataset.dataset.getName(),
|
|
1736
1841
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1737
|
-
totalTestCases:
|
|
1842
|
+
totalTestCases: totalEvaluations,
|
|
1738
1843
|
artifactPath
|
|
1739
1844
|
};
|
|
1740
1845
|
await Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1745,6 +1850,7 @@ var EffectRunner = class {
|
|
|
1745
1850
|
payload: queuedEvent
|
|
1746
1851
|
})
|
|
1747
1852
|
);
|
|
1853
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1748
1854
|
await Effect.runPromise(
|
|
1749
1855
|
Queue.offer(this.runQueue, {
|
|
1750
1856
|
runId,
|
|
@@ -1752,7 +1858,8 @@ var EffectRunner = class {
|
|
|
1752
1858
|
dataset: dataset.dataset,
|
|
1753
1859
|
evaluators: selectedEvaluators,
|
|
1754
1860
|
testCases: selectedTestCases,
|
|
1755
|
-
snapshot
|
|
1861
|
+
snapshot,
|
|
1862
|
+
maxConcurrency
|
|
1756
1863
|
})
|
|
1757
1864
|
);
|
|
1758
1865
|
return snapshot;
|
|
@@ -1949,6 +2056,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1949
2056
|
rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
|
|
1950
2057
|
rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
1951
2058
|
for (const tc of testCases) {
|
|
2059
|
+
const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
|
|
1952
2060
|
rows.push(
|
|
1953
2061
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1954
2062
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
@@ -1960,12 +2068,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1960
2068
|
] }),
|
|
1961
2069
|
" ",
|
|
1962
2070
|
tc.testCaseName,
|
|
2071
|
+
rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
|
|
1963
2072
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1964
2073
|
" (",
|
|
1965
2074
|
tc.durationMs,
|
|
1966
2075
|
"ms)"
|
|
1967
2076
|
] })
|
|
1968
|
-
] }, `tc-${tc.testCaseId}`)
|
|
2077
|
+
] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
|
|
1969
2078
|
);
|
|
1970
2079
|
for (const item of tc.evaluatorScores) {
|
|
1971
2080
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|