@m4trix/evals 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +599 -224
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +600 -225
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +214 -105
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +215 -106
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +23 -5
- package/dist/index.js +218 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -860,6 +860,7 @@ var Metric = {
|
|
|
860
860
|
const def = {
|
|
861
861
|
id: config.id,
|
|
862
862
|
name: config.name,
|
|
863
|
+
aggregate: config.aggregate,
|
|
863
864
|
format: config.format,
|
|
864
865
|
make: (data) => ({ id: config.id, data })
|
|
865
866
|
};
|
|
@@ -879,6 +880,7 @@ var Score = {
|
|
|
879
880
|
id: config.id,
|
|
880
881
|
name: config.name,
|
|
881
882
|
displayStrategy: config.displayStrategy,
|
|
883
|
+
aggregate: config.aggregate,
|
|
882
884
|
format: config.format,
|
|
883
885
|
make: (data, options) => {
|
|
884
886
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -897,23 +899,62 @@ function getScoreById(id) {
|
|
|
897
899
|
return registry2.get(id);
|
|
898
900
|
}
|
|
899
901
|
|
|
902
|
+
// src/evals/aggregators.ts
|
|
903
|
+
function aggregateAverage(values) {
|
|
904
|
+
if (values.length === 0) {
|
|
905
|
+
return { value: 0 };
|
|
906
|
+
}
|
|
907
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
908
|
+
return { value: sum / values.length };
|
|
909
|
+
}
|
|
910
|
+
function aggregateAll(values) {
|
|
911
|
+
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
912
|
+
}
|
|
913
|
+
function aggregateTokenCountSum(values) {
|
|
914
|
+
const initial = {
|
|
915
|
+
input: 0,
|
|
916
|
+
output: 0,
|
|
917
|
+
inputCached: 0,
|
|
918
|
+
outputCached: 0
|
|
919
|
+
};
|
|
920
|
+
return values.reduce(
|
|
921
|
+
(acc, v) => ({
|
|
922
|
+
input: acc.input + (v.input ?? 0),
|
|
923
|
+
output: acc.output + (v.output ?? 0),
|
|
924
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
925
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
926
|
+
}),
|
|
927
|
+
initial
|
|
928
|
+
);
|
|
929
|
+
}
|
|
930
|
+
function aggregateLatencyAverage(values) {
|
|
931
|
+
if (values.length === 0) {
|
|
932
|
+
return { ms: 0 };
|
|
933
|
+
}
|
|
934
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
935
|
+
return { ms: sum / values.length };
|
|
936
|
+
}
|
|
937
|
+
|
|
900
938
|
// src/evals/metrics/standard.ts
|
|
901
939
|
Metric.of({
|
|
902
940
|
id: "token-count",
|
|
903
941
|
name: "Tokens",
|
|
904
|
-
|
|
942
|
+
aggregate: aggregateTokenCountSum,
|
|
943
|
+
format: (data, options) => {
|
|
905
944
|
const input = data.input ?? 0;
|
|
906
945
|
const output = data.output ?? 0;
|
|
907
946
|
const inputCached = data.inputCached ?? 0;
|
|
908
947
|
const outputCached = data.outputCached ?? 0;
|
|
909
948
|
const cached = inputCached + outputCached;
|
|
910
|
-
|
|
949
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
950
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
911
951
|
}
|
|
912
952
|
});
|
|
913
953
|
Metric.of({
|
|
914
954
|
id: "latency",
|
|
915
955
|
name: "Latency",
|
|
916
|
-
|
|
956
|
+
aggregate: aggregateLatencyAverage,
|
|
957
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
917
958
|
});
|
|
918
959
|
|
|
919
960
|
// src/evals/scores/standard.ts
|
|
@@ -921,13 +962,15 @@ Score.of({
|
|
|
921
962
|
id: "percent",
|
|
922
963
|
name: "Score",
|
|
923
964
|
displayStrategy: "bar",
|
|
924
|
-
format: (data) => data.value.toFixed(2)
|
|
965
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
966
|
+
aggregate: aggregateAverage
|
|
925
967
|
});
|
|
926
968
|
Score.of({
|
|
927
969
|
id: "binary",
|
|
928
970
|
name: "Result",
|
|
929
971
|
displayStrategy: "passFail",
|
|
930
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
972
|
+
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
973
|
+
aggregate: aggregateAll
|
|
931
974
|
});
|
|
932
975
|
function createDiffLogEntry(expected, actual, options) {
|
|
933
976
|
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
@@ -1009,7 +1052,8 @@ var defaultRunnerConfig = {
|
|
|
1009
1052
|
],
|
|
1010
1053
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
1011
1054
|
},
|
|
1012
|
-
artifactDirectory: ".eval-results"
|
|
1055
|
+
artifactDirectory: ".eval-results",
|
|
1056
|
+
maxConcurrency: 1
|
|
1013
1057
|
};
|
|
1014
1058
|
function toRunnerConfigOverrides(config) {
|
|
1015
1059
|
if (!config) {
|
|
@@ -1042,6 +1086,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
1042
1086
|
if (config.artifactDirectory !== void 0) {
|
|
1043
1087
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
1044
1088
|
}
|
|
1089
|
+
if (config.maxConcurrency !== void 0) {
|
|
1090
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
1091
|
+
}
|
|
1045
1092
|
if (Object.keys(discovery).length > 0) {
|
|
1046
1093
|
overrides.discovery = discovery;
|
|
1047
1094
|
}
|
|
@@ -1276,6 +1323,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1276
1323
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1277
1324
|
);
|
|
1278
1325
|
}
|
|
1326
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1327
|
+
return effect.Effect.gen(function* () {
|
|
1328
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1329
|
+
const rerunPassed = [];
|
|
1330
|
+
for (let r = 0; r < reruns; r++) {
|
|
1331
|
+
const started = Date.now();
|
|
1332
|
+
const evaluatorScores = [];
|
|
1333
|
+
let testCaseError;
|
|
1334
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1335
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1336
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1337
|
+
if (!evaluateFn) {
|
|
1338
|
+
continue;
|
|
1339
|
+
}
|
|
1340
|
+
try {
|
|
1341
|
+
const logs = [];
|
|
1342
|
+
const logDiff = (expected, actual, options) => {
|
|
1343
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1344
|
+
};
|
|
1345
|
+
const ctx = yield* effect.Effect.promise(
|
|
1346
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1347
|
+
);
|
|
1348
|
+
const result = yield* effect.Effect.promise(
|
|
1349
|
+
() => Promise.resolve(
|
|
1350
|
+
evaluateFn({
|
|
1351
|
+
input: testCaseItem.testCase.getInput(),
|
|
1352
|
+
ctx,
|
|
1353
|
+
output,
|
|
1354
|
+
logDiff
|
|
1355
|
+
})
|
|
1356
|
+
)
|
|
1357
|
+
);
|
|
1358
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1359
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1360
|
+
evaluatorScores.push({
|
|
1361
|
+
evaluatorId,
|
|
1362
|
+
scores,
|
|
1363
|
+
passed: passed2,
|
|
1364
|
+
metrics,
|
|
1365
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1366
|
+
});
|
|
1367
|
+
} catch (error) {
|
|
1368
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1369
|
+
evaluatorScores.push({
|
|
1370
|
+
evaluatorId,
|
|
1371
|
+
scores: [],
|
|
1372
|
+
passed: false
|
|
1373
|
+
});
|
|
1374
|
+
}
|
|
1375
|
+
}
|
|
1376
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1377
|
+
rerunPassed.push(rerunPassedThis);
|
|
1378
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1379
|
+
n + 1,
|
|
1380
|
+
n + 1
|
|
1381
|
+
]);
|
|
1382
|
+
const progressEvent = {
|
|
1383
|
+
type: "TestCaseProgress",
|
|
1384
|
+
runId: task.runId,
|
|
1385
|
+
testCaseId: testCaseItem.id,
|
|
1386
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1387
|
+
completedTestCases: completedEvaluations,
|
|
1388
|
+
totalTestCases: totalEvaluations,
|
|
1389
|
+
rerunIndex: r + 1,
|
|
1390
|
+
rerunTotal: reruns,
|
|
1391
|
+
passed: rerunPassedThis,
|
|
1392
|
+
durationMs: Date.now() - started,
|
|
1393
|
+
evaluatorScores,
|
|
1394
|
+
output,
|
|
1395
|
+
errorMessage: testCaseError
|
|
1396
|
+
};
|
|
1397
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1398
|
+
...snapshot,
|
|
1399
|
+
completedTestCases: completedEvaluations
|
|
1400
|
+
}));
|
|
1401
|
+
yield* publishEvent(progressEvent);
|
|
1402
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1403
|
+
runId: task.runId,
|
|
1404
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1405
|
+
payload: progressEvent
|
|
1406
|
+
});
|
|
1407
|
+
}
|
|
1408
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1409
|
+
if (testCasePassed) {
|
|
1410
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1411
|
+
} else {
|
|
1412
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1413
|
+
}
|
|
1414
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
1415
|
+
effect.Ref.get(passedRef),
|
|
1416
|
+
effect.Ref.get(failedRef)
|
|
1417
|
+
]);
|
|
1418
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1419
|
+
...snapshot,
|
|
1420
|
+
passedTestCases: passed,
|
|
1421
|
+
failedTestCases: failed
|
|
1422
|
+
}));
|
|
1423
|
+
});
|
|
1424
|
+
}
|
|
1279
1425
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
1280
1426
|
const startedAt = Date.now();
|
|
1281
1427
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -1288,104 +1434,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1288
1434
|
runId: task.runId,
|
|
1289
1435
|
startedAt
|
|
1290
1436
|
});
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
)
|
|
1321
|
-
);
|
|
1322
|
-
const { scores, metrics } = normalizeResult(result);
|
|
1323
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1324
|
-
evaluatorScores.push({
|
|
1325
|
-
evaluatorId,
|
|
1326
|
-
scores,
|
|
1327
|
-
passed,
|
|
1328
|
-
metrics,
|
|
1329
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1330
|
-
});
|
|
1331
|
-
} catch (error) {
|
|
1332
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1333
|
-
evaluatorScores.push({
|
|
1334
|
-
evaluatorId,
|
|
1335
|
-
scores: [],
|
|
1336
|
-
passed: false
|
|
1337
|
-
});
|
|
1338
|
-
}
|
|
1339
|
-
}
|
|
1340
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
1341
|
-
completedTestCases += 1;
|
|
1342
|
-
if (testCasePassed) {
|
|
1343
|
-
passedTestCases += 1;
|
|
1344
|
-
} else {
|
|
1345
|
-
failedTestCases += 1;
|
|
1346
|
-
}
|
|
1347
|
-
const progressEvent = {
|
|
1348
|
-
type: "TestCaseProgress",
|
|
1349
|
-
runId: task.runId,
|
|
1350
|
-
testCaseId: testCaseItem.id,
|
|
1351
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1352
|
-
completedTestCases,
|
|
1353
|
-
totalTestCases: task.testCases.length,
|
|
1354
|
-
passed: testCasePassed,
|
|
1355
|
-
durationMs: Date.now() - started,
|
|
1356
|
-
evaluatorScores,
|
|
1357
|
-
output,
|
|
1358
|
-
errorMessage: testCaseError
|
|
1359
|
-
};
|
|
1360
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1361
|
-
...snapshot,
|
|
1362
|
-
completedTestCases,
|
|
1363
|
-
passedTestCases,
|
|
1364
|
-
failedTestCases
|
|
1365
|
-
}));
|
|
1366
|
-
yield* publishEvent(progressEvent);
|
|
1367
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
1368
|
-
runId: task.runId,
|
|
1369
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1370
|
-
payload: progressEvent
|
|
1371
|
-
});
|
|
1372
|
-
}
|
|
1437
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1438
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1439
|
+
0
|
|
1440
|
+
);
|
|
1441
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1442
|
+
const completedRef = yield* effect.Ref.make(0);
|
|
1443
|
+
const passedRef = yield* effect.Ref.make(0);
|
|
1444
|
+
const failedRef = yield* effect.Ref.make(0);
|
|
1445
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1446
|
+
task,
|
|
1447
|
+
testCaseItem,
|
|
1448
|
+
totalEvaluations,
|
|
1449
|
+
publishEvent,
|
|
1450
|
+
persistenceQueue,
|
|
1451
|
+
updateSnapshot,
|
|
1452
|
+
completedRef,
|
|
1453
|
+
passedRef,
|
|
1454
|
+
failedRef
|
|
1455
|
+
);
|
|
1456
|
+
yield* effect.Effect.forEach(
|
|
1457
|
+
task.testCases,
|
|
1458
|
+
processTestCase,
|
|
1459
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1460
|
+
);
|
|
1461
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1462
|
+
effect.Ref.get(completedRef),
|
|
1463
|
+
effect.Ref.get(passedRef),
|
|
1464
|
+
effect.Ref.get(failedRef)
|
|
1465
|
+
]);
|
|
1373
1466
|
const finishedAt = Date.now();
|
|
1374
1467
|
const completedEvent = {
|
|
1375
1468
|
type: "RunCompleted",
|
|
1376
1469
|
runId: task.runId,
|
|
1377
1470
|
finishedAt,
|
|
1378
|
-
passedTestCases,
|
|
1379
|
-
failedTestCases,
|
|
1471
|
+
passedTestCases: passedUniqueTestCases,
|
|
1472
|
+
failedTestCases: failedUniqueTestCases,
|
|
1380
1473
|
totalTestCases: task.testCases.length,
|
|
1381
1474
|
artifactPath: task.snapshot.artifactPath
|
|
1382
1475
|
};
|
|
1383
1476
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1384
1477
|
...snapshot,
|
|
1385
1478
|
status: "completed",
|
|
1386
|
-
completedTestCases,
|
|
1387
|
-
passedTestCases,
|
|
1388
|
-
failedTestCases,
|
|
1479
|
+
completedTestCases: completedEvaluations,
|
|
1480
|
+
passedTestCases: passedUniqueTestCases,
|
|
1481
|
+
failedTestCases: failedUniqueTestCases,
|
|
1389
1482
|
finishedAt
|
|
1390
1483
|
}));
|
|
1391
1484
|
yield* publishEvent(completedEvent);
|
|
@@ -1473,7 +1566,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1473
1566
|
const artifactPath = filePath;
|
|
1474
1567
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1475
1568
|
const progress = aggregateTestCaseProgress(lines);
|
|
1476
|
-
const completedTestCases = runCompleted
|
|
1569
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1477
1570
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1478
1571
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1479
1572
|
return {
|
|
@@ -1495,23 +1588,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1495
1588
|
}
|
|
1496
1589
|
function aggregateTestCaseProgress(lines) {
|
|
1497
1590
|
let completedTestCases = 0;
|
|
1498
|
-
|
|
1499
|
-
let failedTestCases = 0;
|
|
1591
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1500
1592
|
for (const line of lines) {
|
|
1501
1593
|
try {
|
|
1502
1594
|
const event = JSON.parse(line);
|
|
1503
1595
|
if (event.type === "TestCaseProgress") {
|
|
1504
1596
|
const ev = event;
|
|
1505
1597
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
failedTestCases += 1;
|
|
1510
|
-
}
|
|
1598
|
+
const id = ev.testCaseId;
|
|
1599
|
+
const current = testCasePassedBy.get(id);
|
|
1600
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1511
1601
|
}
|
|
1512
1602
|
} catch {
|
|
1513
1603
|
}
|
|
1514
1604
|
}
|
|
1605
|
+
let passedTestCases = 0;
|
|
1606
|
+
let failedTestCases = 0;
|
|
1607
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1608
|
+
if (passed) {
|
|
1609
|
+
passedTestCases += 1;
|
|
1610
|
+
} else {
|
|
1611
|
+
failedTestCases += 1;
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1515
1614
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1516
1615
|
}
|
|
1517
1616
|
async function parseArtifactFile(artifactPath) {
|
|
@@ -1529,6 +1628,8 @@ async function parseArtifactFile(artifactPath) {
|
|
|
1529
1628
|
testCaseName: ev.testCaseName,
|
|
1530
1629
|
completedTestCases: ev.completedTestCases,
|
|
1531
1630
|
totalTestCases: ev.totalTestCases,
|
|
1631
|
+
rerunIndex: ev.rerunIndex,
|
|
1632
|
+
rerunTotal: ev.rerunTotal,
|
|
1532
1633
|
passed: ev.passed,
|
|
1533
1634
|
durationMs: ev.durationMs,
|
|
1534
1635
|
evaluatorScores: ev.evaluatorScores ?? []
|
|
@@ -1734,6 +1835,10 @@ var EffectRunner = class {
|
|
|
1734
1835
|
throw new Error("No evaluators selected for run");
|
|
1735
1836
|
}
|
|
1736
1837
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1838
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1839
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1840
|
+
0
|
|
1841
|
+
);
|
|
1737
1842
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1738
1843
|
const artifactPath = createArtifactPath(
|
|
1739
1844
|
this.config.artifactDirectory,
|
|
@@ -1746,7 +1851,7 @@ var EffectRunner = class {
|
|
|
1746
1851
|
datasetName: dataset.dataset.getName(),
|
|
1747
1852
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1748
1853
|
queuedAt: Date.now(),
|
|
1749
|
-
totalTestCases:
|
|
1854
|
+
totalTestCases: totalEvaluations,
|
|
1750
1855
|
completedTestCases: 0,
|
|
1751
1856
|
passedTestCases: 0,
|
|
1752
1857
|
failedTestCases: 0,
|
|
@@ -1760,7 +1865,7 @@ var EffectRunner = class {
|
|
|
1760
1865
|
datasetId: request.datasetId,
|
|
1761
1866
|
datasetName: dataset.dataset.getName(),
|
|
1762
1867
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1763
|
-
totalTestCases:
|
|
1868
|
+
totalTestCases: totalEvaluations,
|
|
1764
1869
|
artifactPath
|
|
1765
1870
|
};
|
|
1766
1871
|
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1771,6 +1876,7 @@ var EffectRunner = class {
|
|
|
1771
1876
|
payload: queuedEvent
|
|
1772
1877
|
})
|
|
1773
1878
|
);
|
|
1879
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1774
1880
|
await effect.Effect.runPromise(
|
|
1775
1881
|
effect.Queue.offer(this.runQueue, {
|
|
1776
1882
|
runId,
|
|
@@ -1778,7 +1884,8 @@ var EffectRunner = class {
|
|
|
1778
1884
|
dataset: dataset.dataset,
|
|
1779
1885
|
evaluators: selectedEvaluators,
|
|
1780
1886
|
testCases: selectedTestCases,
|
|
1781
|
-
snapshot
|
|
1887
|
+
snapshot,
|
|
1888
|
+
maxConcurrency
|
|
1782
1889
|
})
|
|
1783
1890
|
);
|
|
1784
1891
|
return snapshot;
|
|
@@ -1975,6 +2082,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1975
2082
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
|
|
1976
2083
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
1977
2084
|
for (const tc of testCases) {
|
|
2085
|
+
const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
|
|
1978
2086
|
rows.push(
|
|
1979
2087
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1980
2088
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
@@ -1986,12 +2094,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1986
2094
|
] }),
|
|
1987
2095
|
" ",
|
|
1988
2096
|
tc.testCaseName,
|
|
2097
|
+
rerunPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: rerunPart }) : null,
|
|
1989
2098
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1990
2099
|
" (",
|
|
1991
2100
|
tc.durationMs,
|
|
1992
2101
|
"ms)"
|
|
1993
2102
|
] })
|
|
1994
|
-
] }, `tc-${tc.testCaseId}`)
|
|
2103
|
+
] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
|
|
1995
2104
|
);
|
|
1996
2105
|
for (const item of tc.evaluatorScores) {
|
|
1997
2106
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|