@m4trix/evals 0.21.1 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +200 -154
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +201 -155
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +196 -151
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +197 -152
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +196 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -1215,6 +1215,20 @@ function readOutput(testCase) {
|
|
|
1215
1215
|
}
|
|
1216
1216
|
return candidate.getOutput();
|
|
1217
1217
|
}
|
|
1218
|
+
function buildEvaluationUnits(testCases) {
|
|
1219
|
+
const units = [];
|
|
1220
|
+
for (const testCaseItem of testCases) {
|
|
1221
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1222
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
1223
|
+
units.push({
|
|
1224
|
+
testCaseItem,
|
|
1225
|
+
rerunIndex: r + 1,
|
|
1226
|
+
rerunTotal
|
|
1227
|
+
});
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
return units;
|
|
1231
|
+
}
|
|
1218
1232
|
function nowIsoForFile() {
|
|
1219
1233
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1220
1234
|
}
|
|
@@ -1224,157 +1238,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1224
1238
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1225
1239
|
);
|
|
1226
1240
|
}
|
|
1227
|
-
function
|
|
1241
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1242
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
1228
1243
|
return effect.Effect.gen(function* () {
|
|
1229
|
-
const
|
|
1230
|
-
const
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1293
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1294
|
-
testCaseError = result.message;
|
|
1295
|
-
evaluatorScores.push({
|
|
1296
|
-
evaluatorId,
|
|
1297
|
-
scores: [],
|
|
1298
|
-
passed: false,
|
|
1299
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1300
|
-
});
|
|
1301
|
-
continue;
|
|
1302
|
-
}
|
|
1303
|
-
const { scores, metrics } = normalizeResult(result);
|
|
1304
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1305
|
-
evaluatorScores.push({
|
|
1306
|
-
evaluatorId,
|
|
1307
|
-
scores,
|
|
1308
|
-
passed: passed2,
|
|
1309
|
-
metrics,
|
|
1310
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1311
|
-
});
|
|
1312
|
-
} catch (error) {
|
|
1313
|
-
if (error instanceof Error) {
|
|
1314
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1315
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1316
|
-
}
|
|
1317
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1244
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1245
|
+
const started = Date.now();
|
|
1246
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1247
|
+
n + 1,
|
|
1248
|
+
n + 1
|
|
1249
|
+
]);
|
|
1250
|
+
yield* publishEvent({
|
|
1251
|
+
type: "TestCaseStarted",
|
|
1252
|
+
runId: task.runId,
|
|
1253
|
+
testCaseId: testCaseItem.id,
|
|
1254
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1255
|
+
startedTestCases: startedEvaluations,
|
|
1256
|
+
totalTestCases: totalEvaluations,
|
|
1257
|
+
rerunIndex,
|
|
1258
|
+
rerunTotal
|
|
1259
|
+
});
|
|
1260
|
+
const evaluatorScores = [];
|
|
1261
|
+
let testCaseError;
|
|
1262
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1263
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1264
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1265
|
+
if (!evaluateFn) {
|
|
1266
|
+
continue;
|
|
1267
|
+
}
|
|
1268
|
+
const logs = [];
|
|
1269
|
+
const logDiff = (expected, actual, options) => {
|
|
1270
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1271
|
+
};
|
|
1272
|
+
const log = (message, options) => {
|
|
1273
|
+
logs.push(createLogEntry(message, options));
|
|
1274
|
+
};
|
|
1275
|
+
const createError = (message, options) => {
|
|
1276
|
+
const entry = createLogEntry(message, options);
|
|
1277
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1278
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1279
|
+
return error;
|
|
1280
|
+
};
|
|
1281
|
+
try {
|
|
1282
|
+
const ctx = yield* effect.Effect.promise(
|
|
1283
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1284
|
+
);
|
|
1285
|
+
const result = yield* effect.Effect.promise(
|
|
1286
|
+
() => Promise.resolve().then(
|
|
1287
|
+
() => evaluateFn({
|
|
1288
|
+
input: testCaseItem.testCase.getInput(),
|
|
1289
|
+
ctx,
|
|
1290
|
+
output,
|
|
1291
|
+
meta: {
|
|
1292
|
+
triggerId: task.triggerId,
|
|
1293
|
+
runId: evaluatorRunId,
|
|
1294
|
+
datasetId: task.datasetId
|
|
1295
|
+
},
|
|
1296
|
+
logDiff,
|
|
1297
|
+
log,
|
|
1298
|
+
createError
|
|
1299
|
+
})
|
|
1300
|
+
)
|
|
1301
|
+
);
|
|
1302
|
+
if (result instanceof Error) {
|
|
1303
|
+
const evaluatorError = result;
|
|
1304
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1305
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1306
|
+
testCaseError = result.message;
|
|
1318
1307
|
evaluatorScores.push({
|
|
1319
1308
|
evaluatorId,
|
|
1320
1309
|
scores: [],
|
|
1321
1310
|
passed: false,
|
|
1322
1311
|
logs: logs.length > 0 ? logs : void 0
|
|
1323
1312
|
});
|
|
1313
|
+
continue;
|
|
1324
1314
|
}
|
|
1315
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1316
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1317
|
+
evaluatorScores.push({
|
|
1318
|
+
evaluatorId,
|
|
1319
|
+
scores,
|
|
1320
|
+
passed,
|
|
1321
|
+
metrics,
|
|
1322
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1323
|
+
});
|
|
1324
|
+
} catch (error) {
|
|
1325
|
+
if (error instanceof Error) {
|
|
1326
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1327
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1328
|
+
}
|
|
1329
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1330
|
+
evaluatorScores.push({
|
|
1331
|
+
evaluatorId,
|
|
1332
|
+
scores: [],
|
|
1333
|
+
passed: false,
|
|
1334
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1335
|
+
});
|
|
1325
1336
|
}
|
|
1326
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1327
|
-
rerunPassed.push(rerunPassedThis);
|
|
1328
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1329
|
-
n + 1,
|
|
1330
|
-
n + 1
|
|
1331
|
-
]);
|
|
1332
|
-
const progressEvent = {
|
|
1333
|
-
type: "TestCaseProgress",
|
|
1334
|
-
runId: task.runId,
|
|
1335
|
-
testCaseId: testCaseItem.id,
|
|
1336
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1337
|
-
completedTestCases: completedEvaluations,
|
|
1338
|
-
totalTestCases: totalEvaluations,
|
|
1339
|
-
rerunIndex: r + 1,
|
|
1340
|
-
rerunTotal: reruns,
|
|
1341
|
-
passed: rerunPassedThis,
|
|
1342
|
-
durationMs: Date.now() - started,
|
|
1343
|
-
evaluatorScores,
|
|
1344
|
-
output,
|
|
1345
|
-
errorMessage: testCaseError
|
|
1346
|
-
};
|
|
1347
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1348
|
-
...snapshot,
|
|
1349
|
-
completedTestCases: completedEvaluations
|
|
1350
|
-
}));
|
|
1351
|
-
yield* publishEvent(progressEvent);
|
|
1352
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
1353
|
-
runId: task.runId,
|
|
1354
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1355
|
-
payload: progressEvent
|
|
1356
|
-
});
|
|
1357
|
-
}
|
|
1358
|
-
const testCasePassed = rerunPassed.every(Boolean);
|
|
1359
|
-
if (testCasePassed) {
|
|
1360
|
-
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1361
|
-
} else {
|
|
1362
|
-
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1363
1337
|
}
|
|
1364
|
-
const
|
|
1365
|
-
|
|
1366
|
-
|
|
1338
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1339
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1340
|
+
n + 1,
|
|
1341
|
+
n + 1
|
|
1367
1342
|
]);
|
|
1368
|
-
|
|
1343
|
+
const progressEvent = {
|
|
1344
|
+
type: "TestCaseProgress",
|
|
1345
|
+
runId: task.runId,
|
|
1346
|
+
testCaseId: testCaseItem.id,
|
|
1347
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1348
|
+
completedTestCases: completedEvaluations,
|
|
1349
|
+
totalTestCases: totalEvaluations,
|
|
1350
|
+
rerunIndex,
|
|
1351
|
+
rerunTotal,
|
|
1352
|
+
passed: rerunPassedThis,
|
|
1353
|
+
durationMs: Date.now() - started,
|
|
1354
|
+
evaluatorScores,
|
|
1355
|
+
output,
|
|
1356
|
+
errorMessage: testCaseError
|
|
1357
|
+
};
|
|
1358
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1369
1359
|
...snapshot,
|
|
1370
|
-
|
|
1371
|
-
failedTestCases: failed
|
|
1360
|
+
completedTestCases: completedEvaluations
|
|
1372
1361
|
}));
|
|
1362
|
+
yield* publishEvent(progressEvent);
|
|
1363
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1364
|
+
runId: task.runId,
|
|
1365
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1366
|
+
payload: progressEvent
|
|
1367
|
+
});
|
|
1368
|
+
const testCaseCompleted = yield* effect.Ref.modify(
|
|
1369
|
+
testCaseResultsRef,
|
|
1370
|
+
(map) => {
|
|
1371
|
+
const key = testCaseItem.id;
|
|
1372
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1373
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
1374
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
1375
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
1376
|
+
const newMap = new Map(map);
|
|
1377
|
+
newMap.set(key, {
|
|
1378
|
+
completedCount: newCompletedCount,
|
|
1379
|
+
results: newResults
|
|
1380
|
+
});
|
|
1381
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
1382
|
+
return [outcome, newMap];
|
|
1383
|
+
}
|
|
1384
|
+
);
|
|
1385
|
+
if (testCaseCompleted !== null) {
|
|
1386
|
+
if (testCaseCompleted) {
|
|
1387
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1388
|
+
} else {
|
|
1389
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1390
|
+
}
|
|
1391
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
1392
|
+
effect.Ref.get(passedRef),
|
|
1393
|
+
effect.Ref.get(failedRef)
|
|
1394
|
+
]);
|
|
1395
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1396
|
+
...snapshot,
|
|
1397
|
+
passedTestCases: passed,
|
|
1398
|
+
failedTestCases: failed
|
|
1399
|
+
}));
|
|
1400
|
+
}
|
|
1373
1401
|
});
|
|
1374
1402
|
}
|
|
1375
1403
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
1376
1404
|
const startedAt = Date.now();
|
|
1377
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1405
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1378
1406
|
...snapshot,
|
|
1379
1407
|
status: "running",
|
|
1380
1408
|
startedAt
|
|
@@ -1393,9 +1421,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1393
1421
|
const startedRef = yield* effect.Ref.make(0);
|
|
1394
1422
|
const passedRef = yield* effect.Ref.make(0);
|
|
1395
1423
|
const failedRef = yield* effect.Ref.make(0);
|
|
1396
|
-
const
|
|
1424
|
+
const testCaseResultsRef = yield* effect.Ref.make(
|
|
1425
|
+
/* @__PURE__ */ new Map()
|
|
1426
|
+
);
|
|
1427
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1428
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
1397
1429
|
task,
|
|
1398
|
-
|
|
1430
|
+
unit,
|
|
1399
1431
|
totalEvaluations,
|
|
1400
1432
|
publishEvent,
|
|
1401
1433
|
persistenceQueue,
|
|
@@ -1403,11 +1435,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1403
1435
|
startedRef,
|
|
1404
1436
|
completedRef,
|
|
1405
1437
|
passedRef,
|
|
1406
|
-
failedRef
|
|
1438
|
+
failedRef,
|
|
1439
|
+
testCaseResultsRef
|
|
1407
1440
|
);
|
|
1408
1441
|
yield* effect.Effect.forEach(
|
|
1409
|
-
|
|
1410
|
-
|
|
1442
|
+
evaluationUnits,
|
|
1443
|
+
processEvaluation,
|
|
1411
1444
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1412
1445
|
);
|
|
1413
1446
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
@@ -1425,7 +1458,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1425
1458
|
totalTestCases: task.testCases.length,
|
|
1426
1459
|
artifactPath: task.snapshot.artifactPath
|
|
1427
1460
|
};
|
|
1428
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1461
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1429
1462
|
...snapshot,
|
|
1430
1463
|
status: "completed",
|
|
1431
1464
|
completedTestCases: completedEvaluations,
|
|
@@ -1678,7 +1711,9 @@ var EffectRunner = class {
|
|
|
1678
1711
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1679
1712
|
effect.Queue.unbounded()
|
|
1680
1713
|
);
|
|
1681
|
-
this.
|
|
1714
|
+
this.snapshotsRef = effect.Effect.runSync(
|
|
1715
|
+
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1716
|
+
);
|
|
1682
1717
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1683
1718
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1684
1719
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -1781,7 +1816,13 @@ var EffectRunner = class {
|
|
|
1781
1816
|
status: "queued",
|
|
1782
1817
|
artifactPath
|
|
1783
1818
|
};
|
|
1784
|
-
|
|
1819
|
+
await effect.Effect.runPromise(
|
|
1820
|
+
effect.Ref.update(this.snapshotsRef, (map) => {
|
|
1821
|
+
const next = new Map(map);
|
|
1822
|
+
next.set(runId, snapshot);
|
|
1823
|
+
return next;
|
|
1824
|
+
})
|
|
1825
|
+
);
|
|
1785
1826
|
const queuedEvent = {
|
|
1786
1827
|
type: "RunQueued",
|
|
1787
1828
|
runId,
|
|
@@ -1822,12 +1863,12 @@ var EffectRunner = class {
|
|
|
1822
1863
|
};
|
|
1823
1864
|
}
|
|
1824
1865
|
getRunSnapshot(runId) {
|
|
1825
|
-
return this.
|
|
1866
|
+
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
1826
1867
|
}
|
|
1827
1868
|
getAllRunSnapshots() {
|
|
1828
|
-
return Array.from(
|
|
1829
|
-
(
|
|
1830
|
-
);
|
|
1869
|
+
return Array.from(
|
|
1870
|
+
effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
|
|
1871
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1831
1872
|
}
|
|
1832
1873
|
async loadRunSnapshotsFromArtifacts() {
|
|
1833
1874
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1856,11 +1897,15 @@ var EffectRunner = class {
|
|
|
1856
1897
|
);
|
|
1857
1898
|
}
|
|
1858
1899
|
updateSnapshot(runId, updater) {
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1900
|
+
return effect.Ref.modify(this.snapshotsRef, (map) => {
|
|
1901
|
+
const existing = map.get(runId);
|
|
1902
|
+
if (!existing) {
|
|
1903
|
+
return [void 0, map];
|
|
1904
|
+
}
|
|
1905
|
+
const next = new Map(map);
|
|
1906
|
+
next.set(runId, updater(existing));
|
|
1907
|
+
return [void 0, next];
|
|
1908
|
+
}).pipe(effect.Effect.asVoid);
|
|
1864
1909
|
}
|
|
1865
1910
|
publishEvent(event) {
|
|
1866
1911
|
return effect.Effect.sync(() => {
|