@m4trix/evals 0.21.1 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +196 -151
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +197 -152
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +196 -151
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +197 -152
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +196 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Effect, PubSub, Queue,
|
|
1
|
+
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
3
|
import { diffLines } from 'diff';
|
|
4
4
|
import stringify from 'fast-json-stable-stringify';
|
|
@@ -1190,6 +1190,20 @@ function readOutput(testCase) {
|
|
|
1190
1190
|
}
|
|
1191
1191
|
return candidate.getOutput();
|
|
1192
1192
|
}
|
|
1193
|
+
function buildEvaluationUnits(testCases) {
|
|
1194
|
+
const units = [];
|
|
1195
|
+
for (const testCaseItem of testCases) {
|
|
1196
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1197
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
1198
|
+
units.push({
|
|
1199
|
+
testCaseItem,
|
|
1200
|
+
rerunIndex: r + 1,
|
|
1201
|
+
rerunTotal
|
|
1202
|
+
});
|
|
1203
|
+
}
|
|
1204
|
+
}
|
|
1205
|
+
return units;
|
|
1206
|
+
}
|
|
1193
1207
|
function nowIsoForFile() {
|
|
1194
1208
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1195
1209
|
}
|
|
@@ -1199,157 +1213,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1199
1213
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1200
1214
|
);
|
|
1201
1215
|
}
|
|
1202
|
-
function
|
|
1216
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1217
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
1203
1218
|
return Effect.gen(function* () {
|
|
1204
|
-
const
|
|
1205
|
-
const
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1268
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1269
|
-
testCaseError = result.message;
|
|
1270
|
-
evaluatorScores.push({
|
|
1271
|
-
evaluatorId,
|
|
1272
|
-
scores: [],
|
|
1273
|
-
passed: false,
|
|
1274
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1275
|
-
});
|
|
1276
|
-
continue;
|
|
1277
|
-
}
|
|
1278
|
-
const { scores, metrics } = normalizeResult(result);
|
|
1279
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1280
|
-
evaluatorScores.push({
|
|
1281
|
-
evaluatorId,
|
|
1282
|
-
scores,
|
|
1283
|
-
passed: passed2,
|
|
1284
|
-
metrics,
|
|
1285
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1286
|
-
});
|
|
1287
|
-
} catch (error) {
|
|
1288
|
-
if (error instanceof Error) {
|
|
1289
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1290
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1291
|
-
}
|
|
1292
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1219
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1220
|
+
const started = Date.now();
|
|
1221
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1222
|
+
n + 1,
|
|
1223
|
+
n + 1
|
|
1224
|
+
]);
|
|
1225
|
+
yield* publishEvent({
|
|
1226
|
+
type: "TestCaseStarted",
|
|
1227
|
+
runId: task.runId,
|
|
1228
|
+
testCaseId: testCaseItem.id,
|
|
1229
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1230
|
+
startedTestCases: startedEvaluations,
|
|
1231
|
+
totalTestCases: totalEvaluations,
|
|
1232
|
+
rerunIndex,
|
|
1233
|
+
rerunTotal
|
|
1234
|
+
});
|
|
1235
|
+
const evaluatorScores = [];
|
|
1236
|
+
let testCaseError;
|
|
1237
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1238
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1239
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1240
|
+
if (!evaluateFn) {
|
|
1241
|
+
continue;
|
|
1242
|
+
}
|
|
1243
|
+
const logs = [];
|
|
1244
|
+
const logDiff = (expected, actual, options) => {
|
|
1245
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1246
|
+
};
|
|
1247
|
+
const log = (message, options) => {
|
|
1248
|
+
logs.push(createLogEntry(message, options));
|
|
1249
|
+
};
|
|
1250
|
+
const createError = (message, options) => {
|
|
1251
|
+
const entry = createLogEntry(message, options);
|
|
1252
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1253
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1254
|
+
return error;
|
|
1255
|
+
};
|
|
1256
|
+
try {
|
|
1257
|
+
const ctx = yield* Effect.promise(
|
|
1258
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1259
|
+
);
|
|
1260
|
+
const result = yield* Effect.promise(
|
|
1261
|
+
() => Promise.resolve().then(
|
|
1262
|
+
() => evaluateFn({
|
|
1263
|
+
input: testCaseItem.testCase.getInput(),
|
|
1264
|
+
ctx,
|
|
1265
|
+
output,
|
|
1266
|
+
meta: {
|
|
1267
|
+
triggerId: task.triggerId,
|
|
1268
|
+
runId: evaluatorRunId,
|
|
1269
|
+
datasetId: task.datasetId
|
|
1270
|
+
},
|
|
1271
|
+
logDiff,
|
|
1272
|
+
log,
|
|
1273
|
+
createError
|
|
1274
|
+
})
|
|
1275
|
+
)
|
|
1276
|
+
);
|
|
1277
|
+
if (result instanceof Error) {
|
|
1278
|
+
const evaluatorError = result;
|
|
1279
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1280
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1281
|
+
testCaseError = result.message;
|
|
1293
1282
|
evaluatorScores.push({
|
|
1294
1283
|
evaluatorId,
|
|
1295
1284
|
scores: [],
|
|
1296
1285
|
passed: false,
|
|
1297
1286
|
logs: logs.length > 0 ? logs : void 0
|
|
1298
1287
|
});
|
|
1288
|
+
continue;
|
|
1289
|
+
}
|
|
1290
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1291
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1292
|
+
evaluatorScores.push({
|
|
1293
|
+
evaluatorId,
|
|
1294
|
+
scores,
|
|
1295
|
+
passed,
|
|
1296
|
+
metrics,
|
|
1297
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1298
|
+
});
|
|
1299
|
+
} catch (error) {
|
|
1300
|
+
if (error instanceof Error) {
|
|
1301
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1302
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1299
1303
|
}
|
|
1304
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1305
|
+
evaluatorScores.push({
|
|
1306
|
+
evaluatorId,
|
|
1307
|
+
scores: [],
|
|
1308
|
+
passed: false,
|
|
1309
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1310
|
+
});
|
|
1300
1311
|
}
|
|
1301
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1302
|
-
rerunPassed.push(rerunPassedThis);
|
|
1303
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1304
|
-
n + 1,
|
|
1305
|
-
n + 1
|
|
1306
|
-
]);
|
|
1307
|
-
const progressEvent = {
|
|
1308
|
-
type: "TestCaseProgress",
|
|
1309
|
-
runId: task.runId,
|
|
1310
|
-
testCaseId: testCaseItem.id,
|
|
1311
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1312
|
-
completedTestCases: completedEvaluations,
|
|
1313
|
-
totalTestCases: totalEvaluations,
|
|
1314
|
-
rerunIndex: r + 1,
|
|
1315
|
-
rerunTotal: reruns,
|
|
1316
|
-
passed: rerunPassedThis,
|
|
1317
|
-
durationMs: Date.now() - started,
|
|
1318
|
-
evaluatorScores,
|
|
1319
|
-
output,
|
|
1320
|
-
errorMessage: testCaseError
|
|
1321
|
-
};
|
|
1322
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1323
|
-
...snapshot,
|
|
1324
|
-
completedTestCases: completedEvaluations
|
|
1325
|
-
}));
|
|
1326
|
-
yield* publishEvent(progressEvent);
|
|
1327
|
-
yield* Queue.offer(persistenceQueue, {
|
|
1328
|
-
runId: task.runId,
|
|
1329
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1330
|
-
payload: progressEvent
|
|
1331
|
-
});
|
|
1332
1312
|
}
|
|
1333
|
-
const
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1338
|
-
}
|
|
1339
|
-
const [passed, failed] = yield* Effect.all([
|
|
1340
|
-
Ref.get(passedRef),
|
|
1341
|
-
Ref.get(failedRef)
|
|
1313
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1314
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1315
|
+
n + 1,
|
|
1316
|
+
n + 1
|
|
1342
1317
|
]);
|
|
1343
|
-
|
|
1318
|
+
const progressEvent = {
|
|
1319
|
+
type: "TestCaseProgress",
|
|
1320
|
+
runId: task.runId,
|
|
1321
|
+
testCaseId: testCaseItem.id,
|
|
1322
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1323
|
+
completedTestCases: completedEvaluations,
|
|
1324
|
+
totalTestCases: totalEvaluations,
|
|
1325
|
+
rerunIndex,
|
|
1326
|
+
rerunTotal,
|
|
1327
|
+
passed: rerunPassedThis,
|
|
1328
|
+
durationMs: Date.now() - started,
|
|
1329
|
+
evaluatorScores,
|
|
1330
|
+
output,
|
|
1331
|
+
errorMessage: testCaseError
|
|
1332
|
+
};
|
|
1333
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1344
1334
|
...snapshot,
|
|
1345
|
-
|
|
1346
|
-
failedTestCases: failed
|
|
1335
|
+
completedTestCases: completedEvaluations
|
|
1347
1336
|
}));
|
|
1337
|
+
yield* publishEvent(progressEvent);
|
|
1338
|
+
yield* Queue.offer(persistenceQueue, {
|
|
1339
|
+
runId: task.runId,
|
|
1340
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1341
|
+
payload: progressEvent
|
|
1342
|
+
});
|
|
1343
|
+
const testCaseCompleted = yield* Ref.modify(
|
|
1344
|
+
testCaseResultsRef,
|
|
1345
|
+
(map) => {
|
|
1346
|
+
const key = testCaseItem.id;
|
|
1347
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1348
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
1349
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
1350
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
1351
|
+
const newMap = new Map(map);
|
|
1352
|
+
newMap.set(key, {
|
|
1353
|
+
completedCount: newCompletedCount,
|
|
1354
|
+
results: newResults
|
|
1355
|
+
});
|
|
1356
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
1357
|
+
return [outcome, newMap];
|
|
1358
|
+
}
|
|
1359
|
+
);
|
|
1360
|
+
if (testCaseCompleted !== null) {
|
|
1361
|
+
if (testCaseCompleted) {
|
|
1362
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
1363
|
+
} else {
|
|
1364
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1365
|
+
}
|
|
1366
|
+
const [passed, failed] = yield* Effect.all([
|
|
1367
|
+
Ref.get(passedRef),
|
|
1368
|
+
Ref.get(failedRef)
|
|
1369
|
+
]);
|
|
1370
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1371
|
+
...snapshot,
|
|
1372
|
+
passedTestCases: passed,
|
|
1373
|
+
failedTestCases: failed
|
|
1374
|
+
}));
|
|
1375
|
+
}
|
|
1348
1376
|
});
|
|
1349
1377
|
}
|
|
1350
1378
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
1351
1379
|
const startedAt = Date.now();
|
|
1352
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1380
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1353
1381
|
...snapshot,
|
|
1354
1382
|
status: "running",
|
|
1355
1383
|
startedAt
|
|
@@ -1368,9 +1396,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1368
1396
|
const startedRef = yield* Ref.make(0);
|
|
1369
1397
|
const passedRef = yield* Ref.make(0);
|
|
1370
1398
|
const failedRef = yield* Ref.make(0);
|
|
1371
|
-
const
|
|
1399
|
+
const testCaseResultsRef = yield* Ref.make(
|
|
1400
|
+
/* @__PURE__ */ new Map()
|
|
1401
|
+
);
|
|
1402
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1403
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
1372
1404
|
task,
|
|
1373
|
-
|
|
1405
|
+
unit,
|
|
1374
1406
|
totalEvaluations,
|
|
1375
1407
|
publishEvent,
|
|
1376
1408
|
persistenceQueue,
|
|
@@ -1378,11 +1410,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1378
1410
|
startedRef,
|
|
1379
1411
|
completedRef,
|
|
1380
1412
|
passedRef,
|
|
1381
|
-
failedRef
|
|
1413
|
+
failedRef,
|
|
1414
|
+
testCaseResultsRef
|
|
1382
1415
|
);
|
|
1383
1416
|
yield* Effect.forEach(
|
|
1384
|
-
|
|
1385
|
-
|
|
1417
|
+
evaluationUnits,
|
|
1418
|
+
processEvaluation,
|
|
1386
1419
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1387
1420
|
);
|
|
1388
1421
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
@@ -1400,7 +1433,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1400
1433
|
totalTestCases: task.testCases.length,
|
|
1401
1434
|
artifactPath: task.snapshot.artifactPath
|
|
1402
1435
|
};
|
|
1403
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1436
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1404
1437
|
...snapshot,
|
|
1405
1438
|
status: "completed",
|
|
1406
1439
|
completedTestCases: completedEvaluations,
|
|
@@ -1653,7 +1686,9 @@ var EffectRunner = class {
|
|
|
1653
1686
|
this.persistenceQueue = Effect.runSync(
|
|
1654
1687
|
Queue.unbounded()
|
|
1655
1688
|
);
|
|
1656
|
-
this.
|
|
1689
|
+
this.snapshotsRef = Effect.runSync(
|
|
1690
|
+
Ref.make(/* @__PURE__ */ new Map())
|
|
1691
|
+
);
|
|
1657
1692
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1658
1693
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1659
1694
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -1756,7 +1791,13 @@ var EffectRunner = class {
|
|
|
1756
1791
|
status: "queued",
|
|
1757
1792
|
artifactPath
|
|
1758
1793
|
};
|
|
1759
|
-
|
|
1794
|
+
await Effect.runPromise(
|
|
1795
|
+
Ref.update(this.snapshotsRef, (map) => {
|
|
1796
|
+
const next = new Map(map);
|
|
1797
|
+
next.set(runId, snapshot);
|
|
1798
|
+
return next;
|
|
1799
|
+
})
|
|
1800
|
+
);
|
|
1760
1801
|
const queuedEvent = {
|
|
1761
1802
|
type: "RunQueued",
|
|
1762
1803
|
runId,
|
|
@@ -1797,12 +1838,12 @@ var EffectRunner = class {
|
|
|
1797
1838
|
};
|
|
1798
1839
|
}
|
|
1799
1840
|
getRunSnapshot(runId) {
|
|
1800
|
-
return this.
|
|
1841
|
+
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
1801
1842
|
}
|
|
1802
1843
|
getAllRunSnapshots() {
|
|
1803
|
-
return Array.from(
|
|
1804
|
-
(
|
|
1805
|
-
);
|
|
1844
|
+
return Array.from(
|
|
1845
|
+
Effect.runSync(Ref.get(this.snapshotsRef)).values()
|
|
1846
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1806
1847
|
}
|
|
1807
1848
|
async loadRunSnapshotsFromArtifacts() {
|
|
1808
1849
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1831,11 +1872,15 @@ var EffectRunner = class {
|
|
|
1831
1872
|
);
|
|
1832
1873
|
}
|
|
1833
1874
|
updateSnapshot(runId, updater) {
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1875
|
+
return Ref.modify(this.snapshotsRef, (map) => {
|
|
1876
|
+
const existing = map.get(runId);
|
|
1877
|
+
if (!existing) {
|
|
1878
|
+
return [void 0, map];
|
|
1879
|
+
}
|
|
1880
|
+
const next = new Map(map);
|
|
1881
|
+
next.set(runId, updater(existing));
|
|
1882
|
+
return [void 0, next];
|
|
1883
|
+
}).pipe(Effect.asVoid);
|
|
1839
1884
|
}
|
|
1840
1885
|
publishEvent(event) {
|
|
1841
1886
|
return Effect.sync(() => {
|