@m4trix/evals 0.21.1 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
1
+ import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
3
  import { diffLines } from 'diff';
4
4
  import stringify from 'fast-json-stable-stringify';
@@ -1190,6 +1190,20 @@ function readOutput(testCase) {
1190
1190
  }
1191
1191
  return candidate.getOutput();
1192
1192
  }
1193
+ function buildEvaluationUnits(testCases) {
1194
+ const units = [];
1195
+ for (const testCaseItem of testCases) {
1196
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1197
+ for (let r = 0; r < rerunTotal; r++) {
1198
+ units.push({
1199
+ testCaseItem,
1200
+ rerunIndex: r + 1,
1201
+ rerunTotal
1202
+ });
1203
+ }
1204
+ }
1205
+ return units;
1206
+ }
1193
1207
  function nowIsoForFile() {
1194
1208
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1195
1209
  }
@@ -1199,157 +1213,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1199
1213
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1200
1214
  );
1201
1215
  }
1202
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1216
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1217
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
1203
1218
  return Effect.gen(function* () {
1204
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1205
- const rerunPassed = [];
1206
- for (let r = 0; r < reruns; r++) {
1207
- const evaluatorRunId = `run-${randomUUID()}`;
1208
- const started = Date.now();
1209
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1210
- n + 1,
1211
- n + 1
1212
- ]);
1213
- yield* publishEvent({
1214
- type: "TestCaseStarted",
1215
- runId: task.runId,
1216
- testCaseId: testCaseItem.id,
1217
- testCaseName: testCaseItem.testCase.getName(),
1218
- startedTestCases: startedEvaluations,
1219
- totalTestCases: totalEvaluations,
1220
- rerunIndex: r + 1,
1221
- rerunTotal: reruns
1222
- });
1223
- const evaluatorScores = [];
1224
- let testCaseError;
1225
- const output = readOutput(testCaseItem.testCase);
1226
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1227
- const evaluateFn = evaluator.getEvaluateFn();
1228
- if (!evaluateFn) {
1229
- continue;
1230
- }
1231
- const logs = [];
1232
- const logDiff = (expected, actual, options) => {
1233
- logs.push(createDiffLogEntry(expected, actual, options));
1234
- };
1235
- const log = (message, options) => {
1236
- logs.push(createLogEntry(message, options));
1237
- };
1238
- const createError = (message, options) => {
1239
- const entry = createLogEntry(message, options);
1240
- const error = message instanceof Error ? message : new Error(entry.message);
1241
- error[evaluatorErrorLogEntryKey] = entry;
1242
- return error;
1243
- };
1244
- try {
1245
- const ctx = yield* Effect.promise(
1246
- () => Promise.resolve(evaluator.resolveContext())
1247
- );
1248
- const result = yield* Effect.promise(
1249
- () => Promise.resolve().then(
1250
- () => evaluateFn({
1251
- input: testCaseItem.testCase.getInput(),
1252
- ctx,
1253
- output,
1254
- meta: {
1255
- triggerId: task.triggerId,
1256
- runId: evaluatorRunId,
1257
- datasetId: task.datasetId
1258
- },
1259
- logDiff,
1260
- log,
1261
- createError
1262
- })
1263
- )
1264
- );
1265
- if (result instanceof Error) {
1266
- const evaluatorError = result;
1267
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1268
- logs.push(taggedEntry ?? createLogEntry(result));
1269
- testCaseError = result.message;
1270
- evaluatorScores.push({
1271
- evaluatorId,
1272
- scores: [],
1273
- passed: false,
1274
- logs: logs.length > 0 ? logs : void 0
1275
- });
1276
- continue;
1277
- }
1278
- const { scores, metrics } = normalizeResult(result);
1279
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1280
- evaluatorScores.push({
1281
- evaluatorId,
1282
- scores,
1283
- passed: passed2,
1284
- metrics,
1285
- logs: logs.length > 0 ? logs : void 0
1286
- });
1287
- } catch (error) {
1288
- if (error instanceof Error) {
1289
- const taggedEntry = error[evaluatorErrorLogEntryKey];
1290
- logs.push(taggedEntry ?? createLogEntry(error));
1291
- }
1292
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1219
+ const evaluatorRunId = `run-${randomUUID()}`;
1220
+ const started = Date.now();
1221
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1222
+ n + 1,
1223
+ n + 1
1224
+ ]);
1225
+ yield* publishEvent({
1226
+ type: "TestCaseStarted",
1227
+ runId: task.runId,
1228
+ testCaseId: testCaseItem.id,
1229
+ testCaseName: testCaseItem.testCase.getName(),
1230
+ startedTestCases: startedEvaluations,
1231
+ totalTestCases: totalEvaluations,
1232
+ rerunIndex,
1233
+ rerunTotal
1234
+ });
1235
+ const evaluatorScores = [];
1236
+ let testCaseError;
1237
+ const output = readOutput(testCaseItem.testCase);
1238
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1239
+ const evaluateFn = evaluator.getEvaluateFn();
1240
+ if (!evaluateFn) {
1241
+ continue;
1242
+ }
1243
+ const logs = [];
1244
+ const logDiff = (expected, actual, options) => {
1245
+ logs.push(createDiffLogEntry(expected, actual, options));
1246
+ };
1247
+ const log = (message, options) => {
1248
+ logs.push(createLogEntry(message, options));
1249
+ };
1250
+ const createError = (message, options) => {
1251
+ const entry = createLogEntry(message, options);
1252
+ const error = message instanceof Error ? message : new Error(entry.message);
1253
+ error[evaluatorErrorLogEntryKey] = entry;
1254
+ return error;
1255
+ };
1256
+ try {
1257
+ const ctx = yield* Effect.promise(
1258
+ () => Promise.resolve(evaluator.resolveContext())
1259
+ );
1260
+ const result = yield* Effect.promise(
1261
+ () => Promise.resolve().then(
1262
+ () => evaluateFn({
1263
+ input: testCaseItem.testCase.getInput(),
1264
+ ctx,
1265
+ output,
1266
+ meta: {
1267
+ triggerId: task.triggerId,
1268
+ runId: evaluatorRunId,
1269
+ datasetId: task.datasetId
1270
+ },
1271
+ logDiff,
1272
+ log,
1273
+ createError
1274
+ })
1275
+ )
1276
+ );
1277
+ if (result instanceof Error) {
1278
+ const evaluatorError = result;
1279
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1280
+ logs.push(taggedEntry ?? createLogEntry(result));
1281
+ testCaseError = result.message;
1293
1282
  evaluatorScores.push({
1294
1283
  evaluatorId,
1295
1284
  scores: [],
1296
1285
  passed: false,
1297
1286
  logs: logs.length > 0 ? logs : void 0
1298
1287
  });
1288
+ continue;
1289
+ }
1290
+ const { scores, metrics } = normalizeResult(result);
1291
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1292
+ evaluatorScores.push({
1293
+ evaluatorId,
1294
+ scores,
1295
+ passed,
1296
+ metrics,
1297
+ logs: logs.length > 0 ? logs : void 0
1298
+ });
1299
+ } catch (error) {
1300
+ if (error instanceof Error) {
1301
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1302
+ logs.push(taggedEntry ?? createLogEntry(error));
1299
1303
  }
1304
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1305
+ evaluatorScores.push({
1306
+ evaluatorId,
1307
+ scores: [],
1308
+ passed: false,
1309
+ logs: logs.length > 0 ? logs : void 0
1310
+ });
1300
1311
  }
1301
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1302
- rerunPassed.push(rerunPassedThis);
1303
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1304
- n + 1,
1305
- n + 1
1306
- ]);
1307
- const progressEvent = {
1308
- type: "TestCaseProgress",
1309
- runId: task.runId,
1310
- testCaseId: testCaseItem.id,
1311
- testCaseName: testCaseItem.testCase.getName(),
1312
- completedTestCases: completedEvaluations,
1313
- totalTestCases: totalEvaluations,
1314
- rerunIndex: r + 1,
1315
- rerunTotal: reruns,
1316
- passed: rerunPassedThis,
1317
- durationMs: Date.now() - started,
1318
- evaluatorScores,
1319
- output,
1320
- errorMessage: testCaseError
1321
- };
1322
- updateSnapshot(task.runId, (snapshot) => ({
1323
- ...snapshot,
1324
- completedTestCases: completedEvaluations
1325
- }));
1326
- yield* publishEvent(progressEvent);
1327
- yield* Queue.offer(persistenceQueue, {
1328
- runId: task.runId,
1329
- artifactPath: task.snapshot.artifactPath,
1330
- payload: progressEvent
1331
- });
1332
1312
  }
1333
- const testCasePassed = rerunPassed.every(Boolean);
1334
- if (testCasePassed) {
1335
- yield* Ref.update(passedRef, (n) => n + 1);
1336
- } else {
1337
- yield* Ref.update(failedRef, (n) => n + 1);
1338
- }
1339
- const [passed, failed] = yield* Effect.all([
1340
- Ref.get(passedRef),
1341
- Ref.get(failedRef)
1313
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1314
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1315
+ n + 1,
1316
+ n + 1
1342
1317
  ]);
1343
- updateSnapshot(task.runId, (snapshot) => ({
1318
+ const progressEvent = {
1319
+ type: "TestCaseProgress",
1320
+ runId: task.runId,
1321
+ testCaseId: testCaseItem.id,
1322
+ testCaseName: testCaseItem.testCase.getName(),
1323
+ completedTestCases: completedEvaluations,
1324
+ totalTestCases: totalEvaluations,
1325
+ rerunIndex,
1326
+ rerunTotal,
1327
+ passed: rerunPassedThis,
1328
+ durationMs: Date.now() - started,
1329
+ evaluatorScores,
1330
+ output,
1331
+ errorMessage: testCaseError
1332
+ };
1333
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1344
1334
  ...snapshot,
1345
- passedTestCases: passed,
1346
- failedTestCases: failed
1335
+ completedTestCases: completedEvaluations
1347
1336
  }));
1337
+ yield* publishEvent(progressEvent);
1338
+ yield* Queue.offer(persistenceQueue, {
1339
+ runId: task.runId,
1340
+ artifactPath: task.snapshot.artifactPath,
1341
+ payload: progressEvent
1342
+ });
1343
+ const testCaseCompleted = yield* Ref.modify(
1344
+ testCaseResultsRef,
1345
+ (map) => {
1346
+ const key = testCaseItem.id;
1347
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
1348
+ const newResults = [...existing.results, rerunPassedThis];
1349
+ const newCompletedCount = existing.completedCount + 1;
1350
+ const isLast = newCompletedCount === rerunTotal;
1351
+ const newMap = new Map(map);
1352
+ newMap.set(key, {
1353
+ completedCount: newCompletedCount,
1354
+ results: newResults
1355
+ });
1356
+ const outcome = isLast ? newResults.every(Boolean) : null;
1357
+ return [outcome, newMap];
1358
+ }
1359
+ );
1360
+ if (testCaseCompleted !== null) {
1361
+ if (testCaseCompleted) {
1362
+ yield* Ref.update(passedRef, (n) => n + 1);
1363
+ } else {
1364
+ yield* Ref.update(failedRef, (n) => n + 1);
1365
+ }
1366
+ const [passed, failed] = yield* Effect.all([
1367
+ Ref.get(passedRef),
1368
+ Ref.get(failedRef)
1369
+ ]);
1370
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1371
+ ...snapshot,
1372
+ passedTestCases: passed,
1373
+ failedTestCases: failed
1374
+ }));
1375
+ }
1348
1376
  });
1349
1377
  }
1350
1378
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
1351
1379
  const startedAt = Date.now();
1352
- updateSnapshot(task.runId, (snapshot) => ({
1380
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1353
1381
  ...snapshot,
1354
1382
  status: "running",
1355
1383
  startedAt
@@ -1368,9 +1396,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1368
1396
  const startedRef = yield* Ref.make(0);
1369
1397
  const passedRef = yield* Ref.make(0);
1370
1398
  const failedRef = yield* Ref.make(0);
1371
- const processTestCase = (testCaseItem) => processOneTestCase(
1399
+ const testCaseResultsRef = yield* Ref.make(
1400
+ /* @__PURE__ */ new Map()
1401
+ );
1402
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
1403
+ const processEvaluation = (unit) => processOneEvaluation(
1372
1404
  task,
1373
- testCaseItem,
1405
+ unit,
1374
1406
  totalEvaluations,
1375
1407
  publishEvent,
1376
1408
  persistenceQueue,
@@ -1378,11 +1410,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1378
1410
  startedRef,
1379
1411
  completedRef,
1380
1412
  passedRef,
1381
- failedRef
1413
+ failedRef,
1414
+ testCaseResultsRef
1382
1415
  );
1383
1416
  yield* Effect.forEach(
1384
- task.testCases,
1385
- processTestCase,
1417
+ evaluationUnits,
1418
+ processEvaluation,
1386
1419
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1387
1420
  );
1388
1421
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
@@ -1400,7 +1433,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1400
1433
  totalTestCases: task.testCases.length,
1401
1434
  artifactPath: task.snapshot.artifactPath
1402
1435
  };
1403
- updateSnapshot(task.runId, (snapshot) => ({
1436
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1404
1437
  ...snapshot,
1405
1438
  status: "completed",
1406
1439
  completedTestCases: completedEvaluations,
@@ -1653,7 +1686,9 @@ var EffectRunner = class {
1653
1686
  this.persistenceQueue = Effect.runSync(
1654
1687
  Queue.unbounded()
1655
1688
  );
1656
- this.snapshots = /* @__PURE__ */ new Map();
1689
+ this.snapshotsRef = Effect.runSync(
1690
+ Ref.make(/* @__PURE__ */ new Map())
1691
+ );
1657
1692
  this.listeners = /* @__PURE__ */ new Set();
1658
1693
  this.datasetsById = /* @__PURE__ */ new Map();
1659
1694
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1756,7 +1791,13 @@ var EffectRunner = class {
1756
1791
  status: "queued",
1757
1792
  artifactPath
1758
1793
  };
1759
- this.snapshots.set(runId, snapshot);
1794
+ await Effect.runPromise(
1795
+ Ref.update(this.snapshotsRef, (map) => {
1796
+ const next = new Map(map);
1797
+ next.set(runId, snapshot);
1798
+ return next;
1799
+ })
1800
+ );
1760
1801
  const queuedEvent = {
1761
1802
  type: "RunQueued",
1762
1803
  runId,
@@ -1797,12 +1838,12 @@ var EffectRunner = class {
1797
1838
  };
1798
1839
  }
1799
1840
  getRunSnapshot(runId) {
1800
- return this.snapshots.get(runId);
1841
+ return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1801
1842
  }
1802
1843
  getAllRunSnapshots() {
1803
- return Array.from(this.snapshots.values()).sort(
1804
- (a, b) => b.queuedAt - a.queuedAt
1805
- );
1844
+ return Array.from(
1845
+ Effect.runSync(Ref.get(this.snapshotsRef)).values()
1846
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1806
1847
  }
1807
1848
  async loadRunSnapshotsFromArtifacts() {
1808
1849
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1831,11 +1872,15 @@ var EffectRunner = class {
1831
1872
  );
1832
1873
  }
1833
1874
  updateSnapshot(runId, updater) {
1834
- const existing = this.snapshots.get(runId);
1835
- if (!existing) {
1836
- return;
1837
- }
1838
- this.snapshots.set(runId, updater(existing));
1875
+ return Ref.modify(this.snapshotsRef, (map) => {
1876
+ const existing = map.get(runId);
1877
+ if (!existing) {
1878
+ return [void 0, map];
1879
+ }
1880
+ const next = new Map(map);
1881
+ next.set(runId, updater(existing));
1882
+ return [void 0, next];
1883
+ }).pipe(Effect.asVoid);
1839
1884
  }
1840
1885
  publishEvent(event) {
1841
1886
  return Effect.sync(() => {