@m4trix/evals 0.21.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1215,6 +1215,20 @@ function readOutput(testCase) {
1215
1215
  }
1216
1216
  return candidate.getOutput();
1217
1217
  }
1218
+ function buildEvaluationUnits(testCases) {
1219
+ const units = [];
1220
+ for (const testCaseItem of testCases) {
1221
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1222
+ for (let r = 0; r < rerunTotal; r++) {
1223
+ units.push({
1224
+ testCaseItem,
1225
+ rerunIndex: r + 1,
1226
+ rerunTotal
1227
+ });
1228
+ }
1229
+ }
1230
+ return units;
1231
+ }
1218
1232
  function nowIsoForFile() {
1219
1233
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1220
1234
  }
@@ -1224,157 +1238,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1224
1238
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1225
1239
  );
1226
1240
  }
1227
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1241
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1242
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
1228
1243
  return effect.Effect.gen(function* () {
1229
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1230
- const rerunPassed = [];
1231
- for (let r = 0; r < reruns; r++) {
1232
- const evaluatorRunId = `run-${crypto.randomUUID()}`;
1233
- const started = Date.now();
1234
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1235
- n + 1,
1236
- n + 1
1237
- ]);
1238
- yield* publishEvent({
1239
- type: "TestCaseStarted",
1240
- runId: task.runId,
1241
- testCaseId: testCaseItem.id,
1242
- testCaseName: testCaseItem.testCase.getName(),
1243
- startedTestCases: startedEvaluations,
1244
- totalTestCases: totalEvaluations,
1245
- rerunIndex: r + 1,
1246
- rerunTotal: reruns
1247
- });
1248
- const evaluatorScores = [];
1249
- let testCaseError;
1250
- const output = readOutput(testCaseItem.testCase);
1251
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1252
- const evaluateFn = evaluator.getEvaluateFn();
1253
- if (!evaluateFn) {
1254
- continue;
1255
- }
1256
- const logs = [];
1257
- const logDiff = (expected, actual, options) => {
1258
- logs.push(createDiffLogEntry(expected, actual, options));
1259
- };
1260
- const log = (message, options) => {
1261
- logs.push(createLogEntry(message, options));
1262
- };
1263
- const createError = (message, options) => {
1264
- const entry = createLogEntry(message, options);
1265
- const error = message instanceof Error ? message : new Error(entry.message);
1266
- error[evaluatorErrorLogEntryKey] = entry;
1267
- return error;
1268
- };
1269
- try {
1270
- const ctx = yield* effect.Effect.promise(
1271
- () => Promise.resolve(evaluator.resolveContext())
1272
- );
1273
- const result = yield* effect.Effect.promise(
1274
- () => Promise.resolve().then(
1275
- () => evaluateFn({
1276
- input: testCaseItem.testCase.getInput(),
1277
- ctx,
1278
- output,
1279
- meta: {
1280
- triggerId: task.triggerId,
1281
- runId: evaluatorRunId,
1282
- datasetId: task.datasetId
1283
- },
1284
- logDiff,
1285
- log,
1286
- createError
1287
- })
1288
- )
1289
- );
1290
- if (result instanceof Error) {
1291
- const evaluatorError = result;
1292
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1293
- logs.push(taggedEntry ?? createLogEntry(result));
1294
- testCaseError = result.message;
1295
- evaluatorScores.push({
1296
- evaluatorId,
1297
- scores: [],
1298
- passed: false,
1299
- logs: logs.length > 0 ? logs : void 0
1300
- });
1301
- continue;
1302
- }
1303
- const { scores, metrics } = normalizeResult(result);
1304
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1305
- evaluatorScores.push({
1306
- evaluatorId,
1307
- scores,
1308
- passed: passed2,
1309
- metrics,
1310
- logs: logs.length > 0 ? logs : void 0
1311
- });
1312
- } catch (error) {
1313
- if (error instanceof Error) {
1314
- const taggedEntry = error[evaluatorErrorLogEntryKey];
1315
- logs.push(taggedEntry ?? createLogEntry(error));
1316
- }
1317
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1244
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
1245
+ const started = Date.now();
1246
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1247
+ n + 1,
1248
+ n + 1
1249
+ ]);
1250
+ yield* publishEvent({
1251
+ type: "TestCaseStarted",
1252
+ runId: task.runId,
1253
+ testCaseId: testCaseItem.id,
1254
+ testCaseName: testCaseItem.testCase.getName(),
1255
+ startedTestCases: startedEvaluations,
1256
+ totalTestCases: totalEvaluations,
1257
+ rerunIndex,
1258
+ rerunTotal
1259
+ });
1260
+ const evaluatorScores = [];
1261
+ let testCaseError;
1262
+ const output = readOutput(testCaseItem.testCase);
1263
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1264
+ const evaluateFn = evaluator.getEvaluateFn();
1265
+ if (!evaluateFn) {
1266
+ continue;
1267
+ }
1268
+ const logs = [];
1269
+ const logDiff = (expected, actual, options) => {
1270
+ logs.push(createDiffLogEntry(expected, actual, options));
1271
+ };
1272
+ const log = (message, options) => {
1273
+ logs.push(createLogEntry(message, options));
1274
+ };
1275
+ const createError = (message, options) => {
1276
+ const entry = createLogEntry(message, options);
1277
+ const error = message instanceof Error ? message : new Error(entry.message);
1278
+ error[evaluatorErrorLogEntryKey] = entry;
1279
+ return error;
1280
+ };
1281
+ try {
1282
+ const ctx = yield* effect.Effect.promise(
1283
+ () => Promise.resolve(evaluator.resolveContext())
1284
+ );
1285
+ const result = yield* effect.Effect.promise(
1286
+ () => Promise.resolve().then(
1287
+ () => evaluateFn({
1288
+ input: testCaseItem.testCase.getInput(),
1289
+ ctx,
1290
+ output,
1291
+ meta: {
1292
+ triggerId: task.triggerId,
1293
+ runId: evaluatorRunId,
1294
+ datasetId: task.datasetId
1295
+ },
1296
+ logDiff,
1297
+ log,
1298
+ createError
1299
+ })
1300
+ )
1301
+ );
1302
+ if (result instanceof Error) {
1303
+ const evaluatorError = result;
1304
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1305
+ logs.push(taggedEntry ?? createLogEntry(result));
1306
+ testCaseError = result.message;
1318
1307
  evaluatorScores.push({
1319
1308
  evaluatorId,
1320
1309
  scores: [],
1321
1310
  passed: false,
1322
1311
  logs: logs.length > 0 ? logs : void 0
1323
1312
  });
1313
+ continue;
1324
1314
  }
1315
+ const { scores, metrics } = normalizeResult(result);
1316
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1317
+ evaluatorScores.push({
1318
+ evaluatorId,
1319
+ scores,
1320
+ passed,
1321
+ metrics,
1322
+ logs: logs.length > 0 ? logs : void 0
1323
+ });
1324
+ } catch (error) {
1325
+ if (error instanceof Error) {
1326
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1327
+ logs.push(taggedEntry ?? createLogEntry(error));
1328
+ }
1329
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1330
+ evaluatorScores.push({
1331
+ evaluatorId,
1332
+ scores: [],
1333
+ passed: false,
1334
+ logs: logs.length > 0 ? logs : void 0
1335
+ });
1325
1336
  }
1326
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1327
- rerunPassed.push(rerunPassedThis);
1328
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1329
- n + 1,
1330
- n + 1
1331
- ]);
1332
- const progressEvent = {
1333
- type: "TestCaseProgress",
1334
- runId: task.runId,
1335
- testCaseId: testCaseItem.id,
1336
- testCaseName: testCaseItem.testCase.getName(),
1337
- completedTestCases: completedEvaluations,
1338
- totalTestCases: totalEvaluations,
1339
- rerunIndex: r + 1,
1340
- rerunTotal: reruns,
1341
- passed: rerunPassedThis,
1342
- durationMs: Date.now() - started,
1343
- evaluatorScores,
1344
- output,
1345
- errorMessage: testCaseError
1346
- };
1347
- updateSnapshot(task.runId, (snapshot) => ({
1348
- ...snapshot,
1349
- completedTestCases: completedEvaluations
1350
- }));
1351
- yield* publishEvent(progressEvent);
1352
- yield* effect.Queue.offer(persistenceQueue, {
1353
- runId: task.runId,
1354
- artifactPath: task.snapshot.artifactPath,
1355
- payload: progressEvent
1356
- });
1357
- }
1358
- const testCasePassed = rerunPassed.every(Boolean);
1359
- if (testCasePassed) {
1360
- yield* effect.Ref.update(passedRef, (n) => n + 1);
1361
- } else {
1362
- yield* effect.Ref.update(failedRef, (n) => n + 1);
1363
1337
  }
1364
- const [passed, failed] = yield* effect.Effect.all([
1365
- effect.Ref.get(passedRef),
1366
- effect.Ref.get(failedRef)
1338
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1339
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1340
+ n + 1,
1341
+ n + 1
1367
1342
  ]);
1368
- updateSnapshot(task.runId, (snapshot) => ({
1343
+ const progressEvent = {
1344
+ type: "TestCaseProgress",
1345
+ runId: task.runId,
1346
+ testCaseId: testCaseItem.id,
1347
+ testCaseName: testCaseItem.testCase.getName(),
1348
+ completedTestCases: completedEvaluations,
1349
+ totalTestCases: totalEvaluations,
1350
+ rerunIndex,
1351
+ rerunTotal,
1352
+ passed: rerunPassedThis,
1353
+ durationMs: Date.now() - started,
1354
+ evaluatorScores,
1355
+ output,
1356
+ errorMessage: testCaseError
1357
+ };
1358
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1369
1359
  ...snapshot,
1370
- passedTestCases: passed,
1371
- failedTestCases: failed
1360
+ completedTestCases: completedEvaluations
1372
1361
  }));
1362
+ yield* publishEvent(progressEvent);
1363
+ yield* effect.Queue.offer(persistenceQueue, {
1364
+ runId: task.runId,
1365
+ artifactPath: task.snapshot.artifactPath,
1366
+ payload: progressEvent
1367
+ });
1368
+ const testCaseCompleted = yield* effect.Ref.modify(
1369
+ testCaseResultsRef,
1370
+ (map) => {
1371
+ const key = testCaseItem.id;
1372
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
1373
+ const newResults = [...existing.results, rerunPassedThis];
1374
+ const newCompletedCount = existing.completedCount + 1;
1375
+ const isLast = newCompletedCount === rerunTotal;
1376
+ const newMap = new Map(map);
1377
+ newMap.set(key, {
1378
+ completedCount: newCompletedCount,
1379
+ results: newResults
1380
+ });
1381
+ const outcome = isLast ? newResults.every(Boolean) : null;
1382
+ return [outcome, newMap];
1383
+ }
1384
+ );
1385
+ if (testCaseCompleted !== null) {
1386
+ if (testCaseCompleted) {
1387
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
1388
+ } else {
1389
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
1390
+ }
1391
+ const [passed, failed] = yield* effect.Effect.all([
1392
+ effect.Ref.get(passedRef),
1393
+ effect.Ref.get(failedRef)
1394
+ ]);
1395
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1396
+ ...snapshot,
1397
+ passedTestCases: passed,
1398
+ failedTestCases: failed
1399
+ }));
1400
+ }
1373
1401
  });
1374
1402
  }
1375
1403
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
1376
1404
  const startedAt = Date.now();
1377
- updateSnapshot(task.runId, (snapshot) => ({
1405
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1378
1406
  ...snapshot,
1379
1407
  status: "running",
1380
1408
  startedAt
@@ -1393,9 +1421,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1393
1421
  const startedRef = yield* effect.Ref.make(0);
1394
1422
  const passedRef = yield* effect.Ref.make(0);
1395
1423
  const failedRef = yield* effect.Ref.make(0);
1396
- const processTestCase = (testCaseItem) => processOneTestCase(
1424
+ const testCaseResultsRef = yield* effect.Ref.make(
1425
+ /* @__PURE__ */ new Map()
1426
+ );
1427
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
1428
+ const processEvaluation = (unit) => processOneEvaluation(
1397
1429
  task,
1398
- testCaseItem,
1430
+ unit,
1399
1431
  totalEvaluations,
1400
1432
  publishEvent,
1401
1433
  persistenceQueue,
@@ -1403,11 +1435,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1403
1435
  startedRef,
1404
1436
  completedRef,
1405
1437
  passedRef,
1406
- failedRef
1438
+ failedRef,
1439
+ testCaseResultsRef
1407
1440
  );
1408
1441
  yield* effect.Effect.forEach(
1409
- task.testCases,
1410
- processTestCase,
1442
+ evaluationUnits,
1443
+ processEvaluation,
1411
1444
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1412
1445
  );
1413
1446
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
@@ -1425,7 +1458,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1425
1458
  totalTestCases: task.testCases.length,
1426
1459
  artifactPath: task.snapshot.artifactPath
1427
1460
  };
1428
- updateSnapshot(task.runId, (snapshot) => ({
1461
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1429
1462
  ...snapshot,
1430
1463
  status: "completed",
1431
1464
  completedTestCases: completedEvaluations,
@@ -1678,7 +1711,9 @@ var EffectRunner = class {
1678
1711
  this.persistenceQueue = effect.Effect.runSync(
1679
1712
  effect.Queue.unbounded()
1680
1713
  );
1681
- this.snapshots = /* @__PURE__ */ new Map();
1714
+ this.snapshotsRef = effect.Effect.runSync(
1715
+ effect.Ref.make(/* @__PURE__ */ new Map())
1716
+ );
1682
1717
  this.listeners = /* @__PURE__ */ new Set();
1683
1718
  this.datasetsById = /* @__PURE__ */ new Map();
1684
1719
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1781,7 +1816,13 @@ var EffectRunner = class {
1781
1816
  status: "queued",
1782
1817
  artifactPath
1783
1818
  };
1784
- this.snapshots.set(runId, snapshot);
1819
+ await effect.Effect.runPromise(
1820
+ effect.Ref.update(this.snapshotsRef, (map) => {
1821
+ const next = new Map(map);
1822
+ next.set(runId, snapshot);
1823
+ return next;
1824
+ })
1825
+ );
1785
1826
  const queuedEvent = {
1786
1827
  type: "RunQueued",
1787
1828
  runId,
@@ -1822,12 +1863,12 @@ var EffectRunner = class {
1822
1863
  };
1823
1864
  }
1824
1865
  getRunSnapshot(runId) {
1825
- return this.snapshots.get(runId);
1866
+ return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1826
1867
  }
1827
1868
  getAllRunSnapshots() {
1828
- return Array.from(this.snapshots.values()).sort(
1829
- (a, b) => b.queuedAt - a.queuedAt
1830
- );
1869
+ return Array.from(
1870
+ effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1871
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1831
1872
  }
1832
1873
  async loadRunSnapshotsFromArtifacts() {
1833
1874
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1856,11 +1897,15 @@ var EffectRunner = class {
1856
1897
  );
1857
1898
  }
1858
1899
  updateSnapshot(runId, updater) {
1859
- const existing = this.snapshots.get(runId);
1860
- if (!existing) {
1861
- return;
1862
- }
1863
- this.snapshots.set(runId, updater(existing));
1900
+ return effect.Ref.modify(this.snapshotsRef, (map) => {
1901
+ const existing = map.get(runId);
1902
+ if (!existing) {
1903
+ return [void 0, map];
1904
+ }
1905
+ const next = new Map(map);
1906
+ next.set(runId, updater(existing));
1907
+ return [void 0, next];
1908
+ }).pipe(effect.Effect.asVoid);
1864
1909
  }
1865
1910
  publishEvent(event) {
1866
1911
  return effect.Effect.sync(() => {