@m4trix/evals 0.21.1 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -1443,6 +1443,20 @@ function readOutput(testCase) {
1443
1443
  }
1444
1444
  return candidate.getOutput();
1445
1445
  }
1446
+ function buildEvaluationUnits(testCases) {
1447
+ const units = [];
1448
+ for (const testCaseItem of testCases) {
1449
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1450
+ for (let r = 0; r < rerunTotal; r++) {
1451
+ units.push({
1452
+ testCaseItem,
1453
+ rerunIndex: r + 1,
1454
+ rerunTotal
1455
+ });
1456
+ }
1457
+ }
1458
+ return units;
1459
+ }
1446
1460
  function nowIsoForFile() {
1447
1461
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1448
1462
  }
@@ -1452,157 +1466,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1452
1466
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1453
1467
  );
1454
1468
  }
1455
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1469
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1470
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
1456
1471
  return effect.Effect.gen(function* () {
1457
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1458
- const rerunPassed = [];
1459
- for (let r = 0; r < reruns; r++) {
1460
- const evaluatorRunId = `run-${crypto.randomUUID()}`;
1461
- const started = Date.now();
1462
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1463
- n + 1,
1464
- n + 1
1465
- ]);
1466
- yield* publishEvent({
1467
- type: "TestCaseStarted",
1468
- runId: task.runId,
1469
- testCaseId: testCaseItem.id,
1470
- testCaseName: testCaseItem.testCase.getName(),
1471
- startedTestCases: startedEvaluations,
1472
- totalTestCases: totalEvaluations,
1473
- rerunIndex: r + 1,
1474
- rerunTotal: reruns
1475
- });
1476
- const evaluatorScores = [];
1477
- let testCaseError;
1478
- const output = readOutput(testCaseItem.testCase);
1479
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1480
- const evaluateFn = evaluator.getEvaluateFn();
1481
- if (!evaluateFn) {
1482
- continue;
1483
- }
1484
- const logs = [];
1485
- const logDiff = (expected, actual, options) => {
1486
- logs.push(createDiffLogEntry(expected, actual, options));
1487
- };
1488
- const log = (message, options) => {
1489
- logs.push(createLogEntry(message, options));
1490
- };
1491
- const createError = (message, options) => {
1492
- const entry = createLogEntry(message, options);
1493
- const error = message instanceof Error ? message : new Error(entry.message);
1494
- error[evaluatorErrorLogEntryKey] = entry;
1495
- return error;
1496
- };
1497
- try {
1498
- const ctx = yield* effect.Effect.promise(
1499
- () => Promise.resolve(evaluator.resolveContext())
1500
- );
1501
- const result = yield* effect.Effect.promise(
1502
- () => Promise.resolve().then(
1503
- () => evaluateFn({
1504
- input: testCaseItem.testCase.getInput(),
1505
- ctx,
1506
- output,
1507
- meta: {
1508
- triggerId: task.triggerId,
1509
- runId: evaluatorRunId,
1510
- datasetId: task.datasetId
1511
- },
1512
- logDiff,
1513
- log,
1514
- createError
1515
- })
1516
- )
1517
- );
1518
- if (result instanceof Error) {
1519
- const evaluatorError = result;
1520
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1521
- logs.push(taggedEntry ?? createLogEntry(result));
1522
- testCaseError = result.message;
1523
- evaluatorScores.push({
1524
- evaluatorId,
1525
- scores: [],
1526
- passed: false,
1527
- logs: logs.length > 0 ? logs : void 0
1528
- });
1529
- continue;
1530
- }
1531
- const { scores, metrics } = normalizeResult(result);
1532
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1533
- evaluatorScores.push({
1534
- evaluatorId,
1535
- scores,
1536
- passed: passed2,
1537
- metrics,
1538
- logs: logs.length > 0 ? logs : void 0
1539
- });
1540
- } catch (error) {
1541
- if (error instanceof Error) {
1542
- const taggedEntry = error[evaluatorErrorLogEntryKey];
1543
- logs.push(taggedEntry ?? createLogEntry(error));
1544
- }
1545
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1472
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
1473
+ const started = Date.now();
1474
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1475
+ n + 1,
1476
+ n + 1
1477
+ ]);
1478
+ yield* publishEvent({
1479
+ type: "TestCaseStarted",
1480
+ runId: task.runId,
1481
+ testCaseId: testCaseItem.id,
1482
+ testCaseName: testCaseItem.testCase.getName(),
1483
+ startedTestCases: startedEvaluations,
1484
+ totalTestCases: totalEvaluations,
1485
+ rerunIndex,
1486
+ rerunTotal
1487
+ });
1488
+ const evaluatorScores = [];
1489
+ let testCaseError;
1490
+ const output = readOutput(testCaseItem.testCase);
1491
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1492
+ const evaluateFn = evaluator.getEvaluateFn();
1493
+ if (!evaluateFn) {
1494
+ continue;
1495
+ }
1496
+ const logs = [];
1497
+ const logDiff = (expected, actual, options) => {
1498
+ logs.push(createDiffLogEntry(expected, actual, options));
1499
+ };
1500
+ const log = (message, options) => {
1501
+ logs.push(createLogEntry(message, options));
1502
+ };
1503
+ const createError = (message, options) => {
1504
+ const entry = createLogEntry(message, options);
1505
+ const error = message instanceof Error ? message : new Error(entry.message);
1506
+ error[evaluatorErrorLogEntryKey] = entry;
1507
+ return error;
1508
+ };
1509
+ try {
1510
+ const ctx = yield* effect.Effect.promise(
1511
+ () => Promise.resolve(evaluator.resolveContext())
1512
+ );
1513
+ const result = yield* effect.Effect.promise(
1514
+ () => Promise.resolve().then(
1515
+ () => evaluateFn({
1516
+ input: testCaseItem.testCase.getInput(),
1517
+ ctx,
1518
+ output,
1519
+ meta: {
1520
+ triggerId: task.triggerId,
1521
+ runId: evaluatorRunId,
1522
+ datasetId: task.datasetId
1523
+ },
1524
+ logDiff,
1525
+ log,
1526
+ createError
1527
+ })
1528
+ )
1529
+ );
1530
+ if (result instanceof Error) {
1531
+ const evaluatorError = result;
1532
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1533
+ logs.push(taggedEntry ?? createLogEntry(result));
1534
+ testCaseError = result.message;
1546
1535
  evaluatorScores.push({
1547
1536
  evaluatorId,
1548
1537
  scores: [],
1549
1538
  passed: false,
1550
1539
  logs: logs.length > 0 ? logs : void 0
1551
1540
  });
1541
+ continue;
1542
+ }
1543
+ const { scores, metrics } = normalizeResult(result);
1544
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1545
+ evaluatorScores.push({
1546
+ evaluatorId,
1547
+ scores,
1548
+ passed,
1549
+ metrics,
1550
+ logs: logs.length > 0 ? logs : void 0
1551
+ });
1552
+ } catch (error) {
1553
+ if (error instanceof Error) {
1554
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1555
+ logs.push(taggedEntry ?? createLogEntry(error));
1552
1556
  }
1557
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1558
+ evaluatorScores.push({
1559
+ evaluatorId,
1560
+ scores: [],
1561
+ passed: false,
1562
+ logs: logs.length > 0 ? logs : void 0
1563
+ });
1553
1564
  }
1554
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1555
- rerunPassed.push(rerunPassedThis);
1556
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1557
- n + 1,
1558
- n + 1
1559
- ]);
1560
- const progressEvent = {
1561
- type: "TestCaseProgress",
1562
- runId: task.runId,
1563
- testCaseId: testCaseItem.id,
1564
- testCaseName: testCaseItem.testCase.getName(),
1565
- completedTestCases: completedEvaluations,
1566
- totalTestCases: totalEvaluations,
1567
- rerunIndex: r + 1,
1568
- rerunTotal: reruns,
1569
- passed: rerunPassedThis,
1570
- durationMs: Date.now() - started,
1571
- evaluatorScores,
1572
- output,
1573
- errorMessage: testCaseError
1574
- };
1575
- updateSnapshot(task.runId, (snapshot) => ({
1576
- ...snapshot,
1577
- completedTestCases: completedEvaluations
1578
- }));
1579
- yield* publishEvent(progressEvent);
1580
- yield* effect.Queue.offer(persistenceQueue, {
1581
- runId: task.runId,
1582
- artifactPath: task.snapshot.artifactPath,
1583
- payload: progressEvent
1584
- });
1585
- }
1586
- const testCasePassed = rerunPassed.every(Boolean);
1587
- if (testCasePassed) {
1588
- yield* effect.Ref.update(passedRef, (n) => n + 1);
1589
- } else {
1590
- yield* effect.Ref.update(failedRef, (n) => n + 1);
1591
1565
  }
1592
- const [passed, failed] = yield* effect.Effect.all([
1593
- effect.Ref.get(passedRef),
1594
- effect.Ref.get(failedRef)
1566
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1567
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1568
+ n + 1,
1569
+ n + 1
1595
1570
  ]);
1596
- updateSnapshot(task.runId, (snapshot) => ({
1571
+ const progressEvent = {
1572
+ type: "TestCaseProgress",
1573
+ runId: task.runId,
1574
+ testCaseId: testCaseItem.id,
1575
+ testCaseName: testCaseItem.testCase.getName(),
1576
+ completedTestCases: completedEvaluations,
1577
+ totalTestCases: totalEvaluations,
1578
+ rerunIndex,
1579
+ rerunTotal,
1580
+ passed: rerunPassedThis,
1581
+ durationMs: Date.now() - started,
1582
+ evaluatorScores,
1583
+ output,
1584
+ errorMessage: testCaseError
1585
+ };
1586
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1597
1587
  ...snapshot,
1598
- passedTestCases: passed,
1599
- failedTestCases: failed
1588
+ completedTestCases: completedEvaluations
1600
1589
  }));
1590
+ yield* publishEvent(progressEvent);
1591
+ yield* effect.Queue.offer(persistenceQueue, {
1592
+ runId: task.runId,
1593
+ artifactPath: task.snapshot.artifactPath,
1594
+ payload: progressEvent
1595
+ });
1596
+ const testCaseCompleted = yield* effect.Ref.modify(
1597
+ testCaseResultsRef,
1598
+ (map) => {
1599
+ const key = testCaseItem.id;
1600
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
1601
+ const newResults = [...existing.results, rerunPassedThis];
1602
+ const newCompletedCount = existing.completedCount + 1;
1603
+ const isLast = newCompletedCount === rerunTotal;
1604
+ const newMap = new Map(map);
1605
+ newMap.set(key, {
1606
+ completedCount: newCompletedCount,
1607
+ results: newResults
1608
+ });
1609
+ const outcome = isLast ? newResults.every(Boolean) : null;
1610
+ return [outcome, newMap];
1611
+ }
1612
+ );
1613
+ if (testCaseCompleted !== null) {
1614
+ if (testCaseCompleted) {
1615
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
1616
+ } else {
1617
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
1618
+ }
1619
+ const [passed, failed] = yield* effect.Effect.all([
1620
+ effect.Ref.get(passedRef),
1621
+ effect.Ref.get(failedRef)
1622
+ ]);
1623
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1624
+ ...snapshot,
1625
+ passedTestCases: passed,
1626
+ failedTestCases: failed
1627
+ }));
1628
+ }
1601
1629
  });
1602
1630
  }
1603
1631
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
1604
1632
  const startedAt = Date.now();
1605
- updateSnapshot(task.runId, (snapshot) => ({
1633
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1606
1634
  ...snapshot,
1607
1635
  status: "running",
1608
1636
  startedAt
@@ -1621,9 +1649,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1621
1649
  const startedRef = yield* effect.Ref.make(0);
1622
1650
  const passedRef = yield* effect.Ref.make(0);
1623
1651
  const failedRef = yield* effect.Ref.make(0);
1624
- const processTestCase = (testCaseItem) => processOneTestCase(
1652
+ const testCaseResultsRef = yield* effect.Ref.make(
1653
+ /* @__PURE__ */ new Map()
1654
+ );
1655
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
1656
+ const processEvaluation = (unit) => processOneEvaluation(
1625
1657
  task,
1626
- testCaseItem,
1658
+ unit,
1627
1659
  totalEvaluations,
1628
1660
  publishEvent,
1629
1661
  persistenceQueue,
@@ -1631,11 +1663,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1631
1663
  startedRef,
1632
1664
  completedRef,
1633
1665
  passedRef,
1634
- failedRef
1666
+ failedRef,
1667
+ testCaseResultsRef
1635
1668
  );
1636
1669
  yield* effect.Effect.forEach(
1637
- task.testCases,
1638
- processTestCase,
1670
+ evaluationUnits,
1671
+ processEvaluation,
1639
1672
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1640
1673
  );
1641
1674
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
@@ -1653,7 +1686,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1653
1686
  totalTestCases: task.testCases.length,
1654
1687
  artifactPath: task.snapshot.artifactPath
1655
1688
  };
1656
- updateSnapshot(task.runId, (snapshot) => ({
1689
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1657
1690
  ...snapshot,
1658
1691
  status: "completed",
1659
1692
  completedTestCases: completedEvaluations,
@@ -1936,7 +1969,9 @@ var EffectRunner = class {
1936
1969
  this.persistenceQueue = effect.Effect.runSync(
1937
1970
  effect.Queue.unbounded()
1938
1971
  );
1939
- this.snapshots = /* @__PURE__ */ new Map();
1972
+ this.snapshotsRef = effect.Effect.runSync(
1973
+ effect.Ref.make(/* @__PURE__ */ new Map())
1974
+ );
1940
1975
  this.listeners = /* @__PURE__ */ new Set();
1941
1976
  this.datasetsById = /* @__PURE__ */ new Map();
1942
1977
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -2039,7 +2074,13 @@ var EffectRunner = class {
2039
2074
  status: "queued",
2040
2075
  artifactPath
2041
2076
  };
2042
- this.snapshots.set(runId, snapshot);
2077
+ await effect.Effect.runPromise(
2078
+ effect.Ref.update(this.snapshotsRef, (map) => {
2079
+ const next = new Map(map);
2080
+ next.set(runId, snapshot);
2081
+ return next;
2082
+ })
2083
+ );
2043
2084
  const queuedEvent = {
2044
2085
  type: "RunQueued",
2045
2086
  runId,
@@ -2080,12 +2121,12 @@ var EffectRunner = class {
2080
2121
  };
2081
2122
  }
2082
2123
  getRunSnapshot(runId) {
2083
- return this.snapshots.get(runId);
2124
+ return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
2084
2125
  }
2085
2126
  getAllRunSnapshots() {
2086
- return Array.from(this.snapshots.values()).sort(
2087
- (a, b) => b.queuedAt - a.queuedAt
2088
- );
2127
+ return Array.from(
2128
+ effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
2129
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
2089
2130
  }
2090
2131
  async loadRunSnapshotsFromArtifacts() {
2091
2132
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -2114,11 +2155,15 @@ var EffectRunner = class {
2114
2155
  );
2115
2156
  }
2116
2157
  updateSnapshot(runId, updater) {
2117
- const existing = this.snapshots.get(runId);
2118
- if (!existing) {
2119
- return;
2120
- }
2121
- this.snapshots.set(runId, updater(existing));
2158
+ return effect.Ref.modify(this.snapshotsRef, (map) => {
2159
+ const existing = map.get(runId);
2160
+ if (!existing) {
2161
+ return [void 0, map];
2162
+ }
2163
+ const next = new Map(map);
2164
+ next.set(runId, updater(existing));
2165
+ return [void 0, next];
2166
+ }).pipe(effect.Effect.asVoid);
2122
2167
  }
2123
2168
  publishEvent(event) {
2124
2169
  return effect.Effect.sync(() => {