@m4trix/evals 0.21.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -6,7 +6,7 @@ import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
6
  import { resolve, relative, join, dirname } from 'path';
7
7
  import { LineGraph } from '@pppp606/ink-chart';
8
8
  import { randomUUID } from 'crypto';
9
- import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
9
+ import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
10
10
  import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
12
  import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
@@ -1416,6 +1416,20 @@ function readOutput(testCase) {
1416
1416
  }
1417
1417
  return candidate.getOutput();
1418
1418
  }
1419
+ function buildEvaluationUnits(testCases) {
1420
+ const units = [];
1421
+ for (const testCaseItem of testCases) {
1422
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1423
+ for (let r = 0; r < rerunTotal; r++) {
1424
+ units.push({
1425
+ testCaseItem,
1426
+ rerunIndex: r + 1,
1427
+ rerunTotal
1428
+ });
1429
+ }
1430
+ }
1431
+ return units;
1432
+ }
1419
1433
  function nowIsoForFile() {
1420
1434
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1421
1435
  }
@@ -1425,157 +1439,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1425
1439
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1426
1440
  );
1427
1441
  }
1428
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1442
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1443
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
1429
1444
  return Effect.gen(function* () {
1430
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1431
- const rerunPassed = [];
1432
- for (let r = 0; r < reruns; r++) {
1433
- const evaluatorRunId = `run-${randomUUID()}`;
1434
- const started = Date.now();
1435
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1436
- n + 1,
1437
- n + 1
1438
- ]);
1439
- yield* publishEvent({
1440
- type: "TestCaseStarted",
1441
- runId: task.runId,
1442
- testCaseId: testCaseItem.id,
1443
- testCaseName: testCaseItem.testCase.getName(),
1444
- startedTestCases: startedEvaluations,
1445
- totalTestCases: totalEvaluations,
1446
- rerunIndex: r + 1,
1447
- rerunTotal: reruns
1448
- });
1449
- const evaluatorScores = [];
1450
- let testCaseError;
1451
- const output = readOutput(testCaseItem.testCase);
1452
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1453
- const evaluateFn = evaluator.getEvaluateFn();
1454
- if (!evaluateFn) {
1455
- continue;
1456
- }
1457
- const logs = [];
1458
- const logDiff = (expected, actual, options) => {
1459
- logs.push(createDiffLogEntry(expected, actual, options));
1460
- };
1461
- const log = (message, options) => {
1462
- logs.push(createLogEntry(message, options));
1463
- };
1464
- const createError = (message, options) => {
1465
- const entry = createLogEntry(message, options);
1466
- const error = message instanceof Error ? message : new Error(entry.message);
1467
- error[evaluatorErrorLogEntryKey] = entry;
1468
- return error;
1469
- };
1470
- try {
1471
- const ctx = yield* Effect.promise(
1472
- () => Promise.resolve(evaluator.resolveContext())
1473
- );
1474
- const result = yield* Effect.promise(
1475
- () => Promise.resolve().then(
1476
- () => evaluateFn({
1477
- input: testCaseItem.testCase.getInput(),
1478
- ctx,
1479
- output,
1480
- meta: {
1481
- triggerId: task.triggerId,
1482
- runId: evaluatorRunId,
1483
- datasetId: task.datasetId
1484
- },
1485
- logDiff,
1486
- log,
1487
- createError
1488
- })
1489
- )
1490
- );
1491
- if (result instanceof Error) {
1492
- const evaluatorError = result;
1493
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1494
- logs.push(taggedEntry ?? createLogEntry(result));
1495
- testCaseError = result.message;
1496
- evaluatorScores.push({
1497
- evaluatorId,
1498
- scores: [],
1499
- passed: false,
1500
- logs: logs.length > 0 ? logs : void 0
1501
- });
1502
- continue;
1503
- }
1504
- const { scores, metrics } = normalizeResult(result);
1505
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1506
- evaluatorScores.push({
1507
- evaluatorId,
1508
- scores,
1509
- passed: passed2,
1510
- metrics,
1511
- logs: logs.length > 0 ? logs : void 0
1512
- });
1513
- } catch (error) {
1514
- if (error instanceof Error) {
1515
- const taggedEntry = error[evaluatorErrorLogEntryKey];
1516
- logs.push(taggedEntry ?? createLogEntry(error));
1517
- }
1518
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1445
+ const evaluatorRunId = `run-${randomUUID()}`;
1446
+ const started = Date.now();
1447
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1448
+ n + 1,
1449
+ n + 1
1450
+ ]);
1451
+ yield* publishEvent({
1452
+ type: "TestCaseStarted",
1453
+ runId: task.runId,
1454
+ testCaseId: testCaseItem.id,
1455
+ testCaseName: testCaseItem.testCase.getName(),
1456
+ startedTestCases: startedEvaluations,
1457
+ totalTestCases: totalEvaluations,
1458
+ rerunIndex,
1459
+ rerunTotal
1460
+ });
1461
+ const evaluatorScores = [];
1462
+ let testCaseError;
1463
+ const output = readOutput(testCaseItem.testCase);
1464
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1465
+ const evaluateFn = evaluator.getEvaluateFn();
1466
+ if (!evaluateFn) {
1467
+ continue;
1468
+ }
1469
+ const logs = [];
1470
+ const logDiff = (expected, actual, options) => {
1471
+ logs.push(createDiffLogEntry(expected, actual, options));
1472
+ };
1473
+ const log = (message, options) => {
1474
+ logs.push(createLogEntry(message, options));
1475
+ };
1476
+ const createError = (message, options) => {
1477
+ const entry = createLogEntry(message, options);
1478
+ const error = message instanceof Error ? message : new Error(entry.message);
1479
+ error[evaluatorErrorLogEntryKey] = entry;
1480
+ return error;
1481
+ };
1482
+ try {
1483
+ const ctx = yield* Effect.promise(
1484
+ () => Promise.resolve(evaluator.resolveContext())
1485
+ );
1486
+ const result = yield* Effect.promise(
1487
+ () => Promise.resolve().then(
1488
+ () => evaluateFn({
1489
+ input: testCaseItem.testCase.getInput(),
1490
+ ctx,
1491
+ output,
1492
+ meta: {
1493
+ triggerId: task.triggerId,
1494
+ runId: evaluatorRunId,
1495
+ datasetId: task.datasetId
1496
+ },
1497
+ logDiff,
1498
+ log,
1499
+ createError
1500
+ })
1501
+ )
1502
+ );
1503
+ if (result instanceof Error) {
1504
+ const evaluatorError = result;
1505
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1506
+ logs.push(taggedEntry ?? createLogEntry(result));
1507
+ testCaseError = result.message;
1519
1508
  evaluatorScores.push({
1520
1509
  evaluatorId,
1521
1510
  scores: [],
1522
1511
  passed: false,
1523
1512
  logs: logs.length > 0 ? logs : void 0
1524
1513
  });
1514
+ continue;
1515
+ }
1516
+ const { scores, metrics } = normalizeResult(result);
1517
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1518
+ evaluatorScores.push({
1519
+ evaluatorId,
1520
+ scores,
1521
+ passed,
1522
+ metrics,
1523
+ logs: logs.length > 0 ? logs : void 0
1524
+ });
1525
+ } catch (error) {
1526
+ if (error instanceof Error) {
1527
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1528
+ logs.push(taggedEntry ?? createLogEntry(error));
1525
1529
  }
1530
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1531
+ evaluatorScores.push({
1532
+ evaluatorId,
1533
+ scores: [],
1534
+ passed: false,
1535
+ logs: logs.length > 0 ? logs : void 0
1536
+ });
1526
1537
  }
1527
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1528
- rerunPassed.push(rerunPassedThis);
1529
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1530
- n + 1,
1531
- n + 1
1532
- ]);
1533
- const progressEvent = {
1534
- type: "TestCaseProgress",
1535
- runId: task.runId,
1536
- testCaseId: testCaseItem.id,
1537
- testCaseName: testCaseItem.testCase.getName(),
1538
- completedTestCases: completedEvaluations,
1539
- totalTestCases: totalEvaluations,
1540
- rerunIndex: r + 1,
1541
- rerunTotal: reruns,
1542
- passed: rerunPassedThis,
1543
- durationMs: Date.now() - started,
1544
- evaluatorScores,
1545
- output,
1546
- errorMessage: testCaseError
1547
- };
1548
- updateSnapshot(task.runId, (snapshot) => ({
1549
- ...snapshot,
1550
- completedTestCases: completedEvaluations
1551
- }));
1552
- yield* publishEvent(progressEvent);
1553
- yield* Queue.offer(persistenceQueue, {
1554
- runId: task.runId,
1555
- artifactPath: task.snapshot.artifactPath,
1556
- payload: progressEvent
1557
- });
1558
- }
1559
- const testCasePassed = rerunPassed.every(Boolean);
1560
- if (testCasePassed) {
1561
- yield* Ref.update(passedRef, (n) => n + 1);
1562
- } else {
1563
- yield* Ref.update(failedRef, (n) => n + 1);
1564
1538
  }
1565
- const [passed, failed] = yield* Effect.all([
1566
- Ref.get(passedRef),
1567
- Ref.get(failedRef)
1539
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1540
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1541
+ n + 1,
1542
+ n + 1
1568
1543
  ]);
1569
- updateSnapshot(task.runId, (snapshot) => ({
1544
+ const progressEvent = {
1545
+ type: "TestCaseProgress",
1546
+ runId: task.runId,
1547
+ testCaseId: testCaseItem.id,
1548
+ testCaseName: testCaseItem.testCase.getName(),
1549
+ completedTestCases: completedEvaluations,
1550
+ totalTestCases: totalEvaluations,
1551
+ rerunIndex,
1552
+ rerunTotal,
1553
+ passed: rerunPassedThis,
1554
+ durationMs: Date.now() - started,
1555
+ evaluatorScores,
1556
+ output,
1557
+ errorMessage: testCaseError
1558
+ };
1559
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1570
1560
  ...snapshot,
1571
- passedTestCases: passed,
1572
- failedTestCases: failed
1561
+ completedTestCases: completedEvaluations
1573
1562
  }));
1563
+ yield* publishEvent(progressEvent);
1564
+ yield* Queue.offer(persistenceQueue, {
1565
+ runId: task.runId,
1566
+ artifactPath: task.snapshot.artifactPath,
1567
+ payload: progressEvent
1568
+ });
1569
+ const testCaseCompleted = yield* Ref.modify(
1570
+ testCaseResultsRef,
1571
+ (map) => {
1572
+ const key = testCaseItem.id;
1573
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
1574
+ const newResults = [...existing.results, rerunPassedThis];
1575
+ const newCompletedCount = existing.completedCount + 1;
1576
+ const isLast = newCompletedCount === rerunTotal;
1577
+ const newMap = new Map(map);
1578
+ newMap.set(key, {
1579
+ completedCount: newCompletedCount,
1580
+ results: newResults
1581
+ });
1582
+ const outcome = isLast ? newResults.every(Boolean) : null;
1583
+ return [outcome, newMap];
1584
+ }
1585
+ );
1586
+ if (testCaseCompleted !== null) {
1587
+ if (testCaseCompleted) {
1588
+ yield* Ref.update(passedRef, (n) => n + 1);
1589
+ } else {
1590
+ yield* Ref.update(failedRef, (n) => n + 1);
1591
+ }
1592
+ const [passed, failed] = yield* Effect.all([
1593
+ Ref.get(passedRef),
1594
+ Ref.get(failedRef)
1595
+ ]);
1596
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1597
+ ...snapshot,
1598
+ passedTestCases: passed,
1599
+ failedTestCases: failed
1600
+ }));
1601
+ }
1574
1602
  });
1575
1603
  }
1576
1604
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
1577
1605
  const startedAt = Date.now();
1578
- updateSnapshot(task.runId, (snapshot) => ({
1606
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1579
1607
  ...snapshot,
1580
1608
  status: "running",
1581
1609
  startedAt
@@ -1594,9 +1622,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1594
1622
  const startedRef = yield* Ref.make(0);
1595
1623
  const passedRef = yield* Ref.make(0);
1596
1624
  const failedRef = yield* Ref.make(0);
1597
- const processTestCase = (testCaseItem) => processOneTestCase(
1625
+ const testCaseResultsRef = yield* Ref.make(
1626
+ /* @__PURE__ */ new Map()
1627
+ );
1628
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
1629
+ const processEvaluation = (unit) => processOneEvaluation(
1598
1630
  task,
1599
- testCaseItem,
1631
+ unit,
1600
1632
  totalEvaluations,
1601
1633
  publishEvent,
1602
1634
  persistenceQueue,
@@ -1604,11 +1636,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1604
1636
  startedRef,
1605
1637
  completedRef,
1606
1638
  passedRef,
1607
- failedRef
1639
+ failedRef,
1640
+ testCaseResultsRef
1608
1641
  );
1609
1642
  yield* Effect.forEach(
1610
- task.testCases,
1611
- processTestCase,
1643
+ evaluationUnits,
1644
+ processEvaluation,
1612
1645
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1613
1646
  );
1614
1647
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
@@ -1626,7 +1659,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1626
1659
  totalTestCases: task.testCases.length,
1627
1660
  artifactPath: task.snapshot.artifactPath
1628
1661
  };
1629
- updateSnapshot(task.runId, (snapshot) => ({
1662
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1630
1663
  ...snapshot,
1631
1664
  status: "completed",
1632
1665
  completedTestCases: completedEvaluations,
@@ -1909,7 +1942,9 @@ var EffectRunner = class {
1909
1942
  this.persistenceQueue = Effect.runSync(
1910
1943
  Queue.unbounded()
1911
1944
  );
1912
- this.snapshots = /* @__PURE__ */ new Map();
1945
+ this.snapshotsRef = Effect.runSync(
1946
+ Ref.make(/* @__PURE__ */ new Map())
1947
+ );
1913
1948
  this.listeners = /* @__PURE__ */ new Set();
1914
1949
  this.datasetsById = /* @__PURE__ */ new Map();
1915
1950
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -2012,7 +2047,13 @@ var EffectRunner = class {
2012
2047
  status: "queued",
2013
2048
  artifactPath
2014
2049
  };
2015
- this.snapshots.set(runId, snapshot);
2050
+ await Effect.runPromise(
2051
+ Ref.update(this.snapshotsRef, (map) => {
2052
+ const next = new Map(map);
2053
+ next.set(runId, snapshot);
2054
+ return next;
2055
+ })
2056
+ );
2016
2057
  const queuedEvent = {
2017
2058
  type: "RunQueued",
2018
2059
  runId,
@@ -2053,12 +2094,12 @@ var EffectRunner = class {
2053
2094
  };
2054
2095
  }
2055
2096
  getRunSnapshot(runId) {
2056
- return this.snapshots.get(runId);
2097
+ return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
2057
2098
  }
2058
2099
  getAllRunSnapshots() {
2059
- return Array.from(this.snapshots.values()).sort(
2060
- (a, b) => b.queuedAt - a.queuedAt
2061
- );
2100
+ return Array.from(
2101
+ Effect.runSync(Ref.get(this.snapshotsRef)).values()
2102
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
2062
2103
  }
2063
2104
  async loadRunSnapshotsFromArtifacts() {
2064
2105
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -2087,11 +2128,15 @@ var EffectRunner = class {
2087
2128
  );
2088
2129
  }
2089
2130
  updateSnapshot(runId, updater) {
2090
- const existing = this.snapshots.get(runId);
2091
- if (!existing) {
2092
- return;
2093
- }
2094
- this.snapshots.set(runId, updater(existing));
2131
+ return Ref.modify(this.snapshotsRef, (map) => {
2132
+ const existing = map.get(runId);
2133
+ if (!existing) {
2134
+ return [void 0, map];
2135
+ }
2136
+ const next = new Map(map);
2137
+ next.set(runId, updater(existing));
2138
+ return [void 0, next];
2139
+ }).pipe(Effect.asVoid);
2095
2140
  }
2096
2141
  publishEvent(event) {
2097
2142
  return Effect.sync(() => {