@m4trix/evals 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -6,12 +6,13 @@ import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
6
  import { resolve, relative, join, dirname } from 'path';
7
7
  import { LineGraph } from '@pppp606/ink-chart';
8
8
  import { randomUUID } from 'crypto';
9
- import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
9
+ import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
10
10
  import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
12
  import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
13
13
  import { pathToFileURL } from 'url';
14
- import { diffString } from 'json-diff';
14
+ import { diffLines } from 'diff';
15
+ import stringify from 'fast-json-stable-stringify';
15
16
 
16
17
  var SEP = " ";
17
18
  var ARROW = "\u203A";
@@ -978,10 +979,102 @@ async function collectTestCasesFromFiles(config) {
978
979
  );
979
980
  return found.flat();
980
981
  }
982
+ function preprocessForDiff(value, options) {
983
+ if (options?.sort && Array.isArray(value)) {
984
+ return [...value].sort((a, b) => {
985
+ const aStr = stringify(preprocessForDiff(a, options));
986
+ const bStr = stringify(preprocessForDiff(b, options));
987
+ return aStr.localeCompare(bStr);
988
+ }).map((item) => preprocessForDiff(item, options));
989
+ }
990
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
991
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
992
+ const filtered = {};
993
+ for (const [k, v] of Object.entries(value)) {
994
+ if (!keys.includes(k)) {
995
+ filtered[k] = preprocessForDiff(v, options);
996
+ }
997
+ }
998
+ return filtered;
999
+ }
1000
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
1001
+ const result = {};
1002
+ for (const [k, v] of Object.entries(value)) {
1003
+ result[k] = preprocessForDiff(v, options);
1004
+ }
1005
+ return result;
1006
+ }
1007
+ if (typeof value === "number" && options?.precision !== void 0) {
1008
+ return Number(value.toFixed(options.precision));
1009
+ }
1010
+ return value;
1011
+ }
1012
+ function toPrettyJson(value) {
1013
+ const str = stringify(value);
1014
+ try {
1015
+ const parsed = JSON.parse(str);
1016
+ return JSON.stringify(parsed, null, 2);
1017
+ } catch {
1018
+ return str;
1019
+ }
1020
+ }
1021
+ function formatDiffParts(parts) {
1022
+ const lines = [];
1023
+ for (const part of parts) {
1024
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
1025
+ const partLines = part.value.split("\n");
1026
+ for (let i = 0; i < partLines.length; i++) {
1027
+ const line = partLines[i];
1028
+ if (i === partLines.length - 1 && line === "")
1029
+ continue;
1030
+ lines.push(prefix + line);
1031
+ }
1032
+ }
1033
+ return lines.join("\n");
1034
+ }
981
1035
  function createDiffString(expected, actual, diffOptions) {
982
- const opts = { ...diffOptions, color: false };
983
- const result = diffString(expected, actual, opts);
984
- return typeof result === "string" ? result : "";
1036
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
1037
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
1038
+ if (diffOptions?.keysOnly) {
1039
+ const expectedKeys = JSON.stringify(
1040
+ extractKeys(expectedProcessed),
1041
+ null,
1042
+ 2
1043
+ );
1044
+ const actualKeys = JSON.stringify(
1045
+ extractKeys(actualProcessed),
1046
+ null,
1047
+ 2
1048
+ );
1049
+ const parts2 = diffLines(expectedKeys, actualKeys);
1050
+ return formatDiffParts(parts2);
1051
+ }
1052
+ const expectedStr = toPrettyJson(expectedProcessed);
1053
+ const actualStr = toPrettyJson(actualProcessed);
1054
+ if (expectedStr === actualStr) {
1055
+ return "";
1056
+ }
1057
+ const parts = diffLines(expectedStr, actualStr);
1058
+ if (diffOptions?.outputNewOnly) {
1059
+ const filtered = parts.filter(
1060
+ (p) => p.added === true
1061
+ );
1062
+ return formatDiffParts(filtered);
1063
+ }
1064
+ return formatDiffParts(parts);
1065
+ }
1066
+ function extractKeys(value) {
1067
+ if (value === null || typeof value !== "object") {
1068
+ return "\xB7";
1069
+ }
1070
+ if (Array.isArray(value)) {
1071
+ return value.map(extractKeys);
1072
+ }
1073
+ const result = {};
1074
+ for (const [k, v] of Object.entries(value)) {
1075
+ result[k] = extractKeys(v);
1076
+ }
1077
+ return result;
985
1078
  }
986
1079
  function formatLogMessage(msg) {
987
1080
  if (typeof msg === "string")
@@ -1323,6 +1416,20 @@ function readOutput(testCase) {
1323
1416
  }
1324
1417
  return candidate.getOutput();
1325
1418
  }
1419
+ function buildEvaluationUnits(testCases) {
1420
+ const units = [];
1421
+ for (const testCaseItem of testCases) {
1422
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1423
+ for (let r = 0; r < rerunTotal; r++) {
1424
+ units.push({
1425
+ testCaseItem,
1426
+ rerunIndex: r + 1,
1427
+ rerunTotal
1428
+ });
1429
+ }
1430
+ }
1431
+ return units;
1432
+ }
1326
1433
  function nowIsoForFile() {
1327
1434
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1328
1435
  }
@@ -1332,157 +1439,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1332
1439
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1333
1440
  );
1334
1441
  }
1335
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1442
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1443
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
1336
1444
  return Effect.gen(function* () {
1337
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1338
- const rerunPassed = [];
1339
- for (let r = 0; r < reruns; r++) {
1340
- const evaluatorRunId = `run-${randomUUID()}`;
1341
- const started = Date.now();
1342
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1343
- n + 1,
1344
- n + 1
1345
- ]);
1346
- yield* publishEvent({
1347
- type: "TestCaseStarted",
1348
- runId: task.runId,
1349
- testCaseId: testCaseItem.id,
1350
- testCaseName: testCaseItem.testCase.getName(),
1351
- startedTestCases: startedEvaluations,
1352
- totalTestCases: totalEvaluations,
1353
- rerunIndex: r + 1,
1354
- rerunTotal: reruns
1355
- });
1356
- const evaluatorScores = [];
1357
- let testCaseError;
1358
- const output = readOutput(testCaseItem.testCase);
1359
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1360
- const evaluateFn = evaluator.getEvaluateFn();
1361
- if (!evaluateFn) {
1362
- continue;
1363
- }
1364
- const logs = [];
1365
- const logDiff = (expected, actual, options) => {
1366
- logs.push(createDiffLogEntry(expected, actual, options));
1367
- };
1368
- const log = (message, options) => {
1369
- logs.push(createLogEntry(message, options));
1370
- };
1371
- const createError = (message, options) => {
1372
- const entry = createLogEntry(message, options);
1373
- const error = message instanceof Error ? message : new Error(entry.message);
1374
- error[evaluatorErrorLogEntryKey] = entry;
1375
- return error;
1376
- };
1377
- try {
1378
- const ctx = yield* Effect.promise(
1379
- () => Promise.resolve(evaluator.resolveContext())
1380
- );
1381
- const result = yield* Effect.promise(
1382
- () => Promise.resolve().then(
1383
- () => evaluateFn({
1384
- input: testCaseItem.testCase.getInput(),
1385
- ctx,
1386
- output,
1387
- meta: {
1388
- triggerId: task.triggerId,
1389
- runId: evaluatorRunId,
1390
- datasetId: task.datasetId
1391
- },
1392
- logDiff,
1393
- log,
1394
- createError
1395
- })
1396
- )
1397
- );
1398
- if (result instanceof Error) {
1399
- const evaluatorError = result;
1400
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1401
- logs.push(taggedEntry ?? createLogEntry(result));
1402
- testCaseError = result.message;
1403
- evaluatorScores.push({
1404
- evaluatorId,
1405
- scores: [],
1406
- passed: false,
1407
- logs: logs.length > 0 ? logs : void 0
1408
- });
1409
- continue;
1410
- }
1411
- const { scores, metrics } = normalizeResult(result);
1412
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1413
- evaluatorScores.push({
1414
- evaluatorId,
1415
- scores,
1416
- passed: passed2,
1417
- metrics,
1418
- logs: logs.length > 0 ? logs : void 0
1419
- });
1420
- } catch (error) {
1421
- if (error instanceof Error) {
1422
- const taggedEntry = error[evaluatorErrorLogEntryKey];
1423
- logs.push(taggedEntry ?? createLogEntry(error));
1424
- }
1425
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1445
+ const evaluatorRunId = `run-${randomUUID()}`;
1446
+ const started = Date.now();
1447
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1448
+ n + 1,
1449
+ n + 1
1450
+ ]);
1451
+ yield* publishEvent({
1452
+ type: "TestCaseStarted",
1453
+ runId: task.runId,
1454
+ testCaseId: testCaseItem.id,
1455
+ testCaseName: testCaseItem.testCase.getName(),
1456
+ startedTestCases: startedEvaluations,
1457
+ totalTestCases: totalEvaluations,
1458
+ rerunIndex,
1459
+ rerunTotal
1460
+ });
1461
+ const evaluatorScores = [];
1462
+ let testCaseError;
1463
+ const output = readOutput(testCaseItem.testCase);
1464
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1465
+ const evaluateFn = evaluator.getEvaluateFn();
1466
+ if (!evaluateFn) {
1467
+ continue;
1468
+ }
1469
+ const logs = [];
1470
+ const logDiff = (expected, actual, options) => {
1471
+ logs.push(createDiffLogEntry(expected, actual, options));
1472
+ };
1473
+ const log = (message, options) => {
1474
+ logs.push(createLogEntry(message, options));
1475
+ };
1476
+ const createError = (message, options) => {
1477
+ const entry = createLogEntry(message, options);
1478
+ const error = message instanceof Error ? message : new Error(entry.message);
1479
+ error[evaluatorErrorLogEntryKey] = entry;
1480
+ return error;
1481
+ };
1482
+ try {
1483
+ const ctx = yield* Effect.promise(
1484
+ () => Promise.resolve(evaluator.resolveContext())
1485
+ );
1486
+ const result = yield* Effect.promise(
1487
+ () => Promise.resolve().then(
1488
+ () => evaluateFn({
1489
+ input: testCaseItem.testCase.getInput(),
1490
+ ctx,
1491
+ output,
1492
+ meta: {
1493
+ triggerId: task.triggerId,
1494
+ runId: evaluatorRunId,
1495
+ datasetId: task.datasetId
1496
+ },
1497
+ logDiff,
1498
+ log,
1499
+ createError
1500
+ })
1501
+ )
1502
+ );
1503
+ if (result instanceof Error) {
1504
+ const evaluatorError = result;
1505
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1506
+ logs.push(taggedEntry ?? createLogEntry(result));
1507
+ testCaseError = result.message;
1426
1508
  evaluatorScores.push({
1427
1509
  evaluatorId,
1428
1510
  scores: [],
1429
1511
  passed: false,
1430
1512
  logs: logs.length > 0 ? logs : void 0
1431
1513
  });
1514
+ continue;
1515
+ }
1516
+ const { scores, metrics } = normalizeResult(result);
1517
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1518
+ evaluatorScores.push({
1519
+ evaluatorId,
1520
+ scores,
1521
+ passed,
1522
+ metrics,
1523
+ logs: logs.length > 0 ? logs : void 0
1524
+ });
1525
+ } catch (error) {
1526
+ if (error instanceof Error) {
1527
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1528
+ logs.push(taggedEntry ?? createLogEntry(error));
1432
1529
  }
1530
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1531
+ evaluatorScores.push({
1532
+ evaluatorId,
1533
+ scores: [],
1534
+ passed: false,
1535
+ logs: logs.length > 0 ? logs : void 0
1536
+ });
1433
1537
  }
1434
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1435
- rerunPassed.push(rerunPassedThis);
1436
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1437
- n + 1,
1438
- n + 1
1439
- ]);
1440
- const progressEvent = {
1441
- type: "TestCaseProgress",
1442
- runId: task.runId,
1443
- testCaseId: testCaseItem.id,
1444
- testCaseName: testCaseItem.testCase.getName(),
1445
- completedTestCases: completedEvaluations,
1446
- totalTestCases: totalEvaluations,
1447
- rerunIndex: r + 1,
1448
- rerunTotal: reruns,
1449
- passed: rerunPassedThis,
1450
- durationMs: Date.now() - started,
1451
- evaluatorScores,
1452
- output,
1453
- errorMessage: testCaseError
1454
- };
1455
- updateSnapshot(task.runId, (snapshot) => ({
1456
- ...snapshot,
1457
- completedTestCases: completedEvaluations
1458
- }));
1459
- yield* publishEvent(progressEvent);
1460
- yield* Queue.offer(persistenceQueue, {
1461
- runId: task.runId,
1462
- artifactPath: task.snapshot.artifactPath,
1463
- payload: progressEvent
1464
- });
1465
1538
  }
1466
- const testCasePassed = rerunPassed.every(Boolean);
1467
- if (testCasePassed) {
1468
- yield* Ref.update(passedRef, (n) => n + 1);
1469
- } else {
1470
- yield* Ref.update(failedRef, (n) => n + 1);
1471
- }
1472
- const [passed, failed] = yield* Effect.all([
1473
- Ref.get(passedRef),
1474
- Ref.get(failedRef)
1539
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1540
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1541
+ n + 1,
1542
+ n + 1
1475
1543
  ]);
1476
- updateSnapshot(task.runId, (snapshot) => ({
1544
+ const progressEvent = {
1545
+ type: "TestCaseProgress",
1546
+ runId: task.runId,
1547
+ testCaseId: testCaseItem.id,
1548
+ testCaseName: testCaseItem.testCase.getName(),
1549
+ completedTestCases: completedEvaluations,
1550
+ totalTestCases: totalEvaluations,
1551
+ rerunIndex,
1552
+ rerunTotal,
1553
+ passed: rerunPassedThis,
1554
+ durationMs: Date.now() - started,
1555
+ evaluatorScores,
1556
+ output,
1557
+ errorMessage: testCaseError
1558
+ };
1559
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1477
1560
  ...snapshot,
1478
- passedTestCases: passed,
1479
- failedTestCases: failed
1561
+ completedTestCases: completedEvaluations
1480
1562
  }));
1563
+ yield* publishEvent(progressEvent);
1564
+ yield* Queue.offer(persistenceQueue, {
1565
+ runId: task.runId,
1566
+ artifactPath: task.snapshot.artifactPath,
1567
+ payload: progressEvent
1568
+ });
1569
+ const testCaseCompleted = yield* Ref.modify(
1570
+ testCaseResultsRef,
1571
+ (map) => {
1572
+ const key = testCaseItem.id;
1573
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
1574
+ const newResults = [...existing.results, rerunPassedThis];
1575
+ const newCompletedCount = existing.completedCount + 1;
1576
+ const isLast = newCompletedCount === rerunTotal;
1577
+ const newMap = new Map(map);
1578
+ newMap.set(key, {
1579
+ completedCount: newCompletedCount,
1580
+ results: newResults
1581
+ });
1582
+ const outcome = isLast ? newResults.every(Boolean) : null;
1583
+ return [outcome, newMap];
1584
+ }
1585
+ );
1586
+ if (testCaseCompleted !== null) {
1587
+ if (testCaseCompleted) {
1588
+ yield* Ref.update(passedRef, (n) => n + 1);
1589
+ } else {
1590
+ yield* Ref.update(failedRef, (n) => n + 1);
1591
+ }
1592
+ const [passed, failed] = yield* Effect.all([
1593
+ Ref.get(passedRef),
1594
+ Ref.get(failedRef)
1595
+ ]);
1596
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1597
+ ...snapshot,
1598
+ passedTestCases: passed,
1599
+ failedTestCases: failed
1600
+ }));
1601
+ }
1481
1602
  });
1482
1603
  }
1483
1604
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
1484
1605
  const startedAt = Date.now();
1485
- updateSnapshot(task.runId, (snapshot) => ({
1606
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1486
1607
  ...snapshot,
1487
1608
  status: "running",
1488
1609
  startedAt
@@ -1501,9 +1622,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1501
1622
  const startedRef = yield* Ref.make(0);
1502
1623
  const passedRef = yield* Ref.make(0);
1503
1624
  const failedRef = yield* Ref.make(0);
1504
- const processTestCase = (testCaseItem) => processOneTestCase(
1625
+ const testCaseResultsRef = yield* Ref.make(
1626
+ /* @__PURE__ */ new Map()
1627
+ );
1628
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
1629
+ const processEvaluation = (unit) => processOneEvaluation(
1505
1630
  task,
1506
- testCaseItem,
1631
+ unit,
1507
1632
  totalEvaluations,
1508
1633
  publishEvent,
1509
1634
  persistenceQueue,
@@ -1511,11 +1636,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1511
1636
  startedRef,
1512
1637
  completedRef,
1513
1638
  passedRef,
1514
- failedRef
1639
+ failedRef,
1640
+ testCaseResultsRef
1515
1641
  );
1516
1642
  yield* Effect.forEach(
1517
- task.testCases,
1518
- processTestCase,
1643
+ evaluationUnits,
1644
+ processEvaluation,
1519
1645
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1520
1646
  );
1521
1647
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
@@ -1533,7 +1659,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1533
1659
  totalTestCases: task.testCases.length,
1534
1660
  artifactPath: task.snapshot.artifactPath
1535
1661
  };
1536
- updateSnapshot(task.runId, (snapshot) => ({
1662
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1537
1663
  ...snapshot,
1538
1664
  status: "completed",
1539
1665
  completedTestCases: completedEvaluations,
@@ -1816,7 +1942,9 @@ var EffectRunner = class {
1816
1942
  this.persistenceQueue = Effect.runSync(
1817
1943
  Queue.unbounded()
1818
1944
  );
1819
- this.snapshots = /* @__PURE__ */ new Map();
1945
+ this.snapshotsRef = Effect.runSync(
1946
+ Ref.make(/* @__PURE__ */ new Map())
1947
+ );
1820
1948
  this.listeners = /* @__PURE__ */ new Set();
1821
1949
  this.datasetsById = /* @__PURE__ */ new Map();
1822
1950
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1919,7 +2047,13 @@ var EffectRunner = class {
1919
2047
  status: "queued",
1920
2048
  artifactPath
1921
2049
  };
1922
- this.snapshots.set(runId, snapshot);
2050
+ await Effect.runPromise(
2051
+ Ref.update(this.snapshotsRef, (map) => {
2052
+ const next = new Map(map);
2053
+ next.set(runId, snapshot);
2054
+ return next;
2055
+ })
2056
+ );
1923
2057
  const queuedEvent = {
1924
2058
  type: "RunQueued",
1925
2059
  runId,
@@ -1960,12 +2094,12 @@ var EffectRunner = class {
1960
2094
  };
1961
2095
  }
1962
2096
  getRunSnapshot(runId) {
1963
- return this.snapshots.get(runId);
2097
+ return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1964
2098
  }
1965
2099
  getAllRunSnapshots() {
1966
- return Array.from(this.snapshots.values()).sort(
1967
- (a, b) => b.queuedAt - a.queuedAt
1968
- );
2100
+ return Array.from(
2101
+ Effect.runSync(Ref.get(this.snapshotsRef)).values()
2102
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1969
2103
  }
1970
2104
  async loadRunSnapshotsFromArtifacts() {
1971
2105
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1994,11 +2128,15 @@ var EffectRunner = class {
1994
2128
  );
1995
2129
  }
1996
2130
  updateSnapshot(runId, updater) {
1997
- const existing = this.snapshots.get(runId);
1998
- if (!existing) {
1999
- return;
2000
- }
2001
- this.snapshots.set(runId, updater(existing));
2131
+ return Ref.modify(this.snapshotsRef, (map) => {
2132
+ const existing = map.get(runId);
2133
+ if (!existing) {
2134
+ return [void 0, map];
2135
+ }
2136
+ const next = new Map(map);
2137
+ next.set(runId, updater(existing));
2138
+ return [void 0, next];
2139
+ }).pipe(Effect.asVoid);
2002
2140
  }
2003
2141
  publishEvent(event) {
2004
2142
  return Effect.sync(() => {