@m4trix/evals 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -13,7 +13,8 @@ var fs = require('fs');
13
13
  var jitiModule = require('jiti');
14
14
  var promises = require('fs/promises');
15
15
  var url = require('url');
16
- var jsonDiff = require('json-diff');
16
+ var diff = require('diff');
17
+ var stringify = require('fast-json-stable-stringify');
17
18
 
18
19
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
19
20
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
@@ -38,6 +39,7 @@ function _interopNamespace(e) {
38
39
 
39
40
  var React2__default = /*#__PURE__*/_interopDefault(React2);
40
41
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
42
+ var stringify__default = /*#__PURE__*/_interopDefault(stringify);
41
43
 
42
44
  var SEP = " ";
43
45
  var ARROW = "\u203A";
@@ -1004,10 +1006,102 @@ async function collectTestCasesFromFiles(config) {
1004
1006
  );
1005
1007
  return found.flat();
1006
1008
  }
1009
+ function preprocessForDiff(value, options) {
1010
+ if (options?.sort && Array.isArray(value)) {
1011
+ return [...value].sort((a, b) => {
1012
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
1013
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
1014
+ return aStr.localeCompare(bStr);
1015
+ }).map((item) => preprocessForDiff(item, options));
1016
+ }
1017
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
1018
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
1019
+ const filtered = {};
1020
+ for (const [k, v] of Object.entries(value)) {
1021
+ if (!keys.includes(k)) {
1022
+ filtered[k] = preprocessForDiff(v, options);
1023
+ }
1024
+ }
1025
+ return filtered;
1026
+ }
1027
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
1028
+ const result = {};
1029
+ for (const [k, v] of Object.entries(value)) {
1030
+ result[k] = preprocessForDiff(v, options);
1031
+ }
1032
+ return result;
1033
+ }
1034
+ if (typeof value === "number" && options?.precision !== void 0) {
1035
+ return Number(value.toFixed(options.precision));
1036
+ }
1037
+ return value;
1038
+ }
1039
+ function toPrettyJson(value) {
1040
+ const str = stringify__default.default(value);
1041
+ try {
1042
+ const parsed = JSON.parse(str);
1043
+ return JSON.stringify(parsed, null, 2);
1044
+ } catch {
1045
+ return str;
1046
+ }
1047
+ }
1048
+ function formatDiffParts(parts) {
1049
+ const lines = [];
1050
+ for (const part of parts) {
1051
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
1052
+ const partLines = part.value.split("\n");
1053
+ for (let i = 0; i < partLines.length; i++) {
1054
+ const line = partLines[i];
1055
+ if (i === partLines.length - 1 && line === "")
1056
+ continue;
1057
+ lines.push(prefix + line);
1058
+ }
1059
+ }
1060
+ return lines.join("\n");
1061
+ }
1007
1062
  function createDiffString(expected, actual, diffOptions) {
1008
- const opts = { ...diffOptions, color: false };
1009
- const result = jsonDiff.diffString(expected, actual, opts);
1010
- return typeof result === "string" ? result : "";
1063
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
1064
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
1065
+ if (diffOptions?.keysOnly) {
1066
+ const expectedKeys = JSON.stringify(
1067
+ extractKeys(expectedProcessed),
1068
+ null,
1069
+ 2
1070
+ );
1071
+ const actualKeys = JSON.stringify(
1072
+ extractKeys(actualProcessed),
1073
+ null,
1074
+ 2
1075
+ );
1076
+ const parts2 = diff.diffLines(expectedKeys, actualKeys);
1077
+ return formatDiffParts(parts2);
1078
+ }
1079
+ const expectedStr = toPrettyJson(expectedProcessed);
1080
+ const actualStr = toPrettyJson(actualProcessed);
1081
+ if (expectedStr === actualStr) {
1082
+ return "";
1083
+ }
1084
+ const parts = diff.diffLines(expectedStr, actualStr);
1085
+ if (diffOptions?.outputNewOnly) {
1086
+ const filtered = parts.filter(
1087
+ (p) => p.added === true
1088
+ );
1089
+ return formatDiffParts(filtered);
1090
+ }
1091
+ return formatDiffParts(parts);
1092
+ }
1093
+ function extractKeys(value) {
1094
+ if (value === null || typeof value !== "object") {
1095
+ return "\xB7";
1096
+ }
1097
+ if (Array.isArray(value)) {
1098
+ return value.map(extractKeys);
1099
+ }
1100
+ const result = {};
1101
+ for (const [k, v] of Object.entries(value)) {
1102
+ result[k] = extractKeys(v);
1103
+ }
1104
+ return result;
1011
1105
  }
1012
1106
  function formatLogMessage(msg) {
1013
1107
  if (typeof msg === "string")
@@ -1349,6 +1443,20 @@ function readOutput(testCase) {
1349
1443
  }
1350
1444
  return candidate.getOutput();
1351
1445
  }
1446
+ function buildEvaluationUnits(testCases) {
1447
+ const units = [];
1448
+ for (const testCaseItem of testCases) {
1449
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1450
+ for (let r = 0; r < rerunTotal; r++) {
1451
+ units.push({
1452
+ testCaseItem,
1453
+ rerunIndex: r + 1,
1454
+ rerunTotal
1455
+ });
1456
+ }
1457
+ }
1458
+ return units;
1459
+ }
1352
1460
  function nowIsoForFile() {
1353
1461
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1354
1462
  }
@@ -1358,157 +1466,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1358
1466
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1359
1467
  );
1360
1468
  }
1361
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1469
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1470
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
1362
1471
  return effect.Effect.gen(function* () {
1363
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1364
- const rerunPassed = [];
1365
- for (let r = 0; r < reruns; r++) {
1366
- const evaluatorRunId = `run-${crypto.randomUUID()}`;
1367
- const started = Date.now();
1368
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1369
- n + 1,
1370
- n + 1
1371
- ]);
1372
- yield* publishEvent({
1373
- type: "TestCaseStarted",
1374
- runId: task.runId,
1375
- testCaseId: testCaseItem.id,
1376
- testCaseName: testCaseItem.testCase.getName(),
1377
- startedTestCases: startedEvaluations,
1378
- totalTestCases: totalEvaluations,
1379
- rerunIndex: r + 1,
1380
- rerunTotal: reruns
1381
- });
1382
- const evaluatorScores = [];
1383
- let testCaseError;
1384
- const output = readOutput(testCaseItem.testCase);
1385
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1386
- const evaluateFn = evaluator.getEvaluateFn();
1387
- if (!evaluateFn) {
1388
- continue;
1389
- }
1390
- const logs = [];
1391
- const logDiff = (expected, actual, options) => {
1392
- logs.push(createDiffLogEntry(expected, actual, options));
1393
- };
1394
- const log = (message, options) => {
1395
- logs.push(createLogEntry(message, options));
1396
- };
1397
- const createError = (message, options) => {
1398
- const entry = createLogEntry(message, options);
1399
- const error = message instanceof Error ? message : new Error(entry.message);
1400
- error[evaluatorErrorLogEntryKey] = entry;
1401
- return error;
1402
- };
1403
- try {
1404
- const ctx = yield* effect.Effect.promise(
1405
- () => Promise.resolve(evaluator.resolveContext())
1406
- );
1407
- const result = yield* effect.Effect.promise(
1408
- () => Promise.resolve().then(
1409
- () => evaluateFn({
1410
- input: testCaseItem.testCase.getInput(),
1411
- ctx,
1412
- output,
1413
- meta: {
1414
- triggerId: task.triggerId,
1415
- runId: evaluatorRunId,
1416
- datasetId: task.datasetId
1417
- },
1418
- logDiff,
1419
- log,
1420
- createError
1421
- })
1422
- )
1423
- );
1424
- if (result instanceof Error) {
1425
- const evaluatorError = result;
1426
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1427
- logs.push(taggedEntry ?? createLogEntry(result));
1428
- testCaseError = result.message;
1429
- evaluatorScores.push({
1430
- evaluatorId,
1431
- scores: [],
1432
- passed: false,
1433
- logs: logs.length > 0 ? logs : void 0
1434
- });
1435
- continue;
1436
- }
1437
- const { scores, metrics } = normalizeResult(result);
1438
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1439
- evaluatorScores.push({
1440
- evaluatorId,
1441
- scores,
1442
- passed: passed2,
1443
- metrics,
1444
- logs: logs.length > 0 ? logs : void 0
1445
- });
1446
- } catch (error) {
1447
- if (error instanceof Error) {
1448
- const taggedEntry = error[evaluatorErrorLogEntryKey];
1449
- logs.push(taggedEntry ?? createLogEntry(error));
1450
- }
1451
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1472
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
1473
+ const started = Date.now();
1474
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1475
+ n + 1,
1476
+ n + 1
1477
+ ]);
1478
+ yield* publishEvent({
1479
+ type: "TestCaseStarted",
1480
+ runId: task.runId,
1481
+ testCaseId: testCaseItem.id,
1482
+ testCaseName: testCaseItem.testCase.getName(),
1483
+ startedTestCases: startedEvaluations,
1484
+ totalTestCases: totalEvaluations,
1485
+ rerunIndex,
1486
+ rerunTotal
1487
+ });
1488
+ const evaluatorScores = [];
1489
+ let testCaseError;
1490
+ const output = readOutput(testCaseItem.testCase);
1491
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1492
+ const evaluateFn = evaluator.getEvaluateFn();
1493
+ if (!evaluateFn) {
1494
+ continue;
1495
+ }
1496
+ const logs = [];
1497
+ const logDiff = (expected, actual, options) => {
1498
+ logs.push(createDiffLogEntry(expected, actual, options));
1499
+ };
1500
+ const log = (message, options) => {
1501
+ logs.push(createLogEntry(message, options));
1502
+ };
1503
+ const createError = (message, options) => {
1504
+ const entry = createLogEntry(message, options);
1505
+ const error = message instanceof Error ? message : new Error(entry.message);
1506
+ error[evaluatorErrorLogEntryKey] = entry;
1507
+ return error;
1508
+ };
1509
+ try {
1510
+ const ctx = yield* effect.Effect.promise(
1511
+ () => Promise.resolve(evaluator.resolveContext())
1512
+ );
1513
+ const result = yield* effect.Effect.promise(
1514
+ () => Promise.resolve().then(
1515
+ () => evaluateFn({
1516
+ input: testCaseItem.testCase.getInput(),
1517
+ ctx,
1518
+ output,
1519
+ meta: {
1520
+ triggerId: task.triggerId,
1521
+ runId: evaluatorRunId,
1522
+ datasetId: task.datasetId
1523
+ },
1524
+ logDiff,
1525
+ log,
1526
+ createError
1527
+ })
1528
+ )
1529
+ );
1530
+ if (result instanceof Error) {
1531
+ const evaluatorError = result;
1532
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1533
+ logs.push(taggedEntry ?? createLogEntry(result));
1534
+ testCaseError = result.message;
1452
1535
  evaluatorScores.push({
1453
1536
  evaluatorId,
1454
1537
  scores: [],
1455
1538
  passed: false,
1456
1539
  logs: logs.length > 0 ? logs : void 0
1457
1540
  });
1541
+ continue;
1542
+ }
1543
+ const { scores, metrics } = normalizeResult(result);
1544
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1545
+ evaluatorScores.push({
1546
+ evaluatorId,
1547
+ scores,
1548
+ passed,
1549
+ metrics,
1550
+ logs: logs.length > 0 ? logs : void 0
1551
+ });
1552
+ } catch (error) {
1553
+ if (error instanceof Error) {
1554
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1555
+ logs.push(taggedEntry ?? createLogEntry(error));
1458
1556
  }
1557
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1558
+ evaluatorScores.push({
1559
+ evaluatorId,
1560
+ scores: [],
1561
+ passed: false,
1562
+ logs: logs.length > 0 ? logs : void 0
1563
+ });
1459
1564
  }
1460
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1461
- rerunPassed.push(rerunPassedThis);
1462
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1463
- n + 1,
1464
- n + 1
1465
- ]);
1466
- const progressEvent = {
1467
- type: "TestCaseProgress",
1468
- runId: task.runId,
1469
- testCaseId: testCaseItem.id,
1470
- testCaseName: testCaseItem.testCase.getName(),
1471
- completedTestCases: completedEvaluations,
1472
- totalTestCases: totalEvaluations,
1473
- rerunIndex: r + 1,
1474
- rerunTotal: reruns,
1475
- passed: rerunPassedThis,
1476
- durationMs: Date.now() - started,
1477
- evaluatorScores,
1478
- output,
1479
- errorMessage: testCaseError
1480
- };
1481
- updateSnapshot(task.runId, (snapshot) => ({
1482
- ...snapshot,
1483
- completedTestCases: completedEvaluations
1484
- }));
1485
- yield* publishEvent(progressEvent);
1486
- yield* effect.Queue.offer(persistenceQueue, {
1487
- runId: task.runId,
1488
- artifactPath: task.snapshot.artifactPath,
1489
- payload: progressEvent
1490
- });
1491
1565
  }
1492
- const testCasePassed = rerunPassed.every(Boolean);
1493
- if (testCasePassed) {
1494
- yield* effect.Ref.update(passedRef, (n) => n + 1);
1495
- } else {
1496
- yield* effect.Ref.update(failedRef, (n) => n + 1);
1497
- }
1498
- const [passed, failed] = yield* effect.Effect.all([
1499
- effect.Ref.get(passedRef),
1500
- effect.Ref.get(failedRef)
1566
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1567
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1568
+ n + 1,
1569
+ n + 1
1501
1570
  ]);
1502
- updateSnapshot(task.runId, (snapshot) => ({
1571
+ const progressEvent = {
1572
+ type: "TestCaseProgress",
1573
+ runId: task.runId,
1574
+ testCaseId: testCaseItem.id,
1575
+ testCaseName: testCaseItem.testCase.getName(),
1576
+ completedTestCases: completedEvaluations,
1577
+ totalTestCases: totalEvaluations,
1578
+ rerunIndex,
1579
+ rerunTotal,
1580
+ passed: rerunPassedThis,
1581
+ durationMs: Date.now() - started,
1582
+ evaluatorScores,
1583
+ output,
1584
+ errorMessage: testCaseError
1585
+ };
1586
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1503
1587
  ...snapshot,
1504
- passedTestCases: passed,
1505
- failedTestCases: failed
1588
+ completedTestCases: completedEvaluations
1506
1589
  }));
1590
+ yield* publishEvent(progressEvent);
1591
+ yield* effect.Queue.offer(persistenceQueue, {
1592
+ runId: task.runId,
1593
+ artifactPath: task.snapshot.artifactPath,
1594
+ payload: progressEvent
1595
+ });
1596
+ const testCaseCompleted = yield* effect.Ref.modify(
1597
+ testCaseResultsRef,
1598
+ (map) => {
1599
+ const key = testCaseItem.id;
1600
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
1601
+ const newResults = [...existing.results, rerunPassedThis];
1602
+ const newCompletedCount = existing.completedCount + 1;
1603
+ const isLast = newCompletedCount === rerunTotal;
1604
+ const newMap = new Map(map);
1605
+ newMap.set(key, {
1606
+ completedCount: newCompletedCount,
1607
+ results: newResults
1608
+ });
1609
+ const outcome = isLast ? newResults.every(Boolean) : null;
1610
+ return [outcome, newMap];
1611
+ }
1612
+ );
1613
+ if (testCaseCompleted !== null) {
1614
+ if (testCaseCompleted) {
1615
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
1616
+ } else {
1617
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
1618
+ }
1619
+ const [passed, failed] = yield* effect.Effect.all([
1620
+ effect.Ref.get(passedRef),
1621
+ effect.Ref.get(failedRef)
1622
+ ]);
1623
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1624
+ ...snapshot,
1625
+ passedTestCases: passed,
1626
+ failedTestCases: failed
1627
+ }));
1628
+ }
1507
1629
  });
1508
1630
  }
1509
1631
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
1510
1632
  const startedAt = Date.now();
1511
- updateSnapshot(task.runId, (snapshot) => ({
1633
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1512
1634
  ...snapshot,
1513
1635
  status: "running",
1514
1636
  startedAt
@@ -1527,9 +1649,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1527
1649
  const startedRef = yield* effect.Ref.make(0);
1528
1650
  const passedRef = yield* effect.Ref.make(0);
1529
1651
  const failedRef = yield* effect.Ref.make(0);
1530
- const processTestCase = (testCaseItem) => processOneTestCase(
1652
+ const testCaseResultsRef = yield* effect.Ref.make(
1653
+ /* @__PURE__ */ new Map()
1654
+ );
1655
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
1656
+ const processEvaluation = (unit) => processOneEvaluation(
1531
1657
  task,
1532
- testCaseItem,
1658
+ unit,
1533
1659
  totalEvaluations,
1534
1660
  publishEvent,
1535
1661
  persistenceQueue,
@@ -1537,11 +1663,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1537
1663
  startedRef,
1538
1664
  completedRef,
1539
1665
  passedRef,
1540
- failedRef
1666
+ failedRef,
1667
+ testCaseResultsRef
1541
1668
  );
1542
1669
  yield* effect.Effect.forEach(
1543
- task.testCases,
1544
- processTestCase,
1670
+ evaluationUnits,
1671
+ processEvaluation,
1545
1672
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1546
1673
  );
1547
1674
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
@@ -1559,7 +1686,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1559
1686
  totalTestCases: task.testCases.length,
1560
1687
  artifactPath: task.snapshot.artifactPath
1561
1688
  };
1562
- updateSnapshot(task.runId, (snapshot) => ({
1689
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1563
1690
  ...snapshot,
1564
1691
  status: "completed",
1565
1692
  completedTestCases: completedEvaluations,
@@ -1842,7 +1969,9 @@ var EffectRunner = class {
1842
1969
  this.persistenceQueue = effect.Effect.runSync(
1843
1970
  effect.Queue.unbounded()
1844
1971
  );
1845
- this.snapshots = /* @__PURE__ */ new Map();
1972
+ this.snapshotsRef = effect.Effect.runSync(
1973
+ effect.Ref.make(/* @__PURE__ */ new Map())
1974
+ );
1846
1975
  this.listeners = /* @__PURE__ */ new Set();
1847
1976
  this.datasetsById = /* @__PURE__ */ new Map();
1848
1977
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1945,7 +2074,13 @@ var EffectRunner = class {
1945
2074
  status: "queued",
1946
2075
  artifactPath
1947
2076
  };
1948
- this.snapshots.set(runId, snapshot);
2077
+ await effect.Effect.runPromise(
2078
+ effect.Ref.update(this.snapshotsRef, (map) => {
2079
+ const next = new Map(map);
2080
+ next.set(runId, snapshot);
2081
+ return next;
2082
+ })
2083
+ );
1949
2084
  const queuedEvent = {
1950
2085
  type: "RunQueued",
1951
2086
  runId,
@@ -1986,12 +2121,12 @@ var EffectRunner = class {
1986
2121
  };
1987
2122
  }
1988
2123
  getRunSnapshot(runId) {
1989
- return this.snapshots.get(runId);
2124
+ return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1990
2125
  }
1991
2126
  getAllRunSnapshots() {
1992
- return Array.from(this.snapshots.values()).sort(
1993
- (a, b) => b.queuedAt - a.queuedAt
1994
- );
2127
+ return Array.from(
2128
+ effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
2129
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1995
2130
  }
1996
2131
  async loadRunSnapshotsFromArtifacts() {
1997
2132
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -2020,11 +2155,15 @@ var EffectRunner = class {
2020
2155
  );
2021
2156
  }
2022
2157
  updateSnapshot(runId, updater) {
2023
- const existing = this.snapshots.get(runId);
2024
- if (!existing) {
2025
- return;
2026
- }
2027
- this.snapshots.set(runId, updater(existing));
2158
+ return effect.Ref.modify(this.snapshotsRef, (map) => {
2159
+ const existing = map.get(runId);
2160
+ if (!existing) {
2161
+ return [void 0, map];
2162
+ }
2163
+ const next = new Map(map);
2164
+ next.set(runId, updater(existing));
2165
+ return [void 0, next];
2166
+ }).pipe(effect.Effect.asVoid);
2028
2167
  }
2029
2168
  publishEvent(event) {
2030
2169
  return effect.Effect.sync(() => {