@m4trix/evals 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,7 +1,8 @@
1
1
  'use strict';
2
2
 
3
3
  var effect = require('effect');
4
- var jsonDiff = require('json-diff');
4
+ var diff = require('diff');
5
+ var stringify = require('fast-json-stable-stringify');
5
6
  var crypto = require('crypto');
6
7
  var fs = require('fs');
7
8
  var path = require('path');
@@ -10,6 +11,8 @@ var promises = require('fs/promises');
10
11
  var url = require('url');
11
12
 
12
13
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
14
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
15
+
13
16
  function _interopNamespace(e) {
14
17
  if (e && e.__esModule) return e;
15
18
  var n = Object.create(null);
@@ -28,6 +31,7 @@ function _interopNamespace(e) {
28
31
  return Object.freeze(n);
29
32
  }
30
33
 
34
+ var stringify__default = /*#__PURE__*/_interopDefault(stringify);
31
35
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
32
36
 
33
37
  // src/cli/data.mock.json
@@ -730,10 +734,102 @@ var binaryScore = Score.of({
730
734
  },
731
735
  aggregateValues: Score.aggregate.all
732
736
  });
737
+ function preprocessForDiff(value, options) {
738
+ if (options?.sort && Array.isArray(value)) {
739
+ return [...value].sort((a, b) => {
740
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
741
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
742
+ return aStr.localeCompare(bStr);
743
+ }).map((item) => preprocessForDiff(item, options));
744
+ }
745
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
746
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
747
+ const filtered = {};
748
+ for (const [k, v] of Object.entries(value)) {
749
+ if (!keys.includes(k)) {
750
+ filtered[k] = preprocessForDiff(v, options);
751
+ }
752
+ }
753
+ return filtered;
754
+ }
755
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
756
+ const result = {};
757
+ for (const [k, v] of Object.entries(value)) {
758
+ result[k] = preprocessForDiff(v, options);
759
+ }
760
+ return result;
761
+ }
762
+ if (typeof value === "number" && options?.precision !== void 0) {
763
+ return Number(value.toFixed(options.precision));
764
+ }
765
+ return value;
766
+ }
767
+ function toPrettyJson(value) {
768
+ const str = stringify__default.default(value);
769
+ try {
770
+ const parsed = JSON.parse(str);
771
+ return JSON.stringify(parsed, null, 2);
772
+ } catch {
773
+ return str;
774
+ }
775
+ }
776
+ function formatDiffParts(parts) {
777
+ const lines = [];
778
+ for (const part of parts) {
779
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
780
+ const partLines = part.value.split("\n");
781
+ for (let i = 0; i < partLines.length; i++) {
782
+ const line = partLines[i];
783
+ if (i === partLines.length - 1 && line === "")
784
+ continue;
785
+ lines.push(prefix + line);
786
+ }
787
+ }
788
+ return lines.join("\n");
789
+ }
733
790
  function createDiffString(expected, actual, diffOptions) {
734
- const opts = { ...diffOptions, color: false };
735
- const result = jsonDiff.diffString(expected, actual, opts);
736
- return typeof result === "string" ? result : "";
791
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
792
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
793
+ if (diffOptions?.keysOnly) {
794
+ const expectedKeys = JSON.stringify(
795
+ extractKeys(expectedProcessed),
796
+ null,
797
+ 2
798
+ );
799
+ const actualKeys = JSON.stringify(
800
+ extractKeys(actualProcessed),
801
+ null,
802
+ 2
803
+ );
804
+ const parts2 = diff.diffLines(expectedKeys, actualKeys);
805
+ return formatDiffParts(parts2);
806
+ }
807
+ const expectedStr = toPrettyJson(expectedProcessed);
808
+ const actualStr = toPrettyJson(actualProcessed);
809
+ if (expectedStr === actualStr) {
810
+ return "";
811
+ }
812
+ const parts = diff.diffLines(expectedStr, actualStr);
813
+ if (diffOptions?.outputNewOnly) {
814
+ const filtered = parts.filter(
815
+ (p) => p.added === true
816
+ );
817
+ return formatDiffParts(filtered);
818
+ }
819
+ return formatDiffParts(parts);
820
+ }
821
+ function extractKeys(value) {
822
+ if (value === null || typeof value !== "object") {
823
+ return "\xB7";
824
+ }
825
+ if (Array.isArray(value)) {
826
+ return value.map(extractKeys);
827
+ }
828
+ const result = {};
829
+ for (const [k, v] of Object.entries(value)) {
830
+ result[k] = extractKeys(v);
831
+ }
832
+ return result;
737
833
  }
738
834
  function formatLogMessage(msg) {
739
835
  if (typeof msg === "string")
@@ -1119,6 +1215,20 @@ function readOutput(testCase) {
1119
1215
  }
1120
1216
  return candidate.getOutput();
1121
1217
  }
1218
+ function buildEvaluationUnits(testCases) {
1219
+ const units = [];
1220
+ for (const testCaseItem of testCases) {
1221
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1222
+ for (let r = 0; r < rerunTotal; r++) {
1223
+ units.push({
1224
+ testCaseItem,
1225
+ rerunIndex: r + 1,
1226
+ rerunTotal
1227
+ });
1228
+ }
1229
+ }
1230
+ return units;
1231
+ }
1122
1232
  function nowIsoForFile() {
1123
1233
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1124
1234
  }
@@ -1128,157 +1238,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1128
1238
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1129
1239
  );
1130
1240
  }
1131
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1241
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1242
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
1132
1243
  return effect.Effect.gen(function* () {
1133
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1134
- const rerunPassed = [];
1135
- for (let r = 0; r < reruns; r++) {
1136
- const evaluatorRunId = `run-${crypto.randomUUID()}`;
1137
- const started = Date.now();
1138
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1139
- n + 1,
1140
- n + 1
1141
- ]);
1142
- yield* publishEvent({
1143
- type: "TestCaseStarted",
1144
- runId: task.runId,
1145
- testCaseId: testCaseItem.id,
1146
- testCaseName: testCaseItem.testCase.getName(),
1147
- startedTestCases: startedEvaluations,
1148
- totalTestCases: totalEvaluations,
1149
- rerunIndex: r + 1,
1150
- rerunTotal: reruns
1151
- });
1152
- const evaluatorScores = [];
1153
- let testCaseError;
1154
- const output = readOutput(testCaseItem.testCase);
1155
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1156
- const evaluateFn = evaluator.getEvaluateFn();
1157
- if (!evaluateFn) {
1158
- continue;
1159
- }
1160
- const logs = [];
1161
- const logDiff = (expected, actual, options) => {
1162
- logs.push(createDiffLogEntry(expected, actual, options));
1163
- };
1164
- const log = (message, options) => {
1165
- logs.push(createLogEntry(message, options));
1166
- };
1167
- const createError = (message, options) => {
1168
- const entry = createLogEntry(message, options);
1169
- const error = message instanceof Error ? message : new Error(entry.message);
1170
- error[evaluatorErrorLogEntryKey] = entry;
1171
- return error;
1172
- };
1173
- try {
1174
- const ctx = yield* effect.Effect.promise(
1175
- () => Promise.resolve(evaluator.resolveContext())
1176
- );
1177
- const result = yield* effect.Effect.promise(
1178
- () => Promise.resolve().then(
1179
- () => evaluateFn({
1180
- input: testCaseItem.testCase.getInput(),
1181
- ctx,
1182
- output,
1183
- meta: {
1184
- triggerId: task.triggerId,
1185
- runId: evaluatorRunId,
1186
- datasetId: task.datasetId
1187
- },
1188
- logDiff,
1189
- log,
1190
- createError
1191
- })
1192
- )
1193
- );
1194
- if (result instanceof Error) {
1195
- const evaluatorError = result;
1196
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1197
- logs.push(taggedEntry ?? createLogEntry(result));
1198
- testCaseError = result.message;
1199
- evaluatorScores.push({
1200
- evaluatorId,
1201
- scores: [],
1202
- passed: false,
1203
- logs: logs.length > 0 ? logs : void 0
1204
- });
1205
- continue;
1206
- }
1207
- const { scores, metrics } = normalizeResult(result);
1208
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1209
- evaluatorScores.push({
1210
- evaluatorId,
1211
- scores,
1212
- passed: passed2,
1213
- metrics,
1214
- logs: logs.length > 0 ? logs : void 0
1215
- });
1216
- } catch (error) {
1217
- if (error instanceof Error) {
1218
- const taggedEntry = error[evaluatorErrorLogEntryKey];
1219
- logs.push(taggedEntry ?? createLogEntry(error));
1220
- }
1221
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1244
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
1245
+ const started = Date.now();
1246
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1247
+ n + 1,
1248
+ n + 1
1249
+ ]);
1250
+ yield* publishEvent({
1251
+ type: "TestCaseStarted",
1252
+ runId: task.runId,
1253
+ testCaseId: testCaseItem.id,
1254
+ testCaseName: testCaseItem.testCase.getName(),
1255
+ startedTestCases: startedEvaluations,
1256
+ totalTestCases: totalEvaluations,
1257
+ rerunIndex,
1258
+ rerunTotal
1259
+ });
1260
+ const evaluatorScores = [];
1261
+ let testCaseError;
1262
+ const output = readOutput(testCaseItem.testCase);
1263
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1264
+ const evaluateFn = evaluator.getEvaluateFn();
1265
+ if (!evaluateFn) {
1266
+ continue;
1267
+ }
1268
+ const logs = [];
1269
+ const logDiff = (expected, actual, options) => {
1270
+ logs.push(createDiffLogEntry(expected, actual, options));
1271
+ };
1272
+ const log = (message, options) => {
1273
+ logs.push(createLogEntry(message, options));
1274
+ };
1275
+ const createError = (message, options) => {
1276
+ const entry = createLogEntry(message, options);
1277
+ const error = message instanceof Error ? message : new Error(entry.message);
1278
+ error[evaluatorErrorLogEntryKey] = entry;
1279
+ return error;
1280
+ };
1281
+ try {
1282
+ const ctx = yield* effect.Effect.promise(
1283
+ () => Promise.resolve(evaluator.resolveContext())
1284
+ );
1285
+ const result = yield* effect.Effect.promise(
1286
+ () => Promise.resolve().then(
1287
+ () => evaluateFn({
1288
+ input: testCaseItem.testCase.getInput(),
1289
+ ctx,
1290
+ output,
1291
+ meta: {
1292
+ triggerId: task.triggerId,
1293
+ runId: evaluatorRunId,
1294
+ datasetId: task.datasetId
1295
+ },
1296
+ logDiff,
1297
+ log,
1298
+ createError
1299
+ })
1300
+ )
1301
+ );
1302
+ if (result instanceof Error) {
1303
+ const evaluatorError = result;
1304
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1305
+ logs.push(taggedEntry ?? createLogEntry(result));
1306
+ testCaseError = result.message;
1222
1307
  evaluatorScores.push({
1223
1308
  evaluatorId,
1224
1309
  scores: [],
1225
1310
  passed: false,
1226
1311
  logs: logs.length > 0 ? logs : void 0
1227
1312
  });
1313
+ continue;
1228
1314
  }
1315
+ const { scores, metrics } = normalizeResult(result);
1316
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1317
+ evaluatorScores.push({
1318
+ evaluatorId,
1319
+ scores,
1320
+ passed,
1321
+ metrics,
1322
+ logs: logs.length > 0 ? logs : void 0
1323
+ });
1324
+ } catch (error) {
1325
+ if (error instanceof Error) {
1326
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1327
+ logs.push(taggedEntry ?? createLogEntry(error));
1328
+ }
1329
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1330
+ evaluatorScores.push({
1331
+ evaluatorId,
1332
+ scores: [],
1333
+ passed: false,
1334
+ logs: logs.length > 0 ? logs : void 0
1335
+ });
1229
1336
  }
1230
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1231
- rerunPassed.push(rerunPassedThis);
1232
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1233
- n + 1,
1234
- n + 1
1235
- ]);
1236
- const progressEvent = {
1237
- type: "TestCaseProgress",
1238
- runId: task.runId,
1239
- testCaseId: testCaseItem.id,
1240
- testCaseName: testCaseItem.testCase.getName(),
1241
- completedTestCases: completedEvaluations,
1242
- totalTestCases: totalEvaluations,
1243
- rerunIndex: r + 1,
1244
- rerunTotal: reruns,
1245
- passed: rerunPassedThis,
1246
- durationMs: Date.now() - started,
1247
- evaluatorScores,
1248
- output,
1249
- errorMessage: testCaseError
1250
- };
1251
- updateSnapshot(task.runId, (snapshot) => ({
1252
- ...snapshot,
1253
- completedTestCases: completedEvaluations
1254
- }));
1255
- yield* publishEvent(progressEvent);
1256
- yield* effect.Queue.offer(persistenceQueue, {
1257
- runId: task.runId,
1258
- artifactPath: task.snapshot.artifactPath,
1259
- payload: progressEvent
1260
- });
1261
- }
1262
- const testCasePassed = rerunPassed.every(Boolean);
1263
- if (testCasePassed) {
1264
- yield* effect.Ref.update(passedRef, (n) => n + 1);
1265
- } else {
1266
- yield* effect.Ref.update(failedRef, (n) => n + 1);
1267
1337
  }
1268
- const [passed, failed] = yield* effect.Effect.all([
1269
- effect.Ref.get(passedRef),
1270
- effect.Ref.get(failedRef)
1338
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1339
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1340
+ n + 1,
1341
+ n + 1
1271
1342
  ]);
1272
- updateSnapshot(task.runId, (snapshot) => ({
1343
+ const progressEvent = {
1344
+ type: "TestCaseProgress",
1345
+ runId: task.runId,
1346
+ testCaseId: testCaseItem.id,
1347
+ testCaseName: testCaseItem.testCase.getName(),
1348
+ completedTestCases: completedEvaluations,
1349
+ totalTestCases: totalEvaluations,
1350
+ rerunIndex,
1351
+ rerunTotal,
1352
+ passed: rerunPassedThis,
1353
+ durationMs: Date.now() - started,
1354
+ evaluatorScores,
1355
+ output,
1356
+ errorMessage: testCaseError
1357
+ };
1358
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1273
1359
  ...snapshot,
1274
- passedTestCases: passed,
1275
- failedTestCases: failed
1360
+ completedTestCases: completedEvaluations
1276
1361
  }));
1362
+ yield* publishEvent(progressEvent);
1363
+ yield* effect.Queue.offer(persistenceQueue, {
1364
+ runId: task.runId,
1365
+ artifactPath: task.snapshot.artifactPath,
1366
+ payload: progressEvent
1367
+ });
1368
+ const testCaseCompleted = yield* effect.Ref.modify(
1369
+ testCaseResultsRef,
1370
+ (map) => {
1371
+ const key = testCaseItem.id;
1372
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
1373
+ const newResults = [...existing.results, rerunPassedThis];
1374
+ const newCompletedCount = existing.completedCount + 1;
1375
+ const isLast = newCompletedCount === rerunTotal;
1376
+ const newMap = new Map(map);
1377
+ newMap.set(key, {
1378
+ completedCount: newCompletedCount,
1379
+ results: newResults
1380
+ });
1381
+ const outcome = isLast ? newResults.every(Boolean) : null;
1382
+ return [outcome, newMap];
1383
+ }
1384
+ );
1385
+ if (testCaseCompleted !== null) {
1386
+ if (testCaseCompleted) {
1387
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
1388
+ } else {
1389
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
1390
+ }
1391
+ const [passed, failed] = yield* effect.Effect.all([
1392
+ effect.Ref.get(passedRef),
1393
+ effect.Ref.get(failedRef)
1394
+ ]);
1395
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1396
+ ...snapshot,
1397
+ passedTestCases: passed,
1398
+ failedTestCases: failed
1399
+ }));
1400
+ }
1277
1401
  });
1278
1402
  }
1279
1403
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
1280
1404
  const startedAt = Date.now();
1281
- updateSnapshot(task.runId, (snapshot) => ({
1405
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1282
1406
  ...snapshot,
1283
1407
  status: "running",
1284
1408
  startedAt
@@ -1297,9 +1421,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1297
1421
  const startedRef = yield* effect.Ref.make(0);
1298
1422
  const passedRef = yield* effect.Ref.make(0);
1299
1423
  const failedRef = yield* effect.Ref.make(0);
1300
- const processTestCase = (testCaseItem) => processOneTestCase(
1424
+ const testCaseResultsRef = yield* effect.Ref.make(
1425
+ /* @__PURE__ */ new Map()
1426
+ );
1427
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
1428
+ const processEvaluation = (unit) => processOneEvaluation(
1301
1429
  task,
1302
- testCaseItem,
1430
+ unit,
1303
1431
  totalEvaluations,
1304
1432
  publishEvent,
1305
1433
  persistenceQueue,
@@ -1307,11 +1435,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1307
1435
  startedRef,
1308
1436
  completedRef,
1309
1437
  passedRef,
1310
- failedRef
1438
+ failedRef,
1439
+ testCaseResultsRef
1311
1440
  );
1312
1441
  yield* effect.Effect.forEach(
1313
- task.testCases,
1314
- processTestCase,
1442
+ evaluationUnits,
1443
+ processEvaluation,
1315
1444
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1316
1445
  );
1317
1446
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
@@ -1329,7 +1458,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1329
1458
  totalTestCases: task.testCases.length,
1330
1459
  artifactPath: task.snapshot.artifactPath
1331
1460
  };
1332
- updateSnapshot(task.runId, (snapshot) => ({
1461
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1333
1462
  ...snapshot,
1334
1463
  status: "completed",
1335
1464
  completedTestCases: completedEvaluations,
@@ -1582,7 +1711,9 @@ var EffectRunner = class {
1582
1711
  this.persistenceQueue = effect.Effect.runSync(
1583
1712
  effect.Queue.unbounded()
1584
1713
  );
1585
- this.snapshots = /* @__PURE__ */ new Map();
1714
+ this.snapshotsRef = effect.Effect.runSync(
1715
+ effect.Ref.make(/* @__PURE__ */ new Map())
1716
+ );
1586
1717
  this.listeners = /* @__PURE__ */ new Set();
1587
1718
  this.datasetsById = /* @__PURE__ */ new Map();
1588
1719
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1685,7 +1816,13 @@ var EffectRunner = class {
1685
1816
  status: "queued",
1686
1817
  artifactPath
1687
1818
  };
1688
- this.snapshots.set(runId, snapshot);
1819
+ await effect.Effect.runPromise(
1820
+ effect.Ref.update(this.snapshotsRef, (map) => {
1821
+ const next = new Map(map);
1822
+ next.set(runId, snapshot);
1823
+ return next;
1824
+ })
1825
+ );
1689
1826
  const queuedEvent = {
1690
1827
  type: "RunQueued",
1691
1828
  runId,
@@ -1726,12 +1863,12 @@ var EffectRunner = class {
1726
1863
  };
1727
1864
  }
1728
1865
  getRunSnapshot(runId) {
1729
- return this.snapshots.get(runId);
1866
+ return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1730
1867
  }
1731
1868
  getAllRunSnapshots() {
1732
- return Array.from(this.snapshots.values()).sort(
1733
- (a, b) => b.queuedAt - a.queuedAt
1734
- );
1869
+ return Array.from(
1870
+ effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1871
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1735
1872
  }
1736
1873
  async loadRunSnapshotsFromArtifacts() {
1737
1874
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1760,11 +1897,15 @@ var EffectRunner = class {
1760
1897
  );
1761
1898
  }
1762
1899
  updateSnapshot(runId, updater) {
1763
- const existing = this.snapshots.get(runId);
1764
- if (!existing) {
1765
- return;
1766
- }
1767
- this.snapshots.set(runId, updater(existing));
1900
+ return effect.Ref.modify(this.snapshotsRef, (map) => {
1901
+ const existing = map.get(runId);
1902
+ if (!existing) {
1903
+ return [void 0, map];
1904
+ }
1905
+ const next = new Map(map);
1906
+ next.set(runId, updater(existing));
1907
+ return [void 0, next];
1908
+ }).pipe(effect.Effect.asVoid);
1768
1909
  }
1769
1910
  publishEvent(event) {
1770
1911
  return effect.Effect.sync(() => {