@m4trix/evals 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -169,10 +169,9 @@ declare class Dataset {
169
169
 
170
170
  /**
171
171
  * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
172
- * @see https://www.npmjs.com/package/json-diff
173
172
  */
174
173
  interface JsonDiffOptions {
175
- /** Include equal sections of the document, not just deltas */
174
+ /** Include equal sections of the document, not just deltas (always true with current implementation) */
176
175
  full?: boolean;
177
176
  /** Sort primitive values in arrays before comparing */
178
177
  sort?: boolean;
package/dist/index.js CHANGED
@@ -1,6 +1,7 @@
1
- import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
1
+ import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
- import { diffString } from 'json-diff';
3
+ import { diffLines } from 'diff';
4
+ import stringify from 'fast-json-stable-stringify';
4
5
  import { randomUUID } from 'crypto';
5
6
  import { existsSync } from 'fs';
6
7
  import { resolve as resolve$1, relative, join, dirname } from 'path';
@@ -708,10 +709,102 @@ var binaryScore = Score.of({
708
709
  },
709
710
  aggregateValues: Score.aggregate.all
710
711
  });
712
+ function preprocessForDiff(value, options) {
713
+ if (options?.sort && Array.isArray(value)) {
714
+ return [...value].sort((a, b) => {
715
+ const aStr = stringify(preprocessForDiff(a, options));
716
+ const bStr = stringify(preprocessForDiff(b, options));
717
+ return aStr.localeCompare(bStr);
718
+ }).map((item) => preprocessForDiff(item, options));
719
+ }
720
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
721
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
722
+ const filtered = {};
723
+ for (const [k, v] of Object.entries(value)) {
724
+ if (!keys.includes(k)) {
725
+ filtered[k] = preprocessForDiff(v, options);
726
+ }
727
+ }
728
+ return filtered;
729
+ }
730
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
731
+ const result = {};
732
+ for (const [k, v] of Object.entries(value)) {
733
+ result[k] = preprocessForDiff(v, options);
734
+ }
735
+ return result;
736
+ }
737
+ if (typeof value === "number" && options?.precision !== void 0) {
738
+ return Number(value.toFixed(options.precision));
739
+ }
740
+ return value;
741
+ }
742
+ function toPrettyJson(value) {
743
+ const str = stringify(value);
744
+ try {
745
+ const parsed = JSON.parse(str);
746
+ return JSON.stringify(parsed, null, 2);
747
+ } catch {
748
+ return str;
749
+ }
750
+ }
751
+ function formatDiffParts(parts) {
752
+ const lines = [];
753
+ for (const part of parts) {
754
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
755
+ const partLines = part.value.split("\n");
756
+ for (let i = 0; i < partLines.length; i++) {
757
+ const line = partLines[i];
758
+ if (i === partLines.length - 1 && line === "")
759
+ continue;
760
+ lines.push(prefix + line);
761
+ }
762
+ }
763
+ return lines.join("\n");
764
+ }
711
765
  function createDiffString(expected, actual, diffOptions) {
712
- const opts = { ...diffOptions, color: false };
713
- const result = diffString(expected, actual, opts);
714
- return typeof result === "string" ? result : "";
766
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
767
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
768
+ if (diffOptions?.keysOnly) {
769
+ const expectedKeys = JSON.stringify(
770
+ extractKeys(expectedProcessed),
771
+ null,
772
+ 2
773
+ );
774
+ const actualKeys = JSON.stringify(
775
+ extractKeys(actualProcessed),
776
+ null,
777
+ 2
778
+ );
779
+ const parts2 = diffLines(expectedKeys, actualKeys);
780
+ return formatDiffParts(parts2);
781
+ }
782
+ const expectedStr = toPrettyJson(expectedProcessed);
783
+ const actualStr = toPrettyJson(actualProcessed);
784
+ if (expectedStr === actualStr) {
785
+ return "";
786
+ }
787
+ const parts = diffLines(expectedStr, actualStr);
788
+ if (diffOptions?.outputNewOnly) {
789
+ const filtered = parts.filter(
790
+ (p) => p.added === true
791
+ );
792
+ return formatDiffParts(filtered);
793
+ }
794
+ return formatDiffParts(parts);
795
+ }
796
+ function extractKeys(value) {
797
+ if (value === null || typeof value !== "object") {
798
+ return "\xB7";
799
+ }
800
+ if (Array.isArray(value)) {
801
+ return value.map(extractKeys);
802
+ }
803
+ const result = {};
804
+ for (const [k, v] of Object.entries(value)) {
805
+ result[k] = extractKeys(v);
806
+ }
807
+ return result;
715
808
  }
716
809
  function formatLogMessage(msg) {
717
810
  if (typeof msg === "string")
@@ -1097,6 +1190,20 @@ function readOutput(testCase) {
1097
1190
  }
1098
1191
  return candidate.getOutput();
1099
1192
  }
1193
+ function buildEvaluationUnits(testCases) {
1194
+ const units = [];
1195
+ for (const testCaseItem of testCases) {
1196
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1197
+ for (let r = 0; r < rerunTotal; r++) {
1198
+ units.push({
1199
+ testCaseItem,
1200
+ rerunIndex: r + 1,
1201
+ rerunTotal
1202
+ });
1203
+ }
1204
+ }
1205
+ return units;
1206
+ }
1100
1207
  function nowIsoForFile() {
1101
1208
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1102
1209
  }
@@ -1106,157 +1213,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1106
1213
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1107
1214
  );
1108
1215
  }
1109
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1216
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1217
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
1110
1218
  return Effect.gen(function* () {
1111
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1112
- const rerunPassed = [];
1113
- for (let r = 0; r < reruns; r++) {
1114
- const evaluatorRunId = `run-${randomUUID()}`;
1115
- const started = Date.now();
1116
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1117
- n + 1,
1118
- n + 1
1119
- ]);
1120
- yield* publishEvent({
1121
- type: "TestCaseStarted",
1122
- runId: task.runId,
1123
- testCaseId: testCaseItem.id,
1124
- testCaseName: testCaseItem.testCase.getName(),
1125
- startedTestCases: startedEvaluations,
1126
- totalTestCases: totalEvaluations,
1127
- rerunIndex: r + 1,
1128
- rerunTotal: reruns
1129
- });
1130
- const evaluatorScores = [];
1131
- let testCaseError;
1132
- const output = readOutput(testCaseItem.testCase);
1133
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1134
- const evaluateFn = evaluator.getEvaluateFn();
1135
- if (!evaluateFn) {
1136
- continue;
1137
- }
1138
- const logs = [];
1139
- const logDiff = (expected, actual, options) => {
1140
- logs.push(createDiffLogEntry(expected, actual, options));
1141
- };
1142
- const log = (message, options) => {
1143
- logs.push(createLogEntry(message, options));
1144
- };
1145
- const createError = (message, options) => {
1146
- const entry = createLogEntry(message, options);
1147
- const error = message instanceof Error ? message : new Error(entry.message);
1148
- error[evaluatorErrorLogEntryKey] = entry;
1149
- return error;
1150
- };
1151
- try {
1152
- const ctx = yield* Effect.promise(
1153
- () => Promise.resolve(evaluator.resolveContext())
1154
- );
1155
- const result = yield* Effect.promise(
1156
- () => Promise.resolve().then(
1157
- () => evaluateFn({
1158
- input: testCaseItem.testCase.getInput(),
1159
- ctx,
1160
- output,
1161
- meta: {
1162
- triggerId: task.triggerId,
1163
- runId: evaluatorRunId,
1164
- datasetId: task.datasetId
1165
- },
1166
- logDiff,
1167
- log,
1168
- createError
1169
- })
1170
- )
1171
- );
1172
- if (result instanceof Error) {
1173
- const evaluatorError = result;
1174
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1175
- logs.push(taggedEntry ?? createLogEntry(result));
1176
- testCaseError = result.message;
1177
- evaluatorScores.push({
1178
- evaluatorId,
1179
- scores: [],
1180
- passed: false,
1181
- logs: logs.length > 0 ? logs : void 0
1182
- });
1183
- continue;
1184
- }
1185
- const { scores, metrics } = normalizeResult(result);
1186
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1187
- evaluatorScores.push({
1188
- evaluatorId,
1189
- scores,
1190
- passed: passed2,
1191
- metrics,
1192
- logs: logs.length > 0 ? logs : void 0
1193
- });
1194
- } catch (error) {
1195
- if (error instanceof Error) {
1196
- const taggedEntry = error[evaluatorErrorLogEntryKey];
1197
- logs.push(taggedEntry ?? createLogEntry(error));
1198
- }
1199
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1219
+ const evaluatorRunId = `run-${randomUUID()}`;
1220
+ const started = Date.now();
1221
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1222
+ n + 1,
1223
+ n + 1
1224
+ ]);
1225
+ yield* publishEvent({
1226
+ type: "TestCaseStarted",
1227
+ runId: task.runId,
1228
+ testCaseId: testCaseItem.id,
1229
+ testCaseName: testCaseItem.testCase.getName(),
1230
+ startedTestCases: startedEvaluations,
1231
+ totalTestCases: totalEvaluations,
1232
+ rerunIndex,
1233
+ rerunTotal
1234
+ });
1235
+ const evaluatorScores = [];
1236
+ let testCaseError;
1237
+ const output = readOutput(testCaseItem.testCase);
1238
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1239
+ const evaluateFn = evaluator.getEvaluateFn();
1240
+ if (!evaluateFn) {
1241
+ continue;
1242
+ }
1243
+ const logs = [];
1244
+ const logDiff = (expected, actual, options) => {
1245
+ logs.push(createDiffLogEntry(expected, actual, options));
1246
+ };
1247
+ const log = (message, options) => {
1248
+ logs.push(createLogEntry(message, options));
1249
+ };
1250
+ const createError = (message, options) => {
1251
+ const entry = createLogEntry(message, options);
1252
+ const error = message instanceof Error ? message : new Error(entry.message);
1253
+ error[evaluatorErrorLogEntryKey] = entry;
1254
+ return error;
1255
+ };
1256
+ try {
1257
+ const ctx = yield* Effect.promise(
1258
+ () => Promise.resolve(evaluator.resolveContext())
1259
+ );
1260
+ const result = yield* Effect.promise(
1261
+ () => Promise.resolve().then(
1262
+ () => evaluateFn({
1263
+ input: testCaseItem.testCase.getInput(),
1264
+ ctx,
1265
+ output,
1266
+ meta: {
1267
+ triggerId: task.triggerId,
1268
+ runId: evaluatorRunId,
1269
+ datasetId: task.datasetId
1270
+ },
1271
+ logDiff,
1272
+ log,
1273
+ createError
1274
+ })
1275
+ )
1276
+ );
1277
+ if (result instanceof Error) {
1278
+ const evaluatorError = result;
1279
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1280
+ logs.push(taggedEntry ?? createLogEntry(result));
1281
+ testCaseError = result.message;
1200
1282
  evaluatorScores.push({
1201
1283
  evaluatorId,
1202
1284
  scores: [],
1203
1285
  passed: false,
1204
1286
  logs: logs.length > 0 ? logs : void 0
1205
1287
  });
1288
+ continue;
1289
+ }
1290
+ const { scores, metrics } = normalizeResult(result);
1291
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1292
+ evaluatorScores.push({
1293
+ evaluatorId,
1294
+ scores,
1295
+ passed,
1296
+ metrics,
1297
+ logs: logs.length > 0 ? logs : void 0
1298
+ });
1299
+ } catch (error) {
1300
+ if (error instanceof Error) {
1301
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1302
+ logs.push(taggedEntry ?? createLogEntry(error));
1206
1303
  }
1304
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1305
+ evaluatorScores.push({
1306
+ evaluatorId,
1307
+ scores: [],
1308
+ passed: false,
1309
+ logs: logs.length > 0 ? logs : void 0
1310
+ });
1207
1311
  }
1208
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1209
- rerunPassed.push(rerunPassedThis);
1210
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1211
- n + 1,
1212
- n + 1
1213
- ]);
1214
- const progressEvent = {
1215
- type: "TestCaseProgress",
1216
- runId: task.runId,
1217
- testCaseId: testCaseItem.id,
1218
- testCaseName: testCaseItem.testCase.getName(),
1219
- completedTestCases: completedEvaluations,
1220
- totalTestCases: totalEvaluations,
1221
- rerunIndex: r + 1,
1222
- rerunTotal: reruns,
1223
- passed: rerunPassedThis,
1224
- durationMs: Date.now() - started,
1225
- evaluatorScores,
1226
- output,
1227
- errorMessage: testCaseError
1228
- };
1229
- updateSnapshot(task.runId, (snapshot) => ({
1230
- ...snapshot,
1231
- completedTestCases: completedEvaluations
1232
- }));
1233
- yield* publishEvent(progressEvent);
1234
- yield* Queue.offer(persistenceQueue, {
1235
- runId: task.runId,
1236
- artifactPath: task.snapshot.artifactPath,
1237
- payload: progressEvent
1238
- });
1239
- }
1240
- const testCasePassed = rerunPassed.every(Boolean);
1241
- if (testCasePassed) {
1242
- yield* Ref.update(passedRef, (n) => n + 1);
1243
- } else {
1244
- yield* Ref.update(failedRef, (n) => n + 1);
1245
1312
  }
1246
- const [passed, failed] = yield* Effect.all([
1247
- Ref.get(passedRef),
1248
- Ref.get(failedRef)
1313
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1314
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1315
+ n + 1,
1316
+ n + 1
1249
1317
  ]);
1250
- updateSnapshot(task.runId, (snapshot) => ({
1318
+ const progressEvent = {
1319
+ type: "TestCaseProgress",
1320
+ runId: task.runId,
1321
+ testCaseId: testCaseItem.id,
1322
+ testCaseName: testCaseItem.testCase.getName(),
1323
+ completedTestCases: completedEvaluations,
1324
+ totalTestCases: totalEvaluations,
1325
+ rerunIndex,
1326
+ rerunTotal,
1327
+ passed: rerunPassedThis,
1328
+ durationMs: Date.now() - started,
1329
+ evaluatorScores,
1330
+ output,
1331
+ errorMessage: testCaseError
1332
+ };
1333
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1251
1334
  ...snapshot,
1252
- passedTestCases: passed,
1253
- failedTestCases: failed
1335
+ completedTestCases: completedEvaluations
1254
1336
  }));
1337
+ yield* publishEvent(progressEvent);
1338
+ yield* Queue.offer(persistenceQueue, {
1339
+ runId: task.runId,
1340
+ artifactPath: task.snapshot.artifactPath,
1341
+ payload: progressEvent
1342
+ });
1343
+ const testCaseCompleted = yield* Ref.modify(
1344
+ testCaseResultsRef,
1345
+ (map) => {
1346
+ const key = testCaseItem.id;
1347
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
1348
+ const newResults = [...existing.results, rerunPassedThis];
1349
+ const newCompletedCount = existing.completedCount + 1;
1350
+ const isLast = newCompletedCount === rerunTotal;
1351
+ const newMap = new Map(map);
1352
+ newMap.set(key, {
1353
+ completedCount: newCompletedCount,
1354
+ results: newResults
1355
+ });
1356
+ const outcome = isLast ? newResults.every(Boolean) : null;
1357
+ return [outcome, newMap];
1358
+ }
1359
+ );
1360
+ if (testCaseCompleted !== null) {
1361
+ if (testCaseCompleted) {
1362
+ yield* Ref.update(passedRef, (n) => n + 1);
1363
+ } else {
1364
+ yield* Ref.update(failedRef, (n) => n + 1);
1365
+ }
1366
+ const [passed, failed] = yield* Effect.all([
1367
+ Ref.get(passedRef),
1368
+ Ref.get(failedRef)
1369
+ ]);
1370
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1371
+ ...snapshot,
1372
+ passedTestCases: passed,
1373
+ failedTestCases: failed
1374
+ }));
1375
+ }
1255
1376
  });
1256
1377
  }
1257
1378
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
1258
1379
  const startedAt = Date.now();
1259
- updateSnapshot(task.runId, (snapshot) => ({
1380
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1260
1381
  ...snapshot,
1261
1382
  status: "running",
1262
1383
  startedAt
@@ -1275,9 +1396,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1275
1396
  const startedRef = yield* Ref.make(0);
1276
1397
  const passedRef = yield* Ref.make(0);
1277
1398
  const failedRef = yield* Ref.make(0);
1278
- const processTestCase = (testCaseItem) => processOneTestCase(
1399
+ const testCaseResultsRef = yield* Ref.make(
1400
+ /* @__PURE__ */ new Map()
1401
+ );
1402
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
1403
+ const processEvaluation = (unit) => processOneEvaluation(
1279
1404
  task,
1280
- testCaseItem,
1405
+ unit,
1281
1406
  totalEvaluations,
1282
1407
  publishEvent,
1283
1408
  persistenceQueue,
@@ -1285,11 +1410,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1285
1410
  startedRef,
1286
1411
  completedRef,
1287
1412
  passedRef,
1288
- failedRef
1413
+ failedRef,
1414
+ testCaseResultsRef
1289
1415
  );
1290
1416
  yield* Effect.forEach(
1291
- task.testCases,
1292
- processTestCase,
1417
+ evaluationUnits,
1418
+ processEvaluation,
1293
1419
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1294
1420
  );
1295
1421
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
@@ -1307,7 +1433,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1307
1433
  totalTestCases: task.testCases.length,
1308
1434
  artifactPath: task.snapshot.artifactPath
1309
1435
  };
1310
- updateSnapshot(task.runId, (snapshot) => ({
1436
+ yield* updateSnapshot(task.runId, (snapshot) => ({
1311
1437
  ...snapshot,
1312
1438
  status: "completed",
1313
1439
  completedTestCases: completedEvaluations,
@@ -1560,7 +1686,9 @@ var EffectRunner = class {
1560
1686
  this.persistenceQueue = Effect.runSync(
1561
1687
  Queue.unbounded()
1562
1688
  );
1563
- this.snapshots = /* @__PURE__ */ new Map();
1689
+ this.snapshotsRef = Effect.runSync(
1690
+ Ref.make(/* @__PURE__ */ new Map())
1691
+ );
1564
1692
  this.listeners = /* @__PURE__ */ new Set();
1565
1693
  this.datasetsById = /* @__PURE__ */ new Map();
1566
1694
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1663,7 +1791,13 @@ var EffectRunner = class {
1663
1791
  status: "queued",
1664
1792
  artifactPath
1665
1793
  };
1666
- this.snapshots.set(runId, snapshot);
1794
+ await Effect.runPromise(
1795
+ Ref.update(this.snapshotsRef, (map) => {
1796
+ const next = new Map(map);
1797
+ next.set(runId, snapshot);
1798
+ return next;
1799
+ })
1800
+ );
1667
1801
  const queuedEvent = {
1668
1802
  type: "RunQueued",
1669
1803
  runId,
@@ -1704,12 +1838,12 @@ var EffectRunner = class {
1704
1838
  };
1705
1839
  }
1706
1840
  getRunSnapshot(runId) {
1707
- return this.snapshots.get(runId);
1841
+ return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1708
1842
  }
1709
1843
  getAllRunSnapshots() {
1710
- return Array.from(this.snapshots.values()).sort(
1711
- (a, b) => b.queuedAt - a.queuedAt
1712
- );
1844
+ return Array.from(
1845
+ Effect.runSync(Ref.get(this.snapshotsRef)).values()
1846
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1713
1847
  }
1714
1848
  async loadRunSnapshotsFromArtifacts() {
1715
1849
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1738,11 +1872,15 @@ var EffectRunner = class {
1738
1872
  );
1739
1873
  }
1740
1874
  updateSnapshot(runId, updater) {
1741
- const existing = this.snapshots.get(runId);
1742
- if (!existing) {
1743
- return;
1744
- }
1745
- this.snapshots.set(runId, updater(existing));
1875
+ return Ref.modify(this.snapshotsRef, (map) => {
1876
+ const existing = map.get(runId);
1877
+ if (!existing) {
1878
+ return [void 0, map];
1879
+ }
1880
+ const next = new Map(map);
1881
+ next.set(runId, updater(existing));
1882
+ return [void 0, next];
1883
+ }).pipe(Effect.asVoid);
1746
1884
  }
1747
1885
  publishEvent(event) {
1748
1886
  return Effect.sync(() => {