@m4trix/evals 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -279,11 +279,17 @@ function toEvaluatorOption(item) {
279
279
  };
280
280
  }
281
281
  async function loadRunnerData(runner) {
282
- const [datasets, evaluators] = await Promise.all([
282
+ const [datasets, evaluators, diskSnapshots] = await Promise.all([
283
283
  runner.collectDatasets(),
284
- runner.collectEvaluators()
284
+ runner.collectEvaluators(),
285
+ runner.loadRunSnapshotsFromArtifacts()
285
286
  ]);
286
- const snapshots = runner.getAllRunSnapshots();
287
+ const memSnapshots = runner.getAllRunSnapshots();
288
+ const seen = new Set(memSnapshots.map((s) => s.runId));
289
+ const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
290
+ const snapshots = [...memSnapshots, ...fromDisk].sort(
291
+ (a, b) => b.queuedAt - a.queuedAt
292
+ );
287
293
  if (datasets.length === 0 && evaluators.length === 0) {
288
294
  return loadMockData();
289
295
  }
@@ -576,6 +582,16 @@ var binaryScore = Score.of({
576
582
  displayStrategy: "passFail",
577
583
  format: (data) => data.passed ? "PASSED" : "NOT PASSED"
578
584
  });
585
+ function createDiffLogEntry(expected, actual, options) {
586
+ const diff = jsonDiff.diffString(expected, actual, { color: false });
587
+ return {
588
+ type: "diff",
589
+ label: options?.label,
590
+ expected,
591
+ actual,
592
+ diff: diff || "(no differences)"
593
+ };
594
+ }
579
595
  function printJsonDiff(expected, actual, options = {}) {
580
596
  const { color = true } = options;
581
597
  const diff = jsonDiff.diffString(expected, actual, { color });
@@ -937,6 +953,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
937
953
  continue;
938
954
  }
939
955
  try {
956
+ const logs = [];
957
+ const logDiff = (expected, actual, options) => {
958
+ logs.push(createDiffLogEntry(expected, actual, options));
959
+ };
940
960
  const ctx = yield* effect.Effect.promise(
941
961
  () => Promise.resolve(evaluator.resolveContext())
942
962
  );
@@ -945,13 +965,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
945
965
  evaluateFn({
946
966
  input: testCaseItem.testCase.getInput(),
947
967
  ctx,
948
- output
968
+ output,
969
+ logDiff
949
970
  })
950
971
  )
951
972
  );
952
973
  const { scores, metrics } = normalizeResult(result);
953
974
  const passed = computeEvaluatorPassed(evaluator, result, scores);
954
- evaluatorScores.push({ evaluatorId, scores, passed, metrics });
975
+ evaluatorScores.push({
976
+ evaluatorId,
977
+ scores,
978
+ passed,
979
+ metrics,
980
+ logs: logs.length > 0 ? logs : void 0
981
+ });
955
982
  } catch (error) {
956
983
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
957
984
  evaluatorScores.push({
@@ -1024,6 +1051,120 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1024
1051
  artifactPath: task.snapshot.artifactPath
1025
1052
  });
1026
1053
  });
1054
+ async function loadRunSnapshotsFromArtifacts(config) {
1055
+ const baseDir = path.resolve(config.artifactDirectory);
1056
+ let entries;
1057
+ try {
1058
+ entries = await promises.readdir(baseDir);
1059
+ } catch {
1060
+ return [];
1061
+ }
1062
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1063
+ const snapshots = [];
1064
+ for (const fileName of jsonlFiles) {
1065
+ const filePath = path.join(baseDir, fileName);
1066
+ try {
1067
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
1068
+ if (snapshot) {
1069
+ snapshots.push(snapshot);
1070
+ }
1071
+ } catch {
1072
+ }
1073
+ }
1074
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1075
+ }
1076
+ async function parseArtifactToSnapshot(filePath, _config) {
1077
+ const content = await promises.readFile(filePath, "utf8");
1078
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
1079
+ if (lines.length === 0) {
1080
+ return null;
1081
+ }
1082
+ let runQueued = null;
1083
+ let runCompleted = null;
1084
+ let runFailed = null;
1085
+ let runStarted = null;
1086
+ for (const line of lines) {
1087
+ try {
1088
+ const event = JSON.parse(line);
1089
+ const type = event.type;
1090
+ if (type === "RunQueued") {
1091
+ runQueued = {
1092
+ runId: event.runId,
1093
+ datasetId: event.datasetId,
1094
+ datasetName: event.datasetName,
1095
+ evaluatorIds: event.evaluatorIds,
1096
+ totalTestCases: event.totalTestCases ?? 0,
1097
+ artifactPath: event.artifactPath ?? filePath,
1098
+ ts: event.ts
1099
+ };
1100
+ }
1101
+ if (type === "RunStarted") {
1102
+ runStarted = { startedAt: event.startedAt };
1103
+ }
1104
+ if (type === "RunCompleted") {
1105
+ runCompleted = {
1106
+ passedTestCases: event.passedTestCases,
1107
+ failedTestCases: event.failedTestCases,
1108
+ totalTestCases: event.totalTestCases,
1109
+ finishedAt: event.finishedAt
1110
+ };
1111
+ }
1112
+ if (type === "RunFailed") {
1113
+ runFailed = {
1114
+ finishedAt: event.finishedAt,
1115
+ errorMessage: event.errorMessage
1116
+ };
1117
+ }
1118
+ } catch {
1119
+ }
1120
+ }
1121
+ if (!runQueued) {
1122
+ return null;
1123
+ }
1124
+ const artifactPath = filePath;
1125
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1126
+ const progress = aggregateTestCaseProgress(lines);
1127
+ const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1128
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1129
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1130
+ return {
1131
+ runId: runQueued.runId,
1132
+ datasetId: runQueued.datasetId,
1133
+ datasetName: runQueued.datasetName,
1134
+ evaluatorIds: runQueued.evaluatorIds,
1135
+ queuedAt: runQueued.ts ?? 0,
1136
+ startedAt: runStarted?.startedAt,
1137
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1138
+ totalTestCases: runQueued.totalTestCases,
1139
+ completedTestCases,
1140
+ passedTestCases,
1141
+ failedTestCases,
1142
+ status,
1143
+ artifactPath,
1144
+ errorMessage: runFailed?.errorMessage
1145
+ };
1146
+ }
1147
+ function aggregateTestCaseProgress(lines) {
1148
+ let completedTestCases = 0;
1149
+ let passedTestCases = 0;
1150
+ let failedTestCases = 0;
1151
+ for (const line of lines) {
1152
+ try {
1153
+ const event = JSON.parse(line);
1154
+ if (event.type === "TestCaseProgress") {
1155
+ const ev = event;
1156
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
1157
+ if (ev.passed) {
1158
+ passedTestCases += 1;
1159
+ } else {
1160
+ failedTestCases += 1;
1161
+ }
1162
+ }
1163
+ } catch {
1164
+ }
1165
+ }
1166
+ return { completedTestCases, passedTestCases, failedTestCases };
1167
+ }
1027
1168
  async function appendJsonLine(artifactPath, payload) {
1028
1169
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
1029
1170
  await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
@@ -1280,6 +1421,9 @@ var EffectRunner = class {
1280
1421
  (a, b) => b.queuedAt - a.queuedAt
1281
1422
  );
1282
1423
  }
1424
+ async loadRunSnapshotsFromArtifacts() {
1425
+ return loadRunSnapshotsFromArtifacts(this.config);
1426
+ }
1283
1427
  async shutdown() {
1284
1428
  await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
1285
1429
  await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));