@m4trix/evals 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- require('json-diff');
11
+ var jsonDiff = require('json-diff');
12
12
  var React2 = require('react');
13
13
  var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
@@ -282,6 +282,29 @@ async function collectTestCasesFromFiles(config) {
282
282
  );
283
283
  return found.flat();
284
284
  }
285
+ function createDiffLogEntry(expected, actual, options) {
286
+ const diff = jsonDiff.diffString(expected, actual, { color: false });
287
+ return {
288
+ type: "diff",
289
+ label: options?.label,
290
+ expected,
291
+ actual,
292
+ diff: diff || "(no differences)"
293
+ };
294
+ }
295
+ function getDiffLines(entry) {
296
+ const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
297
+ return raw.split("\n").map((line) => {
298
+ const trimmed = line.trimStart();
299
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
300
+ return { type: "remove", line };
301
+ }
302
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
303
+ return { type: "add", line };
304
+ }
305
+ return { type: "context", line };
306
+ });
307
+ }
285
308
 
286
309
  // src/evals/metric.ts
287
310
  var registry = /* @__PURE__ */ new Map();
@@ -465,6 +488,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
465
488
  continue;
466
489
  }
467
490
  try {
491
+ const logs = [];
492
+ const logDiff = (expected, actual, options) => {
493
+ logs.push(createDiffLogEntry(expected, actual, options));
494
+ };
468
495
  const ctx = yield* effect.Effect.promise(
469
496
  () => Promise.resolve(evaluator.resolveContext())
470
497
  );
@@ -473,13 +500,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
473
500
  evaluateFn({
474
501
  input: testCaseItem.testCase.getInput(),
475
502
  ctx,
476
- output
503
+ output,
504
+ logDiff
477
505
  })
478
506
  )
479
507
  );
480
508
  const { scores, metrics } = normalizeResult(result);
481
509
  const passed = computeEvaluatorPassed(evaluator, result, scores);
482
- evaluatorScores.push({ evaluatorId, scores, passed, metrics });
510
+ evaluatorScores.push({
511
+ evaluatorId,
512
+ scores,
513
+ passed,
514
+ metrics,
515
+ logs: logs.length > 0 ? logs : void 0
516
+ });
483
517
  } catch (error) {
484
518
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
485
519
  evaluatorScores.push({
@@ -552,6 +586,120 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
552
586
  artifactPath: task.snapshot.artifactPath
553
587
  });
554
588
  });
589
+ async function loadRunSnapshotsFromArtifacts(config) {
590
+ const baseDir = path.resolve(config.artifactDirectory);
591
+ let entries;
592
+ try {
593
+ entries = await promises.readdir(baseDir);
594
+ } catch {
595
+ return [];
596
+ }
597
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
598
+ const snapshots = [];
599
+ for (const fileName of jsonlFiles) {
600
+ const filePath = path.join(baseDir, fileName);
601
+ try {
602
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
603
+ if (snapshot) {
604
+ snapshots.push(snapshot);
605
+ }
606
+ } catch {
607
+ }
608
+ }
609
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
610
+ }
611
+ async function parseArtifactToSnapshot(filePath, _config) {
612
+ const content = await promises.readFile(filePath, "utf8");
613
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
614
+ if (lines.length === 0) {
615
+ return null;
616
+ }
617
+ let runQueued = null;
618
+ let runCompleted = null;
619
+ let runFailed = null;
620
+ let runStarted = null;
621
+ for (const line of lines) {
622
+ try {
623
+ const event = JSON.parse(line);
624
+ const type = event.type;
625
+ if (type === "RunQueued") {
626
+ runQueued = {
627
+ runId: event.runId,
628
+ datasetId: event.datasetId,
629
+ datasetName: event.datasetName,
630
+ evaluatorIds: event.evaluatorIds,
631
+ totalTestCases: event.totalTestCases ?? 0,
632
+ artifactPath: event.artifactPath ?? filePath,
633
+ ts: event.ts
634
+ };
635
+ }
636
+ if (type === "RunStarted") {
637
+ runStarted = { startedAt: event.startedAt };
638
+ }
639
+ if (type === "RunCompleted") {
640
+ runCompleted = {
641
+ passedTestCases: event.passedTestCases,
642
+ failedTestCases: event.failedTestCases,
643
+ totalTestCases: event.totalTestCases,
644
+ finishedAt: event.finishedAt
645
+ };
646
+ }
647
+ if (type === "RunFailed") {
648
+ runFailed = {
649
+ finishedAt: event.finishedAt,
650
+ errorMessage: event.errorMessage
651
+ };
652
+ }
653
+ } catch {
654
+ }
655
+ }
656
+ if (!runQueued) {
657
+ return null;
658
+ }
659
+ const artifactPath = filePath;
660
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
661
+ const progress = aggregateTestCaseProgress(lines);
662
+ const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
663
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
664
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
665
+ return {
666
+ runId: runQueued.runId,
667
+ datasetId: runQueued.datasetId,
668
+ datasetName: runQueued.datasetName,
669
+ evaluatorIds: runQueued.evaluatorIds,
670
+ queuedAt: runQueued.ts ?? 0,
671
+ startedAt: runStarted?.startedAt,
672
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
673
+ totalTestCases: runQueued.totalTestCases,
674
+ completedTestCases,
675
+ passedTestCases,
676
+ failedTestCases,
677
+ status,
678
+ artifactPath,
679
+ errorMessage: runFailed?.errorMessage
680
+ };
681
+ }
682
+ function aggregateTestCaseProgress(lines) {
683
+ let completedTestCases = 0;
684
+ let passedTestCases = 0;
685
+ let failedTestCases = 0;
686
+ for (const line of lines) {
687
+ try {
688
+ const event = JSON.parse(line);
689
+ if (event.type === "TestCaseProgress") {
690
+ const ev = event;
691
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
692
+ if (ev.passed) {
693
+ passedTestCases += 1;
694
+ } else {
695
+ failedTestCases += 1;
696
+ }
697
+ }
698
+ } catch {
699
+ }
700
+ }
701
+ return { completedTestCases, passedTestCases, failedTestCases };
702
+ }
555
703
  async function appendJsonLine(artifactPath, payload) {
556
704
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
557
705
  await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
@@ -808,6 +956,9 @@ var EffectRunner = class {
808
956
  (a, b) => b.queuedAt - a.queuedAt
809
957
  );
810
958
  }
959
+ async loadRunSnapshotsFromArtifacts() {
960
+ return loadRunSnapshotsFromArtifacts(this.config);
961
+ }
811
962
  async shutdown() {
812
963
  await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
813
964
  await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
@@ -939,7 +1090,7 @@ function GenerateView({
939
1090
  return;
940
1091
  }
941
1092
  const { writeFile: writeFile2 } = await import('fs/promises');
942
- const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
1093
+ const { join: join4, parse: parse2, resolve: resolve5 } = await import('path');
943
1094
  const testCases = await runner.collectDatasetTestCases(dataset.id);
944
1095
  const payload = testCases.map((item) => {
945
1096
  const tc = item.testCase;
@@ -949,9 +1100,9 @@ function GenerateView({
949
1100
  output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
950
1101
  };
951
1102
  });
952
- const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
1103
+ const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
953
1104
  const parsed = parse2(absoluteDatasetPath);
954
- const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
1105
+ const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
955
1106
  await writeFile2(
956
1107
  outputPath,
957
1108
  `${JSON.stringify(payload, null, 2)}
@@ -1026,7 +1177,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1026
1177
  console.log(`Wrote ${outputPath}`);
1027
1178
  }
1028
1179
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1029
- return new Promise((resolve4, reject) => {
1180
+ return new Promise((resolve5, reject) => {
1030
1181
  const app = ink.render(
1031
1182
  React2__default.default.createElement(GenerateView, {
1032
1183
  runner,
@@ -1036,7 +1187,7 @@ async function generateDatasetJsonCommandInk(runner, datasetName) {
1036
1187
  if (err) {
1037
1188
  reject(err);
1038
1189
  } else {
1039
- resolve4();
1190
+ resolve5();
1040
1191
  }
1041
1192
  }
1042
1193
  })
@@ -1164,7 +1315,7 @@ function RunView({
1164
1315
  const aggregates = /* @__PURE__ */ new Map();
1165
1316
  let overallScoreTotal = 0;
1166
1317
  let overallScoreCount = 0;
1167
- const done = new Promise((resolve4) => {
1318
+ const done = new Promise((resolve5) => {
1168
1319
  const unsubscribe = runner.subscribeRunEvents((event) => {
1169
1320
  if (event.type === "TestCaseProgress") {
1170
1321
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
@@ -1202,14 +1353,15 @@ function RunView({
1202
1353
  evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1203
1354
  scores: item.scores,
1204
1355
  passed: item.passed,
1205
- metrics: item.metrics
1356
+ metrics: item.metrics,
1357
+ logs: item.logs
1206
1358
  }))
1207
1359
  }
1208
1360
  ]);
1209
1361
  }
1210
1362
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
1211
1363
  unsubscribe();
1212
- resolve4(event);
1364
+ resolve5(event);
1213
1365
  }
1214
1366
  });
1215
1367
  });
@@ -1289,30 +1441,42 @@ function RunView({
1289
1441
  "ms)"
1290
1442
  ] })
1291
1443
  ] }),
1292
- tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1293
- item.evaluatorName,
1294
- ":",
1295
- " ",
1296
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1297
- " ",
1298
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1299
- formatScorePart(s),
1300
- " "
1301
- ] }, s.id)),
1302
- item.metrics?.map((m) => {
1303
- const def = getMetricById(m.id);
1304
- if (!def)
1305
- return null;
1306
- const formatted = def.format(m.data);
1307
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1308
- "[",
1309
- def.name ? `${def.name}: ` : "",
1310
- formatted,
1311
- "]",
1444
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
1445
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1446
+ item.evaluatorName,
1447
+ ":",
1448
+ " ",
1449
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1450
+ " ",
1451
+ item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1452
+ formatScorePart(s),
1312
1453
  " "
1313
- ] }, m.id);
1314
- })
1315
- ] }) }, item.evaluatorId))
1454
+ ] }, s.id)),
1455
+ item.metrics?.map((m) => {
1456
+ const def = getMetricById(m.id);
1457
+ if (!def)
1458
+ return null;
1459
+ const formatted = def.format(m.data);
1460
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1461
+ "[",
1462
+ def.name ? `${def.name}: ` : "",
1463
+ formatted,
1464
+ "]",
1465
+ " "
1466
+ ] }, m.id);
1467
+ })
1468
+ ] }),
1469
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1470
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1471
+ ink.Text,
1472
+ {
1473
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1474
+ children: line
1475
+ },
1476
+ lineIdx
1477
+ )) }, logIdx) : null
1478
+ ) })
1479
+ ] }, item.evaluatorId))
1316
1480
  ] }, i)) }),
1317
1481
  phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1318
1482
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
@@ -1541,7 +1705,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1541
1705
  );
1542
1706
  }
1543
1707
  let spinnerTimer;
1544
- const done = new Promise((resolve4) => {
1708
+ const done = new Promise((resolve5) => {
1545
1709
  const unsubscribe = runner.subscribeRunEvents((event) => {
1546
1710
  if (event.type === "TestCaseProgress") {
1547
1711
  completedCount = event.completedTestCases;
@@ -1561,6 +1725,17 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1561
1725
  item.metrics
1562
1726
  )
1563
1727
  );
1728
+ if (!item.passed && item.logs && item.logs.length > 0) {
1729
+ for (const log of item.logs) {
1730
+ if (log.type === "diff") {
1731
+ const useColor = process.stdout.isTTY;
1732
+ for (const { type, line } of getDiffLines(log)) {
1733
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1734
+ console.log(colored);
1735
+ }
1736
+ }
1737
+ }
1738
+ }
1564
1739
  const numeric = toNumericScoreFromScores(item.scores);
1565
1740
  if (numeric !== void 0) {
1566
1741
  const current = aggregates.get(item.evaluatorId) ?? {
@@ -1591,7 +1766,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1591
1766
  runFinished = true;
1592
1767
  clearLine();
1593
1768
  unsubscribe();
1594
- resolve4(event);
1769
+ resolve5(event);
1595
1770
  }
1596
1771
  });
1597
1772
  });
@@ -1669,7 +1844,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1669
1844
  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1670
1845
  }
1671
1846
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1672
- return new Promise((resolve4, reject) => {
1847
+ return new Promise((resolve5, reject) => {
1673
1848
  const app = ink.render(
1674
1849
  React2__default.default.createElement(RunView, {
1675
1850
  runner,
@@ -1680,7 +1855,7 @@ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1680
1855
  if (err) {
1681
1856
  reject(err);
1682
1857
  } else {
1683
- resolve4();
1858
+ resolve5();
1684
1859
  }
1685
1860
  }
1686
1861
  })