@m4trix/evals 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- require('json-diff');
11
+ var jsonDiff = require('json-diff');
12
12
  var React2 = require('react');
13
13
  var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
@@ -282,6 +282,29 @@ async function collectTestCasesFromFiles(config) {
282
282
  );
283
283
  return found.flat();
284
284
  }
285
+ function createDiffLogEntry(expected, actual, options) {
286
+ const diff = jsonDiff.diffString(expected, actual, { color: false });
287
+ return {
288
+ type: "diff",
289
+ label: options?.label,
290
+ expected,
291
+ actual,
292
+ diff: diff || "(no differences)"
293
+ };
294
+ }
295
+ function getDiffLines(entry) {
296
+ const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
297
+ return raw.split("\n").map((line) => {
298
+ const trimmed = line.trimStart();
299
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
300
+ return { type: "remove", line };
301
+ }
302
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
303
+ return { type: "add", line };
304
+ }
305
+ return { type: "context", line };
306
+ });
307
+ }
285
308
 
286
309
  // src/evals/metric.ts
287
310
  var registry = /* @__PURE__ */ new Map();
@@ -465,6 +488,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
465
488
  continue;
466
489
  }
467
490
  try {
491
+ const logs = [];
492
+ const logDiff = (expected, actual, options) => {
493
+ logs.push(createDiffLogEntry(expected, actual, options));
494
+ };
468
495
  const ctx = yield* effect.Effect.promise(
469
496
  () => Promise.resolve(evaluator.resolveContext())
470
497
  );
@@ -473,13 +500,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
473
500
  evaluateFn({
474
501
  input: testCaseItem.testCase.getInput(),
475
502
  ctx,
476
- output
503
+ output,
504
+ logDiff
477
505
  })
478
506
  )
479
507
  );
480
508
  const { scores, metrics } = normalizeResult(result);
481
509
  const passed = computeEvaluatorPassed(evaluator, result, scores);
482
- evaluatorScores.push({ evaluatorId, scores, passed, metrics });
510
+ evaluatorScores.push({
511
+ evaluatorId,
512
+ scores,
513
+ passed,
514
+ metrics,
515
+ logs: logs.length > 0 ? logs : void 0
516
+ });
483
517
  } catch (error) {
484
518
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
485
519
  evaluatorScores.push({
@@ -1202,7 +1236,8 @@ function RunView({
1202
1236
  evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1203
1237
  scores: item.scores,
1204
1238
  passed: item.passed,
1205
- metrics: item.metrics
1239
+ metrics: item.metrics,
1240
+ logs: item.logs
1206
1241
  }))
1207
1242
  }
1208
1243
  ]);
@@ -1289,30 +1324,42 @@ function RunView({
1289
1324
  "ms)"
1290
1325
  ] })
1291
1326
  ] }),
1292
- tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1293
- item.evaluatorName,
1294
- ":",
1295
- " ",
1296
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1297
- " ",
1298
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1299
- formatScorePart(s),
1300
- " "
1301
- ] }, s.id)),
1302
- item.metrics?.map((m) => {
1303
- const def = getMetricById(m.id);
1304
- if (!def)
1305
- return null;
1306
- const formatted = def.format(m.data);
1307
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1308
- "[",
1309
- def.name ? `${def.name}: ` : "",
1310
- formatted,
1311
- "]",
1327
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
1328
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1329
+ item.evaluatorName,
1330
+ ":",
1331
+ " ",
1332
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1333
+ " ",
1334
+ item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1335
+ formatScorePart(s),
1312
1336
  " "
1313
- ] }, m.id);
1314
- })
1315
- ] }) }, item.evaluatorId))
1337
+ ] }, s.id)),
1338
+ item.metrics?.map((m) => {
1339
+ const def = getMetricById(m.id);
1340
+ if (!def)
1341
+ return null;
1342
+ const formatted = def.format(m.data);
1343
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1344
+ "[",
1345
+ def.name ? `${def.name}: ` : "",
1346
+ formatted,
1347
+ "]",
1348
+ " "
1349
+ ] }, m.id);
1350
+ })
1351
+ ] }),
1352
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1353
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1354
+ ink.Text,
1355
+ {
1356
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1357
+ children: line
1358
+ },
1359
+ lineIdx
1360
+ )) }, logIdx) : null
1361
+ ) })
1362
+ ] }, item.evaluatorId))
1316
1363
  ] }, i)) }),
1317
1364
  phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1318
1365
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
@@ -1561,6 +1608,17 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1561
1608
  item.metrics
1562
1609
  )
1563
1610
  );
1611
+ if (!item.passed && item.logs && item.logs.length > 0) {
1612
+ for (const log of item.logs) {
1613
+ if (log.type === "diff") {
1614
+ const useColor = process.stdout.isTTY;
1615
+ for (const { type, line } of getDiffLines(log)) {
1616
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1617
+ console.log(colored);
1618
+ }
1619
+ }
1620
+ }
1621
+ }
1564
1622
  const numeric = toNumericScoreFromScores(item.scores);
1565
1623
  if (numeric !== void 0) {
1566
1624
  const current = aggregates.get(item.evaluatorId) ?? {