@m4trix/evals 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import 'json-diff';
9
+ import { diffString } from 'json-diff';
10
10
  import React2, { useState, useEffect, useCallback } from 'react';
11
11
  import { render, Box, Text } from 'ink';
12
12
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
@@ -256,6 +256,29 @@ async function collectTestCasesFromFiles(config) {
256
256
  );
257
257
  return found.flat();
258
258
  }
259
+ function createDiffLogEntry(expected, actual, options) {
260
+ const diff = diffString(expected, actual, { color: false });
261
+ return {
262
+ type: "diff",
263
+ label: options?.label,
264
+ expected,
265
+ actual,
266
+ diff: diff || "(no differences)"
267
+ };
268
+ }
269
+ function getDiffLines(entry) {
270
+ const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
271
+ return raw.split("\n").map((line) => {
272
+ const trimmed = line.trimStart();
273
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
274
+ return { type: "remove", line };
275
+ }
276
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
277
+ return { type: "add", line };
278
+ }
279
+ return { type: "context", line };
280
+ });
281
+ }
259
282
 
260
283
  // src/evals/metric.ts
261
284
  var registry = /* @__PURE__ */ new Map();
@@ -439,6 +462,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
439
462
  continue;
440
463
  }
441
464
  try {
465
+ const logs = [];
466
+ const logDiff = (expected, actual, options) => {
467
+ logs.push(createDiffLogEntry(expected, actual, options));
468
+ };
442
469
  const ctx = yield* Effect.promise(
443
470
  () => Promise.resolve(evaluator.resolveContext())
444
471
  );
@@ -447,13 +474,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
447
474
  evaluateFn({
448
475
  input: testCaseItem.testCase.getInput(),
449
476
  ctx,
450
- output
477
+ output,
478
+ logDiff
451
479
  })
452
480
  )
453
481
  );
454
482
  const { scores, metrics } = normalizeResult(result);
455
483
  const passed = computeEvaluatorPassed(evaluator, result, scores);
456
- evaluatorScores.push({ evaluatorId, scores, passed, metrics });
484
+ evaluatorScores.push({
485
+ evaluatorId,
486
+ scores,
487
+ passed,
488
+ metrics,
489
+ logs: logs.length > 0 ? logs : void 0
490
+ });
457
491
  } catch (error) {
458
492
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
459
493
  evaluatorScores.push({
@@ -1176,7 +1210,8 @@ function RunView({
1176
1210
  evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1177
1211
  scores: item.scores,
1178
1212
  passed: item.passed,
1179
- metrics: item.metrics
1213
+ metrics: item.metrics,
1214
+ logs: item.logs
1180
1215
  }))
1181
1216
  }
1182
1217
  ]);
@@ -1263,30 +1298,42 @@ function RunView({
1263
1298
  "ms)"
1264
1299
  ] })
1265
1300
  ] }),
1266
- tc.evaluatorScores.map((item) => /* @__PURE__ */ jsx(Box, { marginLeft: 2, children: /* @__PURE__ */ jsxs(Text, { children: [
1267
- item.evaluatorName,
1268
- ":",
1269
- " ",
1270
- /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1271
- " ",
1272
- item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1273
- formatScorePart(s),
1274
- " "
1275
- ] }, s.id)),
1276
- item.metrics?.map((m) => {
1277
- const def = getMetricById(m.id);
1278
- if (!def)
1279
- return null;
1280
- const formatted = def.format(m.data);
1281
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1282
- "[",
1283
- def.name ? `${def.name}: ` : "",
1284
- formatted,
1285
- "]",
1301
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
1302
+ /* @__PURE__ */ jsxs(Text, { children: [
1303
+ item.evaluatorName,
1304
+ ":",
1305
+ " ",
1306
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1307
+ " ",
1308
+ item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1309
+ formatScorePart(s),
1286
1310
  " "
1287
- ] }, m.id);
1288
- })
1289
- ] }) }, item.evaluatorId))
1311
+ ] }, s.id)),
1312
+ item.metrics?.map((m) => {
1313
+ const def = getMetricById(m.id);
1314
+ if (!def)
1315
+ return null;
1316
+ const formatted = def.format(m.data);
1317
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1318
+ "[",
1319
+ def.name ? `${def.name}: ` : "",
1320
+ formatted,
1321
+ "]",
1322
+ " "
1323
+ ] }, m.id);
1324
+ })
1325
+ ] }),
1326
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1327
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
1328
+ Text,
1329
+ {
1330
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1331
+ children: line
1332
+ },
1333
+ lineIdx
1334
+ )) }, logIdx) : null
1335
+ ) })
1336
+ ] }, item.evaluatorId))
1290
1337
  ] }, i)) }),
1291
1338
  phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1292
1339
  /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
@@ -1535,6 +1582,17 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1535
1582
  item.metrics
1536
1583
  )
1537
1584
  );
1585
+ if (!item.passed && item.logs && item.logs.length > 0) {
1586
+ for (const log of item.logs) {
1587
+ if (log.type === "diff") {
1588
+ const useColor = process.stdout.isTTY;
1589
+ for (const { type, line } of getDiffLines(log)) {
1590
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1591
+ console.log(colored);
1592
+ }
1593
+ }
1594
+ }
1595
+ }
1538
1596
  const numeric = toNumericScoreFromScores(item.scores);
1539
1597
  if (numeric !== void 0) {
1540
1598
  const current = aggregates.get(item.evaluatorId) ?? {