npm - @m4trix/evals - Versions diffs - 0.10.0 → 0.11.0 - Mend

@m4trix/evals 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/cli-simple.cjs CHANGED Viewed

@@ -8,7 +8,7 @@ var path = require('path');
 var jitiModule = require('jiti');
 var promises = require('fs/promises');
 var url = require('url');
-require('json-diff');
+var jsonDiff = require('json-diff');
 var React2 = require('react');
 var ink = require('ink');
 var jsxRuntime = require('react/jsx-runtime');
@@ -282,6 +282,29 @@ async function collectTestCasesFromFiles(config) {
   );
   return found.flat();
 }
+function createDiffLogEntry(expected, actual, options) {
+  const diff = jsonDiff.diffString(expected, actual, { color: false });
+  return {
+    type: "diff",
+    label: options?.label,
+    expected,
+    actual,
+    diff: diff || "(no differences)"
+  };
+}
+function getDiffLines(entry) {
+  const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
+  return raw.split("\n").map((line) => {
+    const trimmed = line.trimStart();
+    if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
+      return { type: "remove", line };
+    }
+    if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
+      return { type: "add", line };
+    }
+    return { type: "context", line };
+  });
+}
 // src/evals/metric.ts
 var registry = /* @__PURE__ */ new Map();
@@ -465,6 +488,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
         continue;
       }
       try {
+        const logs = [];
+        const logDiff = (expected, actual, options) => {
+          logs.push(createDiffLogEntry(expected, actual, options));
+        };
         const ctx = yield* effect.Effect.promise(
           () => Promise.resolve(evaluator.resolveContext())
         );
@@ -473,13 +500,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
             evaluateFn({
               input: testCaseItem.testCase.getInput(),
               ctx,
-              output
+              output,
+              logDiff
             })
           )
         );
         const { scores, metrics } = normalizeResult(result);
         const passed = computeEvaluatorPassed(evaluator, result, scores);
-        evaluatorScores.push({ evaluatorId, scores, passed, metrics });
+        evaluatorScores.push({
+          evaluatorId,
+          scores,
+          passed,
+          metrics,
+          logs: logs.length > 0 ? logs : void 0
+        });
       } catch (error) {
         testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
         evaluatorScores.push({
@@ -1202,7 +1236,8 @@ function RunView({
                 evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
                 scores: item.scores,
                 passed: item.passed,
-                metrics: item.metrics
+                metrics: item.metrics,
+                logs: item.logs
               }))
             }
           ]);
@@ -1289,30 +1324,42 @@ function RunView({
           "ms)"
         ] })
       ] }),
-      tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
-        item.evaluatorName,
-        ":",
-        " ",
-        /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
-        " ",
-        item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
-          formatScorePart(s),
-          " "
-        ] }, s.id)),
-        item.metrics?.map((m) => {
-          const def = getMetricById(m.id);
-          if (!def)
-            return null;
-          const formatted = def.format(m.data);
-          return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
-            "[",
-            def.name ? `${def.name}: ` : "",
-            formatted,
-            "]",
+      tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
+        /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
+          item.evaluatorName,
+          ":",
+          " ",
+          /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
+          " ",
+          item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
+            formatScorePart(s),
             " "
-          ] }, m.id);
-        })
-      ] }) }, item.evaluatorId))
+          ] }, s.id)),
+          item.metrics?.map((m) => {
+            const def = getMetricById(m.id);
+            if (!def)
+              return null;
+            const formatted = def.format(m.data);
+            return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
+              "[",
+              def.name ? `${def.name}: ` : "",
+              formatted,
+              "]",
+              " "
+            ] }, m.id);
+          })
+        ] }),
+        !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
+          (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
+            ink.Text,
+            {
+              color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
+              children: line
+            },
+            lineIdx
+          )) }, logIdx) : null
+        ) })
+      ] }, item.evaluatorId))
     ] }, i)) }),
     phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
       /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
@@ -1561,6 +1608,17 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
               item.metrics
             )
           );
+          if (!item.passed && item.logs && item.logs.length > 0) {
+            for (const log of item.logs) {
+              if (log.type === "diff") {
+                const useColor = process.stdout.isTTY;
+                for (const { type, line } of getDiffLines(log)) {
+                  const colored = useColor && type === "remove" ? colorize(`      ${line}`, ansi2.red) : useColor && type === "add" ? colorize(`      ${line}`, ansi2.green) : `      ${line}`;
+                  console.log(colored);
+                }
+              }
+            }
+          }
           const numeric = toNumericScoreFromScores(item.scores);
           if (numeric !== void 0) {
             const current = aggregates.get(item.evaluatorId) ?? {