npm - @m4trix/evals - Versions diffs - 0.9.0 → 0.10.0 - Mend

@m4trix/evals 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/cli-simple.js CHANGED Viewed

@@ -7,6 +7,9 @@ import * as jitiModule from 'jiti';
 import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
 import { pathToFileURL } from 'url';
 import 'json-diff';
+import React2, { useState, useEffect, useCallback } from 'react';
+import { render, Box, Text } from 'ink';
+import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
 // src/runner/config.ts
 var defaultRunnerConfig = {
@@ -867,6 +870,107 @@ function getSimpleCliUsage() {
     '  "/score/i"            regex literal'
   ].join("\n");
 }
+// src/cli-simple/banner.ts
+var ansi = {
+  reset: "\x1B[0m",
+  dim: "\x1B[2m",
+  cyan: "\x1B[36m"
+};
+function printBanner() {
+  const c = (s) => `${ansi.cyan}${s}${ansi.reset}`;
+  const d = (s) => `${ansi.dim}${s}${ansi.reset}`;
+  const lines = [
+    "",
+    `  ${c("\u256D\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256E")}`,
+    `  ${c("\u2502")}  ${d("@m4trix/evals")}  ${c("\xB7")}  ${d("eval-agents-simple")}  ${c("\u2502")}`,
+    `  ${c("\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256F")}`,
+    ""
+  ];
+  console.log(lines.join("\n"));
+}
+function Banner() {
+  return /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
+    /* @__PURE__ */ jsx(Text, { color: "gray", children: "@m4trix/evals" }),
+    /* @__PURE__ */ jsx(Text, { color: "cyan", children: " \xB7 " }),
+    /* @__PURE__ */ jsx(Text, { color: "gray", children: "eval-agents-simple" })
+  ] });
+}
+function GenerateView({
+  runner,
+  datasetName,
+  onComplete
+}) {
+  const [result, setResult] = useState(null);
+  const [error, setError] = useState(null);
+  useEffect(() => {
+    let cancelled = false;
+    async function run() {
+      const dataset = await runner.resolveDatasetByName(datasetName);
+      if (!dataset) {
+        setError(new Error(`Dataset "${datasetName}" not found.`));
+        onComplete(new Error(`Dataset "${datasetName}" not found.`));
+        return;
+      }
+      const { writeFile: writeFile2 } = await import('fs/promises');
+      const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
+      const testCases = await runner.collectDatasetTestCases(dataset.id);
+      const payload = testCases.map((item) => {
+        const tc = item.testCase;
+        return {
+          name: item.testCase.getName(),
+          input: item.testCase.getInput(),
+          output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
+        };
+      });
+      const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
+      const parsed = parse2(absoluteDatasetPath);
+      const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
+      await writeFile2(
+        outputPath,
+        `${JSON.stringify(payload, null, 2)}
+`,
+        "utf8"
+      );
+      if (!cancelled) {
+        setResult({
+          count: payload.length,
+          datasetName: dataset.dataset.getName(),
+          outputPath
+        });
+        setTimeout(() => onComplete(), 200);
+      }
+    }
+    void run();
+    return () => {
+      cancelled = true;
+    };
+  }, [runner, datasetName, onComplete]);
+  if (error) {
+    return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
+      /* @__PURE__ */ jsx(Banner, {}),
+      /* @__PURE__ */ jsx(Text, { color: "red", children: error.message })
+    ] });
+  }
+  return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
+    /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
+    result && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
+      /* @__PURE__ */ jsxs(Text, { color: "green", children: [
+        "Generated ",
+        result.count,
+        ' test cases for dataset "',
+        result.datasetName,
+        '".'
+      ] }),
+      /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+        "Wrote ",
+        result.outputPath
+      ] })
+    ] })
+  ] });
+}
+// src/cli-simple/generate.ts
 function readOutput2(testCase) {
   if (typeof testCase.getOutput !== "function") {
     return void 0;
@@ -877,7 +981,7 @@ function createOutputPath(datasetFilePath) {
   const parsed = parse(datasetFilePath);
   return join(parsed.dir, `${parsed.name}.cases.json`);
 }
-async function generateDatasetJsonCommand(runner, datasetName) {
+async function generateDatasetJsonCommandPlain(runner, datasetName) {
   const dataset = await runner.resolveDatasetByName(datasetName);
   if (!dataset) {
     throw new Error(`Dataset "${datasetName}" not found.`);
@@ -895,9 +999,383 @@ async function generateDatasetJsonCommand(runner, datasetName) {
   console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
   console.log(`Wrote ${outputPath}`);
 }
+async function generateDatasetJsonCommandInk(runner, datasetName) {
+  return new Promise((resolve4, reject) => {
+    const app = render(
+      React2.createElement(GenerateView, {
+        runner,
+        datasetName,
+        onComplete: (err) => {
+          app.unmount();
+          if (err) {
+            reject(err);
+          } else {
+            resolve4();
+          }
+        }
+      })
+    );
+  });
+}
+function barColor(pct) {
+  if (pct >= 70)
+    return "green";
+  if (pct >= 40)
+    return "yellow";
+  return "red";
+}
+function TextBar({
+  label,
+  value,
+  max = 100,
+  labelWidth = 14,
+  barWidth = 20,
+  format = (v) => String(v),
+  colorByValue = true
+}) {
+  const clamped = Math.max(0, Math.min(max, value));
+  const pct = max > 0 ? clamped / max * 100 : 0;
+  const filled = Math.round(clamped / max * barWidth);
+  const filledBar = "\u2588".repeat(filled);
+  const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
+  const color = colorByValue ? barColor(pct) : void 0;
+  return /* @__PURE__ */ jsxs(Text, { children: [
+    /* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
+    " [",
+    color ? /* @__PURE__ */ jsxs(Fragment, { children: [
+      /* @__PURE__ */ jsx(Text, { color, children: filledBar }),
+      /* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
+    ] }) : filledBar + emptyBar,
+    "] ",
+    /* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
+  ] });
+}
+var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
+function Spinner({ label = "Running" }) {
+  const [frame, setFrame] = useState(0);
+  useEffect(() => {
+    const timer = setInterval(() => {
+      setFrame((f) => (f + 1) % FRAMES.length);
+    }, 100);
+    return () => clearInterval(timer);
+  }, []);
+  return /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
+    FRAMES[frame],
+    " ",
+    label
+  ] });
+}
+function scoreColor(score) {
+  if (score >= 80)
+    return "green";
+  if (score >= 50)
+    return "yellow";
+  return "red";
+}
+function createBar(value, max = 100, width = 20) {
+  const safe = Math.max(0, Math.min(max, value));
+  const filled = Math.round(safe / max * width);
+  return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
+}
+function formatScorePart(item, scoreToColor2) {
+  const def = getScoreById(item.id);
+  if (!def) {
+    const numeric = toNumericScore(item.data);
+    return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
+  }
+  const formatted = def.format(item.data);
+  if (def.displayStrategy === "bar") {
+    const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
+    if (typeof numeric === "number" && Number.isFinite(numeric)) {
+      return `${formatted} ${createBar(numeric)}`;
+    }
+  }
+  return formatted;
+}
+function RunView({
+  runner,
+  datasetName,
+  evaluatorPattern,
+  onComplete
+}) {
+  const [phase, setPhase] = useState(
+    "loading"
+  );
+  const [runInfo, setRunInfo] = useState(null);
+  const [testCases, setTestCases] = useState([]);
+  const [summary, setSummary] = useState(null);
+  const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
+  const runEval = useCallback(async () => {
+    const dataset = await runner.resolveDatasetByName(datasetName);
+    if (!dataset) {
+      const known = await runner.collectDatasets();
+      const available = known.map((item) => item.dataset.getName()).sort();
+      onComplete(
+        new Error(
+          available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
+        )
+      );
+      return;
+    }
+    const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
+    if (evaluators.length === 0) {
+      const known = await runner.collectEvaluators();
+      const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
+      onComplete(
+        new Error(
+          available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
+        )
+      );
+      return;
+    }
+    const nameById = new Map(
+      evaluators.map((item) => [
+        item.id,
+        item.evaluator.getName() ?? item.id
+      ])
+    );
+    setEvaluatorNameById(nameById);
+    const aggregates = /* @__PURE__ */ new Map();
+    let overallScoreTotal = 0;
+    let overallScoreCount = 0;
+    const done = new Promise((resolve4) => {
+      const unsubscribe = runner.subscribeRunEvents((event) => {
+        if (event.type === "TestCaseProgress") {
+          const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
+          const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
+          for (const item of event.evaluatorScores) {
+            const numeric = toNumericScoreFromScores(item.scores);
+            if (numeric !== void 0) {
+              const current = aggregates.get(item.evaluatorId) ?? {
+                total: 0,
+                count: 0,
+                passed: 0,
+                failed: 0
+              };
+              aggregates.set(item.evaluatorId, {
+                total: current.total + numeric,
+                count: current.count + 1,
+                passed: current.passed + (item.passed ? 1 : 0),
+                failed: current.failed + (item.passed ? 0 : 1)
+              });
+              overallScoreTotal += numeric;
+              overallScoreCount += 1;
+            }
+          }
+          setTestCases((prev) => [
+            ...prev,
+            {
+              name: event.testCaseName,
+              completedTestCases: event.completedTestCases,
+              totalTestCases: event.totalTestCases,
+              durationMs: event.durationMs,
+              passed: event.passed,
+              averageScore,
+              evaluatorScores: event.evaluatorScores.map((item) => ({
+                evaluatorId: item.evaluatorId,
+                evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
+                scores: item.scores,
+                passed: item.passed,
+                metrics: item.metrics
+              }))
+            }
+          ]);
+        }
+        if (event.type === "RunCompleted" || event.type === "RunFailed") {
+          unsubscribe();
+          resolve4(event);
+        }
+      });
+    });
+    const snapshot = await runner.runDatasetWith({
+      datasetId: dataset.id,
+      evaluatorIds: evaluators.map((item) => item.id)
+    });
+    setRunInfo({
+      runId: snapshot.runId,
+      datasetName: snapshot.datasetName,
+      evaluatorNames: evaluators.map(
+        (e) => e.evaluator.getName() ?? e.id
+      ),
+      totalTestCases: snapshot.totalTestCases
+    });
+    setPhase("running");
+    const finalEvent = await done;
+    if (finalEvent.type === "RunFailed") {
+      onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
+      return;
+    }
+    setSummary({
+      passedTestCases: finalEvent.passedTestCases,
+      failedTestCases: finalEvent.failedTestCases,
+      totalTestCases: finalEvent.totalTestCases,
+      overallScoreTotal,
+      overallScoreCount,
+      aggregates: new Map(aggregates),
+      artifactPath: finalEvent.artifactPath
+    });
+    setPhase("completed");
+    setTimeout(() => onComplete(), 200);
+  }, [runner, datasetName, evaluatorPattern, onComplete]);
+  useEffect(() => {
+    void runEval();
+  }, [runEval]);
+  return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
+    /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
+    runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
+        /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
+      ] }),
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
+        runInfo.datasetName
+      ] }),
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
+        runInfo.evaluatorNames.join(", ")
+      ] }),
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
+        runInfo.totalTestCases
+      ] })
+    ] }),
+    phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
+      Spinner,
+      {
+        label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
+      }
+    ) }),
+    testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
+          "[",
+          tc.completedTestCases,
+          "/",
+          tc.totalTestCases,
+          "]"
+        ] }),
+        " ",
+        tc.name,
+        /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+          " (",
+          tc.durationMs,
+          "ms)"
+        ] })
+      ] }),
+      tc.evaluatorScores.map((item) => /* @__PURE__ */ jsx(Box, { marginLeft: 2, children: /* @__PURE__ */ jsxs(Text, { children: [
+        item.evaluatorName,
+        ":",
+        " ",
+        /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
+        " ",
+        item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
+          formatScorePart(s),
+          " "
+        ] }, s.id)),
+        item.metrics?.map((m) => {
+          const def = getMetricById(m.id);
+          if (!def)
+            return null;
+          const formatted = def.format(m.data);
+          return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+            "[",
+            def.name ? `${def.name}: ` : "",
+            formatted,
+            "]",
+            " "
+          ] }, m.id);
+        })
+      ] }) }, item.evaluatorId))
+    ] }, i)) }),
+    phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
+      /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
+      /* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
+        /* @__PURE__ */ jsx(Text, { color: "green", children: "passed" }),
+        /* @__PURE__ */ jsxs(Text, { children: [
+          " ",
+          summary.passedTestCases,
+          "/",
+          summary.totalTestCases
+        ] })
+      ] }),
+      /* @__PURE__ */ jsxs(Box, { children: [
+        /* @__PURE__ */ jsx(Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
+        /* @__PURE__ */ jsxs(Text, { children: [
+          " ",
+          summary.failedTestCases,
+          "/",
+          summary.totalTestCases
+        ] })
+      ] }),
+      summary.overallScoreCount > 0 && /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsx(
+        TextBar,
+        {
+          label: "overall avg",
+          value: summary.overallScoreTotal / summary.overallScoreCount,
+          barWidth: 20,
+          format: (v) => v.toFixed(2)
+        }
+      ) }),
+      /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
+        /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
+        Array.from(evaluatorNameById.entries()).map(([id, name]) => {
+          const agg = summary.aggregates.get(id);
+          if (!agg || agg.count === 0) {
+            return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+              "- ",
+              name.padEnd(28),
+              " no numeric scores"
+            ] }, id);
+          }
+          const mean = agg.total / agg.count;
+          return /* @__PURE__ */ jsxs(Text, { children: [
+            "- ",
+            name.padEnd(28),
+            " avg=",
+            /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
+            " passed=",
+            agg.passed,
+            " failed=",
+            agg.failed
+          ] }, id);
+        })
+      ] }),
+      /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
+        /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
+        testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
+          /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
+          /* @__PURE__ */ jsxs(Text, { children: [
+            " ",
+            tc.name.padEnd(24)
+          ] }),
+          tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
+            /* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
+              "score=",
+              tc.averageScore.toFixed(2)
+            ] }),
+            /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+              " ",
+              createBar(tc.averageScore, 100, 14)
+            ] })
+          ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
+          /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+            " (",
+            tc.durationMs,
+            "ms)"
+          ] })
+        ] }, i))
+      ] }),
+      /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+        "artifact: ",
+        summary.artifactPath
+      ] }) })
+    ] })
+  ] });
+}
 // src/cli-simple/run.ts
-var ansi = {
+var ansi2 = {
   reset: "\x1B[0m",
   bold: "\x1B[1m",
   dim: "\x1B[2m",
@@ -908,16 +1386,16 @@ var ansi = {
   magenta: "\x1B[35m"
 };
 function colorize(text, color) {
-  return `${color}${text}${ansi.reset}`;
+  return `${color}${text}${ansi2.reset}`;
 }
 function scoreToColor(score) {
   if (score >= 80) {
-    return ansi.green;
+    return ansi2.green;
   }
   if (score >= 50) {
-    return ansi.yellow;
+    return ansi2.yellow;
   }
-  return ansi.red;
+  return ansi2.red;
 }
 function getEvaluatorSummaryLine(evaluatorName, aggregate) {
   if (!aggregate || aggregate.count === 0) {
@@ -926,13 +1404,13 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
   const mean = aggregate.total / aggregate.count;
   return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
 }
-function createBar(value, max = 100, width = 20) {
+function createBar2(value, max = 100, width = 20) {
   const safe = Math.max(0, Math.min(max, value));
   const filled = Math.round(safe / max * width);
   return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
 }
 function formatEvaluatorScoreLine(name, scores, passed, metrics) {
-  const passLabel = passed ? colorize("PASS", `${ansi.bold}${ansi.green}`) : colorize("FAIL", `${ansi.bold}${ansi.red}`);
+  const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
   const scoreParts = [];
   for (const item of scores) {
     const def = getScoreById(item.id);
@@ -949,7 +1427,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
         const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
         if (typeof numeric === "number" && Number.isFinite(numeric)) {
           scoreParts.push(
-            `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi.dim)}`
+            `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
           );
         } else {
           scoreParts.push(formatted);
@@ -963,7 +1441,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
         scoreParts.push(
           colorize(
             formatted,
-            item.passed === true ? `${ansi.bold}${ansi.green}` : item.passed === false ? `${ansi.bold}${ansi.red}` : ansi.dim
+            item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
           )
         );
         break;
@@ -988,7 +1466,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
   }
   return line;
 }
-async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
+async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
   const dataset = await runner.resolveDatasetByName(datasetName);
   if (!dataset) {
     const known = await runner.collectDatasets();
@@ -1030,10 +1508,10 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
     const frame = spinnerFrames[spinnerIndex % spinnerFrames.length];
     spinnerIndex += 1;
     process.stdout.write(
-      `\r${colorize(frame, ansi.cyan)} Running evaluations ${colorize(
+      `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
         `${completedCount}/${totalCount}`,
-        ansi.bold
-      )} ${colorize("(live)", ansi.dim)}`
+        ansi2.bold
+      )} ${colorize("(live)", ansi2.dim)}`
     );
   }
   let spinnerTimer;
@@ -1045,7 +1523,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
         const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
         clearLine();
         console.log(
-          `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi.dim)}`
+          `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
         );
         for (const item of event.evaluatorScores) {
           const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -1096,14 +1574,14 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
     evaluatorIds: evaluators.map((item) => item.id)
   });
   totalCount = snapshot.totalTestCases;
-  console.log(colorize("=== Eval Run Started ===", `${ansi.bold}${ansi.cyan}`));
-  console.log(`Run: ${colorize(snapshot.runId, ansi.cyan)}`);
-  console.log(`Dataset: ${colorize(snapshot.datasetName, ansi.bold)}`);
+  console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
+  console.log(`Run: ${colorize(snapshot.runId, ansi2.cyan)}`);
+  console.log(`Dataset: ${colorize(snapshot.datasetName, ansi2.bold)}`);
   console.log(
     `Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
   );
   console.log(
-    `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi.bold)}`
+    `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
   );
   console.log("");
   drawSpinner();
@@ -1116,17 +1594,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
     throw new Error(`Run failed: ${finalEvent.errorMessage}`);
   }
   console.log("");
-  console.log(colorize("=== Run Summary ===", `${ansi.bold}${ansi.cyan}`));
+  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
   console.log(
     `- passed: ${colorize(
       `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
-      ansi.green
+      ansi2.green
     )}`
   );
   console.log(
     `- failed: ${colorize(
       `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
-      finalEvent.failedTestCases > 0 ? ansi.red : ansi.dim
+      finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
     )}`
   );
   if (overallScoreCount > 0) {
@@ -1135,22 +1613,22 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
       `- overall avg score: ${colorize(
         overallAverage.toFixed(2),
         scoreToColor(overallAverage)
-      )} ${colorize(createBar(overallAverage), ansi.dim)}`
+      )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
     );
   }
-  console.log(colorize("- evaluator averages:", ansi.magenta));
+  console.log(colorize("- evaluator averages:", ansi2.magenta));
   for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
     console.log(
       getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
     );
   }
   if (testCaseSummaries.length > 0) {
-    console.log(colorize("- test case scores:", ansi.magenta));
+    console.log(colorize("- test case scores:", ansi2.magenta));
     for (const summary of testCaseSummaries) {
-      const status = summary.passed ? colorize("PASS", ansi.green) : colorize("FAIL", ansi.red);
+      const status = summary.passed ? colorize("PASS", ansi2.green) : colorize("FAIL", ansi2.red);
       if (summary.averageScore === void 0) {
         console.log(
-          `  ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi.dim)}`
+          `  ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
         );
         continue;
       }
@@ -1158,11 +1636,30 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
         `  ${status} ${summary.name.padEnd(24)} score=${colorize(
           summary.averageScore.toFixed(2),
           scoreToColor(summary.averageScore)
-        )} ${colorize(createBar(summary.averageScore, 100, 14), ansi.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi.dim)}`
+        )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
       );
     }
   }
-  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi.dim)}`);
+  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
+}
+async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
+  return new Promise((resolve4, reject) => {
+    const app = render(
+      React2.createElement(RunView, {
+        runner,
+        datasetName,
+        evaluatorPattern,
+        onComplete: (err) => {
+          app.unmount();
+          if (err) {
+            reject(err);
+          } else {
+            resolve4();
+          }
+        }
+      })
+    );
+  });
 }
 // src/cli-simple/index.ts
@@ -1187,17 +1684,28 @@ async function main() {
     console.error("Missing required --dataset <datasetName> argument.");
     printUsageAndExit(1);
   }
+  if (args.command === "run" && !args.evaluatorPattern) {
+    console.error("Missing required --evaluator <name-or-pattern> argument.");
+    printUsageAndExit(1);
+  }
+  const useInk = process.stdout.isTTY === true;
+  if (!useInk) {
+    printBanner();
+  }
   const runner = createRunner();
   try {
     if (args.command === "run") {
-      if (!args.evaluatorPattern) {
-        console.error("Missing required --evaluator <name-or-pattern> argument.");
-        printUsageAndExit(1);
-      }
-      await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
+      await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
+        runner,
+        args.datasetName,
+        args.evaluatorPattern
+      );
       return;
     }
-    await generateDatasetJsonCommand(runner, args.datasetName);
+    await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
+      runner,
+      args.datasetName
+    );
   } finally {
     await runner.shutdown();
   }