npm - @m4trix/evals - Versions diffs - 0.9.1 → 0.10.0 - Mend

@m4trix/evals 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/cli-simple.js CHANGED Viewed

@@ -7,6 +7,9 @@ import * as jitiModule from 'jiti';
 import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
 import { pathToFileURL } from 'url';
 import 'json-diff';
+import React2, { useState, useEffect, useCallback } from 'react';
+import { render, Box, Text } from 'ink';
+import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
 // src/runner/config.ts
 var defaultRunnerConfig = {
@@ -886,6 +889,88 @@ function printBanner() {
   ];
   console.log(lines.join("\n"));
 }
+function Banner() {
+  return /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
+    /* @__PURE__ */ jsx(Text, { color: "gray", children: "@m4trix/evals" }),
+    /* @__PURE__ */ jsx(Text, { color: "cyan", children: " \xB7 " }),
+    /* @__PURE__ */ jsx(Text, { color: "gray", children: "eval-agents-simple" })
+  ] });
+}
+function GenerateView({
+  runner,
+  datasetName,
+  onComplete
+}) {
+  const [result, setResult] = useState(null);
+  const [error, setError] = useState(null);
+  useEffect(() => {
+    let cancelled = false;
+    async function run() {
+      const dataset = await runner.resolveDatasetByName(datasetName);
+      if (!dataset) {
+        setError(new Error(`Dataset "${datasetName}" not found.`));
+        onComplete(new Error(`Dataset "${datasetName}" not found.`));
+        return;
+      }
+      const { writeFile: writeFile2 } = await import('fs/promises');
+      const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
+      const testCases = await runner.collectDatasetTestCases(dataset.id);
+      const payload = testCases.map((item) => {
+        const tc = item.testCase;
+        return {
+          name: item.testCase.getName(),
+          input: item.testCase.getInput(),
+          output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
+        };
+      });
+      const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
+      const parsed = parse2(absoluteDatasetPath);
+      const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
+      await writeFile2(
+        outputPath,
+        `${JSON.stringify(payload, null, 2)}
+`,
+        "utf8"
+      );
+      if (!cancelled) {
+        setResult({
+          count: payload.length,
+          datasetName: dataset.dataset.getName(),
+          outputPath
+        });
+        setTimeout(() => onComplete(), 200);
+      }
+    }
+    void run();
+    return () => {
+      cancelled = true;
+    };
+  }, [runner, datasetName, onComplete]);
+  if (error) {
+    return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
+      /* @__PURE__ */ jsx(Banner, {}),
+      /* @__PURE__ */ jsx(Text, { color: "red", children: error.message })
+    ] });
+  }
+  return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
+    /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
+    result && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
+      /* @__PURE__ */ jsxs(Text, { color: "green", children: [
+        "Generated ",
+        result.count,
+        ' test cases for dataset "',
+        result.datasetName,
+        '".'
+      ] }),
+      /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+        "Wrote ",
+        result.outputPath
+      ] })
+    ] })
+  ] });
+}
+// src/cli-simple/generate.ts
 function readOutput2(testCase) {
   if (typeof testCase.getOutput !== "function") {
     return void 0;
@@ -896,7 +981,7 @@ function createOutputPath(datasetFilePath) {
   const parsed = parse(datasetFilePath);
   return join(parsed.dir, `${parsed.name}.cases.json`);
 }
-async function generateDatasetJsonCommand(runner, datasetName) {
+async function generateDatasetJsonCommandPlain(runner, datasetName) {
   const dataset = await runner.resolveDatasetByName(datasetName);
   if (!dataset) {
     throw new Error(`Dataset "${datasetName}" not found.`);
@@ -914,6 +999,380 @@ async function generateDatasetJsonCommand(runner, datasetName) {
   console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
   console.log(`Wrote ${outputPath}`);
 }
+async function generateDatasetJsonCommandInk(runner, datasetName) {
+  return new Promise((resolve4, reject) => {
+    const app = render(
+      React2.createElement(GenerateView, {
+        runner,
+        datasetName,
+        onComplete: (err) => {
+          app.unmount();
+          if (err) {
+            reject(err);
+          } else {
+            resolve4();
+          }
+        }
+      })
+    );
+  });
+}
+function barColor(pct) {
+  if (pct >= 70)
+    return "green";
+  if (pct >= 40)
+    return "yellow";
+  return "red";
+}
+function TextBar({
+  label,
+  value,
+  max = 100,
+  labelWidth = 14,
+  barWidth = 20,
+  format = (v) => String(v),
+  colorByValue = true
+}) {
+  const clamped = Math.max(0, Math.min(max, value));
+  const pct = max > 0 ? clamped / max * 100 : 0;
+  const filled = Math.round(clamped / max * barWidth);
+  const filledBar = "\u2588".repeat(filled);
+  const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
+  const color = colorByValue ? barColor(pct) : void 0;
+  return /* @__PURE__ */ jsxs(Text, { children: [
+    /* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
+    " [",
+    color ? /* @__PURE__ */ jsxs(Fragment, { children: [
+      /* @__PURE__ */ jsx(Text, { color, children: filledBar }),
+      /* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
+    ] }) : filledBar + emptyBar,
+    "] ",
+    /* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
+  ] });
+}
+var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
+function Spinner({ label = "Running" }) {
+  const [frame, setFrame] = useState(0);
+  useEffect(() => {
+    const timer = setInterval(() => {
+      setFrame((f) => (f + 1) % FRAMES.length);
+    }, 100);
+    return () => clearInterval(timer);
+  }, []);
+  return /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
+    FRAMES[frame],
+    " ",
+    label
+  ] });
+}
+function scoreColor(score) {
+  if (score >= 80)
+    return "green";
+  if (score >= 50)
+    return "yellow";
+  return "red";
+}
+function createBar(value, max = 100, width = 20) {
+  const safe = Math.max(0, Math.min(max, value));
+  const filled = Math.round(safe / max * width);
+  return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
+}
+function formatScorePart(item, scoreToColor2) {
+  const def = getScoreById(item.id);
+  if (!def) {
+    const numeric = toNumericScore(item.data);
+    return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
+  }
+  const formatted = def.format(item.data);
+  if (def.displayStrategy === "bar") {
+    const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
+    if (typeof numeric === "number" && Number.isFinite(numeric)) {
+      return `${formatted} ${createBar(numeric)}`;
+    }
+  }
+  return formatted;
+}
+function RunView({
+  runner,
+  datasetName,
+  evaluatorPattern,
+  onComplete
+}) {
+  const [phase, setPhase] = useState(
+    "loading"
+  );
+  const [runInfo, setRunInfo] = useState(null);
+  const [testCases, setTestCases] = useState([]);
+  const [summary, setSummary] = useState(null);
+  const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
+  const runEval = useCallback(async () => {
+    const dataset = await runner.resolveDatasetByName(datasetName);
+    if (!dataset) {
+      const known = await runner.collectDatasets();
+      const available = known.map((item) => item.dataset.getName()).sort();
+      onComplete(
+        new Error(
+          available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
+        )
+      );
+      return;
+    }
+    const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
+    if (evaluators.length === 0) {
+      const known = await runner.collectEvaluators();
+      const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
+      onComplete(
+        new Error(
+          available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
+        )
+      );
+      return;
+    }
+    const nameById = new Map(
+      evaluators.map((item) => [
+        item.id,
+        item.evaluator.getName() ?? item.id
+      ])
+    );
+    setEvaluatorNameById(nameById);
+    const aggregates = /* @__PURE__ */ new Map();
+    let overallScoreTotal = 0;
+    let overallScoreCount = 0;
+    const done = new Promise((resolve4) => {
+      const unsubscribe = runner.subscribeRunEvents((event) => {
+        if (event.type === "TestCaseProgress") {
+          const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
+          const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
+          for (const item of event.evaluatorScores) {
+            const numeric = toNumericScoreFromScores(item.scores);
+            if (numeric !== void 0) {
+              const current = aggregates.get(item.evaluatorId) ?? {
+                total: 0,
+                count: 0,
+                passed: 0,
+                failed: 0
+              };
+              aggregates.set(item.evaluatorId, {
+                total: current.total + numeric,
+                count: current.count + 1,
+                passed: current.passed + (item.passed ? 1 : 0),
+                failed: current.failed + (item.passed ? 0 : 1)
+              });
+              overallScoreTotal += numeric;
+              overallScoreCount += 1;
+            }
+          }
+          setTestCases((prev) => [
+            ...prev,
+            {
+              name: event.testCaseName,
+              completedTestCases: event.completedTestCases,
+              totalTestCases: event.totalTestCases,
+              durationMs: event.durationMs,
+              passed: event.passed,
+              averageScore,
+              evaluatorScores: event.evaluatorScores.map((item) => ({
+                evaluatorId: item.evaluatorId,
+                evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
+                scores: item.scores,
+                passed: item.passed,
+                metrics: item.metrics
+              }))
+            }
+          ]);
+        }
+        if (event.type === "RunCompleted" || event.type === "RunFailed") {
+          unsubscribe();
+          resolve4(event);
+        }
+      });
+    });
+    const snapshot = await runner.runDatasetWith({
+      datasetId: dataset.id,
+      evaluatorIds: evaluators.map((item) => item.id)
+    });
+    setRunInfo({
+      runId: snapshot.runId,
+      datasetName: snapshot.datasetName,
+      evaluatorNames: evaluators.map(
+        (e) => e.evaluator.getName() ?? e.id
+      ),
+      totalTestCases: snapshot.totalTestCases
+    });
+    setPhase("running");
+    const finalEvent = await done;
+    if (finalEvent.type === "RunFailed") {
+      onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
+      return;
+    }
+    setSummary({
+      passedTestCases: finalEvent.passedTestCases,
+      failedTestCases: finalEvent.failedTestCases,
+      totalTestCases: finalEvent.totalTestCases,
+      overallScoreTotal,
+      overallScoreCount,
+      aggregates: new Map(aggregates),
+      artifactPath: finalEvent.artifactPath
+    });
+    setPhase("completed");
+    setTimeout(() => onComplete(), 200);
+  }, [runner, datasetName, evaluatorPattern, onComplete]);
+  useEffect(() => {
+    void runEval();
+  }, [runEval]);
+  return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
+    /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
+    runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
+        /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
+      ] }),
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
+        runInfo.datasetName
+      ] }),
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
+        runInfo.evaluatorNames.join(", ")
+      ] }),
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
+        runInfo.totalTestCases
+      ] })
+    ] }),
+    phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
+      Spinner,
+      {
+        label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
+      }
+    ) }),
+    testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
+      /* @__PURE__ */ jsxs(Text, { children: [
+        /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
+          "[",
+          tc.completedTestCases,
+          "/",
+          tc.totalTestCases,
+          "]"
+        ] }),
+        " ",
+        tc.name,
+        /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+          " (",
+          tc.durationMs,
+          "ms)"
+        ] })
+      ] }),
+      tc.evaluatorScores.map((item) => /* @__PURE__ */ jsx(Box, { marginLeft: 2, children: /* @__PURE__ */ jsxs(Text, { children: [
+        item.evaluatorName,
+        ":",
+        " ",
+        /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
+        " ",
+        item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
+          formatScorePart(s),
+          " "
+        ] }, s.id)),
+        item.metrics?.map((m) => {
+          const def = getMetricById(m.id);
+          if (!def)
+            return null;
+          const formatted = def.format(m.data);
+          return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+            "[",
+            def.name ? `${def.name}: ` : "",
+            formatted,
+            "]",
+            " "
+          ] }, m.id);
+        })
+      ] }) }, item.evaluatorId))
+    ] }, i)) }),
+    phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
+      /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
+      /* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
+        /* @__PURE__ */ jsx(Text, { color: "green", children: "passed" }),
+        /* @__PURE__ */ jsxs(Text, { children: [
+          " ",
+          summary.passedTestCases,
+          "/",
+          summary.totalTestCases
+        ] })
+      ] }),
+      /* @__PURE__ */ jsxs(Box, { children: [
+        /* @__PURE__ */ jsx(Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
+        /* @__PURE__ */ jsxs(Text, { children: [
+          " ",
+          summary.failedTestCases,
+          "/",
+          summary.totalTestCases
+        ] })
+      ] }),
+      summary.overallScoreCount > 0 && /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsx(
+        TextBar,
+        {
+          label: "overall avg",
+          value: summary.overallScoreTotal / summary.overallScoreCount,
+          barWidth: 20,
+          format: (v) => v.toFixed(2)
+        }
+      ) }),
+      /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
+        /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
+        Array.from(evaluatorNameById.entries()).map(([id, name]) => {
+          const agg = summary.aggregates.get(id);
+          if (!agg || agg.count === 0) {
+            return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+              "- ",
+              name.padEnd(28),
+              " no numeric scores"
+            ] }, id);
+          }
+          const mean = agg.total / agg.count;
+          return /* @__PURE__ */ jsxs(Text, { children: [
+            "- ",
+            name.padEnd(28),
+            " avg=",
+            /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
+            " passed=",
+            agg.passed,
+            " failed=",
+            agg.failed
+          ] }, id);
+        })
+      ] }),
+      /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
+        /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
+        testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
+          /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
+          /* @__PURE__ */ jsxs(Text, { children: [
+            " ",
+            tc.name.padEnd(24)
+          ] }),
+          tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
+            /* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
+              "score=",
+              tc.averageScore.toFixed(2)
+            ] }),
+            /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+              " ",
+              createBar(tc.averageScore, 100, 14)
+            ] })
+          ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
+          /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+            " (",
+            tc.durationMs,
+            "ms)"
+          ] })
+        ] }, i))
+      ] }),
+      /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+        "artifact: ",
+        summary.artifactPath
+      ] }) })
+    ] })
+  ] });
+}
 // src/cli-simple/run.ts
 var ansi2 = {
@@ -945,7 +1404,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
   const mean = aggregate.total / aggregate.count;
   return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
 }
-function createBar(value, max = 100, width = 20) {
+function createBar2(value, max = 100, width = 20) {
   const safe = Math.max(0, Math.min(max, value));
   const filled = Math.round(safe / max * width);
   return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
@@ -968,7 +1427,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
         const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
         if (typeof numeric === "number" && Number.isFinite(numeric)) {
           scoreParts.push(
-            `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi2.dim)}`
+            `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
           );
         } else {
           scoreParts.push(formatted);
@@ -1007,7 +1466,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
   }
   return line;
 }
-async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
+async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
   const dataset = await runner.resolveDatasetByName(datasetName);
   if (!dataset) {
     const known = await runner.collectDatasets();
@@ -1154,7 +1613,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
       `- overall avg score: ${colorize(
         overallAverage.toFixed(2),
         scoreToColor(overallAverage)
-      )} ${colorize(createBar(overallAverage), ansi2.dim)}`
+      )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
     );
   }
   console.log(colorize("- evaluator averages:", ansi2.magenta));
@@ -1177,12 +1636,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
         `  ${status} ${summary.name.padEnd(24)} score=${colorize(
           summary.averageScore.toFixed(2),
           scoreToColor(summary.averageScore)
-        )} ${colorize(createBar(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
+        )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
       );
     }
   }
   console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
 }
+async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
+  return new Promise((resolve4, reject) => {
+    const app = render(
+      React2.createElement(RunView, {
+        runner,
+        datasetName,
+        evaluatorPattern,
+        onComplete: (err) => {
+          app.unmount();
+          if (err) {
+            reject(err);
+          } else {
+            resolve4();
+          }
+        }
+      })
+    );
+  });
+}
 // src/cli-simple/index.ts
 function printUsageAndExit(exitCode) {
@@ -1210,14 +1688,24 @@ async function main() {
     console.error("Missing required --evaluator <name-or-pattern> argument.");
     printUsageAndExit(1);
   }
-  printBanner();
+  const useInk = process.stdout.isTTY === true;
+  if (!useInk) {
+    printBanner();
+  }
   const runner = createRunner();
   try {
     if (args.command === "run") {
-      await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
+      await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
+        runner,
+        args.datasetName,
+        args.evaluatorPattern
+      );
       return;
     }
-    await generateDatasetJsonCommand(runner, args.datasetName);
+    await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
+      runner,
+      args.datasetName
+    );
   } finally {
     await runner.shutdown();
   }