npm - @m4trix/evals - Versions diffs - 0.14.0 → 0.15.0 - Mend

@m4trix/evals 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/cli-simple.js CHANGED Viewed

@@ -1518,6 +1518,7 @@ function RunView({
     );
     setEvaluatorNameById(nameById);
     const aggregates = /* @__PURE__ */ new Map();
+    const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
     let overallScoreTotal = 0;
     let overallScoreSumSq = 0;
     let overallScoreCount = 0;
@@ -1547,6 +1548,12 @@ function RunView({
               overallScoreSumSq += numeric * numeric;
               overallScoreCount += 1;
             }
+            for (const s of item.scores) {
+              const key = `${item.evaluatorId}:${s.id}`;
+              const list = scoreItemsByEvaluatorScore.get(key) ?? [];
+              list.push(s);
+              scoreItemsByEvaluatorScore.set(key, list);
+            }
           }
           setTestCases((prev) => {
             const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
@@ -1617,6 +1624,7 @@ function RunView({
       overallScoreSumSq,
       overallScoreCount,
       aggregates: new Map(aggregates),
+      scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
       artifactPath: finalEvent.artifactPath
     });
     setPhase("completed");
@@ -1699,36 +1707,45 @@ function RunView({
               ":",
               " ",
               /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
-              " ",
-              item.scores.map((s) => /* @__PURE__ */ jsxs(
+              item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
+                " ",
+                item.metrics.map((m) => {
+                  const def = getMetricById(m.id);
+                  if (!def)
+                    return null;
+                  const formatted = def.format(m.data, {
+                    isAggregated: tc.isAggregated
+                  });
+                  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
+                    "[",
+                    def.name ? `${def.name}: ` : "",
+                    formatted,
+                    "]",
+                    " "
+                  ] }, m.id);
+                })
+              ] }) : null
+            ] }),
+            item.scores.length > 0 ? item.scores.map((s, idx) => {
+              const def = getScoreById(s.id);
+              const scoreLabel = def ? def.name ?? def.id : s.id;
+              return /* @__PURE__ */ jsxs(
                 Text,
                 {
                   color: scoreColor(toNumericScore(s.data) ?? 0),
                   children: [
+                    "      ",
+                    scoreLabel,
+                    ":",
+                    " ",
                     formatScorePart(s, scoreColor, {
                       isAggregated: tc.isAggregated
-                    }),
-                    " "
+                    })
                   ]
                 },
-                s.id
-              )),
-              item.metrics?.map((m) => {
-                const def = getMetricById(m.id);
-                if (!def)
-                  return null;
-                const formatted = def.format(m.data, {
-                  isAggregated: tc.isAggregated
-                });
-                return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
-                  "[",
-                  def.name ? `${def.name}: ` : "",
-                  formatted,
-                  "]",
-                  " "
-                ] }, m.id);
-              })
-            ] }),
+                `${item.evaluatorId}-${s.id}-${idx}`
+              );
+            }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "      n/a" }),
             !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
               (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
                 ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
@@ -1786,26 +1803,54 @@ function RunView({
         /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
         Array.from(evaluatorNameById.entries()).map(([id, name]) => {
           const agg = summary.aggregates.get(id);
-          if (!agg || agg.count === 0) {
+          const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
+            (k) => k.startsWith(`${id}:`)
+          );
+          if (scoreKeys.length === 0) {
             return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
               "- ",
               name.padEnd(28),
-              " no numeric scores"
+              " no scores"
             ] }, id);
           }
-          const mean = agg.total / agg.count;
-          const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
-          const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
-          return /* @__PURE__ */ jsxs(Text, { children: [
-            "- ",
-            name.padEnd(28),
-            " avg=",
-            /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
+          const passedFailed = agg != null ? /* @__PURE__ */ jsxs(Text, { children: [
             " ",
             "passed=",
             agg.passed,
             " failed=",
             agg.failed
+          ] }) : null;
+          return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
+            /* @__PURE__ */ jsxs(Text, { children: [
+              "- ",
+              name.padEnd(28),
+              passedFailed
+            ] }),
+            scoreKeys.map((key) => {
+              const items = summary.scoreItemsByEvaluatorScore?.get(key) ?? [];
+              const aggregated = aggregateScoreItems(items);
+              if (!aggregated)
+                return null;
+              const def = getScoreById(aggregated.id);
+              const label = def ? def.name ?? def.id : aggregated.id;
+              const formatted = def?.format(aggregated.data, {
+                isAggregated: true
+              }) ?? "n/a";
+              const numeric = toNumericScore(aggregated.data);
+              return /* @__PURE__ */ jsxs(
+                Text,
+                {
+                  color: numeric !== void 0 ? scoreColor(numeric) : "gray",
+                  children: [
+                    "    ",
+                    label,
+                    ": ",
+                    formatted
+                  ]
+                },
+                key
+              );
+            })
           ] }, id);
         })
       ] }),
@@ -1933,14 +1978,36 @@ function scoreToColor(score) {
   }
   return ansi2.red;
 }
-function getEvaluatorSummaryLine(evaluatorName, aggregate) {
-  if (!aggregate || aggregate.count === 0) {
-    return `- ${evaluatorName.padEnd(28)} no numeric scores`;
+function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
+  const lines = [];
+  const scoreKeys = [...scoreItemsByKey.keys()].filter(
+    (k) => k.startsWith(`${evaluatorId}:`)
+  );
+  if (scoreKeys.length === 0) {
+    lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
+    return lines;
+  }
+  const passedFailed = aggregate != null ? ` passed=${aggregate.passed} failed=${aggregate.failed}` : "";
+  const scoreLines = [];
+  for (const key of scoreKeys) {
+    const items = scoreItemsByKey.get(key) ?? [];
+    const agg = aggregateScoreItems(items);
+    if (!agg)
+      continue;
+    const def = getScoreById(agg.id);
+    const label = def ? def.name ?? def.id : agg.id;
+    const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
+    const numeric = toNumericScore(agg.data);
+    const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
+    scoreLines.push(`    ${label}: ${colored}`);
+  }
+  if (scoreLines.length > 0) {
+    lines.push(`- ${evaluatorName.padEnd(28)}${passedFailed}`);
+    lines.push(...scoreLines);
+  } else {
+    lines.push(`- ${evaluatorName.padEnd(28)} no numeric scores${passedFailed}`);
   }
-  const mean = aggregate.total / aggregate.count;
-  const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
-  const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
-  return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
+  return lines;
 }
 function createBar2(value, max = 100, width = 20) {
   const safe = Math.max(0, Math.min(max, value));
@@ -1992,46 +2059,8 @@ function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
 }
 function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
   const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
-  const scoreParts = [];
-  for (const item of scores) {
-    const def = getScoreById(item.id);
-    if (!def) {
-      const numeric = toNumericScore(item.data);
-      scoreParts.push(
-        numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
-      );
-      continue;
-    }
-    const formatted = def.format(item.data, options);
-    switch (def.displayStrategy) {
-      case "bar": {
-        const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
-        if (typeof numeric === "number" && Number.isFinite(numeric)) {
-          scoreParts.push(
-            `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
-          );
-        } else {
-          scoreParts.push(formatted);
-        }
-        break;
-      }
-      case "number":
-        scoreParts.push(formatted);
-        break;
-      case "passFail":
-        scoreParts.push(
-          colorize(
-            formatted,
-            item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
-          )
-        );
-        break;
-    }
-  }
-  const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
-  let line = `   ${name}: ${passLabel} ${scoreStr}`;
+  const metricParts = [];
   if (metrics && metrics.length > 0) {
-    const metricParts = [];
     for (const { id, data } of metrics) {
       const def = getMetricById(id);
       if (def) {
@@ -2041,11 +2070,49 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
         );
       }
     }
-    if (metricParts.length > 0) {
-      line += ` ${metricParts.join(" ")}`;
+  }
+  const scoreLines = [];
+  for (const item of scores) {
+    const def = getScoreById(item.id);
+    const scoreLabel = def ? def.name ?? def.id : item.id;
+    let formatted;
+    if (!def) {
+      const numeric = toNumericScore(item.data);
+      formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
+    } else {
+      const raw = def.format(item.data, options);
+      switch (def.displayStrategy) {
+        case "bar": {
+          const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
+          if (typeof numeric === "number" && Number.isFinite(numeric)) {
+            formatted = `${colorize(raw, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`;
+          } else {
+            formatted = raw;
+          }
+          break;
+        }
+        case "number":
+          formatted = raw;
+          break;
+        case "passFail":
+          formatted = colorize(
+            raw,
+            item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
+          );
+          break;
+      }
     }
+    scoreLines.push(`      ${scoreLabel}: ${formatted}`);
+  }
+  const lines = [];
+  const metricStr = metricParts.length > 0 ? ` ${metricParts.join(" ")}` : "";
+  lines.push(`   ${name}: ${passLabel}${metricStr}`);
+  if (scoreLines.length > 0) {
+    lines.push(...scoreLines);
+  } else {
+    lines.push(`      n/a`);
   }
-  return line;
+  return lines;
 }
 async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
   const dataset = await runner.resolveDatasetByName(datasetName);
@@ -2068,6 +2135,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
     evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
   );
   const aggregates = /* @__PURE__ */ new Map();
+  const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
   const testCaseByTestId = /* @__PURE__ */ new Map();
   let overallScoreTotal = 0;
   let overallScoreSumSq = 0;
@@ -2143,6 +2211,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
             overallScoreSumSq += numeric * numeric;
             overallScoreCount += 1;
           }
+          for (const s of item.scores) {
+            const key = `${item.evaluatorId}:${s.id}`;
+            const list = scoreItemsByEvaluatorScore.get(key) ?? [];
+            list.push(s);
+            scoreItemsByEvaluatorScore.set(key, list);
+          }
         }
         const isSameTestCase = lastPrintedTestCaseId === testCaseId;
         const isLastRerun = event.rerunIndex >= event.rerunTotal;
@@ -2166,7 +2240,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
         for (const item of aggregatedScores) {
           const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
           lines.push(
-            formatEvaluatorScoreLine(
+            ...formatEvaluatorScoreLine(
               name,
               item.scores,
               item.passed,
@@ -2263,9 +2337,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
   }
   console.log(colorize("- evaluator averages:", ansi2.magenta));
   for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
-    console.log(
-      getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
+    const evaluatorLines = getEvaluatorSummaryLines(
+      evaluatorId,
+      evaluatorName,
+      aggregates.get(evaluatorId),
+      scoreItemsByEvaluatorScore
     );
+    for (const line of evaluatorLines) {
+      console.log(line);
+    }
   }
   const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
   if (testCaseSummaries.length > 0) {