npm - @darkrishabh/bench-ai - Versions diffs - 1.0.0 - Mend

@darkrishabh/bench-ai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

package/README.md +333 -0
package/dist/cli/app.d.ts +11 -0
package/dist/cli/app.d.ts.map +1 -0
package/dist/cli/app.js +48 -0
package/dist/cli/app.js.map +1 -0
package/dist/cli/components/DiffView.d.ts +5 -0
package/dist/cli/components/DiffView.d.ts.map +1 -0
package/dist/cli/components/DiffView.js +14 -0
package/dist/cli/components/DiffView.js.map +1 -0
package/dist/cli/components/EvalView.d.ts +6 -0
package/dist/cli/components/EvalView.d.ts.map +1 -0
package/dist/cli/components/EvalView.js +82 -0
package/dist/cli/components/EvalView.js.map +1 -0
package/dist/cli/components/Spinner.d.ts +4 -0
package/dist/cli/components/Spinner.d.ts.map +1 -0
package/dist/cli/components/Spinner.js +15 -0
package/dist/cli/components/Spinner.js.map +1 -0
package/dist/cli/index.d.ts +3 -0
package/dist/cli/index.d.ts.map +1 -0
package/dist/cli/index.js +117 -0
package/dist/cli/index.js.map +1 -0
package/dist/cli/run-command.d.ts +11 -0
package/dist/cli/run-command.d.ts.map +1 -0
package/dist/cli/run-command.js +119 -0
package/dist/cli/run-command.js.map +1 -0
package/dist/engine/cost.d.ts +3 -0
package/dist/engine/cost.d.ts.map +1 -0
package/dist/engine/cost.js +52 -0
package/dist/engine/cost.js.map +1 -0
package/dist/engine/diff.d.ts +6 -0
package/dist/engine/diff.d.ts.map +1 -0
package/dist/engine/diff.js +43 -0
package/dist/engine/diff.js.map +1 -0
package/dist/engine/eval.d.ts +14 -0
package/dist/engine/eval.d.ts.map +1 -0
package/dist/engine/eval.js +194 -0
package/dist/engine/eval.js.map +1 -0
package/dist/engine/index.d.ts +15 -0
package/dist/engine/index.d.ts.map +1 -0
package/dist/engine/index.js +10 -0
package/dist/engine/index.js.map +1 -0
package/dist/engine/providers/base.d.ts +7 -0
package/dist/engine/providers/base.d.ts.map +1 -0
package/dist/engine/providers/base.js +2 -0
package/dist/engine/providers/base.js.map +1 -0
package/dist/engine/providers/claude.d.ts +15 -0
package/dist/engine/providers/claude.d.ts.map +1 -0
package/dist/engine/providers/claude.js +53 -0
package/dist/engine/providers/claude.js.map +1 -0
package/dist/engine/providers/minimax.d.ts +16 -0
package/dist/engine/providers/minimax.d.ts.map +1 -0
package/dist/engine/providers/minimax.js +67 -0
package/dist/engine/providers/minimax.js.map +1 -0
package/dist/engine/providers/ollama.d.ts +14 -0
package/dist/engine/providers/ollama.d.ts.map +1 -0
package/dist/engine/providers/ollama.js +60 -0
package/dist/engine/providers/ollama.js.map +1 -0
package/dist/engine/providers/openai-compatible.d.ts +19 -0
package/dist/engine/providers/openai-compatible.d.ts.map +1 -0
package/dist/engine/providers/openai-compatible.js +109 -0
package/dist/engine/providers/openai-compatible.js.map +1 -0
package/dist/engine/providers/subprocess.d.ts +55 -0
package/dist/engine/providers/subprocess.d.ts.map +1 -0
package/dist/engine/providers/subprocess.js +111 -0
package/dist/engine/providers/subprocess.js.map +1 -0
package/dist/engine/suite-loader.d.ts +11 -0
package/dist/engine/suite-loader.d.ts.map +1 -0
package/dist/engine/suite-loader.js +75 -0
package/dist/engine/suite-loader.js.map +1 -0
package/dist/engine/types.d.ts +104 -0
package/dist/engine/types.d.ts.map +1 -0
package/dist/engine/types.js +2 -0
package/dist/engine/types.js.map +1 -0
package/next-env.d.ts +6 -0
package/next.config.ts +26 -0
package/package.json +72 -0
package/public/icon.svg +14 -0
package/src/app/api/diff/route.ts +135 -0
package/src/app/api/models/route.ts +96 -0
package/src/app/api/suite/route.ts +314 -0
package/src/app/globals.css +215 -0
package/src/app/icon.svg +14 -0
package/src/app/layout.tsx +44 -0
package/src/app/opengraph-image.tsx +73 -0
package/src/app/page.tsx +952 -0
package/src/app/suite/layout.tsx +12 -0
package/src/app/suite/page.tsx +206 -0
package/src/app/twitter-image.tsx +1 -0
package/src/components/BenchAiLogo.tsx +38 -0
package/src/components/ComparePanel.tsx +643 -0
package/src/components/ConfigPanel.tsx +809 -0
package/src/components/MarkdownOutput.tsx +16 -0
package/src/components/ModelResponseCard.tsx +313 -0
package/src/components/QuickComparisonBar.tsx +184 -0
package/src/components/ResponsesLineDiff.tsx +149 -0
package/src/components/SettingsPanel.tsx +591 -0
package/src/components/SuitePanel.tsx +875 -0
package/src/lib/brand.ts +4 -0
package/src/lib/config-yaml.ts +70 -0
package/src/lib/consume-suite-sse.ts +70 -0
package/src/lib/describe-judge.ts +23 -0
package/src/lib/model-chip-palette.ts +9 -0
package/src/lib/openai-model-list.ts +33 -0
package/src/lib/provider-ui.ts +30 -0
package/src/lib/resolve-credentials.ts +80 -0
package/src/lib/run-history.ts +66 -0
package/src/lib/simple-line-diff.ts +50 -0
package/src/lib/storage.ts +100 -0
package/src/lib/suite-judge-meta.ts +13 -0
package/src/lib/suite-run-history.ts +81 -0
package/src/types.ts +170 -0
package/vercel.json +5 -0

package/src/lib/brand.ts ADDED Viewed

@@ -0,0 +1,4 @@
+/** Product name and copy for UI + site metadata */
+export const BRAND_NAME = "Bench AI";
+export const BRAND_TAGLINE = "One prompt, many models — compare quality, speed, and cost";
+export const BRAND_SUITE_SUBTITLE = "YAML evaluations against your enabled models";

package/src/lib/config-yaml.ts ADDED Viewed

@@ -0,0 +1,70 @@
+import { load } from "js-yaml";
+import { dump } from "js-yaml";
+import type { AppConfigYaml, JudgeSettings, LLMInstance, SecretsMap } from "../types";
+import { APP_CONFIG_VERSION, DEFAULT_JUDGE_SETTINGS } from "../types";
+export function exportAppConfigYaml(params: {
+  secrets: SecretsMap;
+  judge: JudgeSettings;
+  instances: LLMInstance[];
+}): string {
+  const doc: AppConfigYaml = {
+    version: APP_CONFIG_VERSION,
+    secrets: { ...params.secrets },
+    judge: { ...params.judge },
+    instances: params.instances.map((i) => ({ ...i })),
+  };
+  return dump(doc, { lineWidth: 120, noRefs: true, quotingType: '"' });
+}
+export function parseAppConfigYaml(yaml: string): AppConfigYaml {
+  const raw = load(yaml) as unknown;
+  if (!raw || typeof raw !== "object") {
+    throw new Error("Config must be a YAML mapping");
+  }
+  const o = raw as Record<string, unknown>;
+  const version = typeof o.version === "number" ? o.version : 1;
+  if (version !== APP_CONFIG_VERSION) {
+    throw new Error(`Unsupported config version: ${version} (expected ${APP_CONFIG_VERSION})`);
+  }
+  const secrets: SecretsMap =
+    o.secrets && typeof o.secrets === "object" && o.secrets !== null && !Array.isArray(o.secrets)
+      ? Object.fromEntries(
+          Object.entries(o.secrets as Record<string, unknown>).filter(
+            ([, v]) => typeof v === "string"
+          ) as [string, string][]
+        )
+      : {};
+  let judge: Partial<JudgeSettings> | undefined;
+  if (o.judge && typeof o.judge === "object" && o.judge !== null && !Array.isArray(o.judge)) {
+    judge = o.judge as Partial<JudgeSettings>;
+  }
+  let instances: LLMInstance[] | undefined;
+  if (Array.isArray(o.instances)) {
+    instances = o.instances.filter(
+      (x): x is LLMInstance =>
+        x !== null &&
+        typeof x === "object" &&
+        typeof (x as LLMInstance).id === "string" &&
+        typeof (x as LLMInstance).provider === "string" &&
+        typeof (x as LLMInstance).model === "string" &&
+        typeof (x as LLMInstance).enabled === "boolean"
+    );
+  }
+  return { version, secrets, judge, instances };
+}
+export function mergeImportedConfig(
+  parsed: AppConfigYaml,
+  current: { secrets: SecretsMap; judge: JudgeSettings; instances: LLMInstance[] }
+): { secrets: SecretsMap; judge: JudgeSettings; instances: LLMInstance[] } {
+  return {
+    secrets: { ...current.secrets, ...(parsed.secrets ?? {}) },
+    judge: { ...DEFAULT_JUDGE_SETTINGS, ...current.judge, ...(parsed.judge ?? {}) },
+    instances: Array.isArray(parsed.instances) ? parsed.instances : current.instances,
+  };
+}

package/src/lib/consume-suite-sse.ts ADDED Viewed

@@ -0,0 +1,70 @@
+import type { SuiteResult } from "@darkrishabh/bench-ai";
+import type { SuiteJudgeMeta } from "./suite-judge-meta";
+type SsePayload =
+  | { type: "log"; line: string }
+  | { type: "done"; result: SuiteResult; runLog: string[]; judgeMeta: SuiteJudgeMeta }
+  | { type: "error"; message: string };
+/**
+ * Read a POST /api/suite response with Content-Type: text/event-stream.
+ * Invokes onLogLine for each log line as it arrives.
+ */
+export async function consumeSuiteSseStream(
+  res: Response,
+  onLogLine: (line: string) => void
+): Promise<{ result: SuiteResult; runLog: string[]; judgeMeta: SuiteJudgeMeta | null }> {
+  const reader = res.body?.getReader();
+  if (!reader) {
+    throw new Error("No response body");
+  }
+  const decoder = new TextDecoder();
+  let buffer = "";
+  let final: { result: SuiteResult; runLog: string[]; judgeMeta: SuiteJudgeMeta | null } | null = null;
+  const parseBlock = (block: string) => {
+    const lines = block.split("\n").filter((l) => l.startsWith("data:"));
+    if (lines.length === 0) return;
+    const jsonStr = lines.map((l) => l.replace(/^data:\s?/, "")).join("\n");
+    let msg: SsePayload;
+    try {
+      msg = JSON.parse(jsonStr) as SsePayload;
+    } catch {
+      return;
+    }
+    if (msg.type === "log") onLogLine(msg.line);
+    if (msg.type === "error") throw new Error(msg.message);
+    if (msg.type === "done") {
+      final = {
+        result: msg.result,
+        runLog: msg.runLog,
+        judgeMeta: msg.judgeMeta ?? null,
+      };
+    }
+  };
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    buffer += decoder.decode(value, { stream: true });
+    for (;;) {
+      const idx = buffer.indexOf("\n\n");
+      if (idx === -1) break;
+      const block = buffer.slice(0, idx);
+      buffer = buffer.slice(idx + 2);
+      parseBlock(block);
+    }
+  }
+  buffer += decoder.decode();
+  if (buffer.trim()) {
+    for (const block of buffer.split("\n\n")) {
+      if (block.trim()) parseBlock(block);
+    }
+  }
+  if (!final) {
+    throw new Error("Stream ended without a result event");
+  }
+  return final;
+}

package/src/lib/describe-judge.ts ADDED Viewed

@@ -0,0 +1,23 @@
+import type { JudgeSettings, SecretsMap } from "../types";
+/** One-line summary for suite “run target” UI */
+export function describeJudgeForUi(judge: JudgeSettings, secrets: SecretsMap): string {
+  const ref = judge.anthropicSecretRef?.trim() || "anthropic";
+  const hasAnthropicSecret = Boolean(secrets[ref]?.trim());
+  switch (judge.mode) {
+    case "none":
+      return "No judge — llm-rubric assertions will not be graded.";
+    case "ollama":
+      return `Ollama judge at ${judge.ollamaBaseUrl || "http://localhost:11434"} · model ${judge.ollamaModel || "llama3.2"}`;
+    case "claude":
+      return hasAnthropicSecret
+        ? `Claude (${judge.claudeModel || "default"}) · API key from secret “${ref}”`
+        : `Claude (${judge.claudeModel || "default"}) · key from secret “${ref}” or server ANTHROPIC_API_KEY`;
+    case "auto":
+    default:
+      return hasAnthropicSecret
+        ? `Auto · Claude when available (secret “${ref}”, ${judge.claudeModel || "default model"})`
+        : `Auto · Claude if ANTHROPIC_API_KEY is set on the server (secret “${ref}” is empty)`;
+  }
+}

package/src/lib/model-chip-palette.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/** Ordinal dot colors for enabled model chips in the prompt bar (distinct slots). */
+export const MODEL_CHIP_PALETTE = [
+  "#16a34a",
+  "#2563eb",
+  "#ea580c",
+  "#7c3aed",
+  "#0891b2",
+  "#db2777",
+] as const;

package/src/lib/openai-model-list.ts ADDED Viewed

@@ -0,0 +1,33 @@
+/**
+ * Filter OpenAI `GET /v1/models` IDs for chat-style use in our UI.
+ *
+ * Note: OpenAI does not always expose every chat-capable model in this list (tier,
+ * product, or API surface). If a model is missing here, use **Other…** in Settings
+ * and paste the exact model id from the OpenAI docs.
+ */
+const DROP = (id: string) => {
+  const l = id.toLowerCase();
+  if (l.includes("embedding")) return true;
+  if (l.includes("whisper")) return true;
+  if (l.includes("tts")) return true;
+  if (l.includes("dall-e") || l.includes("dalle")) return true;
+  if (l.includes("moderation")) return true;
+  if (l.includes("realtime")) return true;
+  if (l.includes("transcribe")) return true;
+  if (l.includes("speech")) return true;
+  /** Audio / non-text completion SKUs */
+  if (/\baudio\b/.test(l)) return true;
+  if (l.includes("computer-use")) return true;
+  if (l.startsWith("ft:")) return true;
+  /**
+   * Drop search-augmented / search API SKUs only — not bare substring "search"
+   * (that can appear inside unrelated id segments and hide GPT-5+ ids).
+   */
+  if (l.includes("search-preview") || l.includes("search-api") || /gpt-4o-search/.test(l)) return true;
+  return false;
+};
+export function filterOpenAiChatModelIds(ids: string[]): string[] {
+  return [...new Set(ids.filter((id) => id && !DROP(id)))].sort((a, b) => a.localeCompare(b));
+}

package/src/lib/provider-ui.ts ADDED Viewed

@@ -0,0 +1,30 @@
+/** Shared provider colors for chips, cards, and run-target UI */
+export const PROVIDER_UI: Record<string, { color: string; border: string }> = {
+  claude:      { color: "var(--claude)",      border: "var(--claude-border)"      },
+  ollama:      { color: "var(--ollama)",      border: "var(--ollama-border)"      },
+  minimax:     { color: "var(--minimax)",     border: "var(--minimax-border)"     },
+  openai:      { color: "var(--openai)",      border: "var(--openai-border)"      },
+  groq:        { color: "var(--groq)",        border: "var(--groq-border)"        },
+  openrouter:  { color: "var(--openrouter)",  border: "var(--openrouter-border)"  },
+  "nvidia-nim":{ color: "var(--nvidia-nim)",  border: "var(--nvidia-nim-border)"  },
+  together:    { color: "var(--together)",    border: "var(--together-border)"    },
+  perplexity:  { color: "var(--perplexity)",  border: "var(--perplexity-border)"  },
+  custom:      { color: "var(--custom)",      border: "var(--custom-border)"      },
+  "claude-cli":{ color: "var(--claude)",      border: "var(--claude-border)"      },
+  codex:       { color: "var(--openai)",      border: "var(--openai-border)"      },
+};
+/** Header-style label, e.g. `nvidia-nim` → `NVIDIA-NIM`. */
+export function formatProviderDisplayName(provider: string): string {
+  return provider.replace(/_/g, "-").toUpperCase();
+}
+export function providerUi(provider: string) {
+  return (
+    PROVIDER_UI[provider] ?? {
+      color: "var(--text-3)",
+      border: "var(--border)",
+    }
+  );
+}

package/src/lib/resolve-credentials.ts ADDED Viewed

@@ -0,0 +1,80 @@
+import type { JudgeSettings, LLMInstance } from "../types";
+import { DEFAULT_JUDGE_SETTINGS } from "../types";
+/** Merge inline keys + secret variables for API routes. Omits ref fields from the payload. */
+export function resolveInstancesForApi(
+  instances: LLMInstance[],
+  secrets: Record<string, string>
+): LLMInstance[] {
+  return instances.map((i) => resolveInstanceForApi(i, secrets));
+}
+export function resolveInstanceForApi(
+  instance: LLMInstance,
+  secrets: Record<string, string>
+): LLMInstance {
+  const fromRef = (ref: string | undefined) =>
+    ref?.trim() ? secrets[ref.trim()]?.trim() ?? "" : "";
+  const apiKeyRefVal = fromRef(instance.apiKeySecretRef);
+  const groupRefVal = fromRef(instance.groupIdSecretRef);
+  const apiKey =
+    (apiKeyRefVal || instance.apiKey?.trim() || undefined) ?? undefined;
+  const groupId =
+    (groupRefVal || instance.groupId?.trim() || undefined) ?? undefined;
+  const {
+    apiKeySecretRef: _a,
+    groupIdSecretRef: _g,
+    ...rest
+  } = instance;
+  return {
+    ...rest,
+    apiKey,
+    groupId,
+  };
+}
+/** Payload sent to /api/suite for llm-rubric judge construction. */
+export interface JudgeApiPayload {
+  mode: string;
+  anthropicApiKey?: string;
+  claudeModel?: string;
+  ollamaBaseUrl?: string;
+  ollamaModel?: string;
+}
+export function buildJudgeApiPayload(
+  judge: JudgeSettings,
+  secrets: Record<string, string>
+): JudgeApiPayload {
+  const j = { ...DEFAULT_JUDGE_SETTINGS, ...judge };
+  const ref = j.anthropicSecretRef?.trim() || "anthropic";
+  const anthropicApiKey = secrets[ref]?.trim() || undefined;
+  if (j.mode === "none") {
+    return { mode: "none" };
+  }
+  if (j.mode === "ollama") {
+    return {
+      mode: "ollama",
+      ollamaBaseUrl: j.ollamaBaseUrl?.trim() || "http://localhost:11434",
+      ollamaModel: j.ollamaModel?.trim() || "llama3.2",
+    };
+  }
+  if (j.mode === "claude") {
+    return {
+      mode: "claude",
+      anthropicApiKey,
+      claudeModel: j.claudeModel?.trim() || DEFAULT_JUDGE_SETTINGS.claudeModel,
+    };
+  }
+  // auto
+  return {
+    mode: "auto",
+    anthropicApiKey,
+    claudeModel: j.claudeModel?.trim() || DEFAULT_JUDGE_SETTINGS.claudeModel,
+  };
+}

package/src/lib/run-history.ts ADDED Viewed

@@ -0,0 +1,66 @@
+import type { WebDiffResult } from "../types";
+const HISTORY_KEY = "bench-ai:run-history";
+const LEGACY_PROMPT_DIFF = "prompt-diff:run-history";
+const LEGACY_LLM_DIFF = "llm-diff:run-history";
+const MAX_ENTRIES = 25;
+export interface RunHistoryEntry {
+  id: string;
+  ranAt: string;
+  promptPreview: string;
+  result: WebDiffResult;
+}
+function uid(): string {
+  return `${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
+}
+function readHistoryRaw(): string | null {
+  if (typeof window === "undefined") return null;
+  let raw = localStorage.getItem(HISTORY_KEY);
+  if (raw != null) return raw;
+  for (const lk of [LEGACY_PROMPT_DIFF, LEGACY_LLM_DIFF]) {
+    raw = localStorage.getItem(lk);
+    if (raw != null) {
+      localStorage.setItem(HISTORY_KEY, raw);
+      return raw;
+    }
+  }
+  return null;
+}
+export function loadRunHistory(): RunHistoryEntry[] {
+  if (typeof window === "undefined") return [];
+  try {
+    const raw = readHistoryRaw();
+    if (!raw) return [];
+    const parsed = JSON.parse(raw) as RunHistoryEntry[];
+    return Array.isArray(parsed) ? parsed : [];
+  } catch {
+    return [];
+  }
+}
+export function saveRunHistory(entries: RunHistoryEntry[]): void {
+  if (typeof window === "undefined") return;
+  try {
+    localStorage.setItem(HISTORY_KEY, JSON.stringify(entries.slice(0, MAX_ENTRIES)));
+  } catch {
+    /* ignore quota */
+  }
+}
+export function appendRunHistory(result: WebDiffResult): void {
+  const prompt = result.prompt.trim();
+  const preview =
+    prompt.length > 120 ? `${prompt.slice(0, 117)}…` : prompt || "(empty prompt)";
+  const entry: RunHistoryEntry = {
+    id: uid(),
+    ranAt: result.ranAt,
+    promptPreview: preview,
+    result,
+  };
+  const prev = loadRunHistory();
+  saveRunHistory([entry, ...prev].slice(0, MAX_ENTRIES));
+}

package/src/lib/simple-line-diff.ts ADDED Viewed

@@ -0,0 +1,50 @@
+export type DiffLine = { type: "same" | "add" | "remove"; text: string };
+const MAX_LINES = 1200;
+/** Line-level LCS diff for two strings (split on newlines). */
+export function diffLines(a: string, b: string): DiffLine[] {
+  const A = a.split("\n");
+  const B = b.split("\n");
+  if (A.length > MAX_LINES || B.length > MAX_LINES) {
+    return [
+      {
+        type: "same",
+        text: `Outputs are too long to diff inline (${A.length} vs ${B.length} lines). Use side-by-side or shorten the text.`,
+      },
+    ];
+  }
+  const n = A.length;
+  const m = B.length;
+  const dp: number[][] = Array.from({ length: n + 1 }, () => new Array(m + 1).fill(0));
+  for (let i = n - 1; i >= 0; i--) {
+    for (let j = m - 1; j >= 0; j--) {
+      dp[i][j] =
+        A[i] === B[j] ? 1 + dp[i + 1][j + 1] : Math.max(dp[i + 1][j], dp[i][j + 1]);
+    }
+  }
+  const out: DiffLine[] = [];
+  function walk(i: number, j: number): void {
+    if (i === n && j === m) return;
+    if (i < n && j < m && A[i] === B[j]) {
+      out.push({ type: "same", text: A[i] });
+      walk(i + 1, j + 1);
+      return;
+    }
+    if (j < m && (i === n || dp[i + 1][j] < dp[i][j + 1])) {
+      out.push({ type: "add", text: B[j] });
+      walk(i, j + 1);
+      return;
+    }
+    if (i < n) {
+      out.push({ type: "remove", text: A[i] });
+      walk(i + 1, j);
+      return;
+    }
+    out.push({ type: "add", text: B[j] });
+    walk(i, j + 1);
+  }
+  walk(0, 0);
+  return out;
+}

package/src/lib/storage.ts ADDED Viewed

@@ -0,0 +1,100 @@
+import type { JudgeSettings, LLMInstance, SecretsMap } from "../types";
+import { DEFAULT_JUDGE_SETTINGS } from "../types";
+const KEY = "bench-ai:instances";
+const SECRETS_KEY = "bench-ai:secrets";
+const JUDGE_KEY = "bench-ai:judge";
+const LEGACY_PROMPT_DIFF_INSTANCES = "prompt-diff:instances";
+const LEGACY_PROMPT_DIFF_SECRETS = "prompt-diff:secrets";
+const LEGACY_PROMPT_DIFF_JUDGE = "prompt-diff:judge";
+const LEGACY_LLM_DIFF_INSTANCES = "llm-diff:instances";
+const LEGACY_LLM_DIFF_SECRETS = "llm-diff:secrets";
+const LEGACY_LLM_DIFF_JUDGE = "llm-diff:judge";
+export const DEFAULT_INSTANCES: LLMInstance[] = [
+  {
+    id: "claude-default",
+    provider: "claude",
+    model: "claude-3-5-haiku-20241022",
+    enabled: true,
+    maxTokens: 2048,
+    temperature: 0.7,
+  },
+  {
+    id: "ollama-default",
+    provider: "ollama",
+    model: "llama3.2",
+    enabled: true,
+    baseUrl: "http://localhost:11434",
+    temperature: 0.7,
+  },
+];
+function readLocalStorage(primary: string, ...legacyKeys: string[]): string | null {
+  if (typeof window === "undefined") return null;
+  let raw = localStorage.getItem(primary);
+  if (raw != null) return raw;
+  for (const lk of legacyKeys) {
+    raw = localStorage.getItem(lk);
+    if (raw != null) {
+      localStorage.setItem(primary, raw);
+      return raw;
+    }
+  }
+  return null;
+}
+export function loadInstances(): LLMInstance[] {
+  if (typeof window === "undefined") return DEFAULT_INSTANCES;
+  try {
+    const raw = readLocalStorage(
+      KEY,
+      LEGACY_PROMPT_DIFF_INSTANCES,
+      LEGACY_LLM_DIFF_INSTANCES
+    );
+    return raw ? (JSON.parse(raw) as LLMInstance[]) : DEFAULT_INSTANCES;
+  } catch {
+    return DEFAULT_INSTANCES;
+  }
+}
+export function saveInstances(instances: LLMInstance[]): void {
+  if (typeof window === "undefined") return;
+  localStorage.setItem(KEY, JSON.stringify(instances));
+}
+export function loadSecrets(): SecretsMap {
+  if (typeof window === "undefined") return {};
+  try {
+    const raw = readLocalStorage(SECRETS_KEY, LEGACY_PROMPT_DIFF_SECRETS, LEGACY_LLM_DIFF_SECRETS);
+    if (!raw) return {};
+    const p = JSON.parse(raw) as unknown;
+    if (!p || typeof p !== "object" || Array.isArray(p)) return {};
+    return p as SecretsMap;
+  } catch {
+    return {};
+  }
+}
+export function saveSecrets(secrets: SecretsMap): void {
+  if (typeof window === "undefined") return;
+  localStorage.setItem(SECRETS_KEY, JSON.stringify(secrets));
+}
+export function loadJudgeSettings(): JudgeSettings {
+  if (typeof window === "undefined") return DEFAULT_JUDGE_SETTINGS;
+  try {
+    const raw = readLocalStorage(JUDGE_KEY, LEGACY_PROMPT_DIFF_JUDGE, LEGACY_LLM_DIFF_JUDGE);
+    if (!raw) return DEFAULT_JUDGE_SETTINGS;
+    const p = JSON.parse(raw) as Partial<JudgeSettings>;
+    return { ...DEFAULT_JUDGE_SETTINGS, ...p };
+  } catch {
+    return DEFAULT_JUDGE_SETTINGS;
+  }
+}
+export function saveJudgeSettings(judge: JudgeSettings): void {
+  if (typeof window === "undefined") return;
+  localStorage.setItem(JUDGE_KEY, JSON.stringify(judge));
+}

package/src/lib/suite-judge-meta.ts ADDED Viewed

@@ -0,0 +1,13 @@
+/** Returned with POST /api/suite so the UI can show whether llm-rubric actually invoked a judge LLM. */
+export interface SuiteJudgeMeta {
+  /** Number of `llm-rubric` assertions in the parsed suite YAML */
+  rubricAssertionCount: number;
+  /** True only if a judge provider was constructed and the suite has at least one llm-rubric */
+  willEvaluateRubrics: boolean;
+  judgeMode: string;
+  judgeBackend: "claude" | "ollama" | "off";
+  /** e.g. claude/claude-3-5-haiku-20241022 when active */
+  judgeLabel?: string;
+  /** Short human-readable summary for the banner */
+  summary: string;
+}

package/src/lib/suite-run-history.ts ADDED Viewed

@@ -0,0 +1,81 @@
+import type { SuiteResult } from "@darkrishabh/bench-ai";
+import type { SuiteJudgeMeta } from "./suite-judge-meta";
+const HISTORY_KEY = "bench-ai:suite-run-history";
+const LEGACY_PROMPT_DIFF = "prompt-diff:suite-run-history";
+const LEGACY_LLM_DIFF = "llm-diff:suite-run-history";
+/** Fewer than diff runs — suite payloads include full outputs per case. */
+const MAX_ENTRIES = 15;
+export interface SuiteRunHistoryEntry {
+  id: string;
+  ranAt: string;
+  /** Short label for the list (first meaningful YAML line) */
+  yamlPreview: string;
+  yaml: string;
+  result: SuiteResult;
+  runLog: string[];
+  judgeMeta: SuiteJudgeMeta | null;
+}
+function uid(): string {
+  return `${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
+}
+function previewFromYaml(yaml: string): string {
+  const lines = yaml.split(/\n/);
+  for (const line of lines) {
+    const t = line.trim();
+    if (!t || t.startsWith("#")) continue;
+    return t.length > 115 ? `${t.slice(0, 112)}…` : t;
+  }
+  return "(empty suite)";
+}
+function readSuiteHistoryRaw(): string | null {
+  if (typeof window === "undefined") return null;
+  let raw = localStorage.getItem(HISTORY_KEY);
+  if (raw != null) return raw;
+  for (const lk of [LEGACY_PROMPT_DIFF, LEGACY_LLM_DIFF]) {
+    raw = localStorage.getItem(lk);
+    if (raw != null) {
+      localStorage.setItem(HISTORY_KEY, raw);
+      return raw;
+    }
+  }
+  return null;
+}
+export function loadSuiteRunHistory(): SuiteRunHistoryEntry[] {
+  if (typeof window === "undefined") return [];
+  try {
+    const raw = readSuiteHistoryRaw();
+    if (!raw) return [];
+    const parsed = JSON.parse(raw) as SuiteRunHistoryEntry[];
+    return Array.isArray(parsed) ? parsed : [];
+  } catch {
+    return [];
+  }
+}
+export function saveSuiteRunHistory(entries: SuiteRunHistoryEntry[]): void {
+  if (typeof window === "undefined") return;
+  try {
+    localStorage.setItem(HISTORY_KEY, JSON.stringify(entries.slice(0, MAX_ENTRIES)));
+  } catch {
+    /* ignore quota */
+  }
+}
+export function appendSuiteRunHistory(
+  payload: Omit<SuiteRunHistoryEntry, "id" | "yamlPreview">
+): void {
+  const yamlPreview = previewFromYaml(payload.yaml);
+  const entry: SuiteRunHistoryEntry = {
+    id: uid(),
+    yamlPreview,
+    ...payload,
+  };
+  const prev = loadSuiteRunHistory();
+  saveSuiteRunHistory([entry, ...prev]);
+}