@darkrishabh/bench-ai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/README.md +333 -0
  2. package/dist/cli/app.d.ts +11 -0
  3. package/dist/cli/app.d.ts.map +1 -0
  4. package/dist/cli/app.js +48 -0
  5. package/dist/cli/app.js.map +1 -0
  6. package/dist/cli/components/DiffView.d.ts +5 -0
  7. package/dist/cli/components/DiffView.d.ts.map +1 -0
  8. package/dist/cli/components/DiffView.js +14 -0
  9. package/dist/cli/components/DiffView.js.map +1 -0
  10. package/dist/cli/components/EvalView.d.ts +6 -0
  11. package/dist/cli/components/EvalView.d.ts.map +1 -0
  12. package/dist/cli/components/EvalView.js +82 -0
  13. package/dist/cli/components/EvalView.js.map +1 -0
  14. package/dist/cli/components/Spinner.d.ts +4 -0
  15. package/dist/cli/components/Spinner.d.ts.map +1 -0
  16. package/dist/cli/components/Spinner.js +15 -0
  17. package/dist/cli/components/Spinner.js.map +1 -0
  18. package/dist/cli/index.d.ts +3 -0
  19. package/dist/cli/index.d.ts.map +1 -0
  20. package/dist/cli/index.js +117 -0
  21. package/dist/cli/index.js.map +1 -0
  22. package/dist/cli/run-command.d.ts +11 -0
  23. package/dist/cli/run-command.d.ts.map +1 -0
  24. package/dist/cli/run-command.js +119 -0
  25. package/dist/cli/run-command.js.map +1 -0
  26. package/dist/engine/cost.d.ts +3 -0
  27. package/dist/engine/cost.d.ts.map +1 -0
  28. package/dist/engine/cost.js +52 -0
  29. package/dist/engine/cost.js.map +1 -0
  30. package/dist/engine/diff.d.ts +6 -0
  31. package/dist/engine/diff.d.ts.map +1 -0
  32. package/dist/engine/diff.js +43 -0
  33. package/dist/engine/diff.js.map +1 -0
  34. package/dist/engine/eval.d.ts +14 -0
  35. package/dist/engine/eval.d.ts.map +1 -0
  36. package/dist/engine/eval.js +194 -0
  37. package/dist/engine/eval.js.map +1 -0
  38. package/dist/engine/index.d.ts +15 -0
  39. package/dist/engine/index.d.ts.map +1 -0
  40. package/dist/engine/index.js +10 -0
  41. package/dist/engine/index.js.map +1 -0
  42. package/dist/engine/providers/base.d.ts +7 -0
  43. package/dist/engine/providers/base.d.ts.map +1 -0
  44. package/dist/engine/providers/base.js +2 -0
  45. package/dist/engine/providers/base.js.map +1 -0
  46. package/dist/engine/providers/claude.d.ts +15 -0
  47. package/dist/engine/providers/claude.d.ts.map +1 -0
  48. package/dist/engine/providers/claude.js +53 -0
  49. package/dist/engine/providers/claude.js.map +1 -0
  50. package/dist/engine/providers/minimax.d.ts +16 -0
  51. package/dist/engine/providers/minimax.d.ts.map +1 -0
  52. package/dist/engine/providers/minimax.js +67 -0
  53. package/dist/engine/providers/minimax.js.map +1 -0
  54. package/dist/engine/providers/ollama.d.ts +14 -0
  55. package/dist/engine/providers/ollama.d.ts.map +1 -0
  56. package/dist/engine/providers/ollama.js +60 -0
  57. package/dist/engine/providers/ollama.js.map +1 -0
  58. package/dist/engine/providers/openai-compatible.d.ts +19 -0
  59. package/dist/engine/providers/openai-compatible.d.ts.map +1 -0
  60. package/dist/engine/providers/openai-compatible.js +109 -0
  61. package/dist/engine/providers/openai-compatible.js.map +1 -0
  62. package/dist/engine/providers/subprocess.d.ts +55 -0
  63. package/dist/engine/providers/subprocess.d.ts.map +1 -0
  64. package/dist/engine/providers/subprocess.js +111 -0
  65. package/dist/engine/providers/subprocess.js.map +1 -0
  66. package/dist/engine/suite-loader.d.ts +11 -0
  67. package/dist/engine/suite-loader.d.ts.map +1 -0
  68. package/dist/engine/suite-loader.js +75 -0
  69. package/dist/engine/suite-loader.js.map +1 -0
  70. package/dist/engine/types.d.ts +104 -0
  71. package/dist/engine/types.d.ts.map +1 -0
  72. package/dist/engine/types.js +2 -0
  73. package/dist/engine/types.js.map +1 -0
  74. package/next-env.d.ts +6 -0
  75. package/next.config.ts +26 -0
  76. package/package.json +72 -0
  77. package/public/icon.svg +14 -0
  78. package/src/app/api/diff/route.ts +135 -0
  79. package/src/app/api/models/route.ts +96 -0
  80. package/src/app/api/suite/route.ts +314 -0
  81. package/src/app/globals.css +215 -0
  82. package/src/app/icon.svg +14 -0
  83. package/src/app/layout.tsx +44 -0
  84. package/src/app/opengraph-image.tsx +73 -0
  85. package/src/app/page.tsx +952 -0
  86. package/src/app/suite/layout.tsx +12 -0
  87. package/src/app/suite/page.tsx +206 -0
  88. package/src/app/twitter-image.tsx +1 -0
  89. package/src/components/BenchAiLogo.tsx +38 -0
  90. package/src/components/ComparePanel.tsx +643 -0
  91. package/src/components/ConfigPanel.tsx +809 -0
  92. package/src/components/MarkdownOutput.tsx +16 -0
  93. package/src/components/ModelResponseCard.tsx +313 -0
  94. package/src/components/QuickComparisonBar.tsx +184 -0
  95. package/src/components/ResponsesLineDiff.tsx +149 -0
  96. package/src/components/SettingsPanel.tsx +591 -0
  97. package/src/components/SuitePanel.tsx +875 -0
  98. package/src/lib/brand.ts +4 -0
  99. package/src/lib/config-yaml.ts +70 -0
  100. package/src/lib/consume-suite-sse.ts +70 -0
  101. package/src/lib/describe-judge.ts +23 -0
  102. package/src/lib/model-chip-palette.ts +9 -0
  103. package/src/lib/openai-model-list.ts +33 -0
  104. package/src/lib/provider-ui.ts +30 -0
  105. package/src/lib/resolve-credentials.ts +80 -0
  106. package/src/lib/run-history.ts +66 -0
  107. package/src/lib/simple-line-diff.ts +50 -0
  108. package/src/lib/storage.ts +100 -0
  109. package/src/lib/suite-judge-meta.ts +13 -0
  110. package/src/lib/suite-run-history.ts +81 -0
  111. package/src/types.ts +170 -0
  112. package/vercel.json +5 -0
@@ -0,0 +1,4 @@
1
+ /** Product name and copy for UI + site metadata */
2
+ export const BRAND_NAME = "Bench AI";
3
+ export const BRAND_TAGLINE = "One prompt, many models — compare quality, speed, and cost";
4
+ export const BRAND_SUITE_SUBTITLE = "YAML evaluations against your enabled models";
@@ -0,0 +1,70 @@
1
+ import { load } from "js-yaml";
2
+ import { dump } from "js-yaml";
3
+ import type { AppConfigYaml, JudgeSettings, LLMInstance, SecretsMap } from "../types";
4
+ import { APP_CONFIG_VERSION, DEFAULT_JUDGE_SETTINGS } from "../types";
5
+
6
+ export function exportAppConfigYaml(params: {
7
+ secrets: SecretsMap;
8
+ judge: JudgeSettings;
9
+ instances: LLMInstance[];
10
+ }): string {
11
+ const doc: AppConfigYaml = {
12
+ version: APP_CONFIG_VERSION,
13
+ secrets: { ...params.secrets },
14
+ judge: { ...params.judge },
15
+ instances: params.instances.map((i) => ({ ...i })),
16
+ };
17
+ return dump(doc, { lineWidth: 120, noRefs: true, quotingType: '"' });
18
+ }
19
+
20
+ export function parseAppConfigYaml(yaml: string): AppConfigYaml {
21
+ const raw = load(yaml) as unknown;
22
+ if (!raw || typeof raw !== "object") {
23
+ throw new Error("Config must be a YAML mapping");
24
+ }
25
+ const o = raw as Record<string, unknown>;
26
+ const version = typeof o.version === "number" ? o.version : 1;
27
+ if (version !== APP_CONFIG_VERSION) {
28
+ throw new Error(`Unsupported config version: ${version} (expected ${APP_CONFIG_VERSION})`);
29
+ }
30
+
31
+ const secrets: SecretsMap =
32
+ o.secrets && typeof o.secrets === "object" && o.secrets !== null && !Array.isArray(o.secrets)
33
+ ? Object.fromEntries(
34
+ Object.entries(o.secrets as Record<string, unknown>).filter(
35
+ ([, v]) => typeof v === "string"
36
+ ) as [string, string][]
37
+ )
38
+ : {};
39
+
40
+ let judge: Partial<JudgeSettings> | undefined;
41
+ if (o.judge && typeof o.judge === "object" && o.judge !== null && !Array.isArray(o.judge)) {
42
+ judge = o.judge as Partial<JudgeSettings>;
43
+ }
44
+
45
+ let instances: LLMInstance[] | undefined;
46
+ if (Array.isArray(o.instances)) {
47
+ instances = o.instances.filter(
48
+ (x): x is LLMInstance =>
49
+ x !== null &&
50
+ typeof x === "object" &&
51
+ typeof (x as LLMInstance).id === "string" &&
52
+ typeof (x as LLMInstance).provider === "string" &&
53
+ typeof (x as LLMInstance).model === "string" &&
54
+ typeof (x as LLMInstance).enabled === "boolean"
55
+ );
56
+ }
57
+
58
+ return { version, secrets, judge, instances };
59
+ }
60
+
61
+ export function mergeImportedConfig(
62
+ parsed: AppConfigYaml,
63
+ current: { secrets: SecretsMap; judge: JudgeSettings; instances: LLMInstance[] }
64
+ ): { secrets: SecretsMap; judge: JudgeSettings; instances: LLMInstance[] } {
65
+ return {
66
+ secrets: { ...current.secrets, ...(parsed.secrets ?? {}) },
67
+ judge: { ...DEFAULT_JUDGE_SETTINGS, ...current.judge, ...(parsed.judge ?? {}) },
68
+ instances: Array.isArray(parsed.instances) ? parsed.instances : current.instances,
69
+ };
70
+ }
@@ -0,0 +1,70 @@
1
+ import type { SuiteResult } from "@darkrishabh/bench-ai";
2
+ import type { SuiteJudgeMeta } from "./suite-judge-meta";
3
+
4
+ type SsePayload =
5
+ | { type: "log"; line: string }
6
+ | { type: "done"; result: SuiteResult; runLog: string[]; judgeMeta: SuiteJudgeMeta }
7
+ | { type: "error"; message: string };
8
+
9
+ /**
10
+ * Read a POST /api/suite response with Content-Type: text/event-stream.
11
+ * Invokes onLogLine for each log line as it arrives.
12
+ */
13
+ export async function consumeSuiteSseStream(
14
+ res: Response,
15
+ onLogLine: (line: string) => void
16
+ ): Promise<{ result: SuiteResult; runLog: string[]; judgeMeta: SuiteJudgeMeta | null }> {
17
+ const reader = res.body?.getReader();
18
+ if (!reader) {
19
+ throw new Error("No response body");
20
+ }
21
+
22
+ const decoder = new TextDecoder();
23
+ let buffer = "";
24
+ let final: { result: SuiteResult; runLog: string[]; judgeMeta: SuiteJudgeMeta | null } | null = null;
25
+
26
+ const parseBlock = (block: string) => {
27
+ const lines = block.split("\n").filter((l) => l.startsWith("data:"));
28
+ if (lines.length === 0) return;
29
+ const jsonStr = lines.map((l) => l.replace(/^data:\s?/, "")).join("\n");
30
+ let msg: SsePayload;
31
+ try {
32
+ msg = JSON.parse(jsonStr) as SsePayload;
33
+ } catch {
34
+ return;
35
+ }
36
+ if (msg.type === "log") onLogLine(msg.line);
37
+ if (msg.type === "error") throw new Error(msg.message);
38
+ if (msg.type === "done") {
39
+ final = {
40
+ result: msg.result,
41
+ runLog: msg.runLog,
42
+ judgeMeta: msg.judgeMeta ?? null,
43
+ };
44
+ }
45
+ };
46
+
47
+ while (true) {
48
+ const { done, value } = await reader.read();
49
+ if (done) break;
50
+ buffer += decoder.decode(value, { stream: true });
51
+ for (;;) {
52
+ const idx = buffer.indexOf("\n\n");
53
+ if (idx === -1) break;
54
+ const block = buffer.slice(0, idx);
55
+ buffer = buffer.slice(idx + 2);
56
+ parseBlock(block);
57
+ }
58
+ }
59
+ buffer += decoder.decode();
60
+ if (buffer.trim()) {
61
+ for (const block of buffer.split("\n\n")) {
62
+ if (block.trim()) parseBlock(block);
63
+ }
64
+ }
65
+
66
+ if (!final) {
67
+ throw new Error("Stream ended without a result event");
68
+ }
69
+ return final;
70
+ }
@@ -0,0 +1,23 @@
1
+ import type { JudgeSettings, SecretsMap } from "../types";
2
+
3
+ /** One-line summary for suite “run target” UI */
4
+ export function describeJudgeForUi(judge: JudgeSettings, secrets: SecretsMap): string {
5
+ const ref = judge.anthropicSecretRef?.trim() || "anthropic";
6
+ const hasAnthropicSecret = Boolean(secrets[ref]?.trim());
7
+
8
+ switch (judge.mode) {
9
+ case "none":
10
+ return "No judge — llm-rubric assertions will not be graded.";
11
+ case "ollama":
12
+ return `Ollama judge at ${judge.ollamaBaseUrl || "http://localhost:11434"} · model ${judge.ollamaModel || "llama3.2"}`;
13
+ case "claude":
14
+ return hasAnthropicSecret
15
+ ? `Claude (${judge.claudeModel || "default"}) · API key from secret “${ref}”`
16
+ : `Claude (${judge.claudeModel || "default"}) · key from secret “${ref}” or server ANTHROPIC_API_KEY`;
17
+ case "auto":
18
+ default:
19
+ return hasAnthropicSecret
20
+ ? `Auto · Claude when available (secret “${ref}”, ${judge.claudeModel || "default model"})`
21
+ : `Auto · Claude if ANTHROPIC_API_KEY is set on the server (secret “${ref}” is empty)`;
22
+ }
23
+ }
@@ -0,0 +1,9 @@
1
+ /** Ordinal dot colors for enabled model chips in the prompt bar (distinct slots). */
2
+ export const MODEL_CHIP_PALETTE = [
3
+ "#16a34a",
4
+ "#2563eb",
5
+ "#ea580c",
6
+ "#7c3aed",
7
+ "#0891b2",
8
+ "#db2777",
9
+ ] as const;
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Filter OpenAI `GET /v1/models` IDs for chat-style use in our UI.
3
+ *
4
+ * Note: OpenAI does not always expose every chat-capable model in this list (tier,
5
+ * product, or API surface). If a model is missing here, use **Other…** in Settings
6
+ * and paste the exact model id from the OpenAI docs.
7
+ */
8
+
9
+ const DROP = (id: string) => {
10
+ const l = id.toLowerCase();
11
+ if (l.includes("embedding")) return true;
12
+ if (l.includes("whisper")) return true;
13
+ if (l.includes("tts")) return true;
14
+ if (l.includes("dall-e") || l.includes("dalle")) return true;
15
+ if (l.includes("moderation")) return true;
16
+ if (l.includes("realtime")) return true;
17
+ if (l.includes("transcribe")) return true;
18
+ if (l.includes("speech")) return true;
19
+ /** Audio / non-text completion SKUs */
20
+ if (/\baudio\b/.test(l)) return true;
21
+ if (l.includes("computer-use")) return true;
22
+ if (l.startsWith("ft:")) return true;
23
+ /**
24
+ * Drop search-augmented / search API SKUs only — not bare substring "search"
25
+ * (that can appear inside unrelated id segments and hide GPT-5+ ids).
26
+ */
27
+ if (l.includes("search-preview") || l.includes("search-api") || /gpt-4o-search/.test(l)) return true;
28
+ return false;
29
+ };
30
+
31
+ export function filterOpenAiChatModelIds(ids: string[]): string[] {
32
+ return [...new Set(ids.filter((id) => id && !DROP(id)))].sort((a, b) => a.localeCompare(b));
33
+ }
@@ -0,0 +1,30 @@
1
+ /** Shared provider colors for chips, cards, and run-target UI */
2
+
3
+ export const PROVIDER_UI: Record<string, { color: string; border: string }> = {
4
+ claude: { color: "var(--claude)", border: "var(--claude-border)" },
5
+ ollama: { color: "var(--ollama)", border: "var(--ollama-border)" },
6
+ minimax: { color: "var(--minimax)", border: "var(--minimax-border)" },
7
+ openai: { color: "var(--openai)", border: "var(--openai-border)" },
8
+ groq: { color: "var(--groq)", border: "var(--groq-border)" },
9
+ openrouter: { color: "var(--openrouter)", border: "var(--openrouter-border)" },
10
+ "nvidia-nim":{ color: "var(--nvidia-nim)", border: "var(--nvidia-nim-border)" },
11
+ together: { color: "var(--together)", border: "var(--together-border)" },
12
+ perplexity: { color: "var(--perplexity)", border: "var(--perplexity-border)" },
13
+ custom: { color: "var(--custom)", border: "var(--custom-border)" },
14
+ "claude-cli":{ color: "var(--claude)", border: "var(--claude-border)" },
15
+ codex: { color: "var(--openai)", border: "var(--openai-border)" },
16
+ };
17
+
18
+ /** Header-style label, e.g. `nvidia-nim` → `NVIDIA-NIM`. */
19
+ export function formatProviderDisplayName(provider: string): string {
20
+ return provider.replace(/_/g, "-").toUpperCase();
21
+ }
22
+
23
+ export function providerUi(provider: string) {
24
+ return (
25
+ PROVIDER_UI[provider] ?? {
26
+ color: "var(--text-3)",
27
+ border: "var(--border)",
28
+ }
29
+ );
30
+ }
@@ -0,0 +1,80 @@
1
+ import type { JudgeSettings, LLMInstance } from "../types";
2
+ import { DEFAULT_JUDGE_SETTINGS } from "../types";
3
+
4
+ /** Merge inline keys + secret variables for API routes. Omits ref fields from the payload. */
5
+ export function resolveInstancesForApi(
6
+ instances: LLMInstance[],
7
+ secrets: Record<string, string>
8
+ ): LLMInstance[] {
9
+ return instances.map((i) => resolveInstanceForApi(i, secrets));
10
+ }
11
+
12
+ export function resolveInstanceForApi(
13
+ instance: LLMInstance,
14
+ secrets: Record<string, string>
15
+ ): LLMInstance {
16
+ const fromRef = (ref: string | undefined) =>
17
+ ref?.trim() ? secrets[ref.trim()]?.trim() ?? "" : "";
18
+
19
+ const apiKeyRefVal = fromRef(instance.apiKeySecretRef);
20
+ const groupRefVal = fromRef(instance.groupIdSecretRef);
21
+
22
+ const apiKey =
23
+ (apiKeyRefVal || instance.apiKey?.trim() || undefined) ?? undefined;
24
+ const groupId =
25
+ (groupRefVal || instance.groupId?.trim() || undefined) ?? undefined;
26
+
27
+ const {
28
+ apiKeySecretRef: _a,
29
+ groupIdSecretRef: _g,
30
+ ...rest
31
+ } = instance;
32
+
33
+ return {
34
+ ...rest,
35
+ apiKey,
36
+ groupId,
37
+ };
38
+ }
39
+
40
+ /** Payload sent to /api/suite for llm-rubric judge construction. */
41
+ export interface JudgeApiPayload {
42
+ mode: string;
43
+ anthropicApiKey?: string;
44
+ claudeModel?: string;
45
+ ollamaBaseUrl?: string;
46
+ ollamaModel?: string;
47
+ }
48
+
49
+ export function buildJudgeApiPayload(
50
+ judge: JudgeSettings,
51
+ secrets: Record<string, string>
52
+ ): JudgeApiPayload {
53
+ const j = { ...DEFAULT_JUDGE_SETTINGS, ...judge };
54
+ const ref = j.anthropicSecretRef?.trim() || "anthropic";
55
+ const anthropicApiKey = secrets[ref]?.trim() || undefined;
56
+
57
+ if (j.mode === "none") {
58
+ return { mode: "none" };
59
+ }
60
+ if (j.mode === "ollama") {
61
+ return {
62
+ mode: "ollama",
63
+ ollamaBaseUrl: j.ollamaBaseUrl?.trim() || "http://localhost:11434",
64
+ ollamaModel: j.ollamaModel?.trim() || "llama3.2",
65
+ };
66
+ }
67
+ if (j.mode === "claude") {
68
+ return {
69
+ mode: "claude",
70
+ anthropicApiKey,
71
+ claudeModel: j.claudeModel?.trim() || DEFAULT_JUDGE_SETTINGS.claudeModel,
72
+ };
73
+ }
74
+ // auto
75
+ return {
76
+ mode: "auto",
77
+ anthropicApiKey,
78
+ claudeModel: j.claudeModel?.trim() || DEFAULT_JUDGE_SETTINGS.claudeModel,
79
+ };
80
+ }
@@ -0,0 +1,66 @@
1
+ import type { WebDiffResult } from "../types";
2
+
3
+ const HISTORY_KEY = "bench-ai:run-history";
4
+ const LEGACY_PROMPT_DIFF = "prompt-diff:run-history";
5
+ const LEGACY_LLM_DIFF = "llm-diff:run-history";
6
+ const MAX_ENTRIES = 25;
7
+
8
+ export interface RunHistoryEntry {
9
+ id: string;
10
+ ranAt: string;
11
+ promptPreview: string;
12
+ result: WebDiffResult;
13
+ }
14
+
15
+ function uid(): string {
16
+ return `${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
17
+ }
18
+
19
+ function readHistoryRaw(): string | null {
20
+ if (typeof window === "undefined") return null;
21
+ let raw = localStorage.getItem(HISTORY_KEY);
22
+ if (raw != null) return raw;
23
+ for (const lk of [LEGACY_PROMPT_DIFF, LEGACY_LLM_DIFF]) {
24
+ raw = localStorage.getItem(lk);
25
+ if (raw != null) {
26
+ localStorage.setItem(HISTORY_KEY, raw);
27
+ return raw;
28
+ }
29
+ }
30
+ return null;
31
+ }
32
+
33
+ export function loadRunHistory(): RunHistoryEntry[] {
34
+ if (typeof window === "undefined") return [];
35
+ try {
36
+ const raw = readHistoryRaw();
37
+ if (!raw) return [];
38
+ const parsed = JSON.parse(raw) as RunHistoryEntry[];
39
+ return Array.isArray(parsed) ? parsed : [];
40
+ } catch {
41
+ return [];
42
+ }
43
+ }
44
+
45
+ export function saveRunHistory(entries: RunHistoryEntry[]): void {
46
+ if (typeof window === "undefined") return;
47
+ try {
48
+ localStorage.setItem(HISTORY_KEY, JSON.stringify(entries.slice(0, MAX_ENTRIES)));
49
+ } catch {
50
+ /* ignore quota */
51
+ }
52
+ }
53
+
54
+ export function appendRunHistory(result: WebDiffResult): void {
55
+ const prompt = result.prompt.trim();
56
+ const preview =
57
+ prompt.length > 120 ? `${prompt.slice(0, 117)}…` : prompt || "(empty prompt)";
58
+ const entry: RunHistoryEntry = {
59
+ id: uid(),
60
+ ranAt: result.ranAt,
61
+ promptPreview: preview,
62
+ result,
63
+ };
64
+ const prev = loadRunHistory();
65
+ saveRunHistory([entry, ...prev].slice(0, MAX_ENTRIES));
66
+ }
@@ -0,0 +1,50 @@
1
+ export type DiffLine = { type: "same" | "add" | "remove"; text: string };
2
+
3
+ const MAX_LINES = 1200;
4
+
5
+ /** Line-level LCS diff for two strings (split on newlines). */
6
+ export function diffLines(a: string, b: string): DiffLine[] {
7
+ const A = a.split("\n");
8
+ const B = b.split("\n");
9
+ if (A.length > MAX_LINES || B.length > MAX_LINES) {
10
+ return [
11
+ {
12
+ type: "same",
13
+ text: `Outputs are too long to diff inline (${A.length} vs ${B.length} lines). Use side-by-side or shorten the text.`,
14
+ },
15
+ ];
16
+ }
17
+ const n = A.length;
18
+ const m = B.length;
19
+ const dp: number[][] = Array.from({ length: n + 1 }, () => new Array(m + 1).fill(0));
20
+ for (let i = n - 1; i >= 0; i--) {
21
+ for (let j = m - 1; j >= 0; j--) {
22
+ dp[i][j] =
23
+ A[i] === B[j] ? 1 + dp[i + 1][j + 1] : Math.max(dp[i + 1][j], dp[i][j + 1]);
24
+ }
25
+ }
26
+
27
+ const out: DiffLine[] = [];
28
+ function walk(i: number, j: number): void {
29
+ if (i === n && j === m) return;
30
+ if (i < n && j < m && A[i] === B[j]) {
31
+ out.push({ type: "same", text: A[i] });
32
+ walk(i + 1, j + 1);
33
+ return;
34
+ }
35
+ if (j < m && (i === n || dp[i + 1][j] < dp[i][j + 1])) {
36
+ out.push({ type: "add", text: B[j] });
37
+ walk(i, j + 1);
38
+ return;
39
+ }
40
+ if (i < n) {
41
+ out.push({ type: "remove", text: A[i] });
42
+ walk(i + 1, j);
43
+ return;
44
+ }
45
+ out.push({ type: "add", text: B[j] });
46
+ walk(i, j + 1);
47
+ }
48
+ walk(0, 0);
49
+ return out;
50
+ }
@@ -0,0 +1,100 @@
1
+ import type { JudgeSettings, LLMInstance, SecretsMap } from "../types";
2
+ import { DEFAULT_JUDGE_SETTINGS } from "../types";
3
+
4
+ const KEY = "bench-ai:instances";
5
+ const SECRETS_KEY = "bench-ai:secrets";
6
+ const JUDGE_KEY = "bench-ai:judge";
7
+
8
+ const LEGACY_PROMPT_DIFF_INSTANCES = "prompt-diff:instances";
9
+ const LEGACY_PROMPT_DIFF_SECRETS = "prompt-diff:secrets";
10
+ const LEGACY_PROMPT_DIFF_JUDGE = "prompt-diff:judge";
11
+ const LEGACY_LLM_DIFF_INSTANCES = "llm-diff:instances";
12
+ const LEGACY_LLM_DIFF_SECRETS = "llm-diff:secrets";
13
+ const LEGACY_LLM_DIFF_JUDGE = "llm-diff:judge";
14
+
15
+ export const DEFAULT_INSTANCES: LLMInstance[] = [
16
+ {
17
+ id: "claude-default",
18
+ provider: "claude",
19
+ model: "claude-3-5-haiku-20241022",
20
+ enabled: true,
21
+ maxTokens: 2048,
22
+ temperature: 0.7,
23
+ },
24
+ {
25
+ id: "ollama-default",
26
+ provider: "ollama",
27
+ model: "llama3.2",
28
+ enabled: true,
29
+ baseUrl: "http://localhost:11434",
30
+ temperature: 0.7,
31
+ },
32
+ ];
33
+
34
+ function readLocalStorage(primary: string, ...legacyKeys: string[]): string | null {
35
+ if (typeof window === "undefined") return null;
36
+ let raw = localStorage.getItem(primary);
37
+ if (raw != null) return raw;
38
+ for (const lk of legacyKeys) {
39
+ raw = localStorage.getItem(lk);
40
+ if (raw != null) {
41
+ localStorage.setItem(primary, raw);
42
+ return raw;
43
+ }
44
+ }
45
+ return null;
46
+ }
47
+
48
+ export function loadInstances(): LLMInstance[] {
49
+ if (typeof window === "undefined") return DEFAULT_INSTANCES;
50
+ try {
51
+ const raw = readLocalStorage(
52
+ KEY,
53
+ LEGACY_PROMPT_DIFF_INSTANCES,
54
+ LEGACY_LLM_DIFF_INSTANCES
55
+ );
56
+ return raw ? (JSON.parse(raw) as LLMInstance[]) : DEFAULT_INSTANCES;
57
+ } catch {
58
+ return DEFAULT_INSTANCES;
59
+ }
60
+ }
61
+
62
+ export function saveInstances(instances: LLMInstance[]): void {
63
+ if (typeof window === "undefined") return;
64
+ localStorage.setItem(KEY, JSON.stringify(instances));
65
+ }
66
+
67
+ export function loadSecrets(): SecretsMap {
68
+ if (typeof window === "undefined") return {};
69
+ try {
70
+ const raw = readLocalStorage(SECRETS_KEY, LEGACY_PROMPT_DIFF_SECRETS, LEGACY_LLM_DIFF_SECRETS);
71
+ if (!raw) return {};
72
+ const p = JSON.parse(raw) as unknown;
73
+ if (!p || typeof p !== "object" || Array.isArray(p)) return {};
74
+ return p as SecretsMap;
75
+ } catch {
76
+ return {};
77
+ }
78
+ }
79
+
80
+ export function saveSecrets(secrets: SecretsMap): void {
81
+ if (typeof window === "undefined") return;
82
+ localStorage.setItem(SECRETS_KEY, JSON.stringify(secrets));
83
+ }
84
+
85
+ export function loadJudgeSettings(): JudgeSettings {
86
+ if (typeof window === "undefined") return DEFAULT_JUDGE_SETTINGS;
87
+ try {
88
+ const raw = readLocalStorage(JUDGE_KEY, LEGACY_PROMPT_DIFF_JUDGE, LEGACY_LLM_DIFF_JUDGE);
89
+ if (!raw) return DEFAULT_JUDGE_SETTINGS;
90
+ const p = JSON.parse(raw) as Partial<JudgeSettings>;
91
+ return { ...DEFAULT_JUDGE_SETTINGS, ...p };
92
+ } catch {
93
+ return DEFAULT_JUDGE_SETTINGS;
94
+ }
95
+ }
96
+
97
+ export function saveJudgeSettings(judge: JudgeSettings): void {
98
+ if (typeof window === "undefined") return;
99
+ localStorage.setItem(JUDGE_KEY, JSON.stringify(judge));
100
+ }
@@ -0,0 +1,13 @@
1
+ /** Returned with POST /api/suite so the UI can show whether llm-rubric actually invoked a judge LLM. */
2
+ export interface SuiteJudgeMeta {
3
+ /** Number of `llm-rubric` assertions in the parsed suite YAML */
4
+ rubricAssertionCount: number;
5
+ /** True only if a judge provider was constructed and the suite has at least one llm-rubric */
6
+ willEvaluateRubrics: boolean;
7
+ judgeMode: string;
8
+ judgeBackend: "claude" | "ollama" | "off";
9
+ /** e.g. claude/claude-3-5-haiku-20241022 when active */
10
+ judgeLabel?: string;
11
+ /** Short human-readable summary for the banner */
12
+ summary: string;
13
+ }
@@ -0,0 +1,81 @@
1
+ import type { SuiteResult } from "@darkrishabh/bench-ai";
2
+ import type { SuiteJudgeMeta } from "./suite-judge-meta";
3
+
4
+ const HISTORY_KEY = "bench-ai:suite-run-history";
5
+ const LEGACY_PROMPT_DIFF = "prompt-diff:suite-run-history";
6
+ const LEGACY_LLM_DIFF = "llm-diff:suite-run-history";
7
+ /** Fewer than diff runs — suite payloads include full outputs per case. */
8
+ const MAX_ENTRIES = 15;
9
+
10
+ export interface SuiteRunHistoryEntry {
11
+ id: string;
12
+ ranAt: string;
13
+ /** Short label for the list (first meaningful YAML line) */
14
+ yamlPreview: string;
15
+ yaml: string;
16
+ result: SuiteResult;
17
+ runLog: string[];
18
+ judgeMeta: SuiteJudgeMeta | null;
19
+ }
20
+
21
+ function uid(): string {
22
+ return `${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
23
+ }
24
+
25
+ function previewFromYaml(yaml: string): string {
26
+ const lines = yaml.split(/\n/);
27
+ for (const line of lines) {
28
+ const t = line.trim();
29
+ if (!t || t.startsWith("#")) continue;
30
+ return t.length > 115 ? `${t.slice(0, 112)}…` : t;
31
+ }
32
+ return "(empty suite)";
33
+ }
34
+
35
+ function readSuiteHistoryRaw(): string | null {
36
+ if (typeof window === "undefined") return null;
37
+ let raw = localStorage.getItem(HISTORY_KEY);
38
+ if (raw != null) return raw;
39
+ for (const lk of [LEGACY_PROMPT_DIFF, LEGACY_LLM_DIFF]) {
40
+ raw = localStorage.getItem(lk);
41
+ if (raw != null) {
42
+ localStorage.setItem(HISTORY_KEY, raw);
43
+ return raw;
44
+ }
45
+ }
46
+ return null;
47
+ }
48
+
49
+ export function loadSuiteRunHistory(): SuiteRunHistoryEntry[] {
50
+ if (typeof window === "undefined") return [];
51
+ try {
52
+ const raw = readSuiteHistoryRaw();
53
+ if (!raw) return [];
54
+ const parsed = JSON.parse(raw) as SuiteRunHistoryEntry[];
55
+ return Array.isArray(parsed) ? parsed : [];
56
+ } catch {
57
+ return [];
58
+ }
59
+ }
60
+
61
+ export function saveSuiteRunHistory(entries: SuiteRunHistoryEntry[]): void {
62
+ if (typeof window === "undefined") return;
63
+ try {
64
+ localStorage.setItem(HISTORY_KEY, JSON.stringify(entries.slice(0, MAX_ENTRIES)));
65
+ } catch {
66
+ /* ignore quota */
67
+ }
68
+ }
69
+
70
+ export function appendSuiteRunHistory(
71
+ payload: Omit<SuiteRunHistoryEntry, "id" | "yamlPreview">
72
+ ): void {
73
+ const yamlPreview = previewFromYaml(payload.yaml);
74
+ const entry: SuiteRunHistoryEntry = {
75
+ id: uid(),
76
+ yamlPreview,
77
+ ...payload,
78
+ };
79
+ const prev = loadSuiteRunHistory();
80
+ saveSuiteRunHistory([entry, ...prev]);
81
+ }