@darkrishabh/bench-ai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/README.md +333 -0
  2. package/dist/cli/app.d.ts +11 -0
  3. package/dist/cli/app.d.ts.map +1 -0
  4. package/dist/cli/app.js +48 -0
  5. package/dist/cli/app.js.map +1 -0
  6. package/dist/cli/components/DiffView.d.ts +5 -0
  7. package/dist/cli/components/DiffView.d.ts.map +1 -0
  8. package/dist/cli/components/DiffView.js +14 -0
  9. package/dist/cli/components/DiffView.js.map +1 -0
  10. package/dist/cli/components/EvalView.d.ts +6 -0
  11. package/dist/cli/components/EvalView.d.ts.map +1 -0
  12. package/dist/cli/components/EvalView.js +82 -0
  13. package/dist/cli/components/EvalView.js.map +1 -0
  14. package/dist/cli/components/Spinner.d.ts +4 -0
  15. package/dist/cli/components/Spinner.d.ts.map +1 -0
  16. package/dist/cli/components/Spinner.js +15 -0
  17. package/dist/cli/components/Spinner.js.map +1 -0
  18. package/dist/cli/index.d.ts +3 -0
  19. package/dist/cli/index.d.ts.map +1 -0
  20. package/dist/cli/index.js +117 -0
  21. package/dist/cli/index.js.map +1 -0
  22. package/dist/cli/run-command.d.ts +11 -0
  23. package/dist/cli/run-command.d.ts.map +1 -0
  24. package/dist/cli/run-command.js +119 -0
  25. package/dist/cli/run-command.js.map +1 -0
  26. package/dist/engine/cost.d.ts +3 -0
  27. package/dist/engine/cost.d.ts.map +1 -0
  28. package/dist/engine/cost.js +52 -0
  29. package/dist/engine/cost.js.map +1 -0
  30. package/dist/engine/diff.d.ts +6 -0
  31. package/dist/engine/diff.d.ts.map +1 -0
  32. package/dist/engine/diff.js +43 -0
  33. package/dist/engine/diff.js.map +1 -0
  34. package/dist/engine/eval.d.ts +14 -0
  35. package/dist/engine/eval.d.ts.map +1 -0
  36. package/dist/engine/eval.js +194 -0
  37. package/dist/engine/eval.js.map +1 -0
  38. package/dist/engine/index.d.ts +15 -0
  39. package/dist/engine/index.d.ts.map +1 -0
  40. package/dist/engine/index.js +10 -0
  41. package/dist/engine/index.js.map +1 -0
  42. package/dist/engine/providers/base.d.ts +7 -0
  43. package/dist/engine/providers/base.d.ts.map +1 -0
  44. package/dist/engine/providers/base.js +2 -0
  45. package/dist/engine/providers/base.js.map +1 -0
  46. package/dist/engine/providers/claude.d.ts +15 -0
  47. package/dist/engine/providers/claude.d.ts.map +1 -0
  48. package/dist/engine/providers/claude.js +53 -0
  49. package/dist/engine/providers/claude.js.map +1 -0
  50. package/dist/engine/providers/minimax.d.ts +16 -0
  51. package/dist/engine/providers/minimax.d.ts.map +1 -0
  52. package/dist/engine/providers/minimax.js +67 -0
  53. package/dist/engine/providers/minimax.js.map +1 -0
  54. package/dist/engine/providers/ollama.d.ts +14 -0
  55. package/dist/engine/providers/ollama.d.ts.map +1 -0
  56. package/dist/engine/providers/ollama.js +60 -0
  57. package/dist/engine/providers/ollama.js.map +1 -0
  58. package/dist/engine/providers/openai-compatible.d.ts +19 -0
  59. package/dist/engine/providers/openai-compatible.d.ts.map +1 -0
  60. package/dist/engine/providers/openai-compatible.js +109 -0
  61. package/dist/engine/providers/openai-compatible.js.map +1 -0
  62. package/dist/engine/providers/subprocess.d.ts +55 -0
  63. package/dist/engine/providers/subprocess.d.ts.map +1 -0
  64. package/dist/engine/providers/subprocess.js +111 -0
  65. package/dist/engine/providers/subprocess.js.map +1 -0
  66. package/dist/engine/suite-loader.d.ts +11 -0
  67. package/dist/engine/suite-loader.d.ts.map +1 -0
  68. package/dist/engine/suite-loader.js +75 -0
  69. package/dist/engine/suite-loader.js.map +1 -0
  70. package/dist/engine/types.d.ts +104 -0
  71. package/dist/engine/types.d.ts.map +1 -0
  72. package/dist/engine/types.js +2 -0
  73. package/dist/engine/types.js.map +1 -0
  74. package/next-env.d.ts +6 -0
  75. package/next.config.ts +26 -0
  76. package/package.json +72 -0
  77. package/public/icon.svg +14 -0
  78. package/src/app/api/diff/route.ts +135 -0
  79. package/src/app/api/models/route.ts +96 -0
  80. package/src/app/api/suite/route.ts +314 -0
  81. package/src/app/globals.css +215 -0
  82. package/src/app/icon.svg +14 -0
  83. package/src/app/layout.tsx +44 -0
  84. package/src/app/opengraph-image.tsx +73 -0
  85. package/src/app/page.tsx +952 -0
  86. package/src/app/suite/layout.tsx +12 -0
  87. package/src/app/suite/page.tsx +206 -0
  88. package/src/app/twitter-image.tsx +1 -0
  89. package/src/components/BenchAiLogo.tsx +38 -0
  90. package/src/components/ComparePanel.tsx +643 -0
  91. package/src/components/ConfigPanel.tsx +809 -0
  92. package/src/components/MarkdownOutput.tsx +16 -0
  93. package/src/components/ModelResponseCard.tsx +313 -0
  94. package/src/components/QuickComparisonBar.tsx +184 -0
  95. package/src/components/ResponsesLineDiff.tsx +149 -0
  96. package/src/components/SettingsPanel.tsx +591 -0
  97. package/src/components/SuitePanel.tsx +875 -0
  98. package/src/lib/brand.ts +4 -0
  99. package/src/lib/config-yaml.ts +70 -0
  100. package/src/lib/consume-suite-sse.ts +70 -0
  101. package/src/lib/describe-judge.ts +23 -0
  102. package/src/lib/model-chip-palette.ts +9 -0
  103. package/src/lib/openai-model-list.ts +33 -0
  104. package/src/lib/provider-ui.ts +30 -0
  105. package/src/lib/resolve-credentials.ts +80 -0
  106. package/src/lib/run-history.ts +66 -0
  107. package/src/lib/simple-line-diff.ts +50 -0
  108. package/src/lib/storage.ts +100 -0
  109. package/src/lib/suite-judge-meta.ts +13 -0
  110. package/src/lib/suite-run-history.ts +81 -0
  111. package/src/types.ts +170 -0
  112. package/vercel.json +5 -0
@@ -0,0 +1,875 @@
1
+ "use client";
2
+
3
+ import React, { useState, useRef, useEffect, useMemo } from "react";
4
+ import type { SuiteResult, TestCaseResult, ProviderSummary, AssertionResult } from "@darkrishabh/bench-ai";
5
+ import type { JudgeSettings, LLMInstance, SecretsMap } from "../types";
6
+ import { formatCost } from "@darkrishabh/bench-ai";
7
+ import { buildJudgeApiPayload, resolveInstancesForApi } from "../lib/resolve-credentials";
8
+ import { providerUi } from "../lib/provider-ui";
9
+ import { describeJudgeForUi } from "../lib/describe-judge";
10
+ import type { SuiteJudgeMeta } from "../lib/suite-judge-meta";
11
+ import { consumeSuiteSseStream } from "../lib/consume-suite-sse";
12
+ import {
13
+ appendSuiteRunHistory,
14
+ loadSuiteRunHistory,
15
+ type SuiteRunHistoryEntry,
16
+ } from "../lib/suite-run-history";
17
+
18
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
19
+
20
+ const EXAMPLE_YAML = `prompts:
21
+ - "Explain {{concept}} in one paragraph"
22
+ - "Write a {{language}} function that {{task}}"
23
+
24
+ tests:
25
+ - vars:
26
+ concept: "recursion"
27
+ language: "Python"
28
+ task: "reverses a linked list"
29
+ assert:
30
+ - type: contains
31
+ value: "base case"
32
+ - type: latency
33
+ threshold: 8000
34
+ - type: cost
35
+ threshold: 0.01
36
+
37
+ - vars:
38
+ concept: "gradient descent"
39
+ language: "TypeScript"
40
+ task: "debounces a function"
41
+ assert:
42
+ - type: llm-rubric
43
+ value: "explains it clearly without excessive jargon"
44
+ - type: latency
45
+ threshold: 8000
46
+ `;
47
+
48
+ function scoreColor(score: number): string {
49
+ if (score >= 0.8) return "var(--green)";
50
+ if (score >= 0.5) return "var(--amber, #f59e0b)";
51
+ return "var(--red)";
52
+ }
53
+
54
+ function ScoreBadge({ score }: { score: number }) {
55
+ const color = scoreColor(score);
56
+ return (
57
+ <span style={{
58
+ display: "inline-flex", alignItems: "center", gap: "0.3rem",
59
+ padding: "0.15rem 0.5rem", borderRadius: 5,
60
+ background: color + "18", border: `1px solid ${color}35`,
61
+ color, fontWeight: 700, fontSize: "0.78rem",
62
+ }}>
63
+ {Math.round(score * 100)}%
64
+ </span>
65
+ );
66
+ }
67
+
68
+ function PassFail({ pass }: { pass: boolean }) {
69
+ return (
70
+ <span style={{
71
+ color: pass ? "var(--green)" : "var(--red)",
72
+ fontWeight: 700, fontSize: "0.82rem",
73
+ }}>
74
+ {pass ? "✓" : "✗"}
75
+ </span>
76
+ );
77
+ }
78
+
79
+ // ─── Summary table ────────────────────────────────────────────────────────────
80
+
81
+ function SummaryTable({ summary }: { summary: ProviderSummary[] }) {
82
+ return (
83
+ <div style={{
84
+ background: "var(--surface)", border: "1px solid var(--border)",
85
+ borderRadius: "var(--r-xl)", overflow: "hidden", boxShadow: "var(--shadow-sm)",
86
+ }}>
87
+ <div style={{ padding: "0.75rem 1.2rem", borderBottom: "1px solid var(--border)", fontWeight: 600, fontSize: "0.8125rem", letterSpacing: "-0.01em", color: "var(--text-1)", background: "var(--surface-subtle)" }}>
88
+ Provider summary
89
+ </div>
90
+ <div style={{ overflowX: "auto" }}>
91
+ <table style={{ width: "100%", borderCollapse: "collapse", fontSize: "0.82rem" }}>
92
+ <thead>
93
+ <tr style={{ background: "var(--surface-subtle)" }}>
94
+ {["Provider", "Model", "Score", "Passed", "Failed", "Avg Latency", "Total Cost"].map((h) => (
95
+ <th key={h} style={{ padding: "0.6rem 1rem", textAlign: "left", color: "var(--text-3)", fontWeight: 600, fontSize: "0.72rem", textTransform: "uppercase", letterSpacing: "0.04em", borderBottom: "1px solid var(--border)", whiteSpace: "nowrap" }}>
96
+ {h}
97
+ </th>
98
+ ))}
99
+ </tr>
100
+ </thead>
101
+ <tbody>
102
+ {summary.map((s) => (
103
+ <tr key={`${s.provider}/${s.model}`} style={{ borderBottom: "1px solid var(--border)" }}>
104
+ <td style={{ padding: "0.65rem 1rem", fontWeight: 600, color: "var(--text-1)" }}>{s.provider}</td>
105
+ <td style={{ padding: "0.65rem 1rem", color: "var(--text-2)", fontFamily: "monospace", fontSize: "0.78rem" }}>{s.model}</td>
106
+ <td style={{ padding: "0.65rem 1rem" }}><ScoreBadge score={s.score} /></td>
107
+ <td style={{ padding: "0.65rem 1rem", color: "var(--green)", fontWeight: 600 }}>{s.passed}</td>
108
+ <td style={{ padding: "0.65rem 1rem", color: s.failed > 0 ? "var(--red)" : "var(--text-3)", fontWeight: s.failed > 0 ? 600 : 400 }}>{s.failed}</td>
109
+ <td style={{ padding: "0.65rem 1rem", color: "var(--text-2)" }}>{s.avgLatencyMs.toLocaleString()}ms</td>
110
+ <td style={{ padding: "0.65rem 1rem", color: "var(--text-2)" }}>{s.totalCostUsd === 0 ? "$0.00" : formatCost(s.totalCostUsd)}</td>
111
+ </tr>
112
+ ))}
113
+ </tbody>
114
+ </table>
115
+ </div>
116
+ </div>
117
+ );
118
+ }
119
+
120
+ // ─── Results table (rows = cases, columns = providers) ────────────────────────
121
+
122
+ function AssertionChip({ a }: { a: AssertionResult }) {
123
+ return (
124
+ <div style={{ display: "flex", alignItems: "flex-start", gap: "0.3rem", fontSize: "0.72rem" }}>
125
+ <span style={{ color: a.pass ? "var(--green)" : "var(--red)", flexShrink: 0 }}>{a.pass ? "✓" : "✗"}</span>
126
+ <span style={{ color: "var(--text-3)" }}>[{a.type}]{a.reason ? ` — ${a.reason}` : ""}</span>
127
+ </div>
128
+ );
129
+ }
130
+
131
+ /** Rich block for judge-backed rubric: criterion, verdict, and rationale */
132
+ function LlmRubricJudgmentCard({ a }: { a: AssertionResult }) {
133
+ const criterion =
134
+ a.rubricCriterion?.trim() ||
135
+ "Criterion text was not stored (re-run the suite with the current app version).";
136
+ const verdictColor = a.pass ? "var(--green)" : "var(--red)";
137
+ const verdictBg = a.pass ? "var(--green-subtle)" : "var(--red-subtle)";
138
+
139
+ return (
140
+ <div
141
+ style={{
142
+ border: "1px solid var(--border)",
143
+ borderRadius: "var(--r-md)",
144
+ background: "var(--surface)",
145
+ padding: "0.65rem 0.75rem",
146
+ display: "flex",
147
+ flexDirection: "column",
148
+ gap: "0.55rem",
149
+ }}
150
+ >
151
+ <div style={{ fontSize: "0.65rem", fontWeight: 700, color: "var(--text-3)", letterSpacing: "0.06em", textTransform: "uppercase" }}>
152
+ LLM rubric (judge)
153
+ </div>
154
+
155
+ <div
156
+ style={{
157
+ padding: "0.5rem 0.6rem",
158
+ borderRadius: 6,
159
+ background: "var(--surface-muted)",
160
+ border: "1px solid var(--border)",
161
+ }}
162
+ >
163
+ <div style={{ fontSize: "0.65rem", fontWeight: 600, color: "var(--text-3)", marginBottom: "0.25rem" }}>Rubric detail</div>
164
+ <p style={{ margin: 0, fontSize: "0.78rem", color: "var(--text-1)", lineHeight: 1.5 }}>{criterion}</p>
165
+ </div>
166
+
167
+ <div
168
+ style={{
169
+ display: "flex",
170
+ alignItems: "center",
171
+ gap: "0.5rem",
172
+ flexWrap: "wrap",
173
+ }}
174
+ >
175
+ <span style={{ fontSize: "0.65rem", fontWeight: 600, color: "var(--text-3)", letterSpacing: "0.04em", textTransform: "uppercase" }}>
176
+ Verdict
177
+ </span>
178
+ <span
179
+ style={{
180
+ fontSize: "0.8rem",
181
+ fontWeight: 700,
182
+ color: verdictColor,
183
+ background: verdictBg,
184
+ padding: "0.2rem 0.55rem",
185
+ borderRadius: 6,
186
+ border: `1px solid ${a.pass ? "rgba(4, 120, 87, 0.22)" : "rgba(185, 28, 28, 0.2)"}`,
187
+ }}
188
+ >
189
+ {a.pass ? "Pass" : "Fail"}
190
+ </span>
191
+ </div>
192
+
193
+ <div>
194
+ <div style={{ fontSize: "0.65rem", fontWeight: 600, color: "var(--text-3)", marginBottom: "0.3rem", letterSpacing: "0.04em", textTransform: "uppercase" }}>
195
+ {a.pass ? "Why it passed" : "Why it failed"}
196
+ </div>
197
+ <p style={{ margin: 0, fontSize: "0.76rem", color: "var(--text-2)", lineHeight: 1.55 }}>{a.reason}</p>
198
+ </div>
199
+ </div>
200
+ );
201
+ }
202
+
203
+ function AssertionBlock({ a }: { a: AssertionResult }) {
204
+ if (a.type === "llm-rubric") return <LlmRubricJudgmentCard a={a} />;
205
+ return <AssertionChip a={a} />;
206
+ }
207
+
208
+ function JudgeRunSummary({ meta }: { meta: SuiteJudgeMeta }) {
209
+ const active = meta.willEvaluateRubrics;
210
+ const missing = meta.rubricAssertionCount > 0 && !meta.willEvaluateRubrics;
211
+ const idle = meta.rubricAssertionCount === 0;
212
+
213
+ const palette = active
214
+ ? {
215
+ bg: "var(--green-subtle)",
216
+ border: "rgba(4, 120, 87, 0.22)",
217
+ title: "Rubric judge: active",
218
+ titleColor: "var(--green)",
219
+ }
220
+ : missing
221
+ ? {
222
+ bg: "var(--amber-subtle)",
223
+ border: "rgba(180, 83, 9, 0.28)",
224
+ title: "Rubric judge: not calling an LLM",
225
+ titleColor: "var(--amber)",
226
+ }
227
+ : {
228
+ bg: "var(--surface-muted)",
229
+ border: "var(--border)",
230
+ title: "llm-rubric",
231
+ titleColor: "var(--text-2)",
232
+ };
233
+
234
+ return (
235
+ <div
236
+ style={{
237
+ padding: "0.85rem 1.1rem",
238
+ borderRadius: "var(--r-lg)",
239
+ background: palette.bg,
240
+ border: `1px solid ${palette.border}`,
241
+ }}
242
+ >
243
+ <div style={{ fontSize: "0.72rem", fontWeight: 700, color: "var(--text-3)", letterSpacing: "0.06em", textTransform: "uppercase", marginBottom: "0.35rem" }}>
244
+ {palette.title}
245
+ </div>
246
+ <p style={{ margin: "0 0 0.5rem", fontSize: "0.84rem", color: "var(--text-1)", lineHeight: 1.55, fontWeight: 600 }}>
247
+ {meta.summary}
248
+ </p>
249
+ <div style={{ fontSize: "0.72rem", color: "var(--text-3)", lineHeight: 1.5 }}>
250
+ <span style={{ fontWeight: 600, color: palette.titleColor }}>{meta.rubricAssertionCount}</span>
251
+ {" · "}
252
+ <code style={{ fontSize: "0.68rem", fontFamily: "var(--font-mono)" }}>llm-rubric</code> assertion(s) in YAML · Judge
253
+ mode: <code style={{ fontSize: "0.68rem", fontFamily: "var(--font-mono)" }}>{meta.judgeMode}</code>
254
+ {meta.judgeLabel ? (
255
+ <>
256
+ {" · "}
257
+ Backend: <code style={{ fontSize: "0.68rem", fontFamily: "var(--font-mono)" }}>{meta.judgeLabel}</code>
258
+ </>
259
+ ) : null}
260
+ </div>
261
+ {!idle && (
262
+ <p style={{ margin: "0.55rem 0 0", fontSize: "0.72rem", color: "var(--text-3)", lineHeight: 1.45 }}>
263
+ {active
264
+ ? 'Confirm in the run log: each rubric should show "→ Judge LLM" before "← Judge".'
265
+ : 'Fix judge settings (Secrets + Judge tab), then re-run. Assertions may show "No judge provider configured".'}
266
+ </p>
267
+ )}
268
+ </div>
269
+ );
270
+ }
271
+
272
+ function ResultsTable({ cases, summary }: { cases: TestCaseResult[]; summary: ProviderSummary[] }) {
273
+ const [expandedCase, setExpandedCase] = useState<number | null>(null);
274
+ const providerKeys = summary.map((s) => `${s.provider}/${s.model}`);
275
+
276
+ return (
277
+ <div style={{
278
+ background: "var(--surface)", border: "1px solid var(--border)",
279
+ borderRadius: "var(--r-xl)", overflow: "hidden", boxShadow: "var(--shadow-sm)",
280
+ }}>
281
+ <div style={{ padding: "0.75rem 1.2rem", borderBottom: "1px solid var(--border)", fontWeight: 600, fontSize: "0.8125rem", letterSpacing: "-0.01em", color: "var(--text-1)", background: "var(--surface-subtle)" }}>
282
+ Test matrix
283
+ </div>
284
+ <div style={{ overflowX: "auto" }}>
285
+ <table style={{ width: "100%", borderCollapse: "collapse", fontSize: "0.82rem" }}>
286
+ <thead>
287
+ <tr style={{ background: "var(--surface-subtle)" }}>
288
+ <th style={{ padding: "0.6rem 1rem", textAlign: "left", color: "var(--text-3)", fontWeight: 600, fontSize: "0.72rem", textTransform: "uppercase", letterSpacing: "0.04em", borderBottom: "1px solid var(--border)", minWidth: 220 }}>
289
+ Prompt / Vars
290
+ </th>
291
+ {providerKeys.map((k) => (
292
+ <th key={k} style={{ padding: "0.6rem 1rem", textAlign: "center", color: "var(--text-3)", fontWeight: 600, fontSize: "0.72rem", textTransform: "uppercase", letterSpacing: "0.04em", borderBottom: "1px solid var(--border)", minWidth: 120, whiteSpace: "nowrap" }}>
293
+ {k.split("/")[0]}<br />
294
+ <span style={{ fontWeight: 400, fontFamily: "monospace" }}>{k.split("/").slice(1).join("/")}</span>
295
+ </th>
296
+ ))}
297
+ </tr>
298
+ </thead>
299
+ <tbody>
300
+ {cases.map((c, i) => {
301
+ const isExpanded = expandedCase === i;
302
+ const varStr = Object.entries(c.vars).map(([k, v]) => `${k}=${v}`).join(", ");
303
+ return (
304
+ <React.Fragment key={i}>
305
+ <tr
306
+ style={{ borderBottom: "1px solid var(--border)", cursor: "pointer", transition: "background 0.1s" }}
307
+ onClick={() => setExpandedCase(isExpanded ? null : i)}
308
+ onMouseEnter={(e) => (e.currentTarget.style.background = "var(--surface-hover)")}
309
+ onMouseLeave={(e) => (e.currentTarget.style.background = "")}
310
+ >
311
+ <td style={{ padding: "0.65rem 1rem" }}>
312
+ <div style={{ color: "var(--text-1)", fontSize: "0.8rem", maxWidth: 280, overflow: "hidden", textOverflow: "ellipsis", whiteSpace: "nowrap" }} title={c.prompt}>
313
+ {c.prompt}
314
+ </div>
315
+ {varStr && (
316
+ <div style={{ color: "var(--text-3)", fontSize: "0.72rem", marginTop: "0.15rem" }}>{varStr}</div>
317
+ )}
318
+ </td>
319
+ {providerKeys.map((k) => {
320
+ const pr = c.providerResults.find((r) => `${r.provider}/${r.model}` === k);
321
+ if (!pr) return <td key={k} style={{ padding: "0.65rem 1rem", textAlign: "center", color: "var(--text-3)" }}>—</td>;
322
+ return (
323
+ <td key={k} style={{ padding: "0.65rem 1rem", textAlign: "center" }}>
324
+ {pr.error ? (
325
+ <span style={{ color: "var(--red)", fontSize: "0.72rem" }}>Error</span>
326
+ ) : (
327
+ <div style={{ display: "flex", flexDirection: "column", alignItems: "center", gap: "0.25rem" }}>
328
+ <PassFail pass={pr.pass} />
329
+ {pr.assertions.length > 0 && (
330
+ <span style={{ fontSize: "0.7rem", color: "var(--text-3)" }}>
331
+ {pr.assertions.filter((a) => a.pass).length}/{pr.assertions.length}
332
+ </span>
333
+ )}
334
+ </div>
335
+ )}
336
+ </td>
337
+ );
338
+ })}
339
+ </tr>
340
+ {isExpanded && (
341
+ <tr style={{ borderBottom: "1px solid var(--border)", background: "var(--surface-subtle)" }}>
342
+ <td colSpan={providerKeys.length + 1} style={{ padding: "0.75rem 1rem" }}>
343
+ <div style={{ display: "grid", gridTemplateColumns: `repeat(${providerKeys.length}, 1fr)`, gap: "1rem" }}>
344
+ {providerKeys.map((k) => {
345
+ const pr = c.providerResults.find((r) => `${r.provider}/${r.model}` === k);
346
+ if (!pr) return <div key={k} />;
347
+ return (
348
+ <div key={k} style={{ display: "flex", flexDirection: "column", gap: "0.4rem" }}>
349
+ <div style={{ fontWeight: 600, fontSize: "0.75rem", color: "var(--text-2)", textTransform: "uppercase", letterSpacing: "0.04em" }}>{k}</div>
350
+ {pr.error ? (
351
+ <div style={{ color: "var(--red)", fontSize: "0.78rem" }}>{pr.error}</div>
352
+ ) : (
353
+ <>
354
+ <div style={{ color: "var(--text-3)", fontSize: "0.72rem" }}>
355
+ {pr.latencyMs}ms · {pr.outputTokens > 0 ? `${pr.outputTokens} tokens` : ""}
356
+ </div>
357
+ <div style={{ fontSize: "0.78rem", color: "var(--text-2)", maxHeight: 120, overflowY: "auto", lineHeight: 1.5, whiteSpace: "pre-wrap", wordBreak: "break-word" }}>
358
+ {pr.output}
359
+ </div>
360
+ {pr.assertions.length > 0 && (
361
+ <div style={{ display: "flex", flexDirection: "column", gap: "0.5rem", marginTop: "0.25rem", paddingTop: "0.25rem", borderTop: "1px solid var(--border)" }}>
362
+ {pr.assertions.map((a, ai) => <AssertionBlock key={ai} a={a} />)}
363
+ </div>
364
+ )}
365
+ </>
366
+ )}
367
+ </div>
368
+ );
369
+ })}
370
+ </div>
371
+ </td>
372
+ </tr>
373
+ )}
374
+ </React.Fragment>
375
+ );
376
+ })}
377
+ </tbody>
378
+ </table>
379
+ </div>
380
+ </div>
381
+ );
382
+ }
383
+
384
+ // ─── Run target (what the suite executes against) ─────────────────────────────
385
+
386
+ function RunTargetBanner({
387
+ instances,
388
+ secrets,
389
+ judge,
390
+ onOpenSettings,
391
+ }: {
392
+ instances: LLMInstance[];
393
+ secrets: SecretsMap;
394
+ judge: JudgeSettings;
395
+ onOpenSettings?: () => void;
396
+ }) {
397
+ const enabled = instances.filter((i) => i.enabled);
398
+ const judgeLine = describeJudgeForUi(judge, secrets);
399
+
400
+ return (
401
+ <div
402
+ style={{
403
+ background: "var(--surface)",
404
+ border: "1px solid var(--border)",
405
+ borderRadius: "var(--r-xl)",
406
+ overflow: "hidden",
407
+ boxShadow: "var(--shadow-sm)",
408
+ }}
409
+ >
410
+ <div
411
+ style={{
412
+ padding: "0.65rem 1.15rem",
413
+ borderBottom: "1px solid var(--border)",
414
+ background: "var(--surface-subtle)",
415
+ display: "flex",
416
+ alignItems: "center",
417
+ justifyContent: "space-between",
418
+ gap: "0.75rem",
419
+ flexWrap: "wrap",
420
+ }}
421
+ >
422
+ <span
423
+ style={{
424
+ fontSize: "0.7rem",
425
+ fontWeight: 600,
426
+ color: "var(--text-3)",
427
+ letterSpacing: "0.06em",
428
+ textTransform: "uppercase",
429
+ }}
430
+ >
431
+ Run target
432
+ </span>
433
+ {onOpenSettings && (
434
+ <button
435
+ type="button"
436
+ onClick={onOpenSettings}
437
+ style={{
438
+ padding: "0.25rem 0.65rem",
439
+ borderRadius: 6,
440
+ border: "1px solid var(--border)",
441
+ background: "var(--surface)",
442
+ color: "var(--text-2)",
443
+ fontSize: "0.75rem",
444
+ fontWeight: 600,
445
+ cursor: "pointer",
446
+ fontFamily: "inherit",
447
+ }}
448
+ >
449
+ Change in Settings
450
+ </button>
451
+ )}
452
+ </div>
453
+ <div style={{ padding: "0.9rem 1.15rem", display: "flex", flexDirection: "column", gap: "0.85rem" }}>
454
+ <div>
455
+ <div style={{ fontSize: "0.72rem", fontWeight: 600, color: "var(--text-3)", marginBottom: "0.45rem", letterSpacing: "0.03em" }}>
456
+ Enabled models
457
+ </div>
458
+ {enabled.length === 0 ? (
459
+ <span style={{ fontSize: "0.8125rem", color: "var(--text-2)" }}>No models enabled — enable at least one in Settings.</span>
460
+ ) : (
461
+ <div style={{ display: "flex", flexWrap: "wrap", gap: "0.35rem" }}>
462
+ {enabled.map((i) => {
463
+ const { color, border } = providerUi(i.provider);
464
+ return (
465
+ <span
466
+ key={i.id}
467
+ style={{
468
+ padding: "0.28rem 0.65rem",
469
+ borderRadius: 8,
470
+ fontSize: "0.72rem",
471
+ fontWeight: 600,
472
+ background: "var(--surface-muted)",
473
+ border: `1px solid ${border}`,
474
+ color,
475
+ whiteSpace: "nowrap",
476
+ boxShadow: "var(--shadow-xs)",
477
+ }}
478
+ >
479
+ {i.provider}
480
+ <span style={{ color: "var(--text-3)", fontWeight: 500, margin: "0 0.25rem" }}>·</span>
481
+ <span style={{ fontFamily: "var(--font-mono)", fontWeight: 500, fontSize: "0.68rem" }}>{i.model}</span>
482
+ </span>
483
+ );
484
+ })}
485
+ </div>
486
+ )}
487
+ </div>
488
+ <div>
489
+ <div style={{ fontSize: "0.72rem", fontWeight: 600, color: "var(--text-3)", marginBottom: "0.35rem", letterSpacing: "0.03em" }}>
490
+ Judge (llm-rubric)
491
+ </div>
492
+ <p style={{ margin: 0, fontSize: "0.8125rem", color: "var(--text-2)", lineHeight: 1.55 }}>{judgeLine}</p>
493
+ </div>
494
+ </div>
495
+ </div>
496
+ );
497
+ }
498
+
499
+ // ─── Main SuitePanel ──────────────────────────────────────────────────────────
500
+
501
+ interface SuitePanelProps {
502
+ instances: LLMInstance[];
503
+ secrets: SecretsMap;
504
+ judge: JudgeSettings;
505
+ onOpenSettings?: () => void;
506
+ }
507
+
508
+ type SuiteApiResponse = SuiteResult & { runLog?: string[]; judgeMeta?: SuiteJudgeMeta };
509
+
510
+ export function SuitePanel({ instances, secrets, judge, onOpenSettings }: SuitePanelProps) {
511
+ const [yaml, setYaml] = useState(EXAMPLE_YAML);
512
+ const [loading, setLoading] = useState(false);
513
+ const [result, setResult] = useState<SuiteResult | null>(null);
514
+ const [runLog, setRunLog] = useState<string[]>([]);
515
+ const [judgeMeta, setJudgeMeta] = useState<SuiteJudgeMeta | null>(null);
516
+ const [error, setError] = useState<string | null>(null);
517
+ const [historyVersion, setHistoryVersion] = useState(0);
518
+ const logPreRef = useRef<HTMLPreElement>(null);
519
+
520
+ const historyEntries = useMemo(() => loadSuiteRunHistory(), [historyVersion]);
521
+
522
+ useEffect(() => {
523
+ if (!loading || !logPreRef.current) return;
524
+ logPreRef.current.scrollTop = logPreRef.current.scrollHeight;
525
+ }, [runLog, loading]);
526
+
527
+ const enabled = instances.filter((i) => i.enabled);
528
+ const canRun = enabled.length > 0 && yaml.trim().length > 0 && !loading;
529
+
530
+ const restoreHistoryEntry = (e: SuiteRunHistoryEntry) => {
531
+ setYaml(e.yaml);
532
+ setResult(e.result);
533
+ setRunLog(e.runLog);
534
+ setJudgeMeta(e.judgeMeta);
535
+ setError(null);
536
+ };
537
+
538
+ const run = async () => {
539
+ if (!canRun) return;
540
+ setLoading(true);
541
+ setError(null);
542
+ setResult(null);
543
+ setRunLog([]);
544
+ setJudgeMeta(null);
545
+ try {
546
+ const resolved = resolveInstancesForApi(instances, secrets);
547
+ const judgePayload = buildJudgeApiPayload(judge, secrets);
548
+ const res = await fetch("/api/suite", {
549
+ method: "POST",
550
+ headers: { "Content-Type": "application/json", Accept: "text/event-stream" },
551
+ body: JSON.stringify({
552
+ yaml,
553
+ instances: resolved,
554
+ judge: judgePayload,
555
+ stream: true,
556
+ }),
557
+ });
558
+
559
+ const ct = res.headers.get("content-type") ?? "";
560
+
561
+ if (!res.ok) {
562
+ const body = (await res.json().catch(() => ({}))) as { error?: string };
563
+ throw new Error(body.error ?? res.statusText);
564
+ }
565
+
566
+ if (!ct.includes("text/event-stream")) {
567
+ const body = (await res.json()) as SuiteApiResponse | { error?: string };
568
+ if ("error" in body && body.error) throw new Error(body.error);
569
+ const { runLog: lines, judgeMeta: jm, cases, summary } = body as SuiteApiResponse;
570
+ if (!Array.isArray(cases) || !Array.isArray(summary)) throw new Error("Invalid suite response");
571
+ const suiteResult = { cases, summary };
572
+ const logLines = Array.isArray(lines) ? lines : [];
573
+ const meta = jm ?? null;
574
+ setResult(suiteResult);
575
+ setRunLog(logLines);
576
+ setJudgeMeta(meta);
577
+ appendSuiteRunHistory({
578
+ yaml,
579
+ result: suiteResult,
580
+ runLog: logLines,
581
+ judgeMeta: meta,
582
+ ranAt: new Date().toISOString(),
583
+ });
584
+ setHistoryVersion((v) => v + 1);
585
+ } else {
586
+ const out = await consumeSuiteSseStream(res, (line) => {
587
+ setRunLog((prev) => [...prev, line]);
588
+ });
589
+ const suiteResult = { cases: out.result.cases, summary: out.result.summary };
590
+ const meta = out.judgeMeta;
591
+ setResult(suiteResult);
592
+ setRunLog(out.runLog);
593
+ setJudgeMeta(meta);
594
+ appendSuiteRunHistory({
595
+ yaml,
596
+ result: suiteResult,
597
+ runLog: out.runLog,
598
+ judgeMeta: meta,
599
+ ranAt: new Date().toISOString(),
600
+ });
601
+ setHistoryVersion((v) => v + 1);
602
+ }
603
+ } catch (err) {
604
+ setError(err instanceof Error ? err.message : String(err));
605
+ } finally {
606
+ setLoading(false);
607
+ }
608
+ };
609
+
610
+ const allPassed = result ? result.summary.every((s) => s.failed === 0) : null;
611
+
612
+ return (
613
+ <div style={{ display: "flex", flexDirection: "column", gap: "1.35rem" }}>
614
+ <RunTargetBanner instances={instances} secrets={secrets} judge={judge} onOpenSettings={onOpenSettings} />
615
+
616
+ <div
617
+ style={{
618
+ background: "var(--surface)",
619
+ border: "1px solid var(--border)",
620
+ borderRadius: "var(--r-xl)",
621
+ overflow: "hidden",
622
+ boxShadow: "var(--shadow-sm)",
623
+ }}
624
+ >
625
+ <div
626
+ style={{
627
+ padding: "0.65rem 1.1rem",
628
+ borderBottom: "1px solid var(--border)",
629
+ background: "var(--surface-subtle)",
630
+ fontWeight: 600,
631
+ fontSize: "0.72rem",
632
+ letterSpacing: "0.06em",
633
+ textTransform: "uppercase",
634
+ color: "var(--text-3)",
635
+ }}
636
+ >
637
+ Recent suite runs
638
+ </div>
639
+ {historyEntries.length === 0 ? (
640
+ <div style={{ padding: "1.1rem 1.15rem", color: "var(--text-3)", fontSize: "0.8125rem", margin: 0 }}>
641
+ No saved runs yet. Each successful suite run is stored in this browser (up to 15).
642
+ </div>
643
+ ) : (
644
+ <ul style={{ listStyle: "none", margin: 0, padding: 0 }}>
645
+ {historyEntries.map((entry, idx) => (
646
+ <li
647
+ key={entry.id}
648
+ style={{
649
+ borderBottom: idx < historyEntries.length - 1 ? "1px solid var(--border)" : "none",
650
+ }}
651
+ >
652
+ <button
653
+ type="button"
654
+ onClick={() => restoreHistoryEntry(entry)}
655
+ style={{
656
+ width: "100%",
657
+ textAlign: "left",
658
+ padding: "0.75rem 1.1rem",
659
+ border: "none",
660
+ background: "transparent",
661
+ cursor: "pointer",
662
+ fontFamily: "inherit",
663
+ transition: "background 0.12s ease",
664
+ }}
665
+ onMouseEnter={(e) => {
666
+ e.currentTarget.style.background = "var(--surface-subtle)";
667
+ }}
668
+ onMouseLeave={(e) => {
669
+ e.currentTarget.style.background = "transparent";
670
+ }}
671
+ >
672
+ <div style={{ fontSize: "0.72rem", color: "var(--text-3)", marginBottom: "0.25rem" }}>
673
+ {new Date(entry.ranAt).toLocaleString()} · {entry.result.cases.length} case
674
+ {entry.result.cases.length !== 1 ? "s" : ""} · {entry.result.summary.length} model
675
+ {entry.result.summary.length !== 1 ? "s" : ""}
676
+ </div>
677
+ <div
678
+ style={{
679
+ fontSize: "0.84rem",
680
+ color: "var(--text-1)",
681
+ fontWeight: 500,
682
+ lineHeight: 1.45,
683
+ fontFamily: "var(--font-mono)",
684
+ }}
685
+ >
686
+ {entry.yamlPreview}
687
+ </div>
688
+ </button>
689
+ </li>
690
+ ))}
691
+ </ul>
692
+ )}
693
+ </div>
694
+
695
+ <div style={{ background: "var(--surface)", border: "1px solid var(--border)", borderRadius: "var(--r-2xl)", overflow: "hidden", boxShadow: "var(--shadow-md)" }}>
696
+ <div style={{ padding: "0.7rem 1.2rem", borderBottom: "1px solid var(--border)", background: "var(--surface-subtle)", display: "flex", justifyContent: "space-between", alignItems: "center", flexWrap: "wrap", gap: "0.65rem" }}>
697
+ <div style={{ display: "flex", flexDirection: "column", gap: "0.15rem" }}>
698
+ <span style={{ fontSize: "0.7rem", fontWeight: 600, color: "var(--text-3)", letterSpacing: "0.06em", textTransform: "uppercase" }}>Suite</span>
699
+ <span style={{ fontSize: "0.875rem", fontWeight: 600, color: "var(--text-1)", letterSpacing: "-0.02em" }}>Eval configuration (YAML)</span>
700
+ </div>
701
+ <div style={{ display: "flex", alignItems: "center", gap: "0.75rem" }}>
702
+ {enabled.length === 0 && (
703
+ <span style={{ fontSize: "0.78rem", color: "var(--text-3)", fontWeight: 500 }}>Enable models in Settings</span>
704
+ )}
705
+ <button
706
+ type="button"
707
+ onClick={run}
708
+ disabled={!canRun}
709
+ style={{
710
+ padding: "0.5rem 1.2rem",
711
+ borderRadius: "var(--r-md)",
712
+ border: "none",
713
+ background: canRun ? "var(--accent)" : "var(--surface-hover)",
714
+ color: canRun ? "#fff" : "var(--text-3)",
715
+ fontWeight: 600,
716
+ fontSize: "0.875rem",
717
+ cursor: canRun ? "pointer" : "not-allowed",
718
+ fontFamily: "inherit",
719
+ display: "flex",
720
+ alignItems: "center",
721
+ gap: "0.45rem",
722
+ boxShadow: canRun ? "0 2px 8px rgba(30, 64, 175, 0.28)" : "none",
723
+ transition: "background 0.15s",
724
+ }}
725
+ >
726
+ {loading ? (
727
+ <>
728
+ <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2.5" strokeLinecap="round" style={{ animation: "spin 1s linear infinite" }}>
729
+ <path d="M12 2v4M12 18v4M4.93 4.93l2.83 2.83M16.24 16.24l2.83 2.83M2 12h4M18 12h4M4.93 19.07l2.83-2.83M16.24 7.76l2.83-2.83" />
730
+ </svg>
731
+ Running…
732
+ </>
733
+ ) : "Run Suite"}
734
+ </button>
735
+ </div>
736
+ </div>
737
+ <textarea
738
+ value={yaml}
739
+ onChange={(e) => setYaml(e.target.value)}
740
+ spellCheck={false}
741
+ style={{
742
+ display: "block",
743
+ width: "100%",
744
+ minHeight: 280,
745
+ background: "var(--surface-muted)",
746
+ border: "none",
747
+ outline: "none",
748
+ color: "var(--text-1)",
749
+ fontSize: "0.8125rem",
750
+ lineHeight: 1.65,
751
+ padding: "1.1rem 1.25rem",
752
+ resize: "vertical",
753
+ fontFamily: "var(--font-mono)",
754
+ boxSizing: "border-box",
755
+ }}
756
+ />
757
+ </div>
758
+
759
+ {error && (
760
+ <div style={{ background: "var(--red-subtle)", border: "1px solid rgba(185, 28, 28, 0.2)", color: "var(--red)", borderRadius: "var(--r-lg)", padding: "0.85rem 1.1rem", fontSize: "0.875rem", lineHeight: 1.55, fontWeight: 500 }}>
761
+ {error}
762
+ </div>
763
+ )}
764
+
765
+ {loading && (
766
+ <div
767
+ style={{
768
+ background: "var(--accent-subtle)",
769
+ border: "1px solid rgba(30, 64, 175, 0.15)",
770
+ borderRadius: "var(--r-lg)",
771
+ padding: "0.85rem 1.1rem",
772
+ fontSize: "0.8125rem",
773
+ lineHeight: 1.55,
774
+ color: "var(--accent-text)",
775
+ fontWeight: 500,
776
+ }}
777
+ >
778
+ Running suite — logs stream live below as each LLM and judge request starts and finishes.
779
+ </div>
780
+ )}
781
+
782
+ {(loading || runLog.length > 0) && (
783
+ <div
784
+ style={{
785
+ background: "var(--surface)",
786
+ border: "1px solid var(--border)",
787
+ borderRadius: "var(--r-xl)",
788
+ overflow: "hidden",
789
+ boxShadow: "var(--shadow-sm)",
790
+ }}
791
+ >
792
+ <div
793
+ style={{
794
+ padding: "0.65rem 1.1rem",
795
+ borderBottom: "1px solid var(--border)",
796
+ background: "var(--surface-subtle)",
797
+ fontWeight: 600,
798
+ fontSize: "0.8125rem",
799
+ letterSpacing: "-0.01em",
800
+ color: "var(--text-1)",
801
+ }}
802
+ >
803
+ {loading ? "Live run log" : "Run log"}
804
+ </div>
805
+ <p
806
+ style={{
807
+ margin: 0,
808
+ padding: "0.45rem 1.1rem 0",
809
+ fontSize: "0.68rem",
810
+ color: "var(--text-3)",
811
+ lineHeight: 1.45,
812
+ }}
813
+ >
814
+ Needs a Node server (not static HTML export). On Vercel, use the Node runtime and a long enough function
815
+ timeout (this route sets maxDuration to 300s). If hosts buffer SSE, disable buffering or run very long suites
816
+ on a dedicated server.
817
+ </p>
818
+ <pre
819
+ ref={logPreRef}
820
+ style={{
821
+ margin: 0,
822
+ padding: "0.85rem 1.1rem",
823
+ maxHeight: 280,
824
+ overflow: "auto",
825
+ fontSize: "0.72rem",
826
+ lineHeight: 1.55,
827
+ fontFamily: "var(--font-mono)",
828
+ color: "var(--text-2)",
829
+ background: "var(--surface-muted)",
830
+ whiteSpace: "pre-wrap",
831
+ wordBreak: "break-word",
832
+ }}
833
+ >
834
+ {runLog.length > 0 ? runLog.join("\n") : loading ? "Connecting…" : ""}
835
+ </pre>
836
+ </div>
837
+ )}
838
+
839
+ {/* Results */}
840
+ {result && (
841
+ <div style={{ display: "flex", flexDirection: "column", gap: "1rem" }}>
842
+ {judgeMeta && <JudgeRunSummary meta={judgeMeta} />}
843
+
844
+ {/* Status banner */}
845
+ <div style={{
846
+ display: "flex", alignItems: "center", gap: "0.75rem",
847
+ padding: "0.85rem 1.15rem", borderRadius: "var(--r-lg)",
848
+ background: allPassed ? "var(--green-subtle)" : "var(--red-subtle)",
849
+ border: `1px solid ${allPassed ? "rgba(4, 120, 87, 0.2)" : "rgba(185, 28, 28, 0.2)"}`,
850
+ }}>
851
+ <span style={{ fontSize: "1.05rem", fontWeight: 700 }}>{allPassed ? "✓" : "✗"}</span>
852
+ <span style={{ fontWeight: 600, color: allPassed ? "var(--green)" : "var(--red)", fontSize: "0.9rem" }}>
853
+ {allPassed
854
+ ? "All assertions passed"
855
+ : `${result.summary.reduce((s, p) => s + p.failed, 0)} assertion(s) failed`}
856
+ </span>
857
+ <span style={{ color: "var(--text-3)", fontSize: "0.78rem", marginLeft: "auto" }}>
858
+ {result.cases.length} test case{result.cases.length !== 1 ? "s" : ""} · {result.summary.length} provider{result.summary.length !== 1 ? "s" : ""}
859
+ </span>
860
+ </div>
861
+
862
+ <SummaryTable summary={result.summary} />
863
+ <ResultsTable cases={result.cases} summary={result.summary} />
864
+ </div>
865
+ )}
866
+
867
+ {/* Empty state */}
868
+ {!result && !error && !loading && (
869
+ <div style={{ textAlign: "center", padding: "2.25rem 1rem", color: "var(--text-2)", fontSize: "0.875rem", lineHeight: 1.65, maxWidth: 440, margin: "0 auto" }}>
870
+ Define prompts and assertions in YAML, then run against your enabled models. Expand rows in the results table to inspect outputs and rubric checks.
871
+ </div>
872
+ )}
873
+ </div>
874
+ );
875
+ }