@darkrishabh/bench-ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +333 -0
- package/dist/cli/app.d.ts +11 -0
- package/dist/cli/app.d.ts.map +1 -0
- package/dist/cli/app.js +48 -0
- package/dist/cli/app.js.map +1 -0
- package/dist/cli/components/DiffView.d.ts +5 -0
- package/dist/cli/components/DiffView.d.ts.map +1 -0
- package/dist/cli/components/DiffView.js +14 -0
- package/dist/cli/components/DiffView.js.map +1 -0
- package/dist/cli/components/EvalView.d.ts +6 -0
- package/dist/cli/components/EvalView.d.ts.map +1 -0
- package/dist/cli/components/EvalView.js +82 -0
- package/dist/cli/components/EvalView.js.map +1 -0
- package/dist/cli/components/Spinner.d.ts +4 -0
- package/dist/cli/components/Spinner.d.ts.map +1 -0
- package/dist/cli/components/Spinner.js +15 -0
- package/dist/cli/components/Spinner.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +117 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/run-command.d.ts +11 -0
- package/dist/cli/run-command.d.ts.map +1 -0
- package/dist/cli/run-command.js +119 -0
- package/dist/cli/run-command.js.map +1 -0
- package/dist/engine/cost.d.ts +3 -0
- package/dist/engine/cost.d.ts.map +1 -0
- package/dist/engine/cost.js +52 -0
- package/dist/engine/cost.js.map +1 -0
- package/dist/engine/diff.d.ts +6 -0
- package/dist/engine/diff.d.ts.map +1 -0
- package/dist/engine/diff.js +43 -0
- package/dist/engine/diff.js.map +1 -0
- package/dist/engine/eval.d.ts +14 -0
- package/dist/engine/eval.d.ts.map +1 -0
- package/dist/engine/eval.js +194 -0
- package/dist/engine/eval.js.map +1 -0
- package/dist/engine/index.d.ts +15 -0
- package/dist/engine/index.d.ts.map +1 -0
- package/dist/engine/index.js +10 -0
- package/dist/engine/index.js.map +1 -0
- package/dist/engine/providers/base.d.ts +7 -0
- package/dist/engine/providers/base.d.ts.map +1 -0
- package/dist/engine/providers/base.js +2 -0
- package/dist/engine/providers/base.js.map +1 -0
- package/dist/engine/providers/claude.d.ts +15 -0
- package/dist/engine/providers/claude.d.ts.map +1 -0
- package/dist/engine/providers/claude.js +53 -0
- package/dist/engine/providers/claude.js.map +1 -0
- package/dist/engine/providers/minimax.d.ts +16 -0
- package/dist/engine/providers/minimax.d.ts.map +1 -0
- package/dist/engine/providers/minimax.js +67 -0
- package/dist/engine/providers/minimax.js.map +1 -0
- package/dist/engine/providers/ollama.d.ts +14 -0
- package/dist/engine/providers/ollama.d.ts.map +1 -0
- package/dist/engine/providers/ollama.js +60 -0
- package/dist/engine/providers/ollama.js.map +1 -0
- package/dist/engine/providers/openai-compatible.d.ts +19 -0
- package/dist/engine/providers/openai-compatible.d.ts.map +1 -0
- package/dist/engine/providers/openai-compatible.js +109 -0
- package/dist/engine/providers/openai-compatible.js.map +1 -0
- package/dist/engine/providers/subprocess.d.ts +55 -0
- package/dist/engine/providers/subprocess.d.ts.map +1 -0
- package/dist/engine/providers/subprocess.js +111 -0
- package/dist/engine/providers/subprocess.js.map +1 -0
- package/dist/engine/suite-loader.d.ts +11 -0
- package/dist/engine/suite-loader.d.ts.map +1 -0
- package/dist/engine/suite-loader.js +75 -0
- package/dist/engine/suite-loader.js.map +1 -0
- package/dist/engine/types.d.ts +104 -0
- package/dist/engine/types.d.ts.map +1 -0
- package/dist/engine/types.js +2 -0
- package/dist/engine/types.js.map +1 -0
- package/next-env.d.ts +6 -0
- package/next.config.ts +26 -0
- package/package.json +72 -0
- package/public/icon.svg +14 -0
- package/src/app/api/diff/route.ts +135 -0
- package/src/app/api/models/route.ts +96 -0
- package/src/app/api/suite/route.ts +314 -0
- package/src/app/globals.css +215 -0
- package/src/app/icon.svg +14 -0
- package/src/app/layout.tsx +44 -0
- package/src/app/opengraph-image.tsx +73 -0
- package/src/app/page.tsx +952 -0
- package/src/app/suite/layout.tsx +12 -0
- package/src/app/suite/page.tsx +206 -0
- package/src/app/twitter-image.tsx +1 -0
- package/src/components/BenchAiLogo.tsx +38 -0
- package/src/components/ComparePanel.tsx +643 -0
- package/src/components/ConfigPanel.tsx +809 -0
- package/src/components/MarkdownOutput.tsx +16 -0
- package/src/components/ModelResponseCard.tsx +313 -0
- package/src/components/QuickComparisonBar.tsx +184 -0
- package/src/components/ResponsesLineDiff.tsx +149 -0
- package/src/components/SettingsPanel.tsx +591 -0
- package/src/components/SuitePanel.tsx +875 -0
- package/src/lib/brand.ts +4 -0
- package/src/lib/config-yaml.ts +70 -0
- package/src/lib/consume-suite-sse.ts +70 -0
- package/src/lib/describe-judge.ts +23 -0
- package/src/lib/model-chip-palette.ts +9 -0
- package/src/lib/openai-model-list.ts +33 -0
- package/src/lib/provider-ui.ts +30 -0
- package/src/lib/resolve-credentials.ts +80 -0
- package/src/lib/run-history.ts +66 -0
- package/src/lib/simple-line-diff.ts +50 -0
- package/src/lib/storage.ts +100 -0
- package/src/lib/suite-judge-meta.ts +13 -0
- package/src/lib/suite-run-history.ts +81 -0
- package/src/types.ts +170 -0
- package/vercel.json +5 -0
|
@@ -0,0 +1,875 @@
|
|
|
1
|
+
"use client";
|
|
2
|
+
|
|
3
|
+
import React, { useState, useRef, useEffect, useMemo } from "react";
|
|
4
|
+
import type { SuiteResult, TestCaseResult, ProviderSummary, AssertionResult } from "@darkrishabh/bench-ai";
|
|
5
|
+
import type { JudgeSettings, LLMInstance, SecretsMap } from "../types";
|
|
6
|
+
import { formatCost } from "@darkrishabh/bench-ai";
|
|
7
|
+
import { buildJudgeApiPayload, resolveInstancesForApi } from "../lib/resolve-credentials";
|
|
8
|
+
import { providerUi } from "../lib/provider-ui";
|
|
9
|
+
import { describeJudgeForUi } from "../lib/describe-judge";
|
|
10
|
+
import type { SuiteJudgeMeta } from "../lib/suite-judge-meta";
|
|
11
|
+
import { consumeSuiteSseStream } from "../lib/consume-suite-sse";
|
|
12
|
+
import {
|
|
13
|
+
appendSuiteRunHistory,
|
|
14
|
+
loadSuiteRunHistory,
|
|
15
|
+
type SuiteRunHistoryEntry,
|
|
16
|
+
} from "../lib/suite-run-history";
|
|
17
|
+
|
|
18
|
+
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
19
|
+
|
|
20
|
+
const EXAMPLE_YAML = `prompts:
|
|
21
|
+
- "Explain {{concept}} in one paragraph"
|
|
22
|
+
- "Write a {{language}} function that {{task}}"
|
|
23
|
+
|
|
24
|
+
tests:
|
|
25
|
+
- vars:
|
|
26
|
+
concept: "recursion"
|
|
27
|
+
language: "Python"
|
|
28
|
+
task: "reverses a linked list"
|
|
29
|
+
assert:
|
|
30
|
+
- type: contains
|
|
31
|
+
value: "base case"
|
|
32
|
+
- type: latency
|
|
33
|
+
threshold: 8000
|
|
34
|
+
- type: cost
|
|
35
|
+
threshold: 0.01
|
|
36
|
+
|
|
37
|
+
- vars:
|
|
38
|
+
concept: "gradient descent"
|
|
39
|
+
language: "TypeScript"
|
|
40
|
+
task: "debounces a function"
|
|
41
|
+
assert:
|
|
42
|
+
- type: llm-rubric
|
|
43
|
+
value: "explains it clearly without excessive jargon"
|
|
44
|
+
- type: latency
|
|
45
|
+
threshold: 8000
|
|
46
|
+
`;
|
|
47
|
+
|
|
48
|
+
function scoreColor(score: number): string {
|
|
49
|
+
if (score >= 0.8) return "var(--green)";
|
|
50
|
+
if (score >= 0.5) return "var(--amber, #f59e0b)";
|
|
51
|
+
return "var(--red)";
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function ScoreBadge({ score }: { score: number }) {
|
|
55
|
+
const color = scoreColor(score);
|
|
56
|
+
return (
|
|
57
|
+
<span style={{
|
|
58
|
+
display: "inline-flex", alignItems: "center", gap: "0.3rem",
|
|
59
|
+
padding: "0.15rem 0.5rem", borderRadius: 5,
|
|
60
|
+
background: color + "18", border: `1px solid ${color}35`,
|
|
61
|
+
color, fontWeight: 700, fontSize: "0.78rem",
|
|
62
|
+
}}>
|
|
63
|
+
{Math.round(score * 100)}%
|
|
64
|
+
</span>
|
|
65
|
+
);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function PassFail({ pass }: { pass: boolean }) {
|
|
69
|
+
return (
|
|
70
|
+
<span style={{
|
|
71
|
+
color: pass ? "var(--green)" : "var(--red)",
|
|
72
|
+
fontWeight: 700, fontSize: "0.82rem",
|
|
73
|
+
}}>
|
|
74
|
+
{pass ? "✓" : "✗"}
|
|
75
|
+
</span>
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// ─── Summary table ────────────────────────────────────────────────────────────
|
|
80
|
+
|
|
81
|
+
function SummaryTable({ summary }: { summary: ProviderSummary[] }) {
|
|
82
|
+
return (
|
|
83
|
+
<div style={{
|
|
84
|
+
background: "var(--surface)", border: "1px solid var(--border)",
|
|
85
|
+
borderRadius: "var(--r-xl)", overflow: "hidden", boxShadow: "var(--shadow-sm)",
|
|
86
|
+
}}>
|
|
87
|
+
<div style={{ padding: "0.75rem 1.2rem", borderBottom: "1px solid var(--border)", fontWeight: 600, fontSize: "0.8125rem", letterSpacing: "-0.01em", color: "var(--text-1)", background: "var(--surface-subtle)" }}>
|
|
88
|
+
Provider summary
|
|
89
|
+
</div>
|
|
90
|
+
<div style={{ overflowX: "auto" }}>
|
|
91
|
+
<table style={{ width: "100%", borderCollapse: "collapse", fontSize: "0.82rem" }}>
|
|
92
|
+
<thead>
|
|
93
|
+
<tr style={{ background: "var(--surface-subtle)" }}>
|
|
94
|
+
{["Provider", "Model", "Score", "Passed", "Failed", "Avg Latency", "Total Cost"].map((h) => (
|
|
95
|
+
<th key={h} style={{ padding: "0.6rem 1rem", textAlign: "left", color: "var(--text-3)", fontWeight: 600, fontSize: "0.72rem", textTransform: "uppercase", letterSpacing: "0.04em", borderBottom: "1px solid var(--border)", whiteSpace: "nowrap" }}>
|
|
96
|
+
{h}
|
|
97
|
+
</th>
|
|
98
|
+
))}
|
|
99
|
+
</tr>
|
|
100
|
+
</thead>
|
|
101
|
+
<tbody>
|
|
102
|
+
{summary.map((s) => (
|
|
103
|
+
<tr key={`${s.provider}/${s.model}`} style={{ borderBottom: "1px solid var(--border)" }}>
|
|
104
|
+
<td style={{ padding: "0.65rem 1rem", fontWeight: 600, color: "var(--text-1)" }}>{s.provider}</td>
|
|
105
|
+
<td style={{ padding: "0.65rem 1rem", color: "var(--text-2)", fontFamily: "monospace", fontSize: "0.78rem" }}>{s.model}</td>
|
|
106
|
+
<td style={{ padding: "0.65rem 1rem" }}><ScoreBadge score={s.score} /></td>
|
|
107
|
+
<td style={{ padding: "0.65rem 1rem", color: "var(--green)", fontWeight: 600 }}>{s.passed}</td>
|
|
108
|
+
<td style={{ padding: "0.65rem 1rem", color: s.failed > 0 ? "var(--red)" : "var(--text-3)", fontWeight: s.failed > 0 ? 600 : 400 }}>{s.failed}</td>
|
|
109
|
+
<td style={{ padding: "0.65rem 1rem", color: "var(--text-2)" }}>{s.avgLatencyMs.toLocaleString()}ms</td>
|
|
110
|
+
<td style={{ padding: "0.65rem 1rem", color: "var(--text-2)" }}>{s.totalCostUsd === 0 ? "$0.00" : formatCost(s.totalCostUsd)}</td>
|
|
111
|
+
</tr>
|
|
112
|
+
))}
|
|
113
|
+
</tbody>
|
|
114
|
+
</table>
|
|
115
|
+
</div>
|
|
116
|
+
</div>
|
|
117
|
+
);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// ─── Results table (rows = cases, columns = providers) ────────────────────────
|
|
121
|
+
|
|
122
|
+
function AssertionChip({ a }: { a: AssertionResult }) {
|
|
123
|
+
return (
|
|
124
|
+
<div style={{ display: "flex", alignItems: "flex-start", gap: "0.3rem", fontSize: "0.72rem" }}>
|
|
125
|
+
<span style={{ color: a.pass ? "var(--green)" : "var(--red)", flexShrink: 0 }}>{a.pass ? "✓" : "✗"}</span>
|
|
126
|
+
<span style={{ color: "var(--text-3)" }}>[{a.type}]{a.reason ? ` — ${a.reason}` : ""}</span>
|
|
127
|
+
</div>
|
|
128
|
+
);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/** Rich block for judge-backed rubric: criterion, verdict, and rationale */
|
|
132
|
+
function LlmRubricJudgmentCard({ a }: { a: AssertionResult }) {
|
|
133
|
+
const criterion =
|
|
134
|
+
a.rubricCriterion?.trim() ||
|
|
135
|
+
"Criterion text was not stored (re-run the suite with the current app version).";
|
|
136
|
+
const verdictColor = a.pass ? "var(--green)" : "var(--red)";
|
|
137
|
+
const verdictBg = a.pass ? "var(--green-subtle)" : "var(--red-subtle)";
|
|
138
|
+
|
|
139
|
+
return (
|
|
140
|
+
<div
|
|
141
|
+
style={{
|
|
142
|
+
border: "1px solid var(--border)",
|
|
143
|
+
borderRadius: "var(--r-md)",
|
|
144
|
+
background: "var(--surface)",
|
|
145
|
+
padding: "0.65rem 0.75rem",
|
|
146
|
+
display: "flex",
|
|
147
|
+
flexDirection: "column",
|
|
148
|
+
gap: "0.55rem",
|
|
149
|
+
}}
|
|
150
|
+
>
|
|
151
|
+
<div style={{ fontSize: "0.65rem", fontWeight: 700, color: "var(--text-3)", letterSpacing: "0.06em", textTransform: "uppercase" }}>
|
|
152
|
+
LLM rubric (judge)
|
|
153
|
+
</div>
|
|
154
|
+
|
|
155
|
+
<div
|
|
156
|
+
style={{
|
|
157
|
+
padding: "0.5rem 0.6rem",
|
|
158
|
+
borderRadius: 6,
|
|
159
|
+
background: "var(--surface-muted)",
|
|
160
|
+
border: "1px solid var(--border)",
|
|
161
|
+
}}
|
|
162
|
+
>
|
|
163
|
+
<div style={{ fontSize: "0.65rem", fontWeight: 600, color: "var(--text-3)", marginBottom: "0.25rem" }}>Rubric detail</div>
|
|
164
|
+
<p style={{ margin: 0, fontSize: "0.78rem", color: "var(--text-1)", lineHeight: 1.5 }}>{criterion}</p>
|
|
165
|
+
</div>
|
|
166
|
+
|
|
167
|
+
<div
|
|
168
|
+
style={{
|
|
169
|
+
display: "flex",
|
|
170
|
+
alignItems: "center",
|
|
171
|
+
gap: "0.5rem",
|
|
172
|
+
flexWrap: "wrap",
|
|
173
|
+
}}
|
|
174
|
+
>
|
|
175
|
+
<span style={{ fontSize: "0.65rem", fontWeight: 600, color: "var(--text-3)", letterSpacing: "0.04em", textTransform: "uppercase" }}>
|
|
176
|
+
Verdict
|
|
177
|
+
</span>
|
|
178
|
+
<span
|
|
179
|
+
style={{
|
|
180
|
+
fontSize: "0.8rem",
|
|
181
|
+
fontWeight: 700,
|
|
182
|
+
color: verdictColor,
|
|
183
|
+
background: verdictBg,
|
|
184
|
+
padding: "0.2rem 0.55rem",
|
|
185
|
+
borderRadius: 6,
|
|
186
|
+
border: `1px solid ${a.pass ? "rgba(4, 120, 87, 0.22)" : "rgba(185, 28, 28, 0.2)"}`,
|
|
187
|
+
}}
|
|
188
|
+
>
|
|
189
|
+
{a.pass ? "Pass" : "Fail"}
|
|
190
|
+
</span>
|
|
191
|
+
</div>
|
|
192
|
+
|
|
193
|
+
<div>
|
|
194
|
+
<div style={{ fontSize: "0.65rem", fontWeight: 600, color: "var(--text-3)", marginBottom: "0.3rem", letterSpacing: "0.04em", textTransform: "uppercase" }}>
|
|
195
|
+
{a.pass ? "Why it passed" : "Why it failed"}
|
|
196
|
+
</div>
|
|
197
|
+
<p style={{ margin: 0, fontSize: "0.76rem", color: "var(--text-2)", lineHeight: 1.55 }}>{a.reason}</p>
|
|
198
|
+
</div>
|
|
199
|
+
</div>
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function AssertionBlock({ a }: { a: AssertionResult }) {
|
|
204
|
+
if (a.type === "llm-rubric") return <LlmRubricJudgmentCard a={a} />;
|
|
205
|
+
return <AssertionChip a={a} />;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function JudgeRunSummary({ meta }: { meta: SuiteJudgeMeta }) {
|
|
209
|
+
const active = meta.willEvaluateRubrics;
|
|
210
|
+
const missing = meta.rubricAssertionCount > 0 && !meta.willEvaluateRubrics;
|
|
211
|
+
const idle = meta.rubricAssertionCount === 0;
|
|
212
|
+
|
|
213
|
+
const palette = active
|
|
214
|
+
? {
|
|
215
|
+
bg: "var(--green-subtle)",
|
|
216
|
+
border: "rgba(4, 120, 87, 0.22)",
|
|
217
|
+
title: "Rubric judge: active",
|
|
218
|
+
titleColor: "var(--green)",
|
|
219
|
+
}
|
|
220
|
+
: missing
|
|
221
|
+
? {
|
|
222
|
+
bg: "var(--amber-subtle)",
|
|
223
|
+
border: "rgba(180, 83, 9, 0.28)",
|
|
224
|
+
title: "Rubric judge: not calling an LLM",
|
|
225
|
+
titleColor: "var(--amber)",
|
|
226
|
+
}
|
|
227
|
+
: {
|
|
228
|
+
bg: "var(--surface-muted)",
|
|
229
|
+
border: "var(--border)",
|
|
230
|
+
title: "llm-rubric",
|
|
231
|
+
titleColor: "var(--text-2)",
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
return (
|
|
235
|
+
<div
|
|
236
|
+
style={{
|
|
237
|
+
padding: "0.85rem 1.1rem",
|
|
238
|
+
borderRadius: "var(--r-lg)",
|
|
239
|
+
background: palette.bg,
|
|
240
|
+
border: `1px solid ${palette.border}`,
|
|
241
|
+
}}
|
|
242
|
+
>
|
|
243
|
+
<div style={{ fontSize: "0.72rem", fontWeight: 700, color: "var(--text-3)", letterSpacing: "0.06em", textTransform: "uppercase", marginBottom: "0.35rem" }}>
|
|
244
|
+
{palette.title}
|
|
245
|
+
</div>
|
|
246
|
+
<p style={{ margin: "0 0 0.5rem", fontSize: "0.84rem", color: "var(--text-1)", lineHeight: 1.55, fontWeight: 600 }}>
|
|
247
|
+
{meta.summary}
|
|
248
|
+
</p>
|
|
249
|
+
<div style={{ fontSize: "0.72rem", color: "var(--text-3)", lineHeight: 1.5 }}>
|
|
250
|
+
<span style={{ fontWeight: 600, color: palette.titleColor }}>{meta.rubricAssertionCount}</span>
|
|
251
|
+
{" · "}
|
|
252
|
+
<code style={{ fontSize: "0.68rem", fontFamily: "var(--font-mono)" }}>llm-rubric</code> assertion(s) in YAML · Judge
|
|
253
|
+
mode: <code style={{ fontSize: "0.68rem", fontFamily: "var(--font-mono)" }}>{meta.judgeMode}</code>
|
|
254
|
+
{meta.judgeLabel ? (
|
|
255
|
+
<>
|
|
256
|
+
{" · "}
|
|
257
|
+
Backend: <code style={{ fontSize: "0.68rem", fontFamily: "var(--font-mono)" }}>{meta.judgeLabel}</code>
|
|
258
|
+
</>
|
|
259
|
+
) : null}
|
|
260
|
+
</div>
|
|
261
|
+
{!idle && (
|
|
262
|
+
<p style={{ margin: "0.55rem 0 0", fontSize: "0.72rem", color: "var(--text-3)", lineHeight: 1.45 }}>
|
|
263
|
+
{active
|
|
264
|
+
? 'Confirm in the run log: each rubric should show "→ Judge LLM" before "← Judge".'
|
|
265
|
+
: 'Fix judge settings (Secrets + Judge tab), then re-run. Assertions may show "No judge provider configured".'}
|
|
266
|
+
</p>
|
|
267
|
+
)}
|
|
268
|
+
</div>
|
|
269
|
+
);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function ResultsTable({ cases, summary }: { cases: TestCaseResult[]; summary: ProviderSummary[] }) {
|
|
273
|
+
const [expandedCase, setExpandedCase] = useState<number | null>(null);
|
|
274
|
+
const providerKeys = summary.map((s) => `${s.provider}/${s.model}`);
|
|
275
|
+
|
|
276
|
+
return (
|
|
277
|
+
<div style={{
|
|
278
|
+
background: "var(--surface)", border: "1px solid var(--border)",
|
|
279
|
+
borderRadius: "var(--r-xl)", overflow: "hidden", boxShadow: "var(--shadow-sm)",
|
|
280
|
+
}}>
|
|
281
|
+
<div style={{ padding: "0.75rem 1.2rem", borderBottom: "1px solid var(--border)", fontWeight: 600, fontSize: "0.8125rem", letterSpacing: "-0.01em", color: "var(--text-1)", background: "var(--surface-subtle)" }}>
|
|
282
|
+
Test matrix
|
|
283
|
+
</div>
|
|
284
|
+
<div style={{ overflowX: "auto" }}>
|
|
285
|
+
<table style={{ width: "100%", borderCollapse: "collapse", fontSize: "0.82rem" }}>
|
|
286
|
+
<thead>
|
|
287
|
+
<tr style={{ background: "var(--surface-subtle)" }}>
|
|
288
|
+
<th style={{ padding: "0.6rem 1rem", textAlign: "left", color: "var(--text-3)", fontWeight: 600, fontSize: "0.72rem", textTransform: "uppercase", letterSpacing: "0.04em", borderBottom: "1px solid var(--border)", minWidth: 220 }}>
|
|
289
|
+
Prompt / Vars
|
|
290
|
+
</th>
|
|
291
|
+
{providerKeys.map((k) => (
|
|
292
|
+
<th key={k} style={{ padding: "0.6rem 1rem", textAlign: "center", color: "var(--text-3)", fontWeight: 600, fontSize: "0.72rem", textTransform: "uppercase", letterSpacing: "0.04em", borderBottom: "1px solid var(--border)", minWidth: 120, whiteSpace: "nowrap" }}>
|
|
293
|
+
{k.split("/")[0]}<br />
|
|
294
|
+
<span style={{ fontWeight: 400, fontFamily: "monospace" }}>{k.split("/").slice(1).join("/")}</span>
|
|
295
|
+
</th>
|
|
296
|
+
))}
|
|
297
|
+
</tr>
|
|
298
|
+
</thead>
|
|
299
|
+
<tbody>
|
|
300
|
+
{cases.map((c, i) => {
|
|
301
|
+
const isExpanded = expandedCase === i;
|
|
302
|
+
const varStr = Object.entries(c.vars).map(([k, v]) => `${k}=${v}`).join(", ");
|
|
303
|
+
return (
|
|
304
|
+
<React.Fragment key={i}>
|
|
305
|
+
<tr
|
|
306
|
+
style={{ borderBottom: "1px solid var(--border)", cursor: "pointer", transition: "background 0.1s" }}
|
|
307
|
+
onClick={() => setExpandedCase(isExpanded ? null : i)}
|
|
308
|
+
onMouseEnter={(e) => (e.currentTarget.style.background = "var(--surface-hover)")}
|
|
309
|
+
onMouseLeave={(e) => (e.currentTarget.style.background = "")}
|
|
310
|
+
>
|
|
311
|
+
<td style={{ padding: "0.65rem 1rem" }}>
|
|
312
|
+
<div style={{ color: "var(--text-1)", fontSize: "0.8rem", maxWidth: 280, overflow: "hidden", textOverflow: "ellipsis", whiteSpace: "nowrap" }} title={c.prompt}>
|
|
313
|
+
{c.prompt}
|
|
314
|
+
</div>
|
|
315
|
+
{varStr && (
|
|
316
|
+
<div style={{ color: "var(--text-3)", fontSize: "0.72rem", marginTop: "0.15rem" }}>{varStr}</div>
|
|
317
|
+
)}
|
|
318
|
+
</td>
|
|
319
|
+
{providerKeys.map((k) => {
|
|
320
|
+
const pr = c.providerResults.find((r) => `${r.provider}/${r.model}` === k);
|
|
321
|
+
if (!pr) return <td key={k} style={{ padding: "0.65rem 1rem", textAlign: "center", color: "var(--text-3)" }}>—</td>;
|
|
322
|
+
return (
|
|
323
|
+
<td key={k} style={{ padding: "0.65rem 1rem", textAlign: "center" }}>
|
|
324
|
+
{pr.error ? (
|
|
325
|
+
<span style={{ color: "var(--red)", fontSize: "0.72rem" }}>Error</span>
|
|
326
|
+
) : (
|
|
327
|
+
<div style={{ display: "flex", flexDirection: "column", alignItems: "center", gap: "0.25rem" }}>
|
|
328
|
+
<PassFail pass={pr.pass} />
|
|
329
|
+
{pr.assertions.length > 0 && (
|
|
330
|
+
<span style={{ fontSize: "0.7rem", color: "var(--text-3)" }}>
|
|
331
|
+
{pr.assertions.filter((a) => a.pass).length}/{pr.assertions.length}
|
|
332
|
+
</span>
|
|
333
|
+
)}
|
|
334
|
+
</div>
|
|
335
|
+
)}
|
|
336
|
+
</td>
|
|
337
|
+
);
|
|
338
|
+
})}
|
|
339
|
+
</tr>
|
|
340
|
+
{isExpanded && (
|
|
341
|
+
<tr style={{ borderBottom: "1px solid var(--border)", background: "var(--surface-subtle)" }}>
|
|
342
|
+
<td colSpan={providerKeys.length + 1} style={{ padding: "0.75rem 1rem" }}>
|
|
343
|
+
<div style={{ display: "grid", gridTemplateColumns: `repeat(${providerKeys.length}, 1fr)`, gap: "1rem" }}>
|
|
344
|
+
{providerKeys.map((k) => {
|
|
345
|
+
const pr = c.providerResults.find((r) => `${r.provider}/${r.model}` === k);
|
|
346
|
+
if (!pr) return <div key={k} />;
|
|
347
|
+
return (
|
|
348
|
+
<div key={k} style={{ display: "flex", flexDirection: "column", gap: "0.4rem" }}>
|
|
349
|
+
<div style={{ fontWeight: 600, fontSize: "0.75rem", color: "var(--text-2)", textTransform: "uppercase", letterSpacing: "0.04em" }}>{k}</div>
|
|
350
|
+
{pr.error ? (
|
|
351
|
+
<div style={{ color: "var(--red)", fontSize: "0.78rem" }}>{pr.error}</div>
|
|
352
|
+
) : (
|
|
353
|
+
<>
|
|
354
|
+
<div style={{ color: "var(--text-3)", fontSize: "0.72rem" }}>
|
|
355
|
+
{pr.latencyMs}ms · {pr.outputTokens > 0 ? `${pr.outputTokens} tokens` : ""}
|
|
356
|
+
</div>
|
|
357
|
+
<div style={{ fontSize: "0.78rem", color: "var(--text-2)", maxHeight: 120, overflowY: "auto", lineHeight: 1.5, whiteSpace: "pre-wrap", wordBreak: "break-word" }}>
|
|
358
|
+
{pr.output}
|
|
359
|
+
</div>
|
|
360
|
+
{pr.assertions.length > 0 && (
|
|
361
|
+
<div style={{ display: "flex", flexDirection: "column", gap: "0.5rem", marginTop: "0.25rem", paddingTop: "0.25rem", borderTop: "1px solid var(--border)" }}>
|
|
362
|
+
{pr.assertions.map((a, ai) => <AssertionBlock key={ai} a={a} />)}
|
|
363
|
+
</div>
|
|
364
|
+
)}
|
|
365
|
+
</>
|
|
366
|
+
)}
|
|
367
|
+
</div>
|
|
368
|
+
);
|
|
369
|
+
})}
|
|
370
|
+
</div>
|
|
371
|
+
</td>
|
|
372
|
+
</tr>
|
|
373
|
+
)}
|
|
374
|
+
</React.Fragment>
|
|
375
|
+
);
|
|
376
|
+
})}
|
|
377
|
+
</tbody>
|
|
378
|
+
</table>
|
|
379
|
+
</div>
|
|
380
|
+
</div>
|
|
381
|
+
);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// ─── Run target (what the suite executes against) ─────────────────────────────
|
|
385
|
+
|
|
386
|
+
function RunTargetBanner({
|
|
387
|
+
instances,
|
|
388
|
+
secrets,
|
|
389
|
+
judge,
|
|
390
|
+
onOpenSettings,
|
|
391
|
+
}: {
|
|
392
|
+
instances: LLMInstance[];
|
|
393
|
+
secrets: SecretsMap;
|
|
394
|
+
judge: JudgeSettings;
|
|
395
|
+
onOpenSettings?: () => void;
|
|
396
|
+
}) {
|
|
397
|
+
const enabled = instances.filter((i) => i.enabled);
|
|
398
|
+
const judgeLine = describeJudgeForUi(judge, secrets);
|
|
399
|
+
|
|
400
|
+
return (
|
|
401
|
+
<div
|
|
402
|
+
style={{
|
|
403
|
+
background: "var(--surface)",
|
|
404
|
+
border: "1px solid var(--border)",
|
|
405
|
+
borderRadius: "var(--r-xl)",
|
|
406
|
+
overflow: "hidden",
|
|
407
|
+
boxShadow: "var(--shadow-sm)",
|
|
408
|
+
}}
|
|
409
|
+
>
|
|
410
|
+
<div
|
|
411
|
+
style={{
|
|
412
|
+
padding: "0.65rem 1.15rem",
|
|
413
|
+
borderBottom: "1px solid var(--border)",
|
|
414
|
+
background: "var(--surface-subtle)",
|
|
415
|
+
display: "flex",
|
|
416
|
+
alignItems: "center",
|
|
417
|
+
justifyContent: "space-between",
|
|
418
|
+
gap: "0.75rem",
|
|
419
|
+
flexWrap: "wrap",
|
|
420
|
+
}}
|
|
421
|
+
>
|
|
422
|
+
<span
|
|
423
|
+
style={{
|
|
424
|
+
fontSize: "0.7rem",
|
|
425
|
+
fontWeight: 600,
|
|
426
|
+
color: "var(--text-3)",
|
|
427
|
+
letterSpacing: "0.06em",
|
|
428
|
+
textTransform: "uppercase",
|
|
429
|
+
}}
|
|
430
|
+
>
|
|
431
|
+
Run target
|
|
432
|
+
</span>
|
|
433
|
+
{onOpenSettings && (
|
|
434
|
+
<button
|
|
435
|
+
type="button"
|
|
436
|
+
onClick={onOpenSettings}
|
|
437
|
+
style={{
|
|
438
|
+
padding: "0.25rem 0.65rem",
|
|
439
|
+
borderRadius: 6,
|
|
440
|
+
border: "1px solid var(--border)",
|
|
441
|
+
background: "var(--surface)",
|
|
442
|
+
color: "var(--text-2)",
|
|
443
|
+
fontSize: "0.75rem",
|
|
444
|
+
fontWeight: 600,
|
|
445
|
+
cursor: "pointer",
|
|
446
|
+
fontFamily: "inherit",
|
|
447
|
+
}}
|
|
448
|
+
>
|
|
449
|
+
Change in Settings
|
|
450
|
+
</button>
|
|
451
|
+
)}
|
|
452
|
+
</div>
|
|
453
|
+
<div style={{ padding: "0.9rem 1.15rem", display: "flex", flexDirection: "column", gap: "0.85rem" }}>
|
|
454
|
+
<div>
|
|
455
|
+
<div style={{ fontSize: "0.72rem", fontWeight: 600, color: "var(--text-3)", marginBottom: "0.45rem", letterSpacing: "0.03em" }}>
|
|
456
|
+
Enabled models
|
|
457
|
+
</div>
|
|
458
|
+
{enabled.length === 0 ? (
|
|
459
|
+
<span style={{ fontSize: "0.8125rem", color: "var(--text-2)" }}>No models enabled — enable at least one in Settings.</span>
|
|
460
|
+
) : (
|
|
461
|
+
<div style={{ display: "flex", flexWrap: "wrap", gap: "0.35rem" }}>
|
|
462
|
+
{enabled.map((i) => {
|
|
463
|
+
const { color, border } = providerUi(i.provider);
|
|
464
|
+
return (
|
|
465
|
+
<span
|
|
466
|
+
key={i.id}
|
|
467
|
+
style={{
|
|
468
|
+
padding: "0.28rem 0.65rem",
|
|
469
|
+
borderRadius: 8,
|
|
470
|
+
fontSize: "0.72rem",
|
|
471
|
+
fontWeight: 600,
|
|
472
|
+
background: "var(--surface-muted)",
|
|
473
|
+
border: `1px solid ${border}`,
|
|
474
|
+
color,
|
|
475
|
+
whiteSpace: "nowrap",
|
|
476
|
+
boxShadow: "var(--shadow-xs)",
|
|
477
|
+
}}
|
|
478
|
+
>
|
|
479
|
+
{i.provider}
|
|
480
|
+
<span style={{ color: "var(--text-3)", fontWeight: 500, margin: "0 0.25rem" }}>·</span>
|
|
481
|
+
<span style={{ fontFamily: "var(--font-mono)", fontWeight: 500, fontSize: "0.68rem" }}>{i.model}</span>
|
|
482
|
+
</span>
|
|
483
|
+
);
|
|
484
|
+
})}
|
|
485
|
+
</div>
|
|
486
|
+
)}
|
|
487
|
+
</div>
|
|
488
|
+
<div>
|
|
489
|
+
<div style={{ fontSize: "0.72rem", fontWeight: 600, color: "var(--text-3)", marginBottom: "0.35rem", letterSpacing: "0.03em" }}>
|
|
490
|
+
Judge (llm-rubric)
|
|
491
|
+
</div>
|
|
492
|
+
<p style={{ margin: 0, fontSize: "0.8125rem", color: "var(--text-2)", lineHeight: 1.55 }}>{judgeLine}</p>
|
|
493
|
+
</div>
|
|
494
|
+
</div>
|
|
495
|
+
</div>
|
|
496
|
+
);
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// ─── Main SuitePanel ──────────────────────────────────────────────────────────
|
|
500
|
+
|
|
501
|
+
interface SuitePanelProps {
|
|
502
|
+
instances: LLMInstance[];
|
|
503
|
+
secrets: SecretsMap;
|
|
504
|
+
judge: JudgeSettings;
|
|
505
|
+
onOpenSettings?: () => void;
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
type SuiteApiResponse = SuiteResult & { runLog?: string[]; judgeMeta?: SuiteJudgeMeta };
|
|
509
|
+
|
|
510
|
+
export function SuitePanel({ instances, secrets, judge, onOpenSettings }: SuitePanelProps) {
|
|
511
|
+
const [yaml, setYaml] = useState(EXAMPLE_YAML);
|
|
512
|
+
const [loading, setLoading] = useState(false);
|
|
513
|
+
const [result, setResult] = useState<SuiteResult | null>(null);
|
|
514
|
+
const [runLog, setRunLog] = useState<string[]>([]);
|
|
515
|
+
const [judgeMeta, setJudgeMeta] = useState<SuiteJudgeMeta | null>(null);
|
|
516
|
+
const [error, setError] = useState<string | null>(null);
|
|
517
|
+
const [historyVersion, setHistoryVersion] = useState(0);
|
|
518
|
+
const logPreRef = useRef<HTMLPreElement>(null);
|
|
519
|
+
|
|
520
|
+
const historyEntries = useMemo(() => loadSuiteRunHistory(), [historyVersion]);
|
|
521
|
+
|
|
522
|
+
useEffect(() => {
|
|
523
|
+
if (!loading || !logPreRef.current) return;
|
|
524
|
+
logPreRef.current.scrollTop = logPreRef.current.scrollHeight;
|
|
525
|
+
}, [runLog, loading]);
|
|
526
|
+
|
|
527
|
+
const enabled = instances.filter((i) => i.enabled);
|
|
528
|
+
const canRun = enabled.length > 0 && yaml.trim().length > 0 && !loading;
|
|
529
|
+
|
|
530
|
+
const restoreHistoryEntry = (e: SuiteRunHistoryEntry) => {
|
|
531
|
+
setYaml(e.yaml);
|
|
532
|
+
setResult(e.result);
|
|
533
|
+
setRunLog(e.runLog);
|
|
534
|
+
setJudgeMeta(e.judgeMeta);
|
|
535
|
+
setError(null);
|
|
536
|
+
};
|
|
537
|
+
|
|
538
|
+
const run = async () => {
|
|
539
|
+
if (!canRun) return;
|
|
540
|
+
setLoading(true);
|
|
541
|
+
setError(null);
|
|
542
|
+
setResult(null);
|
|
543
|
+
setRunLog([]);
|
|
544
|
+
setJudgeMeta(null);
|
|
545
|
+
try {
|
|
546
|
+
const resolved = resolveInstancesForApi(instances, secrets);
|
|
547
|
+
const judgePayload = buildJudgeApiPayload(judge, secrets);
|
|
548
|
+
const res = await fetch("/api/suite", {
|
|
549
|
+
method: "POST",
|
|
550
|
+
headers: { "Content-Type": "application/json", Accept: "text/event-stream" },
|
|
551
|
+
body: JSON.stringify({
|
|
552
|
+
yaml,
|
|
553
|
+
instances: resolved,
|
|
554
|
+
judge: judgePayload,
|
|
555
|
+
stream: true,
|
|
556
|
+
}),
|
|
557
|
+
});
|
|
558
|
+
|
|
559
|
+
const ct = res.headers.get("content-type") ?? "";
|
|
560
|
+
|
|
561
|
+
if (!res.ok) {
|
|
562
|
+
const body = (await res.json().catch(() => ({}))) as { error?: string };
|
|
563
|
+
throw new Error(body.error ?? res.statusText);
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
if (!ct.includes("text/event-stream")) {
|
|
567
|
+
const body = (await res.json()) as SuiteApiResponse | { error?: string };
|
|
568
|
+
if ("error" in body && body.error) throw new Error(body.error);
|
|
569
|
+
const { runLog: lines, judgeMeta: jm, cases, summary } = body as SuiteApiResponse;
|
|
570
|
+
if (!Array.isArray(cases) || !Array.isArray(summary)) throw new Error("Invalid suite response");
|
|
571
|
+
const suiteResult = { cases, summary };
|
|
572
|
+
const logLines = Array.isArray(lines) ? lines : [];
|
|
573
|
+
const meta = jm ?? null;
|
|
574
|
+
setResult(suiteResult);
|
|
575
|
+
setRunLog(logLines);
|
|
576
|
+
setJudgeMeta(meta);
|
|
577
|
+
appendSuiteRunHistory({
|
|
578
|
+
yaml,
|
|
579
|
+
result: suiteResult,
|
|
580
|
+
runLog: logLines,
|
|
581
|
+
judgeMeta: meta,
|
|
582
|
+
ranAt: new Date().toISOString(),
|
|
583
|
+
});
|
|
584
|
+
setHistoryVersion((v) => v + 1);
|
|
585
|
+
} else {
|
|
586
|
+
const out = await consumeSuiteSseStream(res, (line) => {
|
|
587
|
+
setRunLog((prev) => [...prev, line]);
|
|
588
|
+
});
|
|
589
|
+
const suiteResult = { cases: out.result.cases, summary: out.result.summary };
|
|
590
|
+
const meta = out.judgeMeta;
|
|
591
|
+
setResult(suiteResult);
|
|
592
|
+
setRunLog(out.runLog);
|
|
593
|
+
setJudgeMeta(meta);
|
|
594
|
+
appendSuiteRunHistory({
|
|
595
|
+
yaml,
|
|
596
|
+
result: suiteResult,
|
|
597
|
+
runLog: out.runLog,
|
|
598
|
+
judgeMeta: meta,
|
|
599
|
+
ranAt: new Date().toISOString(),
|
|
600
|
+
});
|
|
601
|
+
setHistoryVersion((v) => v + 1);
|
|
602
|
+
}
|
|
603
|
+
} catch (err) {
|
|
604
|
+
setError(err instanceof Error ? err.message : String(err));
|
|
605
|
+
} finally {
|
|
606
|
+
setLoading(false);
|
|
607
|
+
}
|
|
608
|
+
};
|
|
609
|
+
|
|
610
|
+
const allPassed = result ? result.summary.every((s) => s.failed === 0) : null;
|
|
611
|
+
|
|
612
|
+
return (
|
|
613
|
+
<div style={{ display: "flex", flexDirection: "column", gap: "1.35rem" }}>
|
|
614
|
+
<RunTargetBanner instances={instances} secrets={secrets} judge={judge} onOpenSettings={onOpenSettings} />
|
|
615
|
+
|
|
616
|
+
<div
|
|
617
|
+
style={{
|
|
618
|
+
background: "var(--surface)",
|
|
619
|
+
border: "1px solid var(--border)",
|
|
620
|
+
borderRadius: "var(--r-xl)",
|
|
621
|
+
overflow: "hidden",
|
|
622
|
+
boxShadow: "var(--shadow-sm)",
|
|
623
|
+
}}
|
|
624
|
+
>
|
|
625
|
+
<div
|
|
626
|
+
style={{
|
|
627
|
+
padding: "0.65rem 1.1rem",
|
|
628
|
+
borderBottom: "1px solid var(--border)",
|
|
629
|
+
background: "var(--surface-subtle)",
|
|
630
|
+
fontWeight: 600,
|
|
631
|
+
fontSize: "0.72rem",
|
|
632
|
+
letterSpacing: "0.06em",
|
|
633
|
+
textTransform: "uppercase",
|
|
634
|
+
color: "var(--text-3)",
|
|
635
|
+
}}
|
|
636
|
+
>
|
|
637
|
+
Recent suite runs
|
|
638
|
+
</div>
|
|
639
|
+
{historyEntries.length === 0 ? (
|
|
640
|
+
<div style={{ padding: "1.1rem 1.15rem", color: "var(--text-3)", fontSize: "0.8125rem", margin: 0 }}>
|
|
641
|
+
No saved runs yet. Each successful suite run is stored in this browser (up to 15).
|
|
642
|
+
</div>
|
|
643
|
+
) : (
|
|
644
|
+
<ul style={{ listStyle: "none", margin: 0, padding: 0 }}>
|
|
645
|
+
{historyEntries.map((entry, idx) => (
|
|
646
|
+
<li
|
|
647
|
+
key={entry.id}
|
|
648
|
+
style={{
|
|
649
|
+
borderBottom: idx < historyEntries.length - 1 ? "1px solid var(--border)" : "none",
|
|
650
|
+
}}
|
|
651
|
+
>
|
|
652
|
+
<button
|
|
653
|
+
type="button"
|
|
654
|
+
onClick={() => restoreHistoryEntry(entry)}
|
|
655
|
+
style={{
|
|
656
|
+
width: "100%",
|
|
657
|
+
textAlign: "left",
|
|
658
|
+
padding: "0.75rem 1.1rem",
|
|
659
|
+
border: "none",
|
|
660
|
+
background: "transparent",
|
|
661
|
+
cursor: "pointer",
|
|
662
|
+
fontFamily: "inherit",
|
|
663
|
+
transition: "background 0.12s ease",
|
|
664
|
+
}}
|
|
665
|
+
onMouseEnter={(e) => {
|
|
666
|
+
e.currentTarget.style.background = "var(--surface-subtle)";
|
|
667
|
+
}}
|
|
668
|
+
onMouseLeave={(e) => {
|
|
669
|
+
e.currentTarget.style.background = "transparent";
|
|
670
|
+
}}
|
|
671
|
+
>
|
|
672
|
+
<div style={{ fontSize: "0.72rem", color: "var(--text-3)", marginBottom: "0.25rem" }}>
|
|
673
|
+
{new Date(entry.ranAt).toLocaleString()} · {entry.result.cases.length} case
|
|
674
|
+
{entry.result.cases.length !== 1 ? "s" : ""} · {entry.result.summary.length} model
|
|
675
|
+
{entry.result.summary.length !== 1 ? "s" : ""}
|
|
676
|
+
</div>
|
|
677
|
+
<div
|
|
678
|
+
style={{
|
|
679
|
+
fontSize: "0.84rem",
|
|
680
|
+
color: "var(--text-1)",
|
|
681
|
+
fontWeight: 500,
|
|
682
|
+
lineHeight: 1.45,
|
|
683
|
+
fontFamily: "var(--font-mono)",
|
|
684
|
+
}}
|
|
685
|
+
>
|
|
686
|
+
{entry.yamlPreview}
|
|
687
|
+
</div>
|
|
688
|
+
</button>
|
|
689
|
+
</li>
|
|
690
|
+
))}
|
|
691
|
+
</ul>
|
|
692
|
+
)}
|
|
693
|
+
</div>
|
|
694
|
+
|
|
695
|
+
<div style={{ background: "var(--surface)", border: "1px solid var(--border)", borderRadius: "var(--r-2xl)", overflow: "hidden", boxShadow: "var(--shadow-md)" }}>
|
|
696
|
+
<div style={{ padding: "0.7rem 1.2rem", borderBottom: "1px solid var(--border)", background: "var(--surface-subtle)", display: "flex", justifyContent: "space-between", alignItems: "center", flexWrap: "wrap", gap: "0.65rem" }}>
|
|
697
|
+
<div style={{ display: "flex", flexDirection: "column", gap: "0.15rem" }}>
|
|
698
|
+
<span style={{ fontSize: "0.7rem", fontWeight: 600, color: "var(--text-3)", letterSpacing: "0.06em", textTransform: "uppercase" }}>Suite</span>
|
|
699
|
+
<span style={{ fontSize: "0.875rem", fontWeight: 600, color: "var(--text-1)", letterSpacing: "-0.02em" }}>Eval configuration (YAML)</span>
|
|
700
|
+
</div>
|
|
701
|
+
<div style={{ display: "flex", alignItems: "center", gap: "0.75rem" }}>
|
|
702
|
+
{enabled.length === 0 && (
|
|
703
|
+
<span style={{ fontSize: "0.78rem", color: "var(--text-3)", fontWeight: 500 }}>Enable models in Settings</span>
|
|
704
|
+
)}
|
|
705
|
+
<button
|
|
706
|
+
type="button"
|
|
707
|
+
onClick={run}
|
|
708
|
+
disabled={!canRun}
|
|
709
|
+
style={{
|
|
710
|
+
padding: "0.5rem 1.2rem",
|
|
711
|
+
borderRadius: "var(--r-md)",
|
|
712
|
+
border: "none",
|
|
713
|
+
background: canRun ? "var(--accent)" : "var(--surface-hover)",
|
|
714
|
+
color: canRun ? "#fff" : "var(--text-3)",
|
|
715
|
+
fontWeight: 600,
|
|
716
|
+
fontSize: "0.875rem",
|
|
717
|
+
cursor: canRun ? "pointer" : "not-allowed",
|
|
718
|
+
fontFamily: "inherit",
|
|
719
|
+
display: "flex",
|
|
720
|
+
alignItems: "center",
|
|
721
|
+
gap: "0.45rem",
|
|
722
|
+
boxShadow: canRun ? "0 2px 8px rgba(30, 64, 175, 0.28)" : "none",
|
|
723
|
+
transition: "background 0.15s",
|
|
724
|
+
}}
|
|
725
|
+
>
|
|
726
|
+
{loading ? (
|
|
727
|
+
<>
|
|
728
|
+
<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2.5" strokeLinecap="round" style={{ animation: "spin 1s linear infinite" }}>
|
|
729
|
+
<path d="M12 2v4M12 18v4M4.93 4.93l2.83 2.83M16.24 16.24l2.83 2.83M2 12h4M18 12h4M4.93 19.07l2.83-2.83M16.24 7.76l2.83-2.83" />
|
|
730
|
+
</svg>
|
|
731
|
+
Running…
|
|
732
|
+
</>
|
|
733
|
+
) : "Run Suite"}
|
|
734
|
+
</button>
|
|
735
|
+
</div>
|
|
736
|
+
</div>
|
|
737
|
+
<textarea
|
|
738
|
+
value={yaml}
|
|
739
|
+
onChange={(e) => setYaml(e.target.value)}
|
|
740
|
+
spellCheck={false}
|
|
741
|
+
style={{
|
|
742
|
+
display: "block",
|
|
743
|
+
width: "100%",
|
|
744
|
+
minHeight: 280,
|
|
745
|
+
background: "var(--surface-muted)",
|
|
746
|
+
border: "none",
|
|
747
|
+
outline: "none",
|
|
748
|
+
color: "var(--text-1)",
|
|
749
|
+
fontSize: "0.8125rem",
|
|
750
|
+
lineHeight: 1.65,
|
|
751
|
+
padding: "1.1rem 1.25rem",
|
|
752
|
+
resize: "vertical",
|
|
753
|
+
fontFamily: "var(--font-mono)",
|
|
754
|
+
boxSizing: "border-box",
|
|
755
|
+
}}
|
|
756
|
+
/>
|
|
757
|
+
</div>
|
|
758
|
+
|
|
759
|
+
{error && (
|
|
760
|
+
<div style={{ background: "var(--red-subtle)", border: "1px solid rgba(185, 28, 28, 0.2)", color: "var(--red)", borderRadius: "var(--r-lg)", padding: "0.85rem 1.1rem", fontSize: "0.875rem", lineHeight: 1.55, fontWeight: 500 }}>
|
|
761
|
+
{error}
|
|
762
|
+
</div>
|
|
763
|
+
)}
|
|
764
|
+
|
|
765
|
+
{loading && (
|
|
766
|
+
<div
|
|
767
|
+
style={{
|
|
768
|
+
background: "var(--accent-subtle)",
|
|
769
|
+
border: "1px solid rgba(30, 64, 175, 0.15)",
|
|
770
|
+
borderRadius: "var(--r-lg)",
|
|
771
|
+
padding: "0.85rem 1.1rem",
|
|
772
|
+
fontSize: "0.8125rem",
|
|
773
|
+
lineHeight: 1.55,
|
|
774
|
+
color: "var(--accent-text)",
|
|
775
|
+
fontWeight: 500,
|
|
776
|
+
}}
|
|
777
|
+
>
|
|
778
|
+
Running suite — logs stream live below as each LLM and judge request starts and finishes.
|
|
779
|
+
</div>
|
|
780
|
+
)}
|
|
781
|
+
|
|
782
|
+
{(loading || runLog.length > 0) && (
|
|
783
|
+
<div
|
|
784
|
+
style={{
|
|
785
|
+
background: "var(--surface)",
|
|
786
|
+
border: "1px solid var(--border)",
|
|
787
|
+
borderRadius: "var(--r-xl)",
|
|
788
|
+
overflow: "hidden",
|
|
789
|
+
boxShadow: "var(--shadow-sm)",
|
|
790
|
+
}}
|
|
791
|
+
>
|
|
792
|
+
<div
|
|
793
|
+
style={{
|
|
794
|
+
padding: "0.65rem 1.1rem",
|
|
795
|
+
borderBottom: "1px solid var(--border)",
|
|
796
|
+
background: "var(--surface-subtle)",
|
|
797
|
+
fontWeight: 600,
|
|
798
|
+
fontSize: "0.8125rem",
|
|
799
|
+
letterSpacing: "-0.01em",
|
|
800
|
+
color: "var(--text-1)",
|
|
801
|
+
}}
|
|
802
|
+
>
|
|
803
|
+
{loading ? "Live run log" : "Run log"}
|
|
804
|
+
</div>
|
|
805
|
+
<p
|
|
806
|
+
style={{
|
|
807
|
+
margin: 0,
|
|
808
|
+
padding: "0.45rem 1.1rem 0",
|
|
809
|
+
fontSize: "0.68rem",
|
|
810
|
+
color: "var(--text-3)",
|
|
811
|
+
lineHeight: 1.45,
|
|
812
|
+
}}
|
|
813
|
+
>
|
|
814
|
+
Needs a Node server (not static HTML export). On Vercel, use the Node runtime and a long enough function
|
|
815
|
+
timeout (this route sets maxDuration to 300s). If hosts buffer SSE, disable buffering or run very long suites
|
|
816
|
+
on a dedicated server.
|
|
817
|
+
</p>
|
|
818
|
+
<pre
|
|
819
|
+
ref={logPreRef}
|
|
820
|
+
style={{
|
|
821
|
+
margin: 0,
|
|
822
|
+
padding: "0.85rem 1.1rem",
|
|
823
|
+
maxHeight: 280,
|
|
824
|
+
overflow: "auto",
|
|
825
|
+
fontSize: "0.72rem",
|
|
826
|
+
lineHeight: 1.55,
|
|
827
|
+
fontFamily: "var(--font-mono)",
|
|
828
|
+
color: "var(--text-2)",
|
|
829
|
+
background: "var(--surface-muted)",
|
|
830
|
+
whiteSpace: "pre-wrap",
|
|
831
|
+
wordBreak: "break-word",
|
|
832
|
+
}}
|
|
833
|
+
>
|
|
834
|
+
{runLog.length > 0 ? runLog.join("\n") : loading ? "Connecting…" : ""}
|
|
835
|
+
</pre>
|
|
836
|
+
</div>
|
|
837
|
+
)}
|
|
838
|
+
|
|
839
|
+
{/* Results */}
|
|
840
|
+
{result && (
|
|
841
|
+
<div style={{ display: "flex", flexDirection: "column", gap: "1rem" }}>
|
|
842
|
+
{judgeMeta && <JudgeRunSummary meta={judgeMeta} />}
|
|
843
|
+
|
|
844
|
+
{/* Status banner */}
|
|
845
|
+
<div style={{
|
|
846
|
+
display: "flex", alignItems: "center", gap: "0.75rem",
|
|
847
|
+
padding: "0.85rem 1.15rem", borderRadius: "var(--r-lg)",
|
|
848
|
+
background: allPassed ? "var(--green-subtle)" : "var(--red-subtle)",
|
|
849
|
+
border: `1px solid ${allPassed ? "rgba(4, 120, 87, 0.2)" : "rgba(185, 28, 28, 0.2)"}`,
|
|
850
|
+
}}>
|
|
851
|
+
<span style={{ fontSize: "1.05rem", fontWeight: 700 }}>{allPassed ? "✓" : "✗"}</span>
|
|
852
|
+
<span style={{ fontWeight: 600, color: allPassed ? "var(--green)" : "var(--red)", fontSize: "0.9rem" }}>
|
|
853
|
+
{allPassed
|
|
854
|
+
? "All assertions passed"
|
|
855
|
+
: `${result.summary.reduce((s, p) => s + p.failed, 0)} assertion(s) failed`}
|
|
856
|
+
</span>
|
|
857
|
+
<span style={{ color: "var(--text-3)", fontSize: "0.78rem", marginLeft: "auto" }}>
|
|
858
|
+
{result.cases.length} test case{result.cases.length !== 1 ? "s" : ""} · {result.summary.length} provider{result.summary.length !== 1 ? "s" : ""}
|
|
859
|
+
</span>
|
|
860
|
+
</div>
|
|
861
|
+
|
|
862
|
+
<SummaryTable summary={result.summary} />
|
|
863
|
+
<ResultsTable cases={result.cases} summary={result.summary} />
|
|
864
|
+
</div>
|
|
865
|
+
)}
|
|
866
|
+
|
|
867
|
+
{/* Empty state */}
|
|
868
|
+
{!result && !error && !loading && (
|
|
869
|
+
<div style={{ textAlign: "center", padding: "2.25rem 1rem", color: "var(--text-2)", fontSize: "0.875rem", lineHeight: 1.65, maxWidth: 440, margin: "0 auto" }}>
|
|
870
|
+
Define prompts and assertions in YAML, then run against your enabled models. Expand rows in the results table to inspect outputs and rubric checks.
|
|
871
|
+
</div>
|
|
872
|
+
)}
|
|
873
|
+
</div>
|
|
874
|
+
);
|
|
875
|
+
}
|