waypoi 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (260) hide show
  1. package/.github/instructions/ui.instructions.md +42 -0
  2. package/.github/workflows/ci.yml +35 -0
  3. package/.github/workflows/publish.yml +71 -0
  4. package/.github/workflows/release.yml +48 -0
  5. package/.playwright-mcp/console-2026-04-04T01-41-10-746Z.log +2 -0
  6. package/.playwright-mcp/console-2026-04-04T01-41-28-799Z.log +3 -0
  7. package/.playwright-mcp/console-2026-04-05T02-26-51-909Z.log +76 -0
  8. package/.playwright-mcp/page-2026-04-04T01-41-10-816Z.yml +1 -0
  9. package/.playwright-mcp/page-2026-04-04T01-41-29-141Z.yml +77 -0
  10. package/.playwright-mcp/page-2026-04-04T01-41-42-633Z.yml +190 -0
  11. package/.playwright-mcp/page-2026-04-04T01-42-03-929Z.yml +262 -0
  12. package/.playwright-mcp/page-2026-04-04T02-12-54-813Z.yml +6 -0
  13. package/.playwright-mcp/page-2026-04-04T02-14-58-600Z.yml +190 -0
  14. package/.playwright-mcp/page-2026-04-04T02-15-03-923Z.yml +190 -0
  15. package/.playwright-mcp/page-2026-04-04T02-15-07-426Z.yml +190 -0
  16. package/.playwright-mcp/page-2026-04-04T02-15-25-729Z.yml +262 -0
  17. package/.playwright-mcp/page-2026-04-04T02-16-22-984Z.yml +262 -0
  18. package/.playwright-mcp/page-2026-04-04T02-17-00-599Z.yml +190 -0
  19. package/.playwright-mcp/page-2026-04-04T02-17-50-874Z.yml +190 -0
  20. package/.playwright-mcp/page-2026-04-05T02-26-55-570Z.yml +6 -0
  21. package/AGENTS.md +48 -0
  22. package/CHANGELOG.md +131 -0
  23. package/README.md +552 -0
  24. package/assets/agent-mode.png +0 -0
  25. package/assets/categorize.png +0 -0
  26. package/assets/dashboard.png +0 -0
  27. package/assets/endpoint-proxy.png +0 -0
  28. package/assets/icon.png +0 -0
  29. package/assets/mcp-generate-image.png +0 -0
  30. package/assets/mcp-understand-image.png +0 -0
  31. package/assets/peek-token-flow.png +0 -0
  32. package/assets/playground.png +0 -0
  33. package/assets/sankey.png +0 -0
  34. package/cli/index.ts +2805 -0
  35. package/cli/legacyRewrite.ts +108 -0
  36. package/cli/modelRef.ts +24 -0
  37. package/dist/cli/index.js +2536 -0
  38. package/dist/cli/legacyRewrite.js +92 -0
  39. package/dist/cli/modelRef.js +20 -0
  40. package/dist/src/benchmark/artifacts.js +131 -0
  41. package/dist/src/benchmark/capabilityClassifier.js +81 -0
  42. package/dist/src/benchmark/capabilityStore.js +144 -0
  43. package/dist/src/benchmark/config.js +238 -0
  44. package/dist/src/benchmark/gates.js +118 -0
  45. package/dist/src/benchmark/jobs.js +252 -0
  46. package/dist/src/benchmark/runner.js +1847 -0
  47. package/dist/src/benchmark/schema.js +353 -0
  48. package/dist/src/benchmark/suites.js +314 -0
  49. package/dist/src/benchmark/tinyQaDataset.js +422 -0
  50. package/dist/src/benchmark/types.js +25 -0
  51. package/dist/src/config.js +47 -0
  52. package/dist/src/index.js +178 -0
  53. package/dist/src/mcp/client.js +215 -0
  54. package/dist/src/mcp/discovery.js +226 -0
  55. package/dist/src/mcp/policy.js +65 -0
  56. package/dist/src/mcp/registry.js +129 -0
  57. package/dist/src/mcp/service.js +460 -0
  58. package/dist/src/middleware/auth.js +179 -0
  59. package/dist/src/middleware/requestCapture.js +192 -0
  60. package/dist/src/middleware/requestStats.js +118 -0
  61. package/dist/src/pools/builder.js +132 -0
  62. package/dist/src/pools/repository.js +69 -0
  63. package/dist/src/pools/scheduler.js +360 -0
  64. package/dist/src/pools/types.js +2 -0
  65. package/dist/src/protocols/adapters/dashscope.js +267 -0
  66. package/dist/src/protocols/adapters/inferenceV2.js +346 -0
  67. package/dist/src/protocols/adapters/openai.js +27 -0
  68. package/dist/src/protocols/registry.js +99 -0
  69. package/dist/src/protocols/types.js +2 -0
  70. package/dist/src/providers/health.js +153 -0
  71. package/dist/src/providers/importer.js +289 -0
  72. package/dist/src/providers/modelRegistry.js +313 -0
  73. package/dist/src/providers/repository.js +361 -0
  74. package/dist/src/providers/types.js +2 -0
  75. package/dist/src/routes/admin.js +531 -0
  76. package/dist/src/routes/audio.js +295 -0
  77. package/dist/src/routes/chat.js +240 -0
  78. package/dist/src/routes/embeddings.js +157 -0
  79. package/dist/src/routes/images.js +288 -0
  80. package/dist/src/routes/mcp.js +256 -0
  81. package/dist/src/routes/mcpService.js +100 -0
  82. package/dist/src/routes/models.js +48 -0
  83. package/dist/src/routes/responses.js +711 -0
  84. package/dist/src/routes/sessions.js +450 -0
  85. package/dist/src/routes/stats.js +270 -0
  86. package/dist/src/routes/ui.js +97 -0
  87. package/dist/src/routes/videos.js +107 -0
  88. package/dist/src/routing/router.js +338 -0
  89. package/dist/src/services/imageGeneration.js +280 -0
  90. package/dist/src/services/imageUnderstanding.js +352 -0
  91. package/dist/src/services/videoGeneration.js +79 -0
  92. package/dist/src/storage/captureRepository.js +1591 -0
  93. package/dist/src/storage/files.js +157 -0
  94. package/dist/src/storage/imageCache.js +346 -0
  95. package/dist/src/storage/repositories.js +388 -0
  96. package/dist/src/storage/sessionRepository.js +370 -0
  97. package/dist/src/storage/statsRepository.js +204 -0
  98. package/dist/src/transport/httpClient.js +126 -0
  99. package/dist/src/types.js +2 -0
  100. package/dist/src/utils/messageMedia.js +285 -0
  101. package/dist/src/utils/modelCapabilities.js +108 -0
  102. package/dist/src/utils/modelDiscovery.js +170 -0
  103. package/dist/src/version.js +5 -0
  104. package/dist/src/workers/captureRetention.js +25 -0
  105. package/dist/src/workers/configWatcher.js +91 -0
  106. package/dist/src/workers/healthChecker.js +21 -0
  107. package/dist/src/workers/statsRotation.js +41 -0
  108. package/docs/LLM/output_schema.md +312 -0
  109. package/docs/benchmark.md +208 -0
  110. package/docs/mcp-guidelines.md +125 -0
  111. package/docs/mcp-service.md +178 -0
  112. package/docs/opencode.md +86 -0
  113. package/docs/providers.md +79 -0
  114. package/examples/benchmark.config.yaml +28 -0
  115. package/examples/providers/alibaba-dashscope.yaml +88 -0
  116. package/examples/providers/alibaba-llm.yaml +64 -0
  117. package/examples/providers/alibaba-registry.yaml +7 -0
  118. package/examples/providers/inference-v2-ray.yaml +29 -0
  119. package/examples/scenarios/assets/omni-call-sample.wav +0 -0
  120. package/examples/scenarios/custom.jsonl +5 -0
  121. package/examples/scenarios/custom.yaml +40 -0
  122. package/model-form-v2.png +0 -0
  123. package/package.json +66 -0
  124. package/provider-form-v2.png +0 -0
  125. package/provider-form.png +0 -0
  126. package/scripts/manual-test.sh +11 -0
  127. package/scripts/version-from-git.js +23 -0
  128. package/src/benchmark/artifacts.ts +149 -0
  129. package/src/benchmark/capabilityClassifier.ts +99 -0
  130. package/src/benchmark/capabilityStore.ts +174 -0
  131. package/src/benchmark/config.ts +337 -0
  132. package/src/benchmark/gates.ts +164 -0
  133. package/src/benchmark/jobs.ts +312 -0
  134. package/src/benchmark/runner.ts +2519 -0
  135. package/src/benchmark/schema.ts +443 -0
  136. package/src/benchmark/suites.ts +323 -0
  137. package/src/benchmark/tinyQaDataset.ts +428 -0
  138. package/src/benchmark/types.ts +442 -0
  139. package/src/config.ts +44 -0
  140. package/src/index.ts +195 -0
  141. package/src/mcp/client.ts +305 -0
  142. package/src/mcp/discovery.ts +266 -0
  143. package/src/mcp/policy.ts +105 -0
  144. package/src/mcp/registry.ts +164 -0
  145. package/src/mcp/service.ts +611 -0
  146. package/src/middleware/auth.ts +251 -0
  147. package/src/middleware/requestCapture.ts +245 -0
  148. package/src/middleware/requestStats.ts +163 -0
  149. package/src/pools/builder.ts +159 -0
  150. package/src/pools/repository.ts +71 -0
  151. package/src/pools/scheduler.ts +425 -0
  152. package/src/pools/types.ts +117 -0
  153. package/src/protocols/adapters/dashscope.ts +335 -0
  154. package/src/protocols/adapters/inferenceV2.ts +428 -0
  155. package/src/protocols/adapters/openai.ts +32 -0
  156. package/src/protocols/registry.ts +117 -0
  157. package/src/protocols/types.ts +81 -0
  158. package/src/providers/health.ts +207 -0
  159. package/src/providers/importer.ts +402 -0
  160. package/src/providers/modelRegistry.ts +415 -0
  161. package/src/providers/repository.ts +439 -0
  162. package/src/providers/types.ts +113 -0
  163. package/src/routes/admin.ts +666 -0
  164. package/src/routes/audio.ts +372 -0
  165. package/src/routes/chat.ts +301 -0
  166. package/src/routes/embeddings.ts +197 -0
  167. package/src/routes/images.ts +356 -0
  168. package/src/routes/mcp.ts +320 -0
  169. package/src/routes/mcpService.ts +114 -0
  170. package/src/routes/models.ts +50 -0
  171. package/src/routes/responses.ts +872 -0
  172. package/src/routes/sessions.ts +558 -0
  173. package/src/routes/stats.ts +312 -0
  174. package/src/routes/ui.ts +96 -0
  175. package/src/routes/videos.ts +132 -0
  176. package/src/routing/router.ts +501 -0
  177. package/src/services/imageGeneration.ts +396 -0
  178. package/src/services/imageUnderstanding.ts +449 -0
  179. package/src/services/videoGeneration.ts +127 -0
  180. package/src/storage/captureRepository.ts +1835 -0
  181. package/src/storage/files.ts +178 -0
  182. package/src/storage/imageCache.ts +405 -0
  183. package/src/storage/repositories.ts +494 -0
  184. package/src/storage/sessionRepository.ts +419 -0
  185. package/src/storage/statsRepository.ts +238 -0
  186. package/src/transport/httpClient.ts +145 -0
  187. package/src/types.ts +322 -0
  188. package/src/utils/messageMedia.ts +293 -0
  189. package/src/utils/modelCapabilities.ts +161 -0
  190. package/src/utils/modelDiscovery.ts +203 -0
  191. package/src/workers/captureRetention.ts +25 -0
  192. package/src/workers/configWatcher.ts +115 -0
  193. package/src/workers/healthChecker.ts +22 -0
  194. package/src/workers/statsRotation.ts +49 -0
  195. package/tests/benchmarkAdminRoutes.test.ts +82 -0
  196. package/tests/benchmarkBasics.test.ts +116 -0
  197. package/tests/captureAdminRoutes.test.ts +420 -0
  198. package/tests/captureRepository.test.ts +797 -0
  199. package/tests/cliLegacyRewrite.test.ts +45 -0
  200. package/tests/imageGeneration.service.test.ts +107 -0
  201. package/tests/imageUnderstanding.service.test.ts +123 -0
  202. package/tests/mcpPolicy.test.ts +105 -0
  203. package/tests/mcpService.test.ts +1245 -0
  204. package/tests/modelRef.test.ts +23 -0
  205. package/tests/modelsRoutes.test.ts +154 -0
  206. package/tests/sessionMediaCache.test.ts +167 -0
  207. package/tests/statsRoutes.test.ts +323 -0
  208. package/tsconfig.json +15 -0
  209. package/ui/index.html +16 -0
  210. package/ui/package-lock.json +8521 -0
  211. package/ui/package.json +52 -0
  212. package/ui/postcss.config.js +6 -0
  213. package/ui/public/assets/apple-touch-icon.png +0 -0
  214. package/ui/public/assets/favicon-16.png +0 -0
  215. package/ui/public/assets/favicon-32.png +0 -0
  216. package/ui/public/assets/icon-192.png +0 -0
  217. package/ui/public/assets/icon-512.png +0 -0
  218. package/ui/src/App.tsx +27 -0
  219. package/ui/src/api/client.ts +1503 -0
  220. package/ui/src/components/EndpointUsageGuide.tsx +361 -0
  221. package/ui/src/components/Layout.tsx +124 -0
  222. package/ui/src/components/MessageContent.tsx +365 -0
  223. package/ui/src/components/ToolCallMessage.tsx +179 -0
  224. package/ui/src/components/ToolPicker.tsx +442 -0
  225. package/ui/src/components/messageContentParser.test.ts +41 -0
  226. package/ui/src/components/messageContentParser.ts +73 -0
  227. package/ui/src/components/thinkingPreview.test.ts +27 -0
  228. package/ui/src/components/thinkingPreview.ts +15 -0
  229. package/ui/src/components/toMermaidSankey.test.ts +78 -0
  230. package/ui/src/components/toMermaidSankey.ts +56 -0
  231. package/ui/src/components/ui/button.tsx +58 -0
  232. package/ui/src/components/ui/input.tsx +21 -0
  233. package/ui/src/components/ui/textarea.tsx +21 -0
  234. package/ui/src/lib/utils.ts +6 -0
  235. package/ui/src/main.tsx +9 -0
  236. package/ui/src/pages/AgentPlayground.tsx +2010 -0
  237. package/ui/src/pages/Benchmark.tsx +988 -0
  238. package/ui/src/pages/Dashboard.tsx +581 -0
  239. package/ui/src/pages/Peek.tsx +962 -0
  240. package/ui/src/pages/Settings.tsx +2013 -0
  241. package/ui/src/pages/agentPlaygroundPayload.test.ts +109 -0
  242. package/ui/src/pages/agentPlaygroundPayload.ts +97 -0
  243. package/ui/src/pages/agentThinkingContent.test.ts +50 -0
  244. package/ui/src/pages/agentThinkingContent.ts +57 -0
  245. package/ui/src/pages/dashboardTokenUsage.test.ts +66 -0
  246. package/ui/src/pages/dashboardTokenUsage.ts +36 -0
  247. package/ui/src/pages/imageUpload.test.ts +39 -0
  248. package/ui/src/pages/imageUpload.ts +71 -0
  249. package/ui/src/pages/peekFilters.test.ts +29 -0
  250. package/ui/src/pages/peekFilters.ts +13 -0
  251. package/ui/src/pages/peekMedia.test.ts +58 -0
  252. package/ui/src/pages/peekMedia.ts +148 -0
  253. package/ui/src/pages/sessionAutoTitle.test.ts +128 -0
  254. package/ui/src/pages/sessionAutoTitle.ts +106 -0
  255. package/ui/src/stores/settings.ts +58 -0
  256. package/ui/src/styles/globals.css +223 -0
  257. package/ui/src/vite-env.d.ts +8 -0
  258. package/ui/tailwind.config.js +106 -0
  259. package/ui/tsconfig.json +32 -0
  260. package/ui/vite.config.ts +37 -0
@@ -0,0 +1,988 @@
1
+ import { useEffect, useMemo, useRef, useState } from 'react'
2
+ import { Gauge, Loader2, Play, RefreshCw, MessageSquareText } from 'lucide-react'
3
+ import { Button } from '@/components/ui/button'
4
+ import { cn } from '@/lib/utils'
5
+ import {
6
+ BenchmarkCapabilityMatrix,
7
+ BenchmarkExampleSummary,
8
+ BenchmarkRunEvent,
9
+ BenchmarkRunRecord,
10
+ BenchmarkRunSummary,
11
+ getBenchmarkRun,
12
+ listBenchmarkCapabilities,
13
+ listBenchmarkExamples,
14
+ listBenchmarkRuns,
15
+ listModels,
16
+ startBenchmarkRun,
17
+ type Model,
18
+ } from '@/api/client'
19
+
20
+ type RunStatus = 'running' | 'completed' | 'failed'
21
+
22
+ type ModelLeaderboardRow = {
23
+ model: string
24
+ runCount: number
25
+ scenarioCount: number
26
+ avgPassRate: number
27
+ avgP95LatencyMs: number
28
+ totalTokens: number
29
+ totalFailovers: number
30
+ }
31
+
32
+ type ShowcaseExchange = {
33
+ id: string
34
+ timestamp?: string
35
+ mode: string
36
+ model: string
37
+ scenarioInput: string
38
+ requestPath: string
39
+ statusCode: number
40
+ contentType: string
41
+ endpointName?: string
42
+ upstreamModel?: string
43
+ toolTrace: Array<{
44
+ kind: 'tool_call' | 'tool_result'
45
+ toolName: string
46
+ toolCallId?: string
47
+ argumentsText?: string
48
+ contentText?: string
49
+ }>
50
+ requestPayload: unknown
51
+ responsePayload: unknown
52
+ }
53
+
54
+ const SHOWCASE_SUITE = 'showcase'
55
+ const DIAGNOSTIC_SUITES = ['smoke', 'proxy', 'agent', 'pool_smoke', 'omni_call_smoke', 'capabilities']
56
+ const ALL_SUITES = [SHOWCASE_SUITE, ...DIAGNOSTIC_SUITES]
57
+ const SUITE_LABELS: Record<string, string> = {
58
+ showcase: 'Showcase',
59
+ smoke: 'Smoke',
60
+ proxy: 'Proxy',
61
+ agent: 'Agent',
62
+ pool_smoke: 'Pool Smoke',
63
+ omni_call_smoke: 'Omni Call Smoke',
64
+ capabilities: 'Capabilities',
65
+ }
66
+ const PROFILES = ['local', 'ci']
67
+
68
+ type GenerationParamDraft = {
69
+ temperature: string
70
+ topP: string
71
+ maxTokens: string
72
+ presencePenalty: string
73
+ frequencyPenalty: string
74
+ seed: string
75
+ stop: string
76
+ }
77
+
78
+ type GenerationParamPayload = {
79
+ temperature?: number
80
+ top_p?: number
81
+ max_tokens?: number
82
+ presence_penalty?: number
83
+ frequency_penalty?: number
84
+ seed?: number
85
+ stop?: string | string[]
86
+ }
87
+
88
+ type NumericGenerationParamKey =
89
+ | 'temperature'
90
+ | 'top_p'
91
+ | 'max_tokens'
92
+ | 'presence_penalty'
93
+ | 'frequency_penalty'
94
+ | 'seed'
95
+
96
+ function parseGenerationParamDraft(draft: GenerationParamDraft): { payload: GenerationParamPayload; errors: string[] } {
97
+ const errors: string[] = []
98
+ const payload: GenerationParamPayload = {}
99
+
100
+ const parseNumber = (
101
+ raw: string,
102
+ fieldLabel: string,
103
+ key: NumericGenerationParamKey,
104
+ opts?: { min?: number; max?: number; integer?: boolean }
105
+ ) => {
106
+ const trimmed = raw.trim()
107
+ if (!trimmed) return
108
+ const parsed = Number(trimmed)
109
+ if (!Number.isFinite(parsed)) {
110
+ errors.push(`${fieldLabel} must be a valid number.`)
111
+ return
112
+ }
113
+ if (opts?.integer && !Number.isInteger(parsed)) {
114
+ errors.push(`${fieldLabel} must be an integer.`)
115
+ return
116
+ }
117
+ if (typeof opts?.min === 'number' && parsed < opts.min) {
118
+ errors.push(`${fieldLabel} must be >= ${opts.min}.`)
119
+ return
120
+ }
121
+ if (typeof opts?.max === 'number' && parsed > opts.max) {
122
+ errors.push(`${fieldLabel} must be <= ${opts.max}.`)
123
+ return
124
+ }
125
+ payload[key] = parsed
126
+ }
127
+
128
+ parseNumber(draft.temperature, 'Temperature', 'temperature')
129
+ parseNumber(draft.topP, 'Top P', 'top_p', { min: 0, max: 1 })
130
+ parseNumber(draft.maxTokens, 'Max Tokens', 'max_tokens', { min: 1, integer: true })
131
+ parseNumber(draft.presencePenalty, 'Presence Penalty', 'presence_penalty', { min: -2, max: 2 })
132
+ parseNumber(draft.frequencyPenalty, 'Frequency Penalty', 'frequency_penalty', { min: -2, max: 2 })
133
+ parseNumber(draft.seed, 'Seed', 'seed', { min: 0, integer: true })
134
+
135
+ const stopValues = draft.stop
136
+ .split(',')
137
+ .map((value) => value.trim())
138
+ .filter(Boolean)
139
+ if (stopValues.length === 1) {
140
+ payload.stop = stopValues[0]
141
+ } else if (stopValues.length > 1) {
142
+ payload.stop = stopValues
143
+ }
144
+
145
+ return { payload, errors }
146
+ }
147
+
148
+ export function Benchmark() {
149
+ const [runs, setRuns] = useState<BenchmarkRunSummary[]>([])
150
+ const [models, setModels] = useState<Model[]>([])
151
+ const [examples, setExamples] = useState<BenchmarkExampleSummary[]>([])
152
+ const [selectedRunId, setSelectedRunId] = useState<string | null>(null)
153
+ const [selectedRun, setSelectedRun] = useState<BenchmarkRunRecord | null>(null)
154
+ const [events, setEvents] = useState<BenchmarkRunEvent[]>([])
155
+ const [loading, setLoading] = useState(true)
156
+ const [starting, setStarting] = useState(false)
157
+ const [suite, setSuite] = useState('showcase')
158
+ const [profile, setProfile] = useState('local')
159
+ const [scenarioPath, setScenarioPath] = useState('')
160
+ const [selectedModel, setSelectedModel] = useState('')
161
+ const [selectedExampleId, setSelectedExampleId] = useState('')
162
+ const [advancedOpen, setAdvancedOpen] = useState(false)
163
+ const [updateCapCache, setUpdateCapCache] = useState(false)
164
+ const [capTtlDays, setCapTtlDays] = useState('7')
165
+ const [genParams, setGenParams] = useState<GenerationParamDraft>({
166
+ temperature: '',
167
+ topP: '',
168
+ maxTokens: '',
169
+ presencePenalty: '',
170
+ frequencyPenalty: '',
171
+ seed: '',
172
+ stop: '',
173
+ })
174
+ const [showRaw, setShowRaw] = useState(false)
175
+ const [error, setError] = useState<string | null>(null)
176
+ const [modelLeaderboard, setModelLeaderboard] = useState<ModelLeaderboardRow[]>([])
177
+ const [capabilityMatrix, setCapabilityMatrix] = useState<BenchmarkCapabilityMatrix | null>(null)
178
+ const eventSourceRef = useRef<EventSource | null>(null)
179
+
180
+ const loadRuns = async () => {
181
+ try {
182
+ const response = await listBenchmarkRuns()
183
+ setRuns(response.data)
184
+ if (!selectedRunId && response.data.length > 0) {
185
+ setSelectedRunId(response.data[0].id)
186
+ }
187
+ } catch (err) {
188
+ console.error('Failed to load benchmark runs:', err)
189
+ setError((err as Error).message)
190
+ } finally {
191
+ setLoading(false)
192
+ }
193
+ }
194
+
195
+ const loadCapabilities = async () => {
196
+ try {
197
+ const response = await listBenchmarkCapabilities(7)
198
+ setCapabilityMatrix(response)
199
+ } catch (err) {
200
+ console.error('Failed to load benchmark capabilities:', err)
201
+ }
202
+ }
203
+
204
+ const loadModels = async () => {
205
+ try {
206
+ const response = await listModels()
207
+ setModels(response.data)
208
+ } catch (err) {
209
+ console.error('Failed to load models:', err)
210
+ }
211
+ }
212
+
213
+ const loadExamples = async (suiteName: string) => {
214
+ try {
215
+ const response = await listBenchmarkExamples(suiteName)
216
+ setExamples(response.data)
217
+ setSelectedExampleId((current) => {
218
+ if (response.data.some((example) => example.id === current)) return current
219
+ return response.data[0]?.id ?? ''
220
+ })
221
+ } catch (err) {
222
+ console.error('Failed to load benchmark examples:', err)
223
+ setExamples([])
224
+ setSelectedExampleId('')
225
+ }
226
+ }
227
+
228
+ useEffect(() => {
229
+ void loadRuns()
230
+ void loadModels()
231
+ void loadCapabilities()
232
+ void loadExamples(suite)
233
+ const timer = setInterval(() => {
234
+ void loadRuns()
235
+ void loadCapabilities()
236
+ }, 5000)
237
+ return () => clearInterval(timer)
238
+ }, [])
239
+
240
+ useEffect(() => {
241
+ void loadExamples(suite)
242
+ }, [suite])
243
+
244
+ useEffect(() => {
245
+ if (suite === 'capabilities') {
246
+ setUpdateCapCache(true)
247
+ }
248
+ }, [suite])
249
+
250
+ useEffect(() => {
251
+ if (!selectedRunId) {
252
+ setSelectedRun(null)
253
+ setEvents([])
254
+ return
255
+ }
256
+
257
+ const loadRun = async () => {
258
+ try {
259
+ const run = await getBenchmarkRun(selectedRunId)
260
+ setSelectedRun(run)
261
+ setEvents((run.events ?? []).slice(-500))
262
+ } catch (err) {
263
+ console.error('Failed to load benchmark run:', err)
264
+ }
265
+ }
266
+
267
+ void loadRun()
268
+ const pollTimer = setInterval(() => {
269
+ void loadRun()
270
+ }, 2500)
271
+
272
+ eventSourceRef.current?.close()
273
+ eventSourceRef.current = null
274
+
275
+ const selectedSummary = runs.find((run) => run.id === selectedRunId)
276
+ if (selectedSummary?.status === 'running') {
277
+ const source = new EventSource(`/admin/benchmarks/runs/${encodeURIComponent(selectedRunId)}/events`)
278
+ source.onmessage = (message) => {
279
+ try {
280
+ const event = JSON.parse(message.data) as BenchmarkRunEvent
281
+ setEvents((prev) => [...prev, event].slice(-500))
282
+ } catch {
283
+ // Ignore malformed events.
284
+ }
285
+ }
286
+ source.onerror = () => {
287
+ source.close()
288
+ }
289
+ eventSourceRef.current = source
290
+ }
291
+
292
+ return () => {
293
+ clearInterval(pollTimer)
294
+ eventSourceRef.current?.close()
295
+ eventSourceRef.current = null
296
+ }
297
+ }, [selectedRunId, runs])
298
+
299
+ useEffect(() => {
300
+ const buildLeaderboard = async () => {
301
+ const completedRunIds = runs
302
+ .filter((run) => run.status === 'completed')
303
+ .slice(0, 20)
304
+ .map((run) => run.id)
305
+ const details = await Promise.all(
306
+ completedRunIds.map(async (id) => {
307
+ try {
308
+ return await getBenchmarkRun(id)
309
+ } catch {
310
+ return null
311
+ }
312
+ })
313
+ )
314
+ const rows = aggregateModelLeaderboard(details.filter((item): item is BenchmarkRunRecord => item !== null))
315
+ setModelLeaderboard(rows)
316
+ }
317
+ void buildLeaderboard()
318
+ }, [runs])
319
+
320
+ const startRun = async () => {
321
+ setStarting(true)
322
+ setError(null)
323
+ try {
324
+ const { payload: paramPayload, errors: paramErrors } = parseGenerationParamDraft(genParams)
325
+ if (paramErrors.length > 0) {
326
+ throw new Error(paramErrors[0])
327
+ }
328
+ const executionMode = isShowcase ? 'showcase' : 'diagnostic'
329
+ const parsedCapTtl = Number(capTtlDays.trim())
330
+ const run = await startBenchmarkRun({
331
+ suite,
332
+ exampleId: isShowcase && selectedExampleId ? selectedExampleId : undefined,
333
+ profile,
334
+ scenarioPath: scenarioPath.trim() || undefined,
335
+ modelOverride: selectedModel || undefined,
336
+ executionMode,
337
+ updateCapCache: updateCapCache || suite === 'capabilities',
338
+ capTtlDays:
339
+ capTtlDays.trim().length > 0 && Number.isFinite(parsedCapTtl) && parsedCapTtl >= 1
340
+ ? Math.trunc(parsedCapTtl)
341
+ : 7,
342
+ ...paramPayload,
343
+ })
344
+ setSelectedRunId(run.id)
345
+ await loadRuns()
346
+ await loadCapabilities()
347
+ } catch (err) {
348
+ setError((err as Error).message)
349
+ } finally {
350
+ setStarting(false)
351
+ }
352
+ }
353
+
354
+ const progress = useMemo(() => {
355
+ const total = selectedRun?.progress?.totalScenarios ?? 0
356
+ const complete = selectedRun?.progress?.completedScenarios ?? 0
357
+ const percent = total > 0 ? Math.min(100, Math.round((complete / total) * 100)) : 0
358
+ return { total, complete, percent }
359
+ }, [selectedRun])
360
+
361
+ const activeExample = useMemo(() => {
362
+ const reportDetails = selectedRun?.report?.scenarioDetails ?? []
363
+ const fromRun = reportDetails.find((detail) => detail.id === selectedExampleId)?.example ?? reportDetails[0]?.example
364
+ if (fromRun) return fromRun
365
+ return examples.find((example) => example.id === selectedExampleId) ?? examples[0] ?? null
366
+ }, [examples, selectedExampleId, selectedRun])
367
+
368
+ const activeScenarioDetail = useMemo(() => {
369
+ const details = selectedRun?.report?.scenarioDetails ?? []
370
+ if (details.length === 0) return null
371
+ return details.find((detail) => detail.id === selectedExampleId) ?? details[0]
372
+ }, [selectedExampleId, selectedRun])
373
+
374
+ const liveTrace = useMemo<ShowcaseExchange[]>(() => {
375
+ const traceEvents = events.filter((event) => event.type === 'exchange' && event.exchange)
376
+ if (traceEvents.length > 0) {
377
+ return traceEvents.map((event, index) => ({
378
+ id: `${event.timestamp}-${index}`,
379
+ timestamp: event.timestamp,
380
+ mode: event.exchange?.mode ?? 'unknown',
381
+ model: event.exchange?.model ?? 'unknown',
382
+ scenarioInput: event.exchange?.scenarioInput ?? '',
383
+ requestPath: event.exchange?.requestPath ?? '',
384
+ statusCode: event.exchange?.statusCode ?? 0,
385
+ contentType: event.exchange?.contentType ?? '',
386
+ endpointName: event.exchange?.endpointName,
387
+ upstreamModel: event.exchange?.upstreamModel,
388
+ toolTrace: event.exchange?.toolTrace ?? [],
389
+ requestPayload: showRaw ? event.exchange?.requestRaw : event.exchange?.requestSanitized,
390
+ responsePayload: showRaw ? event.exchange?.responseRaw : event.exchange?.responseSanitized,
391
+ }))
392
+ }
393
+
394
+ return (activeScenarioDetail?.exchanges ?? []).map((exchange, index) => ({
395
+ id: `${activeScenarioDetail?.id ?? 'detail'}-${index}`,
396
+ timestamp: exchange.timestamp,
397
+ mode: exchange.mode,
398
+ model: exchange.model,
399
+ scenarioInput: activeScenarioDetail?.example?.inputPreview ?? '',
400
+ requestPath: exchange.requestPath,
401
+ statusCode: exchange.statusCode,
402
+ contentType: exchange.contentType,
403
+ endpointName: exchange.endpointName,
404
+ upstreamModel: exchange.upstreamModel,
405
+ toolTrace: exchange.toolTrace,
406
+ requestPayload: exchange.requestSanitized,
407
+ responsePayload: exchange.responseSanitized,
408
+ }))
409
+ }, [activeScenarioDetail, events, showRaw])
410
+
411
+ const isShowcase = suite === SHOWCASE_SUITE
412
+ const selectedExample = useMemo(
413
+ () => examples.find((example) => example.id === selectedExampleId) ?? null,
414
+ [examples, selectedExampleId]
415
+ )
416
+
417
+ return (
418
+ <div className="flex-1 flex flex-col h-full min-h-0">
419
+ <header className="sticky top-0 z-20 h-14 border-b border-border bg-background/95 backdrop-blur flex items-center px-6 gap-4 shrink-0">
420
+ <div className="flex items-center gap-2">
421
+ <Gauge className="w-4 h-4 text-primary" />
422
+ <h2 className="font-mono font-semibold text-sm uppercase tracking-wider">Benchmark</h2>
423
+ </div>
424
+ <div className="text-xs text-muted-foreground font-mono">Live examples first, diagnostics second</div>
425
+ <div className="flex-1" />
426
+ <Button variant="outline" size="sm" onClick={() => { void loadRuns(); void loadModels(); void loadExamples(suite) }} disabled={loading}>
427
+ <RefreshCw className={cn('w-3 h-3 mr-2', loading && 'animate-spin')} />
428
+ Refresh
429
+ </Button>
430
+ </header>
431
+
432
+ <div className="flex-1 min-h-0 grid grid-cols-[320px_1fr] gap-0">
433
+ <aside className="border-r border-border p-4 space-y-4 overflow-auto">
434
+ <div className="panel">
435
+ <div className="panel-header">
436
+ <span className="panel-title">Run Setup</span>
437
+ </div>
438
+ <div className="p-3 space-y-3">
439
+ <label className="text-xs text-muted-foreground block">
440
+ Suite
441
+ <select
442
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
443
+ value={suite}
444
+ onChange={(event) => setSuite(event.target.value)}
445
+ >
446
+ {ALL_SUITES.map((item) => (
447
+ <option key={item} value={item}>{SUITE_LABELS[item] ?? item}</option>
448
+ ))}
449
+ </select>
450
+ </label>
451
+
452
+ {isShowcase ? (
453
+ <div className="rounded border border-border/70 bg-secondary/20 p-2">
454
+ <p className="text-2xs uppercase text-muted-foreground">Question Source</p>
455
+ <p className="text-xs font-mono">vincentkoc/tiny_qa_benchmark (Hugging Face)</p>
456
+ </div>
457
+ ) : null}
458
+
459
+ {isShowcase && (
460
+ <label className="text-xs text-muted-foreground block">
461
+ Example
462
+ <select
463
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
464
+ value={selectedExampleId}
465
+ onChange={(event) => setSelectedExampleId(event.target.value)}
466
+ >
467
+ {examples.map((example) => (
468
+ <option key={example.id} value={example.id}>
469
+ {example.title}
470
+ </option>
471
+ ))}
472
+ {examples.length === 0 && <option value="">No examples</option>}
473
+ </select>
474
+ </label>
475
+ )}
476
+
477
+ <div className="grid grid-cols-2 gap-2">
478
+ <label className="text-xs text-muted-foreground block">
479
+ Temperature
480
+ <input
481
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
482
+ value={genParams.temperature}
483
+ onChange={(event) =>
484
+ setGenParams((prev) => ({ ...prev, temperature: event.target.value }))
485
+ }
486
+ placeholder="e.g. 0.7"
487
+ />
488
+ </label>
489
+ <label className="text-xs text-muted-foreground block">
490
+ Top P
491
+ <input
492
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
493
+ value={genParams.topP}
494
+ onChange={(event) =>
495
+ setGenParams((prev) => ({ ...prev, topP: event.target.value }))
496
+ }
497
+ placeholder="e.g. 1"
498
+ />
499
+ </label>
500
+ <label className="text-xs text-muted-foreground block">
501
+ Max Tokens
502
+ <input
503
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
504
+ value={genParams.maxTokens}
505
+ onChange={(event) =>
506
+ setGenParams((prev) => ({ ...prev, maxTokens: event.target.value }))
507
+ }
508
+ placeholder="e.g. 512"
509
+ />
510
+ </label>
511
+ <label className="text-xs text-muted-foreground block">
512
+ Presence Penalty
513
+ <input
514
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
515
+ value={genParams.presencePenalty}
516
+ onChange={(event) =>
517
+ setGenParams((prev) => ({ ...prev, presencePenalty: event.target.value }))
518
+ }
519
+ placeholder="-2 to 2"
520
+ />
521
+ </label>
522
+ <label className="text-xs text-muted-foreground block">
523
+ Frequency Penalty
524
+ <input
525
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
526
+ value={genParams.frequencyPenalty}
527
+ onChange={(event) =>
528
+ setGenParams((prev) => ({ ...prev, frequencyPenalty: event.target.value }))
529
+ }
530
+ placeholder="-2 to 2"
531
+ />
532
+ </label>
533
+ <label className="text-xs text-muted-foreground block">
534
+ Seed
535
+ <input
536
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
537
+ value={genParams.seed}
538
+ onChange={(event) =>
539
+ setGenParams((prev) => ({ ...prev, seed: event.target.value }))
540
+ }
541
+ placeholder="integer"
542
+ />
543
+ </label>
544
+ </div>
545
+
546
+ <label className="text-xs text-muted-foreground block">
547
+ Stop Sequences (comma-separated)
548
+ <input
549
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
550
+ value={genParams.stop}
551
+ onChange={(event) =>
552
+ setGenParams((prev) => ({ ...prev, stop: event.target.value }))
553
+ }
554
+ placeholder="END, STOP"
555
+ />
556
+ </label>
557
+
558
+ <button
559
+ type="button"
560
+ className="w-full text-left text-xs font-mono text-muted-foreground border border-border rounded px-2 py-1 hover:bg-secondary/50"
561
+ onClick={() => setAdvancedOpen((prev) => !prev)}
562
+ >
563
+ {advancedOpen ? 'Hide Advanced' : 'Show Advanced'}
564
+ </button>
565
+
566
+ {advancedOpen && (
567
+ <div className="space-y-3 rounded border border-border/60 bg-secondary/20 p-2">
568
+ <label className="text-xs text-muted-foreground block">
569
+ Profile
570
+ <select
571
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
572
+ value={profile}
573
+ onChange={(event) => setProfile(event.target.value)}
574
+ >
575
+ {PROFILES.map((item) => (
576
+ <option key={item} value={item}>{item}</option>
577
+ ))}
578
+ </select>
579
+ </label>
580
+
581
+ <label className="text-xs text-muted-foreground block">
582
+ Model Override
583
+ <select
584
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
585
+ value={selectedModel}
586
+ onChange={(event) => setSelectedModel(event.target.value)}
587
+ >
588
+ <option value="">(auto)</option>
589
+ {models.map((model) => (
590
+ <option key={model.id} value={model.id}>{model.id}</option>
591
+ ))}
592
+ </select>
593
+ </label>
594
+
595
+ <label className="text-xs text-muted-foreground block">
596
+ Scenario File (Optional)
597
+ <input
598
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
599
+ value={scenarioPath}
600
+ onChange={(event) => setScenarioPath(event.target.value)}
601
+ placeholder="./examples/scenarios/custom.yaml"
602
+ />
603
+ </label>
604
+
605
+ <label className="text-xs text-muted-foreground block">
606
+ Capability TTL Days
607
+ <input
608
+ className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
609
+ value={capTtlDays}
610
+ onChange={(event) => setCapTtlDays(event.target.value)}
611
+ placeholder="7"
612
+ />
613
+ </label>
614
+
615
+ <label className="flex items-center gap-2 text-xs text-muted-foreground">
616
+ <input
617
+ type="checkbox"
618
+ checked={updateCapCache}
619
+ onChange={(event) => setUpdateCapCache(event.target.checked)}
620
+ />
621
+ Update Capability Cache
622
+ </label>
623
+ </div>
624
+ )}
625
+
626
+ <p className="text-2xs text-muted-foreground">
627
+ {isShowcase
628
+ ? `Showcase runs one Tiny QA question at a time. Expected answer: ${selectedExample?.successCriteria ?? 'n/a'}`
629
+ : 'Diagnostic suites keep pass-rate, latency, and capability diagnostics.'}
630
+ </p>
631
+
632
+ <Button className="w-full" onClick={startRun} disabled={starting || (isShowcase && !selectedExampleId)}>
633
+ {starting ? <Loader2 className="w-4 h-4 animate-spin mr-2" /> : <Play className="w-4 h-4 mr-2" />}
634
+ {isShowcase ? 'Run Showcase' : 'Run Diagnostic'}
635
+ </Button>
636
+ {error && <p className="text-xs text-destructive">{error}</p>}
637
+ </div>
638
+ </div>
639
+
640
+ <div className="panel">
641
+ <div className="panel-header">
642
+ <span className="panel-title">Runs</span>
643
+ </div>
644
+ <div className="max-h-[56vh] overflow-auto divide-y divide-border">
645
+ {runs.map((run) => (
646
+ <button
647
+ key={run.id}
648
+ className={cn(
649
+ 'w-full text-left px-3 py-2 hover:bg-secondary/50 transition-colors',
650
+ selectedRunId === run.id && 'bg-secondary'
651
+ )}
652
+ onClick={() => setSelectedRunId(run.id)}
653
+ >
654
+ <p className="text-xs font-mono truncate">{run.id}</p>
655
+ <p className="text-2xs text-muted-foreground">{run.suite ?? 'custom'}{run.exampleId ? ` • ${run.exampleId}` : ''}</p>
656
+ <p className={cn('text-2xs uppercase font-mono', statusClass(run.status))}>{run.status}</p>
657
+ </button>
658
+ ))}
659
+ {runs.length === 0 && (
660
+ <div className="px-3 py-4 text-xs text-muted-foreground">No benchmark runs yet.</div>
661
+ )}
662
+ </div>
663
+ </div>
664
+ </aside>
665
+
666
+ <section className="min-h-0 overflow-auto p-6 space-y-6">
667
+ <div className="panel">
668
+ <div className="panel-header">
669
+ <span className="panel-title">Progress</span>
670
+ <span className="text-2xs text-muted-foreground ml-auto">
671
+ {progress.complete}/{progress.total}
672
+ </span>
673
+ </div>
674
+ <div className="p-4 space-y-2">
675
+ <div className="w-full h-2 rounded bg-secondary overflow-hidden">
676
+ <div className="h-full bg-primary transition-all duration-300" style={{ width: `${progress.percent}%` }} />
677
+ </div>
678
+ <p className="text-xs text-muted-foreground font-mono">
679
+ {selectedRun?.progress?.currentScenarioId
680
+ ? `Current: ${selectedRun.progress.currentScenarioId}`
681
+ : 'Idle'}
682
+ </p>
683
+ </div>
684
+ </div>
685
+
686
+ <div className="grid grid-cols-2 gap-6">
687
+ <div className="panel min-h-[320px]">
688
+ <div className="panel-header">
689
+ <span className="panel-title">What This Demonstrates</span>
690
+ </div>
691
+ <div className="p-4 space-y-3 text-sm">
692
+ {activeExample ? (
693
+ <>
694
+ <div>
695
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Title</p>
696
+ <p className="font-medium">{activeExample.title}</p>
697
+ </div>
698
+ <div>
699
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Goal</p>
700
+ <p>{activeExample.userVisibleGoal}</p>
701
+ </div>
702
+ <div>
703
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Input</p>
704
+ <pre className="text-xs font-mono whitespace-pre-wrap break-words">{activeExample.inputPreview}</pre>
705
+ </div>
706
+ <div>
707
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Success</p>
708
+ <p>{activeExample.successCriteria}</p>
709
+ </div>
710
+ <div>
711
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Expected Highlights</p>
712
+ <div className="flex flex-wrap gap-2">
713
+ {activeExample.expectedHighlights.map((item) => (
714
+ <span key={item} className="rounded border border-border px-2 py-1 text-2xs font-mono text-muted-foreground">{item}</span>
715
+ ))}
716
+ </div>
717
+ </div>
718
+ </>
719
+ ) : (
720
+ <p className="text-sm text-muted-foreground">No showcase example selected.</p>
721
+ )}
722
+ </div>
723
+ </div>
724
+
725
+ <div className="panel min-h-[320px]">
726
+ <div className="panel-header">
727
+ <span className="panel-title">Verdict</span>
728
+ </div>
729
+ <div className="p-4 space-y-3 text-sm">
730
+ {activeScenarioDetail ? (
731
+ <>
732
+ <div className="flex items-center gap-2 flex-wrap">
733
+ <span className={cn(
734
+ 'rounded border px-2 py-1 text-2xs font-mono',
735
+ activeScenarioDetail.status === 'passed'
736
+ ? 'border-emerald-500/40 text-emerald-300 bg-emerald-500/10'
737
+ : activeScenarioDetail.status === 'skipped'
738
+ ? 'border-amber-500/40 text-amber-300 bg-amber-500/10'
739
+ : 'border-red-500/40 text-red-300 bg-red-500/10'
740
+ )}>
741
+ {activeScenarioDetail.status}
742
+ </span>
743
+ <span className="text-2xs font-mono text-muted-foreground">{activeScenarioDetail.model}</span>
744
+ </div>
745
+ <div>
746
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Reason</p>
747
+ <p>{activeScenarioDetail.verdict}</p>
748
+ </div>
749
+ <div>
750
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Final Response</p>
751
+ <pre className="text-xs font-mono whitespace-pre-wrap break-words">{activeScenarioDetail.finalResponsePreview || 'n/a'}</pre>
752
+ </div>
753
+ <div>
754
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Tools Used</p>
755
+ <p className="text-xs font-mono">{activeScenarioDetail.usedToolNames.length > 0 ? activeScenarioDetail.usedToolNames.join(', ') : 'none'}</p>
756
+ </div>
757
+ </>
758
+ ) : (
759
+ <p className="text-sm text-muted-foreground">Run an example to capture a verdict.</p>
760
+ )}
761
+ </div>
762
+ </div>
763
+ </div>
764
+
765
+ <div className="panel min-h-[420px]">
766
+ <div className="panel-header">
767
+ <MessageSquareText className="w-4 h-4 text-muted-foreground" />
768
+ <span className="panel-title">Live Show</span>
769
+ <button
770
+ className="ml-auto text-2xs px-2 py-1 rounded bg-secondary hover:bg-secondary/80"
771
+ onClick={() => setShowRaw((prev) => !prev)}
772
+ >
773
+ {showRaw ? 'Raw' : 'Sanitized'}
774
+ </button>
775
+ </div>
776
+ <div className="p-4 space-y-3 max-h-[640px] overflow-auto">
777
+ {liveTrace.map((exchange) => (
778
+ <TraceCard key={exchange.id} exchange={exchange} />
779
+ ))}
780
+ {liveTrace.length === 0 && (
781
+ <p className="text-xs text-muted-foreground">No request/response trace yet.</p>
782
+ )}
783
+ </div>
784
+ </div>
785
+
786
+ <div className="grid grid-cols-2 gap-6">
787
+ <div className="panel min-h-[320px]">
788
+ <div className="panel-header">
789
+ <span className="panel-title">Model Leaderboard (Diagnostics)</span>
790
+ </div>
791
+ <div className="p-4 overflow-auto max-h-[360px]">
792
+ <table className="w-full text-xs">
793
+ <thead className="text-muted-foreground">
794
+ <tr>
795
+ <th className="text-left py-1">Model</th>
796
+ <th className="text-right py-1">Runs</th>
797
+ <th className="text-right py-1">Scenarios</th>
798
+ <th className="text-right py-1">Pass</th>
799
+ <th className="text-right py-1">P95</th>
800
+ <th className="text-right py-1">Tokens</th>
801
+ <th className="text-right py-1">Failovers</th>
802
+ </tr>
803
+ </thead>
804
+ <tbody>
805
+ {modelLeaderboard.map((row) => (
806
+ <tr key={row.model} className="border-t border-border/40">
807
+ <td className="py-1 pr-2 font-mono">{row.model}</td>
808
+ <td className="py-1 text-right">{row.runCount}</td>
809
+ <td className="py-1 text-right">{row.scenarioCount}</td>
810
+ <td className="py-1 text-right">{`${Math.round(row.avgPassRate * 100)}%`}</td>
811
+ <td className="py-1 text-right">{`${Math.round(row.avgP95LatencyMs)}ms`}</td>
812
+ <td className="py-1 text-right">{row.totalTokens}</td>
813
+ <td className="py-1 text-right">{row.totalFailovers}</td>
814
+ </tr>
815
+ ))}
816
+ {modelLeaderboard.length === 0 && (
817
+ <tr>
818
+ <td colSpan={7} className="py-3 text-center text-muted-foreground">
819
+ No model history available yet.
820
+ </td>
821
+ </tr>
822
+ )}
823
+ </tbody>
824
+ </table>
825
+ </div>
826
+ </div>
827
+
828
+ <div className="panel min-h-[320px]">
829
+ <div className="panel-header">
830
+ <span className="panel-title">Capabilities (Diagnostics)</span>
831
+ <span className="text-2xs text-muted-foreground ml-auto">
832
+ TTL {capabilityMatrix?.ttlDays ?? 7}d
833
+ </span>
834
+ </div>
835
+ <div className="p-4 overflow-auto max-h-[360px]">
836
+ <table className="w-full text-xs">
837
+ <thead className="text-muted-foreground">
838
+ <tr>
839
+ <th className="text-left py-1">Model</th>
840
+ <th className="text-left py-1">Freshness</th>
841
+ <th className="text-left py-1">Chat</th>
842
+ <th className="text-left py-1">Tools</th>
843
+ <th className="text-left py-1">Embed</th>
844
+ <th className="text-left py-1">Image</th>
845
+ </tr>
846
+ </thead>
847
+ <tbody>
848
+ {(capabilityMatrix?.models ?? []).map((model) => (
849
+ <tr key={model.model} className="border-t border-border/40">
850
+ <td className="py-1 pr-2 font-mono">{model.model}</td>
851
+ <td className={cn('py-1', model.freshness === 'fresh' ? 'text-success' : 'text-warning')}>
852
+ {model.freshness}
853
+ </td>
854
+ <td className="py-1">{model.findings.chat_basic.status}</td>
855
+ <td className="py-1">{model.findings.chat_tool_calls.status}</td>
856
+ <td className="py-1">{model.findings.embeddings.status}</td>
857
+ <td className="py-1">{model.findings.images_generation.status}</td>
858
+ </tr>
859
+ ))}
860
+ {(capabilityMatrix?.models.length ?? 0) === 0 && (
861
+ <tr>
862
+ <td colSpan={6} className="py-3 text-center text-muted-foreground">
863
+ No capability snapshots yet. Run suite "capabilities" to populate cache.
864
+ </td>
865
+ </tr>
866
+ )}
867
+ </tbody>
868
+ </table>
869
+ </div>
870
+ </div>
871
+ </div>
872
+ </section>
873
+ </div>
874
+ </div>
875
+ )
876
+ }
877
+
878
+ function TraceCard({ exchange }: { exchange: ShowcaseExchange }) {
879
+ return (
880
+ <div className="border border-border rounded-md p-3 space-y-3">
881
+ <div className="flex items-center gap-2 flex-wrap text-2xs text-muted-foreground font-mono">
882
+ <span>{exchange.timestamp ?? 'saved-trace'}</span>
883
+ <span>•</span>
884
+ <span>{exchange.mode}</span>
885
+ <span>•</span>
886
+ <span>{exchange.model}</span>
887
+ {exchange.endpointName && (
888
+ <>
889
+ <span>•</span>
890
+ <span>{exchange.endpointName}</span>
891
+ </>
892
+ )}
893
+ </div>
894
+
895
+ <div className="bg-secondary/30 rounded p-2">
896
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Scenario Input</p>
897
+ <pre className="text-2xs font-mono whitespace-pre-wrap break-words">{exchange.scenarioInput}</pre>
898
+ </div>
899
+
900
+ <div className="bg-secondary/30 rounded p-2">
901
+ <p className="text-2xs uppercase text-muted-foreground mb-1">Wire Request {exchange.requestPath}</p>
902
+ <pre className="text-2xs font-mono whitespace-pre-wrap break-words">{safeStringify(exchange.requestPayload)}</pre>
903
+ </div>
904
+
905
+ {exchange.toolTrace.length > 0 && (
906
+ <div className="bg-secondary/30 rounded p-2 space-y-2">
907
+ <p className="text-2xs uppercase text-muted-foreground">Tool Trace</p>
908
+ {exchange.toolTrace.map((step, index) => (
909
+ <div key={`${step.kind}-${step.toolName}-${index}`} className="border border-border/60 rounded p-2">
910
+ <p className="text-2xs font-mono text-muted-foreground">{step.kind} • {step.toolName}</p>
911
+ {step.argumentsText && <pre className="text-2xs font-mono whitespace-pre-wrap break-words mt-1">{step.argumentsText}</pre>}
912
+ {step.contentText && <pre className="text-2xs font-mono whitespace-pre-wrap break-words mt-1">{step.contentText}</pre>}
913
+ </div>
914
+ ))}
915
+ </div>
916
+ )}
917
+
918
+ <div className="bg-secondary/30 rounded p-2">
919
+ <p className="text-2xs uppercase text-muted-foreground mb-1">
920
+ Response {exchange.statusCode} ({exchange.contentType || 'unknown'})
921
+ </p>
922
+ <pre className="text-2xs font-mono whitespace-pre-wrap break-words">{safeStringify(exchange.responsePayload)}</pre>
923
+ </div>
924
+ </div>
925
+ )
926
+ }
927
+
928
+ function aggregateModelLeaderboard(runs: BenchmarkRunRecord[]): ModelLeaderboardRow[] {
929
+ const byModel = new Map<string, {
930
+ runIds: Set<string>
931
+ scenarios: number
932
+ passRateSum: number
933
+ p95Sum: number
934
+ tokens: number
935
+ failovers: number
936
+ }>()
937
+
938
+ for (const run of runs) {
939
+ const results = run.report?.results ?? []
940
+ for (const result of results) {
941
+ if (result.status === 'skipped') continue
942
+ const model = String(result.model ?? '')
943
+ if (!model) continue
944
+ const current = byModel.get(model) ?? {
945
+ runIds: new Set<string>(),
946
+ scenarios: 0,
947
+ passRateSum: 0,
948
+ p95Sum: 0,
949
+ tokens: 0,
950
+ failovers: 0,
951
+ }
952
+ current.runIds.add(run.id)
953
+ current.scenarios += 1
954
+ current.passRateSum += Number(result.passRate ?? 0)
955
+ current.p95Sum += Number(result.p95LatencyMs ?? 0)
956
+ current.tokens += Number(result.totalTokens ?? 0)
957
+ current.failovers += Number(result.failovers ?? 0)
958
+ byModel.set(model, current)
959
+ }
960
+ }
961
+
962
+ return Array.from(byModel.entries()).map(([model, value]) => ({
963
+ model,
964
+ runCount: value.runIds.size,
965
+ scenarioCount: value.scenarios,
966
+ avgPassRate: value.scenarios > 0 ? value.passRateSum / value.scenarios : 0,
967
+ avgP95LatencyMs: value.scenarios > 0 ? value.p95Sum / value.scenarios : 0,
968
+ totalTokens: value.tokens,
969
+ totalFailovers: value.failovers,
970
+ })).sort((a, b) => {
971
+ if (b.avgPassRate !== a.avgPassRate) return b.avgPassRate - a.avgPassRate
972
+ return a.avgP95LatencyMs - b.avgP95LatencyMs
973
+ })
974
+ }
975
+
976
+ function safeStringify(payload: unknown): string {
977
+ try {
978
+ return JSON.stringify(payload, null, 2)
979
+ } catch {
980
+ return String(payload)
981
+ }
982
+ }
983
+
984
+ function statusClass(status: RunStatus): string {
985
+ if (status === 'completed') return 'text-success'
986
+ if (status === 'failed') return 'text-destructive'
987
+ return 'text-warning'
988
+ }