waypoi 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/instructions/ui.instructions.md +42 -0
- package/.github/workflows/ci.yml +35 -0
- package/.github/workflows/publish.yml +71 -0
- package/.github/workflows/release.yml +48 -0
- package/.playwright-mcp/console-2026-04-04T01-41-10-746Z.log +2 -0
- package/.playwright-mcp/console-2026-04-04T01-41-28-799Z.log +3 -0
- package/.playwright-mcp/console-2026-04-05T02-26-51-909Z.log +76 -0
- package/.playwright-mcp/page-2026-04-04T01-41-10-816Z.yml +1 -0
- package/.playwright-mcp/page-2026-04-04T01-41-29-141Z.yml +77 -0
- package/.playwright-mcp/page-2026-04-04T01-41-42-633Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T01-42-03-929Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-12-54-813Z.yml +6 -0
- package/.playwright-mcp/page-2026-04-04T02-14-58-600Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-03-923Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-07-426Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-25-729Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-16-22-984Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-17-00-599Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-17-50-874Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-05T02-26-55-570Z.yml +6 -0
- package/AGENTS.md +48 -0
- package/CHANGELOG.md +131 -0
- package/README.md +552 -0
- package/assets/agent-mode.png +0 -0
- package/assets/categorize.png +0 -0
- package/assets/dashboard.png +0 -0
- package/assets/endpoint-proxy.png +0 -0
- package/assets/icon.png +0 -0
- package/assets/mcp-generate-image.png +0 -0
- package/assets/mcp-understand-image.png +0 -0
- package/assets/peek-token-flow.png +0 -0
- package/assets/playground.png +0 -0
- package/assets/sankey.png +0 -0
- package/cli/index.ts +2805 -0
- package/cli/legacyRewrite.ts +108 -0
- package/cli/modelRef.ts +24 -0
- package/dist/cli/index.js +2536 -0
- package/dist/cli/legacyRewrite.js +92 -0
- package/dist/cli/modelRef.js +20 -0
- package/dist/src/benchmark/artifacts.js +131 -0
- package/dist/src/benchmark/capabilityClassifier.js +81 -0
- package/dist/src/benchmark/capabilityStore.js +144 -0
- package/dist/src/benchmark/config.js +238 -0
- package/dist/src/benchmark/gates.js +118 -0
- package/dist/src/benchmark/jobs.js +252 -0
- package/dist/src/benchmark/runner.js +1847 -0
- package/dist/src/benchmark/schema.js +353 -0
- package/dist/src/benchmark/suites.js +314 -0
- package/dist/src/benchmark/tinyQaDataset.js +422 -0
- package/dist/src/benchmark/types.js +25 -0
- package/dist/src/config.js +47 -0
- package/dist/src/index.js +178 -0
- package/dist/src/mcp/client.js +215 -0
- package/dist/src/mcp/discovery.js +226 -0
- package/dist/src/mcp/policy.js +65 -0
- package/dist/src/mcp/registry.js +129 -0
- package/dist/src/mcp/service.js +460 -0
- package/dist/src/middleware/auth.js +179 -0
- package/dist/src/middleware/requestCapture.js +192 -0
- package/dist/src/middleware/requestStats.js +118 -0
- package/dist/src/pools/builder.js +132 -0
- package/dist/src/pools/repository.js +69 -0
- package/dist/src/pools/scheduler.js +360 -0
- package/dist/src/pools/types.js +2 -0
- package/dist/src/protocols/adapters/dashscope.js +267 -0
- package/dist/src/protocols/adapters/inferenceV2.js +346 -0
- package/dist/src/protocols/adapters/openai.js +27 -0
- package/dist/src/protocols/registry.js +99 -0
- package/dist/src/protocols/types.js +2 -0
- package/dist/src/providers/health.js +153 -0
- package/dist/src/providers/importer.js +289 -0
- package/dist/src/providers/modelRegistry.js +313 -0
- package/dist/src/providers/repository.js +361 -0
- package/dist/src/providers/types.js +2 -0
- package/dist/src/routes/admin.js +531 -0
- package/dist/src/routes/audio.js +295 -0
- package/dist/src/routes/chat.js +240 -0
- package/dist/src/routes/embeddings.js +157 -0
- package/dist/src/routes/images.js +288 -0
- package/dist/src/routes/mcp.js +256 -0
- package/dist/src/routes/mcpService.js +100 -0
- package/dist/src/routes/models.js +48 -0
- package/dist/src/routes/responses.js +711 -0
- package/dist/src/routes/sessions.js +450 -0
- package/dist/src/routes/stats.js +270 -0
- package/dist/src/routes/ui.js +97 -0
- package/dist/src/routes/videos.js +107 -0
- package/dist/src/routing/router.js +338 -0
- package/dist/src/services/imageGeneration.js +280 -0
- package/dist/src/services/imageUnderstanding.js +352 -0
- package/dist/src/services/videoGeneration.js +79 -0
- package/dist/src/storage/captureRepository.js +1591 -0
- package/dist/src/storage/files.js +157 -0
- package/dist/src/storage/imageCache.js +346 -0
- package/dist/src/storage/repositories.js +388 -0
- package/dist/src/storage/sessionRepository.js +370 -0
- package/dist/src/storage/statsRepository.js +204 -0
- package/dist/src/transport/httpClient.js +126 -0
- package/dist/src/types.js +2 -0
- package/dist/src/utils/messageMedia.js +285 -0
- package/dist/src/utils/modelCapabilities.js +108 -0
- package/dist/src/utils/modelDiscovery.js +170 -0
- package/dist/src/version.js +5 -0
- package/dist/src/workers/captureRetention.js +25 -0
- package/dist/src/workers/configWatcher.js +91 -0
- package/dist/src/workers/healthChecker.js +21 -0
- package/dist/src/workers/statsRotation.js +41 -0
- package/docs/LLM/output_schema.md +312 -0
- package/docs/benchmark.md +208 -0
- package/docs/mcp-guidelines.md +125 -0
- package/docs/mcp-service.md +178 -0
- package/docs/opencode.md +86 -0
- package/docs/providers.md +79 -0
- package/examples/benchmark.config.yaml +28 -0
- package/examples/providers/alibaba-dashscope.yaml +88 -0
- package/examples/providers/alibaba-llm.yaml +64 -0
- package/examples/providers/alibaba-registry.yaml +7 -0
- package/examples/providers/inference-v2-ray.yaml +29 -0
- package/examples/scenarios/assets/omni-call-sample.wav +0 -0
- package/examples/scenarios/custom.jsonl +5 -0
- package/examples/scenarios/custom.yaml +40 -0
- package/model-form-v2.png +0 -0
- package/package.json +66 -0
- package/provider-form-v2.png +0 -0
- package/provider-form.png +0 -0
- package/scripts/manual-test.sh +11 -0
- package/scripts/version-from-git.js +23 -0
- package/src/benchmark/artifacts.ts +149 -0
- package/src/benchmark/capabilityClassifier.ts +99 -0
- package/src/benchmark/capabilityStore.ts +174 -0
- package/src/benchmark/config.ts +337 -0
- package/src/benchmark/gates.ts +164 -0
- package/src/benchmark/jobs.ts +312 -0
- package/src/benchmark/runner.ts +2519 -0
- package/src/benchmark/schema.ts +443 -0
- package/src/benchmark/suites.ts +323 -0
- package/src/benchmark/tinyQaDataset.ts +428 -0
- package/src/benchmark/types.ts +442 -0
- package/src/config.ts +44 -0
- package/src/index.ts +195 -0
- package/src/mcp/client.ts +305 -0
- package/src/mcp/discovery.ts +266 -0
- package/src/mcp/policy.ts +105 -0
- package/src/mcp/registry.ts +164 -0
- package/src/mcp/service.ts +611 -0
- package/src/middleware/auth.ts +251 -0
- package/src/middleware/requestCapture.ts +245 -0
- package/src/middleware/requestStats.ts +163 -0
- package/src/pools/builder.ts +159 -0
- package/src/pools/repository.ts +71 -0
- package/src/pools/scheduler.ts +425 -0
- package/src/pools/types.ts +117 -0
- package/src/protocols/adapters/dashscope.ts +335 -0
- package/src/protocols/adapters/inferenceV2.ts +428 -0
- package/src/protocols/adapters/openai.ts +32 -0
- package/src/protocols/registry.ts +117 -0
- package/src/protocols/types.ts +81 -0
- package/src/providers/health.ts +207 -0
- package/src/providers/importer.ts +402 -0
- package/src/providers/modelRegistry.ts +415 -0
- package/src/providers/repository.ts +439 -0
- package/src/providers/types.ts +113 -0
- package/src/routes/admin.ts +666 -0
- package/src/routes/audio.ts +372 -0
- package/src/routes/chat.ts +301 -0
- package/src/routes/embeddings.ts +197 -0
- package/src/routes/images.ts +356 -0
- package/src/routes/mcp.ts +320 -0
- package/src/routes/mcpService.ts +114 -0
- package/src/routes/models.ts +50 -0
- package/src/routes/responses.ts +872 -0
- package/src/routes/sessions.ts +558 -0
- package/src/routes/stats.ts +312 -0
- package/src/routes/ui.ts +96 -0
- package/src/routes/videos.ts +132 -0
- package/src/routing/router.ts +501 -0
- package/src/services/imageGeneration.ts +396 -0
- package/src/services/imageUnderstanding.ts +449 -0
- package/src/services/videoGeneration.ts +127 -0
- package/src/storage/captureRepository.ts +1835 -0
- package/src/storage/files.ts +178 -0
- package/src/storage/imageCache.ts +405 -0
- package/src/storage/repositories.ts +494 -0
- package/src/storage/sessionRepository.ts +419 -0
- package/src/storage/statsRepository.ts +238 -0
- package/src/transport/httpClient.ts +145 -0
- package/src/types.ts +322 -0
- package/src/utils/messageMedia.ts +293 -0
- package/src/utils/modelCapabilities.ts +161 -0
- package/src/utils/modelDiscovery.ts +203 -0
- package/src/workers/captureRetention.ts +25 -0
- package/src/workers/configWatcher.ts +115 -0
- package/src/workers/healthChecker.ts +22 -0
- package/src/workers/statsRotation.ts +49 -0
- package/tests/benchmarkAdminRoutes.test.ts +82 -0
- package/tests/benchmarkBasics.test.ts +116 -0
- package/tests/captureAdminRoutes.test.ts +420 -0
- package/tests/captureRepository.test.ts +797 -0
- package/tests/cliLegacyRewrite.test.ts +45 -0
- package/tests/imageGeneration.service.test.ts +107 -0
- package/tests/imageUnderstanding.service.test.ts +123 -0
- package/tests/mcpPolicy.test.ts +105 -0
- package/tests/mcpService.test.ts +1245 -0
- package/tests/modelRef.test.ts +23 -0
- package/tests/modelsRoutes.test.ts +154 -0
- package/tests/sessionMediaCache.test.ts +167 -0
- package/tests/statsRoutes.test.ts +323 -0
- package/tsconfig.json +15 -0
- package/ui/index.html +16 -0
- package/ui/package-lock.json +8521 -0
- package/ui/package.json +52 -0
- package/ui/postcss.config.js +6 -0
- package/ui/public/assets/apple-touch-icon.png +0 -0
- package/ui/public/assets/favicon-16.png +0 -0
- package/ui/public/assets/favicon-32.png +0 -0
- package/ui/public/assets/icon-192.png +0 -0
- package/ui/public/assets/icon-512.png +0 -0
- package/ui/src/App.tsx +27 -0
- package/ui/src/api/client.ts +1503 -0
- package/ui/src/components/EndpointUsageGuide.tsx +361 -0
- package/ui/src/components/Layout.tsx +124 -0
- package/ui/src/components/MessageContent.tsx +365 -0
- package/ui/src/components/ToolCallMessage.tsx +179 -0
- package/ui/src/components/ToolPicker.tsx +442 -0
- package/ui/src/components/messageContentParser.test.ts +41 -0
- package/ui/src/components/messageContentParser.ts +73 -0
- package/ui/src/components/thinkingPreview.test.ts +27 -0
- package/ui/src/components/thinkingPreview.ts +15 -0
- package/ui/src/components/toMermaidSankey.test.ts +78 -0
- package/ui/src/components/toMermaidSankey.ts +56 -0
- package/ui/src/components/ui/button.tsx +58 -0
- package/ui/src/components/ui/input.tsx +21 -0
- package/ui/src/components/ui/textarea.tsx +21 -0
- package/ui/src/lib/utils.ts +6 -0
- package/ui/src/main.tsx +9 -0
- package/ui/src/pages/AgentPlayground.tsx +2010 -0
- package/ui/src/pages/Benchmark.tsx +988 -0
- package/ui/src/pages/Dashboard.tsx +581 -0
- package/ui/src/pages/Peek.tsx +962 -0
- package/ui/src/pages/Settings.tsx +2013 -0
- package/ui/src/pages/agentPlaygroundPayload.test.ts +109 -0
- package/ui/src/pages/agentPlaygroundPayload.ts +97 -0
- package/ui/src/pages/agentThinkingContent.test.ts +50 -0
- package/ui/src/pages/agentThinkingContent.ts +57 -0
- package/ui/src/pages/dashboardTokenUsage.test.ts +66 -0
- package/ui/src/pages/dashboardTokenUsage.ts +36 -0
- package/ui/src/pages/imageUpload.test.ts +39 -0
- package/ui/src/pages/imageUpload.ts +71 -0
- package/ui/src/pages/peekFilters.test.ts +29 -0
- package/ui/src/pages/peekFilters.ts +13 -0
- package/ui/src/pages/peekMedia.test.ts +58 -0
- package/ui/src/pages/peekMedia.ts +148 -0
- package/ui/src/pages/sessionAutoTitle.test.ts +128 -0
- package/ui/src/pages/sessionAutoTitle.ts +106 -0
- package/ui/src/stores/settings.ts +58 -0
- package/ui/src/styles/globals.css +223 -0
- package/ui/src/vite-env.d.ts +8 -0
- package/ui/tailwind.config.js +106 -0
- package/ui/tsconfig.json +32 -0
- package/ui/vite.config.ts +37 -0
|
@@ -0,0 +1,988 @@
|
|
|
1
|
+
import { useEffect, useMemo, useRef, useState } from 'react'
|
|
2
|
+
import { Gauge, Loader2, Play, RefreshCw, MessageSquareText } from 'lucide-react'
|
|
3
|
+
import { Button } from '@/components/ui/button'
|
|
4
|
+
import { cn } from '@/lib/utils'
|
|
5
|
+
import {
|
|
6
|
+
BenchmarkCapabilityMatrix,
|
|
7
|
+
BenchmarkExampleSummary,
|
|
8
|
+
BenchmarkRunEvent,
|
|
9
|
+
BenchmarkRunRecord,
|
|
10
|
+
BenchmarkRunSummary,
|
|
11
|
+
getBenchmarkRun,
|
|
12
|
+
listBenchmarkCapabilities,
|
|
13
|
+
listBenchmarkExamples,
|
|
14
|
+
listBenchmarkRuns,
|
|
15
|
+
listModels,
|
|
16
|
+
startBenchmarkRun,
|
|
17
|
+
type Model,
|
|
18
|
+
} from '@/api/client'
|
|
19
|
+
|
|
20
|
+
type RunStatus = 'running' | 'completed' | 'failed'
|
|
21
|
+
|
|
22
|
+
type ModelLeaderboardRow = {
|
|
23
|
+
model: string
|
|
24
|
+
runCount: number
|
|
25
|
+
scenarioCount: number
|
|
26
|
+
avgPassRate: number
|
|
27
|
+
avgP95LatencyMs: number
|
|
28
|
+
totalTokens: number
|
|
29
|
+
totalFailovers: number
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
type ShowcaseExchange = {
|
|
33
|
+
id: string
|
|
34
|
+
timestamp?: string
|
|
35
|
+
mode: string
|
|
36
|
+
model: string
|
|
37
|
+
scenarioInput: string
|
|
38
|
+
requestPath: string
|
|
39
|
+
statusCode: number
|
|
40
|
+
contentType: string
|
|
41
|
+
endpointName?: string
|
|
42
|
+
upstreamModel?: string
|
|
43
|
+
toolTrace: Array<{
|
|
44
|
+
kind: 'tool_call' | 'tool_result'
|
|
45
|
+
toolName: string
|
|
46
|
+
toolCallId?: string
|
|
47
|
+
argumentsText?: string
|
|
48
|
+
contentText?: string
|
|
49
|
+
}>
|
|
50
|
+
requestPayload: unknown
|
|
51
|
+
responsePayload: unknown
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const SHOWCASE_SUITE = 'showcase'
|
|
55
|
+
const DIAGNOSTIC_SUITES = ['smoke', 'proxy', 'agent', 'pool_smoke', 'omni_call_smoke', 'capabilities']
|
|
56
|
+
const ALL_SUITES = [SHOWCASE_SUITE, ...DIAGNOSTIC_SUITES]
|
|
57
|
+
const SUITE_LABELS: Record<string, string> = {
|
|
58
|
+
showcase: 'Showcase',
|
|
59
|
+
smoke: 'Smoke',
|
|
60
|
+
proxy: 'Proxy',
|
|
61
|
+
agent: 'Agent',
|
|
62
|
+
pool_smoke: 'Pool Smoke',
|
|
63
|
+
omni_call_smoke: 'Omni Call Smoke',
|
|
64
|
+
capabilities: 'Capabilities',
|
|
65
|
+
}
|
|
66
|
+
const PROFILES = ['local', 'ci']
|
|
67
|
+
|
|
68
|
+
type GenerationParamDraft = {
|
|
69
|
+
temperature: string
|
|
70
|
+
topP: string
|
|
71
|
+
maxTokens: string
|
|
72
|
+
presencePenalty: string
|
|
73
|
+
frequencyPenalty: string
|
|
74
|
+
seed: string
|
|
75
|
+
stop: string
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
type GenerationParamPayload = {
|
|
79
|
+
temperature?: number
|
|
80
|
+
top_p?: number
|
|
81
|
+
max_tokens?: number
|
|
82
|
+
presence_penalty?: number
|
|
83
|
+
frequency_penalty?: number
|
|
84
|
+
seed?: number
|
|
85
|
+
stop?: string | string[]
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
type NumericGenerationParamKey =
|
|
89
|
+
| 'temperature'
|
|
90
|
+
| 'top_p'
|
|
91
|
+
| 'max_tokens'
|
|
92
|
+
| 'presence_penalty'
|
|
93
|
+
| 'frequency_penalty'
|
|
94
|
+
| 'seed'
|
|
95
|
+
|
|
96
|
+
function parseGenerationParamDraft(draft: GenerationParamDraft): { payload: GenerationParamPayload; errors: string[] } {
|
|
97
|
+
const errors: string[] = []
|
|
98
|
+
const payload: GenerationParamPayload = {}
|
|
99
|
+
|
|
100
|
+
const parseNumber = (
|
|
101
|
+
raw: string,
|
|
102
|
+
fieldLabel: string,
|
|
103
|
+
key: NumericGenerationParamKey,
|
|
104
|
+
opts?: { min?: number; max?: number; integer?: boolean }
|
|
105
|
+
) => {
|
|
106
|
+
const trimmed = raw.trim()
|
|
107
|
+
if (!trimmed) return
|
|
108
|
+
const parsed = Number(trimmed)
|
|
109
|
+
if (!Number.isFinite(parsed)) {
|
|
110
|
+
errors.push(`${fieldLabel} must be a valid number.`)
|
|
111
|
+
return
|
|
112
|
+
}
|
|
113
|
+
if (opts?.integer && !Number.isInteger(parsed)) {
|
|
114
|
+
errors.push(`${fieldLabel} must be an integer.`)
|
|
115
|
+
return
|
|
116
|
+
}
|
|
117
|
+
if (typeof opts?.min === 'number' && parsed < opts.min) {
|
|
118
|
+
errors.push(`${fieldLabel} must be >= ${opts.min}.`)
|
|
119
|
+
return
|
|
120
|
+
}
|
|
121
|
+
if (typeof opts?.max === 'number' && parsed > opts.max) {
|
|
122
|
+
errors.push(`${fieldLabel} must be <= ${opts.max}.`)
|
|
123
|
+
return
|
|
124
|
+
}
|
|
125
|
+
payload[key] = parsed
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
parseNumber(draft.temperature, 'Temperature', 'temperature')
|
|
129
|
+
parseNumber(draft.topP, 'Top P', 'top_p', { min: 0, max: 1 })
|
|
130
|
+
parseNumber(draft.maxTokens, 'Max Tokens', 'max_tokens', { min: 1, integer: true })
|
|
131
|
+
parseNumber(draft.presencePenalty, 'Presence Penalty', 'presence_penalty', { min: -2, max: 2 })
|
|
132
|
+
parseNumber(draft.frequencyPenalty, 'Frequency Penalty', 'frequency_penalty', { min: -2, max: 2 })
|
|
133
|
+
parseNumber(draft.seed, 'Seed', 'seed', { min: 0, integer: true })
|
|
134
|
+
|
|
135
|
+
const stopValues = draft.stop
|
|
136
|
+
.split(',')
|
|
137
|
+
.map((value) => value.trim())
|
|
138
|
+
.filter(Boolean)
|
|
139
|
+
if (stopValues.length === 1) {
|
|
140
|
+
payload.stop = stopValues[0]
|
|
141
|
+
} else if (stopValues.length > 1) {
|
|
142
|
+
payload.stop = stopValues
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return { payload, errors }
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export function Benchmark() {
|
|
149
|
+
const [runs, setRuns] = useState<BenchmarkRunSummary[]>([])
|
|
150
|
+
const [models, setModels] = useState<Model[]>([])
|
|
151
|
+
const [examples, setExamples] = useState<BenchmarkExampleSummary[]>([])
|
|
152
|
+
const [selectedRunId, setSelectedRunId] = useState<string | null>(null)
|
|
153
|
+
const [selectedRun, setSelectedRun] = useState<BenchmarkRunRecord | null>(null)
|
|
154
|
+
const [events, setEvents] = useState<BenchmarkRunEvent[]>([])
|
|
155
|
+
const [loading, setLoading] = useState(true)
|
|
156
|
+
const [starting, setStarting] = useState(false)
|
|
157
|
+
const [suite, setSuite] = useState('showcase')
|
|
158
|
+
const [profile, setProfile] = useState('local')
|
|
159
|
+
const [scenarioPath, setScenarioPath] = useState('')
|
|
160
|
+
const [selectedModel, setSelectedModel] = useState('')
|
|
161
|
+
const [selectedExampleId, setSelectedExampleId] = useState('')
|
|
162
|
+
const [advancedOpen, setAdvancedOpen] = useState(false)
|
|
163
|
+
const [updateCapCache, setUpdateCapCache] = useState(false)
|
|
164
|
+
const [capTtlDays, setCapTtlDays] = useState('7')
|
|
165
|
+
const [genParams, setGenParams] = useState<GenerationParamDraft>({
|
|
166
|
+
temperature: '',
|
|
167
|
+
topP: '',
|
|
168
|
+
maxTokens: '',
|
|
169
|
+
presencePenalty: '',
|
|
170
|
+
frequencyPenalty: '',
|
|
171
|
+
seed: '',
|
|
172
|
+
stop: '',
|
|
173
|
+
})
|
|
174
|
+
const [showRaw, setShowRaw] = useState(false)
|
|
175
|
+
const [error, setError] = useState<string | null>(null)
|
|
176
|
+
const [modelLeaderboard, setModelLeaderboard] = useState<ModelLeaderboardRow[]>([])
|
|
177
|
+
const [capabilityMatrix, setCapabilityMatrix] = useState<BenchmarkCapabilityMatrix | null>(null)
|
|
178
|
+
const eventSourceRef = useRef<EventSource | null>(null)
|
|
179
|
+
|
|
180
|
+
const loadRuns = async () => {
|
|
181
|
+
try {
|
|
182
|
+
const response = await listBenchmarkRuns()
|
|
183
|
+
setRuns(response.data)
|
|
184
|
+
if (!selectedRunId && response.data.length > 0) {
|
|
185
|
+
setSelectedRunId(response.data[0].id)
|
|
186
|
+
}
|
|
187
|
+
} catch (err) {
|
|
188
|
+
console.error('Failed to load benchmark runs:', err)
|
|
189
|
+
setError((err as Error).message)
|
|
190
|
+
} finally {
|
|
191
|
+
setLoading(false)
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
const loadCapabilities = async () => {
|
|
196
|
+
try {
|
|
197
|
+
const response = await listBenchmarkCapabilities(7)
|
|
198
|
+
setCapabilityMatrix(response)
|
|
199
|
+
} catch (err) {
|
|
200
|
+
console.error('Failed to load benchmark capabilities:', err)
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const loadModels = async () => {
|
|
205
|
+
try {
|
|
206
|
+
const response = await listModels()
|
|
207
|
+
setModels(response.data)
|
|
208
|
+
} catch (err) {
|
|
209
|
+
console.error('Failed to load models:', err)
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
const loadExamples = async (suiteName: string) => {
|
|
214
|
+
try {
|
|
215
|
+
const response = await listBenchmarkExamples(suiteName)
|
|
216
|
+
setExamples(response.data)
|
|
217
|
+
setSelectedExampleId((current) => {
|
|
218
|
+
if (response.data.some((example) => example.id === current)) return current
|
|
219
|
+
return response.data[0]?.id ?? ''
|
|
220
|
+
})
|
|
221
|
+
} catch (err) {
|
|
222
|
+
console.error('Failed to load benchmark examples:', err)
|
|
223
|
+
setExamples([])
|
|
224
|
+
setSelectedExampleId('')
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
useEffect(() => {
|
|
229
|
+
void loadRuns()
|
|
230
|
+
void loadModels()
|
|
231
|
+
void loadCapabilities()
|
|
232
|
+
void loadExamples(suite)
|
|
233
|
+
const timer = setInterval(() => {
|
|
234
|
+
void loadRuns()
|
|
235
|
+
void loadCapabilities()
|
|
236
|
+
}, 5000)
|
|
237
|
+
return () => clearInterval(timer)
|
|
238
|
+
}, [])
|
|
239
|
+
|
|
240
|
+
useEffect(() => {
|
|
241
|
+
void loadExamples(suite)
|
|
242
|
+
}, [suite])
|
|
243
|
+
|
|
244
|
+
useEffect(() => {
|
|
245
|
+
if (suite === 'capabilities') {
|
|
246
|
+
setUpdateCapCache(true)
|
|
247
|
+
}
|
|
248
|
+
}, [suite])
|
|
249
|
+
|
|
250
|
+
useEffect(() => {
|
|
251
|
+
if (!selectedRunId) {
|
|
252
|
+
setSelectedRun(null)
|
|
253
|
+
setEvents([])
|
|
254
|
+
return
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const loadRun = async () => {
|
|
258
|
+
try {
|
|
259
|
+
const run = await getBenchmarkRun(selectedRunId)
|
|
260
|
+
setSelectedRun(run)
|
|
261
|
+
setEvents((run.events ?? []).slice(-500))
|
|
262
|
+
} catch (err) {
|
|
263
|
+
console.error('Failed to load benchmark run:', err)
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
void loadRun()
|
|
268
|
+
const pollTimer = setInterval(() => {
|
|
269
|
+
void loadRun()
|
|
270
|
+
}, 2500)
|
|
271
|
+
|
|
272
|
+
eventSourceRef.current?.close()
|
|
273
|
+
eventSourceRef.current = null
|
|
274
|
+
|
|
275
|
+
const selectedSummary = runs.find((run) => run.id === selectedRunId)
|
|
276
|
+
if (selectedSummary?.status === 'running') {
|
|
277
|
+
const source = new EventSource(`/admin/benchmarks/runs/${encodeURIComponent(selectedRunId)}/events`)
|
|
278
|
+
source.onmessage = (message) => {
|
|
279
|
+
try {
|
|
280
|
+
const event = JSON.parse(message.data) as BenchmarkRunEvent
|
|
281
|
+
setEvents((prev) => [...prev, event].slice(-500))
|
|
282
|
+
} catch {
|
|
283
|
+
// Ignore malformed events.
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
source.onerror = () => {
|
|
287
|
+
source.close()
|
|
288
|
+
}
|
|
289
|
+
eventSourceRef.current = source
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
return () => {
|
|
293
|
+
clearInterval(pollTimer)
|
|
294
|
+
eventSourceRef.current?.close()
|
|
295
|
+
eventSourceRef.current = null
|
|
296
|
+
}
|
|
297
|
+
}, [selectedRunId, runs])
|
|
298
|
+
|
|
299
|
+
useEffect(() => {
|
|
300
|
+
const buildLeaderboard = async () => {
|
|
301
|
+
const completedRunIds = runs
|
|
302
|
+
.filter((run) => run.status === 'completed')
|
|
303
|
+
.slice(0, 20)
|
|
304
|
+
.map((run) => run.id)
|
|
305
|
+
const details = await Promise.all(
|
|
306
|
+
completedRunIds.map(async (id) => {
|
|
307
|
+
try {
|
|
308
|
+
return await getBenchmarkRun(id)
|
|
309
|
+
} catch {
|
|
310
|
+
return null
|
|
311
|
+
}
|
|
312
|
+
})
|
|
313
|
+
)
|
|
314
|
+
const rows = aggregateModelLeaderboard(details.filter((item): item is BenchmarkRunRecord => item !== null))
|
|
315
|
+
setModelLeaderboard(rows)
|
|
316
|
+
}
|
|
317
|
+
void buildLeaderboard()
|
|
318
|
+
}, [runs])
|
|
319
|
+
|
|
320
|
+
const startRun = async () => {
|
|
321
|
+
setStarting(true)
|
|
322
|
+
setError(null)
|
|
323
|
+
try {
|
|
324
|
+
const { payload: paramPayload, errors: paramErrors } = parseGenerationParamDraft(genParams)
|
|
325
|
+
if (paramErrors.length > 0) {
|
|
326
|
+
throw new Error(paramErrors[0])
|
|
327
|
+
}
|
|
328
|
+
const executionMode = isShowcase ? 'showcase' : 'diagnostic'
|
|
329
|
+
const parsedCapTtl = Number(capTtlDays.trim())
|
|
330
|
+
const run = await startBenchmarkRun({
|
|
331
|
+
suite,
|
|
332
|
+
exampleId: isShowcase && selectedExampleId ? selectedExampleId : undefined,
|
|
333
|
+
profile,
|
|
334
|
+
scenarioPath: scenarioPath.trim() || undefined,
|
|
335
|
+
modelOverride: selectedModel || undefined,
|
|
336
|
+
executionMode,
|
|
337
|
+
updateCapCache: updateCapCache || suite === 'capabilities',
|
|
338
|
+
capTtlDays:
|
|
339
|
+
capTtlDays.trim().length > 0 && Number.isFinite(parsedCapTtl) && parsedCapTtl >= 1
|
|
340
|
+
? Math.trunc(parsedCapTtl)
|
|
341
|
+
: 7,
|
|
342
|
+
...paramPayload,
|
|
343
|
+
})
|
|
344
|
+
setSelectedRunId(run.id)
|
|
345
|
+
await loadRuns()
|
|
346
|
+
await loadCapabilities()
|
|
347
|
+
} catch (err) {
|
|
348
|
+
setError((err as Error).message)
|
|
349
|
+
} finally {
|
|
350
|
+
setStarting(false)
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
const progress = useMemo(() => {
|
|
355
|
+
const total = selectedRun?.progress?.totalScenarios ?? 0
|
|
356
|
+
const complete = selectedRun?.progress?.completedScenarios ?? 0
|
|
357
|
+
const percent = total > 0 ? Math.min(100, Math.round((complete / total) * 100)) : 0
|
|
358
|
+
return { total, complete, percent }
|
|
359
|
+
}, [selectedRun])
|
|
360
|
+
|
|
361
|
+
const activeExample = useMemo(() => {
|
|
362
|
+
const reportDetails = selectedRun?.report?.scenarioDetails ?? []
|
|
363
|
+
const fromRun = reportDetails.find((detail) => detail.id === selectedExampleId)?.example ?? reportDetails[0]?.example
|
|
364
|
+
if (fromRun) return fromRun
|
|
365
|
+
return examples.find((example) => example.id === selectedExampleId) ?? examples[0] ?? null
|
|
366
|
+
}, [examples, selectedExampleId, selectedRun])
|
|
367
|
+
|
|
368
|
+
const activeScenarioDetail = useMemo(() => {
|
|
369
|
+
const details = selectedRun?.report?.scenarioDetails ?? []
|
|
370
|
+
if (details.length === 0) return null
|
|
371
|
+
return details.find((detail) => detail.id === selectedExampleId) ?? details[0]
|
|
372
|
+
}, [selectedExampleId, selectedRun])
|
|
373
|
+
|
|
374
|
+
const liveTrace = useMemo<ShowcaseExchange[]>(() => {
|
|
375
|
+
const traceEvents = events.filter((event) => event.type === 'exchange' && event.exchange)
|
|
376
|
+
if (traceEvents.length > 0) {
|
|
377
|
+
return traceEvents.map((event, index) => ({
|
|
378
|
+
id: `${event.timestamp}-${index}`,
|
|
379
|
+
timestamp: event.timestamp,
|
|
380
|
+
mode: event.exchange?.mode ?? 'unknown',
|
|
381
|
+
model: event.exchange?.model ?? 'unknown',
|
|
382
|
+
scenarioInput: event.exchange?.scenarioInput ?? '',
|
|
383
|
+
requestPath: event.exchange?.requestPath ?? '',
|
|
384
|
+
statusCode: event.exchange?.statusCode ?? 0,
|
|
385
|
+
contentType: event.exchange?.contentType ?? '',
|
|
386
|
+
endpointName: event.exchange?.endpointName,
|
|
387
|
+
upstreamModel: event.exchange?.upstreamModel,
|
|
388
|
+
toolTrace: event.exchange?.toolTrace ?? [],
|
|
389
|
+
requestPayload: showRaw ? event.exchange?.requestRaw : event.exchange?.requestSanitized,
|
|
390
|
+
responsePayload: showRaw ? event.exchange?.responseRaw : event.exchange?.responseSanitized,
|
|
391
|
+
}))
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
return (activeScenarioDetail?.exchanges ?? []).map((exchange, index) => ({
|
|
395
|
+
id: `${activeScenarioDetail?.id ?? 'detail'}-${index}`,
|
|
396
|
+
timestamp: exchange.timestamp,
|
|
397
|
+
mode: exchange.mode,
|
|
398
|
+
model: exchange.model,
|
|
399
|
+
scenarioInput: activeScenarioDetail?.example?.inputPreview ?? '',
|
|
400
|
+
requestPath: exchange.requestPath,
|
|
401
|
+
statusCode: exchange.statusCode,
|
|
402
|
+
contentType: exchange.contentType,
|
|
403
|
+
endpointName: exchange.endpointName,
|
|
404
|
+
upstreamModel: exchange.upstreamModel,
|
|
405
|
+
toolTrace: exchange.toolTrace,
|
|
406
|
+
requestPayload: exchange.requestSanitized,
|
|
407
|
+
responsePayload: exchange.responseSanitized,
|
|
408
|
+
}))
|
|
409
|
+
}, [activeScenarioDetail, events, showRaw])
|
|
410
|
+
|
|
411
|
+
const isShowcase = suite === SHOWCASE_SUITE
|
|
412
|
+
const selectedExample = useMemo(
|
|
413
|
+
() => examples.find((example) => example.id === selectedExampleId) ?? null,
|
|
414
|
+
[examples, selectedExampleId]
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
return (
|
|
418
|
+
<div className="flex-1 flex flex-col h-full min-h-0">
|
|
419
|
+
<header className="sticky top-0 z-20 h-14 border-b border-border bg-background/95 backdrop-blur flex items-center px-6 gap-4 shrink-0">
|
|
420
|
+
<div className="flex items-center gap-2">
|
|
421
|
+
<Gauge className="w-4 h-4 text-primary" />
|
|
422
|
+
<h2 className="font-mono font-semibold text-sm uppercase tracking-wider">Benchmark</h2>
|
|
423
|
+
</div>
|
|
424
|
+
<div className="text-xs text-muted-foreground font-mono">Live examples first, diagnostics second</div>
|
|
425
|
+
<div className="flex-1" />
|
|
426
|
+
<Button variant="outline" size="sm" onClick={() => { void loadRuns(); void loadModels(); void loadExamples(suite) }} disabled={loading}>
|
|
427
|
+
<RefreshCw className={cn('w-3 h-3 mr-2', loading && 'animate-spin')} />
|
|
428
|
+
Refresh
|
|
429
|
+
</Button>
|
|
430
|
+
</header>
|
|
431
|
+
|
|
432
|
+
<div className="flex-1 min-h-0 grid grid-cols-[320px_1fr] gap-0">
|
|
433
|
+
<aside className="border-r border-border p-4 space-y-4 overflow-auto">
|
|
434
|
+
<div className="panel">
|
|
435
|
+
<div className="panel-header">
|
|
436
|
+
<span className="panel-title">Run Setup</span>
|
|
437
|
+
</div>
|
|
438
|
+
<div className="p-3 space-y-3">
|
|
439
|
+
<label className="text-xs text-muted-foreground block">
|
|
440
|
+
Suite
|
|
441
|
+
<select
|
|
442
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
443
|
+
value={suite}
|
|
444
|
+
onChange={(event) => setSuite(event.target.value)}
|
|
445
|
+
>
|
|
446
|
+
{ALL_SUITES.map((item) => (
|
|
447
|
+
<option key={item} value={item}>{SUITE_LABELS[item] ?? item}</option>
|
|
448
|
+
))}
|
|
449
|
+
</select>
|
|
450
|
+
</label>
|
|
451
|
+
|
|
452
|
+
{isShowcase ? (
|
|
453
|
+
<div className="rounded border border-border/70 bg-secondary/20 p-2">
|
|
454
|
+
<p className="text-2xs uppercase text-muted-foreground">Question Source</p>
|
|
455
|
+
<p className="text-xs font-mono">vincentkoc/tiny_qa_benchmark (Hugging Face)</p>
|
|
456
|
+
</div>
|
|
457
|
+
) : null}
|
|
458
|
+
|
|
459
|
+
{isShowcase && (
|
|
460
|
+
<label className="text-xs text-muted-foreground block">
|
|
461
|
+
Example
|
|
462
|
+
<select
|
|
463
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
464
|
+
value={selectedExampleId}
|
|
465
|
+
onChange={(event) => setSelectedExampleId(event.target.value)}
|
|
466
|
+
>
|
|
467
|
+
{examples.map((example) => (
|
|
468
|
+
<option key={example.id} value={example.id}>
|
|
469
|
+
{example.title}
|
|
470
|
+
</option>
|
|
471
|
+
))}
|
|
472
|
+
{examples.length === 0 && <option value="">No examples</option>}
|
|
473
|
+
</select>
|
|
474
|
+
</label>
|
|
475
|
+
)}
|
|
476
|
+
|
|
477
|
+
<div className="grid grid-cols-2 gap-2">
|
|
478
|
+
<label className="text-xs text-muted-foreground block">
|
|
479
|
+
Temperature
|
|
480
|
+
<input
|
|
481
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
482
|
+
value={genParams.temperature}
|
|
483
|
+
onChange={(event) =>
|
|
484
|
+
setGenParams((prev) => ({ ...prev, temperature: event.target.value }))
|
|
485
|
+
}
|
|
486
|
+
placeholder="e.g. 0.7"
|
|
487
|
+
/>
|
|
488
|
+
</label>
|
|
489
|
+
<label className="text-xs text-muted-foreground block">
|
|
490
|
+
Top P
|
|
491
|
+
<input
|
|
492
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
493
|
+
value={genParams.topP}
|
|
494
|
+
onChange={(event) =>
|
|
495
|
+
setGenParams((prev) => ({ ...prev, topP: event.target.value }))
|
|
496
|
+
}
|
|
497
|
+
placeholder="e.g. 1"
|
|
498
|
+
/>
|
|
499
|
+
</label>
|
|
500
|
+
<label className="text-xs text-muted-foreground block">
|
|
501
|
+
Max Tokens
|
|
502
|
+
<input
|
|
503
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
504
|
+
value={genParams.maxTokens}
|
|
505
|
+
onChange={(event) =>
|
|
506
|
+
setGenParams((prev) => ({ ...prev, maxTokens: event.target.value }))
|
|
507
|
+
}
|
|
508
|
+
placeholder="e.g. 512"
|
|
509
|
+
/>
|
|
510
|
+
</label>
|
|
511
|
+
<label className="text-xs text-muted-foreground block">
|
|
512
|
+
Presence Penalty
|
|
513
|
+
<input
|
|
514
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
515
|
+
value={genParams.presencePenalty}
|
|
516
|
+
onChange={(event) =>
|
|
517
|
+
setGenParams((prev) => ({ ...prev, presencePenalty: event.target.value }))
|
|
518
|
+
}
|
|
519
|
+
placeholder="-2 to 2"
|
|
520
|
+
/>
|
|
521
|
+
</label>
|
|
522
|
+
<label className="text-xs text-muted-foreground block">
|
|
523
|
+
Frequency Penalty
|
|
524
|
+
<input
|
|
525
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
526
|
+
value={genParams.frequencyPenalty}
|
|
527
|
+
onChange={(event) =>
|
|
528
|
+
setGenParams((prev) => ({ ...prev, frequencyPenalty: event.target.value }))
|
|
529
|
+
}
|
|
530
|
+
placeholder="-2 to 2"
|
|
531
|
+
/>
|
|
532
|
+
</label>
|
|
533
|
+
<label className="text-xs text-muted-foreground block">
|
|
534
|
+
Seed
|
|
535
|
+
<input
|
|
536
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
537
|
+
value={genParams.seed}
|
|
538
|
+
onChange={(event) =>
|
|
539
|
+
setGenParams((prev) => ({ ...prev, seed: event.target.value }))
|
|
540
|
+
}
|
|
541
|
+
placeholder="integer"
|
|
542
|
+
/>
|
|
543
|
+
</label>
|
|
544
|
+
</div>
|
|
545
|
+
|
|
546
|
+
<label className="text-xs text-muted-foreground block">
|
|
547
|
+
Stop Sequences (comma-separated)
|
|
548
|
+
<input
|
|
549
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
550
|
+
value={genParams.stop}
|
|
551
|
+
onChange={(event) =>
|
|
552
|
+
setGenParams((prev) => ({ ...prev, stop: event.target.value }))
|
|
553
|
+
}
|
|
554
|
+
placeholder="END, STOP"
|
|
555
|
+
/>
|
|
556
|
+
</label>
|
|
557
|
+
|
|
558
|
+
<button
|
|
559
|
+
type="button"
|
|
560
|
+
className="w-full text-left text-xs font-mono text-muted-foreground border border-border rounded px-2 py-1 hover:bg-secondary/50"
|
|
561
|
+
onClick={() => setAdvancedOpen((prev) => !prev)}
|
|
562
|
+
>
|
|
563
|
+
{advancedOpen ? 'Hide Advanced' : 'Show Advanced'}
|
|
564
|
+
</button>
|
|
565
|
+
|
|
566
|
+
{advancedOpen && (
|
|
567
|
+
<div className="space-y-3 rounded border border-border/60 bg-secondary/20 p-2">
|
|
568
|
+
<label className="text-xs text-muted-foreground block">
|
|
569
|
+
Profile
|
|
570
|
+
<select
|
|
571
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
572
|
+
value={profile}
|
|
573
|
+
onChange={(event) => setProfile(event.target.value)}
|
|
574
|
+
>
|
|
575
|
+
{PROFILES.map((item) => (
|
|
576
|
+
<option key={item} value={item}>{item}</option>
|
|
577
|
+
))}
|
|
578
|
+
</select>
|
|
579
|
+
</label>
|
|
580
|
+
|
|
581
|
+
<label className="text-xs text-muted-foreground block">
|
|
582
|
+
Model Override
|
|
583
|
+
<select
|
|
584
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
585
|
+
value={selectedModel}
|
|
586
|
+
onChange={(event) => setSelectedModel(event.target.value)}
|
|
587
|
+
>
|
|
588
|
+
<option value="">(auto)</option>
|
|
589
|
+
{models.map((model) => (
|
|
590
|
+
<option key={model.id} value={model.id}>{model.id}</option>
|
|
591
|
+
))}
|
|
592
|
+
</select>
|
|
593
|
+
</label>
|
|
594
|
+
|
|
595
|
+
<label className="text-xs text-muted-foreground block">
|
|
596
|
+
Scenario File (Optional)
|
|
597
|
+
<input
|
|
598
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
599
|
+
value={scenarioPath}
|
|
600
|
+
onChange={(event) => setScenarioPath(event.target.value)}
|
|
601
|
+
placeholder="./examples/scenarios/custom.yaml"
|
|
602
|
+
/>
|
|
603
|
+
</label>
|
|
604
|
+
|
|
605
|
+
<label className="text-xs text-muted-foreground block">
|
|
606
|
+
Capability TTL Days
|
|
607
|
+
<input
|
|
608
|
+
className="mt-1 w-full bg-input border border-border rounded px-2 py-1 text-sm font-mono"
|
|
609
|
+
value={capTtlDays}
|
|
610
|
+
onChange={(event) => setCapTtlDays(event.target.value)}
|
|
611
|
+
placeholder="7"
|
|
612
|
+
/>
|
|
613
|
+
</label>
|
|
614
|
+
|
|
615
|
+
<label className="flex items-center gap-2 text-xs text-muted-foreground">
|
|
616
|
+
<input
|
|
617
|
+
type="checkbox"
|
|
618
|
+
checked={updateCapCache}
|
|
619
|
+
onChange={(event) => setUpdateCapCache(event.target.checked)}
|
|
620
|
+
/>
|
|
621
|
+
Update Capability Cache
|
|
622
|
+
</label>
|
|
623
|
+
</div>
|
|
624
|
+
)}
|
|
625
|
+
|
|
626
|
+
<p className="text-2xs text-muted-foreground">
|
|
627
|
+
{isShowcase
|
|
628
|
+
? `Showcase runs one Tiny QA question at a time. Expected answer: ${selectedExample?.successCriteria ?? 'n/a'}`
|
|
629
|
+
: 'Diagnostic suites keep pass-rate, latency, and capability diagnostics.'}
|
|
630
|
+
</p>
|
|
631
|
+
|
|
632
|
+
<Button className="w-full" onClick={startRun} disabled={starting || (isShowcase && !selectedExampleId)}>
|
|
633
|
+
{starting ? <Loader2 className="w-4 h-4 animate-spin mr-2" /> : <Play className="w-4 h-4 mr-2" />}
|
|
634
|
+
{isShowcase ? 'Run Showcase' : 'Run Diagnostic'}
|
|
635
|
+
</Button>
|
|
636
|
+
{error && <p className="text-xs text-destructive">{error}</p>}
|
|
637
|
+
</div>
|
|
638
|
+
</div>
|
|
639
|
+
|
|
640
|
+
<div className="panel">
|
|
641
|
+
<div className="panel-header">
|
|
642
|
+
<span className="panel-title">Runs</span>
|
|
643
|
+
</div>
|
|
644
|
+
<div className="max-h-[56vh] overflow-auto divide-y divide-border">
|
|
645
|
+
{runs.map((run) => (
|
|
646
|
+
<button
|
|
647
|
+
key={run.id}
|
|
648
|
+
className={cn(
|
|
649
|
+
'w-full text-left px-3 py-2 hover:bg-secondary/50 transition-colors',
|
|
650
|
+
selectedRunId === run.id && 'bg-secondary'
|
|
651
|
+
)}
|
|
652
|
+
onClick={() => setSelectedRunId(run.id)}
|
|
653
|
+
>
|
|
654
|
+
<p className="text-xs font-mono truncate">{run.id}</p>
|
|
655
|
+
<p className="text-2xs text-muted-foreground">{run.suite ?? 'custom'}{run.exampleId ? ` • ${run.exampleId}` : ''}</p>
|
|
656
|
+
<p className={cn('text-2xs uppercase font-mono', statusClass(run.status))}>{run.status}</p>
|
|
657
|
+
</button>
|
|
658
|
+
))}
|
|
659
|
+
{runs.length === 0 && (
|
|
660
|
+
<div className="px-3 py-4 text-xs text-muted-foreground">No benchmark runs yet.</div>
|
|
661
|
+
)}
|
|
662
|
+
</div>
|
|
663
|
+
</div>
|
|
664
|
+
</aside>
|
|
665
|
+
|
|
666
|
+
<section className="min-h-0 overflow-auto p-6 space-y-6">
|
|
667
|
+
<div className="panel">
|
|
668
|
+
<div className="panel-header">
|
|
669
|
+
<span className="panel-title">Progress</span>
|
|
670
|
+
<span className="text-2xs text-muted-foreground ml-auto">
|
|
671
|
+
{progress.complete}/{progress.total}
|
|
672
|
+
</span>
|
|
673
|
+
</div>
|
|
674
|
+
<div className="p-4 space-y-2">
|
|
675
|
+
<div className="w-full h-2 rounded bg-secondary overflow-hidden">
|
|
676
|
+
<div className="h-full bg-primary transition-all duration-300" style={{ width: `${progress.percent}%` }} />
|
|
677
|
+
</div>
|
|
678
|
+
<p className="text-xs text-muted-foreground font-mono">
|
|
679
|
+
{selectedRun?.progress?.currentScenarioId
|
|
680
|
+
? `Current: ${selectedRun.progress.currentScenarioId}`
|
|
681
|
+
: 'Idle'}
|
|
682
|
+
</p>
|
|
683
|
+
</div>
|
|
684
|
+
</div>
|
|
685
|
+
|
|
686
|
+
<div className="grid grid-cols-2 gap-6">
|
|
687
|
+
<div className="panel min-h-[320px]">
|
|
688
|
+
<div className="panel-header">
|
|
689
|
+
<span className="panel-title">What This Demonstrates</span>
|
|
690
|
+
</div>
|
|
691
|
+
<div className="p-4 space-y-3 text-sm">
|
|
692
|
+
{activeExample ? (
|
|
693
|
+
<>
|
|
694
|
+
<div>
|
|
695
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Title</p>
|
|
696
|
+
<p className="font-medium">{activeExample.title}</p>
|
|
697
|
+
</div>
|
|
698
|
+
<div>
|
|
699
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Goal</p>
|
|
700
|
+
<p>{activeExample.userVisibleGoal}</p>
|
|
701
|
+
</div>
|
|
702
|
+
<div>
|
|
703
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Input</p>
|
|
704
|
+
<pre className="text-xs font-mono whitespace-pre-wrap break-words">{activeExample.inputPreview}</pre>
|
|
705
|
+
</div>
|
|
706
|
+
<div>
|
|
707
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Success</p>
|
|
708
|
+
<p>{activeExample.successCriteria}</p>
|
|
709
|
+
</div>
|
|
710
|
+
<div>
|
|
711
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Expected Highlights</p>
|
|
712
|
+
<div className="flex flex-wrap gap-2">
|
|
713
|
+
{activeExample.expectedHighlights.map((item) => (
|
|
714
|
+
<span key={item} className="rounded border border-border px-2 py-1 text-2xs font-mono text-muted-foreground">{item}</span>
|
|
715
|
+
))}
|
|
716
|
+
</div>
|
|
717
|
+
</div>
|
|
718
|
+
</>
|
|
719
|
+
) : (
|
|
720
|
+
<p className="text-sm text-muted-foreground">No showcase example selected.</p>
|
|
721
|
+
)}
|
|
722
|
+
</div>
|
|
723
|
+
</div>
|
|
724
|
+
|
|
725
|
+
<div className="panel min-h-[320px]">
|
|
726
|
+
<div className="panel-header">
|
|
727
|
+
<span className="panel-title">Verdict</span>
|
|
728
|
+
</div>
|
|
729
|
+
<div className="p-4 space-y-3 text-sm">
|
|
730
|
+
{activeScenarioDetail ? (
|
|
731
|
+
<>
|
|
732
|
+
<div className="flex items-center gap-2 flex-wrap">
|
|
733
|
+
<span className={cn(
|
|
734
|
+
'rounded border px-2 py-1 text-2xs font-mono',
|
|
735
|
+
activeScenarioDetail.status === 'passed'
|
|
736
|
+
? 'border-emerald-500/40 text-emerald-300 bg-emerald-500/10'
|
|
737
|
+
: activeScenarioDetail.status === 'skipped'
|
|
738
|
+
? 'border-amber-500/40 text-amber-300 bg-amber-500/10'
|
|
739
|
+
: 'border-red-500/40 text-red-300 bg-red-500/10'
|
|
740
|
+
)}>
|
|
741
|
+
{activeScenarioDetail.status}
|
|
742
|
+
</span>
|
|
743
|
+
<span className="text-2xs font-mono text-muted-foreground">{activeScenarioDetail.model}</span>
|
|
744
|
+
</div>
|
|
745
|
+
<div>
|
|
746
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Reason</p>
|
|
747
|
+
<p>{activeScenarioDetail.verdict}</p>
|
|
748
|
+
</div>
|
|
749
|
+
<div>
|
|
750
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Final Response</p>
|
|
751
|
+
<pre className="text-xs font-mono whitespace-pre-wrap break-words">{activeScenarioDetail.finalResponsePreview || 'n/a'}</pre>
|
|
752
|
+
</div>
|
|
753
|
+
<div>
|
|
754
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Tools Used</p>
|
|
755
|
+
<p className="text-xs font-mono">{activeScenarioDetail.usedToolNames.length > 0 ? activeScenarioDetail.usedToolNames.join(', ') : 'none'}</p>
|
|
756
|
+
</div>
|
|
757
|
+
</>
|
|
758
|
+
) : (
|
|
759
|
+
<p className="text-sm text-muted-foreground">Run an example to capture a verdict.</p>
|
|
760
|
+
)}
|
|
761
|
+
</div>
|
|
762
|
+
</div>
|
|
763
|
+
</div>
|
|
764
|
+
|
|
765
|
+
<div className="panel min-h-[420px]">
|
|
766
|
+
<div className="panel-header">
|
|
767
|
+
<MessageSquareText className="w-4 h-4 text-muted-foreground" />
|
|
768
|
+
<span className="panel-title">Live Show</span>
|
|
769
|
+
<button
|
|
770
|
+
className="ml-auto text-2xs px-2 py-1 rounded bg-secondary hover:bg-secondary/80"
|
|
771
|
+
onClick={() => setShowRaw((prev) => !prev)}
|
|
772
|
+
>
|
|
773
|
+
{showRaw ? 'Raw' : 'Sanitized'}
|
|
774
|
+
</button>
|
|
775
|
+
</div>
|
|
776
|
+
<div className="p-4 space-y-3 max-h-[640px] overflow-auto">
|
|
777
|
+
{liveTrace.map((exchange) => (
|
|
778
|
+
<TraceCard key={exchange.id} exchange={exchange} />
|
|
779
|
+
))}
|
|
780
|
+
{liveTrace.length === 0 && (
|
|
781
|
+
<p className="text-xs text-muted-foreground">No request/response trace yet.</p>
|
|
782
|
+
)}
|
|
783
|
+
</div>
|
|
784
|
+
</div>
|
|
785
|
+
|
|
786
|
+
<div className="grid grid-cols-2 gap-6">
|
|
787
|
+
<div className="panel min-h-[320px]">
|
|
788
|
+
<div className="panel-header">
|
|
789
|
+
<span className="panel-title">Model Leaderboard (Diagnostics)</span>
|
|
790
|
+
</div>
|
|
791
|
+
<div className="p-4 overflow-auto max-h-[360px]">
|
|
792
|
+
<table className="w-full text-xs">
|
|
793
|
+
<thead className="text-muted-foreground">
|
|
794
|
+
<tr>
|
|
795
|
+
<th className="text-left py-1">Model</th>
|
|
796
|
+
<th className="text-right py-1">Runs</th>
|
|
797
|
+
<th className="text-right py-1">Scenarios</th>
|
|
798
|
+
<th className="text-right py-1">Pass</th>
|
|
799
|
+
<th className="text-right py-1">P95</th>
|
|
800
|
+
<th className="text-right py-1">Tokens</th>
|
|
801
|
+
<th className="text-right py-1">Failovers</th>
|
|
802
|
+
</tr>
|
|
803
|
+
</thead>
|
|
804
|
+
<tbody>
|
|
805
|
+
{modelLeaderboard.map((row) => (
|
|
806
|
+
<tr key={row.model} className="border-t border-border/40">
|
|
807
|
+
<td className="py-1 pr-2 font-mono">{row.model}</td>
|
|
808
|
+
<td className="py-1 text-right">{row.runCount}</td>
|
|
809
|
+
<td className="py-1 text-right">{row.scenarioCount}</td>
|
|
810
|
+
<td className="py-1 text-right">{`${Math.round(row.avgPassRate * 100)}%`}</td>
|
|
811
|
+
<td className="py-1 text-right">{`${Math.round(row.avgP95LatencyMs)}ms`}</td>
|
|
812
|
+
<td className="py-1 text-right">{row.totalTokens}</td>
|
|
813
|
+
<td className="py-1 text-right">{row.totalFailovers}</td>
|
|
814
|
+
</tr>
|
|
815
|
+
))}
|
|
816
|
+
{modelLeaderboard.length === 0 && (
|
|
817
|
+
<tr>
|
|
818
|
+
<td colSpan={7} className="py-3 text-center text-muted-foreground">
|
|
819
|
+
No model history available yet.
|
|
820
|
+
</td>
|
|
821
|
+
</tr>
|
|
822
|
+
)}
|
|
823
|
+
</tbody>
|
|
824
|
+
</table>
|
|
825
|
+
</div>
|
|
826
|
+
</div>
|
|
827
|
+
|
|
828
|
+
<div className="panel min-h-[320px]">
|
|
829
|
+
<div className="panel-header">
|
|
830
|
+
<span className="panel-title">Capabilities (Diagnostics)</span>
|
|
831
|
+
<span className="text-2xs text-muted-foreground ml-auto">
|
|
832
|
+
TTL {capabilityMatrix?.ttlDays ?? 7}d
|
|
833
|
+
</span>
|
|
834
|
+
</div>
|
|
835
|
+
<div className="p-4 overflow-auto max-h-[360px]">
|
|
836
|
+
<table className="w-full text-xs">
|
|
837
|
+
<thead className="text-muted-foreground">
|
|
838
|
+
<tr>
|
|
839
|
+
<th className="text-left py-1">Model</th>
|
|
840
|
+
<th className="text-left py-1">Freshness</th>
|
|
841
|
+
<th className="text-left py-1">Chat</th>
|
|
842
|
+
<th className="text-left py-1">Tools</th>
|
|
843
|
+
<th className="text-left py-1">Embed</th>
|
|
844
|
+
<th className="text-left py-1">Image</th>
|
|
845
|
+
</tr>
|
|
846
|
+
</thead>
|
|
847
|
+
<tbody>
|
|
848
|
+
{(capabilityMatrix?.models ?? []).map((model) => (
|
|
849
|
+
<tr key={model.model} className="border-t border-border/40">
|
|
850
|
+
<td className="py-1 pr-2 font-mono">{model.model}</td>
|
|
851
|
+
<td className={cn('py-1', model.freshness === 'fresh' ? 'text-success' : 'text-warning')}>
|
|
852
|
+
{model.freshness}
|
|
853
|
+
</td>
|
|
854
|
+
<td className="py-1">{model.findings.chat_basic.status}</td>
|
|
855
|
+
<td className="py-1">{model.findings.chat_tool_calls.status}</td>
|
|
856
|
+
<td className="py-1">{model.findings.embeddings.status}</td>
|
|
857
|
+
<td className="py-1">{model.findings.images_generation.status}</td>
|
|
858
|
+
</tr>
|
|
859
|
+
))}
|
|
860
|
+
{(capabilityMatrix?.models.length ?? 0) === 0 && (
|
|
861
|
+
<tr>
|
|
862
|
+
<td colSpan={6} className="py-3 text-center text-muted-foreground">
|
|
863
|
+
No capability snapshots yet. Run suite "capabilities" to populate cache.
|
|
864
|
+
</td>
|
|
865
|
+
</tr>
|
|
866
|
+
)}
|
|
867
|
+
</tbody>
|
|
868
|
+
</table>
|
|
869
|
+
</div>
|
|
870
|
+
</div>
|
|
871
|
+
</div>
|
|
872
|
+
</section>
|
|
873
|
+
</div>
|
|
874
|
+
</div>
|
|
875
|
+
)
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
function TraceCard({ exchange }: { exchange: ShowcaseExchange }) {
|
|
879
|
+
return (
|
|
880
|
+
<div className="border border-border rounded-md p-3 space-y-3">
|
|
881
|
+
<div className="flex items-center gap-2 flex-wrap text-2xs text-muted-foreground font-mono">
|
|
882
|
+
<span>{exchange.timestamp ?? 'saved-trace'}</span>
|
|
883
|
+
<span>•</span>
|
|
884
|
+
<span>{exchange.mode}</span>
|
|
885
|
+
<span>•</span>
|
|
886
|
+
<span>{exchange.model}</span>
|
|
887
|
+
{exchange.endpointName && (
|
|
888
|
+
<>
|
|
889
|
+
<span>•</span>
|
|
890
|
+
<span>{exchange.endpointName}</span>
|
|
891
|
+
</>
|
|
892
|
+
)}
|
|
893
|
+
</div>
|
|
894
|
+
|
|
895
|
+
<div className="bg-secondary/30 rounded p-2">
|
|
896
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Scenario Input</p>
|
|
897
|
+
<pre className="text-2xs font-mono whitespace-pre-wrap break-words">{exchange.scenarioInput}</pre>
|
|
898
|
+
</div>
|
|
899
|
+
|
|
900
|
+
<div className="bg-secondary/30 rounded p-2">
|
|
901
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">Wire Request {exchange.requestPath}</p>
|
|
902
|
+
<pre className="text-2xs font-mono whitespace-pre-wrap break-words">{safeStringify(exchange.requestPayload)}</pre>
|
|
903
|
+
</div>
|
|
904
|
+
|
|
905
|
+
{exchange.toolTrace.length > 0 && (
|
|
906
|
+
<div className="bg-secondary/30 rounded p-2 space-y-2">
|
|
907
|
+
<p className="text-2xs uppercase text-muted-foreground">Tool Trace</p>
|
|
908
|
+
{exchange.toolTrace.map((step, index) => (
|
|
909
|
+
<div key={`${step.kind}-${step.toolName}-${index}`} className="border border-border/60 rounded p-2">
|
|
910
|
+
<p className="text-2xs font-mono text-muted-foreground">{step.kind} • {step.toolName}</p>
|
|
911
|
+
{step.argumentsText && <pre className="text-2xs font-mono whitespace-pre-wrap break-words mt-1">{step.argumentsText}</pre>}
|
|
912
|
+
{step.contentText && <pre className="text-2xs font-mono whitespace-pre-wrap break-words mt-1">{step.contentText}</pre>}
|
|
913
|
+
</div>
|
|
914
|
+
))}
|
|
915
|
+
</div>
|
|
916
|
+
)}
|
|
917
|
+
|
|
918
|
+
<div className="bg-secondary/30 rounded p-2">
|
|
919
|
+
<p className="text-2xs uppercase text-muted-foreground mb-1">
|
|
920
|
+
Response {exchange.statusCode} ({exchange.contentType || 'unknown'})
|
|
921
|
+
</p>
|
|
922
|
+
<pre className="text-2xs font-mono whitespace-pre-wrap break-words">{safeStringify(exchange.responsePayload)}</pre>
|
|
923
|
+
</div>
|
|
924
|
+
</div>
|
|
925
|
+
)
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
function aggregateModelLeaderboard(runs: BenchmarkRunRecord[]): ModelLeaderboardRow[] {
|
|
929
|
+
const byModel = new Map<string, {
|
|
930
|
+
runIds: Set<string>
|
|
931
|
+
scenarios: number
|
|
932
|
+
passRateSum: number
|
|
933
|
+
p95Sum: number
|
|
934
|
+
tokens: number
|
|
935
|
+
failovers: number
|
|
936
|
+
}>()
|
|
937
|
+
|
|
938
|
+
for (const run of runs) {
|
|
939
|
+
const results = run.report?.results ?? []
|
|
940
|
+
for (const result of results) {
|
|
941
|
+
if (result.status === 'skipped') continue
|
|
942
|
+
const model = String(result.model ?? '')
|
|
943
|
+
if (!model) continue
|
|
944
|
+
const current = byModel.get(model) ?? {
|
|
945
|
+
runIds: new Set<string>(),
|
|
946
|
+
scenarios: 0,
|
|
947
|
+
passRateSum: 0,
|
|
948
|
+
p95Sum: 0,
|
|
949
|
+
tokens: 0,
|
|
950
|
+
failovers: 0,
|
|
951
|
+
}
|
|
952
|
+
current.runIds.add(run.id)
|
|
953
|
+
current.scenarios += 1
|
|
954
|
+
current.passRateSum += Number(result.passRate ?? 0)
|
|
955
|
+
current.p95Sum += Number(result.p95LatencyMs ?? 0)
|
|
956
|
+
current.tokens += Number(result.totalTokens ?? 0)
|
|
957
|
+
current.failovers += Number(result.failovers ?? 0)
|
|
958
|
+
byModel.set(model, current)
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
return Array.from(byModel.entries()).map(([model, value]) => ({
|
|
963
|
+
model,
|
|
964
|
+
runCount: value.runIds.size,
|
|
965
|
+
scenarioCount: value.scenarios,
|
|
966
|
+
avgPassRate: value.scenarios > 0 ? value.passRateSum / value.scenarios : 0,
|
|
967
|
+
avgP95LatencyMs: value.scenarios > 0 ? value.p95Sum / value.scenarios : 0,
|
|
968
|
+
totalTokens: value.tokens,
|
|
969
|
+
totalFailovers: value.failovers,
|
|
970
|
+
})).sort((a, b) => {
|
|
971
|
+
if (b.avgPassRate !== a.avgPassRate) return b.avgPassRate - a.avgPassRate
|
|
972
|
+
return a.avgP95LatencyMs - b.avgP95LatencyMs
|
|
973
|
+
})
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
function safeStringify(payload: unknown): string {
|
|
977
|
+
try {
|
|
978
|
+
return JSON.stringify(payload, null, 2)
|
|
979
|
+
} catch {
|
|
980
|
+
return String(payload)
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
function statusClass(status: RunStatus): string {
|
|
985
|
+
if (status === 'completed') return 'text-success'
|
|
986
|
+
if (status === 'failed') return 'text-destructive'
|
|
987
|
+
return 'text-warning'
|
|
988
|
+
}
|