waypoi 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (260) hide show
  1. package/.github/instructions/ui.instructions.md +42 -0
  2. package/.github/workflows/ci.yml +35 -0
  3. package/.github/workflows/publish.yml +71 -0
  4. package/.github/workflows/release.yml +48 -0
  5. package/.playwright-mcp/console-2026-04-04T01-41-10-746Z.log +2 -0
  6. package/.playwright-mcp/console-2026-04-04T01-41-28-799Z.log +3 -0
  7. package/.playwright-mcp/console-2026-04-05T02-26-51-909Z.log +76 -0
  8. package/.playwright-mcp/page-2026-04-04T01-41-10-816Z.yml +1 -0
  9. package/.playwright-mcp/page-2026-04-04T01-41-29-141Z.yml +77 -0
  10. package/.playwright-mcp/page-2026-04-04T01-41-42-633Z.yml +190 -0
  11. package/.playwright-mcp/page-2026-04-04T01-42-03-929Z.yml +262 -0
  12. package/.playwright-mcp/page-2026-04-04T02-12-54-813Z.yml +6 -0
  13. package/.playwright-mcp/page-2026-04-04T02-14-58-600Z.yml +190 -0
  14. package/.playwright-mcp/page-2026-04-04T02-15-03-923Z.yml +190 -0
  15. package/.playwright-mcp/page-2026-04-04T02-15-07-426Z.yml +190 -0
  16. package/.playwright-mcp/page-2026-04-04T02-15-25-729Z.yml +262 -0
  17. package/.playwright-mcp/page-2026-04-04T02-16-22-984Z.yml +262 -0
  18. package/.playwright-mcp/page-2026-04-04T02-17-00-599Z.yml +190 -0
  19. package/.playwright-mcp/page-2026-04-04T02-17-50-874Z.yml +190 -0
  20. package/.playwright-mcp/page-2026-04-05T02-26-55-570Z.yml +6 -0
  21. package/AGENTS.md +48 -0
  22. package/CHANGELOG.md +131 -0
  23. package/README.md +552 -0
  24. package/assets/agent-mode.png +0 -0
  25. package/assets/categorize.png +0 -0
  26. package/assets/dashboard.png +0 -0
  27. package/assets/endpoint-proxy.png +0 -0
  28. package/assets/icon.png +0 -0
  29. package/assets/mcp-generate-image.png +0 -0
  30. package/assets/mcp-understand-image.png +0 -0
  31. package/assets/peek-token-flow.png +0 -0
  32. package/assets/playground.png +0 -0
  33. package/assets/sankey.png +0 -0
  34. package/cli/index.ts +2805 -0
  35. package/cli/legacyRewrite.ts +108 -0
  36. package/cli/modelRef.ts +24 -0
  37. package/dist/cli/index.js +2536 -0
  38. package/dist/cli/legacyRewrite.js +92 -0
  39. package/dist/cli/modelRef.js +20 -0
  40. package/dist/src/benchmark/artifacts.js +131 -0
  41. package/dist/src/benchmark/capabilityClassifier.js +81 -0
  42. package/dist/src/benchmark/capabilityStore.js +144 -0
  43. package/dist/src/benchmark/config.js +238 -0
  44. package/dist/src/benchmark/gates.js +118 -0
  45. package/dist/src/benchmark/jobs.js +252 -0
  46. package/dist/src/benchmark/runner.js +1847 -0
  47. package/dist/src/benchmark/schema.js +353 -0
  48. package/dist/src/benchmark/suites.js +314 -0
  49. package/dist/src/benchmark/tinyQaDataset.js +422 -0
  50. package/dist/src/benchmark/types.js +25 -0
  51. package/dist/src/config.js +47 -0
  52. package/dist/src/index.js +178 -0
  53. package/dist/src/mcp/client.js +215 -0
  54. package/dist/src/mcp/discovery.js +226 -0
  55. package/dist/src/mcp/policy.js +65 -0
  56. package/dist/src/mcp/registry.js +129 -0
  57. package/dist/src/mcp/service.js +460 -0
  58. package/dist/src/middleware/auth.js +179 -0
  59. package/dist/src/middleware/requestCapture.js +192 -0
  60. package/dist/src/middleware/requestStats.js +118 -0
  61. package/dist/src/pools/builder.js +132 -0
  62. package/dist/src/pools/repository.js +69 -0
  63. package/dist/src/pools/scheduler.js +360 -0
  64. package/dist/src/pools/types.js +2 -0
  65. package/dist/src/protocols/adapters/dashscope.js +267 -0
  66. package/dist/src/protocols/adapters/inferenceV2.js +346 -0
  67. package/dist/src/protocols/adapters/openai.js +27 -0
  68. package/dist/src/protocols/registry.js +99 -0
  69. package/dist/src/protocols/types.js +2 -0
  70. package/dist/src/providers/health.js +153 -0
  71. package/dist/src/providers/importer.js +289 -0
  72. package/dist/src/providers/modelRegistry.js +313 -0
  73. package/dist/src/providers/repository.js +361 -0
  74. package/dist/src/providers/types.js +2 -0
  75. package/dist/src/routes/admin.js +531 -0
  76. package/dist/src/routes/audio.js +295 -0
  77. package/dist/src/routes/chat.js +240 -0
  78. package/dist/src/routes/embeddings.js +157 -0
  79. package/dist/src/routes/images.js +288 -0
  80. package/dist/src/routes/mcp.js +256 -0
  81. package/dist/src/routes/mcpService.js +100 -0
  82. package/dist/src/routes/models.js +48 -0
  83. package/dist/src/routes/responses.js +711 -0
  84. package/dist/src/routes/sessions.js +450 -0
  85. package/dist/src/routes/stats.js +270 -0
  86. package/dist/src/routes/ui.js +97 -0
  87. package/dist/src/routes/videos.js +107 -0
  88. package/dist/src/routing/router.js +338 -0
  89. package/dist/src/services/imageGeneration.js +280 -0
  90. package/dist/src/services/imageUnderstanding.js +352 -0
  91. package/dist/src/services/videoGeneration.js +79 -0
  92. package/dist/src/storage/captureRepository.js +1591 -0
  93. package/dist/src/storage/files.js +157 -0
  94. package/dist/src/storage/imageCache.js +346 -0
  95. package/dist/src/storage/repositories.js +388 -0
  96. package/dist/src/storage/sessionRepository.js +370 -0
  97. package/dist/src/storage/statsRepository.js +204 -0
  98. package/dist/src/transport/httpClient.js +126 -0
  99. package/dist/src/types.js +2 -0
  100. package/dist/src/utils/messageMedia.js +285 -0
  101. package/dist/src/utils/modelCapabilities.js +108 -0
  102. package/dist/src/utils/modelDiscovery.js +170 -0
  103. package/dist/src/version.js +5 -0
  104. package/dist/src/workers/captureRetention.js +25 -0
  105. package/dist/src/workers/configWatcher.js +91 -0
  106. package/dist/src/workers/healthChecker.js +21 -0
  107. package/dist/src/workers/statsRotation.js +41 -0
  108. package/docs/LLM/output_schema.md +312 -0
  109. package/docs/benchmark.md +208 -0
  110. package/docs/mcp-guidelines.md +125 -0
  111. package/docs/mcp-service.md +178 -0
  112. package/docs/opencode.md +86 -0
  113. package/docs/providers.md +79 -0
  114. package/examples/benchmark.config.yaml +28 -0
  115. package/examples/providers/alibaba-dashscope.yaml +88 -0
  116. package/examples/providers/alibaba-llm.yaml +64 -0
  117. package/examples/providers/alibaba-registry.yaml +7 -0
  118. package/examples/providers/inference-v2-ray.yaml +29 -0
  119. package/examples/scenarios/assets/omni-call-sample.wav +0 -0
  120. package/examples/scenarios/custom.jsonl +5 -0
  121. package/examples/scenarios/custom.yaml +40 -0
  122. package/model-form-v2.png +0 -0
  123. package/package.json +66 -0
  124. package/provider-form-v2.png +0 -0
  125. package/provider-form.png +0 -0
  126. package/scripts/manual-test.sh +11 -0
  127. package/scripts/version-from-git.js +23 -0
  128. package/src/benchmark/artifacts.ts +149 -0
  129. package/src/benchmark/capabilityClassifier.ts +99 -0
  130. package/src/benchmark/capabilityStore.ts +174 -0
  131. package/src/benchmark/config.ts +337 -0
  132. package/src/benchmark/gates.ts +164 -0
  133. package/src/benchmark/jobs.ts +312 -0
  134. package/src/benchmark/runner.ts +2519 -0
  135. package/src/benchmark/schema.ts +443 -0
  136. package/src/benchmark/suites.ts +323 -0
  137. package/src/benchmark/tinyQaDataset.ts +428 -0
  138. package/src/benchmark/types.ts +442 -0
  139. package/src/config.ts +44 -0
  140. package/src/index.ts +195 -0
  141. package/src/mcp/client.ts +305 -0
  142. package/src/mcp/discovery.ts +266 -0
  143. package/src/mcp/policy.ts +105 -0
  144. package/src/mcp/registry.ts +164 -0
  145. package/src/mcp/service.ts +611 -0
  146. package/src/middleware/auth.ts +251 -0
  147. package/src/middleware/requestCapture.ts +245 -0
  148. package/src/middleware/requestStats.ts +163 -0
  149. package/src/pools/builder.ts +159 -0
  150. package/src/pools/repository.ts +71 -0
  151. package/src/pools/scheduler.ts +425 -0
  152. package/src/pools/types.ts +117 -0
  153. package/src/protocols/adapters/dashscope.ts +335 -0
  154. package/src/protocols/adapters/inferenceV2.ts +428 -0
  155. package/src/protocols/adapters/openai.ts +32 -0
  156. package/src/protocols/registry.ts +117 -0
  157. package/src/protocols/types.ts +81 -0
  158. package/src/providers/health.ts +207 -0
  159. package/src/providers/importer.ts +402 -0
  160. package/src/providers/modelRegistry.ts +415 -0
  161. package/src/providers/repository.ts +439 -0
  162. package/src/providers/types.ts +113 -0
  163. package/src/routes/admin.ts +666 -0
  164. package/src/routes/audio.ts +372 -0
  165. package/src/routes/chat.ts +301 -0
  166. package/src/routes/embeddings.ts +197 -0
  167. package/src/routes/images.ts +356 -0
  168. package/src/routes/mcp.ts +320 -0
  169. package/src/routes/mcpService.ts +114 -0
  170. package/src/routes/models.ts +50 -0
  171. package/src/routes/responses.ts +872 -0
  172. package/src/routes/sessions.ts +558 -0
  173. package/src/routes/stats.ts +312 -0
  174. package/src/routes/ui.ts +96 -0
  175. package/src/routes/videos.ts +132 -0
  176. package/src/routing/router.ts +501 -0
  177. package/src/services/imageGeneration.ts +396 -0
  178. package/src/services/imageUnderstanding.ts +449 -0
  179. package/src/services/videoGeneration.ts +127 -0
  180. package/src/storage/captureRepository.ts +1835 -0
  181. package/src/storage/files.ts +178 -0
  182. package/src/storage/imageCache.ts +405 -0
  183. package/src/storage/repositories.ts +494 -0
  184. package/src/storage/sessionRepository.ts +419 -0
  185. package/src/storage/statsRepository.ts +238 -0
  186. package/src/transport/httpClient.ts +145 -0
  187. package/src/types.ts +322 -0
  188. package/src/utils/messageMedia.ts +293 -0
  189. package/src/utils/modelCapabilities.ts +161 -0
  190. package/src/utils/modelDiscovery.ts +203 -0
  191. package/src/workers/captureRetention.ts +25 -0
  192. package/src/workers/configWatcher.ts +115 -0
  193. package/src/workers/healthChecker.ts +22 -0
  194. package/src/workers/statsRotation.ts +49 -0
  195. package/tests/benchmarkAdminRoutes.test.ts +82 -0
  196. package/tests/benchmarkBasics.test.ts +116 -0
  197. package/tests/captureAdminRoutes.test.ts +420 -0
  198. package/tests/captureRepository.test.ts +797 -0
  199. package/tests/cliLegacyRewrite.test.ts +45 -0
  200. package/tests/imageGeneration.service.test.ts +107 -0
  201. package/tests/imageUnderstanding.service.test.ts +123 -0
  202. package/tests/mcpPolicy.test.ts +105 -0
  203. package/tests/mcpService.test.ts +1245 -0
  204. package/tests/modelRef.test.ts +23 -0
  205. package/tests/modelsRoutes.test.ts +154 -0
  206. package/tests/sessionMediaCache.test.ts +167 -0
  207. package/tests/statsRoutes.test.ts +323 -0
  208. package/tsconfig.json +15 -0
  209. package/ui/index.html +16 -0
  210. package/ui/package-lock.json +8521 -0
  211. package/ui/package.json +52 -0
  212. package/ui/postcss.config.js +6 -0
  213. package/ui/public/assets/apple-touch-icon.png +0 -0
  214. package/ui/public/assets/favicon-16.png +0 -0
  215. package/ui/public/assets/favicon-32.png +0 -0
  216. package/ui/public/assets/icon-192.png +0 -0
  217. package/ui/public/assets/icon-512.png +0 -0
  218. package/ui/src/App.tsx +27 -0
  219. package/ui/src/api/client.ts +1503 -0
  220. package/ui/src/components/EndpointUsageGuide.tsx +361 -0
  221. package/ui/src/components/Layout.tsx +124 -0
  222. package/ui/src/components/MessageContent.tsx +365 -0
  223. package/ui/src/components/ToolCallMessage.tsx +179 -0
  224. package/ui/src/components/ToolPicker.tsx +442 -0
  225. package/ui/src/components/messageContentParser.test.ts +41 -0
  226. package/ui/src/components/messageContentParser.ts +73 -0
  227. package/ui/src/components/thinkingPreview.test.ts +27 -0
  228. package/ui/src/components/thinkingPreview.ts +15 -0
  229. package/ui/src/components/toMermaidSankey.test.ts +78 -0
  230. package/ui/src/components/toMermaidSankey.ts +56 -0
  231. package/ui/src/components/ui/button.tsx +58 -0
  232. package/ui/src/components/ui/input.tsx +21 -0
  233. package/ui/src/components/ui/textarea.tsx +21 -0
  234. package/ui/src/lib/utils.ts +6 -0
  235. package/ui/src/main.tsx +9 -0
  236. package/ui/src/pages/AgentPlayground.tsx +2010 -0
  237. package/ui/src/pages/Benchmark.tsx +988 -0
  238. package/ui/src/pages/Dashboard.tsx +581 -0
  239. package/ui/src/pages/Peek.tsx +962 -0
  240. package/ui/src/pages/Settings.tsx +2013 -0
  241. package/ui/src/pages/agentPlaygroundPayload.test.ts +109 -0
  242. package/ui/src/pages/agentPlaygroundPayload.ts +97 -0
  243. package/ui/src/pages/agentThinkingContent.test.ts +50 -0
  244. package/ui/src/pages/agentThinkingContent.ts +57 -0
  245. package/ui/src/pages/dashboardTokenUsage.test.ts +66 -0
  246. package/ui/src/pages/dashboardTokenUsage.ts +36 -0
  247. package/ui/src/pages/imageUpload.test.ts +39 -0
  248. package/ui/src/pages/imageUpload.ts +71 -0
  249. package/ui/src/pages/peekFilters.test.ts +29 -0
  250. package/ui/src/pages/peekFilters.ts +13 -0
  251. package/ui/src/pages/peekMedia.test.ts +58 -0
  252. package/ui/src/pages/peekMedia.ts +148 -0
  253. package/ui/src/pages/sessionAutoTitle.test.ts +128 -0
  254. package/ui/src/pages/sessionAutoTitle.ts +106 -0
  255. package/ui/src/stores/settings.ts +58 -0
  256. package/ui/src/styles/globals.css +223 -0
  257. package/ui/src/vite-env.d.ts +8 -0
  258. package/ui/tailwind.config.js +106 -0
  259. package/ui/tsconfig.json +32 -0
  260. package/ui/vite.config.ts +37 -0
@@ -0,0 +1,2519 @@
1
+ import { promises as fs } from "fs";
2
+ import path from "path";
3
+ import { randomUUID } from "crypto";
4
+ import YAML from "yaml";
5
+ import { routeRequest } from "../routing/router";
6
+ import { pickBestModelByCapabilities } from "../storage/repositories";
7
+ import { StoragePaths } from "../storage/files";
8
+ import {
9
+ discoverAllTools,
10
+ disconnectAllServers,
11
+ executeTool,
12
+ getCachedTools,
13
+ summarizeMcpError,
14
+ } from "../mcp/discovery";
15
+ import { writeBenchmarkArtifacts } from "./artifacts";
16
+ import { classifyCapabilityStatus } from "./capabilityClassifier";
17
+ import { computeConfigFingerprint, writeCapabilitySnapshots } from "./capabilityStore";
18
+ import { resolveBenchmarkConfig } from "./config";
19
+ import { evaluateGates } from "./gates";
20
+ import { validateScenarioCollection } from "./schema";
21
+ import { builtInSuite, listSuiteExamples } from "./suites";
22
+ import { listProviders } from "../providers/repository";
23
+ import { ProviderModelRecord } from "../providers/types";
24
+ import {
25
+ BENCHMARK_CAPABILITY_KEYS,
26
+ BENCHMARK_MODES,
27
+ BenchmarkCliOptions,
28
+ BenchmarkCapabilityKey,
29
+ BenchmarkCapabilityMatrix,
30
+ BenchmarkCapabilityStatus,
31
+ BenchmarkExchangeSummary,
32
+ BenchmarkMode,
33
+ BenchmarkModeRequirements,
34
+ BenchmarkReport,
35
+ BenchmarkRunOutput,
36
+ BenchmarkScenario,
37
+ BenchmarkScenarioDetail,
38
+ BenchmarkScenarioSummary,
39
+ BenchmarkToolTraceStep,
40
+ BenchmarkModelCapabilitySnapshot,
41
+ EffectiveBenchmarkConfig,
42
+ ScenarioResult,
43
+ ScenarioRunSample,
44
+ } from "./types";
45
+
46
+ export type BenchmarkProgressEventType =
47
+ | "run_started"
48
+ | "scenario_started"
49
+ | "exchange"
50
+ | "sample_completed"
51
+ | "scenario_completed"
52
+ | "warning"
53
+ | "run_completed";
54
+
55
+ export interface BenchmarkExchangeEvent {
56
+ scenarioInput: string;
57
+ requestPreview: string;
58
+ responsePreview: string;
59
+ mode: BenchmarkMode;
60
+ model: string;
61
+ requestPath: string;
62
+ statusCode: number;
63
+ contentType: string;
64
+ endpointId?: string;
65
+ endpointName?: string;
66
+ upstreamModel?: string;
67
+ toolTrace: BenchmarkToolTraceStep[];
68
+ requestRaw: unknown;
69
+ requestSanitized: unknown;
70
+ responseRaw: unknown;
71
+ responseSanitized: unknown;
72
+ }
73
+
74
+ export interface BenchmarkProgressEvent {
75
+ type: BenchmarkProgressEventType;
76
+ timestamp: string;
77
+ runId?: string;
78
+ scenarioId?: string;
79
+ scenarioIndex?: number;
80
+ totalScenarios?: number;
81
+ runIndex?: number;
82
+ totalRuns?: number;
83
+ phase?: "warmup" | "measured";
84
+ scenario?: BenchmarkScenarioSummary;
85
+ exchange?: BenchmarkExchangeEvent;
86
+ sample?: ScenarioRunSample;
87
+ result?: ScenarioResult;
88
+ warning?: string;
89
+ summary?: Pick<BenchmarkReport, "total" | "executed" | "succeeded" | "failed" | "successRate">;
90
+ }
91
+
92
+ export interface BenchmarkRunHooks {
93
+ runId?: string;
94
+ onEvent?: (event: BenchmarkProgressEvent) => void;
95
+ }
96
+
97
+ interface ChatToolCall {
98
+ id?: string;
99
+ function?: {
100
+ name?: string;
101
+ arguments?: string;
102
+ };
103
+ }
104
+
105
+ interface ChatResponse {
106
+ choices?: Array<{
107
+ message?: {
108
+ content?: unknown;
109
+ tool_calls?: ChatToolCall[];
110
+ };
111
+ }>;
112
+ usage?: {
113
+ total_tokens?: number;
114
+ };
115
+ }
116
+
117
+ interface JsonResponseEnvelope {
118
+ statusCode: number;
119
+ payload: unknown;
120
+ contentType: string;
121
+ requestPayload: Record<string, unknown>;
122
+ route: {
123
+ endpointId: string;
124
+ endpointName?: string;
125
+ upstreamModel?: string;
126
+ };
127
+ poolMetrics?: {
128
+ candidateAttempts: number;
129
+ failovers: number;
130
+ rateLimitSwitches: number;
131
+ distinctProviders: number;
132
+ distinctModels: number;
133
+ };
134
+ }
135
+
136
+ interface BinaryResponseEnvelope {
137
+ statusCode: number;
138
+ buffer: Buffer;
139
+ contentType: string;
140
+ requestPayload: Record<string, unknown>;
141
+ route: {
142
+ endpointId: string;
143
+ endpointName?: string;
144
+ upstreamModel?: string;
145
+ };
146
+ poolMetrics?: {
147
+ candidateAttempts: number;
148
+ failovers: number;
149
+ rateLimitSwitches: number;
150
+ distinctProviders: number;
151
+ distinctModels: number;
152
+ };
153
+ }
154
+
155
+ interface ScenarioExecution {
156
+ scenario: BenchmarkScenario;
157
+ example: BenchmarkScenarioSummary;
158
+ result: ScenarioResult;
159
+ samples: ScenarioRunSample[];
160
+ exchanges: BenchmarkExchangeSummary[];
161
+ warnings: string[];
162
+ }
163
+
164
+ type ScenarioExchangeCallback = (event: BenchmarkExchangeEvent) => void;
165
+
166
+ export function listBenchmarkExamples(suite = "showcase"): BenchmarkScenarioSummary[] {
167
+ return listSuiteExamples(suite);
168
+ }
169
+
170
+ export async function runBenchmark(
171
+ paths: StoragePaths,
172
+ options: BenchmarkCliOptions,
173
+ hooks?: BenchmarkRunHooks
174
+ ): Promise<BenchmarkRunOutput> {
175
+ const effective = await resolveBenchmarkConfig(paths, options);
176
+ const loaded = await loadScenarios(paths, effective);
177
+ const runId = hooks?.runId;
178
+
179
+ if (loaded.scenarios.length === 0) {
180
+ throw new Error("No benchmark scenarios found. Use --suite and/or --scenario.");
181
+ }
182
+
183
+ emitEvent(hooks, {
184
+ type: "run_started",
185
+ timestamp: new Date().toISOString(),
186
+ runId,
187
+ totalScenarios: loaded.scenarios.length,
188
+ });
189
+
190
+ const warnings = [...loaded.warnings];
191
+ for (const warning of loaded.warnings) {
192
+ emitEvent(hooks, {
193
+ type: "warning",
194
+ timestamp: new Date().toISOString(),
195
+ runId,
196
+ warning,
197
+ });
198
+ }
199
+
200
+ const hasAgentScenarios = loaded.scenarios.some((scenario) => scenario.mode === "agent");
201
+ if (hasAgentScenarios) {
202
+ try {
203
+ await discoverAllTools(paths);
204
+ } catch (error) {
205
+ const warning = `MCP discovery failed for benchmark: ${summarizeMcpError(error)}`;
206
+ warnings.push(warning);
207
+ emitEvent(hooks, {
208
+ type: "warning",
209
+ timestamp: new Date().toISOString(),
210
+ runId,
211
+ warning,
212
+ });
213
+ if (process.env.WAYPOI_DEBUG_ERRORS === "1") {
214
+ console.error(error);
215
+ }
216
+ }
217
+ }
218
+
219
+ const executions: ScenarioExecution[] = [];
220
+ for (const [scenarioIndex, scenario] of loaded.scenarios.entries()) {
221
+ emitEvent(hooks, {
222
+ type: "scenario_started",
223
+ timestamp: new Date().toISOString(),
224
+ runId,
225
+ scenarioId: scenario.id,
226
+ scenarioIndex: scenarioIndex + 1,
227
+ totalScenarios: loaded.scenarios.length,
228
+ scenario: scenarioToSummary(scenario, effective.run.suite),
229
+ });
230
+
231
+ const execution = await runScenarioWithSampling(
232
+ paths,
233
+ scenario,
234
+ effective,
235
+ (sample, runIndex, phase, totalRuns) => {
236
+ emitEvent(hooks, {
237
+ type: "sample_completed",
238
+ timestamp: new Date().toISOString(),
239
+ runId,
240
+ scenarioId: scenario.id,
241
+ scenarioIndex: scenarioIndex + 1,
242
+ totalScenarios: loaded.scenarios.length,
243
+ runIndex,
244
+ totalRuns,
245
+ phase,
246
+ sample,
247
+ });
248
+ },
249
+ (exchange, runIndex, phase, totalRuns) => {
250
+ emitEvent(hooks, {
251
+ type: "exchange",
252
+ timestamp: new Date().toISOString(),
253
+ runId,
254
+ scenarioId: scenario.id,
255
+ scenarioIndex: scenarioIndex + 1,
256
+ totalScenarios: loaded.scenarios.length,
257
+ runIndex,
258
+ totalRuns,
259
+ phase,
260
+ exchange,
261
+ });
262
+ }
263
+ );
264
+ warnings.push(...execution.warnings);
265
+ for (const warning of execution.warnings) {
266
+ emitEvent(hooks, {
267
+ type: "warning",
268
+ timestamp: new Date().toISOString(),
269
+ runId,
270
+ scenarioId: scenario.id,
271
+ scenarioIndex: scenarioIndex + 1,
272
+ totalScenarios: loaded.scenarios.length,
273
+ warning,
274
+ });
275
+ }
276
+ emitEvent(hooks, {
277
+ type: "scenario_completed",
278
+ timestamp: new Date().toISOString(),
279
+ runId,
280
+ scenarioId: scenario.id,
281
+ scenarioIndex: scenarioIndex + 1,
282
+ totalScenarios: loaded.scenarios.length,
283
+ result: execution.result,
284
+ });
285
+ executions.push(execution);
286
+ }
287
+
288
+ const capabilityMatrix = buildCapabilityMatrix(effective, executions);
289
+ if (effective.run.updateCapCache && capabilityMatrix && capabilityMatrix.models.length > 0) {
290
+ await writeCapabilitySnapshots(paths, capabilityMatrix.models);
291
+ }
292
+
293
+ const reportBase = buildReport(
294
+ effective,
295
+ warnings,
296
+ loaded.scenarioPath,
297
+ executions,
298
+ capabilityMatrix,
299
+ runId
300
+ );
301
+ const gateResults = await evaluateGates(reportBase, effective);
302
+ const report: BenchmarkReport = {
303
+ ...reportBase,
304
+ gateResults,
305
+ };
306
+
307
+ const artifacts = await writeBenchmarkArtifacts(paths, report, effective.run.outPath);
308
+
309
+ await disconnectAllServers();
310
+
311
+ emitEvent(hooks, {
312
+ type: "run_completed",
313
+ timestamp: new Date().toISOString(),
314
+ runId: report.id,
315
+ summary: {
316
+ total: report.total,
317
+ executed: report.executed,
318
+ succeeded: report.succeeded,
319
+ failed: report.failed,
320
+ successRate: report.successRate,
321
+ },
322
+ });
323
+
324
+ return {
325
+ report,
326
+ artifactPath: artifacts.jsonPath,
327
+ textArtifactPath: artifacts.textPath,
328
+ };
329
+ }
330
+
331
+ async function loadScenarios(paths: StoragePaths, effective: EffectiveBenchmarkConfig): Promise<{
332
+ scenarios: BenchmarkScenario[];
333
+ warnings: string[];
334
+ scenarioPath?: string;
335
+ }> {
336
+ let allScenarios: BenchmarkScenario[] = [];
337
+ const warnings: string[] = [];
338
+
339
+ if (effective.run.suite) {
340
+ if (effective.run.suite === "capabilities") {
341
+ allScenarios = await buildCapabilitySuiteScenarios(paths, effective);
342
+ } else {
343
+ allScenarios.push(...builtInSuite(effective.run.suite));
344
+ }
345
+ }
346
+
347
+ if (effective.run.exampleId) {
348
+ allScenarios = allScenarios.filter((scenario) => scenario.id === effective.run.exampleId);
349
+ if (allScenarios.length === 0) {
350
+ throw new Error(
351
+ `Example '${effective.run.exampleId}' not found in suite '${effective.run.suite ?? "showcase"}'.`
352
+ );
353
+ }
354
+ }
355
+
356
+ if (effective.run.scenarioPath) {
357
+ const filePath = path.resolve(effective.run.scenarioPath);
358
+ const fromFile = await loadScenarioFile(filePath);
359
+ const validated = validateScenarioCollection(fromFile, filePath);
360
+ for (const scenario of validated.scenarios) {
361
+ if (!scenario.exampleSource) {
362
+ scenario.exampleSource = "file";
363
+ }
364
+ }
365
+ allScenarios.push(...validated.scenarios);
366
+ warnings.push(...validated.warnings);
367
+ }
368
+
369
+ ensureUniqueScenarioIds(allScenarios);
370
+
371
+ return {
372
+ scenarios: allScenarios,
373
+ warnings,
374
+ scenarioPath: effective.run.scenarioPath ? path.resolve(effective.run.scenarioPath) : undefined,
375
+ };
376
+ }
377
+
378
+ async function buildCapabilitySuiteScenarios(
379
+ paths: StoragePaths,
380
+ effective: EffectiveBenchmarkConfig
381
+ ): Promise<BenchmarkScenario[]> {
382
+ const template = builtInSuite("capabilities");
383
+ if (effective.run.modelOverride) {
384
+ return materializeCapabilityScenariosForModel(template, effective.run.modelOverride);
385
+ }
386
+
387
+ const providers = await listProviders(paths);
388
+ const seen = new Set<string>();
389
+ const scenarios: BenchmarkScenario[] = [];
390
+ for (const provider of providers) {
391
+ if (!provider.enabled) {
392
+ continue;
393
+ }
394
+ for (const model of provider.models) {
395
+ if (model.enabled === false) {
396
+ continue;
397
+ }
398
+ const modelRef = `${provider.id}/${model.modelId}`;
399
+ if (seen.has(modelRef)) {
400
+ continue;
401
+ }
402
+ seen.add(modelRef);
403
+ scenarios.push(...materializeCapabilityScenariosForModel(template, modelRef, model));
404
+ }
405
+ }
406
+
407
+ return scenarios;
408
+ }
409
+
410
+ function materializeCapabilityScenariosForModel(
411
+ template: BenchmarkScenario[],
412
+ model: string,
413
+ providerModel?: ProviderModelRecord
414
+ ): BenchmarkScenario[] {
415
+ return template
416
+ .filter((scenario) => {
417
+ if (scenario.id === "cap.chat_vision_input") {
418
+ return false;
419
+ }
420
+ if (scenario.id === "cap.images_edit") {
421
+ return false;
422
+ }
423
+ if (!providerModel) {
424
+ return true;
425
+ }
426
+ return supportsScenarioByDeclaredCapabilities(scenario, providerModel);
427
+ })
428
+ .map((scenario) => ({
429
+ ...scenario,
430
+ id: `${scenario.id}::${model}`,
431
+ model,
432
+ assertions: { ...scenario.assertions },
433
+ }));
434
+ }
435
+
436
+ function supportsScenarioByDeclaredCapabilities(
437
+ scenario: BenchmarkScenario,
438
+ providerModel: ProviderModelRecord
439
+ ): boolean {
440
+ const input = new Set(providerModel.capabilities.input);
441
+ const output = new Set(providerModel.capabilities.output);
442
+ if (scenario.mode === "chat" || scenario.mode === "agent") {
443
+ return input.has("text") && output.has("text");
444
+ }
445
+ if (scenario.mode === "embeddings") {
446
+ return input.has("text") && output.has("embedding");
447
+ }
448
+ if (scenario.mode === "image_generation") {
449
+ return output.has("image");
450
+ }
451
+ if (scenario.mode === "audio_transcription") {
452
+ return input.has("audio") && output.has("text");
453
+ }
454
+ if (scenario.mode === "audio_speech") {
455
+ return input.has("text") && output.has("audio");
456
+ }
457
+ if (scenario.mode === "omni_call") {
458
+ return input.has("audio") && output.has("text");
459
+ }
460
+ return true;
461
+ }
462
+
463
+ async function loadScenarioFile(filePath: string): Promise<unknown[]> {
464
+ const raw = await fs.readFile(filePath, "utf8");
465
+ const ext = path.extname(filePath).toLowerCase();
466
+
467
+ if (ext === ".jsonl") {
468
+ const rows = raw
469
+ .split("\n")
470
+ .map((line, index) => ({ line: line.trim(), lineNumber: index + 1 }))
471
+ .filter((entry) => entry.line.length > 0);
472
+
473
+ return rows.map((entry) => {
474
+ try {
475
+ return JSON.parse(entry.line) as unknown;
476
+ } catch (error) {
477
+ throw new Error(
478
+ `Failed to parse scenario JSONL ${filePath}:${entry.lineNumber}: ${(error as Error).message}`
479
+ );
480
+ }
481
+ });
482
+ }
483
+
484
+ if (ext === ".yaml" || ext === ".yml") {
485
+ let parsed: unknown;
486
+ try {
487
+ parsed = YAML.parse(raw) as unknown;
488
+ } catch (error) {
489
+ throw new Error(`Failed to parse YAML scenario file ${filePath}: ${(error as Error).message}`);
490
+ }
491
+ return extractScenarioArray(parsed, filePath);
492
+ }
493
+
494
+ let parsed: unknown;
495
+ try {
496
+ parsed = JSON.parse(raw) as unknown;
497
+ } catch (error) {
498
+ throw new Error(`Failed to parse JSON scenario file ${filePath}: ${(error as Error).message}`);
499
+ }
500
+ return extractScenarioArray(parsed, filePath);
501
+ }
502
+
503
+ function extractScenarioArray(parsed: unknown, source: string): unknown[] {
504
+ if (Array.isArray(parsed)) {
505
+ return parsed;
506
+ }
507
+
508
+ if (
509
+ parsed &&
510
+ typeof parsed === "object" &&
511
+ Array.isArray((parsed as { scenarios?: unknown[] }).scenarios)
512
+ ) {
513
+ return (parsed as { scenarios: unknown[] }).scenarios;
514
+ }
515
+
516
+ throw new Error(`${source}: scenario file must be an array or an object with 'scenarios' array.`);
517
+ }
518
+
519
+ function ensureUniqueScenarioIds(scenarios: BenchmarkScenario[]): void {
520
+ const ids = new Set<string>();
521
+ for (const scenario of scenarios) {
522
+ if (ids.has(scenario.id)) {
523
+ throw new Error(`Scenario ID '${scenario.id}' is duplicated.`);
524
+ }
525
+ ids.add(scenario.id);
526
+ }
527
+ }
528
+
529
+ async function runScenarioWithSampling(
530
+ paths: StoragePaths,
531
+ scenario: BenchmarkScenario,
532
+ effective: EffectiveBenchmarkConfig,
533
+ onSampleComplete?: (
534
+ sample: ScenarioRunSample,
535
+ runIndex: number,
536
+ phase: "warmup" | "measured",
537
+ totalRuns: number
538
+ ) => void,
539
+ onExchange?: (
540
+ event: BenchmarkExchangeEvent,
541
+ runIndex: number,
542
+ phase: "warmup" | "measured",
543
+ totalRuns: number
544
+ ) => void
545
+ ): Promise<ScenarioExecution> {
546
+ const warnings: string[] = [];
547
+ const example = scenarioToSummary(scenario, effective.run.suite);
548
+ const model =
549
+ effective.run.modelOverride ||
550
+ scenario.model ||
551
+ (await pickBestModelForScenario(paths, scenario));
552
+
553
+ if (!model) {
554
+ const reason = `No model available for mode '${scenario.mode}'.`;
555
+ warnings.push(`Scenario '${scenario.id}' skipped: ${reason}`);
556
+ return {
557
+ scenario,
558
+ example,
559
+ result: buildSkippedScenarioResult(scenario, reason),
560
+ samples: [],
561
+ exchanges: [],
562
+ warnings,
563
+ };
564
+ }
565
+
566
+ const runProfile =
567
+ effective.run.executionMode === "showcase"
568
+ ? { warmupRuns: 0, measuredRuns: 1, minScenarioPassRate: 1 }
569
+ : effective.profileSettings;
570
+ const totalRuns = runProfile.warmupRuns + runProfile.measuredRuns;
571
+ const measuredSamples: ScenarioRunSample[] = [];
572
+ const measuredExchanges: BenchmarkExchangeSummary[] = [];
573
+
574
+ const selectedTools = getSelectedTools(scenario.tools);
575
+ if (scenario.requiresAvailableTools && selectedTools.length === 0) {
576
+ const reason = "No MCP tools are available for this tool-driven example.";
577
+ warnings.push(`Scenario '${scenario.id}' skipped: ${reason}`);
578
+ return {
579
+ scenario,
580
+ example,
581
+ result: buildSkippedScenarioResult(scenario, reason),
582
+ samples: [],
583
+ exchanges: [],
584
+ warnings,
585
+ };
586
+ }
587
+
588
+ for (let index = 0; index < totalRuns; index++) {
589
+ const phase = index < runProfile.warmupRuns ? "warmup" : "measured";
590
+ const runIndex = index + 1;
591
+ const runExchanges: BenchmarkExchangeSummary[] = [];
592
+ const sample = await runSingleScenario(
593
+ paths,
594
+ scenario,
595
+ model,
596
+ effective,
597
+ runIndex,
598
+ (event) => {
599
+ if (phase === "measured") {
600
+ runExchanges.push(toExchangeSummary(event));
601
+ }
602
+ onExchange?.(event, runIndex, phase, totalRuns);
603
+ }
604
+ );
605
+ onSampleComplete?.(sample, index + 1, phase, totalRuns);
606
+ if (index >= runProfile.warmupRuns) {
607
+ measuredSamples.push(sample);
608
+ measuredExchanges.push(...runExchanges);
609
+ }
610
+ }
611
+
612
+ return {
613
+ scenario,
614
+ example,
615
+ result: buildScenarioResult(
616
+ scenario,
617
+ model,
618
+ measuredSamples,
619
+ runProfile.minScenarioPassRate
620
+ ),
621
+ exchanges: measuredExchanges,
622
+ samples: measuredSamples,
623
+ warnings,
624
+ };
625
+ }
626
+
627
+ async function pickBestModelForScenario(
628
+ paths: StoragePaths,
629
+ scenario: BenchmarkScenario
630
+ ): Promise<string | null> {
631
+ const requirements = getModeRequirements(scenario.mode);
632
+ return pickBestModelByCapabilities(
633
+ paths,
634
+ {
635
+ requiredInput: requirements.requiredInput,
636
+ requiredOutput: requirements.requiredOutput,
637
+ },
638
+ requirements.preferredEndpointType
639
+ );
640
+ }
641
+
642
+ function getModeRequirements(mode: BenchmarkMode): BenchmarkModeRequirements {
643
+ switch (mode) {
644
+ case "chat":
645
+ case "agent":
646
+ case "responses":
647
+ return { requiredInput: ["text"], requiredOutput: ["text"], preferredEndpointType: "llm" };
648
+ case "embeddings":
649
+ return { requiredInput: ["text"], requiredOutput: ["embedding"], preferredEndpointType: "embedding" };
650
+ case "image_generation":
651
+ return { requiredInput: ["text"], requiredOutput: ["image"], preferredEndpointType: "diffusion" };
652
+ case "audio_transcription":
653
+ return { requiredInput: ["audio"], requiredOutput: ["text"], preferredEndpointType: "audio" };
654
+ case "audio_speech":
655
+ return { requiredInput: ["text"], requiredOutput: ["audio"], preferredEndpointType: "audio" };
656
+ case "omni_call":
657
+ return { requiredInput: ["text", "audio"], requiredOutput: ["text"], preferredEndpointType: "llm" };
658
+ }
659
+ }
660
+
661
+ async function runSingleScenario(
662
+ paths: StoragePaths,
663
+ scenario: BenchmarkScenario,
664
+ model: string,
665
+ effective: EffectiveBenchmarkConfig,
666
+ runIndex: number,
667
+ onExchange?: ScenarioExchangeCallback
668
+ ): Promise<ScenarioRunSample> {
669
+ const startTime = Date.now();
670
+
671
+ try {
672
+ const sample = await runModeScenario(paths, scenario, model, effective, startTime, onExchange);
673
+ return { ...sample, runIndex };
674
+ } catch (error) {
675
+ const latencyMs = Date.now() - startTime;
676
+ return {
677
+ runIndex,
678
+ success: false,
679
+ latencyMs,
680
+ statusCode: 0,
681
+ tokens: 0,
682
+ toolCalls: 0,
683
+ throughputTokensPerSec: 0,
684
+ finalOutput: "",
685
+ outputPreview: "",
686
+ verdict: (error as Error).message,
687
+ usedToolNames: [],
688
+ error: (error as Error).message,
689
+ candidateAttempts: 0,
690
+ failovers: 0,
691
+ rateLimitSwitches: 0,
692
+ distinctProviders: 0,
693
+ distinctModels: 0,
694
+ };
695
+ }
696
+ }
697
+
698
+ async function runModeScenario(
699
+ paths: StoragePaths,
700
+ scenario: BenchmarkScenario,
701
+ model: string,
702
+ effective: EffectiveBenchmarkConfig,
703
+ startTime: number,
704
+ onExchange?: ScenarioExchangeCallback
705
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
706
+ switch (scenario.mode) {
707
+ case "chat":
708
+ return runChatScenario(paths, scenario, model, effective, startTime, onExchange);
709
+ case "agent":
710
+ return runAgentScenario(paths, scenario, model, effective, startTime, onExchange);
711
+ case "responses":
712
+ return runResponsesScenario(paths, scenario, model, effective, startTime, onExchange);
713
+ case "embeddings":
714
+ return runEmbeddingsScenario(paths, scenario, model, effective, startTime, onExchange);
715
+ case "image_generation":
716
+ return runImageScenario(paths, scenario, model, effective, startTime, onExchange);
717
+ case "audio_transcription":
718
+ return runAudioTranscriptionScenario(paths, scenario, model, effective, startTime, onExchange);
719
+ case "audio_speech":
720
+ return runAudioSpeechScenario(paths, scenario, model, effective, startTime, onExchange);
721
+ case "omni_call":
722
+ return runOmniCallScenario(paths, scenario, model, effective, startTime, onExchange);
723
+ }
724
+ }
725
+
726
+ async function runChatScenario(
727
+ paths: StoragePaths,
728
+ scenario: BenchmarkScenario,
729
+ model: string,
730
+ effective: EffectiveBenchmarkConfig,
731
+ startTime: number,
732
+ onExchange?: ScenarioExchangeCallback
733
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
734
+ const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
735
+ const payload: Record<string, unknown> = {
736
+ model,
737
+ messages: [{ role: "user", content: scenario.prompt }],
738
+ stream: false,
739
+ ...buildGenerationParams(scenario, effective),
740
+ };
741
+
742
+ const envelope = await requestJson(
743
+ paths,
744
+ model,
745
+ "/v1/chat/completions",
746
+ payload,
747
+ timeoutMs,
748
+ getModeRequirements("chat")
749
+ );
750
+ onExchange?.(
751
+ buildExchangeEvent({
752
+ scenario,
753
+ mode: "chat",
754
+ model,
755
+ requestPath: "/v1/chat/completions",
756
+ requestPayload: envelope.requestPayload,
757
+ responsePayload: envelope.payload,
758
+ statusCode: envelope.statusCode,
759
+ contentType: envelope.contentType,
760
+ endpointId: envelope.route.endpointId,
761
+ endpointName: envelope.route.endpointName,
762
+ upstreamModel: envelope.route.upstreamModel,
763
+ })
764
+ );
765
+
766
+ const response = (envelope.payload as ChatResponse) ?? {};
767
+ const output = parseAssistantContent(response);
768
+ const tokens = Number(response.usage?.total_tokens ?? 0);
769
+ const latencyMs = Date.now() - startTime;
770
+
771
+ const assertionError = evaluateAssertions(scenario, {
772
+ output,
773
+ toolCalls: 0,
774
+ toolNames: [],
775
+ latencyMs,
776
+ statusCode: envelope.statusCode,
777
+ });
778
+
779
+ return {
780
+ success: !assertionError,
781
+ latencyMs,
782
+ statusCode: envelope.statusCode,
783
+ tokens,
784
+ toolCalls: 0,
785
+ throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
786
+ finalOutput: output,
787
+ outputPreview: truncate(output, 180),
788
+ verdict: assertionError ?? "All assertions passed.",
789
+ usedToolNames: [],
790
+ error: assertionError ?? undefined,
791
+ candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
792
+ failovers: envelope.poolMetrics?.failovers ?? 0,
793
+ rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
794
+ distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
795
+ distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
796
+ };
797
+ }
798
+
799
+ async function runResponsesScenario(
800
+ paths: StoragePaths,
801
+ scenario: BenchmarkScenario,
802
+ model: string,
803
+ effective: EffectiveBenchmarkConfig,
804
+ startTime: number,
805
+ onExchange?: ScenarioExchangeCallback
806
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
807
+ const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
808
+ const payload: Record<string, unknown> = {
809
+ model,
810
+ input: scenario.prompt,
811
+ stream: false,
812
+ ...buildGenerationParams(scenario, effective),
813
+ };
814
+
815
+ const envelope = await requestJson(
816
+ paths,
817
+ model,
818
+ "/v1/responses",
819
+ payload,
820
+ timeoutMs,
821
+ getModeRequirements("responses")
822
+ );
823
+ onExchange?.(
824
+ buildExchangeEvent({
825
+ scenario,
826
+ mode: "responses",
827
+ model,
828
+ requestPath: "/v1/responses",
829
+ requestPayload: envelope.requestPayload,
830
+ responsePayload: envelope.payload,
831
+ statusCode: envelope.statusCode,
832
+ contentType: envelope.contentType,
833
+ endpointId: envelope.route.endpointId,
834
+ endpointName: envelope.route.endpointName,
835
+ upstreamModel: envelope.route.upstreamModel,
836
+ })
837
+ );
838
+
839
+ const response = envelope.payload as {
840
+ output?: Array<{
841
+ type?: string;
842
+ content?: Array<{ type?: string; text?: string }>;
843
+ arguments?: string;
844
+ name?: string;
845
+ }>;
846
+ usage?: { total_tokens?: number };
847
+ };
848
+ const output = extractResponsesOutputText(response);
849
+ const tokens = Number(response?.usage?.total_tokens ?? 0);
850
+ const latencyMs = Date.now() - startTime;
851
+
852
+ const assertionError = evaluateAssertions(scenario, {
853
+ output,
854
+ toolCalls: 0,
855
+ toolNames: [],
856
+ latencyMs,
857
+ statusCode: envelope.statusCode,
858
+ });
859
+
860
+ return {
861
+ success: !assertionError,
862
+ latencyMs,
863
+ statusCode: envelope.statusCode,
864
+ tokens,
865
+ toolCalls: 0,
866
+ throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
867
+ finalOutput: output,
868
+ outputPreview: truncate(output, 180),
869
+ verdict: assertionError ?? "All assertions passed.",
870
+ usedToolNames: [],
871
+ error: assertionError ?? undefined,
872
+ candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
873
+ failovers: envelope.poolMetrics?.failovers ?? 0,
874
+ rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
875
+ distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
876
+ distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
877
+ };
878
+ }
879
+
880
+ async function runAgentScenario(
881
+ paths: StoragePaths,
882
+ scenario: BenchmarkScenario,
883
+ model: string,
884
+ effective: EffectiveBenchmarkConfig,
885
+ startTime: number,
886
+ onExchange?: ScenarioExchangeCallback
887
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
888
+ const selectedTools = getSelectedTools(scenario.tools);
889
+ const messages: Array<Record<string, unknown>> = [{ role: "user", content: scenario.prompt }];
890
+ const maxIterations = scenario.maxIterations ?? effective.defaults.maxIterations;
891
+ const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
892
+ const toolTimeoutMs = effective.defaults.toolTimeoutMs;
893
+
894
+ let toolCalls = 0;
895
+ let totalTokens = 0;
896
+ let finalOutput = "";
897
+ const usedToolNames = new Set<string>();
898
+ let statusCode = 200;
899
+ let reachedIterationCap = true;
900
+ let candidateAttempts = 0;
901
+ let failovers = 0;
902
+ let rateLimitSwitches = 0;
903
+ let distinctProviders = 0;
904
+ let distinctModels = 0;
905
+
906
+ for (let iteration = 0; iteration < maxIterations; iteration++) {
907
+ const payload: Record<string, unknown> = {
908
+ model,
909
+ messages,
910
+ stream: false,
911
+ ...buildGenerationParams(scenario, effective),
912
+ };
913
+
914
+ if (selectedTools.length > 0) {
915
+ payload.tools = selectedTools.map((tool) => ({
916
+ type: "function",
917
+ function: {
918
+ name: tool.name,
919
+ description: tool.description ?? "",
920
+ parameters: tool.inputSchema,
921
+ },
922
+ }));
923
+ payload.tool_choice = "auto";
924
+ }
925
+
926
+ const envelope = await requestJson(
927
+ paths,
928
+ model,
929
+ "/v1/chat/completions",
930
+ payload,
931
+ timeoutMs,
932
+ getModeRequirements("agent")
933
+ );
934
+
935
+ statusCode = envelope.statusCode;
936
+ const response = (envelope.payload as ChatResponse) ?? {};
937
+ totalTokens += Number(response.usage?.total_tokens ?? 0);
938
+ candidateAttempts += envelope.poolMetrics?.candidateAttempts ?? 0;
939
+ failovers += envelope.poolMetrics?.failovers ?? 0;
940
+ rateLimitSwitches += envelope.poolMetrics?.rateLimitSwitches ?? 0;
941
+ distinctProviders = Math.max(distinctProviders, envelope.poolMetrics?.distinctProviders ?? 0);
942
+ distinctModels = Math.max(distinctModels, envelope.poolMetrics?.distinctModels ?? 0);
943
+
944
+ const assistantMessage = response.choices?.[0]?.message;
945
+ const assistantContent = parseMessageContent(assistantMessage?.content);
946
+ finalOutput = assistantContent || finalOutput;
947
+ const toolCallList = Array.isArray(assistantMessage?.tool_calls)
948
+ ? assistantMessage.tool_calls
949
+ : [];
950
+ onExchange?.(
951
+ buildExchangeEvent({
952
+ scenario,
953
+ mode: "agent",
954
+ model,
955
+ requestPath: "/v1/chat/completions",
956
+ requestPayload: envelope.requestPayload,
957
+ responsePayload: envelope.payload,
958
+ statusCode: envelope.statusCode,
959
+ contentType: envelope.contentType,
960
+ endpointId: envelope.route.endpointId,
961
+ endpointName: envelope.route.endpointName,
962
+ upstreamModel: envelope.route.upstreamModel,
963
+ toolTrace: buildToolTrace(toolCallList, []),
964
+ })
965
+ );
966
+
967
+ if (toolCallList.length === 0) {
968
+ messages.push({ role: "assistant", content: assistantContent });
969
+ reachedIterationCap = false;
970
+ break;
971
+ }
972
+
973
+ messages.push({
974
+ role: "assistant",
975
+ content: assistantContent || null,
976
+ tool_calls: toolCallList,
977
+ });
978
+
979
+ for (const call of toolCallList) {
980
+ const name = call.function?.name;
981
+ if (!name) {
982
+ throw new Error(`Scenario ${scenario.id}: tool call is missing function.name.`);
983
+ }
984
+
985
+ let args: Record<string, unknown> = {};
986
+ const rawArguments = call.function?.arguments;
987
+ if (rawArguments && rawArguments.trim().length > 0) {
988
+ try {
989
+ args = JSON.parse(rawArguments) as Record<string, unknown>;
990
+ } catch {
991
+ throw new Error(`Scenario ${scenario.id}: invalid tool arguments for ${name}.`);
992
+ }
993
+ }
994
+
995
+ const result = await withTimeout(
996
+ executeTool(name, args),
997
+ toolTimeoutMs,
998
+ `Tool execution timed out for ${name} after ${toolTimeoutMs}ms`
999
+ );
1000
+
1001
+ toolCalls += 1;
1002
+ usedToolNames.add(name);
1003
+ messages.push({
1004
+ role: "tool",
1005
+ tool_call_id: call.id ?? `tool-${iteration + 1}-${toolCalls}`,
1006
+ content: result.content,
1007
+ });
1008
+ onExchange?.(
1009
+ buildExchangeEvent({
1010
+ scenario,
1011
+ mode: "agent",
1012
+ model,
1013
+ requestPath: "/mcp/tools/call",
1014
+ requestPayload: {
1015
+ tool_name: name,
1016
+ arguments: args,
1017
+ },
1018
+ responsePayload: result.content,
1019
+ statusCode: 200,
1020
+ contentType: "application/json",
1021
+ toolTrace: buildToolTrace([call], [
1022
+ {
1023
+ name,
1024
+ toolCallId: call.id ?? `tool-${iteration + 1}-${toolCalls}`,
1025
+ content: result.content,
1026
+ },
1027
+ ]),
1028
+ })
1029
+ );
1030
+ }
1031
+ }
1032
+
1033
+ const latencyMs = Date.now() - startTime;
1034
+ const capError = reachedIterationCap ? "max_iterations_reached" : null;
1035
+ const assertionError = evaluateAssertions(scenario, {
1036
+ output: finalOutput,
1037
+ toolCalls,
1038
+ toolNames: Array.from(usedToolNames),
1039
+ latencyMs,
1040
+ statusCode,
1041
+ });
1042
+ const error = capError ?? assertionError;
1043
+
1044
+ return {
1045
+ success: !error,
1046
+ latencyMs,
1047
+ statusCode,
1048
+ tokens: totalTokens,
1049
+ toolCalls,
1050
+ throughputTokensPerSec: calculateThroughput(totalTokens, latencyMs),
1051
+ finalOutput: finalOutput,
1052
+ outputPreview: truncate(finalOutput, 180),
1053
+ verdict: error ?? "All assertions passed.",
1054
+ usedToolNames: Array.from(usedToolNames),
1055
+ error: error ?? undefined,
1056
+ candidateAttempts,
1057
+ failovers,
1058
+ rateLimitSwitches,
1059
+ distinctProviders,
1060
+ distinctModels,
1061
+ };
1062
+ }
1063
+
1064
+ async function runEmbeddingsScenario(
1065
+ paths: StoragePaths,
1066
+ scenario: BenchmarkScenario,
1067
+ model: string,
1068
+ effective: EffectiveBenchmarkConfig,
1069
+ startTime: number,
1070
+ onExchange?: ScenarioExchangeCallback
1071
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
1072
+ const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
1073
+ const payload: Record<string, unknown> = {
1074
+ model,
1075
+ input: scenario.input,
1076
+ };
1077
+
1078
+ const envelope = await requestJson(
1079
+ paths,
1080
+ model,
1081
+ "/v1/embeddings",
1082
+ payload,
1083
+ timeoutMs,
1084
+ getModeRequirements("embeddings")
1085
+ );
1086
+ onExchange?.(
1087
+ buildExchangeEvent({
1088
+ scenario,
1089
+ mode: "embeddings",
1090
+ model,
1091
+ requestPath: "/v1/embeddings",
1092
+ requestPayload: envelope.requestPayload,
1093
+ responsePayload: envelope.payload,
1094
+ statusCode: envelope.statusCode,
1095
+ contentType: envelope.contentType,
1096
+ endpointId: envelope.route.endpointId,
1097
+ endpointName: envelope.route.endpointName,
1098
+ upstreamModel: envelope.route.upstreamModel,
1099
+ })
1100
+ );
1101
+
1102
+ const response = envelope.payload as {
1103
+ data?: Array<{ embedding?: number[] }>;
1104
+ usage?: { total_tokens?: number };
1105
+ };
1106
+
1107
+ const data = Array.isArray(response?.data) ? response.data : [];
1108
+ const firstVectorLength = Array.isArray(data[0]?.embedding) ? data[0].embedding.length : 0;
1109
+ const text = `items=${data.length},vectorLength=${firstVectorLength}`;
1110
+ const tokens = Number(response?.usage?.total_tokens ?? 0);
1111
+ const latencyMs = Date.now() - startTime;
1112
+
1113
+ const assertionError = evaluateAssertions(scenario, {
1114
+ output: text,
1115
+ toolCalls: 0,
1116
+ toolNames: [],
1117
+ latencyMs,
1118
+ statusCode: envelope.statusCode,
1119
+ embeddingsItems: data.length,
1120
+ embeddingsVectorLength: firstVectorLength,
1121
+ });
1122
+
1123
+ return {
1124
+ success: !assertionError,
1125
+ latencyMs,
1126
+ statusCode: envelope.statusCode,
1127
+ tokens,
1128
+ toolCalls: 0,
1129
+ throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
1130
+ finalOutput: text,
1131
+ outputPreview: truncate(text, 180),
1132
+ verdict: assertionError ?? "All assertions passed.",
1133
+ usedToolNames: [],
1134
+ error: assertionError ?? undefined,
1135
+ candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
1136
+ failovers: envelope.poolMetrics?.failovers ?? 0,
1137
+ rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
1138
+ distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
1139
+ distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
1140
+ };
1141
+ }
1142
+
1143
+ async function runImageScenario(
1144
+ paths: StoragePaths,
1145
+ scenario: BenchmarkScenario,
1146
+ model: string,
1147
+ effective: EffectiveBenchmarkConfig,
1148
+ startTime: number,
1149
+ onExchange?: ScenarioExchangeCallback
1150
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
1151
+ const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
1152
+ const payload: Record<string, unknown> = {
1153
+ model,
1154
+ prompt: scenario.prompt,
1155
+ n: scenario.n,
1156
+ size: scenario.size,
1157
+ };
1158
+
1159
+ const envelope = await requestJson(
1160
+ paths,
1161
+ model,
1162
+ "/v1/images/generations",
1163
+ payload,
1164
+ timeoutMs,
1165
+ getModeRequirements("image_generation")
1166
+ );
1167
+ onExchange?.(
1168
+ buildExchangeEvent({
1169
+ scenario,
1170
+ mode: "image_generation",
1171
+ model,
1172
+ requestPath: "/v1/images/generations",
1173
+ requestPayload: envelope.requestPayload,
1174
+ responsePayload: envelope.payload,
1175
+ statusCode: envelope.statusCode,
1176
+ contentType: envelope.contentType,
1177
+ endpointId: envelope.route.endpointId,
1178
+ endpointName: envelope.route.endpointName,
1179
+ upstreamModel: envelope.route.upstreamModel,
1180
+ })
1181
+ );
1182
+
1183
+ const response = envelope.payload as { data?: Array<{ url?: string; b64_json?: string }> };
1184
+ const images = Array.isArray(response?.data) ? response.data : [];
1185
+ const text = `images=${images.length}`;
1186
+ const latencyMs = Date.now() - startTime;
1187
+
1188
+ const assertionError = evaluateAssertions(scenario, {
1189
+ output: text,
1190
+ toolCalls: 0,
1191
+ toolNames: [],
1192
+ latencyMs,
1193
+ statusCode: envelope.statusCode,
1194
+ imagesCount: images.length,
1195
+ });
1196
+
1197
+ return {
1198
+ success: !assertionError,
1199
+ latencyMs,
1200
+ statusCode: envelope.statusCode,
1201
+ tokens: 0,
1202
+ toolCalls: 0,
1203
+ throughputTokensPerSec: 0,
1204
+ finalOutput: text,
1205
+ outputPreview: truncate(text, 180),
1206
+ verdict: assertionError ?? "All assertions passed.",
1207
+ usedToolNames: [],
1208
+ error: assertionError ?? undefined,
1209
+ candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
1210
+ failovers: envelope.poolMetrics?.failovers ?? 0,
1211
+ rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
1212
+ distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
1213
+ distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
1214
+ };
1215
+ }
1216
+
1217
+ async function runAudioTranscriptionScenario(
1218
+ paths: StoragePaths,
1219
+ scenario: BenchmarkScenario,
1220
+ model: string,
1221
+ effective: EffectiveBenchmarkConfig,
1222
+ startTime: number,
1223
+ onExchange?: ScenarioExchangeCallback
1224
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
1225
+ const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
1226
+ const audioPath = path.resolve(scenario.audioFile as string);
1227
+ const audioBuffer = await fs.readFile(audioPath);
1228
+
1229
+ const payload: Record<string, unknown> = {
1230
+ model,
1231
+ file: audioBuffer.toString("base64"),
1232
+ response_format: "json",
1233
+ };
1234
+
1235
+ const envelope = await requestJson(
1236
+ paths,
1237
+ model,
1238
+ "/v1/audio/transcriptions",
1239
+ payload,
1240
+ timeoutMs,
1241
+ getModeRequirements("audio_transcription")
1242
+ );
1243
+ onExchange?.(
1244
+ buildExchangeEvent({
1245
+ scenario,
1246
+ mode: "audio_transcription",
1247
+ model,
1248
+ requestPath: "/v1/audio/transcriptions",
1249
+ requestPayload: envelope.requestPayload,
1250
+ responsePayload: envelope.payload,
1251
+ statusCode: envelope.statusCode,
1252
+ contentType: envelope.contentType,
1253
+ endpointId: envelope.route.endpointId,
1254
+ endpointName: envelope.route.endpointName,
1255
+ upstreamModel: envelope.route.upstreamModel,
1256
+ })
1257
+ );
1258
+
1259
+ const response = envelope.payload as { text?: string };
1260
+ const text = response?.text ?? "";
1261
+ const latencyMs = Date.now() - startTime;
1262
+
1263
+ const assertionError = evaluateAssertions(scenario, {
1264
+ output: text,
1265
+ toolCalls: 0,
1266
+ toolNames: [],
1267
+ latencyMs,
1268
+ statusCode: envelope.statusCode,
1269
+ });
1270
+
1271
+ return {
1272
+ success: !assertionError,
1273
+ latencyMs,
1274
+ statusCode: envelope.statusCode,
1275
+ tokens: 0,
1276
+ toolCalls: 0,
1277
+ throughputTokensPerSec: 0,
1278
+ finalOutput: text,
1279
+ outputPreview: truncate(text, 180),
1280
+ verdict: assertionError ?? "All assertions passed.",
1281
+ usedToolNames: [],
1282
+ error: assertionError ?? undefined,
1283
+ candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
1284
+ failovers: envelope.poolMetrics?.failovers ?? 0,
1285
+ rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
1286
+ distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
1287
+ distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
1288
+ };
1289
+ }
1290
+
1291
+ async function runAudioSpeechScenario(
1292
+ paths: StoragePaths,
1293
+ scenario: BenchmarkScenario,
1294
+ model: string,
1295
+ effective: EffectiveBenchmarkConfig,
1296
+ startTime: number,
1297
+ onExchange?: ScenarioExchangeCallback
1298
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
1299
+ const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
1300
+ const payload: Record<string, unknown> = {
1301
+ model,
1302
+ input: scenario.inputText,
1303
+ voice: scenario.voice,
1304
+ response_format: scenario.response_format,
1305
+ };
1306
+
1307
+ const envelope = await requestBinary(
1308
+ paths,
1309
+ model,
1310
+ "/v1/audio/speech",
1311
+ payload,
1312
+ timeoutMs,
1313
+ getModeRequirements("audio_speech")
1314
+ );
1315
+ onExchange?.(
1316
+ buildExchangeEvent({
1317
+ scenario,
1318
+ mode: "audio_speech",
1319
+ model,
1320
+ requestPath: "/v1/audio/speech",
1321
+ requestPayload: envelope.requestPayload,
1322
+ responsePayload: {
1323
+ bytes: envelope.buffer.length,
1324
+ },
1325
+ statusCode: envelope.statusCode,
1326
+ contentType: envelope.contentType,
1327
+ endpointId: envelope.route.endpointId,
1328
+ endpointName: envelope.route.endpointName,
1329
+ upstreamModel: envelope.route.upstreamModel,
1330
+ })
1331
+ );
1332
+
1333
+ const latencyMs = Date.now() - startTime;
1334
+ const output = `bytes=${envelope.buffer.length}`;
1335
+ const assertionError = evaluateAssertions(scenario, {
1336
+ output,
1337
+ toolCalls: 0,
1338
+ toolNames: [],
1339
+ latencyMs,
1340
+ statusCode: envelope.statusCode,
1341
+ bytesLength: envelope.buffer.length,
1342
+ contentType: envelope.contentType,
1343
+ });
1344
+
1345
+ return {
1346
+ success: !assertionError,
1347
+ latencyMs,
1348
+ statusCode: envelope.statusCode,
1349
+ tokens: 0,
1350
+ toolCalls: 0,
1351
+ throughputTokensPerSec: 0,
1352
+ finalOutput: output,
1353
+ outputPreview: truncate(output, 180),
1354
+ verdict: assertionError ?? "All assertions passed.",
1355
+ usedToolNames: [],
1356
+ error: assertionError ?? undefined,
1357
+ candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
1358
+ failovers: envelope.poolMetrics?.failovers ?? 0,
1359
+ rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
1360
+ distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
1361
+ distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
1362
+ };
1363
+ }
1364
+
1365
+ async function runOmniCallScenario(
1366
+ paths: StoragePaths,
1367
+ scenario: BenchmarkScenario,
1368
+ model: string,
1369
+ effective: EffectiveBenchmarkConfig,
1370
+ startTime: number,
1371
+ onExchange?: ScenarioExchangeCallback
1372
+ ): Promise<Omit<ScenarioRunSample, "runIndex">> {
1373
+ const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
1374
+ const audioPath = path.resolve(scenario.audioFile as string);
1375
+ const audioBuffer = await fs.readFile(audioPath);
1376
+ const audioFormat = audioFormatFromFile(audioPath);
1377
+
1378
+ const payload: Record<string, unknown> = {
1379
+ model,
1380
+ messages: [
1381
+ {
1382
+ role: "user",
1383
+ content: [
1384
+ {
1385
+ type: "input_audio",
1386
+ input_audio: {
1387
+ data: audioBuffer.toString("base64"),
1388
+ format: audioFormat,
1389
+ },
1390
+ },
1391
+ {
1392
+ type: "text",
1393
+ text:
1394
+ scenario.prompt ??
1395
+ "Summarize this audio briefly and answer as transcript text.",
1396
+ },
1397
+ ],
1398
+ },
1399
+ ],
1400
+ stream: false,
1401
+ ...buildGenerationParams(scenario, effective),
1402
+ };
1403
+
1404
+ const envelope = await requestJson(
1405
+ paths,
1406
+ model,
1407
+ "/v1/chat/completions",
1408
+ payload,
1409
+ timeoutMs,
1410
+ getModeRequirements("omni_call")
1411
+ );
1412
+ onExchange?.(
1413
+ buildExchangeEvent({
1414
+ scenario,
1415
+ mode: "omni_call",
1416
+ model,
1417
+ requestPath: "/v1/chat/completions",
1418
+ requestPayload: envelope.requestPayload,
1419
+ responsePayload: envelope.payload,
1420
+ statusCode: envelope.statusCode,
1421
+ contentType: envelope.contentType,
1422
+ endpointId: envelope.route.endpointId,
1423
+ endpointName: envelope.route.endpointName,
1424
+ upstreamModel: envelope.route.upstreamModel,
1425
+ })
1426
+ );
1427
+
1428
+ const response = (envelope.payload as ChatResponse) ?? {};
1429
+ const output = parseAssistantContent(response);
1430
+ const tokens = Number(response.usage?.total_tokens ?? 0);
1431
+ const audioOutputPresent = responseHasAudioOutput(response);
1432
+ const latencyMs = Date.now() - startTime;
1433
+
1434
+ const assertionError = evaluateAssertions(scenario, {
1435
+ output,
1436
+ toolCalls: 0,
1437
+ toolNames: [],
1438
+ latencyMs,
1439
+ statusCode: envelope.statusCode,
1440
+ });
1441
+
1442
+ return {
1443
+ success: !assertionError,
1444
+ latencyMs,
1445
+ statusCode: envelope.statusCode,
1446
+ tokens,
1447
+ toolCalls: 0,
1448
+ throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
1449
+ finalOutput: output,
1450
+ outputPreview: truncate(`${output}\naudio_output=${audioOutputPresent ? "yes" : "no"}`, 180),
1451
+ verdict: assertionError ?? "All assertions passed.",
1452
+ usedToolNames: [],
1453
+ error: assertionError ?? undefined,
1454
+ candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
1455
+ failovers: envelope.poolMetrics?.failovers ?? 0,
1456
+ rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
1457
+ distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
1458
+ distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
1459
+ audioOutputPresent,
1460
+ };
1461
+ }
1462
+
1463
+ function audioFormatFromFile(filePath: string): string {
1464
+ const ext = path.extname(filePath).slice(1).toLowerCase();
1465
+ if (ext === "mp3") return "mp3";
1466
+ if (ext === "wav") return "wav";
1467
+ if (ext === "ogg") return "ogg";
1468
+ if (ext === "m4a" || ext === "mp4") return "m4a";
1469
+ if (ext === "webm") return "webm";
1470
+ return "wav";
1471
+ }
1472
+
1473
+ function responseHasAudioOutput(response: ChatResponse): boolean {
1474
+ const message = response.choices?.[0]?.message;
1475
+ if (!message || typeof message !== "object") {
1476
+ return false;
1477
+ }
1478
+ const directAudio = (message as { audio?: unknown }).audio;
1479
+ if (directAudio && typeof directAudio === "object") {
1480
+ const audio = directAudio as { url?: unknown; data?: unknown };
1481
+ if (typeof audio.url === "string" || typeof audio.data === "string") {
1482
+ return true;
1483
+ }
1484
+ }
1485
+
1486
+ const content = (message as { content?: unknown }).content;
1487
+ if (!Array.isArray(content)) {
1488
+ return false;
1489
+ }
1490
+ for (const part of content) {
1491
+ if (!part || typeof part !== "object") {
1492
+ continue;
1493
+ }
1494
+ const typed = part as { type?: unknown; audio?: unknown };
1495
+ if (typed.type !== "audio" && typed.type !== "output_audio") {
1496
+ continue;
1497
+ }
1498
+ if (!typed.audio || typeof typed.audio !== "object") {
1499
+ continue;
1500
+ }
1501
+ const audio = typed.audio as { url?: unknown; data?: unknown };
1502
+ if (typeof audio.url === "string" || typeof audio.data === "string") {
1503
+ return true;
1504
+ }
1505
+ }
1506
+ return false;
1507
+ }
1508
+
1509
+ async function requestJson(
1510
+ paths: StoragePaths,
1511
+ model: string,
1512
+ requestPath: string,
1513
+ payload: Record<string, unknown>,
1514
+ timeoutMs: number,
1515
+ requirements: BenchmarkModeRequirements
1516
+ ): Promise<JsonResponseEnvelope> {
1517
+ const requestPayload = JSON.parse(JSON.stringify(payload)) as Record<string, unknown>;
1518
+ const outcome = await routeRequest(
1519
+ paths,
1520
+ model,
1521
+ requestPath,
1522
+ requestPayload,
1523
+ {},
1524
+ AbortSignal.timeout(timeoutMs),
1525
+ {
1526
+ endpointType: requirements.preferredEndpointType,
1527
+ requiredInput: requirements.requiredInput,
1528
+ requiredOutput: requirements.requiredOutput,
1529
+ }
1530
+ );
1531
+
1532
+ const { buffer, contentType } = await readBody(outcome.attempt.response.body, outcome.attempt.response.headers);
1533
+ const payloadData = parseJson(buffer);
1534
+ return {
1535
+ statusCode: outcome.attempt.response.statusCode,
1536
+ payload: payloadData,
1537
+ contentType,
1538
+ requestPayload,
1539
+ route: {
1540
+ endpointId: outcome.attempt.endpoint.id,
1541
+ endpointName: outcome.attempt.endpoint.name,
1542
+ upstreamModel: outcome.attempt.upstreamModel,
1543
+ },
1544
+ poolMetrics: outcome.attempt.pool,
1545
+ };
1546
+ }
1547
+
1548
+ async function requestBinary(
1549
+ paths: StoragePaths,
1550
+ model: string,
1551
+ requestPath: string,
1552
+ payload: Record<string, unknown>,
1553
+ timeoutMs: number,
1554
+ requirements: BenchmarkModeRequirements
1555
+ ): Promise<BinaryResponseEnvelope> {
1556
+ const requestPayload = JSON.parse(JSON.stringify(payload)) as Record<string, unknown>;
1557
+ const outcome = await routeRequest(
1558
+ paths,
1559
+ model,
1560
+ requestPath,
1561
+ requestPayload,
1562
+ {},
1563
+ AbortSignal.timeout(timeoutMs),
1564
+ {
1565
+ endpointType: requirements.preferredEndpointType,
1566
+ requiredInput: requirements.requiredInput,
1567
+ requiredOutput: requirements.requiredOutput,
1568
+ }
1569
+ );
1570
+
1571
+ const { buffer, contentType } = await readBody(outcome.attempt.response.body, outcome.attempt.response.headers);
1572
+ return {
1573
+ statusCode: outcome.attempt.response.statusCode,
1574
+ buffer,
1575
+ contentType,
1576
+ requestPayload,
1577
+ route: {
1578
+ endpointId: outcome.attempt.endpoint.id,
1579
+ endpointName: outcome.attempt.endpoint.name,
1580
+ upstreamModel: outcome.attempt.upstreamModel,
1581
+ },
1582
+ poolMetrics: outcome.attempt.pool,
1583
+ };
1584
+ }
1585
+
1586
+ async function readBody(
1587
+ stream: NodeJS.ReadableStream,
1588
+ headers: Record<string, string | string[]>
1589
+ ): Promise<{ buffer: Buffer; contentType: string }> {
1590
+ const chunks: Buffer[] = [];
1591
+ for await (const chunk of stream) {
1592
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
1593
+ }
1594
+ const buffer = Buffer.concat(chunks);
1595
+ const contentTypeHeader = headers["content-type"] ?? headers["Content-Type"];
1596
+ const contentType = Array.isArray(contentTypeHeader)
1597
+ ? contentTypeHeader.join(", ")
1598
+ : (contentTypeHeader ?? "");
1599
+
1600
+ return { buffer, contentType };
1601
+ }
1602
+
1603
+ function parseJson(buffer: Buffer): unknown {
1604
+ const text = buffer.toString("utf8").trim();
1605
+ if (!text) {
1606
+ return {};
1607
+ }
1608
+ try {
1609
+ return JSON.parse(text);
1610
+ } catch {
1611
+ return { raw: text };
1612
+ }
1613
+ }
1614
+
1615
+ function getSelectedTools(requestedNames?: string[]) {
1616
+ const tools = getCachedTools();
1617
+ if (!requestedNames || requestedNames.length === 0) {
1618
+ return tools;
1619
+ }
1620
+ const requested = new Set(requestedNames);
1621
+ return tools.filter((tool) => requested.has(tool.name));
1622
+ }
1623
+
1624
+ function parseAssistantContent(response: ChatResponse): string {
1625
+ const content = response.choices?.[0]?.message?.content;
1626
+ return parseMessageContent(content);
1627
+ }
1628
+
1629
+ function buildGenerationParams(
1630
+ scenario: BenchmarkScenario,
1631
+ effective: EffectiveBenchmarkConfig
1632
+ ): Record<string, unknown> {
1633
+ return compactObject({
1634
+ temperature:
1635
+ scenario.temperature ??
1636
+ effective.run.temperature ??
1637
+ effective.defaults.temperature,
1638
+ top_p:
1639
+ scenario.top_p ??
1640
+ effective.run.top_p ??
1641
+ effective.defaults.top_p,
1642
+ max_tokens:
1643
+ scenario.max_tokens ??
1644
+ effective.run.max_tokens ??
1645
+ effective.defaults.max_tokens,
1646
+ presence_penalty:
1647
+ scenario.presence_penalty ??
1648
+ effective.run.presence_penalty ??
1649
+ effective.defaults.presence_penalty,
1650
+ frequency_penalty:
1651
+ scenario.frequency_penalty ??
1652
+ effective.run.frequency_penalty ??
1653
+ effective.defaults.frequency_penalty,
1654
+ seed:
1655
+ scenario.seed ??
1656
+ effective.run.seed ??
1657
+ effective.defaults.seed,
1658
+ stop:
1659
+ scenario.stop ??
1660
+ effective.run.stop ??
1661
+ effective.defaults.stop,
1662
+ });
1663
+ }
1664
+
1665
+ function extractResponsesOutputText(response: {
1666
+ output?: Array<{
1667
+ type?: string;
1668
+ content?: Array<{ type?: string; text?: string }>;
1669
+ name?: string;
1670
+ arguments?: string;
1671
+ }>;
1672
+ }): string {
1673
+ const parts: string[] = [];
1674
+ for (const item of response.output ?? []) {
1675
+ if (!item || typeof item !== "object") {
1676
+ continue;
1677
+ }
1678
+ if (item.type === "message") {
1679
+ for (const part of item.content ?? []) {
1680
+ if (part?.type === "output_text" && typeof part.text === "string") {
1681
+ parts.push(part.text);
1682
+ }
1683
+ }
1684
+ continue;
1685
+ }
1686
+ if (item.type === "function_call" && typeof item.name === "string") {
1687
+ parts.push(`tool_call:${item.name}`);
1688
+ }
1689
+ }
1690
+ return parts.join("\n").trim();
1691
+ }
1692
+
1693
+ function parseMessageContent(content: unknown): string {
1694
+ if (typeof content === "string") {
1695
+ return content;
1696
+ }
1697
+ if (Array.isArray(content)) {
1698
+ const parts = content
1699
+ .map((part) => {
1700
+ if (typeof part === "string") {
1701
+ return part;
1702
+ }
1703
+ if (part && typeof part === "object" && typeof (part as { text?: unknown }).text === "string") {
1704
+ return (part as { text: string }).text;
1705
+ }
1706
+ return "";
1707
+ })
1708
+ .filter((part) => part.length > 0);
1709
+ return parts.join("\n");
1710
+ }
1711
+ return "";
1712
+ }
1713
+
1714
+ interface AssertionRuntime {
1715
+ output: string;
1716
+ toolCalls: number;
1717
+ toolNames: string[];
1718
+ latencyMs: number;
1719
+ statusCode: number;
1720
+ embeddingsItems?: number;
1721
+ embeddingsVectorLength?: number;
1722
+ imagesCount?: number;
1723
+ bytesLength?: number;
1724
+ contentType?: string;
1725
+ }
1726
+
1727
+ function evaluateAssertions(scenario: BenchmarkScenario, runtime: AssertionRuntime): string | null {
1728
+ const assertions = scenario.assertions;
1729
+
1730
+ for (const required of assertions.contains ?? []) {
1731
+ if (!runtime.output.includes(required)) {
1732
+ return `Assertion failed: output must include '${required}'.`;
1733
+ }
1734
+ }
1735
+
1736
+ for (const forbidden of assertions.notContains ?? []) {
1737
+ if (runtime.output.includes(forbidden)) {
1738
+ return `Assertion failed: output must not include '${forbidden}'.`;
1739
+ }
1740
+ }
1741
+
1742
+ for (const toolName of assertions.requiredToolNames ?? []) {
1743
+ if (!runtime.toolNames.includes(toolName)) {
1744
+ return `Assertion failed: expected tool '${toolName}' to be used.`;
1745
+ }
1746
+ }
1747
+
1748
+ if (typeof assertions.minToolCalls === "number" && runtime.toolCalls < assertions.minToolCalls) {
1749
+ return `Assertion failed: expected at least ${assertions.minToolCalls} tool calls, got ${runtime.toolCalls}.`;
1750
+ }
1751
+
1752
+ if (typeof assertions.maxToolCalls === "number" && runtime.toolCalls > assertions.maxToolCalls) {
1753
+ return `Assertion failed: expected at most ${assertions.maxToolCalls} tool calls, got ${runtime.toolCalls}.`;
1754
+ }
1755
+
1756
+ if (typeof assertions.maxLatencyMs === "number" && runtime.latencyMs > assertions.maxLatencyMs) {
1757
+ return `Assertion failed: latency ${runtime.latencyMs}ms exceeded ${assertions.maxLatencyMs}ms.`;
1758
+ }
1759
+
1760
+ if (runtime.statusCode !== assertions.statusCode) {
1761
+ return `Assertion failed: expected status ${assertions.statusCode}, got ${runtime.statusCode}.`;
1762
+ }
1763
+
1764
+ if (typeof assertions.minItems === "number") {
1765
+ const items = runtime.embeddingsItems ?? 0;
1766
+ if (items < assertions.minItems) {
1767
+ return `Assertion failed: expected at least ${assertions.minItems} embeddings, got ${items}.`;
1768
+ }
1769
+ }
1770
+
1771
+ if (typeof assertions.minVectorLength === "number") {
1772
+ const vectorLength = runtime.embeddingsVectorLength ?? 0;
1773
+ if (vectorLength < assertions.minVectorLength) {
1774
+ return `Assertion failed: expected vector length >= ${assertions.minVectorLength}, got ${vectorLength}.`;
1775
+ }
1776
+ }
1777
+
1778
+ if (typeof assertions.minImages === "number") {
1779
+ const images = runtime.imagesCount ?? 0;
1780
+ if (images < assertions.minImages) {
1781
+ return `Assertion failed: expected at least ${assertions.minImages} images, got ${images}.`;
1782
+ }
1783
+ }
1784
+
1785
+ for (const text of assertions.containsText ?? []) {
1786
+ if (!runtime.output.includes(text)) {
1787
+ return `Assertion failed: transcription must include '${text}'.`;
1788
+ }
1789
+ }
1790
+
1791
+ for (const text of assertions.notContainsText ?? []) {
1792
+ if (runtime.output.includes(text)) {
1793
+ return `Assertion failed: transcription must not include '${text}'.`;
1794
+ }
1795
+ }
1796
+
1797
+ if (typeof assertions.minBytes === "number") {
1798
+ const bytes = runtime.bytesLength ?? 0;
1799
+ if (bytes < assertions.minBytes) {
1800
+ return `Assertion failed: expected at least ${assertions.minBytes} bytes, got ${bytes}.`;
1801
+ }
1802
+ }
1803
+
1804
+ if (assertions.contentType) {
1805
+ const contentType = runtime.contentType ?? "";
1806
+ if (!contentType.toLowerCase().includes(assertions.contentType.toLowerCase())) {
1807
+ return `Assertion failed: expected content type to include '${assertions.contentType}', got '${contentType}'.`;
1808
+ }
1809
+ }
1810
+
1811
+ return null;
1812
+ }
1813
+
1814
+ function buildSkippedScenarioResult(
1815
+ scenario: BenchmarkScenario,
1816
+ reason: string
1817
+ ): ScenarioResult {
1818
+ return {
1819
+ id: scenario.id,
1820
+ mode: scenario.mode,
1821
+ title: scenario.title,
1822
+ summary: scenario.summary,
1823
+ userVisibleGoal: scenario.userVisibleGoal,
1824
+ exampleSource: scenario.exampleSource,
1825
+ inputPreview: scenario.inputPreview ?? describeScenarioInput(scenario),
1826
+ successCriteria: scenario.successCriteria,
1827
+ expectedHighlights: scenario.expectedHighlights,
1828
+ model: scenario.model ?? "unresolved",
1829
+ status: "skipped",
1830
+ success: true,
1831
+ skippedReason: reason,
1832
+ passRate: 1,
1833
+ passedRuns: 0,
1834
+ failedRuns: 0,
1835
+ avgLatencyMs: 0,
1836
+ p50LatencyMs: 0,
1837
+ p95LatencyMs: 0,
1838
+ p99LatencyMs: 0,
1839
+ totalTokens: 0,
1840
+ totalToolCalls: 0,
1841
+ avgThroughputTokensPerSec: 0,
1842
+ candidateAttempts: 0,
1843
+ failovers: 0,
1844
+ rateLimitSwitches: 0,
1845
+ distinctProviders: 0,
1846
+ distinctModels: 0,
1847
+ errorReasons: [],
1848
+ usedToolNames: [],
1849
+ verdict: reason,
1850
+ outputPreview: "",
1851
+ audioOutputRuns: 0,
1852
+ };
1853
+ }
1854
+
1855
+ function buildScenarioResult(
1856
+ scenario: BenchmarkScenario,
1857
+ model: string,
1858
+ samples: ScenarioRunSample[],
1859
+ minScenarioPassRate: number
1860
+ ): ScenarioResult {
1861
+ const latencies = samples.map((sample) => sample.latencyMs).sort((a, b) => a - b);
1862
+ const passedRuns = samples.filter((sample) => sample.success).length;
1863
+ const failedRuns = samples.length - passedRuns;
1864
+ const totalTokens = samples.reduce((sum, sample) => sum + sample.tokens, 0);
1865
+ const totalToolCalls = samples.reduce((sum, sample) => sum + sample.toolCalls, 0);
1866
+ const passRate = samples.length > 0 ? passedRuns / samples.length : 0;
1867
+ const candidateAttempts = samples.reduce((sum, sample) => sum + (sample.candidateAttempts ?? 0), 0);
1868
+ const failovers = samples.reduce((sum, sample) => sum + (sample.failovers ?? 0), 0);
1869
+ const rateLimitSwitches = samples.reduce((sum, sample) => sum + (sample.rateLimitSwitches ?? 0), 0);
1870
+ const distinctProviders = samples.reduce((max, sample) => Math.max(max, sample.distinctProviders ?? 0), 0);
1871
+ const distinctModels = samples.reduce((max, sample) => Math.max(max, sample.distinctModels ?? 0), 0);
1872
+ const audioOutputRuns = samples.reduce((sum, sample) => sum + (sample.audioOutputPresent ? 1 : 0), 0);
1873
+ const avgLatencyMs =
1874
+ latencies.length > 0
1875
+ ? Math.round(latencies.reduce((sum, value) => sum + value, 0) / latencies.length)
1876
+ : 0;
1877
+ const avgThroughputTokensPerSec =
1878
+ samples.length > 0
1879
+ ? samples.reduce((sum, sample) => sum + sample.throughputTokensPerSec, 0) / samples.length
1880
+ : 0;
1881
+
1882
+ const failureReasonCounts = new Map<string, number>();
1883
+ for (const sample of samples) {
1884
+ if (!sample.error) {
1885
+ continue;
1886
+ }
1887
+ failureReasonCounts.set(sample.error, (failureReasonCounts.get(sample.error) ?? 0) + 1);
1888
+ }
1889
+
1890
+ const errorReasons = [...failureReasonCounts.entries()]
1891
+ .sort((a, b) => b[1] - a[1])
1892
+ .map(([reason, count]) => `${reason} (${count})`);
1893
+
1894
+ const outputPreview =
1895
+ [...samples].reverse().find((sample) => sample.outputPreview)?.outputPreview ?? "";
1896
+
1897
+ const status = passRate >= minScenarioPassRate ? "passed" : "failed";
1898
+
1899
+ return {
1900
+ id: scenario.id,
1901
+ mode: scenario.mode,
1902
+ title: scenario.title,
1903
+ summary: scenario.summary,
1904
+ userVisibleGoal: scenario.userVisibleGoal,
1905
+ exampleSource: scenario.exampleSource,
1906
+ inputPreview: scenario.inputPreview ?? describeScenarioInput(scenario),
1907
+ successCriteria: scenario.successCriteria,
1908
+ expectedHighlights: scenario.expectedHighlights,
1909
+ model,
1910
+ status,
1911
+ success: status === "passed",
1912
+ passRate: Number(passRate.toFixed(4)),
1913
+ passedRuns,
1914
+ failedRuns,
1915
+ avgLatencyMs,
1916
+ p50LatencyMs: percentile(latencies, 50),
1917
+ p95LatencyMs: percentile(latencies, 95),
1918
+ p99LatencyMs: percentile(latencies, 99),
1919
+ totalTokens,
1920
+ totalToolCalls,
1921
+ avgThroughputTokensPerSec: Number(avgThroughputTokensPerSec.toFixed(3)),
1922
+ candidateAttempts,
1923
+ failovers,
1924
+ rateLimitSwitches,
1925
+ distinctProviders,
1926
+ distinctModels,
1927
+ audioOutputRuns,
1928
+ usedToolNames: uniqueToolNames(samples),
1929
+ verdict:
1930
+ status === "passed"
1931
+ ? "All assertions passed."
1932
+ : (samples.find((sample) => sample.error)?.error ?? errorReasons[0] ?? "Scenario failed."),
1933
+ errorReasons,
1934
+ outputPreview,
1935
+ };
1936
+ }
1937
+
1938
+ function uniqueToolNames(samples: ScenarioRunSample[]): string[] {
1939
+ const names = new Set<string>();
1940
+ for (const sample of samples) {
1941
+ for (const toolName of sample.usedToolNames) {
1942
+ names.add(toolName);
1943
+ }
1944
+ }
1945
+ return Array.from(names).sort();
1946
+ }
1947
+
1948
+ function describeScenarioInput(scenario: BenchmarkScenario): string {
1949
+ if (scenario.inputPreview) {
1950
+ return scenario.inputPreview;
1951
+ }
1952
+ if (scenario.prompt) {
1953
+ return scenario.prompt;
1954
+ }
1955
+ if (scenario.inputText) {
1956
+ return scenario.inputText;
1957
+ }
1958
+ if (typeof scenario.input === "string") {
1959
+ return scenario.input;
1960
+ }
1961
+ if (Array.isArray(scenario.input)) {
1962
+ return scenario.input.join(" | ");
1963
+ }
1964
+ if (scenario.audioFile) {
1965
+ return scenario.audioFile;
1966
+ }
1967
+ return "";
1968
+ }
1969
+
1970
+ function buildCapabilityMatrix(
1971
+ effective: EffectiveBenchmarkConfig,
1972
+ executions: ScenarioExecution[]
1973
+ ): BenchmarkCapabilityMatrix | undefined {
1974
+ const ttlDays = effective.run.capTtlDays ?? 7;
1975
+ const ttlMs = ttlDays * 24 * 60 * 60 * 1000;
1976
+ const byModel = new Map<string, {
1977
+ providerId: string;
1978
+ modelId: string;
1979
+ findings: Partial<Record<BenchmarkCapabilityKey, {
1980
+ status: BenchmarkCapabilityStatus;
1981
+ confidence: number;
1982
+ evidence: string;
1983
+ observedAt: string;
1984
+ scenarioId?: string;
1985
+ statusCode?: number;
1986
+ }>>;
1987
+ lastVerifiedAt: string;
1988
+ }>();
1989
+
1990
+ for (const execution of executions) {
1991
+ const capability = execution.scenario.capability;
1992
+ if (!capability) {
1993
+ continue;
1994
+ }
1995
+ const { providerId, modelId } = splitModelRef(execution.result.model);
1996
+ const modelKey = `${providerId}/${modelId}`;
1997
+ const existing = byModel.get(modelKey) ?? {
1998
+ providerId,
1999
+ modelId,
2000
+ findings: {},
2001
+ lastVerifiedAt: new Date().toISOString(),
2002
+ };
2003
+
2004
+ const status = classifyFromExecution(execution);
2005
+ const confidence = confidenceFromExecution(status, execution.result);
2006
+ const primaryReason = execution.result.errorReasons[0] ?? execution.result.outputPreview;
2007
+ const statusCode =
2008
+ execution.samples.find((sample) => sample.statusCode > 0)?.statusCode ??
2009
+ (execution.result.status === "skipped" ? 0 : 200);
2010
+
2011
+ const nextFinding = {
2012
+ status,
2013
+ confidence,
2014
+ evidence: truncate(primaryReason || "No explicit evidence", 220),
2015
+ observedAt: new Date().toISOString(),
2016
+ scenarioId: execution.scenario.id,
2017
+ statusCode: statusCode > 0 ? statusCode : undefined,
2018
+ };
2019
+
2020
+ const prev = existing.findings[capability];
2021
+ if (!prev || shouldReplaceFinding(prev.status, nextFinding.status, prev.confidence, nextFinding.confidence)) {
2022
+ existing.findings[capability] = nextFinding;
2023
+ }
2024
+
2025
+ existing.lastVerifiedAt = new Date().toISOString();
2026
+ byModel.set(modelKey, existing);
2027
+ }
2028
+
2029
+ const models: BenchmarkModelCapabilitySnapshot[] = [];
2030
+ for (const [key, record] of byModel.entries()) {
2031
+ const findings = Object.fromEntries(
2032
+ BENCHMARK_CAPABILITY_KEYS.map((capability) => {
2033
+ const item = record.findings[capability];
2034
+ if (item) {
2035
+ return [
2036
+ capability,
2037
+ {
2038
+ capability,
2039
+ status: item.status,
2040
+ confidence: item.confidence,
2041
+ evidence: item.evidence,
2042
+ scenarioId: item.scenarioId,
2043
+ statusCode: item.statusCode,
2044
+ observedAt: item.observedAt,
2045
+ },
2046
+ ];
2047
+ }
2048
+ return [
2049
+ capability,
2050
+ {
2051
+ capability,
2052
+ status: "unknown" as const,
2053
+ confidence: 0,
2054
+ evidence: "No probe evidence in this run.",
2055
+ observedAt: record.lastVerifiedAt,
2056
+ },
2057
+ ];
2058
+ })
2059
+ ) as BenchmarkModelCapabilitySnapshot["findings"];
2060
+
2061
+ const confidenceValues = Object.values(findings).map((finding) => finding.confidence);
2062
+ const avgConfidence =
2063
+ confidenceValues.length > 0
2064
+ ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length
2065
+ : 0;
2066
+
2067
+ const expiresAt = new Date(Date.parse(record.lastVerifiedAt) + ttlMs).toISOString();
2068
+
2069
+ models.push({
2070
+ model: key,
2071
+ providerId: record.providerId,
2072
+ modelId: record.modelId,
2073
+ configFingerprint: computeConfigFingerprint({
2074
+ suite: effective.run.suite,
2075
+ model: key,
2076
+ profile: effective.profile,
2077
+ }),
2078
+ confidence: Number(avgConfidence.toFixed(3)),
2079
+ lastVerifiedAt: record.lastVerifiedAt,
2080
+ expiresAt,
2081
+ freshness: Date.now() <= Date.parse(expiresAt) ? "fresh" : "stale",
2082
+ findings,
2083
+ });
2084
+ }
2085
+
2086
+ models.sort((a, b) => a.model.localeCompare(b.model));
2087
+
2088
+ if (models.length === 0) {
2089
+ return undefined;
2090
+ }
2091
+
2092
+ return {
2093
+ generatedAt: new Date().toISOString(),
2094
+ ttlDays,
2095
+ models,
2096
+ };
2097
+ }
2098
+
2099
+ function classifyFromExecution(execution: ScenarioExecution): BenchmarkCapabilityStatus {
2100
+ if (execution.result.status === "skipped") {
2101
+ return "unknown";
2102
+ }
2103
+
2104
+ if (execution.result.success) {
2105
+ return "supported";
2106
+ }
2107
+
2108
+ const sample = execution.samples.find((item) => !item.success) ?? execution.samples[0];
2109
+ return classifyCapabilityStatus({
2110
+ success: false,
2111
+ statusCode: sample?.statusCode,
2112
+ error: sample?.error ?? execution.result.errorReasons[0],
2113
+ });
2114
+ }
2115
+
2116
+ function confidenceFromExecution(status: BenchmarkCapabilityStatus, result: ScenarioResult): number {
2117
+ if (status === "supported") {
2118
+ return Math.max(0.5, result.passRate);
2119
+ }
2120
+ if (status === "unsupported" || status === "misconfigured") {
2121
+ return 0.9;
2122
+ }
2123
+ return 0.4;
2124
+ }
2125
+
2126
+ function shouldReplaceFinding(
2127
+ currentStatus: BenchmarkCapabilityStatus,
2128
+ nextStatus: BenchmarkCapabilityStatus,
2129
+ currentConfidence: number,
2130
+ nextConfidence: number
2131
+ ): boolean {
2132
+ const rank = (value: BenchmarkCapabilityStatus): number => {
2133
+ switch (value) {
2134
+ case "supported":
2135
+ return 4;
2136
+ case "unsupported":
2137
+ return 3;
2138
+ case "misconfigured":
2139
+ return 2;
2140
+ case "unknown":
2141
+ return 1;
2142
+ }
2143
+ };
2144
+ if (rank(nextStatus) !== rank(currentStatus)) {
2145
+ return rank(nextStatus) > rank(currentStatus);
2146
+ }
2147
+ return nextConfidence >= currentConfidence;
2148
+ }
2149
+
2150
+ function splitModelRef(model: string): { providerId: string; modelId: string } {
2151
+ const [providerId, ...rest] = model.split("/");
2152
+ if (!providerId || rest.length === 0) {
2153
+ return { providerId: "unknown", modelId: model };
2154
+ }
2155
+ return {
2156
+ providerId,
2157
+ modelId: rest.join("/"),
2158
+ };
2159
+ }
2160
+
2161
+ function buildReport(
2162
+ effective: EffectiveBenchmarkConfig,
2163
+ warnings: string[],
2164
+ scenarioPath: string | undefined,
2165
+ executions: ScenarioExecution[],
2166
+ capabilityMatrix: BenchmarkCapabilityMatrix | undefined,
2167
+ reportId?: string
2168
+ ): Omit<BenchmarkReport, "gateResults"> {
2169
+ const results = executions.map((item) => item.result);
2170
+ const executedResults = results.filter((result) => result.status !== "skipped");
2171
+ const allSamples = executions.flatMap((item) => item.samples);
2172
+ const latencies = allSamples.map((sample) => sample.latencyMs).sort((a, b) => a - b);
2173
+
2174
+ const total = results.length;
2175
+ const executed = executedResults.length;
2176
+ const skipped = results.filter((result) => result.status === "skipped").length;
2177
+ const succeeded = results.filter((result) => result.status === "passed").length;
2178
+ const failed = results.filter((result) => result.status === "failed").length;
2179
+
2180
+ const totalTokens = executedResults.reduce((sum, result) => sum + result.totalTokens, 0);
2181
+ const totalToolCalls = executedResults.reduce((sum, result) => sum + result.totalToolCalls, 0);
2182
+ const avgLatencyMs =
2183
+ latencies.length > 0
2184
+ ? Math.round(latencies.reduce((sum, value) => sum + value, 0) / latencies.length)
2185
+ : 0;
2186
+ const avgThroughputTokensPerSec =
2187
+ allSamples.length > 0
2188
+ ? allSamples.reduce((sum, sample) => sum + sample.throughputTokensPerSec, 0) /
2189
+ allSamples.length
2190
+ : 0;
2191
+
2192
+ const topFailureReasons = collectTopFailureReasons(allSamples);
2193
+ const modeSummary = summarizeByMode(results);
2194
+
2195
+ return {
2196
+ id: reportId ?? randomUUID(),
2197
+ createdAt: new Date().toISOString(),
2198
+ profile: effective.profile,
2199
+ executionMode: effective.run.executionMode ?? "diagnostic",
2200
+ suite: effective.run.suite,
2201
+ exampleId: effective.run.exampleId,
2202
+ scenarioPath,
2203
+ modelOverride: effective.run.modelOverride,
2204
+ configSource: effective.configSource,
2205
+ total,
2206
+ executed,
2207
+ skipped,
2208
+ succeeded,
2209
+ failed,
2210
+ successRate: executed > 0 ? Number((succeeded / executed).toFixed(4)) : 0,
2211
+ totalTokens,
2212
+ totalToolCalls,
2213
+ avgLatencyMs,
2214
+ p50LatencyMs: percentile(latencies, 50),
2215
+ p95LatencyMs: percentile(latencies, 95),
2216
+ p99LatencyMs: percentile(latencies, 99),
2217
+ avgThroughputTokensPerSec: Number(avgThroughputTokensPerSec.toFixed(3)),
2218
+ modeSummary,
2219
+ effectiveConfig: {
2220
+ defaults: effective.defaults,
2221
+ profileSettings: effective.profileSettings,
2222
+ gates: effective.gates,
2223
+ },
2224
+ results,
2225
+ scenarioDetails: executions.map((execution) => ({
2226
+ id: execution.result.id,
2227
+ suite: effective.run.suite,
2228
+ example: execution.example,
2229
+ model: execution.result.model,
2230
+ status: execution.result.status,
2231
+ verdict: execution.result.verdict,
2232
+ exchanges: execution.exchanges,
2233
+ finalResponsePreview: execution.result.outputPreview,
2234
+ usedToolNames: execution.result.usedToolNames,
2235
+ })),
2236
+ scenarioRuns: executions.map((execution) => ({
2237
+ id: execution.result.id,
2238
+ samples: execution.samples,
2239
+ })),
2240
+ warnings,
2241
+ topFailureReasons,
2242
+ capabilityMatrix,
2243
+ };
2244
+ }
2245
+
2246
+ function buildExchangeEvent(args: {
2247
+ scenario: BenchmarkScenario;
2248
+ mode: BenchmarkMode;
2249
+ model: string;
2250
+ requestPath: string;
2251
+ requestPayload: unknown;
2252
+ responsePayload: unknown;
2253
+ statusCode: number;
2254
+ contentType: string;
2255
+ endpointId?: string;
2256
+ endpointName?: string;
2257
+ upstreamModel?: string;
2258
+ toolTrace?: BenchmarkToolTraceStep[];
2259
+ }): BenchmarkExchangeEvent {
2260
+ const requestSanitized = sanitizeForTrace(args.requestPayload);
2261
+ const responseSanitized = sanitizeForTrace(args.responsePayload);
2262
+ return {
2263
+ scenarioInput: describeScenarioInput(args.scenario),
2264
+ requestPreview: truncate(previewForTrace(requestSanitized), 220),
2265
+ responsePreview: truncate(previewForTrace(responseSanitized), 220),
2266
+ mode: args.mode,
2267
+ model: args.model,
2268
+ requestPath: args.requestPath,
2269
+ statusCode: args.statusCode,
2270
+ contentType: args.contentType,
2271
+ endpointId: args.endpointId,
2272
+ endpointName: args.endpointName,
2273
+ upstreamModel: args.upstreamModel,
2274
+ toolTrace: args.toolTrace ?? [],
2275
+ requestRaw: safeSerialize(args.requestPayload),
2276
+ requestSanitized,
2277
+ responseRaw: safeSerialize(args.responsePayload),
2278
+ responseSanitized,
2279
+ };
2280
+ }
2281
+
2282
+ function toExchangeSummary(event: BenchmarkExchangeEvent): BenchmarkExchangeSummary {
2283
+ return {
2284
+ timestamp: new Date().toISOString(),
2285
+ mode: event.mode,
2286
+ model: event.model,
2287
+ requestPath: event.requestPath,
2288
+ statusCode: event.statusCode,
2289
+ contentType: event.contentType,
2290
+ requestSanitized: event.requestSanitized,
2291
+ responseSanitized: event.responseSanitized,
2292
+ requestPreview: event.requestPreview,
2293
+ responsePreview: event.responsePreview,
2294
+ endpointId: event.endpointId,
2295
+ endpointName: event.endpointName,
2296
+ upstreamModel: event.upstreamModel,
2297
+ toolTrace: event.toolTrace,
2298
+ };
2299
+ }
2300
+
2301
+ function scenarioToSummary(
2302
+ scenario: BenchmarkScenario,
2303
+ suite?: string
2304
+ ): BenchmarkScenarioSummary {
2305
+ return {
2306
+ id: scenario.id,
2307
+ suite: suite ?? "custom",
2308
+ mode: scenario.mode,
2309
+ title: scenario.title ?? scenario.id,
2310
+ summary: scenario.summary ?? "Benchmark scenario",
2311
+ userVisibleGoal:
2312
+ scenario.userVisibleGoal ?? "Inspect the exact request, response, and final verdict.",
2313
+ exampleSource: scenario.exampleSource ?? (suite ? "builtin" : "file"),
2314
+ inputPreview: describeScenarioInput(scenario),
2315
+ successCriteria: scenario.successCriteria ?? "All configured assertions pass.",
2316
+ expectedHighlights: scenario.expectedHighlights ?? [],
2317
+ requiresAvailableTools: scenario.requiresAvailableTools === true,
2318
+ model: scenario.model,
2319
+ };
2320
+ }
2321
+
2322
+ function buildToolTrace(
2323
+ toolCalls: ChatToolCall[],
2324
+ toolResults: Array<{ name: string; toolCallId?: string; content: unknown }>
2325
+ ): BenchmarkToolTraceStep[] {
2326
+ const trace: BenchmarkToolTraceStep[] = [];
2327
+ for (const call of toolCalls) {
2328
+ const toolName = call.function?.name;
2329
+ if (!toolName) {
2330
+ continue;
2331
+ }
2332
+ trace.push({
2333
+ kind: "tool_call",
2334
+ toolName,
2335
+ toolCallId: call.id,
2336
+ argumentsText: call.function?.arguments,
2337
+ });
2338
+ }
2339
+ for (const result of toolResults) {
2340
+ trace.push({
2341
+ kind: "tool_result",
2342
+ toolName: result.name,
2343
+ toolCallId: result.toolCallId,
2344
+ contentText: previewForTrace(sanitizeForTrace(result.content)),
2345
+ });
2346
+ }
2347
+ return trace;
2348
+ }
2349
+
2350
+ function safeSerialize(value: unknown): unknown {
2351
+ try {
2352
+ return JSON.parse(JSON.stringify(value));
2353
+ } catch {
2354
+ return { preview: String(value) };
2355
+ }
2356
+ }
2357
+
2358
+ function sanitizeForTrace(value: unknown, depth = 0): unknown {
2359
+ if (depth > 6) {
2360
+ return "[truncated-depth]";
2361
+ }
2362
+ if (value === null || value === undefined) {
2363
+ return value;
2364
+ }
2365
+ if (typeof value === "string") {
2366
+ const trimmed = value.trim();
2367
+ if (looksLikeBase64(trimmed) && trimmed.length > 64) {
2368
+ return `<base64 omitted len=${trimmed.length}>`;
2369
+ }
2370
+ if (trimmed.startsWith("data:") && trimmed.length > 80) {
2371
+ const mime = trimmed.slice(5, trimmed.indexOf(";")) || "unknown";
2372
+ return `<data-url ${mime} omitted len=${trimmed.length}>`;
2373
+ }
2374
+ if (trimmed.length > 500) {
2375
+ return `${trimmed.slice(0, 500)}…`;
2376
+ }
2377
+ return trimmed;
2378
+ }
2379
+ if (typeof value === "number" || typeof value === "boolean") {
2380
+ return value;
2381
+ }
2382
+ if (Array.isArray(value)) {
2383
+ if (value.length > 50) {
2384
+ return {
2385
+ summary: `array(${value.length})`,
2386
+ sample: value.slice(0, 10).map((item) => sanitizeForTrace(item, depth + 1)),
2387
+ };
2388
+ }
2389
+ return value.map((item) => sanitizeForTrace(item, depth + 1));
2390
+ }
2391
+ if (typeof value === "object") {
2392
+ const out: Record<string, unknown> = {};
2393
+ for (const [key, item] of Object.entries(value as Record<string, unknown>)) {
2394
+ if (/(api[-_]?key|authorization|token|secret)/i.test(key)) {
2395
+ out[key] = "***";
2396
+ continue;
2397
+ }
2398
+ if (key === "embedding" && Array.isArray(item)) {
2399
+ out[key] = {
2400
+ summary: `vector(${item.length})`,
2401
+ sample: item.slice(0, 8),
2402
+ };
2403
+ continue;
2404
+ }
2405
+ out[key] = sanitizeForTrace(item, depth + 1);
2406
+ }
2407
+ return out;
2408
+ }
2409
+ return String(value);
2410
+ }
2411
+
2412
+ function looksLikeBase64(value: string): boolean {
2413
+ if (value.length < 32 || value.length % 4 !== 0) {
2414
+ return false;
2415
+ }
2416
+ return /^[A-Za-z0-9+/=]+$/.test(value);
2417
+ }
2418
+
2419
+ function previewForTrace(value: unknown): string {
2420
+ if (typeof value === "string") {
2421
+ return value;
2422
+ }
2423
+ try {
2424
+ return JSON.stringify(value);
2425
+ } catch {
2426
+ return String(value);
2427
+ }
2428
+ }
2429
+
2430
+ function emitEvent(hooks: BenchmarkRunHooks | undefined, event: BenchmarkProgressEvent): void {
2431
+ hooks?.onEvent?.(event);
2432
+ }
2433
+
2434
+ function summarizeByMode(
2435
+ results: ScenarioResult[]
2436
+ ): BenchmarkReport["modeSummary"] {
2437
+ const summary = Object.fromEntries(
2438
+ BENCHMARK_MODES.map((mode) => [
2439
+ mode,
2440
+ { total: 0, executed: 0, skipped: 0, passed: 0, failed: 0 },
2441
+ ])
2442
+ ) as BenchmarkReport["modeSummary"];
2443
+
2444
+ for (const result of results) {
2445
+ const row = summary[result.mode];
2446
+ row.total += 1;
2447
+ if (result.status === "skipped") {
2448
+ row.skipped += 1;
2449
+ continue;
2450
+ }
2451
+ row.executed += 1;
2452
+ if (result.status === "passed") {
2453
+ row.passed += 1;
2454
+ } else {
2455
+ row.failed += 1;
2456
+ }
2457
+ }
2458
+
2459
+ return summary;
2460
+ }
2461
+
2462
+ function collectTopFailureReasons(samples: ScenarioRunSample[]): Array<{ reason: string; count: number }> {
2463
+ const counts = new Map<string, number>();
2464
+ for (const sample of samples) {
2465
+ if (!sample.error) {
2466
+ continue;
2467
+ }
2468
+ counts.set(sample.error, (counts.get(sample.error) ?? 0) + 1);
2469
+ }
2470
+
2471
+ return [...counts.entries()]
2472
+ .sort((a, b) => b[1] - a[1])
2473
+ .slice(0, 5)
2474
+ .map(([reason, count]) => ({ reason, count }));
2475
+ }
2476
+
2477
+ function compactObject(source: Record<string, unknown>): Record<string, unknown> {
2478
+ return Object.fromEntries(
2479
+ Object.entries(source).filter(([, value]) => value !== undefined)
2480
+ );
2481
+ }
2482
+
2483
+ function percentile(sorted: number[], p: number): number {
2484
+ if (sorted.length === 0) {
2485
+ return 0;
2486
+ }
2487
+ const idx = Math.ceil((p / 100) * sorted.length) - 1;
2488
+ return sorted[Math.max(0, idx)];
2489
+ }
2490
+
2491
+ function truncate(text: string, maxLength: number): string {
2492
+ if (text.length <= maxLength) {
2493
+ return text;
2494
+ }
2495
+ return `${text.slice(0, maxLength - 1)}…`;
2496
+ }
2497
+
2498
+ function calculateThroughput(tokens: number, latencyMs: number): number {
2499
+ if (tokens <= 0 || latencyMs <= 0) {
2500
+ return 0;
2501
+ }
2502
+ return (tokens * 1000) / latencyMs;
2503
+ }
2504
+
2505
+ async function withTimeout<T>(promise: Promise<T>, timeoutMs: number, message: string): Promise<T> {
2506
+ let timeoutId: NodeJS.Timeout | null = null;
2507
+ try {
2508
+ return await Promise.race([
2509
+ promise,
2510
+ new Promise<T>((_resolve, reject) => {
2511
+ timeoutId = setTimeout(() => reject(new Error(message)), timeoutMs);
2512
+ }),
2513
+ ]);
2514
+ } finally {
2515
+ if (timeoutId) {
2516
+ clearTimeout(timeoutId);
2517
+ }
2518
+ }
2519
+ }