waypoi 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (260) hide show
  1. package/.github/instructions/ui.instructions.md +42 -0
  2. package/.github/workflows/ci.yml +35 -0
  3. package/.github/workflows/publish.yml +71 -0
  4. package/.github/workflows/release.yml +48 -0
  5. package/.playwright-mcp/console-2026-04-04T01-41-10-746Z.log +2 -0
  6. package/.playwright-mcp/console-2026-04-04T01-41-28-799Z.log +3 -0
  7. package/.playwright-mcp/console-2026-04-05T02-26-51-909Z.log +76 -0
  8. package/.playwright-mcp/page-2026-04-04T01-41-10-816Z.yml +1 -0
  9. package/.playwright-mcp/page-2026-04-04T01-41-29-141Z.yml +77 -0
  10. package/.playwright-mcp/page-2026-04-04T01-41-42-633Z.yml +190 -0
  11. package/.playwright-mcp/page-2026-04-04T01-42-03-929Z.yml +262 -0
  12. package/.playwright-mcp/page-2026-04-04T02-12-54-813Z.yml +6 -0
  13. package/.playwright-mcp/page-2026-04-04T02-14-58-600Z.yml +190 -0
  14. package/.playwright-mcp/page-2026-04-04T02-15-03-923Z.yml +190 -0
  15. package/.playwright-mcp/page-2026-04-04T02-15-07-426Z.yml +190 -0
  16. package/.playwright-mcp/page-2026-04-04T02-15-25-729Z.yml +262 -0
  17. package/.playwright-mcp/page-2026-04-04T02-16-22-984Z.yml +262 -0
  18. package/.playwright-mcp/page-2026-04-04T02-17-00-599Z.yml +190 -0
  19. package/.playwright-mcp/page-2026-04-04T02-17-50-874Z.yml +190 -0
  20. package/.playwright-mcp/page-2026-04-05T02-26-55-570Z.yml +6 -0
  21. package/AGENTS.md +48 -0
  22. package/CHANGELOG.md +131 -0
  23. package/README.md +552 -0
  24. package/assets/agent-mode.png +0 -0
  25. package/assets/categorize.png +0 -0
  26. package/assets/dashboard.png +0 -0
  27. package/assets/endpoint-proxy.png +0 -0
  28. package/assets/icon.png +0 -0
  29. package/assets/mcp-generate-image.png +0 -0
  30. package/assets/mcp-understand-image.png +0 -0
  31. package/assets/peek-token-flow.png +0 -0
  32. package/assets/playground.png +0 -0
  33. package/assets/sankey.png +0 -0
  34. package/cli/index.ts +2805 -0
  35. package/cli/legacyRewrite.ts +108 -0
  36. package/cli/modelRef.ts +24 -0
  37. package/dist/cli/index.js +2536 -0
  38. package/dist/cli/legacyRewrite.js +92 -0
  39. package/dist/cli/modelRef.js +20 -0
  40. package/dist/src/benchmark/artifacts.js +131 -0
  41. package/dist/src/benchmark/capabilityClassifier.js +81 -0
  42. package/dist/src/benchmark/capabilityStore.js +144 -0
  43. package/dist/src/benchmark/config.js +238 -0
  44. package/dist/src/benchmark/gates.js +118 -0
  45. package/dist/src/benchmark/jobs.js +252 -0
  46. package/dist/src/benchmark/runner.js +1847 -0
  47. package/dist/src/benchmark/schema.js +353 -0
  48. package/dist/src/benchmark/suites.js +314 -0
  49. package/dist/src/benchmark/tinyQaDataset.js +422 -0
  50. package/dist/src/benchmark/types.js +25 -0
  51. package/dist/src/config.js +47 -0
  52. package/dist/src/index.js +178 -0
  53. package/dist/src/mcp/client.js +215 -0
  54. package/dist/src/mcp/discovery.js +226 -0
  55. package/dist/src/mcp/policy.js +65 -0
  56. package/dist/src/mcp/registry.js +129 -0
  57. package/dist/src/mcp/service.js +460 -0
  58. package/dist/src/middleware/auth.js +179 -0
  59. package/dist/src/middleware/requestCapture.js +192 -0
  60. package/dist/src/middleware/requestStats.js +118 -0
  61. package/dist/src/pools/builder.js +132 -0
  62. package/dist/src/pools/repository.js +69 -0
  63. package/dist/src/pools/scheduler.js +360 -0
  64. package/dist/src/pools/types.js +2 -0
  65. package/dist/src/protocols/adapters/dashscope.js +267 -0
  66. package/dist/src/protocols/adapters/inferenceV2.js +346 -0
  67. package/dist/src/protocols/adapters/openai.js +27 -0
  68. package/dist/src/protocols/registry.js +99 -0
  69. package/dist/src/protocols/types.js +2 -0
  70. package/dist/src/providers/health.js +153 -0
  71. package/dist/src/providers/importer.js +289 -0
  72. package/dist/src/providers/modelRegistry.js +313 -0
  73. package/dist/src/providers/repository.js +361 -0
  74. package/dist/src/providers/types.js +2 -0
  75. package/dist/src/routes/admin.js +531 -0
  76. package/dist/src/routes/audio.js +295 -0
  77. package/dist/src/routes/chat.js +240 -0
  78. package/dist/src/routes/embeddings.js +157 -0
  79. package/dist/src/routes/images.js +288 -0
  80. package/dist/src/routes/mcp.js +256 -0
  81. package/dist/src/routes/mcpService.js +100 -0
  82. package/dist/src/routes/models.js +48 -0
  83. package/dist/src/routes/responses.js +711 -0
  84. package/dist/src/routes/sessions.js +450 -0
  85. package/dist/src/routes/stats.js +270 -0
  86. package/dist/src/routes/ui.js +97 -0
  87. package/dist/src/routes/videos.js +107 -0
  88. package/dist/src/routing/router.js +338 -0
  89. package/dist/src/services/imageGeneration.js +280 -0
  90. package/dist/src/services/imageUnderstanding.js +352 -0
  91. package/dist/src/services/videoGeneration.js +79 -0
  92. package/dist/src/storage/captureRepository.js +1591 -0
  93. package/dist/src/storage/files.js +157 -0
  94. package/dist/src/storage/imageCache.js +346 -0
  95. package/dist/src/storage/repositories.js +388 -0
  96. package/dist/src/storage/sessionRepository.js +370 -0
  97. package/dist/src/storage/statsRepository.js +204 -0
  98. package/dist/src/transport/httpClient.js +126 -0
  99. package/dist/src/types.js +2 -0
  100. package/dist/src/utils/messageMedia.js +285 -0
  101. package/dist/src/utils/modelCapabilities.js +108 -0
  102. package/dist/src/utils/modelDiscovery.js +170 -0
  103. package/dist/src/version.js +5 -0
  104. package/dist/src/workers/captureRetention.js +25 -0
  105. package/dist/src/workers/configWatcher.js +91 -0
  106. package/dist/src/workers/healthChecker.js +21 -0
  107. package/dist/src/workers/statsRotation.js +41 -0
  108. package/docs/LLM/output_schema.md +312 -0
  109. package/docs/benchmark.md +208 -0
  110. package/docs/mcp-guidelines.md +125 -0
  111. package/docs/mcp-service.md +178 -0
  112. package/docs/opencode.md +86 -0
  113. package/docs/providers.md +79 -0
  114. package/examples/benchmark.config.yaml +28 -0
  115. package/examples/providers/alibaba-dashscope.yaml +88 -0
  116. package/examples/providers/alibaba-llm.yaml +64 -0
  117. package/examples/providers/alibaba-registry.yaml +7 -0
  118. package/examples/providers/inference-v2-ray.yaml +29 -0
  119. package/examples/scenarios/assets/omni-call-sample.wav +0 -0
  120. package/examples/scenarios/custom.jsonl +5 -0
  121. package/examples/scenarios/custom.yaml +40 -0
  122. package/model-form-v2.png +0 -0
  123. package/package.json +66 -0
  124. package/provider-form-v2.png +0 -0
  125. package/provider-form.png +0 -0
  126. package/scripts/manual-test.sh +11 -0
  127. package/scripts/version-from-git.js +23 -0
  128. package/src/benchmark/artifacts.ts +149 -0
  129. package/src/benchmark/capabilityClassifier.ts +99 -0
  130. package/src/benchmark/capabilityStore.ts +174 -0
  131. package/src/benchmark/config.ts +337 -0
  132. package/src/benchmark/gates.ts +164 -0
  133. package/src/benchmark/jobs.ts +312 -0
  134. package/src/benchmark/runner.ts +2519 -0
  135. package/src/benchmark/schema.ts +443 -0
  136. package/src/benchmark/suites.ts +323 -0
  137. package/src/benchmark/tinyQaDataset.ts +428 -0
  138. package/src/benchmark/types.ts +442 -0
  139. package/src/config.ts +44 -0
  140. package/src/index.ts +195 -0
  141. package/src/mcp/client.ts +305 -0
  142. package/src/mcp/discovery.ts +266 -0
  143. package/src/mcp/policy.ts +105 -0
  144. package/src/mcp/registry.ts +164 -0
  145. package/src/mcp/service.ts +611 -0
  146. package/src/middleware/auth.ts +251 -0
  147. package/src/middleware/requestCapture.ts +245 -0
  148. package/src/middleware/requestStats.ts +163 -0
  149. package/src/pools/builder.ts +159 -0
  150. package/src/pools/repository.ts +71 -0
  151. package/src/pools/scheduler.ts +425 -0
  152. package/src/pools/types.ts +117 -0
  153. package/src/protocols/adapters/dashscope.ts +335 -0
  154. package/src/protocols/adapters/inferenceV2.ts +428 -0
  155. package/src/protocols/adapters/openai.ts +32 -0
  156. package/src/protocols/registry.ts +117 -0
  157. package/src/protocols/types.ts +81 -0
  158. package/src/providers/health.ts +207 -0
  159. package/src/providers/importer.ts +402 -0
  160. package/src/providers/modelRegistry.ts +415 -0
  161. package/src/providers/repository.ts +439 -0
  162. package/src/providers/types.ts +113 -0
  163. package/src/routes/admin.ts +666 -0
  164. package/src/routes/audio.ts +372 -0
  165. package/src/routes/chat.ts +301 -0
  166. package/src/routes/embeddings.ts +197 -0
  167. package/src/routes/images.ts +356 -0
  168. package/src/routes/mcp.ts +320 -0
  169. package/src/routes/mcpService.ts +114 -0
  170. package/src/routes/models.ts +50 -0
  171. package/src/routes/responses.ts +872 -0
  172. package/src/routes/sessions.ts +558 -0
  173. package/src/routes/stats.ts +312 -0
  174. package/src/routes/ui.ts +96 -0
  175. package/src/routes/videos.ts +132 -0
  176. package/src/routing/router.ts +501 -0
  177. package/src/services/imageGeneration.ts +396 -0
  178. package/src/services/imageUnderstanding.ts +449 -0
  179. package/src/services/videoGeneration.ts +127 -0
  180. package/src/storage/captureRepository.ts +1835 -0
  181. package/src/storage/files.ts +178 -0
  182. package/src/storage/imageCache.ts +405 -0
  183. package/src/storage/repositories.ts +494 -0
  184. package/src/storage/sessionRepository.ts +419 -0
  185. package/src/storage/statsRepository.ts +238 -0
  186. package/src/transport/httpClient.ts +145 -0
  187. package/src/types.ts +322 -0
  188. package/src/utils/messageMedia.ts +293 -0
  189. package/src/utils/modelCapabilities.ts +161 -0
  190. package/src/utils/modelDiscovery.ts +203 -0
  191. package/src/workers/captureRetention.ts +25 -0
  192. package/src/workers/configWatcher.ts +115 -0
  193. package/src/workers/healthChecker.ts +22 -0
  194. package/src/workers/statsRotation.ts +49 -0
  195. package/tests/benchmarkAdminRoutes.test.ts +82 -0
  196. package/tests/benchmarkBasics.test.ts +116 -0
  197. package/tests/captureAdminRoutes.test.ts +420 -0
  198. package/tests/captureRepository.test.ts +797 -0
  199. package/tests/cliLegacyRewrite.test.ts +45 -0
  200. package/tests/imageGeneration.service.test.ts +107 -0
  201. package/tests/imageUnderstanding.service.test.ts +123 -0
  202. package/tests/mcpPolicy.test.ts +105 -0
  203. package/tests/mcpService.test.ts +1245 -0
  204. package/tests/modelRef.test.ts +23 -0
  205. package/tests/modelsRoutes.test.ts +154 -0
  206. package/tests/sessionMediaCache.test.ts +167 -0
  207. package/tests/statsRoutes.test.ts +323 -0
  208. package/tsconfig.json +15 -0
  209. package/ui/index.html +16 -0
  210. package/ui/package-lock.json +8521 -0
  211. package/ui/package.json +52 -0
  212. package/ui/postcss.config.js +6 -0
  213. package/ui/public/assets/apple-touch-icon.png +0 -0
  214. package/ui/public/assets/favicon-16.png +0 -0
  215. package/ui/public/assets/favicon-32.png +0 -0
  216. package/ui/public/assets/icon-192.png +0 -0
  217. package/ui/public/assets/icon-512.png +0 -0
  218. package/ui/src/App.tsx +27 -0
  219. package/ui/src/api/client.ts +1503 -0
  220. package/ui/src/components/EndpointUsageGuide.tsx +361 -0
  221. package/ui/src/components/Layout.tsx +124 -0
  222. package/ui/src/components/MessageContent.tsx +365 -0
  223. package/ui/src/components/ToolCallMessage.tsx +179 -0
  224. package/ui/src/components/ToolPicker.tsx +442 -0
  225. package/ui/src/components/messageContentParser.test.ts +41 -0
  226. package/ui/src/components/messageContentParser.ts +73 -0
  227. package/ui/src/components/thinkingPreview.test.ts +27 -0
  228. package/ui/src/components/thinkingPreview.ts +15 -0
  229. package/ui/src/components/toMermaidSankey.test.ts +78 -0
  230. package/ui/src/components/toMermaidSankey.ts +56 -0
  231. package/ui/src/components/ui/button.tsx +58 -0
  232. package/ui/src/components/ui/input.tsx +21 -0
  233. package/ui/src/components/ui/textarea.tsx +21 -0
  234. package/ui/src/lib/utils.ts +6 -0
  235. package/ui/src/main.tsx +9 -0
  236. package/ui/src/pages/AgentPlayground.tsx +2010 -0
  237. package/ui/src/pages/Benchmark.tsx +988 -0
  238. package/ui/src/pages/Dashboard.tsx +581 -0
  239. package/ui/src/pages/Peek.tsx +962 -0
  240. package/ui/src/pages/Settings.tsx +2013 -0
  241. package/ui/src/pages/agentPlaygroundPayload.test.ts +109 -0
  242. package/ui/src/pages/agentPlaygroundPayload.ts +97 -0
  243. package/ui/src/pages/agentThinkingContent.test.ts +50 -0
  244. package/ui/src/pages/agentThinkingContent.ts +57 -0
  245. package/ui/src/pages/dashboardTokenUsage.test.ts +66 -0
  246. package/ui/src/pages/dashboardTokenUsage.ts +36 -0
  247. package/ui/src/pages/imageUpload.test.ts +39 -0
  248. package/ui/src/pages/imageUpload.ts +71 -0
  249. package/ui/src/pages/peekFilters.test.ts +29 -0
  250. package/ui/src/pages/peekFilters.ts +13 -0
  251. package/ui/src/pages/peekMedia.test.ts +58 -0
  252. package/ui/src/pages/peekMedia.ts +148 -0
  253. package/ui/src/pages/sessionAutoTitle.test.ts +128 -0
  254. package/ui/src/pages/sessionAutoTitle.ts +106 -0
  255. package/ui/src/stores/settings.ts +58 -0
  256. package/ui/src/styles/globals.css +223 -0
  257. package/ui/src/vite-env.d.ts +8 -0
  258. package/ui/tailwind.config.js +106 -0
  259. package/ui/tsconfig.json +32 -0
  260. package/ui/vite.config.ts +37 -0
@@ -0,0 +1,337 @@
1
+ import { promises as fs } from "fs";
2
+ import path from "path";
3
+ import YAML from "yaml";
4
+ import { StoragePaths } from "../storage/files";
5
+ import {
6
+ BenchmarkCliOptions,
7
+ BenchmarkConfigFile,
8
+ BenchmarkDefaults,
9
+ BenchmarkExecutionMode,
10
+ BenchmarkGateConfig,
11
+ BenchmarkProfileSettings,
12
+ EffectiveBenchmarkConfig,
13
+ } from "./types";
14
+
15
+ const DEFAULT_VERSION = 1;
16
+ const DEFAULT_CAP_TTL_DAYS = 7;
17
+
18
+ const DEFAULTS: BenchmarkDefaults = {
19
+ requestTimeoutMs: 120000,
20
+ toolTimeoutMs: 15000,
21
+ maxIterations: 6,
22
+ temperature: 0,
23
+ top_p: 1,
24
+ max_tokens: 512,
25
+ presence_penalty: 0,
26
+ frequency_penalty: 0,
27
+ };
28
+
29
+ const DEFAULT_PROFILES: Record<string, BenchmarkProfileSettings> = {
30
+ local: {
31
+ warmupRuns: 1,
32
+ measuredRuns: 3,
33
+ minScenarioPassRate: 1.0,
34
+ },
35
+ ci: {
36
+ warmupRuns: 2,
37
+ measuredRuns: 5,
38
+ minScenarioPassRate: 1.0,
39
+ },
40
+ };
41
+
42
+ const DEFAULT_GATES: BenchmarkGateConfig = {
43
+ hard: {
44
+ smokeMinSuccessRate: 1.0,
45
+ },
46
+ soft: {
47
+ maxP95RegressionPct: 20,
48
+ maxThroughputDropPct: 20,
49
+ },
50
+ };
51
+
52
+ export async function resolveBenchmarkConfig(
53
+ paths: StoragePaths,
54
+ cli: BenchmarkCliOptions
55
+ ): Promise<EffectiveBenchmarkConfig> {
56
+ const { fileConfig, configSource } = await loadConfigFile(paths, cli.configPath);
57
+
58
+ const mergedDefaults: BenchmarkDefaults = {
59
+ ...DEFAULTS,
60
+ ...(fileConfig?.defaults ?? {}),
61
+ };
62
+
63
+ const mergedProfiles: Record<string, BenchmarkProfileSettings> = {
64
+ ...DEFAULT_PROFILES,
65
+ };
66
+ for (const [profileName, profilePatch] of Object.entries(fileConfig?.profiles ?? {})) {
67
+ mergedProfiles[profileName] = {
68
+ ...(mergedProfiles[profileName] ?? DEFAULT_PROFILES.local),
69
+ ...profilePatch,
70
+ };
71
+ }
72
+
73
+ const selectedProfile =
74
+ cli.profile ?? fileConfig?.run?.profile ?? "local";
75
+ const profileSettings = mergedProfiles[selectedProfile];
76
+ if (!profileSettings) {
77
+ const names = Object.keys(mergedProfiles).sort().join(", ");
78
+ throw new Error(
79
+ `Unknown benchmark profile '${selectedProfile}'. Available profiles: ${names}`
80
+ );
81
+ }
82
+
83
+ const mergedGates: BenchmarkGateConfig = {
84
+ hard: {
85
+ ...DEFAULT_GATES.hard,
86
+ ...(fileConfig?.gates?.hard ?? {}),
87
+ },
88
+ soft: {
89
+ ...DEFAULT_GATES.soft,
90
+ ...(fileConfig?.gates?.soft ?? {}),
91
+ },
92
+ };
93
+
94
+ const resolved: EffectiveBenchmarkConfig = {
95
+ version: fileConfig?.version ?? DEFAULT_VERSION,
96
+ profile: selectedProfile,
97
+ defaults: validateDefaults(mergedDefaults),
98
+ profileSettings: validateProfileSettings(profileSettings, selectedProfile),
99
+ gates: validateGates(mergedGates),
100
+ run: {
101
+ suite: cli.suite ?? fileConfig?.run?.suite ?? "showcase",
102
+ exampleId: cli.exampleId ?? fileConfig?.run?.exampleId,
103
+ scenarioPath: cli.scenarioPath ?? fileConfig?.run?.scenarioPath,
104
+ modelOverride: cli.modelOverride ?? fileConfig?.run?.model,
105
+ outPath: cli.outPath ?? fileConfig?.run?.outPath,
106
+ baselinePath: cli.baselinePath ?? fileConfig?.run?.baselinePath,
107
+ executionMode: resolveExecutionMode(cli, fileConfig),
108
+ listExamples: cli.listExamples ?? fileConfig?.run?.listExamples ?? false,
109
+ updateCapCache: cli.updateCapCache ?? fileConfig?.run?.updateCapCache ?? false,
110
+ capTtlDays: intField(
111
+ cli.capTtlDays ?? fileConfig?.run?.capTtlDays ?? DEFAULT_CAP_TTL_DAYS,
112
+ "run.capTtlDays",
113
+ 1
114
+ ),
115
+ temperature: optionalNumberField(
116
+ cli.temperature ?? fileConfig?.run?.temperature,
117
+ "run.temperature"
118
+ ),
119
+ top_p: optionalBoundedField(
120
+ cli.top_p ?? fileConfig?.run?.top_p,
121
+ "run.top_p",
122
+ 0,
123
+ 1
124
+ ),
125
+ max_tokens: optionalIntField(
126
+ cli.max_tokens ?? fileConfig?.run?.max_tokens,
127
+ "run.max_tokens",
128
+ 1
129
+ ),
130
+ presence_penalty: optionalBoundedField(
131
+ cli.presence_penalty ?? fileConfig?.run?.presence_penalty,
132
+ "run.presence_penalty",
133
+ -2,
134
+ 2
135
+ ),
136
+ frequency_penalty: optionalBoundedField(
137
+ cli.frequency_penalty ?? fileConfig?.run?.frequency_penalty,
138
+ "run.frequency_penalty",
139
+ -2,
140
+ 2
141
+ ),
142
+ seed: optionalIntField(cli.seed ?? fileConfig?.run?.seed, "run.seed", 0),
143
+ stop: optionalStopField(cli.stop ?? fileConfig?.run?.stop, "run.stop"),
144
+ },
145
+ configSource,
146
+ };
147
+
148
+ return resolved;
149
+ }
150
+
151
+ async function loadConfigFile(
152
+ paths: StoragePaths,
153
+ explicitPath?: string
154
+ ): Promise<{ fileConfig?: BenchmarkConfigFile; configSource?: string }> {
155
+ const candidatePath = explicitPath
156
+ ? path.resolve(explicitPath)
157
+ : path.join(paths.baseDir, "benchmark.config.yaml");
158
+
159
+ try {
160
+ const raw = await fs.readFile(candidatePath, "utf8");
161
+ const parsed = parseConfigDocument(candidatePath, raw);
162
+ return { fileConfig: parsed, configSource: candidatePath };
163
+ } catch (error) {
164
+ const code = (error as NodeJS.ErrnoException).code;
165
+ if (code === "ENOENT") {
166
+ if (explicitPath) {
167
+ throw new Error(`Benchmark config not found: ${candidatePath}`);
168
+ }
169
+ return {};
170
+ }
171
+ throw error;
172
+ }
173
+ }
174
+
175
+ function parseConfigDocument(filePath: string, raw: string): BenchmarkConfigFile {
176
+ const ext = path.extname(filePath).toLowerCase();
177
+ try {
178
+ if (ext === ".json") {
179
+ return JSON.parse(raw) as BenchmarkConfigFile;
180
+ }
181
+ return YAML.parse(raw) as BenchmarkConfigFile;
182
+ } catch (error) {
183
+ throw new Error(
184
+ `Failed to parse benchmark config ${filePath}: ${(error as Error).message}`
185
+ );
186
+ }
187
+ }
188
+
189
+ function validateDefaults(defaults: BenchmarkDefaults): BenchmarkDefaults {
190
+ return {
191
+ requestTimeoutMs: intField(defaults.requestTimeoutMs, "defaults.requestTimeoutMs", 1),
192
+ toolTimeoutMs: intField(defaults.toolTimeoutMs, "defaults.toolTimeoutMs", 1),
193
+ maxIterations: intField(defaults.maxIterations, "defaults.maxIterations", 1),
194
+ temperature: numberField(defaults.temperature, "defaults.temperature"),
195
+ top_p: boundedField(defaults.top_p, "defaults.top_p", 0, 1),
196
+ max_tokens: intField(defaults.max_tokens, "defaults.max_tokens", 1),
197
+ presence_penalty: boundedField(
198
+ defaults.presence_penalty,
199
+ "defaults.presence_penalty",
200
+ -2,
201
+ 2
202
+ ),
203
+ frequency_penalty: boundedField(
204
+ defaults.frequency_penalty,
205
+ "defaults.frequency_penalty",
206
+ -2,
207
+ 2
208
+ ),
209
+ seed: optionalIntField(defaults.seed, "defaults.seed", 0),
210
+ stop: optionalStopField(defaults.stop, "defaults.stop"),
211
+ };
212
+ }
213
+
214
+ function resolveExecutionMode(
215
+ cli: BenchmarkCliOptions,
216
+ fileConfig?: BenchmarkConfigFile
217
+ ): BenchmarkExecutionMode {
218
+ const explicit = cli.executionMode ?? fileConfig?.run?.executionMode;
219
+ if (explicit === "showcase" || explicit === "diagnostic") {
220
+ return explicit;
221
+ }
222
+ const suite = cli.suite ?? fileConfig?.run?.suite ?? "showcase";
223
+ return suite === "showcase" ? "showcase" : "diagnostic";
224
+ }
225
+
226
+ function validateProfileSettings(
227
+ profile: BenchmarkProfileSettings,
228
+ profileName: string
229
+ ): BenchmarkProfileSettings {
230
+ return {
231
+ warmupRuns: intField(profile.warmupRuns, `profiles.${profileName}.warmupRuns`, 0),
232
+ measuredRuns: intField(profile.measuredRuns, `profiles.${profileName}.measuredRuns`, 1),
233
+ minScenarioPassRate: boundedField(
234
+ profile.minScenarioPassRate,
235
+ `profiles.${profileName}.minScenarioPassRate`,
236
+ 0,
237
+ 1
238
+ ),
239
+ };
240
+ }
241
+
242
+ function validateGates(gates: BenchmarkGateConfig): BenchmarkGateConfig {
243
+ return {
244
+ hard: {
245
+ smokeMinSuccessRate: boundedField(
246
+ gates.hard.smokeMinSuccessRate,
247
+ "gates.hard.smokeMinSuccessRate",
248
+ 0,
249
+ 1
250
+ ),
251
+ },
252
+ soft: {
253
+ maxP95RegressionPct: numberField(
254
+ gates.soft.maxP95RegressionPct,
255
+ "gates.soft.maxP95RegressionPct",
256
+ 0
257
+ ),
258
+ maxThroughputDropPct: numberField(
259
+ gates.soft.maxThroughputDropPct,
260
+ "gates.soft.maxThroughputDropPct",
261
+ 0
262
+ ),
263
+ },
264
+ };
265
+ }
266
+
267
+ function intField(value: number, field: string, min: number): number {
268
+ if (!Number.isInteger(value) || value < min) {
269
+ throw new Error(`${field} must be an integer >= ${min}`);
270
+ }
271
+ return value;
272
+ }
273
+
274
+ function numberField(value: number, field: string, min?: number): number {
275
+ if (!Number.isFinite(value)) {
276
+ throw new Error(`${field} must be a finite number`);
277
+ }
278
+ if (typeof min === "number" && value < min) {
279
+ throw new Error(`${field} must be >= ${min}`);
280
+ }
281
+ return value;
282
+ }
283
+
284
+ function boundedField(value: number, field: string, min: number, max: number): number {
285
+ if (!Number.isFinite(value) || value < min || value > max) {
286
+ throw new Error(`${field} must be between ${min} and ${max}`);
287
+ }
288
+ return value;
289
+ }
290
+
291
+ function optionalIntField(value: number | undefined, field: string, min: number): number | undefined {
292
+ if (value === undefined) return undefined;
293
+ return intField(value, field, min);
294
+ }
295
+
296
+ function optionalNumberField(value: number | undefined, field: string, min?: number): number | undefined {
297
+ if (value === undefined) return undefined;
298
+ return numberField(value, field, min);
299
+ }
300
+
301
+ function optionalBoundedField(
302
+ value: number | undefined,
303
+ field: string,
304
+ min: number,
305
+ max: number
306
+ ): number | undefined {
307
+ if (value === undefined) return undefined;
308
+ return boundedField(value, field, min, max);
309
+ }
310
+
311
+ function optionalStopField(value: string | string[] | undefined, field: string): string | string[] | undefined {
312
+ if (value === undefined) return undefined;
313
+ if (typeof value === "string") {
314
+ const trimmed = value.trim();
315
+ if (trimmed.length === 0) {
316
+ throw new Error(`${field} must not be empty`);
317
+ }
318
+ return trimmed;
319
+ }
320
+ if (Array.isArray(value)) {
321
+ if (value.length === 0) {
322
+ throw new Error(`${field} must include at least one stop sequence`);
323
+ }
324
+ const normalized = value.map((item, index) => {
325
+ if (typeof item !== "string") {
326
+ throw new Error(`${field}[${index}] must be a string`);
327
+ }
328
+ const trimmed = item.trim();
329
+ if (trimmed.length === 0) {
330
+ throw new Error(`${field}[${index}] must not be empty`);
331
+ }
332
+ return trimmed;
333
+ });
334
+ return normalized;
335
+ }
336
+ throw new Error(`${field} must be a string or string[]`);
337
+ }
@@ -0,0 +1,164 @@
1
+ import { promises as fs } from "fs";
2
+ import {
3
+ BenchmarkGateResults,
4
+ BenchmarkReport,
5
+ EffectiveBenchmarkConfig,
6
+ ScenarioResult,
7
+ } from "./types";
8
+
9
+ export async function evaluateGates(
10
+ report: Omit<BenchmarkReport, "gateResults">,
11
+ effective: EffectiveBenchmarkConfig
12
+ ): Promise<BenchmarkGateResults> {
13
+ const hardMessages: string[] = [];
14
+ const softMessages: string[] = [];
15
+
16
+ if (effective.run.suite === "smoke") {
17
+ const min = effective.gates.hard.smokeMinSuccessRate;
18
+ if (report.executed > 0 && report.successRate < min) {
19
+ hardMessages.push(
20
+ `Smoke suite success rate ${toPct(report.successRate)} is below required ${toPct(min)}.`
21
+ );
22
+ }
23
+ }
24
+
25
+ const minScenarioPassRate = effective.profileSettings.minScenarioPassRate;
26
+ const failingScenarios = report.results.filter(
27
+ (scenario) => scenario.status !== "skipped" && scenario.passRate < minScenarioPassRate
28
+ );
29
+ for (const scenario of failingScenarios) {
30
+ hardMessages.push(
31
+ `Scenario '${scenario.id}' pass rate ${toPct(scenario.passRate)} is below required ${toPct(minScenarioPassRate)}.`
32
+ );
33
+ }
34
+
35
+ if (effective.run.baselinePath) {
36
+ const baseline = await loadBaseline(effective.run.baselinePath);
37
+ const baselineById = new Map(
38
+ baseline.results
39
+ .map((scenario) => normalizeBaselineScenario(scenario))
40
+ .filter((scenario): scenario is BaselineScenario => scenario !== null)
41
+ .map((scenario) => [scenario.id, scenario])
42
+ );
43
+
44
+ for (const current of report.results) {
45
+ if (current.status === "skipped") {
46
+ continue;
47
+ }
48
+ const ref = baselineById.get(current.id);
49
+ if (!ref) {
50
+ continue;
51
+ }
52
+ const maxP95 = effective.gates.soft.maxP95RegressionPct;
53
+ if (ref.p95LatencyMs > 0) {
54
+ const p95Threshold = ref.p95LatencyMs * (1 + maxP95 / 100);
55
+ if (current.p95LatencyMs > p95Threshold) {
56
+ softMessages.push(
57
+ `Scenario '${current.id}' p95 latency regressed ${pctDelta(current.p95LatencyMs, ref.p95LatencyMs)} (${current.p95LatencyMs}ms vs baseline ${ref.p95LatencyMs}ms).`
58
+ );
59
+ }
60
+ }
61
+
62
+ const maxThroughputDrop = effective.gates.soft.maxThroughputDropPct;
63
+ if (ref.avgThroughputTokensPerSec > 0) {
64
+ const throughputThreshold = ref.avgThroughputTokensPerSec * (1 - maxThroughputDrop / 100);
65
+ if (current.avgThroughputTokensPerSec < throughputThreshold) {
66
+ softMessages.push(
67
+ `Scenario '${current.id}' throughput dropped ${throughputDropPct(current.avgThroughputTokensPerSec, ref.avgThroughputTokensPerSec)} (${current.avgThroughputTokensPerSec.toFixed(2)} t/s vs baseline ${ref.avgThroughputTokensPerSec.toFixed(2)} t/s).`
68
+ );
69
+ }
70
+ }
71
+ }
72
+ }
73
+
74
+ return {
75
+ hard: {
76
+ passed: hardMessages.length === 0,
77
+ messages: hardMessages,
78
+ },
79
+ soft: {
80
+ passed: softMessages.length === 0,
81
+ messages: softMessages,
82
+ },
83
+ };
84
+ }
85
+
86
+ interface BaselineScenario {
87
+ id: string;
88
+ p95LatencyMs: number;
89
+ avgThroughputTokensPerSec: number;
90
+ }
91
+
92
+ async function loadBaseline(pathLike: string): Promise<{ results: unknown[] }> {
93
+ let raw: string;
94
+ try {
95
+ raw = await fs.readFile(pathLike, "utf8");
96
+ } catch (error) {
97
+ throw new Error(
98
+ `Failed to read baseline file '${pathLike}': ${(error as Error).message}`
99
+ );
100
+ }
101
+
102
+ let parsed: unknown;
103
+ try {
104
+ parsed = JSON.parse(raw);
105
+ } catch (error) {
106
+ throw new Error(
107
+ `Failed to parse baseline file '${pathLike}' as JSON: ${(error as Error).message}`
108
+ );
109
+ }
110
+
111
+ if (!parsed || typeof parsed !== "object" || !Array.isArray((parsed as { results?: unknown[] }).results)) {
112
+ throw new Error(
113
+ `Baseline file '${pathLike}' must contain a top-level 'results' array.`
114
+ );
115
+ }
116
+
117
+ return parsed as { results: unknown[] };
118
+ }
119
+
120
+ function normalizeBaselineScenario(raw: unknown): BaselineScenario | null {
121
+ if (!raw || typeof raw !== "object") {
122
+ return null;
123
+ }
124
+ const scenario = raw as Partial<ScenarioResult>;
125
+ if (typeof scenario.id !== "string") {
126
+ return null;
127
+ }
128
+ const p95LatencyMs =
129
+ typeof scenario.p95LatencyMs === "number" && Number.isFinite(scenario.p95LatencyMs)
130
+ ? scenario.p95LatencyMs
131
+ : 0;
132
+ const avgThroughputTokensPerSec =
133
+ typeof scenario.avgThroughputTokensPerSec === "number" &&
134
+ Number.isFinite(scenario.avgThroughputTokensPerSec)
135
+ ? scenario.avgThroughputTokensPerSec
136
+ : 0;
137
+
138
+ return {
139
+ id: scenario.id,
140
+ p95LatencyMs,
141
+ avgThroughputTokensPerSec,
142
+ };
143
+ }
144
+
145
+ function toPct(value: number): string {
146
+ return `${(value * 100).toFixed(1)}%`;
147
+ }
148
+
149
+ function pctDelta(current: number, baseline: number): string {
150
+ if (baseline === 0) {
151
+ return "n/a";
152
+ }
153
+ const delta = ((current - baseline) / baseline) * 100;
154
+ const sign = delta >= 0 ? "+" : "";
155
+ return `${sign}${delta.toFixed(1)}%`;
156
+ }
157
+
158
+ function throughputDropPct(current: number, baseline: number): string {
159
+ if (baseline === 0) {
160
+ return "n/a";
161
+ }
162
+ const delta = ((baseline - current) / baseline) * 100;
163
+ return `${delta.toFixed(1)}%`;
164
+ }