waypoi 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (260) hide show
  1. package/.github/instructions/ui.instructions.md +42 -0
  2. package/.github/workflows/ci.yml +35 -0
  3. package/.github/workflows/publish.yml +71 -0
  4. package/.github/workflows/release.yml +48 -0
  5. package/.playwright-mcp/console-2026-04-04T01-41-10-746Z.log +2 -0
  6. package/.playwright-mcp/console-2026-04-04T01-41-28-799Z.log +3 -0
  7. package/.playwright-mcp/console-2026-04-05T02-26-51-909Z.log +76 -0
  8. package/.playwright-mcp/page-2026-04-04T01-41-10-816Z.yml +1 -0
  9. package/.playwright-mcp/page-2026-04-04T01-41-29-141Z.yml +77 -0
  10. package/.playwright-mcp/page-2026-04-04T01-41-42-633Z.yml +190 -0
  11. package/.playwright-mcp/page-2026-04-04T01-42-03-929Z.yml +262 -0
  12. package/.playwright-mcp/page-2026-04-04T02-12-54-813Z.yml +6 -0
  13. package/.playwright-mcp/page-2026-04-04T02-14-58-600Z.yml +190 -0
  14. package/.playwright-mcp/page-2026-04-04T02-15-03-923Z.yml +190 -0
  15. package/.playwright-mcp/page-2026-04-04T02-15-07-426Z.yml +190 -0
  16. package/.playwright-mcp/page-2026-04-04T02-15-25-729Z.yml +262 -0
  17. package/.playwright-mcp/page-2026-04-04T02-16-22-984Z.yml +262 -0
  18. package/.playwright-mcp/page-2026-04-04T02-17-00-599Z.yml +190 -0
  19. package/.playwright-mcp/page-2026-04-04T02-17-50-874Z.yml +190 -0
  20. package/.playwright-mcp/page-2026-04-05T02-26-55-570Z.yml +6 -0
  21. package/AGENTS.md +48 -0
  22. package/CHANGELOG.md +131 -0
  23. package/README.md +552 -0
  24. package/assets/agent-mode.png +0 -0
  25. package/assets/categorize.png +0 -0
  26. package/assets/dashboard.png +0 -0
  27. package/assets/endpoint-proxy.png +0 -0
  28. package/assets/icon.png +0 -0
  29. package/assets/mcp-generate-image.png +0 -0
  30. package/assets/mcp-understand-image.png +0 -0
  31. package/assets/peek-token-flow.png +0 -0
  32. package/assets/playground.png +0 -0
  33. package/assets/sankey.png +0 -0
  34. package/cli/index.ts +2805 -0
  35. package/cli/legacyRewrite.ts +108 -0
  36. package/cli/modelRef.ts +24 -0
  37. package/dist/cli/index.js +2536 -0
  38. package/dist/cli/legacyRewrite.js +92 -0
  39. package/dist/cli/modelRef.js +20 -0
  40. package/dist/src/benchmark/artifacts.js +131 -0
  41. package/dist/src/benchmark/capabilityClassifier.js +81 -0
  42. package/dist/src/benchmark/capabilityStore.js +144 -0
  43. package/dist/src/benchmark/config.js +238 -0
  44. package/dist/src/benchmark/gates.js +118 -0
  45. package/dist/src/benchmark/jobs.js +252 -0
  46. package/dist/src/benchmark/runner.js +1847 -0
  47. package/dist/src/benchmark/schema.js +353 -0
  48. package/dist/src/benchmark/suites.js +314 -0
  49. package/dist/src/benchmark/tinyQaDataset.js +422 -0
  50. package/dist/src/benchmark/types.js +25 -0
  51. package/dist/src/config.js +47 -0
  52. package/dist/src/index.js +178 -0
  53. package/dist/src/mcp/client.js +215 -0
  54. package/dist/src/mcp/discovery.js +226 -0
  55. package/dist/src/mcp/policy.js +65 -0
  56. package/dist/src/mcp/registry.js +129 -0
  57. package/dist/src/mcp/service.js +460 -0
  58. package/dist/src/middleware/auth.js +179 -0
  59. package/dist/src/middleware/requestCapture.js +192 -0
  60. package/dist/src/middleware/requestStats.js +118 -0
  61. package/dist/src/pools/builder.js +132 -0
  62. package/dist/src/pools/repository.js +69 -0
  63. package/dist/src/pools/scheduler.js +360 -0
  64. package/dist/src/pools/types.js +2 -0
  65. package/dist/src/protocols/adapters/dashscope.js +267 -0
  66. package/dist/src/protocols/adapters/inferenceV2.js +346 -0
  67. package/dist/src/protocols/adapters/openai.js +27 -0
  68. package/dist/src/protocols/registry.js +99 -0
  69. package/dist/src/protocols/types.js +2 -0
  70. package/dist/src/providers/health.js +153 -0
  71. package/dist/src/providers/importer.js +289 -0
  72. package/dist/src/providers/modelRegistry.js +313 -0
  73. package/dist/src/providers/repository.js +361 -0
  74. package/dist/src/providers/types.js +2 -0
  75. package/dist/src/routes/admin.js +531 -0
  76. package/dist/src/routes/audio.js +295 -0
  77. package/dist/src/routes/chat.js +240 -0
  78. package/dist/src/routes/embeddings.js +157 -0
  79. package/dist/src/routes/images.js +288 -0
  80. package/dist/src/routes/mcp.js +256 -0
  81. package/dist/src/routes/mcpService.js +100 -0
  82. package/dist/src/routes/models.js +48 -0
  83. package/dist/src/routes/responses.js +711 -0
  84. package/dist/src/routes/sessions.js +450 -0
  85. package/dist/src/routes/stats.js +270 -0
  86. package/dist/src/routes/ui.js +97 -0
  87. package/dist/src/routes/videos.js +107 -0
  88. package/dist/src/routing/router.js +338 -0
  89. package/dist/src/services/imageGeneration.js +280 -0
  90. package/dist/src/services/imageUnderstanding.js +352 -0
  91. package/dist/src/services/videoGeneration.js +79 -0
  92. package/dist/src/storage/captureRepository.js +1591 -0
  93. package/dist/src/storage/files.js +157 -0
  94. package/dist/src/storage/imageCache.js +346 -0
  95. package/dist/src/storage/repositories.js +388 -0
  96. package/dist/src/storage/sessionRepository.js +370 -0
  97. package/dist/src/storage/statsRepository.js +204 -0
  98. package/dist/src/transport/httpClient.js +126 -0
  99. package/dist/src/types.js +2 -0
  100. package/dist/src/utils/messageMedia.js +285 -0
  101. package/dist/src/utils/modelCapabilities.js +108 -0
  102. package/dist/src/utils/modelDiscovery.js +170 -0
  103. package/dist/src/version.js +5 -0
  104. package/dist/src/workers/captureRetention.js +25 -0
  105. package/dist/src/workers/configWatcher.js +91 -0
  106. package/dist/src/workers/healthChecker.js +21 -0
  107. package/dist/src/workers/statsRotation.js +41 -0
  108. package/docs/LLM/output_schema.md +312 -0
  109. package/docs/benchmark.md +208 -0
  110. package/docs/mcp-guidelines.md +125 -0
  111. package/docs/mcp-service.md +178 -0
  112. package/docs/opencode.md +86 -0
  113. package/docs/providers.md +79 -0
  114. package/examples/benchmark.config.yaml +28 -0
  115. package/examples/providers/alibaba-dashscope.yaml +88 -0
  116. package/examples/providers/alibaba-llm.yaml +64 -0
  117. package/examples/providers/alibaba-registry.yaml +7 -0
  118. package/examples/providers/inference-v2-ray.yaml +29 -0
  119. package/examples/scenarios/assets/omni-call-sample.wav +0 -0
  120. package/examples/scenarios/custom.jsonl +5 -0
  121. package/examples/scenarios/custom.yaml +40 -0
  122. package/model-form-v2.png +0 -0
  123. package/package.json +66 -0
  124. package/provider-form-v2.png +0 -0
  125. package/provider-form.png +0 -0
  126. package/scripts/manual-test.sh +11 -0
  127. package/scripts/version-from-git.js +23 -0
  128. package/src/benchmark/artifacts.ts +149 -0
  129. package/src/benchmark/capabilityClassifier.ts +99 -0
  130. package/src/benchmark/capabilityStore.ts +174 -0
  131. package/src/benchmark/config.ts +337 -0
  132. package/src/benchmark/gates.ts +164 -0
  133. package/src/benchmark/jobs.ts +312 -0
  134. package/src/benchmark/runner.ts +2519 -0
  135. package/src/benchmark/schema.ts +443 -0
  136. package/src/benchmark/suites.ts +323 -0
  137. package/src/benchmark/tinyQaDataset.ts +428 -0
  138. package/src/benchmark/types.ts +442 -0
  139. package/src/config.ts +44 -0
  140. package/src/index.ts +195 -0
  141. package/src/mcp/client.ts +305 -0
  142. package/src/mcp/discovery.ts +266 -0
  143. package/src/mcp/policy.ts +105 -0
  144. package/src/mcp/registry.ts +164 -0
  145. package/src/mcp/service.ts +611 -0
  146. package/src/middleware/auth.ts +251 -0
  147. package/src/middleware/requestCapture.ts +245 -0
  148. package/src/middleware/requestStats.ts +163 -0
  149. package/src/pools/builder.ts +159 -0
  150. package/src/pools/repository.ts +71 -0
  151. package/src/pools/scheduler.ts +425 -0
  152. package/src/pools/types.ts +117 -0
  153. package/src/protocols/adapters/dashscope.ts +335 -0
  154. package/src/protocols/adapters/inferenceV2.ts +428 -0
  155. package/src/protocols/adapters/openai.ts +32 -0
  156. package/src/protocols/registry.ts +117 -0
  157. package/src/protocols/types.ts +81 -0
  158. package/src/providers/health.ts +207 -0
  159. package/src/providers/importer.ts +402 -0
  160. package/src/providers/modelRegistry.ts +415 -0
  161. package/src/providers/repository.ts +439 -0
  162. package/src/providers/types.ts +113 -0
  163. package/src/routes/admin.ts +666 -0
  164. package/src/routes/audio.ts +372 -0
  165. package/src/routes/chat.ts +301 -0
  166. package/src/routes/embeddings.ts +197 -0
  167. package/src/routes/images.ts +356 -0
  168. package/src/routes/mcp.ts +320 -0
  169. package/src/routes/mcpService.ts +114 -0
  170. package/src/routes/models.ts +50 -0
  171. package/src/routes/responses.ts +872 -0
  172. package/src/routes/sessions.ts +558 -0
  173. package/src/routes/stats.ts +312 -0
  174. package/src/routes/ui.ts +96 -0
  175. package/src/routes/videos.ts +132 -0
  176. package/src/routing/router.ts +501 -0
  177. package/src/services/imageGeneration.ts +396 -0
  178. package/src/services/imageUnderstanding.ts +449 -0
  179. package/src/services/videoGeneration.ts +127 -0
  180. package/src/storage/captureRepository.ts +1835 -0
  181. package/src/storage/files.ts +178 -0
  182. package/src/storage/imageCache.ts +405 -0
  183. package/src/storage/repositories.ts +494 -0
  184. package/src/storage/sessionRepository.ts +419 -0
  185. package/src/storage/statsRepository.ts +238 -0
  186. package/src/transport/httpClient.ts +145 -0
  187. package/src/types.ts +322 -0
  188. package/src/utils/messageMedia.ts +293 -0
  189. package/src/utils/modelCapabilities.ts +161 -0
  190. package/src/utils/modelDiscovery.ts +203 -0
  191. package/src/workers/captureRetention.ts +25 -0
  192. package/src/workers/configWatcher.ts +115 -0
  193. package/src/workers/healthChecker.ts +22 -0
  194. package/src/workers/statsRotation.ts +49 -0
  195. package/tests/benchmarkAdminRoutes.test.ts +82 -0
  196. package/tests/benchmarkBasics.test.ts +116 -0
  197. package/tests/captureAdminRoutes.test.ts +420 -0
  198. package/tests/captureRepository.test.ts +797 -0
  199. package/tests/cliLegacyRewrite.test.ts +45 -0
  200. package/tests/imageGeneration.service.test.ts +107 -0
  201. package/tests/imageUnderstanding.service.test.ts +123 -0
  202. package/tests/mcpPolicy.test.ts +105 -0
  203. package/tests/mcpService.test.ts +1245 -0
  204. package/tests/modelRef.test.ts +23 -0
  205. package/tests/modelsRoutes.test.ts +154 -0
  206. package/tests/sessionMediaCache.test.ts +167 -0
  207. package/tests/statsRoutes.test.ts +323 -0
  208. package/tsconfig.json +15 -0
  209. package/ui/index.html +16 -0
  210. package/ui/package-lock.json +8521 -0
  211. package/ui/package.json +52 -0
  212. package/ui/postcss.config.js +6 -0
  213. package/ui/public/assets/apple-touch-icon.png +0 -0
  214. package/ui/public/assets/favicon-16.png +0 -0
  215. package/ui/public/assets/favicon-32.png +0 -0
  216. package/ui/public/assets/icon-192.png +0 -0
  217. package/ui/public/assets/icon-512.png +0 -0
  218. package/ui/src/App.tsx +27 -0
  219. package/ui/src/api/client.ts +1503 -0
  220. package/ui/src/components/EndpointUsageGuide.tsx +361 -0
  221. package/ui/src/components/Layout.tsx +124 -0
  222. package/ui/src/components/MessageContent.tsx +365 -0
  223. package/ui/src/components/ToolCallMessage.tsx +179 -0
  224. package/ui/src/components/ToolPicker.tsx +442 -0
  225. package/ui/src/components/messageContentParser.test.ts +41 -0
  226. package/ui/src/components/messageContentParser.ts +73 -0
  227. package/ui/src/components/thinkingPreview.test.ts +27 -0
  228. package/ui/src/components/thinkingPreview.ts +15 -0
  229. package/ui/src/components/toMermaidSankey.test.ts +78 -0
  230. package/ui/src/components/toMermaidSankey.ts +56 -0
  231. package/ui/src/components/ui/button.tsx +58 -0
  232. package/ui/src/components/ui/input.tsx +21 -0
  233. package/ui/src/components/ui/textarea.tsx +21 -0
  234. package/ui/src/lib/utils.ts +6 -0
  235. package/ui/src/main.tsx +9 -0
  236. package/ui/src/pages/AgentPlayground.tsx +2010 -0
  237. package/ui/src/pages/Benchmark.tsx +988 -0
  238. package/ui/src/pages/Dashboard.tsx +581 -0
  239. package/ui/src/pages/Peek.tsx +962 -0
  240. package/ui/src/pages/Settings.tsx +2013 -0
  241. package/ui/src/pages/agentPlaygroundPayload.test.ts +109 -0
  242. package/ui/src/pages/agentPlaygroundPayload.ts +97 -0
  243. package/ui/src/pages/agentThinkingContent.test.ts +50 -0
  244. package/ui/src/pages/agentThinkingContent.ts +57 -0
  245. package/ui/src/pages/dashboardTokenUsage.test.ts +66 -0
  246. package/ui/src/pages/dashboardTokenUsage.ts +36 -0
  247. package/ui/src/pages/imageUpload.test.ts +39 -0
  248. package/ui/src/pages/imageUpload.ts +71 -0
  249. package/ui/src/pages/peekFilters.test.ts +29 -0
  250. package/ui/src/pages/peekFilters.ts +13 -0
  251. package/ui/src/pages/peekMedia.test.ts +58 -0
  252. package/ui/src/pages/peekMedia.ts +148 -0
  253. package/ui/src/pages/sessionAutoTitle.test.ts +128 -0
  254. package/ui/src/pages/sessionAutoTitle.ts +106 -0
  255. package/ui/src/stores/settings.ts +58 -0
  256. package/ui/src/styles/globals.css +223 -0
  257. package/ui/src/vite-env.d.ts +8 -0
  258. package/ui/tailwind.config.js +106 -0
  259. package/ui/tsconfig.json +32 -0
  260. package/ui/vite.config.ts +37 -0
@@ -0,0 +1,21 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.startHealthChecker = startHealthChecker;
4
+ exports.stopHealthChecker = stopHealthChecker;
5
+ const health_1 = require("../providers/health");
6
+ let healthTimer = null;
7
+ function startHealthChecker(paths) {
8
+ const intervalMs = 30_000;
9
+ const run = async () => {
10
+ await (0, health_1.probeProviderModels)(paths);
11
+ };
12
+ healthTimer = setInterval(run, intervalMs);
13
+ healthTimer.unref();
14
+ void run();
15
+ }
16
+ function stopHealthChecker() {
17
+ if (healthTimer) {
18
+ clearInterval(healthTimer);
19
+ healthTimer = null;
20
+ }
21
+ }
@@ -0,0 +1,41 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.startStatsRotation = startStatsRotation;
4
+ exports.stopStatsRotation = stopStatsRotation;
5
+ const statsRepository_1 = require("../storage/statsRepository");
6
+ /**
7
+ * Stats Rotation Worker
8
+ *
9
+ * Periodically cleans up stats files older than retention period.
10
+ * Runs daily by default.
11
+ */
12
+ const ROTATION_INTERVAL_MS = 24 * 60 * 60 * 1000; // 24 hours
13
+ const DEFAULT_RETENTION_DAYS = 30;
14
+ let rotationTimer = null;
15
+ function startStatsRotation(paths, retentionDays = DEFAULT_RETENTION_DAYS) {
16
+ if (rotationTimer) {
17
+ return; // Already running
18
+ }
19
+ async function runRotation() {
20
+ try {
21
+ const deleted = await (0, statsRepository_1.rotateStats)(paths, retentionDays);
22
+ if (deleted > 0) {
23
+ console.log(`[stats-rotation] Deleted ${deleted} stats file(s) older than ${retentionDays} days`);
24
+ }
25
+ }
26
+ catch (error) {
27
+ console.error("[stats-rotation] Error rotating stats:", error);
28
+ }
29
+ }
30
+ // Run immediately on startup, then periodically
31
+ runRotation();
32
+ rotationTimer = setInterval(runRotation, ROTATION_INTERVAL_MS);
33
+ // Prevent timer from keeping process alive
34
+ rotationTimer.unref();
35
+ }
36
+ function stopStatsRotation() {
37
+ if (rotationTimer) {
38
+ clearInterval(rotationTimer);
39
+ rotationTimer = null;
40
+ }
41
+ }
@@ -0,0 +1,312 @@
1
+ # LLM Output Schema
2
+
3
+ This document describes the output format supported by Waypoi's UI for displaying LLM responses, including thinking/reasoning content.
4
+
5
+ ## Overview
6
+
7
+ Waypoi's Playground UI supports displaying thinking process from LLMs that provide reasoning content. The system handles both:
8
+
9
+ 1. **Native reasoning fields** from LLM APIs (e.g., DeepSeek's `reasoning_content`)
10
+ 2. **HTML-style tags** embedded in the response text
11
+
12
+ ## Supported Thinking Formats
13
+
14
+ ### 1. Native API Fields
15
+
16
+ Some LLM providers include reasoning content in separate fields of the streaming response:
17
+
18
+ | Provider | Field Name | Example |
19
+ |----------|------------|---------|
20
+ | DeepSeek | `reasoning_content` | `choices[0].delta.reasoning_content` |
21
+ | Other providers | `reasoning` | `choices[0].delta.reasoning` |
22
+
23
+ The Waypoi backend automatically extracts these fields and the frontend wraps them in ` ` tags for display.
24
+
25
+ ### 2. HTML-Style Tags
26
+
27
+ LLMs can also output thinking content wrapped in HTML-like tags:
28
+
29
+ ```
30
+
31
+ This is my thinking process...
32
+ Step 1: Analyze the problem
33
+ Step 2: Consider solutions
34
+
35
+
36
+ This is the final response based on my reasoning above.
37
+ ```
38
+
39
+ The UI recognizes these tags and renders the thinking content in a collapsible "Thinking process" block.
40
+
41
+ ## Streaming Response Format
42
+
43
+ When `stream: true` is enabled, the API returns Server-Sent Events (SSE) with the following structure:
44
+
45
+ ### Standard OpenAI Format
46
+
47
+ ```json
48
+ {
49
+ "id": "chatcmpl-xxx",
50
+ "object": "chat.completion.chunk",
51
+ "created": 1234567890,
52
+ "model": "gpt-4",
53
+ "choices": [
54
+ {
55
+ "index": 0,
56
+ "delta": {
57
+ "content": "Hello",
58
+ "reasoning_content": "I should greet the user warmly"
59
+ },
60
+ "finish_reason": null
61
+ }
62
+ ]
63
+ }
64
+ ```
65
+
66
+ ### Frontend Processing
67
+
68
+ The frontend processes each chunk:
69
+
70
+ 1. Extracts `content` (regular response text)
71
+ 2. Extracts `reasoning_content` or `reasoning` (thinking process)
72
+ 3. Wraps reasoning in ` ` tags when transitioning from reasoning to content
73
+ 4. Combines both into the display message
74
+
75
+ Example flow:
76
+
77
+ ```
78
+ Chunk 1: { reasoning: "Let me think..." } → Display: " Let me think..."
79
+ Chunk 2: { reasoning: "Step 1: Analyze" } → Display: " Let me think...Step 1: Analyze"
80
+ Chunk 3: { content: "Based on my analysis" } → Display: " Let me think...Step 1:Analyze \n\nBased on my analysis"
81
+ ```
82
+
83
+ ## Display Behavior
84
+
85
+ ### Thinking Block UI
86
+
87
+ When the UI detects ` ... ` content, it renders:
88
+
89
+ - A collapsible block with a "Thinking process" header
90
+ - Brain icon and expand/collapse chevron
91
+ - Monospace font for the thinking content
92
+ - Collapsed by default to focus on the main response
93
+
94
+ ### Parsing Logic
95
+
96
+ The `MessageContent` component handles three edge cases:
97
+
98
+ 1. **Standard format**: ` ...content... ` - Properly tagged thinking
99
+ 2. **Missing opening tag**: Content before ` ` is treated as thinking
100
+ 3. **Unclosed tag**: ` ...` during streaming (tag will be closed when content arrives)
101
+
102
+ ## Supported Models
103
+
104
+ The following models are known to provide reasoning content:
105
+
106
+ | Model | Reasoning Field | Notes |
107
+ |-------|----------------|-------|
108
+ | DeepSeek-R1 | `reasoning_content` | Chain-of-thought reasoning |
109
+ | DeepSeek-V3 | `reasoning_content` | Extended thinking mode |
110
+ | Other reasoning models | `reasoning` | Generic field support |
111
+
112
+ Models that output ` ` tags in their response (like some Qwen or Llama variants) will also have their thinking content displayed correctly.
113
+
114
+ ## Implementation Details
115
+
116
+ ### Backend (`src/routes/responses.ts`)
117
+
118
+ The Responses API shim handles reasoning content from Codex-formatted requests:
119
+
120
+ ```typescript
121
+ if (delta.reasoning_content || delta.reasoning) {
122
+ const reasoningDelta = delta.reasoning_content || delta.reasoning;
123
+ sendEvent("response.reasoning_text.delta", {
124
+ type: "response.reasoning_text.delta",
125
+ delta: reasoningDelta
126
+ });
127
+ }
128
+ ```
129
+
130
+ ### Frontend (`ui/src/api/client.ts`)
131
+
132
+ The streaming client extracts both content and reasoning:
133
+
134
+ ```typescript
135
+ const delta = parsed.choices?.[0]?.delta;
136
+ const content = delta?.content;
137
+ const reasoning = delta?.reasoning_content || delta?.reasoning;
138
+
139
+ if (content || reasoning) {
140
+ yield { content: content || '', reasoning: reasoning || undefined };
141
+ }
142
+ ```
143
+
144
+ ### Playground (`ui/src/pages/Playground.tsx`)
145
+
146
+ The Playground component tracks reasoning and content separately, then combines them:
147
+
148
+ ```typescript
149
+ if (chunk.reasoning) {
150
+ if (!hasReasoning) {
151
+ reasoningContent = ' ' + chunk.reasoning;
152
+ } else {
153
+ reasoningContent += chunk.reasoning;
154
+ }
155
+ }
156
+
157
+ if (chunk.content && hasReasoning && !reasoningClosed) {
158
+ reasoningContent += ' ';
159
+ reasoningClosed = true;
160
+ }
161
+ ```
162
+
163
+ ## Testing
164
+
165
+ To verify thinking content display:
166
+
167
+ 1. Use a model that supports reasoning (e.g., DeepSeek-R1)
168
+ 2. Send a complex question requiring multi-step reasoning
169
+ 3. Observe the "Thinking process" collapsible block appears
170
+ 4. Expand to see the reasoning content
171
+ 5. Verify the final response follows the thinking block
172
+
173
+ Example test prompt:
174
+
175
+ ```
176
+ If a train travels 120 km in 2 hours, then stops for 30 minutes,
177
+ then continues at the same speed for another 90 km,
178
+ what is the total travel time?
179
+ ```
180
+
181
+ Models with reasoning capability will show their calculation steps in the thinking block before providing the final answer.
182
+
183
+ ## Future Enhancements
184
+
185
+ - Support for multiple thinking blocks in a single response
186
+ - Configurable thinking display (always show/hide by default)
187
+ - Token count display for reasoning vs. response content
188
+ - Export thinking content separately from the final response
189
+
190
+ ## Codex CLI Specific Schema
191
+
192
+ Codex CLI uses a custom event-based protocol rather than the standard OpenAI API format. The Waypoi proxy must transform standard API responses to match Codex's expectations.
193
+
194
+ ### Key Differences from Standard API
195
+
196
+ | Feature | Standard OpenAI | Codex CLI |
197
+ |---------|----------------|-----------|
198
+ | Reasoning Content | Embedded in `delta.content` or separate field | Dedicated `AgentReasoningDeltaEvent` events |
199
+ | Tool Calls | Standard `tool_calls` array | Custom `McpToolCallBeginEvent`/`McpToolCallEndEvent` |
200
+ | Command Execution | Not supported | Special `ExecCommandBeginEvent`/`ExecCommandEndEvent` |
201
+ | Model Requirements | Standard names | Specific names like `gpt-5.1-codex-mini` |
202
+
203
+ ### Codex-Specific Event Types
204
+
205
+ Codex CLI expects the following event types in the stream:
206
+
207
+ ```typescript
208
+ // Reasoning content
209
+ interface AgentReasoningDeltaEvent {
210
+ type: 'agent_reasoning_delta';
211
+ delta: string;
212
+ }
213
+
214
+ // Raw reasoning content (for internal processing)
215
+ interface AgentReasoningRawContentDeltaEvent {
216
+ type: 'agent_reasoning_raw_content_delta';
217
+ delta: string;
218
+ }
219
+
220
+ // Regular message content
221
+ interface AgentMessageDeltaEvent {
222
+ type: 'agent_message_delta';
223
+ delta: string;
224
+ }
225
+
226
+ // Tool calls
227
+ interface McpToolCallBeginEvent {
228
+ type: 'mcp_tool_call_begin';
229
+ name: string;
230
+ arguments: object;
231
+ }
232
+
233
+ interface McpToolCallEndEvent {
234
+ type: 'mcp_tool_call_end';
235
+ result: string;
236
+ }
237
+
238
+ // Command execution
239
+ interface ExecCommandBeginEvent {
240
+ type: 'exec_command_begin';
241
+ command: string;
242
+ source: 'user' | 'agent';
243
+ }
244
+
245
+ interface ExecCommandEndEvent {
246
+ type: 'exec_command_end';
247
+ exit_code: number;
248
+ output: string;
249
+ }
250
+ ```
251
+
252
+ ### Proxy Transformation Rules
253
+
254
+ The Waypoi proxy handles the translation between standard OpenAI format and Codex's custom protocol:
255
+
256
+ 1. **Reasoning Content Extraction**
257
+ - When `reasoning_content` or `reasoning` fields are detected, they're converted to `AgentReasoningDeltaEvent`
258
+ - Example: `{"delta": {"reasoning_content": "Thinking step..."}}` → `{"type": "agent_reasoning_delta", "delta": "Thinking step..."}`
259
+
260
+ 2. **Special Model Handling**
261
+ - Requests to `gpt-5.1-codex-mini` are routed to specific endpoints
262
+ - Other Codex-specific models are transformed to match backend requirements
263
+
264
+ 3. **Tool Call Conversion**
265
+ - Standard tool calls are converted to `McpToolCallBeginEvent`/`McpToolCallEndEvent` sequence
266
+ - Custom tool parameters are preserved
267
+
268
+ ### Implementation in Waypoi
269
+
270
+ The proxy implements these transformations in `src/routes/responses.ts`:
271
+
272
+ ```typescript
273
+ // Convert OpenAI tool calls to Codex MCP events
274
+ if (delta.tool_calls) {
275
+ delta.tool_calls.forEach(toolCall => {
276
+ sendEvent("mcp_tool_call_begin", {
277
+ type: "mcp_tool_call_begin",
278
+ name: toolCall.function.name,
279
+ arguments: JSON.parse(toolCall.function.arguments)
280
+ });
281
+ });
282
+ }
283
+
284
+ // Handle reasoning content from various sources
285
+ if (delta.reasoning_content || delta.reasoning) {
286
+ const reasoningDelta = delta.reasoning_content || delta.reasoning;
287
+ sendEvent("response.reasoning_text.delta", {
288
+ type: "response.reasoning_text.delta",
289
+ delta: reasoningDelta
290
+ });
291
+ }
292
+ ```
293
+
294
+ ### Testing with Codex CLI
295
+
296
+ To verify Codex CLI compatibility:
297
+
298
+ 1. Set the model to `gpt-5.1-codex-mini` in your settings
299
+ 2. Enable reasoning mode if available
300
+ 3. Send a complex prompt requiring multi-step reasoning
301
+ 4. Verify the thinking process appears in dedicated blocks
302
+ 5. Test tool calling with `@mcp` commands
303
+
304
+ Example Codex CLI prompt:
305
+
306
+ ```
307
+ @model gpt-5.1-codex-mini
308
+ @reasoning
309
+ Explain step by step how you would calculate the area of a triangle with sides 3, 4, and 5.
310
+ ```
311
+
312
+ The proxy should properly route this request and format the response to match Codex's event structure, with reasoning content separated from the final answer.
@@ -0,0 +1,208 @@
1
+ # Waypoi Benchmark
2
+
3
+ Waypoi benchmark now has two roles:
4
+
5
+ - `showcase`: a live, user-visible replay of curated examples
6
+ - `diagnostic`: the older internal smoke/capability/regression path
7
+
8
+ Default behavior is showcase-first.
9
+
10
+ ## Quick start
11
+
12
+ ```bash
13
+ # Default run: showcase suite, one visible replay per example
14
+ waypoi bench
15
+
16
+ # List showcase examples
17
+ waypoi bench --list-examples
18
+
19
+ # Run one example
20
+ waypoi bench --example showcase-tinyqa-001
21
+
22
+ # Pin a model for a showcase example
23
+ waypoi bench --suite showcase --example showcase-tinyqa-001 --model smart
24
+
25
+ # Run a diagnostic suite
26
+ waypoi bench --mode diagnostic --suite pool_smoke
27
+
28
+ # Add file-driven scenarios
29
+ waypoi bench --scenario ./examples/scenarios/custom.yaml
30
+
31
+ # Compare with a previous diagnostic run
32
+ waypoi bench --mode diagnostic --baseline ~/.config/waypoi/benchmarks/bench-2026-02-23T12-00-00-000Z.json
33
+ ```
34
+
35
+ ## CLI options
36
+
37
+ - `--suite <name>` built-in suite. Public default is `showcase`.
38
+ - `--example <id>` run one built-in example from the selected suite.
39
+ - `--list-examples` list built-in examples and exit.
40
+ - `--mode <name>` `showcase` or `diagnostic`.
41
+ - `--scenario <path>` scenario file (`.json`, `.jsonl`, `.yaml`, `.yml`).
42
+ - `--model <name>` force one model for all scenarios.
43
+ - `--out <path>` output file (`.json`/`.txt`) or output directory.
44
+ - `--config <path>` benchmark config file (YAML or JSON).
45
+ - `--profile <name>` config profile (default: `local`).
46
+ - `--baseline <path>` previous benchmark report for p95/throughput deltas.
47
+ - `--update-cap-cache` persist capability findings to `$WAYPOI_DIR/capabilities`.
48
+ - `--cap-ttl-days <n>` capability TTL override for freshness (default `7`).
49
+ - `--temperature <n>` run-level generation override for supported modes.
50
+ - `--top-p <n>` run-level generation override (`0..1`) for supported modes.
51
+ - `--max-tokens <n>` run-level generation override (`>=1`) for supported modes.
52
+ - `--presence-penalty <n>` run-level generation override (`-2..2`) for supported modes.
53
+ - `--frequency-penalty <n>` run-level generation override (`-2..2`) for supported modes.
54
+ - `--seed <n>` optional run-level deterministic seed (`>=0`) for supported modes.
55
+ - `--stop <value>` optional stop sequence (string) or comma-separated list in UI.
56
+
57
+ ## Showcase examples
58
+
59
+ The `showcase` suite is the release-facing path. It is built from Hugging Face
60
+ dataset `vincentkoc/tiny_qa_benchmark` (train split):
61
+
62
+ - 52 single-question QA prompts
63
+ - chat-mode single-turn runs
64
+ - per-question answer checks via `contains`
65
+ - category/difficulty metadata exposed as expected highlights
66
+
67
+ Showcase behavior:
68
+
69
+ - sequential only
70
+ - one visible replay per scenario
71
+ - request/response trace is the main artifact
72
+ - verdict explains what passed or failed
73
+ - raw payloads stay in the live event stream; persisted artifacts keep sanitized traces
74
+
75
+ ## Diagnostic suites
76
+
77
+ The older suites remain for engineering use:
78
+
79
+ - `smoke`
80
+ - `proxy`
81
+ - `agent`
82
+ - `pool_smoke`
83
+ - `omni_call_smoke`
84
+ - `capabilities`
85
+
86
+ Diagnostic behavior:
87
+
88
+ - profile-driven warmup and measured runs
89
+ - pass-rate and latency summaries
90
+ - optional baseline regression warnings
91
+ - optional capability cache updates
92
+
93
+ Concurrency is no longer part of the benchmark story.
94
+
95
+ ## Scenario schema
96
+
97
+ Required fields:
98
+
99
+ - `id: string`
100
+ - `mode: "chat" | "agent" | "responses" | "embeddings" | "image_generation" | "audio_transcription" | "audio_speech" | "omni_call"`
101
+
102
+ Mode-specific required fields:
103
+
104
+ - `chat | agent | responses | image_generation`: `prompt`
105
+ - `embeddings`: `input`
106
+ - `audio_transcription`: `audioFile`
107
+ - `audio_speech`: `inputText`, `voice`
108
+ - `omni_call`: `audioFile`
109
+
110
+ Useful showcase metadata:
111
+
112
+ - `title`
113
+ - `summary`
114
+ - `userVisibleGoal`
115
+ - `exampleSource`
116
+ - `inputPreview`
117
+ - `successCriteria`
118
+ - `expectedHighlights`
119
+ - `requiresAvailableTools`
120
+
121
+ Assertions:
122
+
123
+ - generic: `statusCode`, `maxLatencyMs`
124
+ - chat/agent/responses: `contains`, `notContains`
125
+ - agent: `minToolCalls`, `maxToolCalls`, `requiredToolNames`
126
+ - embeddings: `minItems`, `minVectorLength`
127
+ - image generation: `minImages`
128
+ - audio transcription: `containsText`, `notContainsText`
129
+ - audio speech: `minBytes`, `contentType`
130
+
131
+ Validation behavior:
132
+
133
+ - schema errors fail fast with `file + index + field`
134
+ - unknown fields become warnings
135
+
136
+ Generation parameter precedence for supported modes (`chat`, `agent`, `responses`, `omni_call`):
137
+
138
+ 1. scenario-level value
139
+ 2. run-level override
140
+ 3. config defaults
141
+ 4. built-in defaults
142
+
143
+ ### Example: showcase responses scenario
144
+
145
+ ```json
146
+ {
147
+ "id": "responses-demo",
148
+ "mode": "responses",
149
+ "title": "Responses Demo",
150
+ "userVisibleGoal": "Show Responses API compatibility.",
151
+ "prompt": "List two reasons to use a local AI gateway.",
152
+ "assertions": {
153
+ "statusCode": 200
154
+ }
155
+ }
156
+ ```
157
+
158
+ ### Example: showcase tool-calling scenario
159
+
160
+ ```json
161
+ {
162
+ "id": "agent-tool-demo",
163
+ "mode": "agent",
164
+ "title": "Tool Calling",
165
+ "prompt": "Use one available tool, then summarize what you learned.",
166
+ "requiresAvailableTools": true,
167
+ "assertions": {
168
+ "statusCode": 200,
169
+ "minToolCalls": 1
170
+ }
171
+ }
172
+ ```
173
+
174
+ ## Artifacts and UI behavior
175
+
176
+ Each run writes:
177
+
178
+ - `bench-<timestamp>.json`
179
+ - `bench-<timestamp>.txt`
180
+
181
+ Reports now include:
182
+
183
+ - run metadata and effective config
184
+ - per-scenario results
185
+ - sanitized scenario details for history
186
+ - live-show traces for each scenario
187
+ - verdict strings and tool usage summaries
188
+ - optional capability matrix
189
+
190
+ The Benchmark UI is optimized for:
191
+
192
+ - guided suite selection (showcase or diagnostic suite)
193
+ - selecting one showcase example when applicable
194
+ - tuning generation parameters (temperature, top_p, max_tokens, penalties, seed, stop)
195
+ - using an Advanced section for model override, scenario files, profile, and capability cache controls
196
+ - watching the live trace
197
+ - reading the exact scenario input
198
+ - inspecting tool calls and tool results
199
+ - seeing the final verdict clearly
200
+
201
+ ## Verification checklist
202
+
203
+ - `waypoi bench` defaults to showcase behavior.
204
+ - `waypoi bench --list-examples` lists human-readable examples.
205
+ - Benchmark UI loads showcase examples by default.
206
+ - A showcase run shows scenario input, wire request, response, and verdict.
207
+ - Tool-driven examples are skipped clearly when no MCP tools are available.
208
+ - Diagnostic suites still produce capability and regression information.
@@ -0,0 +1,125 @@
1
+ # MCP Tool Governance Guidelines
2
+
3
+ This document is the canonical policy for Waypoi built-in MCP tools (`/mcp`).
4
+
5
+ Scope:
6
+
7
+ - Applies to built-in tools registered in `src/mcp/service.ts`.
8
+ - Does not enforce behavior for external third-party MCP servers managed under `/admin/mcp/*`.
9
+
10
+ ## 1) Tool description standard
11
+
12
+ Every tool description should be concise and action-first:
13
+
14
+ 1. Sentence 1: capability summary (what the tool does).
15
+ 2. Sentence 2: required default behavior for the caller.
16
+ 3. Sentence 3: the biggest pitfall to avoid.
17
+
18
+ Binary-producing tools should explicitly mention file-first output behavior.
19
+
20
+ ## 2) Input schema conventions
21
+
22
+ - Use `snake_case` field names.
23
+ - Include explicit bounds/defaults where relevant.
24
+ - Represent incompatible options as mutually exclusive inputs and validate at runtime.
25
+ - Mark optional non-default behavior clearly (for example `include_data`).
26
+
27
+ ## 3) Output conventions
28
+
29
+ Top-level response shape:
30
+
31
+ - Success: `{ ok: true, ... }`
32
+ - Error: `{ ok: false, error: { type, message } }`
33
+
34
+ For binary-producing tools:
35
+
36
+ - Default to lightweight metadata in responses.
37
+ - Require file output when the tool is binary-producing.
38
+ - Return `file_path` values relative to the output root rather than absolute host paths.
39
+ - Make `file_path` / `file_paths` the canonical small-model result fields.
40
+ - Include raw `url` / `b64_json` only with explicit opt-in (`include_data=true`).
41
+ - Keep `content.text` compact and free of inline base64.
42
+
43
+ ## 4) Error taxonomy
44
+
45
+ Use stable typed errors:
46
+
47
+ - `invalid_request`: parameter validation and contract violations.
48
+ - `no_diffusion_model`: no suitable model available for image generation.
49
+ - `no_vision_model`: no suitable vision-capable text model available for image understanding.
50
+ - `no_video_model`: no suitable video generation model available.
51
+ - `no_video_output`: video generation completed but no video URL was returned.
52
+ - `upstream_error`: upstream/provider failures not attributable to caller input.
53
+ - `forbidden`: endpoint/policy access denied (for route-level guards).
54
+
55
+ Error messages should be deterministic and actionable.
56
+
57
+ ## 5) Operational behavior
58
+
59
+ - Tool handlers should define explicit timeout behavior (for example 60s for image generation, 300s for video generation).
60
+ - Do not silently degrade into inline-only success for binary tools.
61
+ - For binary file-output modes, tools MAY override upstream response format to a byte-bearing format to guarantee file materialization.
62
+ - Retry behavior should be explicit per tool. If no retries are implemented, fail deterministically.
63
+ - In multi-project environments, pin MCP output root via server env:
64
+ - `WAYPOI_MCP_OUTPUT_ROOT=<absolute path>` (default: `~/.config/waypoi`)
65
+ - `WAYPOI_MCP_OUTPUT_SUBDIR=work` (or another controlled relative subdir; default: `generated-images`)
66
+ - `WAYPOI_MCP_STRICT_OUTPUT_ROOT=true` for fail-fast misconfiguration handling.
67
+
68
+ ## 6) Agent behavior guidelines
69
+
70
+ For tool-calling agents:
71
+
72
+ 1. Prefer file output for binary-generating tools.
73
+ 2. Keep responses minimal unless inline data is explicitly needed downstream.
74
+ 3. Avoid repeated expensive calls with unchanged arguments.
75
+ 4. Use `include_data=true` only for explicit transport requirements.
76
+ 5. For image-generation editing, provide at most one source (`image_path` xor `image_url`).
77
+ 6. For image-to-text tools, provide exactly one image source (`image_path` xor `image_url`).
78
+
79
+ Output goes to `~/.config/waypoi/generated-images` by default. Set `WAYPOI_MCP_OUTPUT_ROOT` to redirect.
80
+
81
+ ### Safe-default example (`generate_image`)
82
+
83
+ ```json
84
+ {
85
+ "name": "generate_image",
86
+ "arguments": {
87
+ "prompt": "Minimal icon with clean geometric shape",
88
+ "include_data": false
89
+ }
90
+ }
91
+ ```
92
+
93
+ ### Image-edit example (`generate_image`)
94
+
95
+ ```json
96
+ {
97
+ "name": "generate_image",
98
+ "arguments": {
99
+ "prompt": "Replace the background with a clean studio backdrop",
100
+ "image_path": "./tmp/input.png",
101
+ "include_data": false
102
+ }
103
+ }
104
+ ```
105
+
106
+ ### Image-to-text defaults (`understand_image`)
107
+
108
+ - Exactly one image source is required (`image_path` xor `image_url`).
109
+ - Keep `instruction` concise and task-specific unless broad analysis is needed.
110
+ - Treat top-level `text` as the canonical answer field.
111
+ - For local image files, coordinate-sensitive answers should be expressed in original image pixels even when the upload is resized upstream.
112
+
113
+ ## 7) New MCP tool checklist
114
+
115
+ Before adding a new built-in MCP tool:
116
+
117
+ 1. Description follows the governance template and includes normative guidance.
118
+ 2. Input schema uses `snake_case`, bounds/defaults, and validates incompatible combinations.
119
+ 3. Output shape follows `{ ok: true|false, ... }`, compact `content.text`, and file-first policy for binary payloads.
120
+ 4. Typed errors are stable and mapped to taxonomy.
121
+ 5. Tests cover:
122
+ - policy validation rules,
123
+ - default payload behavior,
124
+ - error paths,
125
+ - tool listing/description visibility.