@mooncompany/uplink-chat 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of @mooncompany/uplink-chat might be problematic. Click here for more details.

Files changed (158) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +185 -0
  3. package/bin/uplink.js +279 -0
  4. package/middleware/error-handler.js +69 -0
  5. package/package.json +93 -0
  6. package/public/css/agents.36b98c0f.css +1469 -0
  7. package/public/css/agents.css +1469 -0
  8. package/public/css/app.a6a7f8f5.css +2731 -0
  9. package/public/css/app.css +2731 -0
  10. package/public/css/artifacts.css +444 -0
  11. package/public/css/commands.css +55 -0
  12. package/public/css/connection.css +131 -0
  13. package/public/css/dashboard.css +233 -0
  14. package/public/css/developer.css +328 -0
  15. package/public/css/files.css +123 -0
  16. package/public/css/markdown.css +156 -0
  17. package/public/css/message-actions.css +278 -0
  18. package/public/css/mobile.css +614 -0
  19. package/public/css/panels-unified.css +483 -0
  20. package/public/css/premium.css +415 -0
  21. package/public/css/realtime.css +189 -0
  22. package/public/css/satellites.css +401 -0
  23. package/public/css/shortcuts.css +185 -0
  24. package/public/css/split-view.4def0262.css +673 -0
  25. package/public/css/split-view.css +673 -0
  26. package/public/css/theme-generator.css +391 -0
  27. package/public/css/themes.css +387 -0
  28. package/public/css/timestamps.css +54 -0
  29. package/public/css/variables.css +78 -0
  30. package/public/dist/bundle.b55050c4.js +15757 -0
  31. package/public/favicon.svg +24 -0
  32. package/public/img/agents/ada.png +0 -0
  33. package/public/img/agents/clarice.png +0 -0
  34. package/public/img/agents/dennis-nedry.png +0 -0
  35. package/public/img/agents/elliot-alderson.png +0 -0
  36. package/public/img/agents/main.png +0 -0
  37. package/public/img/agents/scotty.png +0 -0
  38. package/public/img/agents/top-flight-security.png +0 -0
  39. package/public/index.html +1083 -0
  40. package/public/js/agents-data.js +234 -0
  41. package/public/js/agents-ui.js +72 -0
  42. package/public/js/agents.js +1525 -0
  43. package/public/js/app.js +79 -0
  44. package/public/js/appearance-settings.js +111 -0
  45. package/public/js/artifacts.js +432 -0
  46. package/public/js/audio-queue.js +168 -0
  47. package/public/js/bootstrap.js +54 -0
  48. package/public/js/chat.js +1211 -0
  49. package/public/js/commands.js +581 -0
  50. package/public/js/connection-api.js +121 -0
  51. package/public/js/connection.js +1231 -0
  52. package/public/js/context-tracker.js +271 -0
  53. package/public/js/core.js +172 -0
  54. package/public/js/dashboard.js +452 -0
  55. package/public/js/developer.js +432 -0
  56. package/public/js/encryption.js +124 -0
  57. package/public/js/errors.js +122 -0
  58. package/public/js/event-bus.js +77 -0
  59. package/public/js/fetch-utils.js +171 -0
  60. package/public/js/file-handler.js +229 -0
  61. package/public/js/files.js +352 -0
  62. package/public/js/gateway-chat.js +538 -0
  63. package/public/js/logger.js +112 -0
  64. package/public/js/markdown.js +190 -0
  65. package/public/js/message-actions.js +431 -0
  66. package/public/js/message-renderer.js +288 -0
  67. package/public/js/missed-messages.js +235 -0
  68. package/public/js/mobile-debug.js +95 -0
  69. package/public/js/notifications.js +367 -0
  70. package/public/js/offline-queue.js +178 -0
  71. package/public/js/onboarding.js +543 -0
  72. package/public/js/panels.js +156 -0
  73. package/public/js/premium.js +412 -0
  74. package/public/js/realtime-voice.js +844 -0
  75. package/public/js/satellite-sync.js +256 -0
  76. package/public/js/satellite-ui.js +175 -0
  77. package/public/js/satellites.js +1516 -0
  78. package/public/js/settings.js +1087 -0
  79. package/public/js/shortcuts.js +381 -0
  80. package/public/js/split-chat.js +1234 -0
  81. package/public/js/split-resize.js +211 -0
  82. package/public/js/splitview.js +340 -0
  83. package/public/js/storage.js +408 -0
  84. package/public/js/streaming-handler.js +324 -0
  85. package/public/js/stt-settings.js +316 -0
  86. package/public/js/theme-generator.js +661 -0
  87. package/public/js/themes.js +164 -0
  88. package/public/js/timestamps.js +198 -0
  89. package/public/js/tts-settings.js +575 -0
  90. package/public/js/ui.js +267 -0
  91. package/public/js/update-notifier.js +143 -0
  92. package/public/js/utils/constants.js +165 -0
  93. package/public/js/utils/sanitize.js +93 -0
  94. package/public/js/utils/sse-parser.js +195 -0
  95. package/public/js/voice.js +883 -0
  96. package/public/manifest.json +58 -0
  97. package/public/moon_texture.jpg +0 -0
  98. package/public/sw.js +221 -0
  99. package/public/three.min.js +6 -0
  100. package/server/channel.js +529 -0
  101. package/server/chat.js +270 -0
  102. package/server/config-store.js +362 -0
  103. package/server/config.js +159 -0
  104. package/server/context.js +131 -0
  105. package/server/gateway-commands.js +211 -0
  106. package/server/gateway-proxy.js +318 -0
  107. package/server/index.js +22 -0
  108. package/server/logger.js +89 -0
  109. package/server/middleware/auth.js +188 -0
  110. package/server/middleware.js +218 -0
  111. package/server/openclaw-discover.js +308 -0
  112. package/server/premium/index.js +156 -0
  113. package/server/premium/license.js +140 -0
  114. package/server/realtime/bridge.js +837 -0
  115. package/server/realtime/index.js +349 -0
  116. package/server/realtime/tts-stream.js +446 -0
  117. package/server/routes/agents.js +564 -0
  118. package/server/routes/artifacts.js +174 -0
  119. package/server/routes/chat.js +311 -0
  120. package/server/routes/config-settings.js +345 -0
  121. package/server/routes/config.js +603 -0
  122. package/server/routes/files.js +307 -0
  123. package/server/routes/index.js +18 -0
  124. package/server/routes/media.js +451 -0
  125. package/server/routes/missed-messages.js +107 -0
  126. package/server/routes/premium.js +75 -0
  127. package/server/routes/push.js +156 -0
  128. package/server/routes/satellite.js +406 -0
  129. package/server/routes/status.js +251 -0
  130. package/server/routes/stt.js +35 -0
  131. package/server/routes/voice.js +260 -0
  132. package/server/routes/webhooks.js +203 -0
  133. package/server/routes.js +206 -0
  134. package/server/runtime-config.js +336 -0
  135. package/server/share.js +305 -0
  136. package/server/stt/faster-whisper.js +72 -0
  137. package/server/stt/groq.js +51 -0
  138. package/server/stt/index.js +196 -0
  139. package/server/stt/openai.js +49 -0
  140. package/server/sync.js +244 -0
  141. package/server/tailscale-https.js +175 -0
  142. package/server/tts.js +646 -0
  143. package/server/update-checker.js +172 -0
  144. package/server/utils/filename.js +129 -0
  145. package/server/utils.js +147 -0
  146. package/server/watchdog.js +318 -0
  147. package/server/websocket/broadcast.js +359 -0
  148. package/server/websocket/connections.js +339 -0
  149. package/server/websocket/index.js +215 -0
  150. package/server/websocket/routing.js +277 -0
  151. package/server/websocket/sync.js +102 -0
  152. package/server.js +404 -0
  153. package/utils/detect-tool-usage.js +93 -0
  154. package/utils/errors.js +158 -0
  155. package/utils/html-escape.js +84 -0
  156. package/utils/id-sanitize.js +94 -0
  157. package/utils/response.js +130 -0
  158. package/utils/with-retry.js +105 -0
@@ -0,0 +1,837 @@
1
+ /**
2
+ * Agent Voice Bridge — Routes voice through OpenClaw Gateway
3
+ *
4
+ * Triangle architecture:
5
+ * Browser Mic → OpenAI Realtime API (ears: VAD + transcription only)
6
+ * ↓ transcript
7
+ * Uplink Bridge (this module)
8
+ * ↓ text message
9
+ * OpenClaw Gateway (brain)
10
+ * ↓ streaming text response
11
+ * TTS Engine (mouth)
12
+ * ↓ PCM audio chunks
13
+ * Browser Speaker (via WebSocket)
14
+ *
15
+ * Key difference from index.js (standalone relay):
16
+ * - OpenAI Realtime opened with `modalities: ['text']` — no audio output from OpenAI
17
+ * - Transcription events intercepted → routed to Gateway instead of letting OpenAI respond
18
+ * - Gateway response streamed → TTS → PCM audio sent back to client
19
+ * - Client plays audio directly via AudioContext (does NOT inject back into OpenAI)
20
+ *
21
+ * Custom events sent to client:
22
+ * { type: 'bridge.transcript', text } — user's transcribed speech
23
+ * { type: 'bridge.response.delta', text } — streaming text chunk from gateway
24
+ * { type: 'bridge.audio', audio: base64PCM } — TTS audio chunk
25
+ * { type: 'bridge.response.done' } — response complete
26
+ * { type: 'bridge.status', status } — 'thinking', 'speaking', etc.
27
+ * { type: 'bridge.error', error } — error message
28
+ *
29
+ * Usage (in server.js):
30
+ * import { setupAgentVoiceBridge } from './server/realtime/bridge.js';
31
+ * setupAgentVoiceBridge(httpServer);
32
+ */
33
+
34
+ import { WebSocketServer, WebSocket } from 'ws';
35
+ import { createLogger } from '../logger.js';
36
+ import { loadConfig } from '../runtime-config.js';
37
+ import { ALLOWED_ORIGINS, STREAM_READ_TIMEOUT_MS } from '../config.js';
38
+ import { streamTTS, splitSentences } from './tts-stream.js';
39
+
40
+ const log = createLogger('voice-bridge');
41
+
42
+ // ─── Constants ──────────────────────────────────────────────────────────────
43
+
44
+ const OPENAI_REALTIME_BASE = 'wss://api.openai.com/v1/realtime';
45
+ const DEFAULT_MODEL = 'gpt-4o-mini-realtime-preview';
46
+ const MAX_BRIDGE_CONNECTIONS = 5;
47
+
48
+ // Gateway streaming constants
49
+ const GATEWAY_FETCH_TIMEOUT_MS = 300_000; // 5 minutes for long responses
50
+ const GATEWAY_STREAM_TIMEOUT_MS = STREAM_READ_TIMEOUT_MS || 300_000;
51
+
52
+ // Connection bookkeeping
53
+ const activeSessions = new Map(); // sessionId → session state
54
+
55
+ // ─── Helpers ────────────────────────────────────────────────────────────────
56
+
57
+ /**
58
+ * Verify that the WebSocket upgrade origin is allowed.
59
+ */
60
+ function verifyOrigin(origin) {
61
+ if (!origin) return true; // Non-browser clients don't send Origin
62
+ try {
63
+ const url = new URL(origin);
64
+ const host = url.hostname;
65
+ if (host === 'localhost' || host === '127.0.0.1' || host === '::1') return true;
66
+ if (host.endsWith('.ts.net')) return true;
67
+ return ALLOWED_ORIGINS.some(allowed => {
68
+ try { return new URL(allowed).hostname === host; } catch { return false; }
69
+ });
70
+ } catch {
71
+ return false;
72
+ }
73
+ }
74
+
75
+ /**
76
+ * Send a JSON event to the client WebSocket if it's open.
77
+ */
78
+ function sendToClient(ws, event) {
79
+ if (ws.readyState === WebSocket.OPEN) {
80
+ try {
81
+ ws.send(JSON.stringify(event));
82
+ } catch (err) {
83
+ log.error('Failed to send to client:', err.message);
84
+ }
85
+ }
86
+ }
87
+
88
+ /**
89
+ * Build the `session.update` for OpenAI Realtime in bridge mode.
90
+ * Key difference: modalities=['text'] only — no audio output from OpenAI.
91
+ * We keep input_audio_format=pcm16 so mic audio flows through for transcription.
92
+ */
93
+ function buildBridgeSessionUpdate(voice, vadSilenceDurationMs) {
94
+ const silenceMs = Math.max(200, Math.min(1500, vadSilenceDurationMs || 400));
95
+ return JSON.stringify({
96
+ type: 'session.update',
97
+ session: {
98
+ modalities: ['text'],
99
+ instructions: 'Transcribe the user\'s speech accurately. Do not generate responses.',
100
+ voice: voice || 'marin',
101
+ input_audio_format: 'pcm16',
102
+ input_audio_transcription: {
103
+ model: 'whisper-1',
104
+ },
105
+ turn_detection: {
106
+ type: 'server_vad',
107
+ threshold: 0.65,
108
+ prefix_padding_ms: 200,
109
+ silence_duration_ms: silenceMs,
110
+ },
111
+ },
112
+ });
113
+ }
114
+
115
+ /**
116
+ * Build the TTS config from runtime config.
117
+ * @param {Object} runtimeConfig - The loaded runtime config
118
+ * @returns {Object} TTS config for streamTTS()
119
+ */
120
+ function buildTTSConfig(runtimeConfig) {
121
+ // Use agent voice settings (from Settings → Voice → Agent Voice)
122
+ const engine = runtimeConfig.agentVoiceTtsEngine ||
123
+ (runtimeConfig.openaiApiKey ? 'openai' : 'edge');
124
+
125
+ // Map agentVoiceTtsVoice to the right engine field
126
+ const agentVoice = runtimeConfig.agentVoiceTtsVoice || 'echo';
127
+
128
+ return {
129
+ engine,
130
+ openaiApiKey: runtimeConfig.openaiApiKey,
131
+ openaiTtsVoice: engine === 'openai' ? agentVoice : (runtimeConfig.openaiTtsVoice || 'nova'),
132
+ openaiTtsModel: runtimeConfig.openaiTtsModel || 'tts-1',
133
+ edgeTtsVoice: runtimeConfig.edgeTtsVoice || 'en-US-GuyNeural',
134
+ };
135
+ }
136
+
137
+ /**
138
+ * Stream a message to the OpenClaw gateway and process the SSE response.
139
+ * Returns the full assembled response text.
140
+ *
141
+ * @param {Object} params
142
+ * @param {string} params.message - User's transcribed message
143
+ * @param {Object} params.config - Runtime config
144
+ * @param {string} params.sessionId - Session ID for logging
145
+ * @param {WebSocket} params.clientWs - Client WebSocket for streaming events
146
+ * @param {AbortSignal} params.signal - Abort signal for cancellation
147
+ * @param {Function} [params.onSentence] - Callback fired with each complete sentence during streaming
148
+ * @returns {Promise<{ response: string, tools: string[] }>}
149
+ */
150
+ async function streamGatewayResponse({ message, config, sessionId, clientWs, signal, onSentence }) {
151
+ const gatewayUrl = config.gatewayUrl;
152
+ const gatewayToken = config.gatewayToken;
153
+
154
+ if (!gatewayUrl) {
155
+ throw new Error('Gateway URL not configured');
156
+ }
157
+
158
+ // Build session key — bridge shares the main session
159
+ const sessionKey = 'agent:main:main';
160
+
161
+ const url = `${gatewayUrl}/v1/chat/completions`;
162
+ log.info(`[${sessionId}] Gateway request: ${url}, session=${sessionKey}`);
163
+
164
+ const response = await fetch(url, {
165
+ method: 'POST',
166
+ headers: {
167
+ 'Content-Type': 'application/json',
168
+ 'Authorization': `Bearer ${gatewayToken}`,
169
+ 'x-openclaw-session-key': sessionKey,
170
+ },
171
+ body: JSON.stringify({
172
+ model: config.voiceModel || 'openclaw',
173
+ user: 'uplink-voice',
174
+ stream: true,
175
+ max_tokens: 100,
176
+ stream_options: { include_usage: true },
177
+ messages: [
178
+ {
179
+ role: 'user',
180
+ content: `[Voice chat via Agent Voice Bridge — keep response conversational and concise] ${message}`,
181
+ },
182
+ ],
183
+ }),
184
+ signal,
185
+ });
186
+
187
+ if (!response.ok) {
188
+ const errText = await response.text().catch(() => 'Unknown error');
189
+ throw new Error(`Gateway error ${response.status}: ${errText}`);
190
+ }
191
+
192
+ if (!response.body) {
193
+ throw new Error('Gateway response has no body');
194
+ }
195
+
196
+ // Parse the SSE stream
197
+ const reader = response.body.getReader();
198
+ const decoder = new TextDecoder();
199
+ let buffer = '';
200
+ let fullResponse = '';
201
+ let sentenceAccum = '';
202
+ const detectedTools = [];
203
+ let sentThinking = false;
204
+
205
+ const readWithTimeout = () => {
206
+ return Promise.race([
207
+ reader.read(),
208
+ new Promise((_, reject) =>
209
+ setTimeout(() => reject(new Error('Gateway stream read timed out')), GATEWAY_STREAM_TIMEOUT_MS)
210
+ ),
211
+ ]);
212
+ };
213
+
214
+ try {
215
+ while (true) {
216
+ if (signal?.aborted) break;
217
+
218
+ const { done, value } = await readWithTimeout();
219
+ if (done) break;
220
+
221
+ buffer += decoder.decode(value, { stream: true });
222
+ const lines = buffer.split('\n');
223
+ buffer = lines.pop() || '';
224
+
225
+ for (const line of lines) {
226
+ if (!line.startsWith('data: ')) continue;
227
+ const data = line.slice(6);
228
+ if (data === '[DONE]') continue;
229
+
230
+ try {
231
+ const parsed = JSON.parse(data);
232
+ const delta = parsed.choices?.[0]?.delta;
233
+
234
+ // Detect tool calls
235
+ const toolCalls = delta?.tool_calls;
236
+ if (toolCalls && toolCalls.length > 0) {
237
+ const toolName = toolCalls[0]?.function?.name;
238
+ if (toolName && !detectedTools.includes(toolName)) {
239
+ detectedTools.push(toolName);
240
+ if (!sentThinking) {
241
+ sendToClient(clientWs, { type: 'bridge.status', status: 'thinking' });
242
+ sentThinking = true;
243
+ }
244
+ log.info(`[${sessionId}] Tool detected: ${toolName}`);
245
+ }
246
+ }
247
+
248
+ // Stream text content
249
+ if (delta?.content) {
250
+ fullResponse += delta.content;
251
+ sendToClient(clientWs, {
252
+ type: 'bridge.response.delta',
253
+ text: delta.content,
254
+ });
255
+
256
+ // Real-time clause detection for pipelined TTS
257
+ // Fire on sentence ends AND clause boundaries for faster first audio
258
+ if (onSentence) {
259
+ sentenceAccum = (sentenceAccum || '') + delta.content;
260
+ // Match sentence-ending punctuation OR clause breaks (after 15+ chars)
261
+ // Clause breaks: comma, semicolon, colon, dash, newline
262
+ let match;
263
+ while (sentenceAccum.length > 0) {
264
+ // Prefer sentence boundaries
265
+ match = sentenceAccum.match(/^(.*?[.!?])\s/s);
266
+ if (!match && sentenceAccum.length >= 20) {
267
+ // Fall back to clause boundaries for faster streaming
268
+ match = sentenceAccum.match(/^(.{15,}?[,;:\-—\n])\s*/s);
269
+ }
270
+ if (match) {
271
+ const clause = match[1].trim();
272
+ sentenceAccum = sentenceAccum.slice(match[0].length);
273
+ if (clause) onSentence(clause);
274
+ } else {
275
+ break;
276
+ }
277
+ }
278
+ }
279
+ }
280
+ } catch {
281
+ // Skip unparseable chunks
282
+ }
283
+ }
284
+ }
285
+ } finally {
286
+ reader.releaseLock();
287
+ }
288
+
289
+ // Flush any remaining text as a final sentence
290
+ if (onSentence && sentenceAccum.trim()) {
291
+ onSentence(sentenceAccum.trim());
292
+ }
293
+
294
+ return { response: fullResponse, tools: detectedTools };
295
+ }
296
+
297
+ /**
298
+ * Process TTS for a complete response: split into sentences, stream TTS for each,
299
+ * send audio chunks to client.
300
+ *
301
+ * @param {Object} params
302
+ * @param {string} params.text - Full response text
303
+ * @param {Object} params.ttsConfig - TTS engine configuration
304
+ * @param {WebSocket} params.clientWs - Client WebSocket
305
+ * @param {string} params.sessionId - Session ID for logging
306
+ * @param {AbortSignal} params.signal - Abort signal
307
+ */
308
+ async function processResponseTTS({ text, ttsConfig, clientWs, sessionId, signal }) {
309
+ const sentences = splitSentences(text);
310
+
311
+ if (sentences.length === 0) {
312
+ log.warn(`[${sessionId}] No sentences to synthesize`);
313
+ return;
314
+ }
315
+
316
+ log.info(`[${sessionId}] TTS: ${sentences.length} sentence(s), engine=${ttsConfig.engine}`);
317
+ sendToClient(clientWs, { type: 'bridge.status', status: 'speaking' });
318
+
319
+ for (const sentence of sentences) {
320
+ if (signal?.aborted) break;
321
+ if (clientWs.readyState !== WebSocket.OPEN) break;
322
+
323
+ try {
324
+ for await (const pcmChunk of streamTTS(sentence, ttsConfig)) {
325
+ if (signal?.aborted) break;
326
+ if (clientWs.readyState !== WebSocket.OPEN) break;
327
+
328
+ // Send PCM chunk as base64 to client
329
+ sendToClient(clientWs, {
330
+ type: 'bridge.audio',
331
+ audio: pcmChunk.toString('base64'),
332
+ });
333
+ }
334
+ } catch (err) {
335
+ log.error(`[${sessionId}] TTS error for sentence:`, err.message);
336
+ sendToClient(clientWs, {
337
+ type: 'bridge.error',
338
+ error: `TTS failed: ${err.message}`,
339
+ });
340
+ // Continue with remaining sentences — client gets text even if audio fails
341
+ }
342
+ }
343
+ }
344
+
345
+ /**
346
+ * Save a message pair (user + assistant) to the sync file for chat history.
347
+ * Uses a direct HTTP POST to our own server's message-saving endpoint
348
+ * or imports the function directly.
349
+ *
350
+ * @param {string} userText - User's transcribed message
351
+ * @param {string} assistantText - Agent's response
352
+ * @param {string} sessionId - Session ID for logging
353
+ */
354
+ async function saveToHistory(userText, assistantText, sessionId) {
355
+ try {
356
+ // Lazy import to avoid circular dependencies
357
+ // The saveMessageToSync function writes to the messages-sync.json file
358
+ const { saveMessageToSync } = await import('../routes.js');
359
+
360
+ if (userText) {
361
+ await saveMessageToSync('user', userText);
362
+ log.debug(`[${sessionId}] Saved user message to history`);
363
+ }
364
+ if (assistantText) {
365
+ await saveMessageToSync('assistant', assistantText);
366
+ log.debug(`[${sessionId}] Saved assistant message to history`);
367
+ }
368
+ } catch (err) {
369
+ log.error(`[${sessionId}] Failed to save to history:`, err.message);
370
+ }
371
+ }
372
+
373
+ /**
374
+ * Broadcast sync messages to WebSocket clients for cross-device sync.
375
+ *
376
+ * @param {string} role - 'user' or 'assistant'
377
+ * @param {string} text - Message text
378
+ */
379
+ async function broadcastSync(role, text) {
380
+ try {
381
+ const { broadcastSyncMessage, generateMessageId } = await import('../websocket/index.js');
382
+ const msgId = generateMessageId();
383
+ broadcastSyncMessage(role, text, 'main', msgId);
384
+ } catch (err) {
385
+ log.warn('Failed to broadcast sync message:', err.message);
386
+ }
387
+ }
388
+
389
+ // ─── Main Bridge Logic ─────────────────────────────────────────────────────
390
+
391
+ /**
392
+ * Handle a single bridge session: client WS + OpenAI Realtime WS + Gateway.
393
+ *
394
+ * @param {WebSocket} clientWs - Client WebSocket connection
395
+ * @param {import('http').IncomingMessage} req - HTTP upgrade request
396
+ */
397
+ async function handleBridgeConnection(clientWs, req) {
398
+ const sessionId = `bridge-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
399
+ const clientIp = req.socket.remoteAddress;
400
+ log.info(`Bridge client connected: ${sessionId} from ${clientIp}`);
401
+
402
+ // ── Load runtime config ─────────────────────────────────────────────
403
+ let config;
404
+ try {
405
+ config = await loadConfig();
406
+ } catch (err) {
407
+ log.error('Failed to load runtime config:', err);
408
+ clientWs.close(1011, 'Server configuration error');
409
+ return;
410
+ }
411
+
412
+ const apiKey = config.openaiApiKey;
413
+ if (!apiKey) {
414
+ log.error('No OpenAI API key — cannot open realtime session');
415
+ sendToClient(clientWs, {
416
+ type: 'bridge.error',
417
+ error: 'No OpenAI API key configured. Add your key in Settings → Voice.',
418
+ });
419
+ clientWs.close(1008, 'Missing API key');
420
+ return;
421
+ }
422
+
423
+ if (!config.gatewayUrl || !config.gatewayToken) {
424
+ log.error('Gateway not configured — cannot use agent voice bridge');
425
+ sendToClient(clientWs, {
426
+ type: 'bridge.error',
427
+ error: 'OpenClaw Gateway not configured. Check Settings.',
428
+ });
429
+ clientWs.close(1008, 'Gateway not configured');
430
+ return;
431
+ }
432
+
433
+ const model = config.realtimeModel || DEFAULT_MODEL;
434
+ const voice = config.realtimeVoice || 'marin';
435
+ const vadSilenceDurationMs = config.vadSilenceDurationMs || 400;
436
+ const ttsConfig = buildTTSConfig(config);
437
+
438
+ // ── Open OpenAI Realtime WebSocket ──────────────────────────────────
439
+ const openaiUrl = `${OPENAI_REALTIME_BASE}?model=${encodeURIComponent(model)}`;
440
+ let openaiWs;
441
+ try {
442
+ openaiWs = new WebSocket(openaiUrl, {
443
+ headers: {
444
+ 'Authorization': `Bearer ${apiKey}`,
445
+ 'OpenAI-Beta': 'realtime=v1',
446
+ },
447
+ perMessageDeflate: true,
448
+ });
449
+ } catch (err) {
450
+ log.error('Failed to create OpenAI WebSocket:', err);
451
+ clientWs.close(1011, 'Failed to connect to OpenAI');
452
+ return;
453
+ }
454
+
455
+ // ── Session state ───────────────────────────────────────────────────
456
+ let clientClosed = false;
457
+ let openaiClosed = false;
458
+ let processingResponse = false;
459
+ let activeAbort = null; // AbortController for current gateway request
460
+
461
+ activeSessions.set(sessionId, { clientWs, openaiWs });
462
+
463
+ // Keepalive
464
+ const keepalive = setInterval(() => {
465
+ try {
466
+ if (clientWs.readyState === WebSocket.OPEN) clientWs.ping();
467
+ if (openaiWs.readyState === WebSocket.OPEN) openaiWs.ping();
468
+ } catch { /* swallow */ }
469
+ }, 15_000);
470
+
471
+ function cleanup() {
472
+ clearInterval(keepalive);
473
+ activeSessions.delete(sessionId);
474
+ if (activeAbort) {
475
+ activeAbort.abort();
476
+ activeAbort = null;
477
+ }
478
+ if (!clientClosed && clientWs.readyState === WebSocket.OPEN) {
479
+ clientWs.close();
480
+ }
481
+ if (!openaiClosed && openaiWs.readyState === WebSocket.OPEN) {
482
+ openaiWs.close();
483
+ }
484
+ }
485
+
486
+ // ── Process a user transcript through the gateway ───────────────────
487
+
488
+ const MIN_BARGE_IN_WORDS = 3; // Don't barge-in for short/ambiguous transcripts
489
+
490
+ async function processTranscript(transcript) {
491
+ if (!transcript || !transcript.trim()) return;
492
+
493
+ const trimmed = transcript.trim();
494
+ const wordCount = trimmed.split(/\s+/).filter(w => w.length > 0).length;
495
+
496
+ log.info(`[${sessionId}] Transcript: "${trimmed.substring(0, 200)}" (${wordCount} words)`);
497
+
498
+ // If a response is in progress, only barge-in for substantial speech
499
+ // Short fragments (1-2 words) during a response are likely noise/echo
500
+ if (activeAbort) {
501
+ if (wordCount < MIN_BARGE_IN_WORDS) {
502
+ log.info(`[${sessionId}] Skipping short transcript during active response: "${trimmed}"`);
503
+ return;
504
+ }
505
+ log.info(`[${sessionId}] Barge-in: aborting previous response`);
506
+ activeAbort.abort();
507
+ }
508
+
509
+ processingResponse = true;
510
+ activeAbort = new AbortController();
511
+ const { signal } = activeAbort;
512
+
513
+ // Send transcript to client
514
+ sendToClient(clientWs, { type: 'bridge.transcript', text: trimmed });
515
+
516
+ // Broadcast user message for cross-device sync
517
+ broadcastSync('user', trimmed);
518
+
519
+ try {
520
+ // Send to gateway and stream response with pipelined TTS
521
+ sendToClient(clientWs, { type: 'bridge.status', status: 'thinking' });
522
+
523
+ // Queue for sentences that arrive during streaming
524
+ const sentenceQueue = [];
525
+ let ttsRunning = false;
526
+ let streamDone = false;
527
+ let sentenceCount = 0;
528
+
529
+ // TTS consumer — processes sentences from queue as they arrive
530
+ const processTTSQueue = async () => {
531
+ if (ttsRunning) return;
532
+ ttsRunning = true;
533
+ while (sentenceQueue.length > 0) {
534
+ if (signal?.aborted) break;
535
+ if (clientWs.readyState !== WebSocket.OPEN) break;
536
+
537
+ const sentence = sentenceQueue.shift();
538
+ const clean = sentence.replace(/MEDIA:.+?(?:\n|$)/g, '').trim();
539
+ if (!clean) continue;
540
+
541
+ sentenceCount++;
542
+ if (sentenceCount === 1) {
543
+ sendToClient(clientWs, { type: 'bridge.status', status: 'speaking' });
544
+ }
545
+
546
+ try {
547
+ for await (const pcmChunk of streamTTS(clean, ttsConfig)) {
548
+ if (signal?.aborted) break;
549
+ if (clientWs.readyState !== WebSocket.OPEN) break;
550
+ sendToClient(clientWs, {
551
+ type: 'bridge.audio',
552
+ audio: pcmChunk.toString('base64'),
553
+ });
554
+ }
555
+ } catch (err) {
556
+ log.error(`[${sessionId}] TTS error:`, err.message);
557
+ }
558
+ }
559
+ ttsRunning = false;
560
+ };
561
+
562
+ // Stream gateway with sentence callback — TTS starts on first sentence
563
+ const { response: gatewayResponse } = await streamGatewayResponse({
564
+ message: trimmed,
565
+ config,
566
+ sessionId,
567
+ clientWs,
568
+ signal,
569
+ onSentence: (sentence) => {
570
+ sentenceQueue.push(sentence);
571
+ processTTSQueue(); // kick TTS if not already running
572
+ },
573
+ });
574
+
575
+ if (signal.aborted) {
576
+ log.info(`[${sessionId}] Response aborted (barge-in)`);
577
+ return;
578
+ }
579
+
580
+ // Wait for TTS queue to drain
581
+ while (sentenceQueue.length > 0 || ttsRunning) {
582
+ if (signal?.aborted) break;
583
+ await new Promise(r => setTimeout(r, 50));
584
+ }
585
+
586
+ const cleanResponse = (gatewayResponse || '').replace(/MEDIA:.+?(?:\n|$)/g, '').trim();
587
+ log.info(`[${sessionId}] Complete: ${cleanResponse.length} chars, ${sentenceCount} sentences TTS'd`);
588
+
589
+ // Signal response complete
590
+ sendToClient(clientWs, { type: 'bridge.response.done' });
591
+
592
+ // Save to chat history (async, don't block)
593
+ if (cleanResponse) {
594
+ saveToHistory(trimmed, cleanResponse, sessionId);
595
+ broadcastSync('assistant', cleanResponse);
596
+ }
597
+
598
+ } catch (err) {
599
+ if (err.name === 'AbortError' || signal.aborted) {
600
+ log.info(`[${sessionId}] Response processing aborted`);
601
+ } else {
602
+ log.error(`[${sessionId}] Error processing transcript:`, err.message);
603
+ sendToClient(clientWs, {
604
+ type: 'bridge.error',
605
+ error: `Processing failed: ${err.message}`,
606
+ });
607
+ sendToClient(clientWs, { type: 'bridge.response.done' });
608
+ }
609
+ } finally {
610
+ processingResponse = false;
611
+ if (activeAbort?.signal === signal) {
612
+ activeAbort = null;
613
+ }
614
+ }
615
+ }
616
+
617
+ // ── OpenAI Realtime WebSocket events ────────────────────────────────
618
+
619
+ openaiWs.on('open', () => {
620
+ log.info(`[${sessionId}] OpenAI Realtime connected (model=${model})`);
621
+
622
+ // Send bridge-mode session update (text-only modalities, no audio output)
623
+ try {
624
+ const sessionUpdate = buildBridgeSessionUpdate(voice, vadSilenceDurationMs);
625
+ log.debug(`[${sessionId}] Sending bridge session.update (VAD silence: ${vadSilenceDurationMs}ms)`);
626
+ openaiWs.send(sessionUpdate);
627
+ } catch (err) {
628
+ log.error(`[${sessionId}] Failed to send session.update:`, err);
629
+ }
630
+
631
+ // Notify client that bridge is ready
632
+ sendToClient(clientWs, { type: 'relay.ready' });
633
+ });
634
+
635
+ openaiWs.on('message', (data, isBinary) => {
636
+ if (isBinary) {
637
+ // In bridge mode we shouldn't get binary audio from OpenAI (modalities=['text']),
638
+ // but if we do, just drop it
639
+ return;
640
+ }
641
+
642
+ try {
643
+ const event = JSON.parse(data.toString());
644
+ const eventType = event.type;
645
+
646
+ // ── Intercept transcription events ──────────────────────────────
647
+ if (eventType === 'conversation.item.input_audio_transcription.completed') {
648
+ // This is the main event — user speech has been transcribed
649
+ const transcript = event.transcript;
650
+ log.info(`[${sessionId}] OpenAI transcription completed: "${(transcript || '').substring(0, 100)}"`);
651
+ // Process async — don't block the WebSocket message handler
652
+ processTranscript(transcript);
653
+ // Do NOT forward to client — we handle it ourselves
654
+ return;
655
+ }
656
+
657
+ // ── Forward select events to client (for UI state) ─────────────
658
+ // These events are useful for the client's VAD/speech indicators
659
+ switch (eventType) {
660
+ case 'session.created':
661
+ case 'session.updated':
662
+ // Let client know session is configured
663
+ sendToClient(clientWs, event);
664
+ break;
665
+
666
+ case 'input_audio_buffer.speech_started':
667
+ // User started speaking — useful for UI animation
668
+ sendToClient(clientWs, event);
669
+ // If agent is speaking, this is a barge-in
670
+ if (processingResponse && activeAbort) {
671
+ log.info(`[${sessionId}] Barge-in detected: user started speaking`);
672
+ activeAbort.abort();
673
+ }
674
+ break;
675
+
676
+ case 'input_audio_buffer.speech_stopped':
677
+ case 'input_audio_buffer.committed':
678
+ // User stopped speaking — forward for UI
679
+ sendToClient(clientWs, event);
680
+ break;
681
+
682
+ case 'conversation.item.input_audio_transcription.failed':
683
+ // Transcription failed
684
+ log.warn(`[${sessionId}] Transcription failed:`, JSON.stringify(event.error));
685
+ sendToClient(clientWs, {
686
+ type: 'bridge.error',
687
+ error: 'Speech transcription failed — please try again',
688
+ });
689
+ break;
690
+
691
+ case 'error':
692
+ // OpenAI error — forward to client
693
+ log.error(`[${sessionId}] OpenAI error:`, JSON.stringify(event));
694
+ sendToClient(clientWs, {
695
+ type: 'bridge.error',
696
+ error: event.error?.message || 'OpenAI Realtime error',
697
+ });
698
+ break;
699
+
700
+ case 'rate_limits.updated':
701
+ // Log rate limits but don't forward
702
+ log.debug(`[${sessionId}] Rate limits: ${JSON.stringify(event.rate_limits)}`);
703
+ break;
704
+
705
+ default:
706
+ // Drop all other events (response.*, conversation.item.created, etc.)
707
+ // In bridge mode, OpenAI shouldn't be generating responses
708
+ log.debug(`[${sessionId}] Dropping OpenAI event: ${eventType}`);
709
+ break;
710
+ }
711
+ } catch (err) {
712
+ log.error(`[${sessionId}] Error parsing OpenAI message:`, err.message);
713
+ }
714
+ });
715
+
716
+ openaiWs.on('close', (code, reason) => {
717
+ openaiClosed = true;
718
+ const reasonStr = reason?.toString() || '';
719
+ log.info(`[${sessionId}] OpenAI disconnected (code=${code}${reasonStr ? ', reason=' + reasonStr : ''})`);
720
+ sendToClient(clientWs, {
721
+ type: 'bridge.error',
722
+ error: 'Voice connection lost — please reconnect',
723
+ });
724
+ cleanup();
725
+ });
726
+
727
+ openaiWs.on('error', (err) => {
728
+ log.error(`[${sessionId}] OpenAI error:`, err.message);
729
+ sendToClient(clientWs, {
730
+ type: 'bridge.error',
731
+ error: `Voice connection error: ${err.message}`,
732
+ });
733
+ cleanup();
734
+ });
735
+
736
+ // ── Client WebSocket events ─────────────────────────────────────────
737
+
738
+ clientWs.on('message', (data, isBinary) => {
739
+ // Forward everything from client to OpenAI (mic audio, session updates, etc.)
740
+ if (openaiWs.readyState === WebSocket.OPEN) {
741
+ try {
742
+ if (isBinary) {
743
+ // Binary = mic audio — forward directly
744
+ openaiWs.send(data);
745
+ } else {
746
+ // Text = JSON commands from client
747
+ const msgStr = data.toString();
748
+ try {
749
+ const msg = JSON.parse(msgStr);
750
+ log.debug(`[${sessionId}] Client event: ${msg.type}`);
751
+
752
+ // Intercept bridge-specific commands
753
+ if (msg.type === 'bridge.cancel') {
754
+ // Client wants to cancel current response
755
+ if (activeAbort) {
756
+ log.info(`[${sessionId}] Client cancelled response`);
757
+ activeAbort.abort();
758
+ }
759
+ return;
760
+ }
761
+ } catch { /* not JSON, just forward */ }
762
+
763
+ openaiWs.send(msgStr);
764
+ }
765
+ } catch (err) {
766
+ log.error(`[${sessionId}] Error forwarding client → OpenAI:`, err.message);
767
+ }
768
+ }
769
+ });
770
+
771
+ clientWs.on('close', (code) => {
772
+ clientClosed = true;
773
+ log.info(`[${sessionId}] Client disconnected (code=${code})`);
774
+ cleanup();
775
+ });
776
+
777
+ clientWs.on('error', (err) => {
778
+ log.error(`[${sessionId}] Client error:`, err.message);
779
+ cleanup();
780
+ });
781
+ }
782
+
783
+ // ─── Public API ─────────────────────────────────────────────────────────────
784
+
785
+ /**
786
+ * Set up the Agent Voice Bridge WebSocket endpoint on the given HTTP server.
787
+ *
788
+ * Listens for upgrade requests on `/api/realtime/bridge` using noServer mode
789
+ * (same pattern as setupRealtimeRelay in index.js) so it coexists with other
790
+ * WebSocket servers.
791
+ *
792
+ * @param {import('http').Server} server - The HTTP server instance
793
+ * @returns {WebSocketServer} The WebSocket server instance
794
+ */
795
+ export function setupAgentVoiceBridge(server) {
796
+ const wss = new WebSocketServer({ noServer: true, perMessageDeflate: false });
797
+
798
+ // ── Handle upgrade on /api/realtime/bridge ──────────────────────────
799
+ server.on('upgrade', (request, socket, head) => {
800
+ const { pathname } = new URL(request.url, `http://${request.headers.host}`);
801
+
802
+ // Handle /api/realtime?mode=agent OR /api/realtime/bridge
803
+ const params = new URL(request.url, `http://${request.headers.host}`).searchParams;
804
+ const isAgentMode = pathname === '/api/realtime' && params.get('mode') === 'agent';
805
+ const isBridgePath = pathname === '/api/realtime/bridge';
806
+ if (!isAgentMode && !isBridgePath) return; // Let other handlers take it
807
+
808
+ // Origin check
809
+ const origin = request.headers.origin;
810
+ if (!verifyOrigin(origin)) {
811
+ log.warn(`Rejected upgrade from invalid origin: ${origin}`);
812
+ socket.write('HTTP/1.1 403 Forbidden\r\n\r\n');
813
+ socket.destroy();
814
+ return;
815
+ }
816
+
817
+ // Connection limit
818
+ if (activeSessions.size >= MAX_BRIDGE_CONNECTIONS) {
819
+ log.warn(`Rejected upgrade — at capacity (${activeSessions.size}/${MAX_BRIDGE_CONNECTIONS})`);
820
+ socket.write('HTTP/1.1 429 Too Many Requests\r\n\r\n');
821
+ socket.destroy();
822
+ return;
823
+ }
824
+
825
+ wss.handleUpgrade(request, socket, head, (ws) => {
826
+ wss.emit('connection', ws, request);
827
+ });
828
+ });
829
+
830
+ // ── New client connection ───────────────────────────────────────────
831
+ wss.on('connection', (clientWs, req) => {
832
+ handleBridgeConnection(clientWs, req);
833
+ });
834
+
835
+ log.info('Agent Voice Bridge ready at /api/realtime?mode=agent');
836
+ return wss;
837
+ }