@mooncompany/uplink-chat 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @mooncompany/uplink-chat might be problematic. Click here for more details.
- package/LICENSE +21 -0
- package/README.md +185 -0
- package/bin/uplink.js +279 -0
- package/middleware/error-handler.js +69 -0
- package/package.json +93 -0
- package/public/css/agents.36b98c0f.css +1469 -0
- package/public/css/agents.css +1469 -0
- package/public/css/app.a6a7f8f5.css +2731 -0
- package/public/css/app.css +2731 -0
- package/public/css/artifacts.css +444 -0
- package/public/css/commands.css +55 -0
- package/public/css/connection.css +131 -0
- package/public/css/dashboard.css +233 -0
- package/public/css/developer.css +328 -0
- package/public/css/files.css +123 -0
- package/public/css/markdown.css +156 -0
- package/public/css/message-actions.css +278 -0
- package/public/css/mobile.css +614 -0
- package/public/css/panels-unified.css +483 -0
- package/public/css/premium.css +415 -0
- package/public/css/realtime.css +189 -0
- package/public/css/satellites.css +401 -0
- package/public/css/shortcuts.css +185 -0
- package/public/css/split-view.4def0262.css +673 -0
- package/public/css/split-view.css +673 -0
- package/public/css/theme-generator.css +391 -0
- package/public/css/themes.css +387 -0
- package/public/css/timestamps.css +54 -0
- package/public/css/variables.css +78 -0
- package/public/dist/bundle.b55050c4.js +15757 -0
- package/public/favicon.svg +24 -0
- package/public/img/agents/ada.png +0 -0
- package/public/img/agents/clarice.png +0 -0
- package/public/img/agents/dennis-nedry.png +0 -0
- package/public/img/agents/elliot-alderson.png +0 -0
- package/public/img/agents/main.png +0 -0
- package/public/img/agents/scotty.png +0 -0
- package/public/img/agents/top-flight-security.png +0 -0
- package/public/index.html +1083 -0
- package/public/js/agents-data.js +234 -0
- package/public/js/agents-ui.js +72 -0
- package/public/js/agents.js +1525 -0
- package/public/js/app.js +79 -0
- package/public/js/appearance-settings.js +111 -0
- package/public/js/artifacts.js +432 -0
- package/public/js/audio-queue.js +168 -0
- package/public/js/bootstrap.js +54 -0
- package/public/js/chat.js +1211 -0
- package/public/js/commands.js +581 -0
- package/public/js/connection-api.js +121 -0
- package/public/js/connection.js +1231 -0
- package/public/js/context-tracker.js +271 -0
- package/public/js/core.js +172 -0
- package/public/js/dashboard.js +452 -0
- package/public/js/developer.js +432 -0
- package/public/js/encryption.js +124 -0
- package/public/js/errors.js +122 -0
- package/public/js/event-bus.js +77 -0
- package/public/js/fetch-utils.js +171 -0
- package/public/js/file-handler.js +229 -0
- package/public/js/files.js +352 -0
- package/public/js/gateway-chat.js +538 -0
- package/public/js/logger.js +112 -0
- package/public/js/markdown.js +190 -0
- package/public/js/message-actions.js +431 -0
- package/public/js/message-renderer.js +288 -0
- package/public/js/missed-messages.js +235 -0
- package/public/js/mobile-debug.js +95 -0
- package/public/js/notifications.js +367 -0
- package/public/js/offline-queue.js +178 -0
- package/public/js/onboarding.js +543 -0
- package/public/js/panels.js +156 -0
- package/public/js/premium.js +412 -0
- package/public/js/realtime-voice.js +844 -0
- package/public/js/satellite-sync.js +256 -0
- package/public/js/satellite-ui.js +175 -0
- package/public/js/satellites.js +1516 -0
- package/public/js/settings.js +1087 -0
- package/public/js/shortcuts.js +381 -0
- package/public/js/split-chat.js +1234 -0
- package/public/js/split-resize.js +211 -0
- package/public/js/splitview.js +340 -0
- package/public/js/storage.js +408 -0
- package/public/js/streaming-handler.js +324 -0
- package/public/js/stt-settings.js +316 -0
- package/public/js/theme-generator.js +661 -0
- package/public/js/themes.js +164 -0
- package/public/js/timestamps.js +198 -0
- package/public/js/tts-settings.js +575 -0
- package/public/js/ui.js +267 -0
- package/public/js/update-notifier.js +143 -0
- package/public/js/utils/constants.js +165 -0
- package/public/js/utils/sanitize.js +93 -0
- package/public/js/utils/sse-parser.js +195 -0
- package/public/js/voice.js +883 -0
- package/public/manifest.json +58 -0
- package/public/moon_texture.jpg +0 -0
- package/public/sw.js +221 -0
- package/public/three.min.js +6 -0
- package/server/channel.js +529 -0
- package/server/chat.js +270 -0
- package/server/config-store.js +362 -0
- package/server/config.js +159 -0
- package/server/context.js +131 -0
- package/server/gateway-commands.js +211 -0
- package/server/gateway-proxy.js +318 -0
- package/server/index.js +22 -0
- package/server/logger.js +89 -0
- package/server/middleware/auth.js +188 -0
- package/server/middleware.js +218 -0
- package/server/openclaw-discover.js +308 -0
- package/server/premium/index.js +156 -0
- package/server/premium/license.js +140 -0
- package/server/realtime/bridge.js +837 -0
- package/server/realtime/index.js +349 -0
- package/server/realtime/tts-stream.js +446 -0
- package/server/routes/agents.js +564 -0
- package/server/routes/artifacts.js +174 -0
- package/server/routes/chat.js +311 -0
- package/server/routes/config-settings.js +345 -0
- package/server/routes/config.js +603 -0
- package/server/routes/files.js +307 -0
- package/server/routes/index.js +18 -0
- package/server/routes/media.js +451 -0
- package/server/routes/missed-messages.js +107 -0
- package/server/routes/premium.js +75 -0
- package/server/routes/push.js +156 -0
- package/server/routes/satellite.js +406 -0
- package/server/routes/status.js +251 -0
- package/server/routes/stt.js +35 -0
- package/server/routes/voice.js +260 -0
- package/server/routes/webhooks.js +203 -0
- package/server/routes.js +206 -0
- package/server/runtime-config.js +336 -0
- package/server/share.js +305 -0
- package/server/stt/faster-whisper.js +72 -0
- package/server/stt/groq.js +51 -0
- package/server/stt/index.js +196 -0
- package/server/stt/openai.js +49 -0
- package/server/sync.js +244 -0
- package/server/tailscale-https.js +175 -0
- package/server/tts.js +646 -0
- package/server/update-checker.js +172 -0
- package/server/utils/filename.js +129 -0
- package/server/utils.js +147 -0
- package/server/watchdog.js +318 -0
- package/server/websocket/broadcast.js +359 -0
- package/server/websocket/connections.js +339 -0
- package/server/websocket/index.js +215 -0
- package/server/websocket/routing.js +277 -0
- package/server/websocket/sync.js +102 -0
- package/server.js +404 -0
- package/utils/detect-tool-usage.js +93 -0
- package/utils/errors.js +158 -0
- package/utils/html-escape.js +84 -0
- package/utils/id-sanitize.js +94 -0
- package/utils/response.js +130 -0
- package/utils/with-retry.js +105 -0
|
@@ -0,0 +1,837 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent Voice Bridge — Routes voice through OpenClaw Gateway
|
|
3
|
+
*
|
|
4
|
+
* Triangle architecture:
|
|
5
|
+
* Browser Mic → OpenAI Realtime API (ears: VAD + transcription only)
|
|
6
|
+
* ↓ transcript
|
|
7
|
+
* Uplink Bridge (this module)
|
|
8
|
+
* ↓ text message
|
|
9
|
+
* OpenClaw Gateway (brain)
|
|
10
|
+
* ↓ streaming text response
|
|
11
|
+
* TTS Engine (mouth)
|
|
12
|
+
* ↓ PCM audio chunks
|
|
13
|
+
* Browser Speaker (via WebSocket)
|
|
14
|
+
*
|
|
15
|
+
* Key difference from index.js (standalone relay):
|
|
16
|
+
* - OpenAI Realtime opened with `modalities: ['text']` — no audio output from OpenAI
|
|
17
|
+
* - Transcription events intercepted → routed to Gateway instead of letting OpenAI respond
|
|
18
|
+
* - Gateway response streamed → TTS → PCM audio sent back to client
|
|
19
|
+
* - Client plays audio directly via AudioContext (does NOT inject back into OpenAI)
|
|
20
|
+
*
|
|
21
|
+
* Custom events sent to client:
|
|
22
|
+
* { type: 'bridge.transcript', text } — user's transcribed speech
|
|
23
|
+
* { type: 'bridge.response.delta', text } — streaming text chunk from gateway
|
|
24
|
+
* { type: 'bridge.audio', audio: base64PCM } — TTS audio chunk
|
|
25
|
+
* { type: 'bridge.response.done' } — response complete
|
|
26
|
+
* { type: 'bridge.status', status } — 'thinking', 'speaking', etc.
|
|
27
|
+
* { type: 'bridge.error', error } — error message
|
|
28
|
+
*
|
|
29
|
+
* Usage (in server.js):
|
|
30
|
+
* import { setupAgentVoiceBridge } from './server/realtime/bridge.js';
|
|
31
|
+
* setupAgentVoiceBridge(httpServer);
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
import { WebSocketServer, WebSocket } from 'ws';
|
|
35
|
+
import { createLogger } from '../logger.js';
|
|
36
|
+
import { loadConfig } from '../runtime-config.js';
|
|
37
|
+
import { ALLOWED_ORIGINS, STREAM_READ_TIMEOUT_MS } from '../config.js';
|
|
38
|
+
import { streamTTS, splitSentences } from './tts-stream.js';
|
|
39
|
+
|
|
40
|
+
const log = createLogger('voice-bridge');
|
|
41
|
+
|
|
42
|
+
// ─── Constants ──────────────────────────────────────────────────────────────
|
|
43
|
+
|
|
44
|
+
const OPENAI_REALTIME_BASE = 'wss://api.openai.com/v1/realtime';
|
|
45
|
+
const DEFAULT_MODEL = 'gpt-4o-mini-realtime-preview';
|
|
46
|
+
const MAX_BRIDGE_CONNECTIONS = 5;
|
|
47
|
+
|
|
48
|
+
// Gateway streaming constants
|
|
49
|
+
const GATEWAY_FETCH_TIMEOUT_MS = 300_000; // 5 minutes for long responses
|
|
50
|
+
const GATEWAY_STREAM_TIMEOUT_MS = STREAM_READ_TIMEOUT_MS || 300_000;
|
|
51
|
+
|
|
52
|
+
// Connection bookkeeping
|
|
53
|
+
const activeSessions = new Map(); // sessionId → session state
|
|
54
|
+
|
|
55
|
+
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Verify that the WebSocket upgrade origin is allowed.
|
|
59
|
+
*/
|
|
60
|
+
function verifyOrigin(origin) {
|
|
61
|
+
if (!origin) return true; // Non-browser clients don't send Origin
|
|
62
|
+
try {
|
|
63
|
+
const url = new URL(origin);
|
|
64
|
+
const host = url.hostname;
|
|
65
|
+
if (host === 'localhost' || host === '127.0.0.1' || host === '::1') return true;
|
|
66
|
+
if (host.endsWith('.ts.net')) return true;
|
|
67
|
+
return ALLOWED_ORIGINS.some(allowed => {
|
|
68
|
+
try { return new URL(allowed).hostname === host; } catch { return false; }
|
|
69
|
+
});
|
|
70
|
+
} catch {
|
|
71
|
+
return false;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Send a JSON event to the client WebSocket if it's open.
|
|
77
|
+
*/
|
|
78
|
+
function sendToClient(ws, event) {
|
|
79
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
80
|
+
try {
|
|
81
|
+
ws.send(JSON.stringify(event));
|
|
82
|
+
} catch (err) {
|
|
83
|
+
log.error('Failed to send to client:', err.message);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Build the `session.update` for OpenAI Realtime in bridge mode.
|
|
90
|
+
* Key difference: modalities=['text'] only — no audio output from OpenAI.
|
|
91
|
+
* We keep input_audio_format=pcm16 so mic audio flows through for transcription.
|
|
92
|
+
*/
|
|
93
|
+
function buildBridgeSessionUpdate(voice, vadSilenceDurationMs) {
|
|
94
|
+
const silenceMs = Math.max(200, Math.min(1500, vadSilenceDurationMs || 400));
|
|
95
|
+
return JSON.stringify({
|
|
96
|
+
type: 'session.update',
|
|
97
|
+
session: {
|
|
98
|
+
modalities: ['text'],
|
|
99
|
+
instructions: 'Transcribe the user\'s speech accurately. Do not generate responses.',
|
|
100
|
+
voice: voice || 'marin',
|
|
101
|
+
input_audio_format: 'pcm16',
|
|
102
|
+
input_audio_transcription: {
|
|
103
|
+
model: 'whisper-1',
|
|
104
|
+
},
|
|
105
|
+
turn_detection: {
|
|
106
|
+
type: 'server_vad',
|
|
107
|
+
threshold: 0.65,
|
|
108
|
+
prefix_padding_ms: 200,
|
|
109
|
+
silence_duration_ms: silenceMs,
|
|
110
|
+
},
|
|
111
|
+
},
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Build the TTS config from runtime config.
|
|
117
|
+
* @param {Object} runtimeConfig - The loaded runtime config
|
|
118
|
+
* @returns {Object} TTS config for streamTTS()
|
|
119
|
+
*/
|
|
120
|
+
function buildTTSConfig(runtimeConfig) {
|
|
121
|
+
// Use agent voice settings (from Settings → Voice → Agent Voice)
|
|
122
|
+
const engine = runtimeConfig.agentVoiceTtsEngine ||
|
|
123
|
+
(runtimeConfig.openaiApiKey ? 'openai' : 'edge');
|
|
124
|
+
|
|
125
|
+
// Map agentVoiceTtsVoice to the right engine field
|
|
126
|
+
const agentVoice = runtimeConfig.agentVoiceTtsVoice || 'echo';
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
engine,
|
|
130
|
+
openaiApiKey: runtimeConfig.openaiApiKey,
|
|
131
|
+
openaiTtsVoice: engine === 'openai' ? agentVoice : (runtimeConfig.openaiTtsVoice || 'nova'),
|
|
132
|
+
openaiTtsModel: runtimeConfig.openaiTtsModel || 'tts-1',
|
|
133
|
+
edgeTtsVoice: runtimeConfig.edgeTtsVoice || 'en-US-GuyNeural',
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Stream a message to the OpenClaw gateway and process the SSE response.
|
|
139
|
+
* Returns the full assembled response text.
|
|
140
|
+
*
|
|
141
|
+
* @param {Object} params
|
|
142
|
+
* @param {string} params.message - User's transcribed message
|
|
143
|
+
* @param {Object} params.config - Runtime config
|
|
144
|
+
* @param {string} params.sessionId - Session ID for logging
|
|
145
|
+
* @param {WebSocket} params.clientWs - Client WebSocket for streaming events
|
|
146
|
+
* @param {AbortSignal} params.signal - Abort signal for cancellation
|
|
147
|
+
* @param {Function} [params.onSentence] - Callback fired with each complete sentence during streaming
|
|
148
|
+
* @returns {Promise<{ response: string, tools: string[] }>}
|
|
149
|
+
*/
|
|
150
|
+
async function streamGatewayResponse({ message, config, sessionId, clientWs, signal, onSentence }) {
|
|
151
|
+
const gatewayUrl = config.gatewayUrl;
|
|
152
|
+
const gatewayToken = config.gatewayToken;
|
|
153
|
+
|
|
154
|
+
if (!gatewayUrl) {
|
|
155
|
+
throw new Error('Gateway URL not configured');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Build session key — bridge shares the main session
|
|
159
|
+
const sessionKey = 'agent:main:main';
|
|
160
|
+
|
|
161
|
+
const url = `${gatewayUrl}/v1/chat/completions`;
|
|
162
|
+
log.info(`[${sessionId}] Gateway request: ${url}, session=${sessionKey}`);
|
|
163
|
+
|
|
164
|
+
const response = await fetch(url, {
|
|
165
|
+
method: 'POST',
|
|
166
|
+
headers: {
|
|
167
|
+
'Content-Type': 'application/json',
|
|
168
|
+
'Authorization': `Bearer ${gatewayToken}`,
|
|
169
|
+
'x-openclaw-session-key': sessionKey,
|
|
170
|
+
},
|
|
171
|
+
body: JSON.stringify({
|
|
172
|
+
model: config.voiceModel || 'openclaw',
|
|
173
|
+
user: 'uplink-voice',
|
|
174
|
+
stream: true,
|
|
175
|
+
max_tokens: 100,
|
|
176
|
+
stream_options: { include_usage: true },
|
|
177
|
+
messages: [
|
|
178
|
+
{
|
|
179
|
+
role: 'user',
|
|
180
|
+
content: `[Voice chat via Agent Voice Bridge — keep response conversational and concise] ${message}`,
|
|
181
|
+
},
|
|
182
|
+
],
|
|
183
|
+
}),
|
|
184
|
+
signal,
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
if (!response.ok) {
|
|
188
|
+
const errText = await response.text().catch(() => 'Unknown error');
|
|
189
|
+
throw new Error(`Gateway error ${response.status}: ${errText}`);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if (!response.body) {
|
|
193
|
+
throw new Error('Gateway response has no body');
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Parse the SSE stream
|
|
197
|
+
const reader = response.body.getReader();
|
|
198
|
+
const decoder = new TextDecoder();
|
|
199
|
+
let buffer = '';
|
|
200
|
+
let fullResponse = '';
|
|
201
|
+
let sentenceAccum = '';
|
|
202
|
+
const detectedTools = [];
|
|
203
|
+
let sentThinking = false;
|
|
204
|
+
|
|
205
|
+
const readWithTimeout = () => {
|
|
206
|
+
return Promise.race([
|
|
207
|
+
reader.read(),
|
|
208
|
+
new Promise((_, reject) =>
|
|
209
|
+
setTimeout(() => reject(new Error('Gateway stream read timed out')), GATEWAY_STREAM_TIMEOUT_MS)
|
|
210
|
+
),
|
|
211
|
+
]);
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
try {
|
|
215
|
+
while (true) {
|
|
216
|
+
if (signal?.aborted) break;
|
|
217
|
+
|
|
218
|
+
const { done, value } = await readWithTimeout();
|
|
219
|
+
if (done) break;
|
|
220
|
+
|
|
221
|
+
buffer += decoder.decode(value, { stream: true });
|
|
222
|
+
const lines = buffer.split('\n');
|
|
223
|
+
buffer = lines.pop() || '';
|
|
224
|
+
|
|
225
|
+
for (const line of lines) {
|
|
226
|
+
if (!line.startsWith('data: ')) continue;
|
|
227
|
+
const data = line.slice(6);
|
|
228
|
+
if (data === '[DONE]') continue;
|
|
229
|
+
|
|
230
|
+
try {
|
|
231
|
+
const parsed = JSON.parse(data);
|
|
232
|
+
const delta = parsed.choices?.[0]?.delta;
|
|
233
|
+
|
|
234
|
+
// Detect tool calls
|
|
235
|
+
const toolCalls = delta?.tool_calls;
|
|
236
|
+
if (toolCalls && toolCalls.length > 0) {
|
|
237
|
+
const toolName = toolCalls[0]?.function?.name;
|
|
238
|
+
if (toolName && !detectedTools.includes(toolName)) {
|
|
239
|
+
detectedTools.push(toolName);
|
|
240
|
+
if (!sentThinking) {
|
|
241
|
+
sendToClient(clientWs, { type: 'bridge.status', status: 'thinking' });
|
|
242
|
+
sentThinking = true;
|
|
243
|
+
}
|
|
244
|
+
log.info(`[${sessionId}] Tool detected: ${toolName}`);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Stream text content
|
|
249
|
+
if (delta?.content) {
|
|
250
|
+
fullResponse += delta.content;
|
|
251
|
+
sendToClient(clientWs, {
|
|
252
|
+
type: 'bridge.response.delta',
|
|
253
|
+
text: delta.content,
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
// Real-time clause detection for pipelined TTS
|
|
257
|
+
// Fire on sentence ends AND clause boundaries for faster first audio
|
|
258
|
+
if (onSentence) {
|
|
259
|
+
sentenceAccum = (sentenceAccum || '') + delta.content;
|
|
260
|
+
// Match sentence-ending punctuation OR clause breaks (after 15+ chars)
|
|
261
|
+
// Clause breaks: comma, semicolon, colon, dash, newline
|
|
262
|
+
let match;
|
|
263
|
+
while (sentenceAccum.length > 0) {
|
|
264
|
+
// Prefer sentence boundaries
|
|
265
|
+
match = sentenceAccum.match(/^(.*?[.!?])\s/s);
|
|
266
|
+
if (!match && sentenceAccum.length >= 20) {
|
|
267
|
+
// Fall back to clause boundaries for faster streaming
|
|
268
|
+
match = sentenceAccum.match(/^(.{15,}?[,;:\-—\n])\s*/s);
|
|
269
|
+
}
|
|
270
|
+
if (match) {
|
|
271
|
+
const clause = match[1].trim();
|
|
272
|
+
sentenceAccum = sentenceAccum.slice(match[0].length);
|
|
273
|
+
if (clause) onSentence(clause);
|
|
274
|
+
} else {
|
|
275
|
+
break;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
} catch {
|
|
281
|
+
// Skip unparseable chunks
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
} finally {
|
|
286
|
+
reader.releaseLock();
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Flush any remaining text as a final sentence
|
|
290
|
+
if (onSentence && sentenceAccum.trim()) {
|
|
291
|
+
onSentence(sentenceAccum.trim());
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
return { response: fullResponse, tools: detectedTools };
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Process TTS for a complete response: split into sentences, stream TTS for each,
|
|
299
|
+
* send audio chunks to client.
|
|
300
|
+
*
|
|
301
|
+
* @param {Object} params
|
|
302
|
+
* @param {string} params.text - Full response text
|
|
303
|
+
* @param {Object} params.ttsConfig - TTS engine configuration
|
|
304
|
+
* @param {WebSocket} params.clientWs - Client WebSocket
|
|
305
|
+
* @param {string} params.sessionId - Session ID for logging
|
|
306
|
+
* @param {AbortSignal} params.signal - Abort signal
|
|
307
|
+
*/
|
|
308
|
+
async function processResponseTTS({ text, ttsConfig, clientWs, sessionId, signal }) {
|
|
309
|
+
const sentences = splitSentences(text);
|
|
310
|
+
|
|
311
|
+
if (sentences.length === 0) {
|
|
312
|
+
log.warn(`[${sessionId}] No sentences to synthesize`);
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
log.info(`[${sessionId}] TTS: ${sentences.length} sentence(s), engine=${ttsConfig.engine}`);
|
|
317
|
+
sendToClient(clientWs, { type: 'bridge.status', status: 'speaking' });
|
|
318
|
+
|
|
319
|
+
for (const sentence of sentences) {
|
|
320
|
+
if (signal?.aborted) break;
|
|
321
|
+
if (clientWs.readyState !== WebSocket.OPEN) break;
|
|
322
|
+
|
|
323
|
+
try {
|
|
324
|
+
for await (const pcmChunk of streamTTS(sentence, ttsConfig)) {
|
|
325
|
+
if (signal?.aborted) break;
|
|
326
|
+
if (clientWs.readyState !== WebSocket.OPEN) break;
|
|
327
|
+
|
|
328
|
+
// Send PCM chunk as base64 to client
|
|
329
|
+
sendToClient(clientWs, {
|
|
330
|
+
type: 'bridge.audio',
|
|
331
|
+
audio: pcmChunk.toString('base64'),
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
} catch (err) {
|
|
335
|
+
log.error(`[${sessionId}] TTS error for sentence:`, err.message);
|
|
336
|
+
sendToClient(clientWs, {
|
|
337
|
+
type: 'bridge.error',
|
|
338
|
+
error: `TTS failed: ${err.message}`,
|
|
339
|
+
});
|
|
340
|
+
// Continue with remaining sentences — client gets text even if audio fails
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* Save a message pair (user + assistant) to the sync file for chat history.
|
|
347
|
+
* Uses a direct HTTP POST to our own server's message-saving endpoint
|
|
348
|
+
* or imports the function directly.
|
|
349
|
+
*
|
|
350
|
+
* @param {string} userText - User's transcribed message
|
|
351
|
+
* @param {string} assistantText - Agent's response
|
|
352
|
+
* @param {string} sessionId - Session ID for logging
|
|
353
|
+
*/
|
|
354
|
+
async function saveToHistory(userText, assistantText, sessionId) {
|
|
355
|
+
try {
|
|
356
|
+
// Lazy import to avoid circular dependencies
|
|
357
|
+
// The saveMessageToSync function writes to the messages-sync.json file
|
|
358
|
+
const { saveMessageToSync } = await import('../routes.js');
|
|
359
|
+
|
|
360
|
+
if (userText) {
|
|
361
|
+
await saveMessageToSync('user', userText);
|
|
362
|
+
log.debug(`[${sessionId}] Saved user message to history`);
|
|
363
|
+
}
|
|
364
|
+
if (assistantText) {
|
|
365
|
+
await saveMessageToSync('assistant', assistantText);
|
|
366
|
+
log.debug(`[${sessionId}] Saved assistant message to history`);
|
|
367
|
+
}
|
|
368
|
+
} catch (err) {
|
|
369
|
+
log.error(`[${sessionId}] Failed to save to history:`, err.message);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Broadcast sync messages to WebSocket clients for cross-device sync.
|
|
375
|
+
*
|
|
376
|
+
* @param {string} role - 'user' or 'assistant'
|
|
377
|
+
* @param {string} text - Message text
|
|
378
|
+
*/
|
|
379
|
+
async function broadcastSync(role, text) {
|
|
380
|
+
try {
|
|
381
|
+
const { broadcastSyncMessage, generateMessageId } = await import('../websocket/index.js');
|
|
382
|
+
const msgId = generateMessageId();
|
|
383
|
+
broadcastSyncMessage(role, text, 'main', msgId);
|
|
384
|
+
} catch (err) {
|
|
385
|
+
log.warn('Failed to broadcast sync message:', err.message);
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// ─── Main Bridge Logic ─────────────────────────────────────────────────────
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Handle a single bridge session: client WS + OpenAI Realtime WS + Gateway.
|
|
393
|
+
*
|
|
394
|
+
* @param {WebSocket} clientWs - Client WebSocket connection
|
|
395
|
+
* @param {import('http').IncomingMessage} req - HTTP upgrade request
|
|
396
|
+
*/
|
|
397
|
+
async function handleBridgeConnection(clientWs, req) {
|
|
398
|
+
const sessionId = `bridge-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
399
|
+
const clientIp = req.socket.remoteAddress;
|
|
400
|
+
log.info(`Bridge client connected: ${sessionId} from ${clientIp}`);
|
|
401
|
+
|
|
402
|
+
// ── Load runtime config ─────────────────────────────────────────────
|
|
403
|
+
let config;
|
|
404
|
+
try {
|
|
405
|
+
config = await loadConfig();
|
|
406
|
+
} catch (err) {
|
|
407
|
+
log.error('Failed to load runtime config:', err);
|
|
408
|
+
clientWs.close(1011, 'Server configuration error');
|
|
409
|
+
return;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const apiKey = config.openaiApiKey;
|
|
413
|
+
if (!apiKey) {
|
|
414
|
+
log.error('No OpenAI API key — cannot open realtime session');
|
|
415
|
+
sendToClient(clientWs, {
|
|
416
|
+
type: 'bridge.error',
|
|
417
|
+
error: 'No OpenAI API key configured. Add your key in Settings → Voice.',
|
|
418
|
+
});
|
|
419
|
+
clientWs.close(1008, 'Missing API key');
|
|
420
|
+
return;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
if (!config.gatewayUrl || !config.gatewayToken) {
|
|
424
|
+
log.error('Gateway not configured — cannot use agent voice bridge');
|
|
425
|
+
sendToClient(clientWs, {
|
|
426
|
+
type: 'bridge.error',
|
|
427
|
+
error: 'OpenClaw Gateway not configured. Check Settings.',
|
|
428
|
+
});
|
|
429
|
+
clientWs.close(1008, 'Gateway not configured');
|
|
430
|
+
return;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
const model = config.realtimeModel || DEFAULT_MODEL;
|
|
434
|
+
const voice = config.realtimeVoice || 'marin';
|
|
435
|
+
const vadSilenceDurationMs = config.vadSilenceDurationMs || 400;
|
|
436
|
+
const ttsConfig = buildTTSConfig(config);
|
|
437
|
+
|
|
438
|
+
// ── Open OpenAI Realtime WebSocket ──────────────────────────────────
|
|
439
|
+
const openaiUrl = `${OPENAI_REALTIME_BASE}?model=${encodeURIComponent(model)}`;
|
|
440
|
+
let openaiWs;
|
|
441
|
+
try {
|
|
442
|
+
openaiWs = new WebSocket(openaiUrl, {
|
|
443
|
+
headers: {
|
|
444
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
445
|
+
'OpenAI-Beta': 'realtime=v1',
|
|
446
|
+
},
|
|
447
|
+
perMessageDeflate: true,
|
|
448
|
+
});
|
|
449
|
+
} catch (err) {
|
|
450
|
+
log.error('Failed to create OpenAI WebSocket:', err);
|
|
451
|
+
clientWs.close(1011, 'Failed to connect to OpenAI');
|
|
452
|
+
return;
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// ── Session state ───────────────────────────────────────────────────
|
|
456
|
+
let clientClosed = false;
|
|
457
|
+
let openaiClosed = false;
|
|
458
|
+
let processingResponse = false;
|
|
459
|
+
let activeAbort = null; // AbortController for current gateway request
|
|
460
|
+
|
|
461
|
+
activeSessions.set(sessionId, { clientWs, openaiWs });
|
|
462
|
+
|
|
463
|
+
// Keepalive
|
|
464
|
+
const keepalive = setInterval(() => {
|
|
465
|
+
try {
|
|
466
|
+
if (clientWs.readyState === WebSocket.OPEN) clientWs.ping();
|
|
467
|
+
if (openaiWs.readyState === WebSocket.OPEN) openaiWs.ping();
|
|
468
|
+
} catch { /* swallow */ }
|
|
469
|
+
}, 15_000);
|
|
470
|
+
|
|
471
|
+
function cleanup() {
|
|
472
|
+
clearInterval(keepalive);
|
|
473
|
+
activeSessions.delete(sessionId);
|
|
474
|
+
if (activeAbort) {
|
|
475
|
+
activeAbort.abort();
|
|
476
|
+
activeAbort = null;
|
|
477
|
+
}
|
|
478
|
+
if (!clientClosed && clientWs.readyState === WebSocket.OPEN) {
|
|
479
|
+
clientWs.close();
|
|
480
|
+
}
|
|
481
|
+
if (!openaiClosed && openaiWs.readyState === WebSocket.OPEN) {
|
|
482
|
+
openaiWs.close();
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// ── Process a user transcript through the gateway ───────────────────
|
|
487
|
+
|
|
488
|
+
const MIN_BARGE_IN_WORDS = 3; // Don't barge-in for short/ambiguous transcripts
|
|
489
|
+
|
|
490
|
+
async function processTranscript(transcript) {
|
|
491
|
+
if (!transcript || !transcript.trim()) return;
|
|
492
|
+
|
|
493
|
+
const trimmed = transcript.trim();
|
|
494
|
+
const wordCount = trimmed.split(/\s+/).filter(w => w.length > 0).length;
|
|
495
|
+
|
|
496
|
+
log.info(`[${sessionId}] Transcript: "${trimmed.substring(0, 200)}" (${wordCount} words)`);
|
|
497
|
+
|
|
498
|
+
// If a response is in progress, only barge-in for substantial speech
|
|
499
|
+
// Short fragments (1-2 words) during a response are likely noise/echo
|
|
500
|
+
if (activeAbort) {
|
|
501
|
+
if (wordCount < MIN_BARGE_IN_WORDS) {
|
|
502
|
+
log.info(`[${sessionId}] Skipping short transcript during active response: "${trimmed}"`);
|
|
503
|
+
return;
|
|
504
|
+
}
|
|
505
|
+
log.info(`[${sessionId}] Barge-in: aborting previous response`);
|
|
506
|
+
activeAbort.abort();
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
processingResponse = true;
|
|
510
|
+
activeAbort = new AbortController();
|
|
511
|
+
const { signal } = activeAbort;
|
|
512
|
+
|
|
513
|
+
// Send transcript to client
|
|
514
|
+
sendToClient(clientWs, { type: 'bridge.transcript', text: trimmed });
|
|
515
|
+
|
|
516
|
+
// Broadcast user message for cross-device sync
|
|
517
|
+
broadcastSync('user', trimmed);
|
|
518
|
+
|
|
519
|
+
try {
|
|
520
|
+
// Send to gateway and stream response with pipelined TTS
|
|
521
|
+
sendToClient(clientWs, { type: 'bridge.status', status: 'thinking' });
|
|
522
|
+
|
|
523
|
+
// Queue for sentences that arrive during streaming
|
|
524
|
+
const sentenceQueue = [];
|
|
525
|
+
let ttsRunning = false;
|
|
526
|
+
let streamDone = false;
|
|
527
|
+
let sentenceCount = 0;
|
|
528
|
+
|
|
529
|
+
// TTS consumer — processes sentences from queue as they arrive
|
|
530
|
+
const processTTSQueue = async () => {
|
|
531
|
+
if (ttsRunning) return;
|
|
532
|
+
ttsRunning = true;
|
|
533
|
+
while (sentenceQueue.length > 0) {
|
|
534
|
+
if (signal?.aborted) break;
|
|
535
|
+
if (clientWs.readyState !== WebSocket.OPEN) break;
|
|
536
|
+
|
|
537
|
+
const sentence = sentenceQueue.shift();
|
|
538
|
+
const clean = sentence.replace(/MEDIA:.+?(?:\n|$)/g, '').trim();
|
|
539
|
+
if (!clean) continue;
|
|
540
|
+
|
|
541
|
+
sentenceCount++;
|
|
542
|
+
if (sentenceCount === 1) {
|
|
543
|
+
sendToClient(clientWs, { type: 'bridge.status', status: 'speaking' });
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
try {
|
|
547
|
+
for await (const pcmChunk of streamTTS(clean, ttsConfig)) {
|
|
548
|
+
if (signal?.aborted) break;
|
|
549
|
+
if (clientWs.readyState !== WebSocket.OPEN) break;
|
|
550
|
+
sendToClient(clientWs, {
|
|
551
|
+
type: 'bridge.audio',
|
|
552
|
+
audio: pcmChunk.toString('base64'),
|
|
553
|
+
});
|
|
554
|
+
}
|
|
555
|
+
} catch (err) {
|
|
556
|
+
log.error(`[${sessionId}] TTS error:`, err.message);
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
ttsRunning = false;
|
|
560
|
+
};
|
|
561
|
+
|
|
562
|
+
// Stream gateway with sentence callback — TTS starts on first sentence
|
|
563
|
+
const { response: gatewayResponse } = await streamGatewayResponse({
|
|
564
|
+
message: trimmed,
|
|
565
|
+
config,
|
|
566
|
+
sessionId,
|
|
567
|
+
clientWs,
|
|
568
|
+
signal,
|
|
569
|
+
onSentence: (sentence) => {
|
|
570
|
+
sentenceQueue.push(sentence);
|
|
571
|
+
processTTSQueue(); // kick TTS if not already running
|
|
572
|
+
},
|
|
573
|
+
});
|
|
574
|
+
|
|
575
|
+
if (signal.aborted) {
|
|
576
|
+
log.info(`[${sessionId}] Response aborted (barge-in)`);
|
|
577
|
+
return;
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// Wait for TTS queue to drain
|
|
581
|
+
while (sentenceQueue.length > 0 || ttsRunning) {
|
|
582
|
+
if (signal?.aborted) break;
|
|
583
|
+
await new Promise(r => setTimeout(r, 50));
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
const cleanResponse = (gatewayResponse || '').replace(/MEDIA:.+?(?:\n|$)/g, '').trim();
|
|
587
|
+
log.info(`[${sessionId}] Complete: ${cleanResponse.length} chars, ${sentenceCount} sentences TTS'd`);
|
|
588
|
+
|
|
589
|
+
// Signal response complete
|
|
590
|
+
sendToClient(clientWs, { type: 'bridge.response.done' });
|
|
591
|
+
|
|
592
|
+
// Save to chat history (async, don't block)
|
|
593
|
+
if (cleanResponse) {
|
|
594
|
+
saveToHistory(trimmed, cleanResponse, sessionId);
|
|
595
|
+
broadcastSync('assistant', cleanResponse);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
} catch (err) {
|
|
599
|
+
if (err.name === 'AbortError' || signal.aborted) {
|
|
600
|
+
log.info(`[${sessionId}] Response processing aborted`);
|
|
601
|
+
} else {
|
|
602
|
+
log.error(`[${sessionId}] Error processing transcript:`, err.message);
|
|
603
|
+
sendToClient(clientWs, {
|
|
604
|
+
type: 'bridge.error',
|
|
605
|
+
error: `Processing failed: ${err.message}`,
|
|
606
|
+
});
|
|
607
|
+
sendToClient(clientWs, { type: 'bridge.response.done' });
|
|
608
|
+
}
|
|
609
|
+
} finally {
|
|
610
|
+
processingResponse = false;
|
|
611
|
+
if (activeAbort?.signal === signal) {
|
|
612
|
+
activeAbort = null;
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// ── OpenAI Realtime WebSocket events ────────────────────────────────
|
|
618
|
+
|
|
619
|
+
openaiWs.on('open', () => {
|
|
620
|
+
log.info(`[${sessionId}] OpenAI Realtime connected (model=${model})`);
|
|
621
|
+
|
|
622
|
+
// Send bridge-mode session update (text-only modalities, no audio output)
|
|
623
|
+
try {
|
|
624
|
+
const sessionUpdate = buildBridgeSessionUpdate(voice, vadSilenceDurationMs);
|
|
625
|
+
log.debug(`[${sessionId}] Sending bridge session.update (VAD silence: ${vadSilenceDurationMs}ms)`);
|
|
626
|
+
openaiWs.send(sessionUpdate);
|
|
627
|
+
} catch (err) {
|
|
628
|
+
log.error(`[${sessionId}] Failed to send session.update:`, err);
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
// Notify client that bridge is ready
|
|
632
|
+
sendToClient(clientWs, { type: 'relay.ready' });
|
|
633
|
+
});
|
|
634
|
+
|
|
635
|
+
openaiWs.on('message', (data, isBinary) => {
|
|
636
|
+
if (isBinary) {
|
|
637
|
+
// In bridge mode we shouldn't get binary audio from OpenAI (modalities=['text']),
|
|
638
|
+
// but if we do, just drop it
|
|
639
|
+
return;
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
try {
|
|
643
|
+
const event = JSON.parse(data.toString());
|
|
644
|
+
const eventType = event.type;
|
|
645
|
+
|
|
646
|
+
// ── Intercept transcription events ──────────────────────────────
|
|
647
|
+
if (eventType === 'conversation.item.input_audio_transcription.completed') {
|
|
648
|
+
// This is the main event — user speech has been transcribed
|
|
649
|
+
const transcript = event.transcript;
|
|
650
|
+
log.info(`[${sessionId}] OpenAI transcription completed: "${(transcript || '').substring(0, 100)}"`);
|
|
651
|
+
// Process async — don't block the WebSocket message handler
|
|
652
|
+
processTranscript(transcript);
|
|
653
|
+
// Do NOT forward to client — we handle it ourselves
|
|
654
|
+
return;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
// ── Forward select events to client (for UI state) ─────────────
|
|
658
|
+
// These events are useful for the client's VAD/speech indicators
|
|
659
|
+
switch (eventType) {
|
|
660
|
+
case 'session.created':
|
|
661
|
+
case 'session.updated':
|
|
662
|
+
// Let client know session is configured
|
|
663
|
+
sendToClient(clientWs, event);
|
|
664
|
+
break;
|
|
665
|
+
|
|
666
|
+
case 'input_audio_buffer.speech_started':
|
|
667
|
+
// User started speaking — useful for UI animation
|
|
668
|
+
sendToClient(clientWs, event);
|
|
669
|
+
// If agent is speaking, this is a barge-in
|
|
670
|
+
if (processingResponse && activeAbort) {
|
|
671
|
+
log.info(`[${sessionId}] Barge-in detected: user started speaking`);
|
|
672
|
+
activeAbort.abort();
|
|
673
|
+
}
|
|
674
|
+
break;
|
|
675
|
+
|
|
676
|
+
case 'input_audio_buffer.speech_stopped':
|
|
677
|
+
case 'input_audio_buffer.committed':
|
|
678
|
+
// User stopped speaking — forward for UI
|
|
679
|
+
sendToClient(clientWs, event);
|
|
680
|
+
break;
|
|
681
|
+
|
|
682
|
+
case 'conversation.item.input_audio_transcription.failed':
|
|
683
|
+
// Transcription failed
|
|
684
|
+
log.warn(`[${sessionId}] Transcription failed:`, JSON.stringify(event.error));
|
|
685
|
+
sendToClient(clientWs, {
|
|
686
|
+
type: 'bridge.error',
|
|
687
|
+
error: 'Speech transcription failed — please try again',
|
|
688
|
+
});
|
|
689
|
+
break;
|
|
690
|
+
|
|
691
|
+
case 'error':
|
|
692
|
+
// OpenAI error — forward to client
|
|
693
|
+
log.error(`[${sessionId}] OpenAI error:`, JSON.stringify(event));
|
|
694
|
+
sendToClient(clientWs, {
|
|
695
|
+
type: 'bridge.error',
|
|
696
|
+
error: event.error?.message || 'OpenAI Realtime error',
|
|
697
|
+
});
|
|
698
|
+
break;
|
|
699
|
+
|
|
700
|
+
case 'rate_limits.updated':
|
|
701
|
+
// Log rate limits but don't forward
|
|
702
|
+
log.debug(`[${sessionId}] Rate limits: ${JSON.stringify(event.rate_limits)}`);
|
|
703
|
+
break;
|
|
704
|
+
|
|
705
|
+
default:
|
|
706
|
+
// Drop all other events (response.*, conversation.item.created, etc.)
|
|
707
|
+
// In bridge mode, OpenAI shouldn't be generating responses
|
|
708
|
+
log.debug(`[${sessionId}] Dropping OpenAI event: ${eventType}`);
|
|
709
|
+
break;
|
|
710
|
+
}
|
|
711
|
+
} catch (err) {
|
|
712
|
+
log.error(`[${sessionId}] Error parsing OpenAI message:`, err.message);
|
|
713
|
+
}
|
|
714
|
+
});
|
|
715
|
+
|
|
716
|
+
openaiWs.on('close', (code, reason) => {
|
|
717
|
+
openaiClosed = true;
|
|
718
|
+
const reasonStr = reason?.toString() || '';
|
|
719
|
+
log.info(`[${sessionId}] OpenAI disconnected (code=${code}${reasonStr ? ', reason=' + reasonStr : ''})`);
|
|
720
|
+
sendToClient(clientWs, {
|
|
721
|
+
type: 'bridge.error',
|
|
722
|
+
error: 'Voice connection lost — please reconnect',
|
|
723
|
+
});
|
|
724
|
+
cleanup();
|
|
725
|
+
});
|
|
726
|
+
|
|
727
|
+
openaiWs.on('error', (err) => {
|
|
728
|
+
log.error(`[${sessionId}] OpenAI error:`, err.message);
|
|
729
|
+
sendToClient(clientWs, {
|
|
730
|
+
type: 'bridge.error',
|
|
731
|
+
error: `Voice connection error: ${err.message}`,
|
|
732
|
+
});
|
|
733
|
+
cleanup();
|
|
734
|
+
});
|
|
735
|
+
|
|
736
|
+
// ── Client WebSocket events ─────────────────────────────────────────
|
|
737
|
+
|
|
738
|
+
clientWs.on('message', (data, isBinary) => {
|
|
739
|
+
// Forward everything from client to OpenAI (mic audio, session updates, etc.)
|
|
740
|
+
if (openaiWs.readyState === WebSocket.OPEN) {
|
|
741
|
+
try {
|
|
742
|
+
if (isBinary) {
|
|
743
|
+
// Binary = mic audio — forward directly
|
|
744
|
+
openaiWs.send(data);
|
|
745
|
+
} else {
|
|
746
|
+
// Text = JSON commands from client
|
|
747
|
+
const msgStr = data.toString();
|
|
748
|
+
try {
|
|
749
|
+
const msg = JSON.parse(msgStr);
|
|
750
|
+
log.debug(`[${sessionId}] Client event: ${msg.type}`);
|
|
751
|
+
|
|
752
|
+
// Intercept bridge-specific commands
|
|
753
|
+
if (msg.type === 'bridge.cancel') {
|
|
754
|
+
// Client wants to cancel current response
|
|
755
|
+
if (activeAbort) {
|
|
756
|
+
log.info(`[${sessionId}] Client cancelled response`);
|
|
757
|
+
activeAbort.abort();
|
|
758
|
+
}
|
|
759
|
+
return;
|
|
760
|
+
}
|
|
761
|
+
} catch { /* not JSON, just forward */ }
|
|
762
|
+
|
|
763
|
+
openaiWs.send(msgStr);
|
|
764
|
+
}
|
|
765
|
+
} catch (err) {
|
|
766
|
+
log.error(`[${sessionId}] Error forwarding client → OpenAI:`, err.message);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
});
|
|
770
|
+
|
|
771
|
+
clientWs.on('close', (code) => {
|
|
772
|
+
clientClosed = true;
|
|
773
|
+
log.info(`[${sessionId}] Client disconnected (code=${code})`);
|
|
774
|
+
cleanup();
|
|
775
|
+
});
|
|
776
|
+
|
|
777
|
+
clientWs.on('error', (err) => {
|
|
778
|
+
log.error(`[${sessionId}] Client error:`, err.message);
|
|
779
|
+
cleanup();
|
|
780
|
+
});
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
// ─── Public API ─────────────────────────────────────────────────────────────
|
|
784
|
+
|
|
785
|
+
/**
|
|
786
|
+
* Set up the Agent Voice Bridge WebSocket endpoint on the given HTTP server.
|
|
787
|
+
*
|
|
788
|
+
* Listens for upgrade requests on `/api/realtime/bridge` using noServer mode
|
|
789
|
+
* (same pattern as setupRealtimeRelay in index.js) so it coexists with other
|
|
790
|
+
* WebSocket servers.
|
|
791
|
+
*
|
|
792
|
+
* @param {import('http').Server} server - The HTTP server instance
|
|
793
|
+
* @returns {WebSocketServer} The WebSocket server instance
|
|
794
|
+
*/
|
|
795
|
+
export function setupAgentVoiceBridge(server) {
|
|
796
|
+
const wss = new WebSocketServer({ noServer: true, perMessageDeflate: false });
|
|
797
|
+
|
|
798
|
+
// ── Handle upgrade on /api/realtime/bridge ──────────────────────────
|
|
799
|
+
server.on('upgrade', (request, socket, head) => {
|
|
800
|
+
const { pathname } = new URL(request.url, `http://${request.headers.host}`);
|
|
801
|
+
|
|
802
|
+
// Handle /api/realtime?mode=agent OR /api/realtime/bridge
|
|
803
|
+
const params = new URL(request.url, `http://${request.headers.host}`).searchParams;
|
|
804
|
+
const isAgentMode = pathname === '/api/realtime' && params.get('mode') === 'agent';
|
|
805
|
+
const isBridgePath = pathname === '/api/realtime/bridge';
|
|
806
|
+
if (!isAgentMode && !isBridgePath) return; // Let other handlers take it
|
|
807
|
+
|
|
808
|
+
// Origin check
|
|
809
|
+
const origin = request.headers.origin;
|
|
810
|
+
if (!verifyOrigin(origin)) {
|
|
811
|
+
log.warn(`Rejected upgrade from invalid origin: ${origin}`);
|
|
812
|
+
socket.write('HTTP/1.1 403 Forbidden\r\n\r\n');
|
|
813
|
+
socket.destroy();
|
|
814
|
+
return;
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
// Connection limit
|
|
818
|
+
if (activeSessions.size >= MAX_BRIDGE_CONNECTIONS) {
|
|
819
|
+
log.warn(`Rejected upgrade — at capacity (${activeSessions.size}/${MAX_BRIDGE_CONNECTIONS})`);
|
|
820
|
+
socket.write('HTTP/1.1 429 Too Many Requests\r\n\r\n');
|
|
821
|
+
socket.destroy();
|
|
822
|
+
return;
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
wss.handleUpgrade(request, socket, head, (ws) => {
|
|
826
|
+
wss.emit('connection', ws, request);
|
|
827
|
+
});
|
|
828
|
+
});
|
|
829
|
+
|
|
830
|
+
// ── New client connection ───────────────────────────────────────────
|
|
831
|
+
wss.on('connection', (clientWs, req) => {
|
|
832
|
+
handleBridgeConnection(clientWs, req);
|
|
833
|
+
});
|
|
834
|
+
|
|
835
|
+
log.info('Agent Voice Bridge ready at /api/realtime?mode=agent');
|
|
836
|
+
return wss;
|
|
837
|
+
}
|