@juspay/neurolink 9.55.2 → 9.55.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,4 @@
1
1
  import WebSocket, { WebSocketServer } from "ws";
2
- import { Cobra } from "@picovoice/cobra-node";
3
2
  import { FrameBus } from "./frameBus.js";
4
3
  import { TurnManager, TurnState } from "./turnManager.js";
5
4
  import { CartesiaStream } from "../../adapters/tts/cartesiaHandler.js";
@@ -116,420 +115,443 @@ export function setupWebSocket(server) {
116
115
  }
117
116
  const neurolink = new NeuroLink();
118
117
  wss.on("connection", (clientWs) => {
119
- logger.info("[WS] Client connected");
120
- // --- Per-session Cobra instance ---
121
- let cobra = null;
122
- let FRAME_LENGTH = 512;
123
- let FRAME_BYTES = FRAME_LENGTH * 2;
124
- try {
125
- cobra = new Cobra(accessKey);
126
- FRAME_LENGTH = cobra.frameLength;
127
- FRAME_BYTES = FRAME_LENGTH * 2;
128
- logger.info(`[VAD] Cobra ready (frameLength=${FRAME_LENGTH})`);
129
- }
130
- catch (err) {
131
- logger.error("[VAD] Cobra init failed:", err);
132
- clientWs.close();
133
- return;
134
- }
135
- // --- Per-session state ---
136
- const bus = new FrameBus();
137
- const turnManager = new TurnManager(bus);
138
- let sonioxWs = null;
139
- let keepAliveTimer = null;
140
- let sessionClosed = false;
141
- let transcriptBuffer = "";
142
- let activeTTS = null;
143
- const conversation = [];
144
- let currentTurnId = 0;
145
- let activePipelineTurnId = null;
146
- // Safety fallback: if the client never sends playback_done (crash, network drop),
147
- // auto-reset the turn state after this many ms so the assistant isn't stuck.
148
- let playbackResetTimer = null;
149
- // Timestamp (ms) before which barge-in via Soniox is suppressed.
150
- // Set when TTS starts playing to prevent TTS echo from triggering immediate re-interrupt.
151
- // AEC on the browser needs ~300-400ms to characterise the echo signal before suppressing it.
152
- let bargeInLockedUntil = 0;
153
- // Cobra VAD state
154
- let isSpeaking = false;
155
- let silenceFrameCount = 0;
156
- let voiceFrameCount = 0;
157
- let frameRemainder = Buffer.alloc(0);
158
- /* ======= INTERRUPT ======= */
159
- function closeTts(stream, reason) {
160
- if (!stream) {
161
- return;
162
- }
118
+ void (async () => {
119
+ logger.info("[WS] Client connected");
120
+ // --- Per-session Cobra instance ---
121
+ let cobra = null;
122
+ let FRAME_LENGTH = 512;
123
+ let FRAME_BYTES = FRAME_LENGTH * 2;
163
124
  try {
164
- // Close the WS first so that any pending done/error/close listeners
165
- // in processTurn() can settle immediately, rather than hanging until
166
- // the withTimeout fires.
167
- stream.close();
168
- stream.removeAllListeners();
169
- }
170
- catch (error) {
171
- logger.warn(reason, error);
172
- }
173
- }
174
- function doInterrupt() {
175
- logger.info("[INTERRUPT] Cutting TTS");
176
- if (playbackResetTimer) {
177
- clearTimeout(playbackResetTimer);
178
- playbackResetTimer = null;
179
- }
180
- bargeInLockedUntil = 0;
181
- currentTurnId++;
182
- activePipelineTurnId = null;
183
- transcriptBuffer = "";
184
- isSpeaking = false;
185
- silenceFrameCount = 0;
186
- voiceFrameCount = 0;
187
- if (activeTTS) {
188
- closeTts(activeTTS, "[INTERRUPT] Failed to close active TTS stream");
189
- activeTTS = null;
190
- }
191
- turnManager.reset();
192
- if (clientWs.readyState === WebSocket.OPEN) {
193
- clientWs.send(JSON.stringify({ type: "interrupt" }));
194
- }
195
- }
196
- /* ======= SONIOX ======= */
197
- function connectSoniox() {
198
- const ws = new WebSocket(SONIOX_URL);
199
- sonioxWs = ws;
200
- ws.on("open", () => {
201
- logger.info("[SONIOX] Connected");
202
- ws.send(JSON.stringify({
203
- api_key: getSonioxApiKey(),
204
- model: "stt-rt-preview",
205
- audio_format: "auto",
206
- language_hints: ["en"],
207
- enable_endpoint_detection: true,
208
- }));
209
- ws.send(makeWavHeader(16000, 1));
210
- startKeepAlive();
211
- });
212
- ws.on("message", handleSonioxMessage);
213
- ws.on("close", (code, reason) => {
214
- logger.info(`[SONIOX] Closed: code=${code} reason=${reason.toString() || "(none)"}`);
215
- stopKeepAlive();
216
- if (!sessionClosed) {
217
- setTimeout(() => {
218
- connectSoniox();
219
- }, 500);
220
- }
221
- });
222
- ws.on("error", (err) => {
223
- logger.error("[SONIOX] Error:", err.message);
224
- });
225
- }
226
- function startKeepAlive() {
227
- keepAliveTimer = setInterval(() => {
228
- if (sonioxWs?.readyState === WebSocket.OPEN) {
229
- sonioxWs.send(JSON.stringify({ type: "keepalive" }));
125
+ let mod;
126
+ try {
127
+ mod = (await import(
128
+ /* @vite-ignore */ "@picovoice/cobra-node"));
230
129
  }
231
- }, 8000);
232
- }
233
- function stopKeepAlive() {
234
- if (keepAliveTimer) {
235
- clearInterval(keepAliveTimer);
236
- keepAliveTimer = null;
237
- }
238
- }
239
- /* ======= STT HANDLER ======= */
240
- async function handleSonioxMessage(msg) {
241
- const data = parseSonioxMessage(msg);
242
- if (!data) {
243
- return;
244
- }
245
- if (!Array.isArray(data.tokens)) {
246
- if (data.error || data.status || data.type) {
247
- if (logger.shouldLog("debug")) {
248
- logger.info("[SONIOX] msg:", JSON.stringify(data));
130
+ catch (err) {
131
+ const e = err instanceof Error ? err : null;
132
+ if (e?.code === "ERR_MODULE_NOT_FOUND" &&
133
+ e.message.includes("cobra-node")) {
134
+ throw new Error('Voice activity detection requires "@picovoice/cobra-node". Install it with:\n pnpm add @picovoice/cobra-node', { cause: err });
249
135
  }
136
+ throw err;
250
137
  }
138
+ cobra = new mod.Cobra(accessKey);
139
+ FRAME_LENGTH = cobra.frameLength;
140
+ FRAME_BYTES = FRAME_LENGTH * 2;
141
+ logger.info(`[VAD] Cobra ready (frameLength=${FRAME_LENGTH})`);
142
+ }
143
+ catch (err) {
144
+ logger.error("[VAD] Cobra init failed:", err);
145
+ clientWs.close();
251
146
  return;
252
147
  }
253
- const tokens = data.tokens;
254
- // Barge-in detection:
255
- // Soniox non-final tokens = real speech is being recognised right now.
256
- // Browser AEC (echo cancellation) suppresses TTS playback at the mic, so
257
- // non-final tokens can only come from the user's own voice — unlike raw
258
- // Cobra probability which can be fooled by speaker echo.
259
- // We only fire interrupt when the TurnManager confirms TTS is actually
260
- // playing (ASSISTANT_SPEAKING state set by processTurn).
261
- // bargeInLockedUntil suppresses the first ~400ms after TTS starts so that
262
- // TTS audio picked up by the mic (before AEC locks on) can't re-trigger.
263
- if (turnManager.state === TurnState.ASSISTANT_SPEAKING &&
264
- Date.now() > bargeInLockedUntil) {
265
- const speechPartials = tokens.filter((token) => !token.is_final && token.text && token.text.trim().length > 1);
266
- if (speechPartials.length > 0) {
267
- logger.info(`[BARGE-IN] Detected via Soniox: "${speechPartials.map((token) => token.text).join("")}"`);
268
- doInterrupt();
148
+ // --- Per-session state ---
149
+ const bus = new FrameBus();
150
+ const turnManager = new TurnManager(bus);
151
+ let sonioxWs = null;
152
+ let keepAliveTimer = null;
153
+ let sessionClosed = false;
154
+ let transcriptBuffer = "";
155
+ let activeTTS = null;
156
+ const conversation = [];
157
+ let currentTurnId = 0;
158
+ let activePipelineTurnId = null;
159
+ // Safety fallback: if the client never sends playback_done (crash, network drop),
160
+ // auto-reset the turn state after this many ms so the assistant isn't stuck.
161
+ let playbackResetTimer = null;
162
+ // Timestamp (ms) before which barge-in via Soniox is suppressed.
163
+ // Set when TTS starts playing to prevent TTS echo from triggering immediate re-interrupt.
164
+ // AEC on the browser needs ~300-400ms to characterise the echo signal before suppressing it.
165
+ let bargeInLockedUntil = 0;
166
+ // Cobra VAD state
167
+ let isSpeaking = false;
168
+ let silenceFrameCount = 0;
169
+ let voiceFrameCount = 0;
170
+ let frameRemainder = Buffer.alloc(0);
171
+ /* ======= INTERRUPT ======= */
172
+ function closeTts(stream, reason) {
173
+ if (!stream) {
269
174
  return;
270
175
  }
176
+ try {
177
+ // Close the WS first so that any pending done/error/close listeners
178
+ // in processTurn() can settle immediately, rather than hanging until
179
+ // the withTimeout fires.
180
+ stream.close();
181
+ stream.removeAllListeners();
182
+ }
183
+ catch (error) {
184
+ logger.warn(reason, error);
185
+ }
271
186
  }
272
- const finals = tokens.filter((token) => token.is_final && token.text);
273
- if (!finals.length) {
274
- return;
275
- }
276
- transcriptBuffer += finals.map((token) => token.text).join("");
277
- const hasEnd = finals.some((token) => token.text === "<end>");
278
- if (!hasEnd) {
279
- return;
280
- }
281
- const finalText = transcriptBuffer.replace("<end>", "").trim();
282
- transcriptBuffer = "";
283
- if (!finalText) {
284
- return;
187
+ function doInterrupt() {
188
+ logger.info("[INTERRUPT] Cutting TTS");
189
+ if (playbackResetTimer) {
190
+ clearTimeout(playbackResetTimer);
191
+ playbackResetTimer = null;
192
+ }
193
+ bargeInLockedUntil = 0;
194
+ currentTurnId++;
195
+ activePipelineTurnId = null;
196
+ transcriptBuffer = "";
197
+ isSpeaking = false;
198
+ silenceFrameCount = 0;
199
+ voiceFrameCount = 0;
200
+ if (activeTTS) {
201
+ closeTts(activeTTS, "[INTERRUPT] Failed to close active TTS stream");
202
+ activeTTS = null;
203
+ }
204
+ turnManager.reset();
205
+ if (clientWs.readyState === WebSocket.OPEN) {
206
+ clientWs.send(JSON.stringify({ type: "interrupt" }));
207
+ }
285
208
  }
286
- logger.info("[STT] Final ->", finalText);
287
- try {
288
- await processTurn(finalText);
209
+ /* ======= SONIOX ======= */
210
+ function connectSoniox() {
211
+ const ws = new WebSocket(SONIOX_URL);
212
+ sonioxWs = ws;
213
+ ws.on("open", () => {
214
+ logger.info("[SONIOX] Connected");
215
+ ws.send(JSON.stringify({
216
+ api_key: getSonioxApiKey(),
217
+ model: "stt-rt-preview",
218
+ audio_format: "auto",
219
+ language_hints: ["en"],
220
+ enable_endpoint_detection: true,
221
+ }));
222
+ ws.send(makeWavHeader(16000, 1));
223
+ startKeepAlive();
224
+ });
225
+ ws.on("message", handleSonioxMessage);
226
+ ws.on("close", (code, reason) => {
227
+ logger.info(`[SONIOX] Closed: code=${code} reason=${reason.toString() || "(none)"}`);
228
+ stopKeepAlive();
229
+ if (!sessionClosed) {
230
+ setTimeout(() => {
231
+ connectSoniox();
232
+ }, 500);
233
+ }
234
+ });
235
+ ws.on("error", (err) => {
236
+ logger.error("[SONIOX] Error:", err.message);
237
+ });
289
238
  }
290
- catch (err) {
291
- logger.error("[PIPELINE] Unhandled error in processTurn:", err.message);
292
- turnManager.reset();
239
+ function startKeepAlive() {
240
+ keepAliveTimer = setInterval(() => {
241
+ if (sonioxWs?.readyState === WebSocket.OPEN) {
242
+ sonioxWs.send(JSON.stringify({ type: "keepalive" }));
243
+ }
244
+ }, 8000);
293
245
  }
294
- }
295
- /* ======= TURN PROCESSOR ======= */
296
- async function processTurn(userText) {
297
- if (activePipelineTurnId !== null) {
298
- logger.info("[PIPELINE] Already running — discarding duplicate STT final");
299
- return;
246
+ function stopKeepAlive() {
247
+ if (keepAliveTimer) {
248
+ clearInterval(keepAliveTimer);
249
+ keepAliveTimer = null;
250
+ }
300
251
  }
301
- currentTurnId++;
302
- const myTurn = currentTurnId;
303
- activePipelineTurnId = myTurn;
304
- const tSttEnd = now();
305
- try {
306
- // Build context without mutating `conversation` — only commit on full completion.
307
- const stream = await streamAnswer(neurolink, [
308
- ...conversation,
309
- { role: "user", content: userText },
310
- ]);
311
- if (myTurn !== currentTurnId) {
252
+ /* ======= STT HANDLER ======= */
253
+ async function handleSonioxMessage(msg) {
254
+ const data = parseSonioxMessage(msg);
255
+ if (!data) {
312
256
  return;
313
257
  }
314
- const tts = new CartesiaStream(`turn-${Date.now()}`);
315
- activeTTS = tts;
316
- await tts.ready();
317
- if (myTurn !== currentTurnId) {
258
+ if (!Array.isArray(data.tokens)) {
259
+ if (data.error || data.status || data.type) {
260
+ if (logger.shouldLog("debug")) {
261
+ logger.info("[SONIOX] msg:", JSON.stringify(data));
262
+ }
263
+ }
318
264
  return;
319
265
  }
320
- // Register error handler immediately after ready() — before the LLM stream loop —
321
- // so Cartesia errors emitted mid-stream (during token sending) are captured.
322
- // Without this, errors during the for-await loop have no listener and are swallowed.
323
- let ttsError = null;
324
- tts.on("error", (err) => {
325
- ttsError = err;
326
- logger.error("[TTS] Mid-stream error:", err.message);
327
- });
328
- // Pre-lock barge-in BEFORE signaling assistant speaking.
329
- // Without this there is a ~700-1000ms gap where TurnState is ASSISTANT_SPEAKING
330
- // but bargeInLockedUntil=0, so Soniox residual tokens from the previous TTS echo
331
- // immediately trigger an interrupt before any audio has even been sent.
332
- bargeInLockedUntil = Date.now() + 1000;
333
- // Signal TurnManager that TTS is about to play — barge-in detection is now live.
334
- turnManager.assistantSpeaking();
335
- let firstAudioSent = false;
336
- let assistantReply = "";
337
- let tokenBuffer = "";
338
- // Sentence/phrase boundaries to flush on — avoids flooding Cartesia with
339
- // one tiny message per token, which causes "Service unavailable" errors on
340
- // long responses. We flush when we hit natural speech breaks or the buffer
341
- // grows large enough to produce a clean TTS chunk.
342
- const FLUSH_REGEX = /[.!?,;:]\s/;
343
- const FLUSH_MIN_LENGTH = 80;
344
- tts.on("audio", (audio) => {
345
- if (myTurn !== currentTurnId) {
266
+ const tokens = data.tokens;
267
+ // Barge-in detection:
268
+ // Soniox non-final tokens = real speech is being recognised right now.
269
+ // Browser AEC (echo cancellation) suppresses TTS playback at the mic, so
270
+ // non-final tokens can only come from the user's own voice — unlike raw
271
+ // Cobra probability which can be fooled by speaker echo.
272
+ // We only fire interrupt when the TurnManager confirms TTS is actually
273
+ // playing (ASSISTANT_SPEAKING state set by processTurn).
274
+ // bargeInLockedUntil suppresses the first ~400ms after TTS starts so that
275
+ // TTS audio picked up by the mic (before AEC locks on) can't re-trigger.
276
+ if (turnManager.state === TurnState.ASSISTANT_SPEAKING &&
277
+ Date.now() > bargeInLockedUntil) {
278
+ const speechPartials = tokens.filter((token) => !token.is_final && token.text && token.text.trim().length > 1);
279
+ if (speechPartials.length > 0) {
280
+ logger.info(`[BARGE-IN] Detected via Soniox: "${speechPartials.map((token) => token.text).join("")}"`);
281
+ doInterrupt();
346
282
  return;
347
283
  }
348
- if (!firstAudioSent) {
349
- firstAudioSent = true;
350
- // Refresh the lock from when audio ACTUALLY hits the client so it covers
351
- // the AEC lock-on window (~300-400ms for browser echo cancellation).
352
- // This extends the protection past the initial 1000ms pre-lock.
353
- bargeInLockedUntil = Date.now() + 400;
354
- logger.info(`[LATENCY] STT -> First Audio: ${(now() - tSttEnd).toFixed(0)}ms`);
355
- }
356
- if (clientWs.readyState === WebSocket.OPEN) {
357
- clientWs.send(audio);
358
- }
359
- });
360
- for await (const chunk of stream) {
361
- if (myTurn !== currentTurnId) {
362
- logger.info("[PIPELINE] Stale LLM stream — dropping");
363
- break;
364
- }
365
- // If Cartesia errored mid-stream, abort sending more tokens.
366
- if (ttsError) {
367
- logger.info("[PIPELINE] Aborting LLM stream — Cartesia error");
368
- break;
369
- }
370
- if (!chunk || typeof chunk !== "object" || !("content" in chunk)) {
371
- continue;
372
- }
373
- if (typeof chunk.content !== "string") {
374
- continue;
375
- }
376
- assistantReply += chunk.content;
377
- tokenBuffer += chunk.content;
378
- // Flush buffer to Cartesia at sentence/phrase boundaries or when it's
379
- // grown large enough. This batches tokens into meaningful speech chunks
380
- // instead of sending one WebSocket message per token.
381
- if (FLUSH_REGEX.test(tokenBuffer) ||
382
- tokenBuffer.length >= FLUSH_MIN_LENGTH) {
383
- tts.send(tokenBuffer, true);
384
- tokenBuffer = "";
385
- }
386
284
  }
387
- // Flush any remaining buffered tokens before the final flush().
388
- if (tokenBuffer) {
389
- tts.send(tokenBuffer, true);
390
- tokenBuffer = "";
285
+ const finals = tokens.filter((token) => token.is_final && token.text);
286
+ if (!finals.length) {
287
+ return;
391
288
  }
392
- // If Cartesia errored during the stream, reset and bail out now.
393
- if (ttsError) {
394
- logger.error("[TTS] Error during stream — resetting turn so user can retry:", String(ttsError));
395
- closeTts(tts, "[TTS] Failed to close stream after mid-stream error");
396
- turnManager.reset();
289
+ transcriptBuffer += finals.map((token) => token.text).join("");
290
+ const hasEnd = finals.some((token) => token.text === "<end>");
291
+ if (!hasEnd) {
397
292
  return;
398
293
  }
399
- if (myTurn !== currentTurnId) {
294
+ const finalText = transcriptBuffer.replace("<end>", "").trim();
295
+ transcriptBuffer = "";
296
+ if (!finalText) {
400
297
  return;
401
298
  }
402
- let ttsSucceeded = false;
299
+ logger.info("[STT] Final ->", finalText);
403
300
  try {
404
- await withTimeout(new Promise((resolve, reject) => {
405
- tts.once("done", () => {
406
- ttsSucceeded = true;
407
- resolve();
408
- });
409
- // Re-use the persistent error handler: if another error arrives during flush,
410
- // the existing "error" listener fires ttsError; reject via a one-time wrapper.
411
- tts.once("error", reject);
412
- // Reject if the socket closes without emitting done or error.
413
- tts.once("close", () => reject(new Error("Cartesia WS closed before flush completed")));
414
- tts.flush();
415
- }), 10000, "Cartesia flush timed out");
301
+ await processTurn(finalText);
416
302
  }
417
303
  catch (err) {
418
- // Cartesia failed (e.g. "Service unavailable"). The user heard nothing.
419
- // Reset state immediately so they can speak and retry — don't commit
420
- // the turn to conversation history since it was never heard.
421
- logger.error("[TTS] Error during flush — resetting turn so user can retry:", err.message);
422
- closeTts(tts, "[TTS] Failed to close stream after flush error");
304
+ logger.error("[PIPELINE] Unhandled error in processTurn:", err.message);
423
305
  turnManager.reset();
424
- return;
425
306
  }
426
- closeTts(tts, "[TTS] Failed to close stream after successful playback");
427
- if (!ttsSucceeded || myTurn !== currentTurnId) {
428
- return;
429
- }
430
- // Only commit conversation when the turn completed fully and was heard.
431
- conversation.push({ role: "user", content: userText });
432
- conversation.push({ role: "assistant", content: assistantReply });
433
- // Do NOT reset state here — the client is still playing buffered audio.
434
- // The client sends playback_done when the last audio chunk finishes playing,
435
- // which is the correct moment to return to IDLE and allow new user speech.
436
- // Safety fallback: if the client never sends playback_done (crash, disconnect),
437
- // auto-reset after 20 seconds so the assistant doesn't stay stuck.
438
- if (playbackResetTimer) {
439
- clearTimeout(playbackResetTimer);
440
- }
441
- playbackResetTimer = setTimeout(() => {
442
- playbackResetTimer = null;
443
- turnManager.reset();
444
- }, 20000);
445
307
  }
446
- finally {
447
- if (activePipelineTurnId === myTurn) {
448
- activePipelineTurnId = null;
308
+ /* ======= TURN PROCESSOR ======= */
309
+ async function processTurn(userText) {
310
+ if (activePipelineTurnId !== null) {
311
+ logger.info("[PIPELINE] Already running — discarding duplicate STT final");
312
+ return;
449
313
  }
450
- }
451
- }
452
- /* ======= CLIENT AUDIO + CONTROL ======= */
453
- clientWs.on("message", (data) => {
454
- if (typeof data === "string") {
455
- const msg = parseClientControlMessage(data);
456
- if (msg?.type === "playback_done") {
457
- // Client finished playing all audio — now it's safe to listen again.
314
+ currentTurnId++;
315
+ const myTurn = currentTurnId;
316
+ activePipelineTurnId = myTurn;
317
+ const tSttEnd = now();
318
+ try {
319
+ // Build context without mutating `conversation` — only commit on full completion.
320
+ const stream = await streamAnswer(neurolink, [
321
+ ...conversation,
322
+ { role: "user", content: userText },
323
+ ]);
324
+ if (myTurn !== currentTurnId) {
325
+ return;
326
+ }
327
+ const tts = new CartesiaStream(`turn-${Date.now()}`);
328
+ activeTTS = tts;
329
+ await tts.ready();
330
+ if (myTurn !== currentTurnId) {
331
+ return;
332
+ }
333
+ // Register error handler immediately after ready() — before the LLM stream loop —
334
+ // so Cartesia errors emitted mid-stream (during token sending) are captured.
335
+ // Without this, errors during the for-await loop have no listener and are swallowed.
336
+ let ttsError = null;
337
+ tts.on("error", (err) => {
338
+ ttsError = err;
339
+ logger.error("[TTS] Mid-stream error:", err.message);
340
+ });
341
+ // Pre-lock barge-in BEFORE signaling assistant speaking.
342
+ // Without this there is a ~700-1000ms gap where TurnState is ASSISTANT_SPEAKING
343
+ // but bargeInLockedUntil=0, so Soniox residual tokens from the previous TTS echo
344
+ // immediately trigger an interrupt before any audio has even been sent.
345
+ bargeInLockedUntil = Date.now() + 1000;
346
+ // Signal TurnManager that TTS is about to play — barge-in detection is now live.
347
+ turnManager.assistantSpeaking();
348
+ let firstAudioSent = false;
349
+ let assistantReply = "";
350
+ let tokenBuffer = "";
351
+ // Sentence/phrase boundaries to flush on — avoids flooding Cartesia with
352
+ // one tiny message per token, which causes "Service unavailable" errors on
353
+ // long responses. We flush when we hit natural speech breaks or the buffer
354
+ // grows large enough to produce a clean TTS chunk.
355
+ const FLUSH_REGEX = /[.!?,;:]\s/;
356
+ const FLUSH_MIN_LENGTH = 80;
357
+ tts.on("audio", (audio) => {
358
+ if (myTurn !== currentTurnId) {
359
+ return;
360
+ }
361
+ if (!firstAudioSent) {
362
+ firstAudioSent = true;
363
+ // Refresh the lock from when audio ACTUALLY hits the client so it covers
364
+ // the AEC lock-on window (~300-400ms for browser echo cancellation).
365
+ // This extends the protection past the initial 1000ms pre-lock.
366
+ bargeInLockedUntil = Date.now() + 400;
367
+ logger.info(`[LATENCY] STT -> First Audio: ${(now() - tSttEnd).toFixed(0)}ms`);
368
+ }
369
+ if (clientWs.readyState === WebSocket.OPEN) {
370
+ clientWs.send(audio);
371
+ }
372
+ });
373
+ for await (const chunk of stream) {
374
+ if (myTurn !== currentTurnId) {
375
+ logger.info("[PIPELINE] Stale LLM stream — dropping");
376
+ break;
377
+ }
378
+ // If Cartesia errored mid-stream, abort sending more tokens.
379
+ if (ttsError) {
380
+ logger.info("[PIPELINE] Aborting LLM stream — Cartesia error");
381
+ break;
382
+ }
383
+ if (!chunk || typeof chunk !== "object" || !("content" in chunk)) {
384
+ continue;
385
+ }
386
+ if (typeof chunk.content !== "string") {
387
+ continue;
388
+ }
389
+ assistantReply += chunk.content;
390
+ tokenBuffer += chunk.content;
391
+ // Flush buffer to Cartesia at sentence/phrase boundaries or when it's
392
+ // grown large enough. This batches tokens into meaningful speech chunks
393
+ // instead of sending one WebSocket message per token.
394
+ if (FLUSH_REGEX.test(tokenBuffer) ||
395
+ tokenBuffer.length >= FLUSH_MIN_LENGTH) {
396
+ tts.send(tokenBuffer, true);
397
+ tokenBuffer = "";
398
+ }
399
+ }
400
+ // Flush any remaining buffered tokens before the final flush().
401
+ if (tokenBuffer) {
402
+ tts.send(tokenBuffer, true);
403
+ tokenBuffer = "";
404
+ }
405
+ // If Cartesia errored during the stream, reset and bail out now.
406
+ if (ttsError) {
407
+ logger.error("[TTS] Error during stream — resetting turn so user can retry:", String(ttsError));
408
+ closeTts(tts, "[TTS] Failed to close stream after mid-stream error");
409
+ turnManager.reset();
410
+ return;
411
+ }
412
+ if (myTurn !== currentTurnId) {
413
+ return;
414
+ }
415
+ let ttsSucceeded = false;
416
+ try {
417
+ await withTimeout(new Promise((resolve, reject) => {
418
+ tts.once("done", () => {
419
+ ttsSucceeded = true;
420
+ resolve();
421
+ });
422
+ // Re-use the persistent error handler: if another error arrives during flush,
423
+ // the existing "error" listener fires ttsError; reject via a one-time wrapper.
424
+ tts.once("error", reject);
425
+ // Reject if the socket closes without emitting done or error.
426
+ tts.once("close", () => reject(new Error("Cartesia WS closed before flush completed")));
427
+ tts.flush();
428
+ }), 10000, "Cartesia flush timed out");
429
+ }
430
+ catch (err) {
431
+ // Cartesia failed (e.g. "Service unavailable"). The user heard nothing.
432
+ // Reset state immediately so they can speak and retry — don't commit
433
+ // the turn to conversation history since it was never heard.
434
+ logger.error("[TTS] Error during flush — resetting turn so user can retry:", err.message);
435
+ closeTts(tts, "[TTS] Failed to close stream after flush error");
436
+ turnManager.reset();
437
+ return;
438
+ }
439
+ closeTts(tts, "[TTS] Failed to close stream after successful playback");
440
+ if (!ttsSucceeded || myTurn !== currentTurnId) {
441
+ return;
442
+ }
443
+ // Only commit conversation when the turn completed fully and was heard.
444
+ conversation.push({ role: "user", content: userText });
445
+ conversation.push({ role: "assistant", content: assistantReply });
446
+ // Do NOT reset state here — the client is still playing buffered audio.
447
+ // The client sends playback_done when the last audio chunk finishes playing,
448
+ // which is the correct moment to return to IDLE and allow new user speech.
449
+ // Safety fallback: if the client never sends playback_done (crash, disconnect),
450
+ // auto-reset after 20 seconds so the assistant doesn't stay stuck.
458
451
  if (playbackResetTimer) {
459
452
  clearTimeout(playbackResetTimer);
453
+ }
454
+ playbackResetTimer = setTimeout(() => {
460
455
  playbackResetTimer = null;
456
+ turnManager.reset();
457
+ }, 20000);
458
+ }
459
+ finally {
460
+ if (activePipelineTurnId === myTurn) {
461
+ activePipelineTurnId = null;
461
462
  }
462
- turnManager.reset();
463
463
  }
464
- return;
465
- }
466
- if (!(data instanceof Buffer)) {
467
- return;
468
464
  }
469
- // Reassemble into exact FRAME_BYTES-sized Cobra frames.
470
- const combined = Buffer.concat([frameRemainder, data]);
471
- let pos = 0;
472
- while (pos + FRAME_BYTES <= combined.length) {
473
- const frame = new Int16Array(FRAME_LENGTH);
474
- for (let i = 0; i < FRAME_LENGTH; i++) {
475
- frame[i] = combined.readInt16LE(pos + i * 2);
476
- }
477
- pos += FRAME_BYTES;
478
- // Cobra VAD:
479
- // Cobra tracks when the user is speaking vs silent. Its output drives
480
- // TurnManager state (USER_SPEAKING / PROCESSING) but does NOT trigger
481
- // interrupt — that comes from Soniox non-final tokens so echo can't fool it.
482
- let voiceProb = 0;
483
- try {
484
- if (!cobra) {
485
- continue;
465
+ /* ======= CLIENT AUDIO + CONTROL ======= */
466
+ clientWs.on("message", (data) => {
467
+ if (typeof data === "string") {
468
+ const msg = parseClientControlMessage(data);
469
+ if (msg?.type === "playback_done") {
470
+ // Client finished playing all audio now it's safe to listen again.
471
+ if (playbackResetTimer) {
472
+ clearTimeout(playbackResetTimer);
473
+ playbackResetTimer = null;
474
+ }
475
+ turnManager.reset();
486
476
  }
487
- voiceProb = cobra.process(frame);
477
+ return;
488
478
  }
489
- catch (err) {
490
- logger.error("[VAD] Cobra process error:", err);
479
+ if (!(data instanceof Buffer)) {
480
+ return;
491
481
  }
492
- const isVoice = voiceProb >= VOICE_THRESHOLD;
493
- if (isVoice) {
494
- voiceFrameCount++;
495
- silenceFrameCount = 0;
496
- if (!isSpeaking && voiceFrameCount >= VOICE_FRAMES_TO_START) {
497
- isSpeaking = true;
498
- logger.info(`[VAD] Speech start (prob=${voiceProb.toFixed(2)})`);
499
- bus.publish({ type: "vad_start" });
482
+ // Reassemble into exact FRAME_BYTES-sized Cobra frames.
483
+ const combined = Buffer.concat([frameRemainder, data]);
484
+ let pos = 0;
485
+ while (pos + FRAME_BYTES <= combined.length) {
486
+ const frame = new Int16Array(FRAME_LENGTH);
487
+ for (let i = 0; i < FRAME_LENGTH; i++) {
488
+ frame[i] = combined.readInt16LE(pos + i * 2);
500
489
  }
501
- }
502
- else {
503
- voiceFrameCount = 0;
504
- if (isSpeaking) {
505
- silenceFrameCount++;
506
- if (silenceFrameCount >= SILENCE_FRAMES_TO_STOP) {
507
- isSpeaking = false;
508
- silenceFrameCount = 0;
509
- logger.info("[VAD] Speech stop");
510
- bus.publish({ type: "vad_stop" });
490
+ pos += FRAME_BYTES;
491
+ // Cobra VAD:
492
+ // Cobra tracks when the user is speaking vs silent. Its output drives
493
+ // TurnManager state (USER_SPEAKING / PROCESSING) but does NOT trigger
494
+ // interrupt — that comes from Soniox non-final tokens so echo can't fool it.
495
+ let voiceProb = 0;
496
+ try {
497
+ if (!cobra) {
498
+ continue;
499
+ }
500
+ voiceProb = cobra.process(frame);
501
+ }
502
+ catch (err) {
503
+ logger.error("[VAD] Cobra process error:", err);
504
+ }
505
+ const isVoice = voiceProb >= VOICE_THRESHOLD;
506
+ if (isVoice) {
507
+ voiceFrameCount++;
508
+ silenceFrameCount = 0;
509
+ if (!isSpeaking && voiceFrameCount >= VOICE_FRAMES_TO_START) {
510
+ isSpeaking = true;
511
+ logger.info(`[VAD] Speech start (prob=${voiceProb.toFixed(2)})`);
512
+ bus.publish({ type: "vad_start" });
513
+ }
514
+ }
515
+ else {
516
+ voiceFrameCount = 0;
517
+ if (isSpeaking) {
518
+ silenceFrameCount++;
519
+ if (silenceFrameCount >= SILENCE_FRAMES_TO_STOP) {
520
+ isSpeaking = false;
521
+ silenceFrameCount = 0;
522
+ logger.info("[VAD] Speech stop");
523
+ bus.publish({ type: "vad_stop" });
524
+ }
511
525
  }
512
526
  }
527
+ // Always forward every frame to Soniox for continuous transcription.
528
+ if (sonioxWs?.readyState === WebSocket.OPEN) {
529
+ sonioxWs.send(Buffer.from(frame.buffer));
530
+ }
513
531
  }
514
- // Always forward every frame to Soniox for continuous transcription.
515
- if (sonioxWs?.readyState === WebSocket.OPEN) {
516
- sonioxWs.send(Buffer.from(frame.buffer));
532
+ frameRemainder = combined.subarray(pos);
533
+ });
534
+ clientWs.on("close", () => {
535
+ logger.info("[WS] Client disconnected");
536
+ sessionClosed = true;
537
+ if (cobra) {
538
+ cobra.release();
517
539
  }
540
+ closeTts(activeTTS, "[WS] Failed to close active TTS on disconnect");
541
+ stopKeepAlive();
542
+ if (sonioxWs) {
543
+ sonioxWs.close();
544
+ }
545
+ });
546
+ connectSoniox();
547
+ })().catch((err) => {
548
+ logger.error("[WS] Connection handler failed:", err);
549
+ try {
550
+ clientWs.close();
518
551
  }
519
- frameRemainder = combined.subarray(pos);
520
- });
521
- clientWs.on("close", () => {
522
- logger.info("[WS] Client disconnected");
523
- sessionClosed = true;
524
- if (cobra) {
525
- cobra.release();
526
- }
527
- closeTts(activeTTS, "[WS] Failed to close active TTS on disconnect");
528
- stopKeepAlive();
529
- if (sonioxWs) {
530
- sonioxWs.close();
552
+ catch {
553
+ /* already closed */
531
554
  }
532
555
  });
533
- connectSoniox();
534
556
  });
535
557
  }