@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +23 -17
  3. package/dist/adapters/tts/googleTTSHandler.js +1 -1
  4. package/dist/browser/neurolink.min.js +382 -364
  5. package/dist/cli/commands/serve.js +9 -0
  6. package/dist/cli/commands/voiceServer.d.ts +7 -0
  7. package/dist/cli/commands/voiceServer.js +9 -1
  8. package/dist/cli/factories/commandFactory.js +136 -11
  9. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  10. package/dist/cli/utils/audioFileUtils.d.ts +3 -3
  11. package/dist/cli/utils/audioFileUtils.js +5 -1
  12. package/dist/core/baseProvider.js +29 -6
  13. package/dist/factories/providerRegistry.d.ts +14 -0
  14. package/dist/factories/providerRegistry.js +141 -2
  15. package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
  16. package/dist/lib/core/baseProvider.js +29 -6
  17. package/dist/lib/factories/providerRegistry.d.ts +14 -0
  18. package/dist/lib/factories/providerRegistry.js +141 -2
  19. package/dist/lib/mcp/toolRegistry.js +7 -1
  20. package/dist/lib/neurolink.d.ts +19 -0
  21. package/dist/lib/neurolink.js +252 -14
  22. package/dist/lib/observability/exporters/laminarExporter.js +1 -0
  23. package/dist/lib/observability/exporters/posthogExporter.js +1 -0
  24. package/dist/lib/observability/utils/spanSerializer.js +1 -0
  25. package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
  26. package/dist/lib/server/voice/tokenCompare.js +23 -0
  27. package/dist/lib/server/voice/voiceServerApp.js +62 -3
  28. package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
  29. package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
  30. package/dist/lib/types/generate.d.ts +47 -0
  31. package/dist/lib/types/hitl.d.ts +3 -0
  32. package/dist/lib/types/index.d.ts +1 -1
  33. package/dist/lib/types/index.js +1 -1
  34. package/dist/lib/types/realtime.d.ts +243 -0
  35. package/dist/lib/types/realtime.js +70 -0
  36. package/dist/lib/types/server.d.ts +68 -0
  37. package/dist/lib/types/span.d.ts +2 -0
  38. package/dist/lib/types/span.js +2 -0
  39. package/dist/lib/types/stream.d.ts +36 -14
  40. package/dist/lib/types/stt.d.ts +585 -0
  41. package/dist/lib/types/stt.js +90 -0
  42. package/dist/lib/types/tools.d.ts +2 -0
  43. package/dist/lib/types/tts.d.ts +23 -11
  44. package/dist/lib/types/tts.js +7 -0
  45. package/dist/lib/types/voice.d.ts +272 -0
  46. package/dist/lib/types/voice.js +137 -0
  47. package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
  48. package/dist/lib/utils/audioFormatDetector.js +34 -0
  49. package/dist/lib/utils/errorHandling.js +4 -0
  50. package/dist/lib/utils/sttProcessor.d.ts +115 -0
  51. package/dist/lib/utils/sttProcessor.js +295 -0
  52. package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
  53. package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
  54. package/dist/lib/voice/audio-utils.d.ts +135 -0
  55. package/dist/lib/voice/audio-utils.js +435 -0
  56. package/dist/lib/voice/errors.d.ts +123 -0
  57. package/dist/lib/voice/errors.js +386 -0
  58. package/dist/lib/voice/index.d.ts +26 -0
  59. package/dist/lib/voice/index.js +55 -0
  60. package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
  61. package/dist/lib/voice/providers/AzureSTT.js +345 -0
  62. package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
  63. package/dist/lib/voice/providers/AzureTTS.js +349 -0
  64. package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
  65. package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
  66. package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
  67. package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
  68. package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
  69. package/dist/lib/voice/providers/GeminiLive.js +372 -0
  70. package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
  71. package/dist/lib/voice/providers/GoogleSTT.js +454 -0
  72. package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
  73. package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
  74. package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
  75. package/dist/lib/voice/providers/OpenAISTT.js +286 -0
  76. package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
  77. package/dist/lib/voice/providers/OpenAITTS.js +271 -0
  78. package/dist/lib/voice/stream-handler.d.ts +166 -0
  79. package/dist/lib/voice/stream-handler.js +514 -0
  80. package/dist/mcp/toolRegistry.js +7 -1
  81. package/dist/neurolink.d.ts +19 -0
  82. package/dist/neurolink.js +252 -14
  83. package/dist/observability/exporters/laminarExporter.js +1 -0
  84. package/dist/observability/exporters/posthogExporter.js +1 -0
  85. package/dist/observability/utils/spanSerializer.js +1 -0
  86. package/dist/server/voice/tokenCompare.d.ts +14 -0
  87. package/dist/server/voice/tokenCompare.js +22 -0
  88. package/dist/server/voice/voiceServerApp.js +62 -3
  89. package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
  90. package/dist/server/voice/voiceWebSocketHandler.js +555 -435
  91. package/dist/types/generate.d.ts +47 -0
  92. package/dist/types/hitl.d.ts +3 -0
  93. package/dist/types/index.d.ts +1 -1
  94. package/dist/types/index.js +1 -1
  95. package/dist/types/realtime.d.ts +243 -0
  96. package/dist/types/realtime.js +69 -0
  97. package/dist/types/server.d.ts +68 -0
  98. package/dist/types/span.d.ts +2 -0
  99. package/dist/types/span.js +2 -0
  100. package/dist/types/stream.d.ts +36 -14
  101. package/dist/types/stt.d.ts +585 -0
  102. package/dist/types/stt.js +89 -0
  103. package/dist/types/tools.d.ts +2 -0
  104. package/dist/types/tts.d.ts +23 -11
  105. package/dist/types/tts.js +7 -0
  106. package/dist/types/voice.d.ts +272 -0
  107. package/dist/types/voice.js +136 -0
  108. package/dist/utils/audioFormatDetector.d.ts +15 -0
  109. package/dist/utils/audioFormatDetector.js +33 -0
  110. package/dist/utils/errorHandling.js +4 -0
  111. package/dist/utils/sttProcessor.d.ts +115 -0
  112. package/dist/utils/sttProcessor.js +294 -0
  113. package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
  114. package/dist/voice/RealtimeVoiceAPI.js +438 -0
  115. package/dist/voice/audio-utils.d.ts +135 -0
  116. package/dist/voice/audio-utils.js +434 -0
  117. package/dist/voice/errors.d.ts +123 -0
  118. package/dist/voice/errors.js +385 -0
  119. package/dist/voice/index.d.ts +26 -0
  120. package/dist/voice/index.js +54 -0
  121. package/dist/voice/providers/AzureSTT.d.ts +47 -0
  122. package/dist/voice/providers/AzureSTT.js +344 -0
  123. package/dist/voice/providers/AzureTTS.d.ts +59 -0
  124. package/dist/voice/providers/AzureTTS.js +348 -0
  125. package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
  126. package/dist/voice/providers/DeepgramSTT.js +549 -0
  127. package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
  128. package/dist/voice/providers/ElevenLabsTTS.js +310 -0
  129. package/dist/voice/providers/GeminiLive.d.ts +52 -0
  130. package/dist/voice/providers/GeminiLive.js +371 -0
  131. package/dist/voice/providers/GoogleSTT.d.ts +60 -0
  132. package/dist/voice/providers/GoogleSTT.js +453 -0
  133. package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
  134. package/dist/voice/providers/OpenAIRealtime.js +411 -0
  135. package/dist/voice/providers/OpenAISTT.d.ts +41 -0
  136. package/dist/voice/providers/OpenAISTT.js +285 -0
  137. package/dist/voice/providers/OpenAITTS.d.ts +49 -0
  138. package/dist/voice/providers/OpenAITTS.js +270 -0
  139. package/dist/voice/stream-handler.d.ts +166 -0
  140. package/dist/voice/stream-handler.js +513 -0
  141. package/package.json +5 -2
@@ -1,10 +1,25 @@
1
1
  import WebSocket, { WebSocketServer } from "ws";
2
2
  import { FrameBus } from "./frameBus.js";
3
3
  import { TurnManager, TurnState } from "./turnManager.js";
4
+ import { timingSafeEqualString } from "./tokenCompare.js";
4
5
  import { CartesiaStream } from "../../adapters/tts/cartesiaHandler.js";
5
6
  import { NeuroLink } from "../../neurolink.js";
6
7
  import { logger } from "../../utils/logger.js";
7
8
  import { withTimeout } from "../../utils/async/withTimeout.js";
9
+ async function loadCobra(accessKey) {
10
+ try {
11
+ const mod = (await import(/* @vite-ignore */ "@picovoice/cobra-node"));
12
+ return new mod.Cobra(accessKey);
13
+ }
14
+ catch (err) {
15
+ const e = err instanceof Error ? err : null;
16
+ if (e?.code === "ERR_MODULE_NOT_FOUND" &&
17
+ e.message.includes("cobra-node")) {
18
+ throw new Error('Voice activity detection requires "@picovoice/cobra-node". Install it with:\n pnpm add @picovoice/cobra-node', { cause: err });
19
+ }
20
+ throw err;
21
+ }
22
+ }
8
23
  const SONIOX_URL = process.env.SONIOX_WS_URL ?? "wss://stt-rt.soniox.com/transcribe-websocket";
9
24
  function getRequiredEnv(name) {
10
25
  const value = process.env[name];
@@ -14,13 +29,23 @@ function getRequiredEnv(name) {
14
29
  return value;
15
30
  }
16
31
  /**
17
- * Call from the voice-server command handler BEFORE importing anything else
18
- * so the env change is scoped to voice mode only.
32
+ * Voice-server-mode environment configuration.
33
+ *
34
+ * @deprecated NEW12 — this used to mutate `process.env.NEUROLINK_DISABLE_MCP_TOOLS`
35
+ * which is process-wide. That broke any embedder that called this function in
36
+ * a process which ALSO used NeuroLink for non-voice work. The disable-tools
37
+ * intent is now passed explicitly via `disableTools: true` on every NeuroLink
38
+ * `generate()` / `stream()` call inside this server (see line ~167). Calling
39
+ * this function is now a no-op kept for backwards compatibility.
19
40
  */
20
41
  export function configureVoiceServerEnvironment() {
21
- // Disable MCP tools for the voice server tools add 5-7s of init latency
22
- // on every turn and are not needed for real-time voice interaction.
23
- process.env.NEUROLINK_DISABLE_MCP_TOOLS = "true";
42
+ // No-op. The disable-tools intent is plumbed through SDK options instead
43
+ // of via process.env mutation (NEW12).
44
+ // Issue 8 fix: surface a runtime deprecation signal so external callers
45
+ // know their call has no effect — silent no-ops are a footgun.
46
+ logger.warn("[deprecation] configureVoiceServerEnvironment() is a no-op as of NEW12. " +
47
+ "Pass `disableTools: true` via SDK options on each generate()/stream() " +
48
+ "call instead. This function will be removed in a future release.");
24
49
  }
25
50
  let _sonioxApiKey;
26
51
  function getSonioxApiKey() {
@@ -29,6 +54,17 @@ function getSonioxApiKey() {
29
54
  }
30
55
  return _sonioxApiKey;
31
56
  }
57
+ /**
58
+ * Returns a copy of an outbound Soniox payload with the API key redacted.
59
+ *
60
+ * Use this whenever debug logging the auth frame — never JSON.stringify the
61
+ * raw object. (C3 mitigation: prevents the Soniox API key from leaking into
62
+ * any aggregated log sink even if a future debug statement serialises the
63
+ * outbound payload.)
64
+ */
65
+ export function redactSonioxAuth(payload) {
66
+ return { ...payload, api_key: "[REDACTED]" };
67
+ }
32
68
  // How many consecutive silent Cobra frames (each 32ms) before declaring speech end.
33
69
  // 30 x 32ms = 960ms — long enough to distinguish a thinking pause from a real stop.
34
70
  const SILENCE_FRAMES_TO_STOP = 30;
@@ -107,452 +143,536 @@ async function streamAnswer(neurolink, messages, options) {
107
143
  });
108
144
  return result.stream;
109
145
  }
110
- export function setupWebSocket(server) {
111
- const wss = new WebSocketServer({ server });
112
- const accessKey = process.env.PICOVOICE_ACCESS_KEY;
113
- if (!accessKey) {
114
- throw new Error("PICOVOICE_ACCESS_KEY is not set in environment");
146
+ // CLAUDE.md Rule 2: ServerVoiceConnectionCtx + ServerVoiceSessionState live
147
+ // in src/lib/types/server.ts and are imported via the barrel above.
148
+ function createVerifyClient(authToken) {
149
+ return (info, cb) => {
150
+ if (!authToken) {
151
+ cb(true);
152
+ return;
153
+ }
154
+ const header = info.req.headers["authorization"];
155
+ const headerToken = typeof header === "string" && header.startsWith("Bearer ")
156
+ ? header.slice(7)
157
+ : undefined;
158
+ let urlToken;
159
+ try {
160
+ const url = new URL(info.req.url ?? "/", "http://localhost");
161
+ urlToken = url.searchParams.get("token") ?? undefined;
162
+ }
163
+ catch {
164
+ // Malformed URL — reject below.
165
+ }
166
+ const provided = headerToken ?? urlToken;
167
+ // Bug 2 mitigation: constant-time compare prevents the WS auth gate
168
+ // from leaking token length / prefix when the server is exposed via
169
+ // VOICE_SERVER_ALLOW_PUBLIC=1.
170
+ if (!provided || !timingSafeEqualString(provided, authToken)) {
171
+ cb(false, 401, "Unauthorized");
172
+ return;
173
+ }
174
+ cb(true);
175
+ };
176
+ }
177
+ function closeTts(stream, reason) {
178
+ if (!stream) {
179
+ return;
115
180
  }
116
- const neurolink = new NeuroLink();
117
- wss.on("connection", (clientWs) => {
118
- void (async () => {
119
- logger.info("[WS] Client connected");
120
- // --- Per-session Cobra instance ---
121
- let cobra = null;
122
- let FRAME_LENGTH = 512;
123
- let FRAME_BYTES = FRAME_LENGTH * 2;
124
- try {
125
- let mod;
126
- try {
127
- mod = (await import(
128
- /* @vite-ignore */ "@picovoice/cobra-node"));
129
- }
130
- catch (err) {
131
- const e = err instanceof Error ? err : null;
132
- if (e?.code === "ERR_MODULE_NOT_FOUND" &&
133
- e.message.includes("cobra-node")) {
134
- throw new Error('Voice activity detection requires "@picovoice/cobra-node". Install it with:\n pnpm add @picovoice/cobra-node', { cause: err });
135
- }
136
- throw err;
137
- }
138
- cobra = new mod.Cobra(accessKey);
139
- FRAME_LENGTH = cobra.frameLength;
140
- FRAME_BYTES = FRAME_LENGTH * 2;
141
- logger.info(`[VAD] Cobra ready (frameLength=${FRAME_LENGTH})`);
142
- }
143
- catch (err) {
144
- logger.error("[VAD] Cobra init failed:", err);
145
- clientWs.close();
181
+ try {
182
+ // Close the WS first so that any pending done/error/close listeners
183
+ // in processTurn() can settle immediately, rather than hanging until
184
+ // the withTimeout fires.
185
+ stream.close();
186
+ stream.removeAllListeners();
187
+ }
188
+ catch (error) {
189
+ logger.warn(reason, error);
190
+ }
191
+ }
192
+ async function processTurn(userText, clientWs, neurolink, s) {
193
+ if (s.activePipelineTurnId !== null) {
194
+ logger.info("[PIPELINE] Already running — discarding duplicate STT final");
195
+ return;
196
+ }
197
+ s.currentTurnId++;
198
+ const myTurn = s.currentTurnId;
199
+ s.activePipelineTurnId = myTurn;
200
+ // M4: register a per-turn abort flag. doInterrupt() flips it; every
201
+ // await suspension can short-circuit via `myAbort.aborted` instead
202
+ // of relying solely on the `myTurn !== currentTurnId` surrogate.
203
+ const myAbort = { aborted: false };
204
+ s.turnAborters.add(myAbort);
205
+ const tSttEnd = now();
206
+ try {
207
+ // Hard-cap conversation history to last N turns to prevent unbounded
208
+ // growth from exceeding the LLM context window in long voice sessions.
209
+ const MAX_HISTORY_TURNS = 20;
210
+ const trimmedHistory = s.conversation.slice(-MAX_HISTORY_TURNS * 2);
211
+ // Build context without mutating `conversation` — only commit on full completion.
212
+ const stream = await streamAnswer(neurolink, [
213
+ ...trimmedHistory,
214
+ { role: "user", content: userText },
215
+ ]);
216
+ if (myAbort.aborted || myTurn !== s.currentTurnId) {
217
+ return;
218
+ }
219
+ const tts = new CartesiaStream(`turn-${Date.now()}`);
220
+ // NEW3: register the error handler BEFORE `await tts.ready()` and
221
+ // BEFORE assigning `activeTTS = tts`. Otherwise a barge-in interrupt
222
+ // landing between `activeTTS = tts` and `await tts.ready()` would
223
+ // call `closeTts(activeTTS)` on a stream whose error events have
224
+ // no listener — emitting an unhandled error.
225
+ let ttsError = null;
226
+ tts.on("error", (err) => {
227
+ ttsError = err;
228
+ logger.error("[TTS] Mid-stream error:", err.message);
229
+ });
230
+ s.activeTTS = tts;
231
+ await tts.ready();
232
+ if (myAbort.aborted || myTurn !== s.currentTurnId) {
233
+ return;
234
+ }
235
+ // Pre-lock barge-in BEFORE signaling assistant speaking.
236
+ // Without this there is a ~700-1000ms gap where TurnState is ASSISTANT_SPEAKING
237
+ // but bargeInLockedUntil=0, so Soniox residual tokens from the previous TTS echo
238
+ // immediately trigger an interrupt before any audio has even been sent.
239
+ s.bargeInLockedUntil = Date.now() + 1000;
240
+ // Signal TurnManager that TTS is about to play — barge-in detection is now live.
241
+ s.turnManager.assistantSpeaking();
242
+ let firstAudioSent = false;
243
+ let assistantReply = "";
244
+ let tokenBuffer = "";
245
+ // Sentence/phrase boundaries to flush on — avoids flooding Cartesia with
246
+ // one tiny message per token, which causes "Service unavailable" errors on
247
+ // long responses. We flush when we hit natural speech breaks or the buffer
248
+ // grows large enough to produce a clean TTS chunk.
249
+ const FLUSH_REGEX = /[.!?,;:]\s/;
250
+ const FLUSH_MIN_LENGTH = 80;
251
+ tts.on("audio", (audio) => {
252
+ if (myAbort.aborted || myTurn !== s.currentTurnId) {
146
253
  return;
147
254
  }
148
- // --- Per-session state ---
149
- const bus = new FrameBus();
150
- const turnManager = new TurnManager(bus);
151
- let sonioxWs = null;
152
- let keepAliveTimer = null;
153
- let sessionClosed = false;
154
- let transcriptBuffer = "";
155
- let activeTTS = null;
156
- const conversation = [];
157
- let currentTurnId = 0;
158
- let activePipelineTurnId = null;
159
- // Safety fallback: if the client never sends playback_done (crash, network drop),
160
- // auto-reset the turn state after this many ms so the assistant isn't stuck.
161
- let playbackResetTimer = null;
162
- // Timestamp (ms) before which barge-in via Soniox is suppressed.
163
- // Set when TTS starts playing to prevent TTS echo from triggering immediate re-interrupt.
164
- // AEC on the browser needs ~300-400ms to characterise the echo signal before suppressing it.
165
- let bargeInLockedUntil = 0;
166
- // Cobra VAD state
167
- let isSpeaking = false;
168
- let silenceFrameCount = 0;
169
- let voiceFrameCount = 0;
170
- let frameRemainder = Buffer.alloc(0);
171
- /* ======= INTERRUPT ======= */
172
- function closeTts(stream, reason) {
173
- if (!stream) {
174
- return;
175
- }
176
- try {
177
- // Close the WS first so that any pending done/error/close listeners
178
- // in processTurn() can settle immediately, rather than hanging until
179
- // the withTimeout fires.
180
- stream.close();
181
- stream.removeAllListeners();
182
- }
183
- catch (error) {
184
- logger.warn(reason, error);
185
- }
255
+ if (!firstAudioSent) {
256
+ firstAudioSent = true;
257
+ // Refresh the lock from when audio ACTUALLY hits the client so it covers
258
+ // the AEC lock-on window (~300-400ms for browser echo cancellation).
259
+ // This extends the protection past the initial 1000ms pre-lock.
260
+ s.bargeInLockedUntil = Date.now() + 400;
261
+ logger.info(`[LATENCY] STT -> First Audio: ${(now() - tSttEnd).toFixed(0)}ms`);
186
262
  }
187
- function doInterrupt() {
188
- logger.info("[INTERRUPT] Cutting TTS");
189
- if (playbackResetTimer) {
190
- clearTimeout(playbackResetTimer);
191
- playbackResetTimer = null;
192
- }
193
- bargeInLockedUntil = 0;
194
- currentTurnId++;
195
- activePipelineTurnId = null;
196
- transcriptBuffer = "";
197
- isSpeaking = false;
198
- silenceFrameCount = 0;
199
- voiceFrameCount = 0;
200
- if (activeTTS) {
201
- closeTts(activeTTS, "[INTERRUPT] Failed to close active TTS stream");
202
- activeTTS = null;
203
- }
204
- turnManager.reset();
205
- if (clientWs.readyState === WebSocket.OPEN) {
206
- clientWs.send(JSON.stringify({ type: "interrupt" }));
207
- }
263
+ if (clientWs.readyState === WebSocket.OPEN) {
264
+ clientWs.send(audio);
208
265
  }
209
- /* ======= SONIOX ======= */
210
- function connectSoniox() {
211
- const ws = new WebSocket(SONIOX_URL);
212
- sonioxWs = ws;
213
- ws.on("open", () => {
214
- logger.info("[SONIOX] Connected");
215
- ws.send(JSON.stringify({
216
- api_key: getSonioxApiKey(),
217
- model: "stt-rt-preview",
218
- audio_format: "auto",
219
- language_hints: ["en"],
220
- enable_endpoint_detection: true,
221
- }));
222
- ws.send(makeWavHeader(16000, 1));
223
- startKeepAlive();
224
- });
225
- ws.on("message", handleSonioxMessage);
226
- ws.on("close", (code, reason) => {
227
- logger.info(`[SONIOX] Closed: code=${code} reason=${reason.toString() || "(none)"}`);
228
- stopKeepAlive();
229
- if (!sessionClosed) {
230
- setTimeout(() => {
231
- connectSoniox();
232
- }, 500);
233
- }
234
- });
235
- ws.on("error", (err) => {
236
- logger.error("[SONIOX] Error:", err.message);
266
+ });
267
+ for await (const chunk of stream) {
268
+ if (myAbort.aborted || myTurn !== s.currentTurnId) {
269
+ logger.info("[PIPELINE] Stale LLM stream — dropping");
270
+ break;
271
+ }
272
+ // If Cartesia errored mid-stream, abort sending more tokens.
273
+ if (ttsError) {
274
+ logger.info("[PIPELINE] Aborting LLM stream — Cartesia error");
275
+ break;
276
+ }
277
+ if (!chunk || typeof chunk !== "object" || !("content" in chunk)) {
278
+ continue;
279
+ }
280
+ if (typeof chunk.content !== "string") {
281
+ continue;
282
+ }
283
+ assistantReply += chunk.content;
284
+ tokenBuffer += chunk.content;
285
+ // Flush buffer to Cartesia at sentence/phrase boundaries or when it's
286
+ // grown large enough. This batches tokens into meaningful speech chunks
287
+ // instead of sending one WebSocket message per token.
288
+ if (FLUSH_REGEX.test(tokenBuffer) ||
289
+ tokenBuffer.length >= FLUSH_MIN_LENGTH) {
290
+ tts.send(tokenBuffer, true);
291
+ tokenBuffer = "";
292
+ }
293
+ }
294
+ // Flush any remaining buffered tokens before the final flush().
295
+ if (tokenBuffer) {
296
+ tts.send(tokenBuffer, true);
297
+ tokenBuffer = "";
298
+ }
299
+ // If Cartesia errored during the stream, reset and bail out now.
300
+ if (ttsError) {
301
+ logger.error("[TTS] Error during stream — resetting turn so user can retry:", String(ttsError));
302
+ closeTts(tts, "[TTS] Failed to close stream after mid-stream error");
303
+ s.turnManager.reset();
304
+ return;
305
+ }
306
+ if (myAbort.aborted || myTurn !== s.currentTurnId) {
307
+ return;
308
+ }
309
+ let ttsSucceeded = false;
310
+ try {
311
+ await withTimeout(new Promise((resolve, reject) => {
312
+ tts.once("done", () => {
313
+ ttsSucceeded = true;
314
+ resolve();
237
315
  });
316
+ // Re-use the persistent error handler: if another error arrives during flush,
317
+ // the existing "error" listener fires ttsError; reject via a one-time wrapper.
318
+ tts.once("error", reject);
319
+ // Reject if the socket closes without emitting done or error.
320
+ tts.once("close", () => reject(new Error("Cartesia WS closed before flush completed")));
321
+ tts.flush();
322
+ }), 10000, "Cartesia flush timed out");
323
+ }
324
+ catch (err) {
325
+ // Cartesia failed (e.g. "Service unavailable"). The user heard nothing.
326
+ // Reset state immediately so they can speak and retry — don't commit
327
+ // the turn to conversation history since it was never heard.
328
+ logger.error("[TTS] Error during flush — resetting turn so user can retry:", err.message);
329
+ closeTts(tts, "[TTS] Failed to close stream after flush error");
330
+ s.turnManager.reset();
331
+ return;
332
+ }
333
+ closeTts(tts, "[TTS] Failed to close stream after successful playback");
334
+ if (!ttsSucceeded || myTurn !== s.currentTurnId) {
335
+ return;
336
+ }
337
+ // Only commit conversation when the turn completed fully and was heard.
338
+ s.conversation.push({ role: "user", content: userText });
339
+ s.conversation.push({ role: "assistant", content: assistantReply });
340
+ // Do NOT reset state here — the client is still playing buffered audio.
341
+ // The client sends playback_done when the last audio chunk finishes playing,
342
+ // which is the correct moment to return to IDLE and allow new user speech.
343
+ // Safety fallback: if the client never sends playback_done (crash, disconnect),
344
+ // auto-reset after 20 seconds so the assistant doesn't stay stuck.
345
+ if (s.playbackResetTimer) {
346
+ clearTimeout(s.playbackResetTimer);
347
+ }
348
+ s.playbackResetTimer = setTimeout(() => {
349
+ s.playbackResetTimer = null;
350
+ s.turnManager.reset();
351
+ }, 20000);
352
+ }
353
+ finally {
354
+ if (s.activePipelineTurnId === myTurn) {
355
+ s.activePipelineTurnId = null;
356
+ }
357
+ // M4: always remove our abort flag from the registry, even on
358
+ // crash. doInterrupt() may have already cleared the set, in which
359
+ // case this is a no-op.
360
+ s.turnAborters.delete(myAbort);
361
+ }
362
+ }
363
+ function handleClientBinaryAudio(data, clientWs, s) {
364
+ const buf = Buffer.isBuffer(data) ? data : Buffer.from(data);
365
+ // Reassemble into exact FRAME_BYTES-sized Cobra frames.
366
+ const combined = Buffer.concat([s.frameRemainder, buf]);
367
+ let pos = 0;
368
+ while (pos + s.FRAME_BYTES <= combined.length) {
369
+ const frame = new Int16Array(s.FRAME_LENGTH);
370
+ for (let i = 0; i < s.FRAME_LENGTH; i++) {
371
+ frame[i] = combined.readInt16LE(pos + i * 2);
372
+ }
373
+ pos += s.FRAME_BYTES;
374
+ // Cobra VAD:
375
+ // Cobra tracks when the user is speaking vs silent. Its output drives
376
+ // TurnManager state (USER_SPEAKING / PROCESSING) but does NOT trigger
377
+ // interrupt — that comes from Soniox non-final tokens so echo can't fool it.
378
+ let voiceProb = 0;
379
+ try {
380
+ if (!s.cobra) {
381
+ continue;
238
382
  }
239
- function startKeepAlive() {
240
- keepAliveTimer = setInterval(() => {
241
- if (sonioxWs?.readyState === WebSocket.OPEN) {
242
- sonioxWs.send(JSON.stringify({ type: "keepalive" }));
243
- }
244
- }, 8000);
383
+ voiceProb = s.cobra.process(frame);
384
+ }
385
+ catch (err) {
386
+ logger.error("[VAD] Cobra process error:", err);
387
+ }
388
+ const isVoice = voiceProb >= VOICE_THRESHOLD;
389
+ if (isVoice) {
390
+ s.voiceFrameCount++;
391
+ s.silenceFrameCount = 0;
392
+ if (!s.isSpeaking && s.voiceFrameCount >= VOICE_FRAMES_TO_START) {
393
+ s.isSpeaking = true;
394
+ logger.info(`[VAD] Speech start (prob=${voiceProb.toFixed(2)})`);
395
+ s.bus.publish({ type: "vad_start" });
245
396
  }
246
- function stopKeepAlive() {
247
- if (keepAliveTimer) {
248
- clearInterval(keepAliveTimer);
249
- keepAliveTimer = null;
397
+ }
398
+ else {
399
+ s.voiceFrameCount = 0;
400
+ if (s.isSpeaking) {
401
+ s.silenceFrameCount++;
402
+ if (s.silenceFrameCount >= SILENCE_FRAMES_TO_STOP) {
403
+ s.isSpeaking = false;
404
+ s.silenceFrameCount = 0;
405
+ logger.info("[VAD] Speech stop");
406
+ s.bus.publish({ type: "vad_stop" });
250
407
  }
251
408
  }
252
- /* ======= STT HANDLER ======= */
253
- async function handleSonioxMessage(msg) {
254
- const data = parseSonioxMessage(msg);
255
- if (!data) {
256
- return;
257
- }
258
- if (!Array.isArray(data.tokens)) {
259
- if (data.error || data.status || data.type) {
260
- if (logger.shouldLog("debug")) {
261
- logger.info("[SONIOX] msg:", JSON.stringify(data));
262
- }
263
- }
264
- return;
265
- }
266
- const tokens = data.tokens;
267
- // Barge-in detection:
268
- // Soniox non-final tokens = real speech is being recognised right now.
269
- // Browser AEC (echo cancellation) suppresses TTS playback at the mic, so
270
- // non-final tokens can only come from the user's own voice — unlike raw
271
- // Cobra probability which can be fooled by speaker echo.
272
- // We only fire interrupt when the TurnManager confirms TTS is actually
273
- // playing (ASSISTANT_SPEAKING state set by processTurn).
274
- // bargeInLockedUntil suppresses the first ~400ms after TTS starts so that
275
- // TTS audio picked up by the mic (before AEC locks on) can't re-trigger.
276
- if (turnManager.state === TurnState.ASSISTANT_SPEAKING &&
277
- Date.now() > bargeInLockedUntil) {
278
- const speechPartials = tokens.filter((token) => !token.is_final && token.text && token.text.trim().length > 1);
279
- if (speechPartials.length > 0) {
280
- logger.info(`[BARGE-IN] Detected via Soniox: "${speechPartials.map((token) => token.text).join("")}"`);
281
- doInterrupt();
282
- return;
283
- }
284
- }
285
- const finals = tokens.filter((token) => token.is_final && token.text);
286
- if (!finals.length) {
287
- return;
288
- }
289
- transcriptBuffer += finals.map((token) => token.text).join("");
290
- const hasEnd = finals.some((token) => token.text === "<end>");
291
- if (!hasEnd) {
292
- return;
293
- }
294
- const finalText = transcriptBuffer.replace("<end>", "").trim();
295
- transcriptBuffer = "";
296
- if (!finalText) {
297
- return;
298
- }
299
- logger.info("[STT] Final ->", finalText);
300
- try {
301
- await processTurn(finalText);
302
- }
303
- catch (err) {
304
- logger.error("[PIPELINE] Unhandled error in processTurn:", err.message);
305
- turnManager.reset();
306
- }
409
+ }
410
+ // Always forward every frame to Soniox for continuous transcription.
411
+ if (s.sonioxWs?.readyState === WebSocket.OPEN) {
412
+ s.sonioxWs.send(Buffer.from(frame.buffer));
413
+ }
414
+ }
415
+ s.frameRemainder = combined.subarray(pos);
416
+ }
417
+ async function handleVoiceConnection(clientWs, ctx) {
418
+ const { neurolink, accessKey } = ctx;
419
+ logger.info("[WS] Client connected");
420
+ // --- Per-session Cobra instance ---
421
+ // Use definite-assignment via early return on catch — avoids dead initial
422
+ // values that ESLint flags as `no-useless-assignment` and matches the
423
+ // intent: if cobra init fails, the connection cannot proceed.
424
+ let cobra;
425
+ let FRAME_LENGTH;
426
+ try {
427
+ cobra = await loadCobra(accessKey);
428
+ FRAME_LENGTH = cobra.frameLength;
429
+ logger.info(`[VAD] Cobra ready (frameLength=${FRAME_LENGTH})`);
430
+ }
431
+ catch (err) {
432
+ logger.error("[VAD] Cobra init failed:", err);
433
+ clientWs.close();
434
+ return;
435
+ }
436
+ const FRAME_BYTES = FRAME_LENGTH * 2;
437
+ // --- Per-session state ---
438
+ const bus = new FrameBus();
439
+ const s = {
440
+ cobra,
441
+ FRAME_LENGTH,
442
+ FRAME_BYTES,
443
+ bus,
444
+ turnManager: new TurnManager(bus),
445
+ sonioxWs: null,
446
+ keepAliveTimer: null,
447
+ sonioxReconnectTimer: null,
448
+ sessionClosed: false,
449
+ transcriptBuffer: "",
450
+ activeTTS: null,
451
+ conversation: [],
452
+ currentTurnId: 0,
453
+ activePipelineTurnId: null,
454
+ // M4: per-turn abort flags. doInterrupt() flips every flag in this
455
+ // set so any concurrent processTurn invocation can detect abort at
456
+ // every await suspension — without relying on the `myTurn !==
457
+ // currentTurnId` surrogate (which had edge cases when a stale
458
+ // pipeline was mid-await on a closed TTS stream).
459
+ turnAborters: new Set(),
460
+ // Safety fallback: if the client never sends playback_done (crash, network drop),
461
+ // auto-reset the turn state after this many ms so the assistant isn't stuck.
462
+ playbackResetTimer: null,
463
+ // Timestamp (ms) before which barge-in via Soniox is suppressed.
464
+ // Set when TTS starts playing to prevent TTS echo from triggering immediate re-interrupt.
465
+ // AEC on the browser needs ~300-400ms to characterise the echo signal before suppressing it.
466
+ bargeInLockedUntil: 0,
467
+ // Cobra VAD state
468
+ isSpeaking: false,
469
+ silenceFrameCount: 0,
470
+ voiceFrameCount: 0,
471
+ frameRemainder: Buffer.alloc(0),
472
+ };
473
+ /* ======= INTERRUPT ======= */
474
+ function doInterrupt() {
475
+ logger.info("[INTERRUPT] Cutting TTS");
476
+ if (s.playbackResetTimer) {
477
+ clearTimeout(s.playbackResetTimer);
478
+ s.playbackResetTimer = null;
479
+ }
480
+ s.bargeInLockedUntil = 0;
481
+ s.currentTurnId++;
482
+ s.activePipelineTurnId = null;
483
+ // M4: signal every in-flight processTurn that it's been aborted so
484
+ // their await-checkpoints exit immediately, instead of unwinding
485
+ // through awaits on a stream that's already being closed below.
486
+ for (const a of s.turnAborters) {
487
+ a.aborted = true;
488
+ }
489
+ s.turnAborters.clear();
490
+ s.transcriptBuffer = "";
491
+ s.isSpeaking = false;
492
+ s.silenceFrameCount = 0;
493
+ s.voiceFrameCount = 0;
494
+ if (s.activeTTS) {
495
+ closeTts(s.activeTTS, "[INTERRUPT] Failed to close active TTS stream");
496
+ s.activeTTS = null;
497
+ }
498
+ s.turnManager.reset();
499
+ if (clientWs.readyState === WebSocket.OPEN) {
500
+ clientWs.send(JSON.stringify({ type: "interrupt" }));
501
+ }
502
+ }
503
+ /* ======= SONIOX ======= */
504
+ function startKeepAlive() {
505
+ s.keepAliveTimer = setInterval(() => {
506
+ if (s.sonioxWs?.readyState === WebSocket.OPEN) {
507
+ s.sonioxWs.send(JSON.stringify({ type: "keepalive" }));
307
508
  }
308
- /* ======= TURN PROCESSOR ======= */
309
- async function processTurn(userText) {
310
- if (activePipelineTurnId !== null) {
311
- logger.info("[PIPELINE] Already running — discarding duplicate STT final");
312
- return;
313
- }
314
- currentTurnId++;
315
- const myTurn = currentTurnId;
316
- activePipelineTurnId = myTurn;
317
- const tSttEnd = now();
318
- try {
319
- // Build context without mutating `conversation` — only commit on full completion.
320
- const stream = await streamAnswer(neurolink, [
321
- ...conversation,
322
- { role: "user", content: userText },
323
- ]);
324
- if (myTurn !== currentTurnId) {
325
- return;
326
- }
327
- const tts = new CartesiaStream(`turn-${Date.now()}`);
328
- activeTTS = tts;
329
- await tts.ready();
330
- if (myTurn !== currentTurnId) {
331
- return;
332
- }
333
- // Register error handler immediately after ready() — before the LLM stream loop —
334
- // so Cartesia errors emitted mid-stream (during token sending) are captured.
335
- // Without this, errors during the for-await loop have no listener and are swallowed.
336
- let ttsError = null;
337
- tts.on("error", (err) => {
338
- ttsError = err;
339
- logger.error("[TTS] Mid-stream error:", err.message);
340
- });
341
- // Pre-lock barge-in BEFORE signaling assistant speaking.
342
- // Without this there is a ~700-1000ms gap where TurnState is ASSISTANT_SPEAKING
343
- // but bargeInLockedUntil=0, so Soniox residual tokens from the previous TTS echo
344
- // immediately trigger an interrupt before any audio has even been sent.
345
- bargeInLockedUntil = Date.now() + 1000;
346
- // Signal TurnManager that TTS is about to play — barge-in detection is now live.
347
- turnManager.assistantSpeaking();
348
- let firstAudioSent = false;
349
- let assistantReply = "";
350
- let tokenBuffer = "";
351
- // Sentence/phrase boundaries to flush on — avoids flooding Cartesia with
352
- // one tiny message per token, which causes "Service unavailable" errors on
353
- // long responses. We flush when we hit natural speech breaks or the buffer
354
- // grows large enough to produce a clean TTS chunk.
355
- const FLUSH_REGEX = /[.!?,;:]\s/;
356
- const FLUSH_MIN_LENGTH = 80;
357
- tts.on("audio", (audio) => {
358
- if (myTurn !== currentTurnId) {
359
- return;
360
- }
361
- if (!firstAudioSent) {
362
- firstAudioSent = true;
363
- // Refresh the lock from when audio ACTUALLY hits the client so it covers
364
- // the AEC lock-on window (~300-400ms for browser echo cancellation).
365
- // This extends the protection past the initial 1000ms pre-lock.
366
- bargeInLockedUntil = Date.now() + 400;
367
- logger.info(`[LATENCY] STT -> First Audio: ${(now() - tSttEnd).toFixed(0)}ms`);
368
- }
369
- if (clientWs.readyState === WebSocket.OPEN) {
370
- clientWs.send(audio);
371
- }
372
- });
373
- for await (const chunk of stream) {
374
- if (myTurn !== currentTurnId) {
375
- logger.info("[PIPELINE] Stale LLM stream — dropping");
376
- break;
377
- }
378
- // If Cartesia errored mid-stream, abort sending more tokens.
379
- if (ttsError) {
380
- logger.info("[PIPELINE] Aborting LLM stream — Cartesia error");
381
- break;
382
- }
383
- if (!chunk || typeof chunk !== "object" || !("content" in chunk)) {
384
- continue;
385
- }
386
- if (typeof chunk.content !== "string") {
387
- continue;
388
- }
389
- assistantReply += chunk.content;
390
- tokenBuffer += chunk.content;
391
- // Flush buffer to Cartesia at sentence/phrase boundaries or when it's
392
- // grown large enough. This batches tokens into meaningful speech chunks
393
- // instead of sending one WebSocket message per token.
394
- if (FLUSH_REGEX.test(tokenBuffer) ||
395
- tokenBuffer.length >= FLUSH_MIN_LENGTH) {
396
- tts.send(tokenBuffer, true);
397
- tokenBuffer = "";
398
- }
399
- }
400
- // Flush any remaining buffered tokens before the final flush().
401
- if (tokenBuffer) {
402
- tts.send(tokenBuffer, true);
403
- tokenBuffer = "";
404
- }
405
- // If Cartesia errored during the stream, reset and bail out now.
406
- if (ttsError) {
407
- logger.error("[TTS] Error during stream — resetting turn so user can retry:", String(ttsError));
408
- closeTts(tts, "[TTS] Failed to close stream after mid-stream error");
409
- turnManager.reset();
410
- return;
411
- }
412
- if (myTurn !== currentTurnId) {
413
- return;
414
- }
415
- let ttsSucceeded = false;
416
- try {
417
- await withTimeout(new Promise((resolve, reject) => {
418
- tts.once("done", () => {
419
- ttsSucceeded = true;
420
- resolve();
421
- });
422
- // Re-use the persistent error handler: if another error arrives during flush,
423
- // the existing "error" listener fires ttsError; reject via a one-time wrapper.
424
- tts.once("error", reject);
425
- // Reject if the socket closes without emitting done or error.
426
- tts.once("close", () => reject(new Error("Cartesia WS closed before flush completed")));
427
- tts.flush();
428
- }), 10000, "Cartesia flush timed out");
429
- }
430
- catch (err) {
431
- // Cartesia failed (e.g. "Service unavailable"). The user heard nothing.
432
- // Reset state immediately so they can speak and retry — don't commit
433
- // the turn to conversation history since it was never heard.
434
- logger.error("[TTS] Error during flush — resetting turn so user can retry:", err.message);
435
- closeTts(tts, "[TTS] Failed to close stream after flush error");
436
- turnManager.reset();
437
- return;
438
- }
439
- closeTts(tts, "[TTS] Failed to close stream after successful playback");
440
- if (!ttsSucceeded || myTurn !== currentTurnId) {
441
- return;
442
- }
443
- // Only commit conversation when the turn completed fully and was heard.
444
- conversation.push({ role: "user", content: userText });
445
- conversation.push({ role: "assistant", content: assistantReply });
446
- // Do NOT reset state here — the client is still playing buffered audio.
447
- // The client sends playback_done when the last audio chunk finishes playing,
448
- // which is the correct moment to return to IDLE and allow new user speech.
449
- // Safety fallback: if the client never sends playback_done (crash, disconnect),
450
- // auto-reset after 20 seconds so the assistant doesn't stay stuck.
451
- if (playbackResetTimer) {
452
- clearTimeout(playbackResetTimer);
453
- }
454
- playbackResetTimer = setTimeout(() => {
455
- playbackResetTimer = null;
456
- turnManager.reset();
457
- }, 20000);
458
- }
459
- finally {
460
- if (activePipelineTurnId === myTurn) {
461
- activePipelineTurnId = null;
462
- }
509
+ }, 8000);
510
+ }
511
+ function stopKeepAlive() {
512
+ if (s.keepAliveTimer) {
513
+ clearInterval(s.keepAliveTimer);
514
+ s.keepAliveTimer = null;
515
+ }
516
+ }
517
+ async function handleSonioxMessage(msg) {
518
+ const data = parseSonioxMessage(msg);
519
+ if (!data) {
520
+ return;
521
+ }
522
+ if (!Array.isArray(data.tokens)) {
523
+ if (data.error || data.status || data.type) {
524
+ if (logger.shouldLog("debug")) {
525
+ logger.info("[SONIOX] msg:", JSON.stringify(data));
463
526
  }
464
527
  }
465
- /* ======= CLIENT AUDIO + CONTROL ======= */
466
- clientWs.on("message", (data) => {
467
- if (typeof data === "string") {
468
- const msg = parseClientControlMessage(data);
469
- if (msg?.type === "playback_done") {
470
- // Client finished playing all audio now it's safe to listen again.
471
- if (playbackResetTimer) {
472
- clearTimeout(playbackResetTimer);
473
- playbackResetTimer = null;
474
- }
475
- turnManager.reset();
476
- }
477
- return;
478
- }
479
- if (!(data instanceof Buffer)) {
480
- return;
481
- }
482
- // Reassemble into exact FRAME_BYTES-sized Cobra frames.
483
- const combined = Buffer.concat([frameRemainder, data]);
484
- let pos = 0;
485
- while (pos + FRAME_BYTES <= combined.length) {
486
- const frame = new Int16Array(FRAME_LENGTH);
487
- for (let i = 0; i < FRAME_LENGTH; i++) {
488
- frame[i] = combined.readInt16LE(pos + i * 2);
489
- }
490
- pos += FRAME_BYTES;
491
- // Cobra VAD:
492
- // Cobra tracks when the user is speaking vs silent. Its output drives
493
- // TurnManager state (USER_SPEAKING / PROCESSING) but does NOT trigger
494
- // interrupt — that comes from Soniox non-final tokens so echo can't fool it.
495
- let voiceProb = 0;
496
- try {
497
- if (!cobra) {
498
- continue;
499
- }
500
- voiceProb = cobra.process(frame);
501
- }
502
- catch (err) {
503
- logger.error("[VAD] Cobra process error:", err);
504
- }
505
- const isVoice = voiceProb >= VOICE_THRESHOLD;
506
- if (isVoice) {
507
- voiceFrameCount++;
508
- silenceFrameCount = 0;
509
- if (!isSpeaking && voiceFrameCount >= VOICE_FRAMES_TO_START) {
510
- isSpeaking = true;
511
- logger.info(`[VAD] Speech start (prob=${voiceProb.toFixed(2)})`);
512
- bus.publish({ type: "vad_start" });
513
- }
514
- }
515
- else {
516
- voiceFrameCount = 0;
517
- if (isSpeaking) {
518
- silenceFrameCount++;
519
- if (silenceFrameCount >= SILENCE_FRAMES_TO_STOP) {
520
- isSpeaking = false;
521
- silenceFrameCount = 0;
522
- logger.info("[VAD] Speech stop");
523
- bus.publish({ type: "vad_stop" });
524
- }
525
- }
526
- }
527
- // Always forward every frame to Soniox for continuous transcription.
528
- if (sonioxWs?.readyState === WebSocket.OPEN) {
529
- sonioxWs.send(Buffer.from(frame.buffer));
530
- }
531
- }
532
- frameRemainder = combined.subarray(pos);
533
- });
534
- clientWs.on("close", () => {
535
- logger.info("[WS] Client disconnected");
536
- sessionClosed = true;
537
- if (cobra) {
538
- cobra.release();
539
- }
540
- closeTts(activeTTS, "[WS] Failed to close active TTS on disconnect");
541
- stopKeepAlive();
542
- if (sonioxWs) {
543
- sonioxWs.close();
544
- }
545
- });
546
- connectSoniox();
547
- })().catch((err) => {
548
- logger.error("[WS] Connection handler failed:", err);
549
- try {
550
- clientWs.close();
528
+ return;
529
+ }
530
+ const tokens = data.tokens;
531
+ // Barge-in detection:
532
+ // Soniox non-final tokens = real speech is being recognised right now.
533
+ // Browser AEC (echo cancellation) suppresses TTS playback at the mic, so
534
+ // non-final tokens can only come from the user's own voice — unlike raw
535
+ // Cobra probability which can be fooled by speaker echo.
536
+ // We only fire interrupt when the TurnManager confirms TTS is actually
537
+ // playing (ASSISTANT_SPEAKING state set by processTurn).
538
+ // bargeInLockedUntil suppresses the first ~400ms after TTS starts so that
539
+ // TTS audio picked up by the mic (before AEC locks on) can't re-trigger.
540
+ if (s.turnManager.state === TurnState.ASSISTANT_SPEAKING &&
541
+ Date.now() > s.bargeInLockedUntil) {
542
+ const speechPartials = tokens.filter((token) => !token.is_final && token.text && token.text.trim().length > 1);
543
+ if (speechPartials.length > 0) {
544
+ logger.info(`[BARGE-IN] Detected via Soniox: "${speechPartials.map((token) => token.text).join("")}"`);
545
+ doInterrupt();
546
+ return;
551
547
  }
552
- catch {
553
- /* already closed */
548
+ }
549
+ const finals = tokens.filter((token) => token.is_final && token.text);
550
+ if (!finals.length) {
551
+ return;
552
+ }
553
+ s.transcriptBuffer += finals.map((token) => token.text).join("");
554
+ const hasEnd = finals.some((token) => token.text === "<end>");
555
+ if (!hasEnd) {
556
+ return;
557
+ }
558
+ const finalText = s.transcriptBuffer.replace("<end>", "").trim();
559
+ s.transcriptBuffer = "";
560
+ if (!finalText) {
561
+ return;
562
+ }
563
+ logger.info("[STT] Final ->", finalText);
564
+ try {
565
+ await processTurn(finalText, clientWs, neurolink, s);
566
+ }
567
+ catch (err) {
568
+ logger.error("[PIPELINE] Unhandled error in processTurn:", err.message);
569
+ s.turnManager.reset();
570
+ }
571
+ }
572
+ function connectSoniox() {
573
+ const ws = new WebSocket(SONIOX_URL);
574
+ s.sonioxWs = ws;
575
+ ws.on("open", () => {
576
+ logger.info("[SONIOX] Connected");
577
+ // C3: build the auth frame in a sealed scope. The api_key is only
578
+ // serialised inside ws.send(); never expose the assembled object
579
+ // to any logger or telemetry sink. If you ever need to log the
580
+ // outbound payload during debugging, use the redacted clone via
581
+ // `redactSonioxAuth(payload)` defined below — never JSON.stringify
582
+ // the raw object.
583
+ const authPayload = {
584
+ api_key: getSonioxApiKey(),
585
+ model: "stt-rt-preview",
586
+ audio_format: "auto",
587
+ language_hints: ["en"],
588
+ enable_endpoint_detection: true,
589
+ };
590
+ ws.send(JSON.stringify(authPayload));
591
+ ws.send(makeWavHeader(16000, 1));
592
+ startKeepAlive();
593
+ });
594
+ ws.on("message", handleSonioxMessage);
595
+ ws.on("close", (code, reason) => {
596
+ logger.info(`[SONIOX] Closed: code=${code} reason=${reason.toString() || "(none)"}`);
597
+ stopKeepAlive();
598
+ if (!s.sessionClosed) {
599
+ s.sonioxReconnectTimer = setTimeout(() => {
600
+ s.sonioxReconnectTimer = null;
601
+ connectSoniox();
602
+ }, 500);
554
603
  }
555
604
  });
605
+ ws.on("error", (err) => {
606
+ logger.error("[SONIOX] Error:", err.message);
607
+ });
608
+ }
609
+ /* ======= CLIENT AUDIO + CONTROL ======= */
610
+ clientWs.on("message", (data, isBinary) => {
611
+ if (!isBinary) {
612
+ // Text frame — parse as JSON control message
613
+ const msg = parseClientControlMessage(data.toString());
614
+ if (msg?.type === "playback_done") {
615
+ // Client finished playing all audio — now it's safe to listen again.
616
+ if (s.playbackResetTimer) {
617
+ clearTimeout(s.playbackResetTimer);
618
+ s.playbackResetTimer = null;
619
+ }
620
+ s.turnManager.reset();
621
+ }
622
+ return;
623
+ }
624
+ handleClientBinaryAudio(data, clientWs, s);
625
+ });
626
+ clientWs.on("close", () => {
627
+ logger.info("[WS] Client disconnected");
628
+ s.sessionClosed = true;
629
+ // Cancel any in-flight processTurn pipelines so LLM/TTS work doesn't
630
+ // keep running after the client is gone (otherwise the LLM stream keeps
631
+ // pulling and the Cartesia flush waits its full 10s window).
632
+ for (const a of s.turnAborters) {
633
+ a.aborted = true;
634
+ }
635
+ s.turnAborters.clear();
636
+ s.activePipelineTurnId = null;
637
+ // Cancel all pending timers to prevent callbacks on dead sessions
638
+ if (s.playbackResetTimer) {
639
+ clearTimeout(s.playbackResetTimer);
640
+ s.playbackResetTimer = null;
641
+ }
642
+ if (s.sonioxReconnectTimer) {
643
+ clearTimeout(s.sonioxReconnectTimer);
644
+ s.sonioxReconnectTimer = null;
645
+ }
646
+ if (s.cobra) {
647
+ s.cobra.release();
648
+ }
649
+ closeTts(s.activeTTS, "[WS] Failed to close active TTS on disconnect");
650
+ stopKeepAlive();
651
+ if (s.sonioxWs) {
652
+ s.sonioxWs.close();
653
+ }
654
+ });
655
+ connectSoniox();
656
+ }
657
+ export function setupWebSocket(server, options = {}) {
658
+ // NEW11: maxPayload protects against OOM on giant inbound frames.
659
+ // verifyClient enforces auth on the upgrade handshake before any session
660
+ // resources are allocated.
661
+ const wss = new WebSocketServer({
662
+ server,
663
+ maxPayload: options.maxPayload ?? 1_048_576,
664
+ verifyClient: createVerifyClient(options.authToken),
665
+ });
666
+ const accessKey = process.env.PICOVOICE_ACCESS_KEY;
667
+ if (!accessKey) {
668
+ throw new Error("PICOVOICE_ACCESS_KEY is not set in environment");
669
+ }
670
+ const neurolink = new NeuroLink();
671
+ wss.on("connection", (clientWs) => {
672
+ void handleVoiceConnection(clientWs, { neurolink, accessKey }).catch((err) => {
673
+ logger.error("[WS] Connection handler failed:", err);
674
+ clientWs.close();
675
+ });
556
676
  });
557
677
  }
558
678
  //# sourceMappingURL=voiceWebSocketHandler.js.map