@kodelyth/voice-call 2026.5.42 → 2026.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/package.json +18 -6
  2. package/api.ts +0 -16
  3. package/cli-metadata.ts +0 -10
  4. package/config-api.ts +0 -12
  5. package/index.test.ts +0 -1075
  6. package/index.ts +0 -863
  7. package/runtime-api.ts +0 -20
  8. package/runtime-entry.ts +0 -1
  9. package/setup-api.ts +0 -47
  10. package/src/allowlist.test.ts +0 -18
  11. package/src/allowlist.ts +0 -19
  12. package/src/cli.test.ts +0 -12
  13. package/src/cli.ts +0 -866
  14. package/src/config-compat.test.ts +0 -130
  15. package/src/config-compat.ts +0 -227
  16. package/src/config.test.ts +0 -542
  17. package/src/config.ts +0 -883
  18. package/src/core-bridge.ts +0 -14
  19. package/src/deep-merge.test.ts +0 -40
  20. package/src/deep-merge.ts +0 -23
  21. package/src/gateway-continue-operation.ts +0 -200
  22. package/src/http-headers.test.ts +0 -16
  23. package/src/http-headers.ts +0 -15
  24. package/src/manager/context.ts +0 -50
  25. package/src/manager/events.test.ts +0 -578
  26. package/src/manager/events.ts +0 -332
  27. package/src/manager/lifecycle.ts +0 -53
  28. package/src/manager/lookup.test.ts +0 -52
  29. package/src/manager/lookup.ts +0 -35
  30. package/src/manager/outbound.test.ts +0 -629
  31. package/src/manager/outbound.ts +0 -508
  32. package/src/manager/state.ts +0 -48
  33. package/src/manager/store.ts +0 -107
  34. package/src/manager/timers.test.ts +0 -127
  35. package/src/manager/timers.ts +0 -113
  36. package/src/manager/twiml.test.ts +0 -13
  37. package/src/manager/twiml.ts +0 -17
  38. package/src/manager.closed-loop.test.ts +0 -259
  39. package/src/manager.inbound-allowlist.test.ts +0 -183
  40. package/src/manager.notify.test.ts +0 -390
  41. package/src/manager.restore.test.ts +0 -310
  42. package/src/manager.test-harness.ts +0 -127
  43. package/src/manager.ts +0 -441
  44. package/src/media-stream.test.ts +0 -953
  45. package/src/media-stream.ts +0 -876
  46. package/src/providers/base.ts +0 -99
  47. package/src/providers/mock.test.ts +0 -86
  48. package/src/providers/mock.ts +0 -185
  49. package/src/providers/plivo.test.ts +0 -93
  50. package/src/providers/plivo.ts +0 -601
  51. package/src/providers/shared/call-status.test.ts +0 -24
  52. package/src/providers/shared/call-status.ts +0 -24
  53. package/src/providers/shared/guarded-json-api.test.ts +0 -127
  54. package/src/providers/shared/guarded-json-api.ts +0 -49
  55. package/src/providers/telnyx.test.ts +0 -489
  56. package/src/providers/telnyx.ts +0 -419
  57. package/src/providers/twilio/api.test.ts +0 -184
  58. package/src/providers/twilio/api.ts +0 -100
  59. package/src/providers/twilio/twiml-policy.test.ts +0 -84
  60. package/src/providers/twilio/twiml-policy.ts +0 -87
  61. package/src/providers/twilio/webhook.ts +0 -34
  62. package/src/providers/twilio.test.ts +0 -607
  63. package/src/providers/twilio.ts +0 -861
  64. package/src/providers/twilio.types.ts +0 -17
  65. package/src/realtime-agent-context.test.ts +0 -101
  66. package/src/realtime-agent-context.ts +0 -149
  67. package/src/realtime-defaults.ts +0 -3
  68. package/src/realtime-fast-context.test.ts +0 -74
  69. package/src/realtime-fast-context.ts +0 -27
  70. package/src/realtime-transcription.runtime.ts +0 -4
  71. package/src/realtime-voice.runtime.ts +0 -5
  72. package/src/response-generator.test.ts +0 -385
  73. package/src/response-generator.ts +0 -348
  74. package/src/response-model.test.ts +0 -71
  75. package/src/response-model.ts +0 -23
  76. package/src/runtime.test.ts +0 -625
  77. package/src/runtime.ts +0 -528
  78. package/src/telephony-audio.test.ts +0 -61
  79. package/src/telephony-audio.ts +0 -12
  80. package/src/telephony-tts.test.ts +0 -196
  81. package/src/telephony-tts.ts +0 -235
  82. package/src/test-fixtures.ts +0 -82
  83. package/src/tts-provider-voice.test.ts +0 -34
  84. package/src/tts-provider-voice.ts +0 -21
  85. package/src/tunnel.test.ts +0 -173
  86. package/src/tunnel.ts +0 -314
  87. package/src/types.ts +0 -311
  88. package/src/utils.test.ts +0 -17
  89. package/src/utils.ts +0 -14
  90. package/src/voice-mapping.test.ts +0 -32
  91. package/src/voice-mapping.ts +0 -65
  92. package/src/webhook/realtime-audio-pacer.test.ts +0 -146
  93. package/src/webhook/realtime-audio-pacer.ts +0 -204
  94. package/src/webhook/realtime-handler.test.ts +0 -1450
  95. package/src/webhook/realtime-handler.ts +0 -1382
  96. package/src/webhook/stale-call-reaper.test.ts +0 -89
  97. package/src/webhook/stale-call-reaper.ts +0 -38
  98. package/src/webhook/stream-frame-adapter.test.ts +0 -187
  99. package/src/webhook/stream-frame-adapter.ts +0 -219
  100. package/src/webhook/tailscale.test.ts +0 -216
  101. package/src/webhook/tailscale.ts +0 -129
  102. package/src/webhook-exposure.test.ts +0 -33
  103. package/src/webhook-exposure.ts +0 -84
  104. package/src/webhook-security.test.ts +0 -813
  105. package/src/webhook-security.ts +0 -982
  106. package/src/webhook.hangup-once.lifecycle.test.ts +0 -179
  107. package/src/webhook.test.ts +0 -1615
  108. package/src/webhook.ts +0 -933
  109. package/src/webhook.types.ts +0 -5
  110. package/src/websocket-test-support.ts +0 -72
  111. package/tsconfig.json +0 -16
@@ -1,876 +0,0 @@
1
- /**
2
- * Media Stream Handler
3
- *
4
- * Handles bidirectional audio streaming between Twilio and the AI services.
5
- * - Receives mu-law audio from Twilio via WebSocket
6
- * - Forwards to the selected realtime transcription provider
7
- * - Sends TTS audio back to Twilio
8
- */
9
-
10
- import type { IncomingMessage } from "node:http";
11
- import type { Duplex } from "node:stream";
12
- import type { KlawConfig } from "klaw/plugin-sdk/config-contracts";
13
- import type {
14
- RealtimeTranscriptionProviderConfig,
15
- RealtimeTranscriptionProviderPlugin,
16
- RealtimeTranscriptionSession,
17
- } from "klaw/plugin-sdk/realtime-transcription";
18
- import {
19
- createTalkSessionController,
20
- recordTalkObservabilityEvent,
21
- type TalkEvent,
22
- type TalkEventInput,
23
- type TalkSessionController,
24
- } from "klaw/plugin-sdk/realtime-voice";
25
- import { type RawData, WebSocket, WebSocketServer } from "ws";
26
-
27
- /**
28
- * Configuration for the media stream handler.
29
- */
30
- export interface MediaStreamConfig {
31
- /** Realtime transcription provider for streaming STT. */
32
- transcriptionProvider: RealtimeTranscriptionProviderPlugin;
33
- /** Provider-owned config blob passed into the transcription session. */
34
- providerConfig: RealtimeTranscriptionProviderConfig;
35
- /** Full runtime config, used by providers that can resolve OAuth profiles. */
36
- cfg?: KlawConfig;
37
- /** Close sockets that never send a valid `start` frame within this window. */
38
- preStartTimeoutMs?: number;
39
- /** Max concurrent pre-start sockets. */
40
- maxPendingConnections?: number;
41
- /** Max concurrent pre-start sockets from a single source IP. */
42
- maxPendingConnectionsPerIp?: number;
43
- /** Max total open sockets (pending + active sessions). */
44
- maxConnections?: number;
45
- /** Optional trusted resolver for the source IP used by pending-connection guards. */
46
- resolveClientIp?: (request: IncomingMessage) => string | undefined;
47
- /** Validate whether to accept a media stream for the given call ID */
48
- shouldAcceptStream?: (params: { callId: string; streamSid: string; token?: string }) => boolean;
49
- /** Callback when transcript is received */
50
- onTranscript?: (callId: string, transcript: string) => void;
51
- /** Callback for partial transcripts (streaming UI) */
52
- onPartialTranscript?: (callId: string, partial: string) => void;
53
- /** Callback when stream connects */
54
- onConnect?: (callId: string, streamSid: string) => void;
55
- /** Callback when realtime transcription is ready for the stream */
56
- onTranscriptionReady?: (callId: string, streamSid: string) => void;
57
- /** Callback when speech starts (barge-in) */
58
- onSpeechStart?: (callId: string) => void;
59
- /** Callback when stream disconnects */
60
- onDisconnect?: (callId: string, streamSid: string) => void;
61
- /** Callback for common Talk events emitted by the telephony STT/TTS adapter. */
62
- onTalkEvent?: (callId: string, streamSid: string, event: TalkEvent) => void;
63
- }
64
-
65
- /**
66
- * Active media stream session.
67
- */
68
- interface StreamSession {
69
- callId: string;
70
- streamSid: string;
71
- ws: WebSocket;
72
- sttSession: RealtimeTranscriptionSession;
73
- talk: TalkSessionController;
74
- }
75
-
76
- type TtsQueueEntry = {
77
- playFn: (signal: AbortSignal) => Promise<void>;
78
- controller: AbortController;
79
- resolve: () => void;
80
- reject: (error: unknown) => void;
81
- };
82
-
83
- type StreamSendResult = {
84
- sent: boolean;
85
- readyState?: number;
86
- bufferedBeforeBytes: number;
87
- bufferedAfterBytes: number;
88
- };
89
-
90
- type PendingConnection = {
91
- ip: string;
92
- timeout: ReturnType<typeof setTimeout>;
93
- };
94
-
95
- const DEFAULT_PRE_START_TIMEOUT_MS = 5000;
96
- const DEFAULT_MAX_PENDING_CONNECTIONS = 32;
97
- const DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP = 4;
98
- const DEFAULT_MAX_CONNECTIONS = 128;
99
- const MAX_INBOUND_MESSAGE_BYTES = 64 * 1024;
100
- const MAX_WS_BUFFERED_BYTES = 1024 * 1024;
101
- const CLOSE_REASON_LOG_MAX_CHARS = 120;
102
-
103
- export function sanitizeLogText(value: string, maxChars: number): string {
104
- const sanitized = value
105
- .replace(/\p{Cc}/gu, " ")
106
- .replace(/\s+/g, " ")
107
- .trim();
108
- if (sanitized.length <= maxChars) {
109
- return sanitized;
110
- }
111
- return `${sanitized.slice(0, maxChars)}...`;
112
- }
113
-
114
- function normalizeWsMessageData(data: RawData): Buffer {
115
- if (Buffer.isBuffer(data)) {
116
- return data;
117
- }
118
- if (Array.isArray(data)) {
119
- return Buffer.concat(data);
120
- }
121
- return Buffer.from(data);
122
- }
123
-
124
- export function parseTwilioMediaMessage(data: RawData): TwilioMediaMessage {
125
- const raw = normalizeWsMessageData(data);
126
- try {
127
- return JSON.parse(raw.toString("utf8")) as TwilioMediaMessage;
128
- } catch (cause) {
129
- throw new Error("Twilio media stream message was malformed JSON", { cause });
130
- }
131
- }
132
-
133
- /**
134
- * Manages WebSocket connections for Twilio media streams.
135
- */
136
- export class MediaStreamHandler {
137
- private wss: WebSocketServer | null = null;
138
- private sessions = new Map<string, StreamSession>();
139
- private config: MediaStreamConfig;
140
- /** Pending sockets that have upgraded but not yet sent an accepted `start` frame. */
141
- private pendingConnections = new Map<WebSocket, PendingConnection>();
142
- /** Pending socket count per remote IP for pre-auth throttling. */
143
- private pendingByIp = new Map<string, number>();
144
- private preStartTimeoutMs: number;
145
- private maxPendingConnections: number;
146
- private maxPendingConnectionsPerIp: number;
147
- private maxConnections: number;
148
- private inflightUpgrades = 0;
149
- /** TTS playback queues per stream (serialize audio to prevent overlap) */
150
- private ttsQueues = new Map<string, TtsQueueEntry[]>();
151
- /** Whether TTS is currently playing per stream */
152
- private ttsPlaying = new Map<string, boolean>();
153
- /** Active TTS playback controllers per stream */
154
- private ttsActiveControllers = new Map<string, AbortController>();
155
-
156
- constructor(config: MediaStreamConfig) {
157
- this.config = config;
158
- this.preStartTimeoutMs = config.preStartTimeoutMs ?? DEFAULT_PRE_START_TIMEOUT_MS;
159
- this.maxPendingConnections = config.maxPendingConnections ?? DEFAULT_MAX_PENDING_CONNECTIONS;
160
- this.maxPendingConnectionsPerIp =
161
- config.maxPendingConnectionsPerIp ?? DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP;
162
- this.maxConnections = config.maxConnections ?? DEFAULT_MAX_CONNECTIONS;
163
- }
164
-
165
- /**
166
- * Handle WebSocket upgrade for media stream connections.
167
- */
168
- handleUpgrade(request: IncomingMessage, socket: Duplex, head: Buffer): void {
169
- if (!this.wss) {
170
- this.wss = new WebSocketServer({
171
- noServer: true,
172
- // Reject oversized frames before app-level parsing runs on unauthenticated sockets.
173
- maxPayload: MAX_INBOUND_MESSAGE_BYTES,
174
- });
175
- this.wss.on("connection", (ws, req) => this.handleConnection(ws, req));
176
- }
177
-
178
- const currentConnections = this.getCurrentConnectionCount();
179
- if (currentConnections >= this.maxConnections) {
180
- this.rejectUpgrade(socket, 503, "Too many media stream connections");
181
- return;
182
- }
183
-
184
- this.inflightUpgrades += 1;
185
- let released = false;
186
- const releaseUpgradeReservation = () => {
187
- if (released) {
188
- return;
189
- }
190
- released = true;
191
- this.inflightUpgrades = Math.max(0, this.inflightUpgrades - 1);
192
- };
193
- const handleUpgradeAbort = () => {
194
- socket.removeListener("error", handleUpgradeAbort);
195
- socket.removeListener("close", handleUpgradeAbort);
196
- releaseUpgradeReservation();
197
- };
198
- socket.once("error", handleUpgradeAbort);
199
- socket.once("close", handleUpgradeAbort);
200
-
201
- try {
202
- this.wss.handleUpgrade(request, socket, head, (ws) => {
203
- socket.removeListener("error", handleUpgradeAbort);
204
- socket.removeListener("close", handleUpgradeAbort);
205
- releaseUpgradeReservation();
206
- this.wss?.emit("connection", ws, request);
207
- });
208
- } catch (error) {
209
- socket.removeListener("error", handleUpgradeAbort);
210
- socket.removeListener("close", handleUpgradeAbort);
211
- releaseUpgradeReservation();
212
- throw error;
213
- }
214
- }
215
-
216
- /**
217
- * Handle new WebSocket connection from Twilio.
218
- */
219
- private async handleConnection(ws: WebSocket, _request: IncomingMessage): Promise<void> {
220
- let session: StreamSession | null = null;
221
- const streamToken = this.getStreamToken(_request);
222
- const ip = this.getClientIp(_request);
223
-
224
- if (!this.registerPendingConnection(ws, ip)) {
225
- ws.close(1013, "Too many pending media stream connections");
226
- return;
227
- }
228
-
229
- ws.on("message", async (data: RawData) => {
230
- try {
231
- const message = parseTwilioMediaMessage(data);
232
-
233
- switch (message.event) {
234
- case "connected":
235
- console.log("[MediaStream] Twilio connected");
236
- break;
237
-
238
- case "start":
239
- session = this.handleStart(ws, message, streamToken);
240
- if (session) {
241
- this.clearPendingConnection(ws);
242
- }
243
- break;
244
-
245
- case "media":
246
- if (session && message.media?.payload) {
247
- // Forward audio to STT
248
- const audioBuffer = Buffer.from(message.media.payload, "base64");
249
- const turnId = this.ensureActiveTurn(session);
250
- this.emitTalkEvent(session, {
251
- type: "input.audio.delta",
252
- turnId,
253
- payload: {
254
- callId: session.callId,
255
- streamSid: session.streamSid,
256
- bytes: audioBuffer.byteLength,
257
- },
258
- });
259
- session.sttSession.sendAudio(audioBuffer);
260
- }
261
- break;
262
-
263
- case "stop":
264
- if (session) {
265
- this.handleStop(session);
266
- session = null;
267
- }
268
- break;
269
-
270
- case "clear":
271
- case "mark":
272
- break;
273
- }
274
- } catch (error) {
275
- console.error("[MediaStream] Error processing message:", error);
276
- }
277
- });
278
-
279
- ws.on("close", (code, reason) => {
280
- const rawReason = Buffer.isBuffer(reason) ? reason.toString("utf8") : String(reason || "");
281
- const reasonText = sanitizeLogText(rawReason, CLOSE_REASON_LOG_MAX_CHARS);
282
- console.log(
283
- `[MediaStream] WebSocket closed (code: ${code}, reason: ${reasonText || "none"})`,
284
- );
285
- this.clearPendingConnection(ws);
286
- if (session) {
287
- this.handleStop(session);
288
- }
289
- });
290
-
291
- ws.on("error", (error) => {
292
- console.error("[MediaStream] WebSocket error:", error);
293
- });
294
- }
295
-
296
- /**
297
- * Handle stream start event.
298
- */
299
- private handleStart(
300
- ws: WebSocket,
301
- message: TwilioMediaMessage,
302
- streamToken?: string,
303
- ): StreamSession | null {
304
- const streamSid = message.streamSid || "";
305
- const callSid = message.start?.callSid || "";
306
-
307
- // Prefer token from start message customParameters (set via TwiML <Parameter>),
308
- // falling back to query string token. Twilio strips query params from WebSocket
309
- // URLs but reliably delivers <Parameter> values in customParameters.
310
- const effectiveToken = message.start?.customParameters?.token ?? streamToken;
311
-
312
- console.log(`[MediaStream] Stream started: ${streamSid} (call: ${callSid})`);
313
- if (!callSid) {
314
- console.warn("[MediaStream] Missing callSid; closing stream");
315
- ws.close(1008, "Missing callSid");
316
- return null;
317
- }
318
- if (
319
- this.config.shouldAcceptStream &&
320
- !this.config.shouldAcceptStream({ callId: callSid, streamSid, token: effectiveToken })
321
- ) {
322
- console.warn(`[MediaStream] Rejecting stream for unknown call: ${callSid}`);
323
- ws.close(1008, "Unknown call");
324
- return null;
325
- }
326
-
327
- const sttSession = this.config.transcriptionProvider.createSession({
328
- cfg: this.config.cfg,
329
- providerConfig: this.config.providerConfig,
330
- onPartial: (partial) => {
331
- const session = this.sessions.get(streamSid);
332
- if (session) {
333
- this.emitTalkEvent(session, {
334
- type: "transcript.delta",
335
- turnId: this.ensureActiveTurn(session),
336
- payload: { callId: callSid, streamSid, text: partial, role: "user" },
337
- });
338
- }
339
- this.config.onPartialTranscript?.(callSid, partial);
340
- },
341
- onTranscript: (transcript) => {
342
- const session = this.sessions.get(streamSid);
343
- if (session) {
344
- const turnId = this.ensureActiveTurn(session);
345
- this.emitTalkEvent(session, {
346
- type: "input.audio.committed",
347
- turnId,
348
- final: true,
349
- payload: { callId: callSid, streamSid },
350
- });
351
- this.emitTalkEvent(session, {
352
- type: "transcript.done",
353
- turnId,
354
- final: true,
355
- payload: { callId: callSid, streamSid, text: transcript, role: "user" },
356
- });
357
- }
358
- this.config.onTranscript?.(callSid, transcript);
359
- },
360
- onSpeechStart: () => {
361
- const session = this.sessions.get(streamSid);
362
- if (session) {
363
- this.ensureActiveTurn(session);
364
- }
365
- this.config.onSpeechStart?.(callSid);
366
- },
367
- onError: (error) => {
368
- console.warn("[MediaStream] Transcription session error:", error.message);
369
- const session = this.sessions.get(streamSid);
370
- if (session) {
371
- this.emitTalkEvent(session, {
372
- type: "session.error",
373
- final: true,
374
- payload: { callId: callSid, streamSid, error: error.message },
375
- });
376
- }
377
- },
378
- });
379
-
380
- const session: StreamSession = {
381
- callId: callSid,
382
- streamSid,
383
- ws,
384
- sttSession,
385
- talk: this.createTalkEvents(callSid, streamSid),
386
- };
387
-
388
- this.sessions.set(streamSid, session);
389
- this.config.onConnect?.(callSid, streamSid);
390
- this.emitTalkEvent(session, {
391
- type: "session.started",
392
- payload: { callId: callSid, streamSid, provider: this.config.transcriptionProvider.id },
393
- });
394
- void this.connectTranscriptionAndNotify(session);
395
-
396
- return session;
397
- }
398
-
399
- private async connectTranscriptionAndNotify(session: StreamSession): Promise<void> {
400
- try {
401
- await session.sttSession.connect();
402
- } catch (error) {
403
- console.warn(
404
- "[MediaStream] STT connection failed; closing media stream:",
405
- error instanceof Error ? error.message : String(error),
406
- );
407
- this.emitTalkEvent(session, {
408
- type: "session.error",
409
- final: true,
410
- payload: {
411
- callId: session.callId,
412
- streamSid: session.streamSid,
413
- error: error instanceof Error ? error.message : String(error),
414
- },
415
- });
416
- if (
417
- this.sessions.get(session.streamSid) === session &&
418
- session.ws.readyState === WebSocket.OPEN
419
- ) {
420
- session.ws.close(1011, "STT connection failed");
421
- } else {
422
- session.sttSession.close();
423
- }
424
- return;
425
- }
426
-
427
- if (
428
- this.sessions.get(session.streamSid) !== session ||
429
- session.ws.readyState !== WebSocket.OPEN
430
- ) {
431
- session.sttSession.close();
432
- return;
433
- }
434
-
435
- this.emitTalkEvent(session, {
436
- type: "session.ready",
437
- payload: { callId: session.callId, streamSid: session.streamSid },
438
- });
439
- this.config.onTranscriptionReady?.(session.callId, session.streamSid);
440
- }
441
-
442
- /**
443
- * Handle stream stop event.
444
- */
445
- private handleStop(session: StreamSession): void {
446
- console.log(`[MediaStream] Stream stopped: ${session.streamSid}`);
447
-
448
- this.clearTtsState(session.streamSid);
449
- session.sttSession.close();
450
- this.sessions.delete(session.streamSid);
451
- this.emitTalkEvent(session, {
452
- type: "session.closed",
453
- final: true,
454
- payload: { callId: session.callId, streamSid: session.streamSid },
455
- });
456
- this.config.onDisconnect?.(session.callId, session.streamSid);
457
- }
458
-
459
- private getStreamToken(request: IncomingMessage): string | undefined {
460
- if (!request.url || !request.headers.host) {
461
- return undefined;
462
- }
463
- try {
464
- const url = new URL(request.url, `http://${request.headers.host}`);
465
- return url.searchParams.get("token") ?? undefined;
466
- } catch {
467
- return undefined;
468
- }
469
- }
470
-
471
- private getClientIp(request: IncomingMessage): string {
472
- const resolvedIp = this.config.resolveClientIp?.(request)?.trim();
473
- if (resolvedIp) {
474
- return resolvedIp;
475
- }
476
- return request.socket.remoteAddress || "unknown";
477
- }
478
-
479
- private getCurrentConnectionCount(): number {
480
- return this.wss ? this.wss.clients.size + this.inflightUpgrades : this.inflightUpgrades;
481
- }
482
-
483
- private registerPendingConnection(ws: WebSocket, ip: string): boolean {
484
- if (this.pendingConnections.size >= this.maxPendingConnections) {
485
- console.warn("[MediaStream] Rejecting connection: pending connection limit reached");
486
- return false;
487
- }
488
-
489
- const pendingForIp = this.pendingByIp.get(ip) ?? 0;
490
- if (pendingForIp >= this.maxPendingConnectionsPerIp) {
491
- console.warn(`[MediaStream] Rejecting connection: pending per-IP limit reached (${ip})`);
492
- return false;
493
- }
494
-
495
- const timeout = setTimeout(() => {
496
- if (!this.pendingConnections.has(ws)) {
497
- return;
498
- }
499
- console.warn(
500
- `[MediaStream] Closing pre-start idle connection after ${this.preStartTimeoutMs}ms (${ip})`,
501
- );
502
- ws.close(1008, "Start timeout");
503
- }, this.preStartTimeoutMs);
504
-
505
- timeout.unref?.();
506
- this.pendingConnections.set(ws, { ip, timeout });
507
- this.pendingByIp.set(ip, pendingForIp + 1);
508
- return true;
509
- }
510
-
511
- private clearPendingConnection(ws: WebSocket): void {
512
- const pending = this.pendingConnections.get(ws);
513
- if (!pending) {
514
- return;
515
- }
516
-
517
- clearTimeout(pending.timeout);
518
- this.pendingConnections.delete(ws);
519
-
520
- const current = this.pendingByIp.get(pending.ip) ?? 0;
521
- if (current <= 1) {
522
- this.pendingByIp.delete(pending.ip);
523
- return;
524
- }
525
- this.pendingByIp.set(pending.ip, current - 1);
526
- }
527
-
528
- private rejectUpgrade(socket: Duplex, statusCode: 429 | 503, message: string): void {
529
- const statusText = statusCode === 429 ? "Too Many Requests" : "Service Unavailable";
530
- const body = `${message}\n`;
531
- socket.write(
532
- `HTTP/1.1 ${statusCode} ${statusText}\r\n` +
533
- "Connection: close\r\n" +
534
- "Content-Type: text/plain; charset=utf-8\r\n" +
535
- `Content-Length: ${Buffer.byteLength(body)}\r\n` +
536
- "\r\n" +
537
- body,
538
- );
539
- socket.destroy();
540
- }
541
-
542
- /**
543
- * Get an active session with an open WebSocket, or undefined if unavailable.
544
- */
545
- private getOpenSession(streamSid: string): StreamSession | undefined {
546
- const session = this.sessions.get(streamSid);
547
- return session?.ws.readyState === WebSocket.OPEN ? session : undefined;
548
- }
549
-
550
- /**
551
- * Send a message to a stream's WebSocket if available.
552
- */
553
- private sendToStream(streamSid: string, message: unknown): StreamSendResult {
554
- const session = this.sessions.get(streamSid);
555
- if (!session) {
556
- return {
557
- sent: false,
558
- bufferedBeforeBytes: 0,
559
- bufferedAfterBytes: 0,
560
- };
561
- }
562
-
563
- const readyState = session.ws.readyState;
564
- const bufferedBeforeBytes = session.ws.bufferedAmount;
565
- if (readyState !== WebSocket.OPEN) {
566
- return {
567
- sent: false,
568
- readyState,
569
- bufferedBeforeBytes,
570
- bufferedAfterBytes: session.ws.bufferedAmount,
571
- };
572
- }
573
- if (bufferedBeforeBytes > MAX_WS_BUFFERED_BYTES) {
574
- try {
575
- session.ws.close(1013, "Backpressure: send buffer exceeded");
576
- } catch {
577
- // Best-effort close; caller still receives sent:false.
578
- }
579
- return {
580
- sent: false,
581
- readyState,
582
- bufferedBeforeBytes,
583
- bufferedAfterBytes: session.ws.bufferedAmount,
584
- };
585
- }
586
-
587
- try {
588
- session.ws.send(JSON.stringify(message));
589
- const bufferedAfterBytes = session.ws.bufferedAmount;
590
- if (bufferedAfterBytes > MAX_WS_BUFFERED_BYTES) {
591
- try {
592
- session.ws.close(1013, "Backpressure: send buffer exceeded");
593
- } catch {
594
- // Best-effort close; caller still receives sent:false.
595
- }
596
- return {
597
- sent: false,
598
- readyState,
599
- bufferedBeforeBytes,
600
- bufferedAfterBytes,
601
- };
602
- }
603
- return {
604
- sent: true,
605
- readyState,
606
- bufferedBeforeBytes,
607
- bufferedAfterBytes,
608
- };
609
- } catch {
610
- return {
611
- sent: false,
612
- readyState,
613
- bufferedBeforeBytes,
614
- bufferedAfterBytes: session.ws.bufferedAmount,
615
- };
616
- }
617
- }
618
-
619
- /**
620
- * Send audio to a specific stream (for TTS playback).
621
- * Audio should be mu-law encoded at 8kHz mono.
622
- */
623
- sendAudio(streamSid: string, muLawAudio: Buffer): StreamSendResult {
624
- const session = this.getOpenSession(streamSid);
625
- if (session) {
626
- this.emitTalkEvent(session, {
627
- type: "output.audio.delta",
628
- turnId: this.ensureActiveTurn(session),
629
- payload: { callId: session.callId, streamSid, bytes: muLawAudio.byteLength },
630
- });
631
- }
632
- return this.sendToStream(streamSid, {
633
- event: "media",
634
- streamSid,
635
- media: { payload: muLawAudio.toString("base64") },
636
- });
637
- }
638
-
639
- /**
640
- * Send a mark event to track audio playback position.
641
- */
642
- sendMark(streamSid: string, name: string): StreamSendResult {
643
- return this.sendToStream(streamSid, {
644
- event: "mark",
645
- streamSid,
646
- mark: { name },
647
- });
648
- }
649
-
650
- /**
651
- * Clear audio buffer (interrupt playback).
652
- */
653
- clearAudio(streamSid: string): StreamSendResult {
654
- return this.sendToStream(streamSid, { event: "clear", streamSid });
655
- }
656
-
657
- /**
658
- * Queue a TTS operation for sequential playback.
659
- * Only one TTS operation plays at a time per stream to prevent overlap.
660
- */
661
- async queueTts(streamSid: string, playFn: (signal: AbortSignal) => Promise<void>): Promise<void> {
662
- const queue = this.getTtsQueue(streamSid);
663
- let resolveEntry: () => void;
664
- let rejectEntry: (error: unknown) => void;
665
- const promise = new Promise<void>((resolve, reject) => {
666
- resolveEntry = resolve;
667
- rejectEntry = reject;
668
- });
669
-
670
- queue.push({
671
- playFn,
672
- controller: new AbortController(),
673
- resolve: resolveEntry!,
674
- reject: rejectEntry!,
675
- });
676
-
677
- if (!this.ttsPlaying.get(streamSid)) {
678
- void this.processQueue(streamSid);
679
- }
680
-
681
- return promise;
682
- }
683
-
684
- /**
685
- * Clear TTS queue and interrupt current playback (barge-in).
686
- */
687
- clearTtsQueue(streamSid: string, _reason = "unspecified"): void {
688
- const queue = this.getTtsQueue(streamSid);
689
- this.resolveQueuedTtsEntries(queue);
690
- this.ttsActiveControllers.get(streamSid)?.abort();
691
- const session = this.sessions.get(streamSid);
692
- if (session?.talk.activeTurnId) {
693
- const cancelled = session.talk.cancelTurn({
694
- payload: { callId: session.callId, streamSid, reason: _reason },
695
- });
696
- if (cancelled.ok) {
697
- this.config.onTalkEvent?.(session.callId, session.streamSid, cancelled.event);
698
- }
699
- }
700
- this.clearAudio(streamSid);
701
- }
702
-
703
- /**
704
- * Get active session by call ID.
705
- */
706
- getSessionByCallId(callId: string): StreamSession | undefined {
707
- return [...this.sessions.values()].find((session) => session.callId === callId);
708
- }
709
-
710
- /**
711
- * Close all sessions.
712
- */
713
- closeAll(): void {
714
- for (const session of this.sessions.values()) {
715
- this.clearTtsState(session.streamSid);
716
- session.sttSession.close();
717
- session.ws.close();
718
- }
719
- this.sessions.clear();
720
- }
721
-
722
- private getTtsQueue(streamSid: string): TtsQueueEntry[] {
723
- const existing = this.ttsQueues.get(streamSid);
724
- if (existing) {
725
- return existing;
726
- }
727
- const queue: TtsQueueEntry[] = [];
728
- this.ttsQueues.set(streamSid, queue);
729
- return queue;
730
- }
731
-
732
- /**
733
- * Process the TTS queue for a stream.
734
- * Uses iterative approach to avoid stack accumulation from recursion.
735
- */
736
- private async processQueue(streamSid: string): Promise<void> {
737
- this.ttsPlaying.set(streamSid, true);
738
-
739
- while (true) {
740
- const queue = this.ttsQueues.get(streamSid);
741
- if (!queue || queue.length === 0) {
742
- this.ttsPlaying.set(streamSid, false);
743
- this.ttsActiveControllers.delete(streamSid);
744
- return;
745
- }
746
-
747
- const entry = queue.shift()!;
748
- this.ttsActiveControllers.set(streamSid, entry.controller);
749
- const session = this.sessions.get(streamSid);
750
- let playbackTurnId: string | undefined;
751
-
752
- try {
753
- if (session) {
754
- playbackTurnId = this.ensureActiveTurn(session);
755
- this.emitTalkEvent(session, {
756
- type: "output.audio.started",
757
- turnId: playbackTurnId,
758
- payload: { callId: session.callId, streamSid },
759
- });
760
- }
761
- await entry.playFn(entry.controller.signal);
762
- if (entry.controller.signal.aborted) {
763
- entry.resolve();
764
- continue;
765
- }
766
- if (session) {
767
- const turnId = playbackTurnId ?? this.ensureActiveTurn(session);
768
- this.emitTalkEvent(session, {
769
- type: "output.audio.done",
770
- turnId,
771
- final: true,
772
- payload: { callId: session.callId, streamSid },
773
- });
774
- if (session.talk.activeTurnId) {
775
- const ended = session.talk.endTurn({
776
- payload: { callId: session.callId, streamSid },
777
- });
778
- if (ended.ok) {
779
- this.config.onTalkEvent?.(session.callId, session.streamSid, ended.event);
780
- }
781
- }
782
- }
783
- entry.resolve();
784
- } catch (error) {
785
- if (entry.controller.signal.aborted) {
786
- entry.resolve();
787
- } else {
788
- console.error("[MediaStream] TTS playback error:", error);
789
- entry.reject(error);
790
- }
791
- } finally {
792
- if (this.ttsActiveControllers.get(streamSid) === entry.controller) {
793
- this.ttsActiveControllers.delete(streamSid);
794
- }
795
- }
796
- }
797
- }
798
-
799
- private createTalkEvents(callId: string, streamSid: string): TalkSessionController {
800
- return createTalkSessionController(
801
- {
802
- sessionId: `voice-call:${callId}:${streamSid}`,
803
- mode: "stt-tts",
804
- transport: "gateway-relay",
805
- brain: "agent-consult",
806
- provider: this.config.transcriptionProvider.id,
807
- turnIdPrefix: `${streamSid}:turn`,
808
- },
809
- { onEvent: recordTalkObservabilityEvent },
810
- );
811
- }
812
-
813
- private emitTalkEvent(session: StreamSession, input: TalkEventInput): void {
814
- const event = session.talk.emit(input);
815
- this.config.onTalkEvent?.(session.callId, session.streamSid, event);
816
- }
817
-
818
- private ensureActiveTurn(session: StreamSession): string {
819
- const turn = session.talk.ensureTurn({
820
- payload: { callId: session.callId, streamSid: session.streamSid },
821
- });
822
- if (turn.event) {
823
- this.config.onTalkEvent?.(session.callId, session.streamSid, turn.event);
824
- }
825
- return turn.turnId;
826
- }
827
-
828
- private clearTtsState(streamSid: string): void {
829
- const queue = this.ttsQueues.get(streamSid);
830
- if (queue) {
831
- this.resolveQueuedTtsEntries(queue);
832
- }
833
- this.ttsActiveControllers.get(streamSid)?.abort();
834
- this.ttsActiveControllers.delete(streamSid);
835
- this.ttsPlaying.delete(streamSid);
836
- this.ttsQueues.delete(streamSid);
837
- }
838
-
839
- private resolveQueuedTtsEntries(queue: TtsQueueEntry[]): void {
840
- const pending = queue.splice(0);
841
- for (const entry of pending) {
842
- entry.controller.abort();
843
- entry.resolve();
844
- }
845
- }
846
- }
847
-
848
- /**
849
- * Twilio Media Stream message format.
850
- */
851
- interface TwilioMediaMessage {
852
- event: "connected" | "start" | "media" | "stop" | "mark" | "clear";
853
- sequenceNumber?: string;
854
- streamSid?: string;
855
- start?: {
856
- streamSid: string;
857
- accountSid: string;
858
- callSid: string;
859
- tracks: string[];
860
- customParameters?: Record<string, string>;
861
- mediaFormat: {
862
- encoding: string;
863
- sampleRate: number;
864
- channels: number;
865
- };
866
- };
867
- media?: {
868
- track?: string;
869
- chunk?: string;
870
- timestamp?: string;
871
- payload?: string;
872
- };
873
- mark?: {
874
- name: string;
875
- };
876
- }