@openclaw/voice-call 2026.3.13 → 2026.5.2-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +27 -5
  2. package/api.ts +16 -0
  3. package/cli-metadata.ts +10 -0
  4. package/config-api.ts +12 -0
  5. package/index.test.ts +943 -0
  6. package/index.ts +379 -149
  7. package/openclaw.plugin.json +384 -157
  8. package/package.json +35 -5
  9. package/runtime-api.ts +20 -0
  10. package/runtime-entry.ts +1 -0
  11. package/setup-api.ts +47 -0
  12. package/src/allowlist.test.ts +18 -0
  13. package/src/cli.ts +533 -68
  14. package/src/config-compat.test.ts +120 -0
  15. package/src/config-compat.ts +227 -0
  16. package/src/config.test.ts +273 -12
  17. package/src/config.ts +355 -72
  18. package/src/core-bridge.ts +2 -147
  19. package/src/deep-merge.test.ts +40 -0
  20. package/src/gateway-continue-operation.ts +200 -0
  21. package/src/http-headers.ts +6 -3
  22. package/src/manager/context.ts +6 -5
  23. package/src/manager/events.test.ts +243 -19
  24. package/src/manager/events.ts +61 -31
  25. package/src/manager/lifecycle.ts +53 -0
  26. package/src/manager/lookup.test.ts +52 -0
  27. package/src/manager/outbound.test.ts +528 -0
  28. package/src/manager/outbound.ts +163 -57
  29. package/src/manager/store.ts +18 -6
  30. package/src/manager/timers.test.ts +129 -0
  31. package/src/manager/timers.ts +4 -3
  32. package/src/manager/twiml.test.ts +13 -0
  33. package/src/manager/twiml.ts +8 -0
  34. package/src/manager.closed-loop.test.ts +30 -12
  35. package/src/manager.inbound-allowlist.test.ts +77 -10
  36. package/src/manager.notify.test.ts +344 -20
  37. package/src/manager.restore.test.ts +95 -8
  38. package/src/manager.test-harness.ts +8 -6
  39. package/src/manager.ts +79 -5
  40. package/src/media-stream.test.ts +578 -81
  41. package/src/media-stream.ts +235 -54
  42. package/src/providers/base.ts +19 -0
  43. package/src/providers/mock.ts +7 -1
  44. package/src/providers/plivo.test.ts +50 -6
  45. package/src/providers/plivo.ts +14 -6
  46. package/src/providers/shared/call-status.ts +2 -1
  47. package/src/providers/shared/guarded-json-api.test.ts +106 -0
  48. package/src/providers/shared/guarded-json-api.ts +1 -1
  49. package/src/providers/telnyx.test.ts +178 -6
  50. package/src/providers/telnyx.ts +40 -3
  51. package/src/providers/twilio/api.test.ts +145 -0
  52. package/src/providers/twilio/api.ts +67 -16
  53. package/src/providers/twilio/twiml-policy.ts +6 -10
  54. package/src/providers/twilio/webhook.ts +1 -1
  55. package/src/providers/twilio.test.ts +425 -25
  56. package/src/providers/twilio.ts +230 -77
  57. package/src/providers/twilio.types.ts +17 -0
  58. package/src/realtime-defaults.ts +3 -0
  59. package/src/realtime-fast-context.test.ts +88 -0
  60. package/src/realtime-fast-context.ts +165 -0
  61. package/src/realtime-transcription.runtime.ts +4 -0
  62. package/src/realtime-voice.runtime.ts +5 -0
  63. package/src/response-generator.test.ts +321 -0
  64. package/src/response-generator.ts +213 -53
  65. package/src/response-model.test.ts +71 -0
  66. package/src/response-model.ts +23 -0
  67. package/src/runtime.test.ts +429 -0
  68. package/src/runtime.ts +270 -24
  69. package/src/telephony-audio.test.ts +61 -0
  70. package/src/telephony-audio.ts +1 -79
  71. package/src/telephony-tts.test.ts +133 -12
  72. package/src/telephony-tts.ts +155 -2
  73. package/src/test-fixtures.ts +28 -7
  74. package/src/tts-provider-voice.test.ts +34 -0
  75. package/src/tts-provider-voice.ts +21 -0
  76. package/src/tunnel.test.ts +166 -0
  77. package/src/tunnel.ts +1 -1
  78. package/src/types.ts +24 -37
  79. package/src/utils.test.ts +17 -0
  80. package/src/voice-mapping.test.ts +34 -0
  81. package/src/voice-mapping.ts +3 -2
  82. package/src/webhook/realtime-handler.test.ts +598 -0
  83. package/src/webhook/realtime-handler.ts +485 -0
  84. package/src/webhook/stale-call-reaper.test.ts +88 -0
  85. package/src/webhook/stale-call-reaper.ts +5 -0
  86. package/src/webhook/tailscale.test.ts +214 -0
  87. package/src/webhook/tailscale.ts +19 -5
  88. package/src/webhook-exposure.test.ts +33 -0
  89. package/src/webhook-exposure.ts +84 -0
  90. package/src/webhook-security.test.ts +172 -21
  91. package/src/webhook-security.ts +43 -29
  92. package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
  93. package/src/webhook.test.ts +1145 -27
  94. package/src/webhook.ts +523 -102
  95. package/src/webhook.types.ts +5 -0
  96. package/src/websocket-test-support.ts +72 -0
  97. package/tsconfig.json +16 -0
  98. package/CHANGELOG.md +0 -121
  99. package/src/providers/index.ts +0 -10
  100. package/src/providers/stt-openai-realtime.test.ts +0 -42
  101. package/src/providers/stt-openai-realtime.ts +0 -311
  102. package/src/providers/tts-openai.test.ts +0 -43
  103. package/src/providers/tts-openai.ts +0 -221
@@ -1,33 +1,52 @@
1
- import { once } from "node:events";
2
- import http from "node:http";
3
- import { describe, expect, it } from "vitest";
4
- import { WebSocket } from "ws";
5
- import { MediaStreamHandler } from "./media-stream.js";
1
+ import type { IncomingMessage } from "node:http";
2
+ import net from "node:net";
6
3
  import type {
7
- OpenAIRealtimeSTTProvider,
8
- RealtimeSTTSession,
9
- } from "./providers/stt-openai-realtime.js";
10
-
11
- const createStubSession = (): RealtimeSTTSession => ({
4
+ RealtimeTranscriptionProviderPlugin,
5
+ RealtimeTranscriptionSession,
6
+ } from "openclaw/plugin-sdk/realtime-transcription";
7
+ import { describe, expect, it, vi } from "vitest";
8
+ import { WebSocket } from "ws";
9
+ import { MediaStreamHandler, sanitizeLogText } from "./media-stream.js";
10
+ import {
11
+ connectWs,
12
+ startUpgradeWsServer,
13
+ waitForClose,
14
+ withTimeout,
15
+ } from "./websocket-test-support.js";
16
+
17
+ const createStubSession = (): RealtimeTranscriptionSession => ({
12
18
  connect: async () => {},
13
19
  sendAudio: () => {},
14
- waitForTranscript: async () => "",
15
- onPartial: () => {},
16
- onTranscript: () => {},
17
- onSpeechStart: () => {},
18
20
  close: () => {},
19
21
  isConnected: () => true,
20
22
  });
21
23
 
22
- const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
24
+ const createStubSttProvider = (): RealtimeTranscriptionProviderPlugin =>
23
25
  ({
24
26
  createSession: () => createStubSession(),
25
- }) as unknown as OpenAIRealtimeSTTProvider;
27
+ id: "openai",
28
+ label: "OpenAI",
29
+ isConfigured: () => true,
30
+ }) as unknown as RealtimeTranscriptionProviderPlugin;
26
31
 
27
32
  const flush = async (): Promise<void> => {
28
33
  await new Promise((resolve) => setTimeout(resolve, 0));
29
34
  };
30
35
 
36
+ const createDeferred = (): {
37
+ promise: Promise<void>;
38
+ resolve: () => void;
39
+ reject: (error: Error) => void;
40
+ } => {
41
+ let resolve!: () => void;
42
+ let reject!: (error: Error) => void;
43
+ const promise = new Promise<void>((resolvePromise, rejectPromise) => {
44
+ resolve = resolvePromise;
45
+ reject = rejectPromise;
46
+ });
47
+ return { promise, resolve, reject };
48
+ };
49
+
31
50
  const waitForAbort = (signal: AbortSignal): Promise<void> =>
32
51
  new Promise((resolve) => {
33
52
  if (signal.aborted) {
@@ -37,74 +56,24 @@ const waitForAbort = (signal: AbortSignal): Promise<void> =>
37
56
  signal.addEventListener("abort", () => resolve(), { once: true });
38
57
  });
39
58
 
40
- const withTimeout = async <T>(promise: Promise<T>, timeoutMs = 2000): Promise<T> => {
41
- let timer: ReturnType<typeof setTimeout> | null = null;
42
- const timeout = new Promise<never>((_, reject) => {
43
- timer = setTimeout(() => reject(new Error(`Timed out after ${timeoutMs}ms`)), timeoutMs);
44
- });
45
-
46
- try {
47
- return await Promise.race([promise, timeout]);
48
- } finally {
49
- if (timer) {
50
- clearTimeout(timer);
51
- }
52
- }
53
- };
54
-
55
59
  const startWsServer = async (
56
60
  handler: MediaStreamHandler,
57
61
  ): Promise<{
58
62
  url: string;
59
63
  close: () => Promise<void>;
60
- }> => {
61
- const server = http.createServer();
62
- server.on("upgrade", (request, socket, head) => {
63
- handler.handleUpgrade(request, socket, head);
64
- });
65
-
66
- await new Promise<void>((resolve) => {
67
- server.listen(0, "127.0.0.1", resolve);
68
- });
69
-
70
- const address = server.address();
71
- if (!address || typeof address === "string") {
72
- throw new Error("Failed to resolve test server address");
73
- }
74
-
75
- return {
76
- url: `ws://127.0.0.1:${address.port}/voice/stream`,
77
- close: async () => {
78
- await new Promise<void>((resolve, reject) => {
79
- server.close((err) => (err ? reject(err) : resolve()));
80
- });
64
+ }> =>
65
+ startUpgradeWsServer({
66
+ urlPath: "/voice/stream",
67
+ onUpgrade: (request, socket, head) => {
68
+ handler.handleUpgrade(request, socket, head);
81
69
  },
82
- };
83
- };
84
-
85
- const connectWs = async (url: string): Promise<WebSocket> => {
86
- const ws = new WebSocket(url);
87
- await withTimeout(once(ws, "open") as Promise<[unknown]>);
88
- return ws;
89
- };
90
-
91
- const waitForClose = async (
92
- ws: WebSocket,
93
- ): Promise<{
94
- code: number;
95
- reason: string;
96
- }> => {
97
- const [code, reason] = (await withTimeout(once(ws, "close") as Promise<[number, Buffer]>)) ?? [];
98
- return {
99
- code,
100
- reason: Buffer.isBuffer(reason) ? reason.toString() : String(reason || ""),
101
- };
102
- };
70
+ });
103
71
 
104
72
  describe("MediaStreamHandler TTS queue", () => {
105
73
  it("serializes TTS playback and resolves in order", async () => {
106
74
  const handler = new MediaStreamHandler({
107
- sttProvider: createStubSttProvider(),
75
+ transcriptionProvider: createStubSttProvider(),
76
+ providerConfig: {},
108
77
  });
109
78
  const started: number[] = [];
110
79
  const finished: number[] = [];
@@ -137,7 +106,8 @@ describe("MediaStreamHandler TTS queue", () => {
137
106
 
138
107
  it("cancels active playback and clears queued items", async () => {
139
108
  const handler = new MediaStreamHandler({
140
- sttProvider: createStubSttProvider(),
109
+ transcriptionProvider: createStubSttProvider(),
110
+ providerConfig: {},
141
111
  });
142
112
 
143
113
  let queuedRan = false;
@@ -147,7 +117,7 @@ describe("MediaStreamHandler TTS queue", () => {
147
117
  started.push("active");
148
118
  await waitForAbort(signal);
149
119
  });
150
- void handler.queueTts("stream-1", async () => {
120
+ const queued = handler.queueTts("stream-1", async () => {
151
121
  queuedRan = true;
152
122
  });
153
123
 
@@ -156,18 +126,134 @@ describe("MediaStreamHandler TTS queue", () => {
156
126
 
157
127
  handler.clearTtsQueue("stream-1");
158
128
  await active;
129
+ await withTimeout(queued);
159
130
  await flush();
160
131
 
161
132
  expect(queuedRan).toBe(false);
162
133
  });
134
+
135
+ it("resolves pending queued playback during stream teardown", async () => {
136
+ const handler = new MediaStreamHandler({
137
+ transcriptionProvider: createStubSttProvider(),
138
+ providerConfig: {},
139
+ });
140
+
141
+ let queuedRan = false;
142
+ const active = handler.queueTts("stream-1", async (signal) => {
143
+ await waitForAbort(signal);
144
+ });
145
+ const queued = handler.queueTts("stream-1", async () => {
146
+ queuedRan = true;
147
+ });
148
+
149
+ await flush();
150
+ (
151
+ handler as unknown as {
152
+ clearTtsState(streamSid: string): void;
153
+ }
154
+ ).clearTtsState("stream-1");
155
+
156
+ await withTimeout(active);
157
+ await withTimeout(queued);
158
+ expect(queuedRan).toBe(false);
159
+ });
163
160
  });
164
161
 
165
162
  describe("MediaStreamHandler security hardening", () => {
163
+ it("fails sends and closes stream when buffered bytes already exceed the cap", () => {
164
+ const handler = new MediaStreamHandler({
165
+ transcriptionProvider: createStubSttProvider(),
166
+ providerConfig: {},
167
+ });
168
+ const ws = {
169
+ readyState: WebSocket.OPEN,
170
+ bufferedAmount: 2 * 1024 * 1024,
171
+ send: vi.fn(),
172
+ close: vi.fn(),
173
+ } as unknown as WebSocket;
174
+ (
175
+ handler as unknown as {
176
+ sessions: Map<
177
+ string,
178
+ {
179
+ callId: string;
180
+ streamSid: string;
181
+ ws: WebSocket;
182
+ sttSession: RealtimeTranscriptionSession;
183
+ }
184
+ >;
185
+ }
186
+ ).sessions.set("MZ-backpressure", {
187
+ callId: "CA-backpressure",
188
+ streamSid: "MZ-backpressure",
189
+ ws,
190
+ sttSession: createStubSession(),
191
+ });
192
+
193
+ const result = handler.sendAudio("MZ-backpressure", Buffer.alloc(160, 0xff));
194
+
195
+ expect(result.sent).toBe(false);
196
+ expect(ws.send).not.toHaveBeenCalled();
197
+ expect(ws.close).toHaveBeenCalledWith(1013, "Backpressure: send buffer exceeded");
198
+ });
199
+
200
+ it("fails sends when buffered bytes exceed cap after enqueueing a frame", () => {
201
+ const handler = new MediaStreamHandler({
202
+ transcriptionProvider: createStubSttProvider(),
203
+ providerConfig: {},
204
+ });
205
+ const ws = {
206
+ readyState: WebSocket.OPEN,
207
+ bufferedAmount: 0,
208
+ send: vi.fn(() => {
209
+ (
210
+ ws as unknown as {
211
+ bufferedAmount: number;
212
+ }
213
+ ).bufferedAmount = 2 * 1024 * 1024;
214
+ }),
215
+ close: vi.fn(),
216
+ } as unknown as WebSocket;
217
+ (
218
+ handler as unknown as {
219
+ sessions: Map<
220
+ string,
221
+ {
222
+ callId: string;
223
+ streamSid: string;
224
+ ws: WebSocket;
225
+ sttSession: RealtimeTranscriptionSession;
226
+ }
227
+ >;
228
+ }
229
+ ).sessions.set("MZ-overflow", {
230
+ callId: "CA-overflow",
231
+ streamSid: "MZ-overflow",
232
+ ws,
233
+ sttSession: createStubSession(),
234
+ });
235
+
236
+ const result = handler.sendMark("MZ-overflow", "mark-1");
237
+
238
+ expect(ws.send).toHaveBeenCalledTimes(1);
239
+ expect(result.sent).toBe(false);
240
+ expect(ws.close).toHaveBeenCalledWith(1013, "Backpressure: send buffer exceeded");
241
+ });
242
+
243
+ it("sanitizes websocket close reason before logging", () => {
244
+ const reason = sanitizeLogText("forged\nline\r\tentry", 120);
245
+ expect(reason).not.toContain("\n");
246
+ expect(reason).not.toContain("\r");
247
+ expect(reason).not.toContain("\t");
248
+ expect(reason).toContain("forged line entry");
249
+ });
250
+
166
251
  it("closes idle pre-start connections after timeout", async () => {
167
252
  const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
168
253
  [];
169
254
  const handler = new MediaStreamHandler({
170
- sttProvider: createStubSttProvider(),
255
+ transcriptionProvider: createStubSttProvider(),
256
+ providerConfig: {},
171
257
  preStartTimeoutMs: 40,
172
258
  shouldAcceptStream: (params) => {
173
259
  shouldAcceptStreamCalls.push(params);
@@ -190,7 +276,8 @@ describe("MediaStreamHandler security hardening", () => {
190
276
 
191
277
  it("enforces pending connection limits", async () => {
192
278
  const handler = new MediaStreamHandler({
193
- sttProvider: createStubSttProvider(),
279
+ transcriptionProvider: createStubSttProvider(),
280
+ providerConfig: {},
194
281
  preStartTimeoutMs: 5_000,
195
282
  maxPendingConnections: 1,
196
283
  maxPendingConnectionsPerIp: 1,
@@ -213,9 +300,46 @@ describe("MediaStreamHandler security hardening", () => {
213
300
  }
214
301
  });
215
302
 
303
+ it("uses resolved client IPs for per-IP pending limits", async () => {
304
+ const handler = new MediaStreamHandler({
305
+ transcriptionProvider: createStubSttProvider(),
306
+ providerConfig: {},
307
+ preStartTimeoutMs: 5_000,
308
+ maxPendingConnections: 10,
309
+ maxPendingConnectionsPerIp: 1,
310
+ resolveClientIp: (request) => String(request.headers["x-forwarded-for"] ?? ""),
311
+ });
312
+ const server = await startWsServer(handler);
313
+
314
+ try {
315
+ const first = new WebSocket(server.url, {
316
+ headers: { "x-forwarded-for": "198.51.100.10" },
317
+ });
318
+ await withTimeout(new Promise((resolve) => first.once("open", resolve)));
319
+
320
+ const second = new WebSocket(server.url, {
321
+ headers: { "x-forwarded-for": "203.0.113.20" },
322
+ });
323
+ await withTimeout(new Promise((resolve) => second.once("open", resolve)));
324
+
325
+ expect(first.readyState).toBe(WebSocket.OPEN);
326
+ expect(second.readyState).toBe(WebSocket.OPEN);
327
+
328
+ const firstClosed = waitForClose(first);
329
+ const secondClosed = waitForClose(second);
330
+ first.close();
331
+ second.close();
332
+ await firstClosed;
333
+ await secondClosed;
334
+ } finally {
335
+ await server.close();
336
+ }
337
+ });
338
+
216
339
  it("rejects upgrades when max connection cap is reached", async () => {
217
340
  const handler = new MediaStreamHandler({
218
- sttProvider: createStubSttProvider(),
341
+ transcriptionProvider: createStubSttProvider(),
342
+ providerConfig: {},
219
343
  preStartTimeoutMs: 5_000,
220
344
  maxConnections: 1,
221
345
  maxPendingConnections: 10,
@@ -228,7 +352,7 @@ describe("MediaStreamHandler security hardening", () => {
228
352
  const secondError = await withTimeout(
229
353
  new Promise<Error>((resolve) => {
230
354
  const ws = new WebSocket(server.url);
231
- ws.once("error", (err) => resolve(err as Error));
355
+ ws.once("error", (err) => resolve(err));
232
356
  }),
233
357
  );
234
358
 
@@ -241,9 +365,132 @@ describe("MediaStreamHandler security hardening", () => {
241
365
  }
242
366
  });
243
367
 
368
+ it("counts in-flight upgrades against the max connection cap", () => {
369
+ const handler = new MediaStreamHandler({
370
+ transcriptionProvider: createStubSttProvider(),
371
+ providerConfig: {},
372
+ maxConnections: 2,
373
+ maxPendingConnections: 10,
374
+ maxPendingConnectionsPerIp: 10,
375
+ });
376
+
377
+ const fakeWss = {
378
+ clients: new Set([{}]),
379
+ handleUpgrade: vi.fn(),
380
+ emit: vi.fn(),
381
+ on: vi.fn(),
382
+ };
383
+ let upgradeCallback: ((ws: WebSocket) => void) | null = null;
384
+ fakeWss.handleUpgrade.mockImplementation(
385
+ (
386
+ _request: IncomingMessage,
387
+ _socket: unknown,
388
+ _head: Buffer,
389
+ callback: (ws: WebSocket) => void,
390
+ ) => {
391
+ upgradeCallback = callback;
392
+ },
393
+ );
394
+
395
+ (
396
+ handler as unknown as {
397
+ wss: typeof fakeWss;
398
+ }
399
+ ).wss = fakeWss;
400
+
401
+ const firstSocket = {
402
+ once: vi.fn(),
403
+ removeListener: vi.fn(),
404
+ write: vi.fn(),
405
+ destroy: vi.fn(),
406
+ };
407
+ handler.handleUpgrade(
408
+ { socket: { remoteAddress: "127.0.0.1" } } as IncomingMessage,
409
+ firstSocket as never,
410
+ Buffer.alloc(0),
411
+ );
412
+
413
+ const secondSocket = {
414
+ once: vi.fn(),
415
+ removeListener: vi.fn(),
416
+ write: vi.fn(),
417
+ destroy: vi.fn(),
418
+ };
419
+ handler.handleUpgrade(
420
+ { socket: { remoteAddress: "127.0.0.1" } } as IncomingMessage,
421
+ secondSocket as never,
422
+ Buffer.alloc(0),
423
+ );
424
+
425
+ expect(fakeWss.handleUpgrade).toHaveBeenCalledTimes(1);
426
+ expect(secondSocket.write).toHaveBeenCalledOnce();
427
+ expect(secondSocket.destroy).toHaveBeenCalledOnce();
428
+
429
+ expect(upgradeCallback).not.toBeNull();
430
+ const completeUpgrade = upgradeCallback as ((ws: WebSocket) => void) | null;
431
+ if (!completeUpgrade) {
432
+ throw new Error("Expected upgrade callback to be registered");
433
+ }
434
+ completeUpgrade({} as WebSocket);
435
+ expect(fakeWss.emit).toHaveBeenCalledWith(
436
+ "connection",
437
+ expect.anything(),
438
+ expect.objectContaining({ socket: { remoteAddress: "127.0.0.1" } }),
439
+ );
440
+ });
441
+
442
+ it("releases in-flight reservations when ws rejects a malformed upgrade before the callback", async () => {
443
+ const handler = new MediaStreamHandler({
444
+ transcriptionProvider: createStubSttProvider(),
445
+ providerConfig: {},
446
+ preStartTimeoutMs: 5_000,
447
+ maxConnections: 1,
448
+ maxPendingConnections: 10,
449
+ maxPendingConnectionsPerIp: 10,
450
+ });
451
+ const server = await startWsServer(handler);
452
+ const serverUrl = new URL(server.url);
453
+
454
+ try {
455
+ await withTimeout(
456
+ new Promise<void>((resolve, reject) => {
457
+ const socket = net.createConnection(
458
+ { host: serverUrl.hostname, port: Number(serverUrl.port) },
459
+ () => {
460
+ socket.write(
461
+ [
462
+ "GET /voice/stream HTTP/1.1",
463
+ `Host: ${serverUrl.host}`,
464
+ "Upgrade: websocket",
465
+ "Connection: Upgrade",
466
+ "Sec-WebSocket-Version: 13",
467
+ "",
468
+ "",
469
+ ].join("\r\n"),
470
+ );
471
+ },
472
+ );
473
+ socket.once("error", reject);
474
+ socket.once("data", () => {
475
+ socket.end();
476
+ });
477
+ socket.once("close", () => resolve());
478
+ }),
479
+ );
480
+
481
+ const ws = await connectWs(server.url);
482
+ expect(ws.readyState).toBe(WebSocket.OPEN);
483
+ ws.close();
484
+ await waitForClose(ws);
485
+ } finally {
486
+ await server.close();
487
+ }
488
+ });
489
+
244
490
  it("clears pending state after valid start", async () => {
245
491
  const handler = new MediaStreamHandler({
246
- sttProvider: createStubSttProvider(),
492
+ transcriptionProvider: createStubSttProvider(),
493
+ providerConfig: {},
247
494
  preStartTimeoutMs: 40,
248
495
  shouldAcceptStream: () => true,
249
496
  });
@@ -268,4 +515,254 @@ describe("MediaStreamHandler security hardening", () => {
268
515
  await server.close();
269
516
  }
270
517
  });
518
+
519
+ it("keeps accepted streams alive while STT readiness exceeds the pre-start timeout", async () => {
520
+ const sttReady = createDeferred();
521
+ const sttConnectStarted = createDeferred();
522
+ const transcriptionReady = createDeferred();
523
+ const events: string[] = [];
524
+
525
+ const session: RealtimeTranscriptionSession = {
526
+ connect: async () => {
527
+ events.push("stt-connect-start");
528
+ sttConnectStarted.resolve();
529
+ await sttReady.promise;
530
+ events.push("stt-connect-ready");
531
+ },
532
+ sendAudio: () => {},
533
+ close: () => {},
534
+ isConnected: () => false,
535
+ };
536
+
537
+ const handler = new MediaStreamHandler({
538
+ transcriptionProvider: {
539
+ createSession: () => session,
540
+ id: "openai",
541
+ label: "OpenAI",
542
+ isConfigured: () => true,
543
+ },
544
+ providerConfig: {},
545
+ preStartTimeoutMs: 40,
546
+ shouldAcceptStream: () => true,
547
+ onConnect: () => {
548
+ events.push("onConnect");
549
+ },
550
+ onTranscriptionReady: () => {
551
+ events.push("onTranscriptionReady");
552
+ transcriptionReady.resolve();
553
+ },
554
+ });
555
+ const server = await startWsServer(handler);
556
+
557
+ try {
558
+ const ws = await connectWs(server.url);
559
+ ws.send(
560
+ JSON.stringify({
561
+ event: "start",
562
+ streamSid: "MZ-slow-stt",
563
+ start: { callSid: "CA-slow-stt" },
564
+ }),
565
+ );
566
+
567
+ await withTimeout(sttConnectStarted.promise);
568
+ await new Promise((resolve) => setTimeout(resolve, 80));
569
+ expect(ws.readyState).toBe(WebSocket.OPEN);
570
+ expect(events).toEqual(["onConnect", "stt-connect-start"]);
571
+
572
+ sttReady.resolve();
573
+ await withTimeout(transcriptionReady.promise);
574
+ expect(events).toEqual([
575
+ "onConnect",
576
+ "stt-connect-start",
577
+ "stt-connect-ready",
578
+ "onTranscriptionReady",
579
+ ]);
580
+
581
+ ws.close();
582
+ await waitForClose(ws);
583
+ } finally {
584
+ await server.close();
585
+ }
586
+ });
587
+
588
+ it("forwards early Twilio media into the STT session before readiness", async () => {
589
+ const sttReady = createDeferred();
590
+ const sttConnectStarted = createDeferred();
591
+ const transcriptionReady = createDeferred();
592
+ const audioReceived = createDeferred();
593
+ const receivedAudio: Buffer[] = [];
594
+ let onConnectCalls = 0;
595
+ let onTranscriptionReadyCalls = 0;
596
+
597
+ const session: RealtimeTranscriptionSession = {
598
+ connect: async () => {
599
+ sttConnectStarted.resolve();
600
+ await sttReady.promise;
601
+ },
602
+ sendAudio: (audio) => {
603
+ receivedAudio.push(Buffer.from(audio));
604
+ audioReceived.resolve();
605
+ },
606
+ close: () => {},
607
+ isConnected: () => false,
608
+ };
609
+
610
+ const handler = new MediaStreamHandler({
611
+ transcriptionProvider: {
612
+ createSession: () => session,
613
+ id: "openai",
614
+ label: "OpenAI",
615
+ isConfigured: () => true,
616
+ },
617
+ providerConfig: {},
618
+ shouldAcceptStream: () => true,
619
+ onConnect: () => {
620
+ onConnectCalls += 1;
621
+ },
622
+ onTranscriptionReady: () => {
623
+ onTranscriptionReadyCalls += 1;
624
+ transcriptionReady.resolve();
625
+ },
626
+ });
627
+ const server = await startWsServer(handler);
628
+ let ws: WebSocket | undefined;
629
+
630
+ try {
631
+ ws = await connectWs(server.url);
632
+ ws.send(
633
+ JSON.stringify({
634
+ event: "start",
635
+ streamSid: "MZ-early-media",
636
+ start: { callSid: "CA-early-media" },
637
+ }),
638
+ );
639
+
640
+ await withTimeout(sttConnectStarted.promise);
641
+ ws.send(
642
+ JSON.stringify({
643
+ event: "media",
644
+ streamSid: "MZ-early-media",
645
+ media: { payload: Buffer.from("early").toString("base64") },
646
+ }),
647
+ );
648
+ await withTimeout(audioReceived.promise);
649
+
650
+ expect(Buffer.concat(receivedAudio).toString()).toBe("early");
651
+ expect(onConnectCalls).toBe(1);
652
+ expect(onTranscriptionReadyCalls).toBe(0);
653
+
654
+ sttReady.resolve();
655
+ await withTimeout(transcriptionReady.promise);
656
+ expect(onConnectCalls).toBe(1);
657
+ expect(onTranscriptionReadyCalls).toBe(1);
658
+ } finally {
659
+ sttReady.resolve();
660
+ if (ws) {
661
+ if (ws.readyState === WebSocket.OPEN) {
662
+ ws.close();
663
+ }
664
+ if (ws.readyState !== WebSocket.CLOSED) {
665
+ await waitForClose(ws).catch(() => {});
666
+ }
667
+ }
668
+ await server.close();
669
+ }
670
+ });
671
+
672
+ it("closes the media stream and disconnects once when STT readiness fails", async () => {
673
+ const sttConnectStarted = createDeferred();
674
+ const onDisconnectReady = createDeferred();
675
+ const onConnect = vi.fn();
676
+ const onTranscriptionReady = vi.fn();
677
+ const onDisconnect = vi.fn(() => {
678
+ onDisconnectReady.resolve();
679
+ });
680
+
681
+ const session: RealtimeTranscriptionSession = {
682
+ connect: async () => {
683
+ sttConnectStarted.resolve();
684
+ throw new Error("provider unavailable");
685
+ },
686
+ sendAudio: () => {},
687
+ close: vi.fn(),
688
+ isConnected: () => false,
689
+ };
690
+
691
+ const handler = new MediaStreamHandler({
692
+ transcriptionProvider: {
693
+ createSession: () => session,
694
+ id: "openai",
695
+ label: "OpenAI",
696
+ isConfigured: () => true,
697
+ },
698
+ providerConfig: {},
699
+ shouldAcceptStream: () => true,
700
+ onConnect,
701
+ onTranscriptionReady,
702
+ onDisconnect,
703
+ });
704
+ const server = await startWsServer(handler);
705
+
706
+ try {
707
+ const ws = await connectWs(server.url);
708
+ ws.send(
709
+ JSON.stringify({
710
+ event: "start",
711
+ streamSid: "MZ-stt-fail",
712
+ start: { callSid: "CA-stt-fail" },
713
+ }),
714
+ );
715
+
716
+ await withTimeout(sttConnectStarted.promise);
717
+ const closed = await waitForClose(ws);
718
+ await withTimeout(onDisconnectReady.promise);
719
+
720
+ expect(closed.code).toBe(1011);
721
+ expect(closed.reason).toBe("STT connection failed");
722
+ expect(onConnect).toHaveBeenCalledTimes(1);
723
+ expect(onConnect).toHaveBeenCalledWith("CA-stt-fail", "MZ-stt-fail");
724
+ expect(onTranscriptionReady).not.toHaveBeenCalled();
725
+ expect(onDisconnect).toHaveBeenCalledTimes(1);
726
+ expect(onDisconnect).toHaveBeenCalledWith("CA-stt-fail", "MZ-stt-fail");
727
+ expect(session.close).toHaveBeenCalledTimes(1);
728
+ } finally {
729
+ await server.close();
730
+ }
731
+ });
732
+
733
+ it("rejects oversized pre-start frames at the websocket maxPayload guard before validation runs", async () => {
734
+ const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
735
+ [];
736
+ const handler = new MediaStreamHandler({
737
+ transcriptionProvider: createStubSttProvider(),
738
+ providerConfig: {},
739
+ preStartTimeoutMs: 1_000,
740
+ shouldAcceptStream: (params) => {
741
+ shouldAcceptStreamCalls.push(params);
742
+ return true;
743
+ },
744
+ });
745
+ const server = await startWsServer(handler);
746
+
747
+ try {
748
+ const ws = await connectWs(server.url);
749
+ ws.send(
750
+ JSON.stringify({
751
+ event: "start",
752
+ streamSid: "MZ-oversized",
753
+ start: {
754
+ callSid: "CA-oversized",
755
+ customParameters: { token: "token-oversized", padding: "A".repeat(256 * 1024) },
756
+ },
757
+ }),
758
+ );
759
+
760
+ const closed = await waitForClose(ws);
761
+
762
+ expect(closed.code).toBe(1009);
763
+ expect(shouldAcceptStreamCalls).toEqual([]);
764
+ } finally {
765
+ await server.close();
766
+ }
767
+ });
271
768
  });