@openclaw/voice-call 2026.3.13 → 2026.5.2-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -5
- package/api.ts +16 -0
- package/cli-metadata.ts +10 -0
- package/config-api.ts +12 -0
- package/index.test.ts +943 -0
- package/index.ts +379 -149
- package/openclaw.plugin.json +384 -157
- package/package.json +35 -5
- package/runtime-api.ts +20 -0
- package/runtime-entry.ts +1 -0
- package/setup-api.ts +47 -0
- package/src/allowlist.test.ts +18 -0
- package/src/cli.ts +533 -68
- package/src/config-compat.test.ts +120 -0
- package/src/config-compat.ts +227 -0
- package/src/config.test.ts +273 -12
- package/src/config.ts +355 -72
- package/src/core-bridge.ts +2 -147
- package/src/deep-merge.test.ts +40 -0
- package/src/gateway-continue-operation.ts +200 -0
- package/src/http-headers.ts +6 -3
- package/src/manager/context.ts +6 -5
- package/src/manager/events.test.ts +243 -19
- package/src/manager/events.ts +61 -31
- package/src/manager/lifecycle.ts +53 -0
- package/src/manager/lookup.test.ts +52 -0
- package/src/manager/outbound.test.ts +528 -0
- package/src/manager/outbound.ts +163 -57
- package/src/manager/store.ts +18 -6
- package/src/manager/timers.test.ts +129 -0
- package/src/manager/timers.ts +4 -3
- package/src/manager/twiml.test.ts +13 -0
- package/src/manager/twiml.ts +8 -0
- package/src/manager.closed-loop.test.ts +30 -12
- package/src/manager.inbound-allowlist.test.ts +77 -10
- package/src/manager.notify.test.ts +344 -20
- package/src/manager.restore.test.ts +95 -8
- package/src/manager.test-harness.ts +8 -6
- package/src/manager.ts +79 -5
- package/src/media-stream.test.ts +578 -81
- package/src/media-stream.ts +235 -54
- package/src/providers/base.ts +19 -0
- package/src/providers/mock.ts +7 -1
- package/src/providers/plivo.test.ts +50 -6
- package/src/providers/plivo.ts +14 -6
- package/src/providers/shared/call-status.ts +2 -1
- package/src/providers/shared/guarded-json-api.test.ts +106 -0
- package/src/providers/shared/guarded-json-api.ts +1 -1
- package/src/providers/telnyx.test.ts +178 -6
- package/src/providers/telnyx.ts +40 -3
- package/src/providers/twilio/api.test.ts +145 -0
- package/src/providers/twilio/api.ts +67 -16
- package/src/providers/twilio/twiml-policy.ts +6 -10
- package/src/providers/twilio/webhook.ts +1 -1
- package/src/providers/twilio.test.ts +425 -25
- package/src/providers/twilio.ts +230 -77
- package/src/providers/twilio.types.ts +17 -0
- package/src/realtime-defaults.ts +3 -0
- package/src/realtime-fast-context.test.ts +88 -0
- package/src/realtime-fast-context.ts +165 -0
- package/src/realtime-transcription.runtime.ts +4 -0
- package/src/realtime-voice.runtime.ts +5 -0
- package/src/response-generator.test.ts +321 -0
- package/src/response-generator.ts +213 -53
- package/src/response-model.test.ts +71 -0
- package/src/response-model.ts +23 -0
- package/src/runtime.test.ts +429 -0
- package/src/runtime.ts +270 -24
- package/src/telephony-audio.test.ts +61 -0
- package/src/telephony-audio.ts +1 -79
- package/src/telephony-tts.test.ts +133 -12
- package/src/telephony-tts.ts +155 -2
- package/src/test-fixtures.ts +28 -7
- package/src/tts-provider-voice.test.ts +34 -0
- package/src/tts-provider-voice.ts +21 -0
- package/src/tunnel.test.ts +166 -0
- package/src/tunnel.ts +1 -1
- package/src/types.ts +24 -37
- package/src/utils.test.ts +17 -0
- package/src/voice-mapping.test.ts +34 -0
- package/src/voice-mapping.ts +3 -2
- package/src/webhook/realtime-handler.test.ts +598 -0
- package/src/webhook/realtime-handler.ts +485 -0
- package/src/webhook/stale-call-reaper.test.ts +88 -0
- package/src/webhook/stale-call-reaper.ts +5 -0
- package/src/webhook/tailscale.test.ts +214 -0
- package/src/webhook/tailscale.ts +19 -5
- package/src/webhook-exposure.test.ts +33 -0
- package/src/webhook-exposure.ts +84 -0
- package/src/webhook-security.test.ts +172 -21
- package/src/webhook-security.ts +43 -29
- package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
- package/src/webhook.test.ts +1145 -27
- package/src/webhook.ts +523 -102
- package/src/webhook.types.ts +5 -0
- package/src/websocket-test-support.ts +72 -0
- package/tsconfig.json +16 -0
- package/CHANGELOG.md +0 -121
- package/src/providers/index.ts +0 -10
- package/src/providers/stt-openai-realtime.test.ts +0 -42
- package/src/providers/stt-openai-realtime.ts +0 -311
- package/src/providers/tts-openai.test.ts +0 -43
- package/src/providers/tts-openai.ts +0 -221
package/src/media-stream.test.ts
CHANGED
|
@@ -1,33 +1,52 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import
|
|
3
|
-
import { describe, expect, it } from "vitest";
|
|
4
|
-
import { WebSocket } from "ws";
|
|
5
|
-
import { MediaStreamHandler } from "./media-stream.js";
|
|
1
|
+
import type { IncomingMessage } from "node:http";
|
|
2
|
+
import net from "node:net";
|
|
6
3
|
import type {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
} from "
|
|
10
|
-
|
|
11
|
-
|
|
4
|
+
RealtimeTranscriptionProviderPlugin,
|
|
5
|
+
RealtimeTranscriptionSession,
|
|
6
|
+
} from "openclaw/plugin-sdk/realtime-transcription";
|
|
7
|
+
import { describe, expect, it, vi } from "vitest";
|
|
8
|
+
import { WebSocket } from "ws";
|
|
9
|
+
import { MediaStreamHandler, sanitizeLogText } from "./media-stream.js";
|
|
10
|
+
import {
|
|
11
|
+
connectWs,
|
|
12
|
+
startUpgradeWsServer,
|
|
13
|
+
waitForClose,
|
|
14
|
+
withTimeout,
|
|
15
|
+
} from "./websocket-test-support.js";
|
|
16
|
+
|
|
17
|
+
const createStubSession = (): RealtimeTranscriptionSession => ({
|
|
12
18
|
connect: async () => {},
|
|
13
19
|
sendAudio: () => {},
|
|
14
|
-
waitForTranscript: async () => "",
|
|
15
|
-
onPartial: () => {},
|
|
16
|
-
onTranscript: () => {},
|
|
17
|
-
onSpeechStart: () => {},
|
|
18
20
|
close: () => {},
|
|
19
21
|
isConnected: () => true,
|
|
20
22
|
});
|
|
21
23
|
|
|
22
|
-
const createStubSttProvider = ():
|
|
24
|
+
const createStubSttProvider = (): RealtimeTranscriptionProviderPlugin =>
|
|
23
25
|
({
|
|
24
26
|
createSession: () => createStubSession(),
|
|
25
|
-
|
|
27
|
+
id: "openai",
|
|
28
|
+
label: "OpenAI",
|
|
29
|
+
isConfigured: () => true,
|
|
30
|
+
}) as unknown as RealtimeTranscriptionProviderPlugin;
|
|
26
31
|
|
|
27
32
|
const flush = async (): Promise<void> => {
|
|
28
33
|
await new Promise((resolve) => setTimeout(resolve, 0));
|
|
29
34
|
};
|
|
30
35
|
|
|
36
|
+
const createDeferred = (): {
|
|
37
|
+
promise: Promise<void>;
|
|
38
|
+
resolve: () => void;
|
|
39
|
+
reject: (error: Error) => void;
|
|
40
|
+
} => {
|
|
41
|
+
let resolve!: () => void;
|
|
42
|
+
let reject!: (error: Error) => void;
|
|
43
|
+
const promise = new Promise<void>((resolvePromise, rejectPromise) => {
|
|
44
|
+
resolve = resolvePromise;
|
|
45
|
+
reject = rejectPromise;
|
|
46
|
+
});
|
|
47
|
+
return { promise, resolve, reject };
|
|
48
|
+
};
|
|
49
|
+
|
|
31
50
|
const waitForAbort = (signal: AbortSignal): Promise<void> =>
|
|
32
51
|
new Promise((resolve) => {
|
|
33
52
|
if (signal.aborted) {
|
|
@@ -37,74 +56,24 @@ const waitForAbort = (signal: AbortSignal): Promise<void> =>
|
|
|
37
56
|
signal.addEventListener("abort", () => resolve(), { once: true });
|
|
38
57
|
});
|
|
39
58
|
|
|
40
|
-
const withTimeout = async <T>(promise: Promise<T>, timeoutMs = 2000): Promise<T> => {
|
|
41
|
-
let timer: ReturnType<typeof setTimeout> | null = null;
|
|
42
|
-
const timeout = new Promise<never>((_, reject) => {
|
|
43
|
-
timer = setTimeout(() => reject(new Error(`Timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
44
|
-
});
|
|
45
|
-
|
|
46
|
-
try {
|
|
47
|
-
return await Promise.race([promise, timeout]);
|
|
48
|
-
} finally {
|
|
49
|
-
if (timer) {
|
|
50
|
-
clearTimeout(timer);
|
|
51
|
-
}
|
|
52
|
-
}
|
|
53
|
-
};
|
|
54
|
-
|
|
55
59
|
const startWsServer = async (
|
|
56
60
|
handler: MediaStreamHandler,
|
|
57
61
|
): Promise<{
|
|
58
62
|
url: string;
|
|
59
63
|
close: () => Promise<void>;
|
|
60
|
-
}> =>
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
await new Promise<void>((resolve) => {
|
|
67
|
-
server.listen(0, "127.0.0.1", resolve);
|
|
68
|
-
});
|
|
69
|
-
|
|
70
|
-
const address = server.address();
|
|
71
|
-
if (!address || typeof address === "string") {
|
|
72
|
-
throw new Error("Failed to resolve test server address");
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
return {
|
|
76
|
-
url: `ws://127.0.0.1:${address.port}/voice/stream`,
|
|
77
|
-
close: async () => {
|
|
78
|
-
await new Promise<void>((resolve, reject) => {
|
|
79
|
-
server.close((err) => (err ? reject(err) : resolve()));
|
|
80
|
-
});
|
|
64
|
+
}> =>
|
|
65
|
+
startUpgradeWsServer({
|
|
66
|
+
urlPath: "/voice/stream",
|
|
67
|
+
onUpgrade: (request, socket, head) => {
|
|
68
|
+
handler.handleUpgrade(request, socket, head);
|
|
81
69
|
},
|
|
82
|
-
};
|
|
83
|
-
};
|
|
84
|
-
|
|
85
|
-
const connectWs = async (url: string): Promise<WebSocket> => {
|
|
86
|
-
const ws = new WebSocket(url);
|
|
87
|
-
await withTimeout(once(ws, "open") as Promise<[unknown]>);
|
|
88
|
-
return ws;
|
|
89
|
-
};
|
|
90
|
-
|
|
91
|
-
const waitForClose = async (
|
|
92
|
-
ws: WebSocket,
|
|
93
|
-
): Promise<{
|
|
94
|
-
code: number;
|
|
95
|
-
reason: string;
|
|
96
|
-
}> => {
|
|
97
|
-
const [code, reason] = (await withTimeout(once(ws, "close") as Promise<[number, Buffer]>)) ?? [];
|
|
98
|
-
return {
|
|
99
|
-
code,
|
|
100
|
-
reason: Buffer.isBuffer(reason) ? reason.toString() : String(reason || ""),
|
|
101
|
-
};
|
|
102
|
-
};
|
|
70
|
+
});
|
|
103
71
|
|
|
104
72
|
describe("MediaStreamHandler TTS queue", () => {
|
|
105
73
|
it("serializes TTS playback and resolves in order", async () => {
|
|
106
74
|
const handler = new MediaStreamHandler({
|
|
107
|
-
|
|
75
|
+
transcriptionProvider: createStubSttProvider(),
|
|
76
|
+
providerConfig: {},
|
|
108
77
|
});
|
|
109
78
|
const started: number[] = [];
|
|
110
79
|
const finished: number[] = [];
|
|
@@ -137,7 +106,8 @@ describe("MediaStreamHandler TTS queue", () => {
|
|
|
137
106
|
|
|
138
107
|
it("cancels active playback and clears queued items", async () => {
|
|
139
108
|
const handler = new MediaStreamHandler({
|
|
140
|
-
|
|
109
|
+
transcriptionProvider: createStubSttProvider(),
|
|
110
|
+
providerConfig: {},
|
|
141
111
|
});
|
|
142
112
|
|
|
143
113
|
let queuedRan = false;
|
|
@@ -147,7 +117,7 @@ describe("MediaStreamHandler TTS queue", () => {
|
|
|
147
117
|
started.push("active");
|
|
148
118
|
await waitForAbort(signal);
|
|
149
119
|
});
|
|
150
|
-
|
|
120
|
+
const queued = handler.queueTts("stream-1", async () => {
|
|
151
121
|
queuedRan = true;
|
|
152
122
|
});
|
|
153
123
|
|
|
@@ -156,18 +126,134 @@ describe("MediaStreamHandler TTS queue", () => {
|
|
|
156
126
|
|
|
157
127
|
handler.clearTtsQueue("stream-1");
|
|
158
128
|
await active;
|
|
129
|
+
await withTimeout(queued);
|
|
159
130
|
await flush();
|
|
160
131
|
|
|
161
132
|
expect(queuedRan).toBe(false);
|
|
162
133
|
});
|
|
134
|
+
|
|
135
|
+
it("resolves pending queued playback during stream teardown", async () => {
|
|
136
|
+
const handler = new MediaStreamHandler({
|
|
137
|
+
transcriptionProvider: createStubSttProvider(),
|
|
138
|
+
providerConfig: {},
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
let queuedRan = false;
|
|
142
|
+
const active = handler.queueTts("stream-1", async (signal) => {
|
|
143
|
+
await waitForAbort(signal);
|
|
144
|
+
});
|
|
145
|
+
const queued = handler.queueTts("stream-1", async () => {
|
|
146
|
+
queuedRan = true;
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
await flush();
|
|
150
|
+
(
|
|
151
|
+
handler as unknown as {
|
|
152
|
+
clearTtsState(streamSid: string): void;
|
|
153
|
+
}
|
|
154
|
+
).clearTtsState("stream-1");
|
|
155
|
+
|
|
156
|
+
await withTimeout(active);
|
|
157
|
+
await withTimeout(queued);
|
|
158
|
+
expect(queuedRan).toBe(false);
|
|
159
|
+
});
|
|
163
160
|
});
|
|
164
161
|
|
|
165
162
|
describe("MediaStreamHandler security hardening", () => {
|
|
163
|
+
it("fails sends and closes stream when buffered bytes already exceed the cap", () => {
|
|
164
|
+
const handler = new MediaStreamHandler({
|
|
165
|
+
transcriptionProvider: createStubSttProvider(),
|
|
166
|
+
providerConfig: {},
|
|
167
|
+
});
|
|
168
|
+
const ws = {
|
|
169
|
+
readyState: WebSocket.OPEN,
|
|
170
|
+
bufferedAmount: 2 * 1024 * 1024,
|
|
171
|
+
send: vi.fn(),
|
|
172
|
+
close: vi.fn(),
|
|
173
|
+
} as unknown as WebSocket;
|
|
174
|
+
(
|
|
175
|
+
handler as unknown as {
|
|
176
|
+
sessions: Map<
|
|
177
|
+
string,
|
|
178
|
+
{
|
|
179
|
+
callId: string;
|
|
180
|
+
streamSid: string;
|
|
181
|
+
ws: WebSocket;
|
|
182
|
+
sttSession: RealtimeTranscriptionSession;
|
|
183
|
+
}
|
|
184
|
+
>;
|
|
185
|
+
}
|
|
186
|
+
).sessions.set("MZ-backpressure", {
|
|
187
|
+
callId: "CA-backpressure",
|
|
188
|
+
streamSid: "MZ-backpressure",
|
|
189
|
+
ws,
|
|
190
|
+
sttSession: createStubSession(),
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
const result = handler.sendAudio("MZ-backpressure", Buffer.alloc(160, 0xff));
|
|
194
|
+
|
|
195
|
+
expect(result.sent).toBe(false);
|
|
196
|
+
expect(ws.send).not.toHaveBeenCalled();
|
|
197
|
+
expect(ws.close).toHaveBeenCalledWith(1013, "Backpressure: send buffer exceeded");
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
it("fails sends when buffered bytes exceed cap after enqueueing a frame", () => {
|
|
201
|
+
const handler = new MediaStreamHandler({
|
|
202
|
+
transcriptionProvider: createStubSttProvider(),
|
|
203
|
+
providerConfig: {},
|
|
204
|
+
});
|
|
205
|
+
const ws = {
|
|
206
|
+
readyState: WebSocket.OPEN,
|
|
207
|
+
bufferedAmount: 0,
|
|
208
|
+
send: vi.fn(() => {
|
|
209
|
+
(
|
|
210
|
+
ws as unknown as {
|
|
211
|
+
bufferedAmount: number;
|
|
212
|
+
}
|
|
213
|
+
).bufferedAmount = 2 * 1024 * 1024;
|
|
214
|
+
}),
|
|
215
|
+
close: vi.fn(),
|
|
216
|
+
} as unknown as WebSocket;
|
|
217
|
+
(
|
|
218
|
+
handler as unknown as {
|
|
219
|
+
sessions: Map<
|
|
220
|
+
string,
|
|
221
|
+
{
|
|
222
|
+
callId: string;
|
|
223
|
+
streamSid: string;
|
|
224
|
+
ws: WebSocket;
|
|
225
|
+
sttSession: RealtimeTranscriptionSession;
|
|
226
|
+
}
|
|
227
|
+
>;
|
|
228
|
+
}
|
|
229
|
+
).sessions.set("MZ-overflow", {
|
|
230
|
+
callId: "CA-overflow",
|
|
231
|
+
streamSid: "MZ-overflow",
|
|
232
|
+
ws,
|
|
233
|
+
sttSession: createStubSession(),
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
const result = handler.sendMark("MZ-overflow", "mark-1");
|
|
237
|
+
|
|
238
|
+
expect(ws.send).toHaveBeenCalledTimes(1);
|
|
239
|
+
expect(result.sent).toBe(false);
|
|
240
|
+
expect(ws.close).toHaveBeenCalledWith(1013, "Backpressure: send buffer exceeded");
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
it("sanitizes websocket close reason before logging", () => {
|
|
244
|
+
const reason = sanitizeLogText("forged\nline\r\tentry", 120);
|
|
245
|
+
expect(reason).not.toContain("\n");
|
|
246
|
+
expect(reason).not.toContain("\r");
|
|
247
|
+
expect(reason).not.toContain("\t");
|
|
248
|
+
expect(reason).toContain("forged line entry");
|
|
249
|
+
});
|
|
250
|
+
|
|
166
251
|
it("closes idle pre-start connections after timeout", async () => {
|
|
167
252
|
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
|
|
168
253
|
[];
|
|
169
254
|
const handler = new MediaStreamHandler({
|
|
170
|
-
|
|
255
|
+
transcriptionProvider: createStubSttProvider(),
|
|
256
|
+
providerConfig: {},
|
|
171
257
|
preStartTimeoutMs: 40,
|
|
172
258
|
shouldAcceptStream: (params) => {
|
|
173
259
|
shouldAcceptStreamCalls.push(params);
|
|
@@ -190,7 +276,8 @@ describe("MediaStreamHandler security hardening", () => {
|
|
|
190
276
|
|
|
191
277
|
it("enforces pending connection limits", async () => {
|
|
192
278
|
const handler = new MediaStreamHandler({
|
|
193
|
-
|
|
279
|
+
transcriptionProvider: createStubSttProvider(),
|
|
280
|
+
providerConfig: {},
|
|
194
281
|
preStartTimeoutMs: 5_000,
|
|
195
282
|
maxPendingConnections: 1,
|
|
196
283
|
maxPendingConnectionsPerIp: 1,
|
|
@@ -213,9 +300,46 @@ describe("MediaStreamHandler security hardening", () => {
|
|
|
213
300
|
}
|
|
214
301
|
});
|
|
215
302
|
|
|
303
|
+
it("uses resolved client IPs for per-IP pending limits", async () => {
|
|
304
|
+
const handler = new MediaStreamHandler({
|
|
305
|
+
transcriptionProvider: createStubSttProvider(),
|
|
306
|
+
providerConfig: {},
|
|
307
|
+
preStartTimeoutMs: 5_000,
|
|
308
|
+
maxPendingConnections: 10,
|
|
309
|
+
maxPendingConnectionsPerIp: 1,
|
|
310
|
+
resolveClientIp: (request) => String(request.headers["x-forwarded-for"] ?? ""),
|
|
311
|
+
});
|
|
312
|
+
const server = await startWsServer(handler);
|
|
313
|
+
|
|
314
|
+
try {
|
|
315
|
+
const first = new WebSocket(server.url, {
|
|
316
|
+
headers: { "x-forwarded-for": "198.51.100.10" },
|
|
317
|
+
});
|
|
318
|
+
await withTimeout(new Promise((resolve) => first.once("open", resolve)));
|
|
319
|
+
|
|
320
|
+
const second = new WebSocket(server.url, {
|
|
321
|
+
headers: { "x-forwarded-for": "203.0.113.20" },
|
|
322
|
+
});
|
|
323
|
+
await withTimeout(new Promise((resolve) => second.once("open", resolve)));
|
|
324
|
+
|
|
325
|
+
expect(first.readyState).toBe(WebSocket.OPEN);
|
|
326
|
+
expect(second.readyState).toBe(WebSocket.OPEN);
|
|
327
|
+
|
|
328
|
+
const firstClosed = waitForClose(first);
|
|
329
|
+
const secondClosed = waitForClose(second);
|
|
330
|
+
first.close();
|
|
331
|
+
second.close();
|
|
332
|
+
await firstClosed;
|
|
333
|
+
await secondClosed;
|
|
334
|
+
} finally {
|
|
335
|
+
await server.close();
|
|
336
|
+
}
|
|
337
|
+
});
|
|
338
|
+
|
|
216
339
|
it("rejects upgrades when max connection cap is reached", async () => {
|
|
217
340
|
const handler = new MediaStreamHandler({
|
|
218
|
-
|
|
341
|
+
transcriptionProvider: createStubSttProvider(),
|
|
342
|
+
providerConfig: {},
|
|
219
343
|
preStartTimeoutMs: 5_000,
|
|
220
344
|
maxConnections: 1,
|
|
221
345
|
maxPendingConnections: 10,
|
|
@@ -228,7 +352,7 @@ describe("MediaStreamHandler security hardening", () => {
|
|
|
228
352
|
const secondError = await withTimeout(
|
|
229
353
|
new Promise<Error>((resolve) => {
|
|
230
354
|
const ws = new WebSocket(server.url);
|
|
231
|
-
ws.once("error", (err) => resolve(err
|
|
355
|
+
ws.once("error", (err) => resolve(err));
|
|
232
356
|
}),
|
|
233
357
|
);
|
|
234
358
|
|
|
@@ -241,9 +365,132 @@ describe("MediaStreamHandler security hardening", () => {
|
|
|
241
365
|
}
|
|
242
366
|
});
|
|
243
367
|
|
|
368
|
+
it("counts in-flight upgrades against the max connection cap", () => {
|
|
369
|
+
const handler = new MediaStreamHandler({
|
|
370
|
+
transcriptionProvider: createStubSttProvider(),
|
|
371
|
+
providerConfig: {},
|
|
372
|
+
maxConnections: 2,
|
|
373
|
+
maxPendingConnections: 10,
|
|
374
|
+
maxPendingConnectionsPerIp: 10,
|
|
375
|
+
});
|
|
376
|
+
|
|
377
|
+
const fakeWss = {
|
|
378
|
+
clients: new Set([{}]),
|
|
379
|
+
handleUpgrade: vi.fn(),
|
|
380
|
+
emit: vi.fn(),
|
|
381
|
+
on: vi.fn(),
|
|
382
|
+
};
|
|
383
|
+
let upgradeCallback: ((ws: WebSocket) => void) | null = null;
|
|
384
|
+
fakeWss.handleUpgrade.mockImplementation(
|
|
385
|
+
(
|
|
386
|
+
_request: IncomingMessage,
|
|
387
|
+
_socket: unknown,
|
|
388
|
+
_head: Buffer,
|
|
389
|
+
callback: (ws: WebSocket) => void,
|
|
390
|
+
) => {
|
|
391
|
+
upgradeCallback = callback;
|
|
392
|
+
},
|
|
393
|
+
);
|
|
394
|
+
|
|
395
|
+
(
|
|
396
|
+
handler as unknown as {
|
|
397
|
+
wss: typeof fakeWss;
|
|
398
|
+
}
|
|
399
|
+
).wss = fakeWss;
|
|
400
|
+
|
|
401
|
+
const firstSocket = {
|
|
402
|
+
once: vi.fn(),
|
|
403
|
+
removeListener: vi.fn(),
|
|
404
|
+
write: vi.fn(),
|
|
405
|
+
destroy: vi.fn(),
|
|
406
|
+
};
|
|
407
|
+
handler.handleUpgrade(
|
|
408
|
+
{ socket: { remoteAddress: "127.0.0.1" } } as IncomingMessage,
|
|
409
|
+
firstSocket as never,
|
|
410
|
+
Buffer.alloc(0),
|
|
411
|
+
);
|
|
412
|
+
|
|
413
|
+
const secondSocket = {
|
|
414
|
+
once: vi.fn(),
|
|
415
|
+
removeListener: vi.fn(),
|
|
416
|
+
write: vi.fn(),
|
|
417
|
+
destroy: vi.fn(),
|
|
418
|
+
};
|
|
419
|
+
handler.handleUpgrade(
|
|
420
|
+
{ socket: { remoteAddress: "127.0.0.1" } } as IncomingMessage,
|
|
421
|
+
secondSocket as never,
|
|
422
|
+
Buffer.alloc(0),
|
|
423
|
+
);
|
|
424
|
+
|
|
425
|
+
expect(fakeWss.handleUpgrade).toHaveBeenCalledTimes(1);
|
|
426
|
+
expect(secondSocket.write).toHaveBeenCalledOnce();
|
|
427
|
+
expect(secondSocket.destroy).toHaveBeenCalledOnce();
|
|
428
|
+
|
|
429
|
+
expect(upgradeCallback).not.toBeNull();
|
|
430
|
+
const completeUpgrade = upgradeCallback as ((ws: WebSocket) => void) | null;
|
|
431
|
+
if (!completeUpgrade) {
|
|
432
|
+
throw new Error("Expected upgrade callback to be registered");
|
|
433
|
+
}
|
|
434
|
+
completeUpgrade({} as WebSocket);
|
|
435
|
+
expect(fakeWss.emit).toHaveBeenCalledWith(
|
|
436
|
+
"connection",
|
|
437
|
+
expect.anything(),
|
|
438
|
+
expect.objectContaining({ socket: { remoteAddress: "127.0.0.1" } }),
|
|
439
|
+
);
|
|
440
|
+
});
|
|
441
|
+
|
|
442
|
+
it("releases in-flight reservations when ws rejects a malformed upgrade before the callback", async () => {
|
|
443
|
+
const handler = new MediaStreamHandler({
|
|
444
|
+
transcriptionProvider: createStubSttProvider(),
|
|
445
|
+
providerConfig: {},
|
|
446
|
+
preStartTimeoutMs: 5_000,
|
|
447
|
+
maxConnections: 1,
|
|
448
|
+
maxPendingConnections: 10,
|
|
449
|
+
maxPendingConnectionsPerIp: 10,
|
|
450
|
+
});
|
|
451
|
+
const server = await startWsServer(handler);
|
|
452
|
+
const serverUrl = new URL(server.url);
|
|
453
|
+
|
|
454
|
+
try {
|
|
455
|
+
await withTimeout(
|
|
456
|
+
new Promise<void>((resolve, reject) => {
|
|
457
|
+
const socket = net.createConnection(
|
|
458
|
+
{ host: serverUrl.hostname, port: Number(serverUrl.port) },
|
|
459
|
+
() => {
|
|
460
|
+
socket.write(
|
|
461
|
+
[
|
|
462
|
+
"GET /voice/stream HTTP/1.1",
|
|
463
|
+
`Host: ${serverUrl.host}`,
|
|
464
|
+
"Upgrade: websocket",
|
|
465
|
+
"Connection: Upgrade",
|
|
466
|
+
"Sec-WebSocket-Version: 13",
|
|
467
|
+
"",
|
|
468
|
+
"",
|
|
469
|
+
].join("\r\n"),
|
|
470
|
+
);
|
|
471
|
+
},
|
|
472
|
+
);
|
|
473
|
+
socket.once("error", reject);
|
|
474
|
+
socket.once("data", () => {
|
|
475
|
+
socket.end();
|
|
476
|
+
});
|
|
477
|
+
socket.once("close", () => resolve());
|
|
478
|
+
}),
|
|
479
|
+
);
|
|
480
|
+
|
|
481
|
+
const ws = await connectWs(server.url);
|
|
482
|
+
expect(ws.readyState).toBe(WebSocket.OPEN);
|
|
483
|
+
ws.close();
|
|
484
|
+
await waitForClose(ws);
|
|
485
|
+
} finally {
|
|
486
|
+
await server.close();
|
|
487
|
+
}
|
|
488
|
+
});
|
|
489
|
+
|
|
244
490
|
it("clears pending state after valid start", async () => {
|
|
245
491
|
const handler = new MediaStreamHandler({
|
|
246
|
-
|
|
492
|
+
transcriptionProvider: createStubSttProvider(),
|
|
493
|
+
providerConfig: {},
|
|
247
494
|
preStartTimeoutMs: 40,
|
|
248
495
|
shouldAcceptStream: () => true,
|
|
249
496
|
});
|
|
@@ -268,4 +515,254 @@ describe("MediaStreamHandler security hardening", () => {
|
|
|
268
515
|
await server.close();
|
|
269
516
|
}
|
|
270
517
|
});
|
|
518
|
+
|
|
519
|
+
it("keeps accepted streams alive while STT readiness exceeds the pre-start timeout", async () => {
|
|
520
|
+
const sttReady = createDeferred();
|
|
521
|
+
const sttConnectStarted = createDeferred();
|
|
522
|
+
const transcriptionReady = createDeferred();
|
|
523
|
+
const events: string[] = [];
|
|
524
|
+
|
|
525
|
+
const session: RealtimeTranscriptionSession = {
|
|
526
|
+
connect: async () => {
|
|
527
|
+
events.push("stt-connect-start");
|
|
528
|
+
sttConnectStarted.resolve();
|
|
529
|
+
await sttReady.promise;
|
|
530
|
+
events.push("stt-connect-ready");
|
|
531
|
+
},
|
|
532
|
+
sendAudio: () => {},
|
|
533
|
+
close: () => {},
|
|
534
|
+
isConnected: () => false,
|
|
535
|
+
};
|
|
536
|
+
|
|
537
|
+
const handler = new MediaStreamHandler({
|
|
538
|
+
transcriptionProvider: {
|
|
539
|
+
createSession: () => session,
|
|
540
|
+
id: "openai",
|
|
541
|
+
label: "OpenAI",
|
|
542
|
+
isConfigured: () => true,
|
|
543
|
+
},
|
|
544
|
+
providerConfig: {},
|
|
545
|
+
preStartTimeoutMs: 40,
|
|
546
|
+
shouldAcceptStream: () => true,
|
|
547
|
+
onConnect: () => {
|
|
548
|
+
events.push("onConnect");
|
|
549
|
+
},
|
|
550
|
+
onTranscriptionReady: () => {
|
|
551
|
+
events.push("onTranscriptionReady");
|
|
552
|
+
transcriptionReady.resolve();
|
|
553
|
+
},
|
|
554
|
+
});
|
|
555
|
+
const server = await startWsServer(handler);
|
|
556
|
+
|
|
557
|
+
try {
|
|
558
|
+
const ws = await connectWs(server.url);
|
|
559
|
+
ws.send(
|
|
560
|
+
JSON.stringify({
|
|
561
|
+
event: "start",
|
|
562
|
+
streamSid: "MZ-slow-stt",
|
|
563
|
+
start: { callSid: "CA-slow-stt" },
|
|
564
|
+
}),
|
|
565
|
+
);
|
|
566
|
+
|
|
567
|
+
await withTimeout(sttConnectStarted.promise);
|
|
568
|
+
await new Promise((resolve) => setTimeout(resolve, 80));
|
|
569
|
+
expect(ws.readyState).toBe(WebSocket.OPEN);
|
|
570
|
+
expect(events).toEqual(["onConnect", "stt-connect-start"]);
|
|
571
|
+
|
|
572
|
+
sttReady.resolve();
|
|
573
|
+
await withTimeout(transcriptionReady.promise);
|
|
574
|
+
expect(events).toEqual([
|
|
575
|
+
"onConnect",
|
|
576
|
+
"stt-connect-start",
|
|
577
|
+
"stt-connect-ready",
|
|
578
|
+
"onTranscriptionReady",
|
|
579
|
+
]);
|
|
580
|
+
|
|
581
|
+
ws.close();
|
|
582
|
+
await waitForClose(ws);
|
|
583
|
+
} finally {
|
|
584
|
+
await server.close();
|
|
585
|
+
}
|
|
586
|
+
});
|
|
587
|
+
|
|
588
|
+
it("forwards early Twilio media into the STT session before readiness", async () => {
|
|
589
|
+
const sttReady = createDeferred();
|
|
590
|
+
const sttConnectStarted = createDeferred();
|
|
591
|
+
const transcriptionReady = createDeferred();
|
|
592
|
+
const audioReceived = createDeferred();
|
|
593
|
+
const receivedAudio: Buffer[] = [];
|
|
594
|
+
let onConnectCalls = 0;
|
|
595
|
+
let onTranscriptionReadyCalls = 0;
|
|
596
|
+
|
|
597
|
+
const session: RealtimeTranscriptionSession = {
|
|
598
|
+
connect: async () => {
|
|
599
|
+
sttConnectStarted.resolve();
|
|
600
|
+
await sttReady.promise;
|
|
601
|
+
},
|
|
602
|
+
sendAudio: (audio) => {
|
|
603
|
+
receivedAudio.push(Buffer.from(audio));
|
|
604
|
+
audioReceived.resolve();
|
|
605
|
+
},
|
|
606
|
+
close: () => {},
|
|
607
|
+
isConnected: () => false,
|
|
608
|
+
};
|
|
609
|
+
|
|
610
|
+
const handler = new MediaStreamHandler({
|
|
611
|
+
transcriptionProvider: {
|
|
612
|
+
createSession: () => session,
|
|
613
|
+
id: "openai",
|
|
614
|
+
label: "OpenAI",
|
|
615
|
+
isConfigured: () => true,
|
|
616
|
+
},
|
|
617
|
+
providerConfig: {},
|
|
618
|
+
shouldAcceptStream: () => true,
|
|
619
|
+
onConnect: () => {
|
|
620
|
+
onConnectCalls += 1;
|
|
621
|
+
},
|
|
622
|
+
onTranscriptionReady: () => {
|
|
623
|
+
onTranscriptionReadyCalls += 1;
|
|
624
|
+
transcriptionReady.resolve();
|
|
625
|
+
},
|
|
626
|
+
});
|
|
627
|
+
const server = await startWsServer(handler);
|
|
628
|
+
let ws: WebSocket | undefined;
|
|
629
|
+
|
|
630
|
+
try {
|
|
631
|
+
ws = await connectWs(server.url);
|
|
632
|
+
ws.send(
|
|
633
|
+
JSON.stringify({
|
|
634
|
+
event: "start",
|
|
635
|
+
streamSid: "MZ-early-media",
|
|
636
|
+
start: { callSid: "CA-early-media" },
|
|
637
|
+
}),
|
|
638
|
+
);
|
|
639
|
+
|
|
640
|
+
await withTimeout(sttConnectStarted.promise);
|
|
641
|
+
ws.send(
|
|
642
|
+
JSON.stringify({
|
|
643
|
+
event: "media",
|
|
644
|
+
streamSid: "MZ-early-media",
|
|
645
|
+
media: { payload: Buffer.from("early").toString("base64") },
|
|
646
|
+
}),
|
|
647
|
+
);
|
|
648
|
+
await withTimeout(audioReceived.promise);
|
|
649
|
+
|
|
650
|
+
expect(Buffer.concat(receivedAudio).toString()).toBe("early");
|
|
651
|
+
expect(onConnectCalls).toBe(1);
|
|
652
|
+
expect(onTranscriptionReadyCalls).toBe(0);
|
|
653
|
+
|
|
654
|
+
sttReady.resolve();
|
|
655
|
+
await withTimeout(transcriptionReady.promise);
|
|
656
|
+
expect(onConnectCalls).toBe(1);
|
|
657
|
+
expect(onTranscriptionReadyCalls).toBe(1);
|
|
658
|
+
} finally {
|
|
659
|
+
sttReady.resolve();
|
|
660
|
+
if (ws) {
|
|
661
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
662
|
+
ws.close();
|
|
663
|
+
}
|
|
664
|
+
if (ws.readyState !== WebSocket.CLOSED) {
|
|
665
|
+
await waitForClose(ws).catch(() => {});
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
await server.close();
|
|
669
|
+
}
|
|
670
|
+
});
|
|
671
|
+
|
|
672
|
+
it("closes the media stream and disconnects once when STT readiness fails", async () => {
|
|
673
|
+
const sttConnectStarted = createDeferred();
|
|
674
|
+
const onDisconnectReady = createDeferred();
|
|
675
|
+
const onConnect = vi.fn();
|
|
676
|
+
const onTranscriptionReady = vi.fn();
|
|
677
|
+
const onDisconnect = vi.fn(() => {
|
|
678
|
+
onDisconnectReady.resolve();
|
|
679
|
+
});
|
|
680
|
+
|
|
681
|
+
const session: RealtimeTranscriptionSession = {
|
|
682
|
+
connect: async () => {
|
|
683
|
+
sttConnectStarted.resolve();
|
|
684
|
+
throw new Error("provider unavailable");
|
|
685
|
+
},
|
|
686
|
+
sendAudio: () => {},
|
|
687
|
+
close: vi.fn(),
|
|
688
|
+
isConnected: () => false,
|
|
689
|
+
};
|
|
690
|
+
|
|
691
|
+
const handler = new MediaStreamHandler({
|
|
692
|
+
transcriptionProvider: {
|
|
693
|
+
createSession: () => session,
|
|
694
|
+
id: "openai",
|
|
695
|
+
label: "OpenAI",
|
|
696
|
+
isConfigured: () => true,
|
|
697
|
+
},
|
|
698
|
+
providerConfig: {},
|
|
699
|
+
shouldAcceptStream: () => true,
|
|
700
|
+
onConnect,
|
|
701
|
+
onTranscriptionReady,
|
|
702
|
+
onDisconnect,
|
|
703
|
+
});
|
|
704
|
+
const server = await startWsServer(handler);
|
|
705
|
+
|
|
706
|
+
try {
|
|
707
|
+
const ws = await connectWs(server.url);
|
|
708
|
+
ws.send(
|
|
709
|
+
JSON.stringify({
|
|
710
|
+
event: "start",
|
|
711
|
+
streamSid: "MZ-stt-fail",
|
|
712
|
+
start: { callSid: "CA-stt-fail" },
|
|
713
|
+
}),
|
|
714
|
+
);
|
|
715
|
+
|
|
716
|
+
await withTimeout(sttConnectStarted.promise);
|
|
717
|
+
const closed = await waitForClose(ws);
|
|
718
|
+
await withTimeout(onDisconnectReady.promise);
|
|
719
|
+
|
|
720
|
+
expect(closed.code).toBe(1011);
|
|
721
|
+
expect(closed.reason).toBe("STT connection failed");
|
|
722
|
+
expect(onConnect).toHaveBeenCalledTimes(1);
|
|
723
|
+
expect(onConnect).toHaveBeenCalledWith("CA-stt-fail", "MZ-stt-fail");
|
|
724
|
+
expect(onTranscriptionReady).not.toHaveBeenCalled();
|
|
725
|
+
expect(onDisconnect).toHaveBeenCalledTimes(1);
|
|
726
|
+
expect(onDisconnect).toHaveBeenCalledWith("CA-stt-fail", "MZ-stt-fail");
|
|
727
|
+
expect(session.close).toHaveBeenCalledTimes(1);
|
|
728
|
+
} finally {
|
|
729
|
+
await server.close();
|
|
730
|
+
}
|
|
731
|
+
});
|
|
732
|
+
|
|
733
|
+
it("rejects oversized pre-start frames at the websocket maxPayload guard before validation runs", async () => {
|
|
734
|
+
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
|
|
735
|
+
[];
|
|
736
|
+
const handler = new MediaStreamHandler({
|
|
737
|
+
transcriptionProvider: createStubSttProvider(),
|
|
738
|
+
providerConfig: {},
|
|
739
|
+
preStartTimeoutMs: 1_000,
|
|
740
|
+
shouldAcceptStream: (params) => {
|
|
741
|
+
shouldAcceptStreamCalls.push(params);
|
|
742
|
+
return true;
|
|
743
|
+
},
|
|
744
|
+
});
|
|
745
|
+
const server = await startWsServer(handler);
|
|
746
|
+
|
|
747
|
+
try {
|
|
748
|
+
const ws = await connectWs(server.url);
|
|
749
|
+
ws.send(
|
|
750
|
+
JSON.stringify({
|
|
751
|
+
event: "start",
|
|
752
|
+
streamSid: "MZ-oversized",
|
|
753
|
+
start: {
|
|
754
|
+
callSid: "CA-oversized",
|
|
755
|
+
customParameters: { token: "token-oversized", padding: "A".repeat(256 * 1024) },
|
|
756
|
+
},
|
|
757
|
+
}),
|
|
758
|
+
);
|
|
759
|
+
|
|
760
|
+
const closed = await waitForClose(ws);
|
|
761
|
+
|
|
762
|
+
expect(closed.code).toBe(1009);
|
|
763
|
+
expect(shouldAcceptStreamCalls).toEqual([]);
|
|
764
|
+
} finally {
|
|
765
|
+
await server.close();
|
|
766
|
+
}
|
|
767
|
+
});
|
|
271
768
|
});
|