@openclaw/voice-call 2026.2.21 → 2026.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## 2026.2.22
4
+
5
+ ### Changes
6
+
7
+ - Version alignment with core OpenClaw release numbers.
8
+
3
9
  ## 2026.1.26
4
10
 
5
11
  ### Changes
package/README.md CHANGED
@@ -76,6 +76,10 @@ Put under `plugins.entries.voice-call.config`:
76
76
  streaming: {
77
77
  enabled: true,
78
78
  streamPath: "/voice/stream",
79
+ preStartTimeoutMs: 5000,
80
+ maxPendingConnections: 32,
81
+ maxPendingConnectionsPerIp: 4,
82
+ maxConnections: 128,
79
83
  },
80
84
  }
81
85
  ```
@@ -87,6 +91,13 @@ Notes:
87
91
  - Telnyx requires `telnyx.publicKey` (or `TELNYX_PUBLIC_KEY`) unless `skipSignatureVerification` is true.
88
92
  - `tunnel.allowNgrokFreeTierLoopbackBypass: true` allows Twilio webhooks with invalid signatures **only** when `tunnel.provider="ngrok"` and `serve.bind` is loopback (ngrok local agent). Use for local dev only.
89
93
 
94
+ Streaming security defaults:
95
+
96
+ - `streaming.preStartTimeoutMs` closes sockets that never send a valid `start` frame.
97
+ - `streaming.maxPendingConnections` caps total unauthenticated pre-start sockets.
98
+ - `streaming.maxPendingConnectionsPerIp` caps unauthenticated pre-start sockets per source IP.
99
+ - `streaming.maxConnections` caps total open media stream sockets (pending + active).
100
+
90
101
  ## Stale call reaper
91
102
 
92
103
  Use `staleCallReaperSeconds` to end calls that never receive a terminal webhook
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openclaw/voice-call",
3
- "version": "2026.2.21",
3
+ "version": "2026.2.22",
4
4
  "description": "OpenClaw voice-call plugin",
5
5
  "type": "module",
6
6
  "dependencies": {
@@ -30,6 +30,10 @@ function createBaseConfig(provider: "telnyx" | "twilio" | "plivo" | "mock"): Voi
30
30
  silenceDurationMs: 800,
31
31
  vadThreshold: 0.5,
32
32
  streamPath: "/voice/stream",
33
+ preStartTimeoutMs: 5000,
34
+ maxPendingConnections: 32,
35
+ maxPendingConnectionsPerIp: 4,
36
+ maxConnections: 128,
33
37
  },
34
38
  skipSignatureVerification: false,
35
39
  stt: { provider: "openai", model: "whisper-1" },
package/src/config.ts CHANGED
@@ -219,6 +219,17 @@ export const VoiceCallStreamingConfigSchema = z
219
219
  vadThreshold: z.number().min(0).max(1).default(0.5),
220
220
  /** WebSocket path for media stream connections */
221
221
  streamPath: z.string().min(1).default("/voice/stream"),
222
+ /**
223
+ * Close unauthenticated media stream sockets if no valid `start` frame arrives in time.
224
+ * Protects against pre-auth idle connection hold attacks.
225
+ */
226
+ preStartTimeoutMs: z.number().int().positive().default(5000),
227
+ /** Maximum number of concurrently pending (pre-start) media stream sockets. */
228
+ maxPendingConnections: z.number().int().positive().default(32),
229
+ /** Maximum pending media stream sockets per source IP. */
230
+ maxPendingConnectionsPerIp: z.number().int().positive().default(4),
231
+ /** Hard cap for all open media stream sockets (pending + active). */
232
+ maxConnections: z.number().int().positive().default(128),
222
233
  })
223
234
  .strict()
224
235
  .default({
@@ -228,6 +239,10 @@ export const VoiceCallStreamingConfigSchema = z
228
239
  silenceDurationMs: 800,
229
240
  vadThreshold: 0.5,
230
241
  streamPath: "/voice/stream",
242
+ preStartTimeoutMs: 5000,
243
+ maxPendingConnections: 32,
244
+ maxPendingConnectionsPerIp: 4,
245
+ maxConnections: 128,
231
246
  });
232
247
  export type VoiceCallStreamingConfig = z.infer<typeof VoiceCallStreamingConfigSchema>;
233
248
 
@@ -45,6 +45,32 @@ function createProvider(overrides: Partial<VoiceCallProvider> = {}): VoiceCallPr
45
45
  };
46
46
  }
47
47
 
48
+ function createInboundDisabledConfig() {
49
+ return VoiceCallConfigSchema.parse({
50
+ enabled: true,
51
+ provider: "plivo",
52
+ fromNumber: "+15550000000",
53
+ inboundPolicy: "disabled",
54
+ });
55
+ }
56
+
57
+ function createInboundInitiatedEvent(params: {
58
+ id: string;
59
+ providerCallId: string;
60
+ from: string;
61
+ }): NormalizedEvent {
62
+ return {
63
+ id: params.id,
64
+ type: "call.initiated",
65
+ callId: params.providerCallId,
66
+ providerCallId: params.providerCallId,
67
+ timestamp: Date.now(),
68
+ direction: "inbound",
69
+ from: params.from,
70
+ to: "+15550000000",
71
+ };
72
+ }
73
+
48
74
  describe("processEvent (functional)", () => {
49
75
  it("calls provider hangup when rejecting inbound call", () => {
50
76
  const hangupCalls: HangupCallInput[] = [];
@@ -55,24 +81,14 @@ describe("processEvent (functional)", () => {
55
81
  });
56
82
 
57
83
  const ctx = createContext({
58
- config: VoiceCallConfigSchema.parse({
59
- enabled: true,
60
- provider: "plivo",
61
- fromNumber: "+15550000000",
62
- inboundPolicy: "disabled",
63
- }),
84
+ config: createInboundDisabledConfig(),
64
85
  provider,
65
86
  });
66
- const event: NormalizedEvent = {
87
+ const event = createInboundInitiatedEvent({
67
88
  id: "evt-1",
68
- type: "call.initiated",
69
- callId: "prov-1",
70
89
  providerCallId: "prov-1",
71
- timestamp: Date.now(),
72
- direction: "inbound",
73
90
  from: "+15559999999",
74
- to: "+15550000000",
75
- };
91
+ });
76
92
 
77
93
  processEvent(ctx, event);
78
94
 
@@ -87,24 +103,14 @@ describe("processEvent (functional)", () => {
87
103
 
88
104
  it("does not call hangup when provider is null", () => {
89
105
  const ctx = createContext({
90
- config: VoiceCallConfigSchema.parse({
91
- enabled: true,
92
- provider: "plivo",
93
- fromNumber: "+15550000000",
94
- inboundPolicy: "disabled",
95
- }),
106
+ config: createInboundDisabledConfig(),
96
107
  provider: null,
97
108
  });
98
- const event: NormalizedEvent = {
109
+ const event = createInboundInitiatedEvent({
99
110
  id: "evt-2",
100
- type: "call.initiated",
101
- callId: "prov-2",
102
111
  providerCallId: "prov-2",
103
- timestamp: Date.now(),
104
- direction: "inbound",
105
112
  from: "+15551111111",
106
- to: "+15550000000",
107
- };
113
+ });
108
114
 
109
115
  processEvent(ctx, event);
110
116
 
@@ -119,24 +125,14 @@ describe("processEvent (functional)", () => {
119
125
  },
120
126
  });
121
127
  const ctx = createContext({
122
- config: VoiceCallConfigSchema.parse({
123
- enabled: true,
124
- provider: "plivo",
125
- fromNumber: "+15550000000",
126
- inboundPolicy: "disabled",
127
- }),
128
+ config: createInboundDisabledConfig(),
128
129
  provider,
129
130
  });
130
- const event1: NormalizedEvent = {
131
+ const event1 = createInboundInitiatedEvent({
131
132
  id: "evt-init",
132
- type: "call.initiated",
133
- callId: "prov-dup",
134
133
  providerCallId: "prov-dup",
135
- timestamp: Date.now(),
136
- direction: "inbound",
137
134
  from: "+15552222222",
138
- to: "+15550000000",
139
- };
135
+ });
140
136
  const event2: NormalizedEvent = {
141
137
  id: "evt-ring",
142
138
  type: "call.ringing",
@@ -228,24 +224,14 @@ describe("processEvent (functional)", () => {
228
224
  },
229
225
  });
230
226
  const ctx = createContext({
231
- config: VoiceCallConfigSchema.parse({
232
- enabled: true,
233
- provider: "plivo",
234
- fromNumber: "+15550000000",
235
- inboundPolicy: "disabled",
236
- }),
227
+ config: createInboundDisabledConfig(),
237
228
  provider,
238
229
  });
239
- const event: NormalizedEvent = {
230
+ const event = createInboundInitiatedEvent({
240
231
  id: "evt-fail",
241
- type: "call.initiated",
242
- callId: "prov-fail",
243
232
  providerCallId: "prov-fail",
244
- timestamp: Date.now(),
245
- direction: "inbound",
246
233
  from: "+15553333333",
247
- to: "+15550000000",
248
- };
234
+ });
249
235
 
250
236
  expect(() => processEvent(ctx, event)).not.toThrow();
251
237
  expect(ctx.activeCalls.size).toBe(0);
@@ -51,6 +51,32 @@ type EndCallContext = Pick<
51
51
  | "maxDurationTimers"
52
52
  >;
53
53
 
54
+ type ConnectedCallContext = Pick<CallManagerContext, "activeCalls" | "provider">;
55
+
56
+ type ConnectedCallLookup =
57
+ | { kind: "error"; error: string }
58
+ | { kind: "ended"; call: CallRecord }
59
+ | {
60
+ kind: "ok";
61
+ call: CallRecord;
62
+ providerCallId: string;
63
+ provider: NonNullable<ConnectedCallContext["provider"]>;
64
+ };
65
+
66
+ function lookupConnectedCall(ctx: ConnectedCallContext, callId: CallId): ConnectedCallLookup {
67
+ const call = ctx.activeCalls.get(callId);
68
+ if (!call) {
69
+ return { kind: "error", error: "Call not found" };
70
+ }
71
+ if (!ctx.provider || !call.providerCallId) {
72
+ return { kind: "error", error: "Call not connected" };
73
+ }
74
+ if (TerminalStates.has(call.state)) {
75
+ return { kind: "ended", call };
76
+ }
77
+ return { kind: "ok", call, providerCallId: call.providerCallId, provider: ctx.provider };
78
+ }
79
+
54
80
  export async function initiateCall(
55
81
  ctx: InitiateContext,
56
82
  to: string,
@@ -149,26 +175,25 @@ export async function speak(
149
175
  callId: CallId,
150
176
  text: string,
151
177
  ): Promise<{ success: boolean; error?: string }> {
152
- const call = ctx.activeCalls.get(callId);
153
- if (!call) {
154
- return { success: false, error: "Call not found" };
155
- }
156
- if (!ctx.provider || !call.providerCallId) {
157
- return { success: false, error: "Call not connected" };
178
+ const lookup = lookupConnectedCall(ctx, callId);
179
+ if (lookup.kind === "error") {
180
+ return { success: false, error: lookup.error };
158
181
  }
159
- if (TerminalStates.has(call.state)) {
182
+ if (lookup.kind === "ended") {
160
183
  return { success: false, error: "Call has ended" };
161
184
  }
185
+ const { call, providerCallId, provider } = lookup;
186
+
162
187
  try {
163
188
  transitionState(call, "speaking");
164
189
  persistCallRecord(ctx.storePath, call);
165
190
 
166
191
  addTranscriptEntry(call, "bot", text);
167
192
 
168
- const voice = ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
169
- await ctx.provider.playTts({
193
+ const voice = provider.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
194
+ await provider.playTts({
170
195
  callId,
171
- providerCallId: call.providerCallId,
196
+ providerCallId,
172
197
  text,
173
198
  voice,
174
199
  });
@@ -232,16 +257,15 @@ export async function continueCall(
232
257
  callId: CallId,
233
258
  prompt: string,
234
259
  ): Promise<{ success: boolean; transcript?: string; error?: string }> {
235
- const call = ctx.activeCalls.get(callId);
236
- if (!call) {
237
- return { success: false, error: "Call not found" };
260
+ const lookup = lookupConnectedCall(ctx, callId);
261
+ if (lookup.kind === "error") {
262
+ return { success: false, error: lookup.error };
238
263
  }
239
- if (!ctx.provider || !call.providerCallId) {
240
- return { success: false, error: "Call not connected" };
241
- }
242
- if (TerminalStates.has(call.state)) {
264
+ if (lookup.kind === "ended") {
243
265
  return { success: false, error: "Call has ended" };
244
266
  }
267
+ const { call, providerCallId, provider } = lookup;
268
+
245
269
  if (ctx.activeTurnCalls.has(callId) || ctx.transcriptWaiters.has(callId)) {
246
270
  return { success: false, error: "Already waiting for transcript" };
247
271
  }
@@ -256,13 +280,13 @@ export async function continueCall(
256
280
  persistCallRecord(ctx.storePath, call);
257
281
 
258
282
  const listenStartedAt = Date.now();
259
- await ctx.provider.startListening({ callId, providerCallId: call.providerCallId });
283
+ await provider.startListening({ callId, providerCallId });
260
284
 
261
285
  const transcript = await waitForFinalTranscript(ctx, callId);
262
286
  const transcriptReceivedAt = Date.now();
263
287
 
264
288
  // Best-effort: stop listening after final transcript.
265
- await ctx.provider.stopListening({ callId, providerCallId: call.providerCallId });
289
+ await provider.stopListening({ callId, providerCallId });
266
290
 
267
291
  const lastTurnLatencyMs = transcriptReceivedAt - turnStartedAt;
268
292
  const lastTurnListenWaitMs = transcriptReceivedAt - listenStartedAt;
@@ -302,21 +326,19 @@ export async function endCall(
302
326
  ctx: EndCallContext,
303
327
  callId: CallId,
304
328
  ): Promise<{ success: boolean; error?: string }> {
305
- const call = ctx.activeCalls.get(callId);
306
- if (!call) {
307
- return { success: false, error: "Call not found" };
329
+ const lookup = lookupConnectedCall(ctx, callId);
330
+ if (lookup.kind === "error") {
331
+ return { success: false, error: lookup.error };
308
332
  }
309
- if (!ctx.provider || !call.providerCallId) {
310
- return { success: false, error: "Call not connected" };
311
- }
312
- if (TerminalStates.has(call.state)) {
333
+ if (lookup.kind === "ended") {
313
334
  return { success: true };
314
335
  }
336
+ const { call, providerCallId, provider } = lookup;
315
337
 
316
338
  try {
317
- await ctx.provider.hangupCall({
339
+ await provider.hangupCall({
318
340
  callId,
319
- providerCallId: call.providerCallId,
341
+ providerCallId,
320
342
  reason: "hangup-bot",
321
343
  });
322
344
 
@@ -329,9 +351,7 @@ export async function endCall(
329
351
  rejectTranscriptWaiter(ctx, callId, "Call ended: hangup-bot");
330
352
 
331
353
  ctx.activeCalls.delete(callId);
332
- if (call.providerCallId) {
333
- ctx.providerCallIdMap.delete(call.providerCallId);
334
- }
354
+ ctx.providerCallIdMap.delete(providerCallId);
335
355
 
336
356
  return { success: true };
337
357
  } catch (err) {
@@ -46,17 +46,44 @@ class FakeProvider implements VoiceCallProvider {
46
46
  }
47
47
  }
48
48
 
49
+ let storeSeq = 0;
50
+
51
+ function createTestStorePath(): string {
52
+ storeSeq += 1;
53
+ return path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}-${storeSeq}`);
54
+ }
55
+
56
+ function createManagerHarness(
57
+ configOverrides: Record<string, unknown> = {},
58
+ provider = new FakeProvider(),
59
+ ): {
60
+ manager: CallManager;
61
+ provider: FakeProvider;
62
+ } {
63
+ const config = VoiceCallConfigSchema.parse({
64
+ enabled: true,
65
+ provider: "plivo",
66
+ fromNumber: "+15550000000",
67
+ ...configOverrides,
68
+ });
69
+ const manager = new CallManager(config, createTestStorePath());
70
+ manager.initialize(provider, "https://example.com/voice/webhook");
71
+ return { manager, provider };
72
+ }
73
+
74
+ function markCallAnswered(manager: CallManager, callId: string, eventId: string): void {
75
+ manager.processEvent({
76
+ id: eventId,
77
+ type: "call.answered",
78
+ callId,
79
+ providerCallId: "request-uuid",
80
+ timestamp: Date.now(),
81
+ });
82
+ }
83
+
49
84
  describe("CallManager", () => {
50
85
  it("upgrades providerCallId mapping when provider ID changes", async () => {
51
- const config = VoiceCallConfigSchema.parse({
52
- enabled: true,
53
- provider: "plivo",
54
- fromNumber: "+15550000000",
55
- });
56
-
57
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
58
- const manager = new CallManager(config, storePath);
59
- manager.initialize(new FakeProvider(), "https://example.com/voice/webhook");
86
+ const { manager } = createManagerHarness();
60
87
 
61
88
  const { callId, success, error } = await manager.initiateCall("+15550000001");
62
89
  expect(success).toBe(true);
@@ -81,16 +108,7 @@ describe("CallManager", () => {
81
108
  });
82
109
 
83
110
  it("speaks initial message on answered for notify mode (non-Twilio)", async () => {
84
- const config = VoiceCallConfigSchema.parse({
85
- enabled: true,
86
- provider: "plivo",
87
- fromNumber: "+15550000000",
88
- });
89
-
90
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
91
- const provider = new FakeProvider();
92
- const manager = new CallManager(config, storePath);
93
- manager.initialize(provider, "https://example.com/voice/webhook");
111
+ const { manager, provider } = createManagerHarness();
94
112
 
95
113
  const { callId, success } = await manager.initiateCall("+15550000002", undefined, {
96
114
  message: "Hello there",
@@ -113,19 +131,11 @@ describe("CallManager", () => {
113
131
  });
114
132
 
115
133
  it("rejects inbound calls with missing caller ID when allowlist enabled", () => {
116
- const config = VoiceCallConfigSchema.parse({
117
- enabled: true,
118
- provider: "plivo",
119
- fromNumber: "+15550000000",
134
+ const { manager, provider } = createManagerHarness({
120
135
  inboundPolicy: "allowlist",
121
136
  allowFrom: ["+15550001234"],
122
137
  });
123
138
 
124
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
125
- const provider = new FakeProvider();
126
- const manager = new CallManager(config, storePath);
127
- manager.initialize(provider, "https://example.com/voice/webhook");
128
-
129
139
  manager.processEvent({
130
140
  id: "evt-allowlist-missing",
131
141
  type: "call.initiated",
@@ -142,19 +152,11 @@ describe("CallManager", () => {
142
152
  });
143
153
 
144
154
  it("rejects inbound calls with anonymous caller ID when allowlist enabled", () => {
145
- const config = VoiceCallConfigSchema.parse({
146
- enabled: true,
147
- provider: "plivo",
148
- fromNumber: "+15550000000",
155
+ const { manager, provider } = createManagerHarness({
149
156
  inboundPolicy: "allowlist",
150
157
  allowFrom: ["+15550001234"],
151
158
  });
152
159
 
153
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
154
- const provider = new FakeProvider();
155
- const manager = new CallManager(config, storePath);
156
- manager.initialize(provider, "https://example.com/voice/webhook");
157
-
158
160
  manager.processEvent({
159
161
  id: "evt-allowlist-anon",
160
162
  type: "call.initiated",
@@ -172,19 +174,11 @@ describe("CallManager", () => {
172
174
  });
173
175
 
174
176
  it("rejects inbound calls that only match allowlist suffixes", () => {
175
- const config = VoiceCallConfigSchema.parse({
176
- enabled: true,
177
- provider: "plivo",
178
- fromNumber: "+15550000000",
177
+ const { manager, provider } = createManagerHarness({
179
178
  inboundPolicy: "allowlist",
180
179
  allowFrom: ["+15550001234"],
181
180
  });
182
181
 
183
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
184
- const provider = new FakeProvider();
185
- const manager = new CallManager(config, storePath);
186
- manager.initialize(provider, "https://example.com/voice/webhook");
187
-
188
182
  manager.processEvent({
189
183
  id: "evt-allowlist-suffix",
190
184
  type: "call.initiated",
@@ -202,18 +196,10 @@ describe("CallManager", () => {
202
196
  });
203
197
 
204
198
  it("rejects duplicate inbound events with a single hangup call", () => {
205
- const config = VoiceCallConfigSchema.parse({
206
- enabled: true,
207
- provider: "plivo",
208
- fromNumber: "+15550000000",
199
+ const { manager, provider } = createManagerHarness({
209
200
  inboundPolicy: "disabled",
210
201
  });
211
202
 
212
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
213
- const provider = new FakeProvider();
214
- const manager = new CallManager(config, storePath);
215
- manager.initialize(provider, "https://example.com/voice/webhook");
216
-
217
203
  manager.processEvent({
218
204
  id: "evt-reject-init",
219
205
  type: "call.initiated",
@@ -242,18 +228,11 @@ describe("CallManager", () => {
242
228
  });
243
229
 
244
230
  it("accepts inbound calls that exactly match the allowlist", () => {
245
- const config = VoiceCallConfigSchema.parse({
246
- enabled: true,
247
- provider: "plivo",
248
- fromNumber: "+15550000000",
231
+ const { manager } = createManagerHarness({
249
232
  inboundPolicy: "allowlist",
250
233
  allowFrom: ["+15550001234"],
251
234
  });
252
235
 
253
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
254
- const manager = new CallManager(config, storePath);
255
- manager.initialize(new FakeProvider(), "https://example.com/voice/webhook");
256
-
257
236
  manager.processEvent({
258
237
  id: "evt-allowlist-exact",
259
238
  type: "call.initiated",
@@ -269,28 +248,14 @@ describe("CallManager", () => {
269
248
  });
270
249
 
271
250
  it("completes a closed-loop turn without live audio", async () => {
272
- const config = VoiceCallConfigSchema.parse({
273
- enabled: true,
274
- provider: "plivo",
275
- fromNumber: "+15550000000",
251
+ const { manager, provider } = createManagerHarness({
276
252
  transcriptTimeoutMs: 5000,
277
253
  });
278
254
 
279
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
280
- const provider = new FakeProvider();
281
- const manager = new CallManager(config, storePath);
282
- manager.initialize(provider, "https://example.com/voice/webhook");
283
-
284
255
  const started = await manager.initiateCall("+15550000003");
285
256
  expect(started.success).toBe(true);
286
257
 
287
- manager.processEvent({
288
- id: "evt-closed-loop-answered",
289
- type: "call.answered",
290
- callId: started.callId,
291
- providerCallId: "request-uuid",
292
- timestamp: Date.now(),
293
- });
258
+ markCallAnswered(manager, started.callId, "evt-closed-loop-answered");
294
259
 
295
260
  const turnPromise = manager.continueCall(started.callId, "How can I help?");
296
261
  await new Promise((resolve) => setTimeout(resolve, 0));
@@ -323,28 +288,14 @@ describe("CallManager", () => {
323
288
  });
324
289
 
325
290
  it("rejects overlapping continueCall requests for the same call", async () => {
326
- const config = VoiceCallConfigSchema.parse({
327
- enabled: true,
328
- provider: "plivo",
329
- fromNumber: "+15550000000",
291
+ const { manager, provider } = createManagerHarness({
330
292
  transcriptTimeoutMs: 5000,
331
293
  });
332
294
 
333
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
334
- const provider = new FakeProvider();
335
- const manager = new CallManager(config, storePath);
336
- manager.initialize(provider, "https://example.com/voice/webhook");
337
-
338
295
  const started = await manager.initiateCall("+15550000004");
339
296
  expect(started.success).toBe(true);
340
297
 
341
- manager.processEvent({
342
- id: "evt-overlap-answered",
343
- type: "call.answered",
344
- callId: started.callId,
345
- providerCallId: "request-uuid",
346
- timestamp: Date.now(),
347
- });
298
+ markCallAnswered(manager, started.callId, "evt-overlap-answered");
348
299
 
349
300
  const first = manager.continueCall(started.callId, "First prompt");
350
301
  const second = await manager.continueCall(started.callId, "Second prompt");
@@ -369,28 +320,14 @@ describe("CallManager", () => {
369
320
  });
370
321
 
371
322
  it("tracks latency metadata across multiple closed-loop turns", async () => {
372
- const config = VoiceCallConfigSchema.parse({
373
- enabled: true,
374
- provider: "plivo",
375
- fromNumber: "+15550000000",
323
+ const { manager, provider } = createManagerHarness({
376
324
  transcriptTimeoutMs: 5000,
377
325
  });
378
326
 
379
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
380
- const provider = new FakeProvider();
381
- const manager = new CallManager(config, storePath);
382
- manager.initialize(provider, "https://example.com/voice/webhook");
383
-
384
327
  const started = await manager.initiateCall("+15550000005");
385
328
  expect(started.success).toBe(true);
386
329
 
387
- manager.processEvent({
388
- id: "evt-multi-answered",
389
- type: "call.answered",
390
- callId: started.callId,
391
- providerCallId: "request-uuid",
392
- timestamp: Date.now(),
393
- });
330
+ markCallAnswered(manager, started.callId, "evt-multi-answered");
394
331
 
395
332
  const firstTurn = manager.continueCall(started.callId, "First question");
396
333
  await new Promise((resolve) => setTimeout(resolve, 0));
@@ -436,28 +373,14 @@ describe("CallManager", () => {
436
373
  });
437
374
 
438
375
  it("handles repeated closed-loop turns without waiter churn", async () => {
439
- const config = VoiceCallConfigSchema.parse({
440
- enabled: true,
441
- provider: "plivo",
442
- fromNumber: "+15550000000",
376
+ const { manager, provider } = createManagerHarness({
443
377
  transcriptTimeoutMs: 5000,
444
378
  });
445
379
 
446
- const storePath = path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}`);
447
- const provider = new FakeProvider();
448
- const manager = new CallManager(config, storePath);
449
- manager.initialize(provider, "https://example.com/voice/webhook");
450
-
451
380
  const started = await manager.initiateCall("+15550000006");
452
381
  expect(started.success).toBe(true);
453
382
 
454
- manager.processEvent({
455
- id: "evt-loop-answered",
456
- type: "call.answered",
457
- callId: started.callId,
458
- providerCallId: "request-uuid",
459
- timestamp: Date.now(),
460
- });
383
+ markCallAnswered(manager, started.callId, "evt-loop-answered");
461
384
 
462
385
  for (let i = 1; i <= 5; i++) {
463
386
  const turnPromise = manager.continueCall(started.callId, `Prompt ${i}`);
@@ -1,4 +1,7 @@
1
+ import { once } from "node:events";
2
+ import http from "node:http";
1
3
  import { describe, expect, it } from "vitest";
4
+ import { WebSocket } from "ws";
2
5
  import { MediaStreamHandler } from "./media-stream.js";
3
6
  import type {
4
7
  OpenAIRealtimeSTTProvider,
@@ -34,6 +37,70 @@ const waitForAbort = (signal: AbortSignal): Promise<void> =>
34
37
  signal.addEventListener("abort", () => resolve(), { once: true });
35
38
  });
36
39
 
40
+ const withTimeout = async <T>(promise: Promise<T>, timeoutMs = 2000): Promise<T> => {
41
+ let timer: ReturnType<typeof setTimeout> | null = null;
42
+ const timeout = new Promise<never>((_, reject) => {
43
+ timer = setTimeout(() => reject(new Error(`Timed out after ${timeoutMs}ms`)), timeoutMs);
44
+ });
45
+
46
+ try {
47
+ return await Promise.race([promise, timeout]);
48
+ } finally {
49
+ if (timer) {
50
+ clearTimeout(timer);
51
+ }
52
+ }
53
+ };
54
+
55
+ const startWsServer = async (
56
+ handler: MediaStreamHandler,
57
+ ): Promise<{
58
+ url: string;
59
+ close: () => Promise<void>;
60
+ }> => {
61
+ const server = http.createServer();
62
+ server.on("upgrade", (request, socket, head) => {
63
+ handler.handleUpgrade(request, socket, head);
64
+ });
65
+
66
+ await new Promise<void>((resolve) => {
67
+ server.listen(0, "127.0.0.1", resolve);
68
+ });
69
+
70
+ const address = server.address();
71
+ if (!address || typeof address === "string") {
72
+ throw new Error("Failed to resolve test server address");
73
+ }
74
+
75
+ return {
76
+ url: `ws://127.0.0.1:${address.port}/voice/stream`,
77
+ close: async () => {
78
+ await new Promise<void>((resolve, reject) => {
79
+ server.close((err) => (err ? reject(err) : resolve()));
80
+ });
81
+ },
82
+ };
83
+ };
84
+
85
+ const connectWs = async (url: string): Promise<WebSocket> => {
86
+ const ws = new WebSocket(url);
87
+ await withTimeout(once(ws, "open") as Promise<[unknown]>);
88
+ return ws;
89
+ };
90
+
91
+ const waitForClose = async (
92
+ ws: WebSocket,
93
+ ): Promise<{
94
+ code: number;
95
+ reason: string;
96
+ }> => {
97
+ const [code, reason] = (await withTimeout(once(ws, "close") as Promise<[number, Buffer]>)) ?? [];
98
+ return {
99
+ code,
100
+ reason: Buffer.isBuffer(reason) ? reason.toString() : String(reason || ""),
101
+ };
102
+ };
103
+
37
104
  describe("MediaStreamHandler TTS queue", () => {
38
105
  it("serializes TTS playback and resolves in order", async () => {
39
106
  const handler = new MediaStreamHandler({
@@ -94,3 +161,111 @@ describe("MediaStreamHandler TTS queue", () => {
94
161
  expect(queuedRan).toBe(false);
95
162
  });
96
163
  });
164
+
165
+ describe("MediaStreamHandler security hardening", () => {
166
+ it("closes idle pre-start connections after timeout", async () => {
167
+ const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
168
+ [];
169
+ const handler = new MediaStreamHandler({
170
+ sttProvider: createStubSttProvider(),
171
+ preStartTimeoutMs: 40,
172
+ shouldAcceptStream: (params) => {
173
+ shouldAcceptStreamCalls.push(params);
174
+ return true;
175
+ },
176
+ });
177
+ const server = await startWsServer(handler);
178
+
179
+ try {
180
+ const ws = await connectWs(server.url);
181
+ const closed = await waitForClose(ws);
182
+
183
+ expect(closed.code).toBe(1008);
184
+ expect(closed.reason).toBe("Start timeout");
185
+ expect(shouldAcceptStreamCalls).toEqual([]);
186
+ } finally {
187
+ await server.close();
188
+ }
189
+ });
190
+
191
+ it("enforces pending connection limits", async () => {
192
+ const handler = new MediaStreamHandler({
193
+ sttProvider: createStubSttProvider(),
194
+ preStartTimeoutMs: 5_000,
195
+ maxPendingConnections: 1,
196
+ maxPendingConnectionsPerIp: 1,
197
+ });
198
+ const server = await startWsServer(handler);
199
+
200
+ try {
201
+ const first = await connectWs(server.url);
202
+ const second = await connectWs(server.url);
203
+ const secondClosed = await waitForClose(second);
204
+
205
+ expect(secondClosed.code).toBe(1013);
206
+ expect(secondClosed.reason).toContain("Too many pending");
207
+ expect(first.readyState).toBe(WebSocket.OPEN);
208
+
209
+ first.close();
210
+ await waitForClose(first);
211
+ } finally {
212
+ await server.close();
213
+ }
214
+ });
215
+
216
+ it("rejects upgrades when max connection cap is reached", async () => {
217
+ const handler = new MediaStreamHandler({
218
+ sttProvider: createStubSttProvider(),
219
+ preStartTimeoutMs: 5_000,
220
+ maxConnections: 1,
221
+ maxPendingConnections: 10,
222
+ maxPendingConnectionsPerIp: 10,
223
+ });
224
+ const server = await startWsServer(handler);
225
+
226
+ try {
227
+ const first = await connectWs(server.url);
228
+ const secondError = await withTimeout(
229
+ new Promise<Error>((resolve) => {
230
+ const ws = new WebSocket(server.url);
231
+ ws.once("error", (err) => resolve(err as Error));
232
+ }),
233
+ );
234
+
235
+ expect(secondError.message).toContain("Unexpected server response: 503");
236
+
237
+ first.close();
238
+ await waitForClose(first);
239
+ } finally {
240
+ await server.close();
241
+ }
242
+ });
243
+
244
+ it("clears pending state after valid start", async () => {
245
+ const handler = new MediaStreamHandler({
246
+ sttProvider: createStubSttProvider(),
247
+ preStartTimeoutMs: 40,
248
+ shouldAcceptStream: () => true,
249
+ });
250
+ const server = await startWsServer(handler);
251
+
252
+ try {
253
+ const ws = await connectWs(server.url);
254
+ ws.send(
255
+ JSON.stringify({
256
+ event: "start",
257
+ streamSid: "MZ123",
258
+ start: { callSid: "CA123", customParameters: { token: "token-123" } },
259
+ }),
260
+ );
261
+
262
+ await new Promise((resolve) => setTimeout(resolve, 80));
263
+ expect(ws.readyState).toBe(WebSocket.OPEN);
264
+
265
+ ws.close();
266
+ await waitForClose(ws);
267
+ } finally {
268
+ await server.close();
269
+ }
270
+ });
271
+ });
@@ -21,6 +21,14 @@ import type {
21
21
  export interface MediaStreamConfig {
22
22
  /** STT provider for transcription */
23
23
  sttProvider: OpenAIRealtimeSTTProvider;
24
+ /** Close sockets that never send a valid `start` frame within this window. */
25
+ preStartTimeoutMs?: number;
26
+ /** Max concurrent pre-start sockets. */
27
+ maxPendingConnections?: number;
28
+ /** Max concurrent pre-start sockets from a single source IP. */
29
+ maxPendingConnectionsPerIp?: number;
30
+ /** Max total open sockets (pending + active sessions). */
31
+ maxConnections?: number;
24
32
  /** Validate whether to accept a media stream for the given call ID */
25
33
  shouldAcceptStream?: (params: { callId: string; streamSid: string; token?: string }) => boolean;
26
34
  /** Callback when transcript is received */
@@ -52,6 +60,16 @@ type TtsQueueEntry = {
52
60
  reject: (error: unknown) => void;
53
61
  };
54
62
 
63
+ type PendingConnection = {
64
+ ip: string;
65
+ timeout: ReturnType<typeof setTimeout>;
66
+ };
67
+
68
+ const DEFAULT_PRE_START_TIMEOUT_MS = 5000;
69
+ const DEFAULT_MAX_PENDING_CONNECTIONS = 32;
70
+ const DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP = 4;
71
+ const DEFAULT_MAX_CONNECTIONS = 128;
72
+
55
73
  /**
56
74
  * Manages WebSocket connections for Twilio media streams.
57
75
  */
@@ -59,6 +77,14 @@ export class MediaStreamHandler {
59
77
  private wss: WebSocketServer | null = null;
60
78
  private sessions = new Map<string, StreamSession>();
61
79
  private config: MediaStreamConfig;
80
+ /** Pending sockets that have upgraded but not yet sent an accepted `start` frame. */
81
+ private pendingConnections = new Map<WebSocket, PendingConnection>();
82
+ /** Pending socket count per remote IP for pre-auth throttling. */
83
+ private pendingByIp = new Map<string, number>();
84
+ private preStartTimeoutMs: number;
85
+ private maxPendingConnections: number;
86
+ private maxPendingConnectionsPerIp: number;
87
+ private maxConnections: number;
62
88
  /** TTS playback queues per stream (serialize audio to prevent overlap) */
63
89
  private ttsQueues = new Map<string, TtsQueueEntry[]>();
64
90
  /** Whether TTS is currently playing per stream */
@@ -68,6 +94,11 @@ export class MediaStreamHandler {
68
94
 
69
95
  constructor(config: MediaStreamConfig) {
70
96
  this.config = config;
97
+ this.preStartTimeoutMs = config.preStartTimeoutMs ?? DEFAULT_PRE_START_TIMEOUT_MS;
98
+ this.maxPendingConnections = config.maxPendingConnections ?? DEFAULT_MAX_PENDING_CONNECTIONS;
99
+ this.maxPendingConnectionsPerIp =
100
+ config.maxPendingConnectionsPerIp ?? DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP;
101
+ this.maxConnections = config.maxConnections ?? DEFAULT_MAX_CONNECTIONS;
71
102
  }
72
103
 
73
104
  /**
@@ -79,6 +110,12 @@ export class MediaStreamHandler {
79
110
  this.wss.on("connection", (ws, req) => this.handleConnection(ws, req));
80
111
  }
81
112
 
113
+ const currentConnections = this.wss.clients.size;
114
+ if (currentConnections >= this.maxConnections) {
115
+ this.rejectUpgrade(socket, 503, "Too many media stream connections");
116
+ return;
117
+ }
118
+
82
119
  this.wss.handleUpgrade(request, socket, head, (ws) => {
83
120
  this.wss?.emit("connection", ws, request);
84
121
  });
@@ -90,6 +127,12 @@ export class MediaStreamHandler {
90
127
  private async handleConnection(ws: WebSocket, _request: IncomingMessage): Promise<void> {
91
128
  let session: StreamSession | null = null;
92
129
  const streamToken = this.getStreamToken(_request);
130
+ const ip = this.getClientIp(_request);
131
+
132
+ if (!this.registerPendingConnection(ws, ip)) {
133
+ ws.close(1013, "Too many pending media stream connections");
134
+ return;
135
+ }
93
136
 
94
137
  ws.on("message", async (data: Buffer) => {
95
138
  try {
@@ -102,6 +145,9 @@ export class MediaStreamHandler {
102
145
 
103
146
  case "start":
104
147
  session = await this.handleStart(ws, message, streamToken);
148
+ if (session) {
149
+ this.clearPendingConnection(ws);
150
+ }
105
151
  break;
106
152
 
107
153
  case "media":
@@ -125,6 +171,7 @@ export class MediaStreamHandler {
125
171
  });
126
172
 
127
173
  ws.on("close", () => {
174
+ this.clearPendingConnection(ws);
128
175
  if (session) {
129
176
  this.handleStop(session);
130
177
  }
@@ -226,6 +273,69 @@ export class MediaStreamHandler {
226
273
  }
227
274
  }
228
275
 
276
+ private getClientIp(request: IncomingMessage): string {
277
+ return request.socket.remoteAddress || "unknown";
278
+ }
279
+
280
+ private registerPendingConnection(ws: WebSocket, ip: string): boolean {
281
+ if (this.pendingConnections.size >= this.maxPendingConnections) {
282
+ console.warn("[MediaStream] Rejecting connection: pending connection limit reached");
283
+ return false;
284
+ }
285
+
286
+ const pendingForIp = this.pendingByIp.get(ip) ?? 0;
287
+ if (pendingForIp >= this.maxPendingConnectionsPerIp) {
288
+ console.warn(`[MediaStream] Rejecting connection: pending per-IP limit reached (${ip})`);
289
+ return false;
290
+ }
291
+
292
+ const timeout = setTimeout(() => {
293
+ if (!this.pendingConnections.has(ws)) {
294
+ return;
295
+ }
296
+ console.warn(
297
+ `[MediaStream] Closing pre-start idle connection after ${this.preStartTimeoutMs}ms (${ip})`,
298
+ );
299
+ ws.close(1008, "Start timeout");
300
+ }, this.preStartTimeoutMs);
301
+
302
+ timeout.unref?.();
303
+ this.pendingConnections.set(ws, { ip, timeout });
304
+ this.pendingByIp.set(ip, pendingForIp + 1);
305
+ return true;
306
+ }
307
+
308
+ private clearPendingConnection(ws: WebSocket): void {
309
+ const pending = this.pendingConnections.get(ws);
310
+ if (!pending) {
311
+ return;
312
+ }
313
+
314
+ clearTimeout(pending.timeout);
315
+ this.pendingConnections.delete(ws);
316
+
317
+ const current = this.pendingByIp.get(pending.ip) ?? 0;
318
+ if (current <= 1) {
319
+ this.pendingByIp.delete(pending.ip);
320
+ return;
321
+ }
322
+ this.pendingByIp.set(pending.ip, current - 1);
323
+ }
324
+
325
+ private rejectUpgrade(socket: Duplex, statusCode: 429 | 503, message: string): void {
326
+ const statusText = statusCode === 429 ? "Too Many Requests" : "Service Unavailable";
327
+ const body = `${message}\n`;
328
+ socket.write(
329
+ `HTTP/1.1 ${statusCode} ${statusText}\r\n` +
330
+ "Connection: close\r\n" +
331
+ "Content-Type: text/plain; charset=utf-8\r\n" +
332
+ `Content-Length: ${Buffer.byteLength(body)}\r\n` +
333
+ "\r\n" +
334
+ body,
335
+ );
336
+ socket.destroy();
337
+ }
338
+
229
339
  /**
230
340
  * Get an active session with an open WebSocket, or undefined if unavailable.
231
341
  */
package/src/webhook.ts CHANGED
@@ -77,6 +77,10 @@ export class VoiceCallWebhookServer {
77
77
 
78
78
  const streamConfig: MediaStreamConfig = {
79
79
  sttProvider,
80
+ preStartTimeoutMs: this.config.streaming?.preStartTimeoutMs,
81
+ maxPendingConnections: this.config.streaming?.maxPendingConnections,
82
+ maxPendingConnectionsPerIp: this.config.streaming?.maxPendingConnectionsPerIp,
83
+ maxConnections: this.config.streaming?.maxConnections,
80
84
  shouldAcceptStream: ({ callId, token }) => {
81
85
  const call = this.manager.getCallByProviderCallId(callId);
82
86
  if (!call) {
@@ -192,9 +196,8 @@ export class VoiceCallWebhookServer {
192
196
  // Handle WebSocket upgrades for media streams
193
197
  if (this.mediaStreamHandler) {
194
198
  this.server.on("upgrade", (request, socket, head) => {
195
- const url = new URL(request.url || "/", `http://${request.headers.host}`);
196
-
197
- if (url.pathname === streamPath) {
199
+ const path = this.getUpgradePathname(request);
200
+ if (path === streamPath) {
198
201
  console.log("[voice-call] WebSocket upgrade for media stream");
199
202
  this.mediaStreamHandler?.handleUpgrade(request, socket, head);
200
203
  } else {
@@ -269,6 +272,15 @@ export class VoiceCallWebhookServer {
269
272
  });
270
273
  }
271
274
 
275
+ private getUpgradePathname(request: http.IncomingMessage): string | null {
276
+ try {
277
+ const host = request.headers.host || "localhost";
278
+ return new URL(request.url || "/", `http://${host}`).pathname;
279
+ } catch {
280
+ return null;
281
+ }
282
+ }
283
+
272
284
  /**
273
285
  * Handle incoming HTTP request.
274
286
  */