@openclaw/voice-call 2026.3.13 → 2026.5.2-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +27 -5
  2. package/api.ts +16 -0
  3. package/cli-metadata.ts +10 -0
  4. package/config-api.ts +12 -0
  5. package/index.test.ts +943 -0
  6. package/index.ts +379 -149
  7. package/openclaw.plugin.json +384 -157
  8. package/package.json +35 -5
  9. package/runtime-api.ts +20 -0
  10. package/runtime-entry.ts +1 -0
  11. package/setup-api.ts +47 -0
  12. package/src/allowlist.test.ts +18 -0
  13. package/src/cli.ts +533 -68
  14. package/src/config-compat.test.ts +120 -0
  15. package/src/config-compat.ts +227 -0
  16. package/src/config.test.ts +273 -12
  17. package/src/config.ts +355 -72
  18. package/src/core-bridge.ts +2 -147
  19. package/src/deep-merge.test.ts +40 -0
  20. package/src/gateway-continue-operation.ts +200 -0
  21. package/src/http-headers.ts +6 -3
  22. package/src/manager/context.ts +6 -5
  23. package/src/manager/events.test.ts +243 -19
  24. package/src/manager/events.ts +61 -31
  25. package/src/manager/lifecycle.ts +53 -0
  26. package/src/manager/lookup.test.ts +52 -0
  27. package/src/manager/outbound.test.ts +528 -0
  28. package/src/manager/outbound.ts +163 -57
  29. package/src/manager/store.ts +18 -6
  30. package/src/manager/timers.test.ts +129 -0
  31. package/src/manager/timers.ts +4 -3
  32. package/src/manager/twiml.test.ts +13 -0
  33. package/src/manager/twiml.ts +8 -0
  34. package/src/manager.closed-loop.test.ts +30 -12
  35. package/src/manager.inbound-allowlist.test.ts +77 -10
  36. package/src/manager.notify.test.ts +344 -20
  37. package/src/manager.restore.test.ts +95 -8
  38. package/src/manager.test-harness.ts +8 -6
  39. package/src/manager.ts +79 -5
  40. package/src/media-stream.test.ts +578 -81
  41. package/src/media-stream.ts +235 -54
  42. package/src/providers/base.ts +19 -0
  43. package/src/providers/mock.ts +7 -1
  44. package/src/providers/plivo.test.ts +50 -6
  45. package/src/providers/plivo.ts +14 -6
  46. package/src/providers/shared/call-status.ts +2 -1
  47. package/src/providers/shared/guarded-json-api.test.ts +106 -0
  48. package/src/providers/shared/guarded-json-api.ts +1 -1
  49. package/src/providers/telnyx.test.ts +178 -6
  50. package/src/providers/telnyx.ts +40 -3
  51. package/src/providers/twilio/api.test.ts +145 -0
  52. package/src/providers/twilio/api.ts +67 -16
  53. package/src/providers/twilio/twiml-policy.ts +6 -10
  54. package/src/providers/twilio/webhook.ts +1 -1
  55. package/src/providers/twilio.test.ts +425 -25
  56. package/src/providers/twilio.ts +230 -77
  57. package/src/providers/twilio.types.ts +17 -0
  58. package/src/realtime-defaults.ts +3 -0
  59. package/src/realtime-fast-context.test.ts +88 -0
  60. package/src/realtime-fast-context.ts +165 -0
  61. package/src/realtime-transcription.runtime.ts +4 -0
  62. package/src/realtime-voice.runtime.ts +5 -0
  63. package/src/response-generator.test.ts +321 -0
  64. package/src/response-generator.ts +213 -53
  65. package/src/response-model.test.ts +71 -0
  66. package/src/response-model.ts +23 -0
  67. package/src/runtime.test.ts +429 -0
  68. package/src/runtime.ts +270 -24
  69. package/src/telephony-audio.test.ts +61 -0
  70. package/src/telephony-audio.ts +1 -79
  71. package/src/telephony-tts.test.ts +133 -12
  72. package/src/telephony-tts.ts +155 -2
  73. package/src/test-fixtures.ts +28 -7
  74. package/src/tts-provider-voice.test.ts +34 -0
  75. package/src/tts-provider-voice.ts +21 -0
  76. package/src/tunnel.test.ts +166 -0
  77. package/src/tunnel.ts +1 -1
  78. package/src/types.ts +24 -37
  79. package/src/utils.test.ts +17 -0
  80. package/src/voice-mapping.test.ts +34 -0
  81. package/src/voice-mapping.ts +3 -2
  82. package/src/webhook/realtime-handler.test.ts +598 -0
  83. package/src/webhook/realtime-handler.ts +485 -0
  84. package/src/webhook/stale-call-reaper.test.ts +88 -0
  85. package/src/webhook/stale-call-reaper.ts +5 -0
  86. package/src/webhook/tailscale.test.ts +214 -0
  87. package/src/webhook/tailscale.ts +19 -5
  88. package/src/webhook-exposure.test.ts +33 -0
  89. package/src/webhook-exposure.ts +84 -0
  90. package/src/webhook-security.test.ts +172 -21
  91. package/src/webhook-security.ts +43 -29
  92. package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
  93. package/src/webhook.test.ts +1145 -27
  94. package/src/webhook.ts +523 -102
  95. package/src/webhook.types.ts +5 -0
  96. package/src/websocket-test-support.ts +72 -0
  97. package/tsconfig.json +16 -0
  98. package/CHANGELOG.md +0 -121
  99. package/src/providers/index.ts +0 -10
  100. package/src/providers/stt-openai-realtime.test.ts +0 -42
  101. package/src/providers/stt-openai-realtime.ts +0 -311
  102. package/src/providers/tts-openai.test.ts +0 -43
  103. package/src/providers/tts-openai.ts +0 -221
@@ -3,24 +3,27 @@
3
3
  *
4
4
  * Handles bidirectional audio streaming between Twilio and the AI services.
5
5
  * - Receives mu-law audio from Twilio via WebSocket
6
- * - Forwards to OpenAI Realtime STT for transcription
6
+ * - Forwards to the selected realtime transcription provider
7
7
  * - Sends TTS audio back to Twilio
8
8
  */
9
9
 
10
10
  import type { IncomingMessage } from "node:http";
11
11
  import type { Duplex } from "node:stream";
12
- import { WebSocket, WebSocketServer } from "ws";
13
12
  import type {
14
- OpenAIRealtimeSTTProvider,
15
- RealtimeSTTSession,
16
- } from "./providers/stt-openai-realtime.js";
13
+ RealtimeTranscriptionProviderConfig,
14
+ RealtimeTranscriptionProviderPlugin,
15
+ RealtimeTranscriptionSession,
16
+ } from "openclaw/plugin-sdk/realtime-transcription";
17
+ import { type RawData, WebSocket, WebSocketServer } from "ws";
17
18
 
18
19
  /**
19
20
  * Configuration for the media stream handler.
20
21
  */
21
22
  export interface MediaStreamConfig {
22
- /** STT provider for transcription */
23
- sttProvider: OpenAIRealtimeSTTProvider;
23
+ /** Realtime transcription provider for streaming STT. */
24
+ transcriptionProvider: RealtimeTranscriptionProviderPlugin;
25
+ /** Provider-owned config blob passed into the transcription session. */
26
+ providerConfig: RealtimeTranscriptionProviderConfig;
24
27
  /** Close sockets that never send a valid `start` frame within this window. */
25
28
  preStartTimeoutMs?: number;
26
29
  /** Max concurrent pre-start sockets. */
@@ -29,6 +32,8 @@ export interface MediaStreamConfig {
29
32
  maxPendingConnectionsPerIp?: number;
30
33
  /** Max total open sockets (pending + active sessions). */
31
34
  maxConnections?: number;
35
+ /** Optional trusted resolver for the source IP used by pending-connection guards. */
36
+ resolveClientIp?: (request: IncomingMessage) => string | undefined;
32
37
  /** Validate whether to accept a media stream for the given call ID */
33
38
  shouldAcceptStream?: (params: { callId: string; streamSid: string; token?: string }) => boolean;
34
39
  /** Callback when transcript is received */
@@ -37,10 +42,12 @@ export interface MediaStreamConfig {
37
42
  onPartialTranscript?: (callId: string, partial: string) => void;
38
43
  /** Callback when stream connects */
39
44
  onConnect?: (callId: string, streamSid: string) => void;
45
+ /** Callback when realtime transcription is ready for the stream */
46
+ onTranscriptionReady?: (callId: string, streamSid: string) => void;
40
47
  /** Callback when speech starts (barge-in) */
41
48
  onSpeechStart?: (callId: string) => void;
42
49
  /** Callback when stream disconnects */
43
- onDisconnect?: (callId: string) => void;
50
+ onDisconnect?: (callId: string, streamSid: string) => void;
44
51
  }
45
52
 
46
53
  /**
@@ -50,7 +57,7 @@ interface StreamSession {
50
57
  callId: string;
51
58
  streamSid: string;
52
59
  ws: WebSocket;
53
- sttSession: RealtimeSTTSession;
60
+ sttSession: RealtimeTranscriptionSession;
54
61
  }
55
62
 
56
63
  type TtsQueueEntry = {
@@ -60,6 +67,13 @@ type TtsQueueEntry = {
60
67
  reject: (error: unknown) => void;
61
68
  };
62
69
 
70
+ type StreamSendResult = {
71
+ sent: boolean;
72
+ readyState?: number;
73
+ bufferedBeforeBytes: number;
74
+ bufferedAfterBytes: number;
75
+ };
76
+
63
77
  type PendingConnection = {
64
78
  ip: string;
65
79
  timeout: ReturnType<typeof setTimeout>;
@@ -69,6 +83,30 @@ const DEFAULT_PRE_START_TIMEOUT_MS = 5000;
69
83
  const DEFAULT_MAX_PENDING_CONNECTIONS = 32;
70
84
  const DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP = 4;
71
85
  const DEFAULT_MAX_CONNECTIONS = 128;
86
+ const MAX_INBOUND_MESSAGE_BYTES = 64 * 1024;
87
+ const MAX_WS_BUFFERED_BYTES = 1024 * 1024;
88
+ const CLOSE_REASON_LOG_MAX_CHARS = 120;
89
+
90
+ export function sanitizeLogText(value: string, maxChars: number): string {
91
+ const sanitized = value
92
+ .replace(/\p{Cc}/gu, " ")
93
+ .replace(/\s+/g, " ")
94
+ .trim();
95
+ if (sanitized.length <= maxChars) {
96
+ return sanitized;
97
+ }
98
+ return `${sanitized.slice(0, maxChars)}...`;
99
+ }
100
+
101
+ function normalizeWsMessageData(data: RawData): Buffer {
102
+ if (Buffer.isBuffer(data)) {
103
+ return data;
104
+ }
105
+ if (Array.isArray(data)) {
106
+ return Buffer.concat(data);
107
+ }
108
+ return Buffer.from(data);
109
+ }
72
110
 
73
111
  /**
74
112
  * Manages WebSocket connections for Twilio media streams.
@@ -85,6 +123,7 @@ export class MediaStreamHandler {
85
123
  private maxPendingConnections: number;
86
124
  private maxPendingConnectionsPerIp: number;
87
125
  private maxConnections: number;
126
+ private inflightUpgrades = 0;
88
127
  /** TTS playback queues per stream (serialize audio to prevent overlap) */
89
128
  private ttsQueues = new Map<string, TtsQueueEntry[]>();
90
129
  /** Whether TTS is currently playing per stream */
@@ -106,19 +145,50 @@ export class MediaStreamHandler {
106
145
  */
107
146
  handleUpgrade(request: IncomingMessage, socket: Duplex, head: Buffer): void {
108
147
  if (!this.wss) {
109
- this.wss = new WebSocketServer({ noServer: true });
148
+ this.wss = new WebSocketServer({
149
+ noServer: true,
150
+ // Reject oversized frames before app-level parsing runs on unauthenticated sockets.
151
+ maxPayload: MAX_INBOUND_MESSAGE_BYTES,
152
+ });
110
153
  this.wss.on("connection", (ws, req) => this.handleConnection(ws, req));
111
154
  }
112
155
 
113
- const currentConnections = this.wss.clients.size;
156
+ const currentConnections = this.getCurrentConnectionCount();
114
157
  if (currentConnections >= this.maxConnections) {
115
158
  this.rejectUpgrade(socket, 503, "Too many media stream connections");
116
159
  return;
117
160
  }
118
161
 
119
- this.wss.handleUpgrade(request, socket, head, (ws) => {
120
- this.wss?.emit("connection", ws, request);
121
- });
162
+ this.inflightUpgrades += 1;
163
+ let released = false;
164
+ const releaseUpgradeReservation = () => {
165
+ if (released) {
166
+ return;
167
+ }
168
+ released = true;
169
+ this.inflightUpgrades = Math.max(0, this.inflightUpgrades - 1);
170
+ };
171
+ const handleUpgradeAbort = () => {
172
+ socket.removeListener("error", handleUpgradeAbort);
173
+ socket.removeListener("close", handleUpgradeAbort);
174
+ releaseUpgradeReservation();
175
+ };
176
+ socket.once("error", handleUpgradeAbort);
177
+ socket.once("close", handleUpgradeAbort);
178
+
179
+ try {
180
+ this.wss.handleUpgrade(request, socket, head, (ws) => {
181
+ socket.removeListener("error", handleUpgradeAbort);
182
+ socket.removeListener("close", handleUpgradeAbort);
183
+ releaseUpgradeReservation();
184
+ this.wss?.emit("connection", ws, request);
185
+ });
186
+ } catch (error) {
187
+ socket.removeListener("error", handleUpgradeAbort);
188
+ socket.removeListener("close", handleUpgradeAbort);
189
+ releaseUpgradeReservation();
190
+ throw error;
191
+ }
122
192
  }
123
193
 
124
194
  /**
@@ -134,9 +204,10 @@ export class MediaStreamHandler {
134
204
  return;
135
205
  }
136
206
 
137
- ws.on("message", async (data: Buffer) => {
207
+ ws.on("message", async (data: RawData) => {
138
208
  try {
139
- const message = JSON.parse(data.toString()) as TwilioMediaMessage;
209
+ const raw = normalizeWsMessageData(data);
210
+ const message = JSON.parse(raw.toString("utf8")) as TwilioMediaMessage;
140
211
 
141
212
  switch (message.event) {
142
213
  case "connected":
@@ -144,7 +215,7 @@ export class MediaStreamHandler {
144
215
  break;
145
216
 
146
217
  case "start":
147
- session = await this.handleStart(ws, message, streamToken);
218
+ session = this.handleStart(ws, message, streamToken);
148
219
  if (session) {
149
220
  this.clearPendingConnection(ws);
150
221
  }
@@ -164,13 +235,22 @@ export class MediaStreamHandler {
164
235
  session = null;
165
236
  }
166
237
  break;
238
+
239
+ case "clear":
240
+ case "mark":
241
+ break;
167
242
  }
168
243
  } catch (error) {
169
244
  console.error("[MediaStream] Error processing message:", error);
170
245
  }
171
246
  });
172
247
 
173
- ws.on("close", () => {
248
+ ws.on("close", (code, reason) => {
249
+ const rawReason = Buffer.isBuffer(reason) ? reason.toString("utf8") : String(reason || "");
250
+ const reasonText = sanitizeLogText(rawReason, CLOSE_REASON_LOG_MAX_CHARS);
251
+ console.log(
252
+ `[MediaStream] WebSocket closed (code: ${code}, reason: ${reasonText || "none"})`,
253
+ );
174
254
  this.clearPendingConnection(ws);
175
255
  if (session) {
176
256
  this.handleStop(session);
@@ -185,11 +265,11 @@ export class MediaStreamHandler {
185
265
  /**
186
266
  * Handle stream start event.
187
267
  */
188
- private async handleStart(
268
+ private handleStart(
189
269
  ws: WebSocket,
190
270
  message: TwilioMediaMessage,
191
271
  streamToken?: string,
192
- ): Promise<StreamSession | null> {
272
+ ): StreamSession | null {
193
273
  const streamSid = message.streamSid || "";
194
274
  const callSid = message.start?.callSid || "";
195
275
 
@@ -213,20 +293,20 @@ export class MediaStreamHandler {
213
293
  return null;
214
294
  }
215
295
 
216
- // Create STT session
217
- const sttSession = this.config.sttProvider.createSession();
218
-
219
- // Set up transcript callbacks
220
- sttSession.onPartial((partial) => {
221
- this.config.onPartialTranscript?.(callSid, partial);
222
- });
223
-
224
- sttSession.onTranscript((transcript) => {
225
- this.config.onTranscript?.(callSid, transcript);
226
- });
227
-
228
- sttSession.onSpeechStart(() => {
229
- this.config.onSpeechStart?.(callSid);
296
+ const sttSession = this.config.transcriptionProvider.createSession({
297
+ providerConfig: this.config.providerConfig,
298
+ onPartial: (partial) => {
299
+ this.config.onPartialTranscript?.(callSid, partial);
300
+ },
301
+ onTranscript: (transcript) => {
302
+ this.config.onTranscript?.(callSid, transcript);
303
+ },
304
+ onSpeechStart: () => {
305
+ this.config.onSpeechStart?.(callSid);
306
+ },
307
+ onError: (error) => {
308
+ console.warn("[MediaStream] Transcription session error:", error.message);
309
+ },
230
310
  });
231
311
 
232
312
  const session: StreamSession = {
@@ -237,18 +317,42 @@ export class MediaStreamHandler {
237
317
  };
238
318
 
239
319
  this.sessions.set(streamSid, session);
240
-
241
- // Notify connection BEFORE STT connect so TTS can work even if STT fails
242
320
  this.config.onConnect?.(callSid, streamSid);
243
-
244
- // Connect to OpenAI STT (non-blocking, log errors but don't fail the call)
245
- sttSession.connect().catch((err) => {
246
- console.warn(`[MediaStream] STT connection failed (TTS still works):`, err.message);
247
- });
321
+ void this.connectTranscriptionAndNotify(session);
248
322
 
249
323
  return session;
250
324
  }
251
325
 
326
+ private async connectTranscriptionAndNotify(session: StreamSession): Promise<void> {
327
+ try {
328
+ await session.sttSession.connect();
329
+ } catch (error) {
330
+ console.warn(
331
+ "[MediaStream] STT connection failed; closing media stream:",
332
+ error instanceof Error ? error.message : String(error),
333
+ );
334
+ if (
335
+ this.sessions.get(session.streamSid) === session &&
336
+ session.ws.readyState === WebSocket.OPEN
337
+ ) {
338
+ session.ws.close(1011, "STT connection failed");
339
+ } else {
340
+ session.sttSession.close();
341
+ }
342
+ return;
343
+ }
344
+
345
+ if (
346
+ this.sessions.get(session.streamSid) !== session ||
347
+ session.ws.readyState !== WebSocket.OPEN
348
+ ) {
349
+ session.sttSession.close();
350
+ return;
351
+ }
352
+
353
+ this.config.onTranscriptionReady?.(session.callId, session.streamSid);
354
+ }
355
+
252
356
  /**
253
357
  * Handle stream stop event.
254
358
  */
@@ -258,7 +362,7 @@ export class MediaStreamHandler {
258
362
  this.clearTtsState(session.streamSid);
259
363
  session.sttSession.close();
260
364
  this.sessions.delete(session.streamSid);
261
- this.config.onDisconnect?.(session.callId);
365
+ this.config.onDisconnect?.(session.callId, session.streamSid);
262
366
  }
263
367
 
264
368
  private getStreamToken(request: IncomingMessage): string | undefined {
@@ -274,9 +378,17 @@ export class MediaStreamHandler {
274
378
  }
275
379
 
276
380
  private getClientIp(request: IncomingMessage): string {
381
+ const resolvedIp = this.config.resolveClientIp?.(request)?.trim();
382
+ if (resolvedIp) {
383
+ return resolvedIp;
384
+ }
277
385
  return request.socket.remoteAddress || "unknown";
278
386
  }
279
387
 
388
+ private getCurrentConnectionCount(): number {
389
+ return this.wss ? this.wss.clients.size + this.inflightUpgrades : this.inflightUpgrades;
390
+ }
391
+
280
392
  private registerPendingConnection(ws: WebSocket, ip: string): boolean {
281
393
  if (this.pendingConnections.size >= this.maxPendingConnections) {
282
394
  console.warn("[MediaStream] Rejecting connection: pending connection limit reached");
@@ -347,17 +459,78 @@ export class MediaStreamHandler {
347
459
  /**
348
460
  * Send a message to a stream's WebSocket if available.
349
461
  */
350
- private sendToStream(streamSid: string, message: unknown): void {
351
- const session = this.getOpenSession(streamSid);
352
- session?.ws.send(JSON.stringify(message));
462
+ private sendToStream(streamSid: string, message: unknown): StreamSendResult {
463
+ const session = this.sessions.get(streamSid);
464
+ if (!session) {
465
+ return {
466
+ sent: false,
467
+ bufferedBeforeBytes: 0,
468
+ bufferedAfterBytes: 0,
469
+ };
470
+ }
471
+
472
+ const readyState = session.ws.readyState;
473
+ const bufferedBeforeBytes = session.ws.bufferedAmount;
474
+ if (readyState !== WebSocket.OPEN) {
475
+ return {
476
+ sent: false,
477
+ readyState,
478
+ bufferedBeforeBytes,
479
+ bufferedAfterBytes: session.ws.bufferedAmount,
480
+ };
481
+ }
482
+ if (bufferedBeforeBytes > MAX_WS_BUFFERED_BYTES) {
483
+ try {
484
+ session.ws.close(1013, "Backpressure: send buffer exceeded");
485
+ } catch {
486
+ // Best-effort close; caller still receives sent:false.
487
+ }
488
+ return {
489
+ sent: false,
490
+ readyState,
491
+ bufferedBeforeBytes,
492
+ bufferedAfterBytes: session.ws.bufferedAmount,
493
+ };
494
+ }
495
+
496
+ try {
497
+ session.ws.send(JSON.stringify(message));
498
+ const bufferedAfterBytes = session.ws.bufferedAmount;
499
+ if (bufferedAfterBytes > MAX_WS_BUFFERED_BYTES) {
500
+ try {
501
+ session.ws.close(1013, "Backpressure: send buffer exceeded");
502
+ } catch {
503
+ // Best-effort close; caller still receives sent:false.
504
+ }
505
+ return {
506
+ sent: false,
507
+ readyState,
508
+ bufferedBeforeBytes,
509
+ bufferedAfterBytes,
510
+ };
511
+ }
512
+ return {
513
+ sent: true,
514
+ readyState,
515
+ bufferedBeforeBytes,
516
+ bufferedAfterBytes,
517
+ };
518
+ } catch {
519
+ return {
520
+ sent: false,
521
+ readyState,
522
+ bufferedBeforeBytes,
523
+ bufferedAfterBytes: session.ws.bufferedAmount,
524
+ };
525
+ }
353
526
  }
354
527
 
355
528
  /**
356
529
  * Send audio to a specific stream (for TTS playback).
357
530
  * Audio should be mu-law encoded at 8kHz mono.
358
531
  */
359
- sendAudio(streamSid: string, muLawAudio: Buffer): void {
360
- this.sendToStream(streamSid, {
532
+ sendAudio(streamSid: string, muLawAudio: Buffer): StreamSendResult {
533
+ return this.sendToStream(streamSid, {
361
534
  event: "media",
362
535
  streamSid,
363
536
  media: { payload: muLawAudio.toString("base64") },
@@ -367,8 +540,8 @@ export class MediaStreamHandler {
367
540
  /**
368
541
  * Send a mark event to track audio playback position.
369
542
  */
370
- sendMark(streamSid: string, name: string): void {
371
- this.sendToStream(streamSid, {
543
+ sendMark(streamSid: string, name: string): StreamSendResult {
544
+ return this.sendToStream(streamSid, {
372
545
  event: "mark",
373
546
  streamSid,
374
547
  mark: { name },
@@ -378,8 +551,8 @@ export class MediaStreamHandler {
378
551
  /**
379
552
  * Clear audio buffer (interrupt playback).
380
553
  */
381
- clearAudio(streamSid: string): void {
382
- this.sendToStream(streamSid, { event: "clear", streamSid });
554
+ clearAudio(streamSid: string): StreamSendResult {
555
+ return this.sendToStream(streamSid, { event: "clear", streamSid });
383
556
  }
384
557
 
385
558
  /**
@@ -412,9 +585,9 @@ export class MediaStreamHandler {
412
585
  /**
413
586
  * Clear TTS queue and interrupt current playback (barge-in).
414
587
  */
415
- clearTtsQueue(streamSid: string): void {
588
+ clearTtsQueue(streamSid: string, _reason = "unspecified"): void {
416
589
  const queue = this.getTtsQueue(streamSid);
417
- queue.length = 0;
590
+ this.resolveQueuedTtsEntries(queue);
418
591
  this.ttsActiveControllers.get(streamSid)?.abort();
419
592
  this.clearAudio(streamSid);
420
593
  }
@@ -487,13 +660,21 @@ export class MediaStreamHandler {
487
660
  private clearTtsState(streamSid: string): void {
488
661
  const queue = this.ttsQueues.get(streamSid);
489
662
  if (queue) {
490
- queue.length = 0;
663
+ this.resolveQueuedTtsEntries(queue);
491
664
  }
492
665
  this.ttsActiveControllers.get(streamSid)?.abort();
493
666
  this.ttsActiveControllers.delete(streamSid);
494
667
  this.ttsPlaying.delete(streamSid);
495
668
  this.ttsQueues.delete(streamSid);
496
669
  }
670
+
671
+ private resolveQueuedTtsEntries(queue: TtsQueueEntry[]): void {
672
+ const pending = queue.splice(0);
673
+ for (const entry of pending) {
674
+ entry.controller.abort();
675
+ entry.resolve();
676
+ }
677
+ }
497
678
  }
498
679
 
499
680
  /**
@@ -1,4 +1,5 @@
1
1
  import type {
2
+ AnswerCallInput,
2
3
  GetCallStatusInput,
3
4
  GetCallStatusResult,
4
5
  HangupCallInput,
@@ -6,6 +7,7 @@ import type {
6
7
  InitiateCallResult,
7
8
  PlayTtsInput,
8
9
  ProviderName,
10
+ SendDtmfInput,
9
11
  WebhookParseOptions,
10
12
  ProviderWebhookParseResult,
11
13
  StartListeningInput,
@@ -41,12 +43,24 @@ export interface VoiceCallProvider {
41
43
  */
42
44
  parseWebhookEvent(ctx: WebhookContext, options?: WebhookParseOptions): ProviderWebhookParseResult;
43
45
 
46
+ /**
47
+ * Consume one-time TwiML that must be served before shortcut handlers such as
48
+ * realtime media streams take over the webhook response.
49
+ */
50
+ consumeInitialTwiML?: (ctx: WebhookContext) => string | null;
51
+
44
52
  /**
45
53
  * Initiate an outbound call.
46
54
  * @returns Provider call ID and status
47
55
  */
48
56
  initiateCall(input: InitiateCallInput): Promise<InitiateCallResult>;
49
57
 
58
+ /**
59
+ * Answer an accepted inbound call when the provider requires an explicit
60
+ * answer command after the initial webhook.
61
+ */
62
+ answerCall?: (input: AnswerCallInput) => Promise<void>;
63
+
50
64
  /**
51
65
  * Hang up an active call.
52
66
  */
@@ -58,6 +72,11 @@ export interface VoiceCallProvider {
58
72
  */
59
73
  playTts(input: PlayTtsInput): Promise<void>;
60
74
 
75
+ /**
76
+ * Send DTMF digits to an active call.
77
+ */
78
+ sendDtmf?: (input: SendDtmfInput) => Promise<void>;
79
+
61
80
  /**
62
81
  * Start listening for user speech (activate STT).
63
82
  */
@@ -1,4 +1,5 @@
1
1
  import crypto from "node:crypto";
2
+ import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
2
3
  import type {
3
4
  EndReason,
4
5
  GetCallStatusInput,
@@ -10,6 +11,7 @@ import type {
10
11
  PlayTtsInput,
11
12
  WebhookParseOptions,
12
13
  ProviderWebhookParseResult,
14
+ SendDtmfInput,
13
15
  StartListeningInput,
14
16
  StopListeningInput,
15
17
  WebhookContext,
@@ -161,6 +163,10 @@ export class MockProvider implements VoiceCallProvider {
161
163
  // No-op for mock
162
164
  }
163
165
 
166
+ async sendDtmf(_input: SendDtmfInput): Promise<void> {
167
+ // No-op for mock
168
+ }
169
+
164
170
  async startListening(_input: StartListeningInput): Promise<void> {
165
171
  // No-op for mock
166
172
  }
@@ -170,7 +176,7 @@ export class MockProvider implements VoiceCallProvider {
170
176
  }
171
177
 
172
178
  async getCallStatus(input: GetCallStatusInput): Promise<GetCallStatusResult> {
173
- const id = input.providerCallId.toLowerCase();
179
+ const id = normalizeLowercaseStringOrEmpty(input.providerCallId);
174
180
  if (id.includes("stale") || id.includes("ended") || id.includes("completed")) {
175
181
  return { status: "completed", isTerminal: true };
176
182
  }
@@ -1,6 +1,20 @@
1
1
  import { describe, expect, it } from "vitest";
2
2
  import { PlivoProvider } from "./plivo.js";
3
3
 
4
+ function requireEvent<T>(event: T | undefined, message: string): T {
5
+ if (!event) {
6
+ throw new Error(message);
7
+ }
8
+ return event;
9
+ }
10
+
11
+ function requireResponseBody(body: string | undefined): string {
12
+ if (!body) {
13
+ throw new Error("Plivo provider did not return a response body");
14
+ }
15
+ return body;
16
+ }
17
+
4
18
  describe("PlivoProvider", () => {
5
19
  it("parses answer callback into call.answered and returns keep-alive XML", () => {
6
20
  const provider = new PlivoProvider({
@@ -18,11 +32,13 @@ describe("PlivoProvider", () => {
18
32
  });
19
33
 
20
34
  expect(result.events).toHaveLength(1);
21
- expect(result.events[0]?.type).toBe("call.answered");
22
- expect(result.events[0]?.callId).toBe("internal-call-id");
23
- expect(result.events[0]?.providerCallId).toBe("call-uuid");
24
- expect(result.providerResponseBody).toContain("<Wait");
25
- expect(result.providerResponseBody).toContain('length="300"');
35
+ const event = requireEvent(result.events[0], "expected Plivo answer event");
36
+ expect(event.type).toBe("call.answered");
37
+ expect(event.callId).toBe("internal-call-id");
38
+ expect(event.providerCallId).toBe("call-uuid");
39
+ const responseBody = requireResponseBody(result.providerResponseBody);
40
+ expect(responseBody).toContain("<Wait");
41
+ expect(responseBody).toContain('length="300"');
26
42
  });
27
43
 
28
44
  it("uses verified request key when provided", () => {
@@ -44,6 +60,34 @@ describe("PlivoProvider", () => {
44
60
  );
45
61
 
46
62
  expect(result.events).toHaveLength(1);
47
- expect(result.events[0]?.dedupeKey).toBe("plivo:v3:verified");
63
+ expect(requireEvent(result.events[0], "expected verified Plivo event").dedupeKey).toBe(
64
+ "plivo:v3:verified",
65
+ );
66
+ });
67
+
68
+ it("pins stored callback bases to publicUrl instead of request Host", () => {
69
+ const provider = new PlivoProvider(
70
+ {
71
+ authId: "MA000000000000000000",
72
+ authToken: "test-token",
73
+ },
74
+ {
75
+ publicUrl: "https://voice.openclaw.ai/voice/webhook?provider=plivo",
76
+ },
77
+ );
78
+
79
+ provider.parseWebhookEvent({
80
+ headers: { host: "attacker.example" },
81
+ rawBody:
82
+ "CallUUID=call-uuid&CallStatus=in-progress&Direction=outbound&From=%2B15550000000&To=%2B15550000001&Event=StartApp",
83
+ url: "https://attacker.example/voice/webhook?provider=plivo&flow=answer&callId=internal-call-id",
84
+ method: "POST",
85
+ query: { provider: "plivo", flow: "answer", callId: "internal-call-id" },
86
+ });
87
+
88
+ const callbackMap = (provider as unknown as { callUuidToWebhookUrl: Map<string, string> })
89
+ .callUuidToWebhookUrl;
90
+
91
+ expect(callbackMap.get("call-uuid")).toBe("https://voice.openclaw.ai/voice/webhook");
48
92
  });
49
93
  });
@@ -1,4 +1,8 @@
1
1
  import crypto from "node:crypto";
2
+ import {
3
+ normalizeLowercaseStringOrEmpty,
4
+ normalizeOptionalString,
5
+ } from "openclaw/plugin-sdk/text-runtime";
2
6
  import type { PlivoConfig, WebhookSecurityConfig } from "../config.js";
3
7
  import { getHeader } from "../http-headers.js";
4
8
  import type {
@@ -129,7 +133,7 @@ export class PlivoProvider implements VoiceCallProvider {
129
133
  ctx: WebhookContext,
130
134
  options?: WebhookParseOptions,
131
135
  ): ProviderWebhookParseResult {
132
- const flow = typeof ctx.query?.flow === "string" ? ctx.query.flow.trim() : "";
136
+ const flow = normalizeOptionalString(ctx.query?.flow) ?? "";
133
137
 
134
138
  const parsed = this.parseBody(ctx.rawBody);
135
139
  if (!parsed) {
@@ -480,7 +484,7 @@ export class PlivoProvider implements VoiceCallProvider {
480
484
 
481
485
  private static normalizeNumber(numberOrSip: string): string {
482
486
  const trimmed = numberOrSip.trim();
483
- if (trimmed.toLowerCase().startsWith("sip:")) {
487
+ if (normalizeLowercaseStringOrEmpty(trimmed).startsWith("sip:")) {
484
488
  return trimmed;
485
489
  }
486
490
  return trimmed.replace(/[^\d+]/g, "");
@@ -517,10 +521,7 @@ export class PlivoProvider implements VoiceCallProvider {
517
521
  }
518
522
 
519
523
  private getCallIdFromQuery(ctx: WebhookContext): string | undefined {
520
- const callId =
521
- typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
522
- ? ctx.query.callId.trim()
523
- : undefined;
524
+ const callId = normalizeOptionalString(ctx.query?.callId);
524
525
  return callId || undefined;
525
526
  }
526
527
 
@@ -544,6 +545,13 @@ export class PlivoProvider implements VoiceCallProvider {
544
545
 
545
546
  private baseWebhookUrlFromCtx(ctx: WebhookContext): string | null {
546
547
  try {
548
+ if (this.options.publicUrl) {
549
+ const base = new URL(this.options.publicUrl);
550
+ const requestUrl = new URL(ctx.url);
551
+ base.pathname = requestUrl.pathname;
552
+ return `${base.origin}${base.pathname}`;
553
+ }
554
+
547
555
  const u = new URL(
548
556
  reconstructWebhookUrl(ctx, {
549
557
  allowedHosts: this.options.webhookSecurity?.allowedHosts,