@openclaw/voice-call 2026.3.13 → 2026.5.2-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +27 -5
  2. package/api.ts +16 -0
  3. package/cli-metadata.ts +10 -0
  4. package/config-api.ts +12 -0
  5. package/index.test.ts +943 -0
  6. package/index.ts +379 -149
  7. package/openclaw.plugin.json +384 -157
  8. package/package.json +35 -5
  9. package/runtime-api.ts +20 -0
  10. package/runtime-entry.ts +1 -0
  11. package/setup-api.ts +47 -0
  12. package/src/allowlist.test.ts +18 -0
  13. package/src/cli.ts +533 -68
  14. package/src/config-compat.test.ts +120 -0
  15. package/src/config-compat.ts +227 -0
  16. package/src/config.test.ts +273 -12
  17. package/src/config.ts +355 -72
  18. package/src/core-bridge.ts +2 -147
  19. package/src/deep-merge.test.ts +40 -0
  20. package/src/gateway-continue-operation.ts +200 -0
  21. package/src/http-headers.ts +6 -3
  22. package/src/manager/context.ts +6 -5
  23. package/src/manager/events.test.ts +243 -19
  24. package/src/manager/events.ts +61 -31
  25. package/src/manager/lifecycle.ts +53 -0
  26. package/src/manager/lookup.test.ts +52 -0
  27. package/src/manager/outbound.test.ts +528 -0
  28. package/src/manager/outbound.ts +163 -57
  29. package/src/manager/store.ts +18 -6
  30. package/src/manager/timers.test.ts +129 -0
  31. package/src/manager/timers.ts +4 -3
  32. package/src/manager/twiml.test.ts +13 -0
  33. package/src/manager/twiml.ts +8 -0
  34. package/src/manager.closed-loop.test.ts +30 -12
  35. package/src/manager.inbound-allowlist.test.ts +77 -10
  36. package/src/manager.notify.test.ts +344 -20
  37. package/src/manager.restore.test.ts +95 -8
  38. package/src/manager.test-harness.ts +8 -6
  39. package/src/manager.ts +79 -5
  40. package/src/media-stream.test.ts +578 -81
  41. package/src/media-stream.ts +235 -54
  42. package/src/providers/base.ts +19 -0
  43. package/src/providers/mock.ts +7 -1
  44. package/src/providers/plivo.test.ts +50 -6
  45. package/src/providers/plivo.ts +14 -6
  46. package/src/providers/shared/call-status.ts +2 -1
  47. package/src/providers/shared/guarded-json-api.test.ts +106 -0
  48. package/src/providers/shared/guarded-json-api.ts +1 -1
  49. package/src/providers/telnyx.test.ts +178 -6
  50. package/src/providers/telnyx.ts +40 -3
  51. package/src/providers/twilio/api.test.ts +145 -0
  52. package/src/providers/twilio/api.ts +67 -16
  53. package/src/providers/twilio/twiml-policy.ts +6 -10
  54. package/src/providers/twilio/webhook.ts +1 -1
  55. package/src/providers/twilio.test.ts +425 -25
  56. package/src/providers/twilio.ts +230 -77
  57. package/src/providers/twilio.types.ts +17 -0
  58. package/src/realtime-defaults.ts +3 -0
  59. package/src/realtime-fast-context.test.ts +88 -0
  60. package/src/realtime-fast-context.ts +165 -0
  61. package/src/realtime-transcription.runtime.ts +4 -0
  62. package/src/realtime-voice.runtime.ts +5 -0
  63. package/src/response-generator.test.ts +321 -0
  64. package/src/response-generator.ts +213 -53
  65. package/src/response-model.test.ts +71 -0
  66. package/src/response-model.ts +23 -0
  67. package/src/runtime.test.ts +429 -0
  68. package/src/runtime.ts +270 -24
  69. package/src/telephony-audio.test.ts +61 -0
  70. package/src/telephony-audio.ts +1 -79
  71. package/src/telephony-tts.test.ts +133 -12
  72. package/src/telephony-tts.ts +155 -2
  73. package/src/test-fixtures.ts +28 -7
  74. package/src/tts-provider-voice.test.ts +34 -0
  75. package/src/tts-provider-voice.ts +21 -0
  76. package/src/tunnel.test.ts +166 -0
  77. package/src/tunnel.ts +1 -1
  78. package/src/types.ts +24 -37
  79. package/src/utils.test.ts +17 -0
  80. package/src/voice-mapping.test.ts +34 -0
  81. package/src/voice-mapping.ts +3 -2
  82. package/src/webhook/realtime-handler.test.ts +598 -0
  83. package/src/webhook/realtime-handler.ts +485 -0
  84. package/src/webhook/stale-call-reaper.test.ts +88 -0
  85. package/src/webhook/stale-call-reaper.ts +5 -0
  86. package/src/webhook/tailscale.test.ts +214 -0
  87. package/src/webhook/tailscale.ts +19 -5
  88. package/src/webhook-exposure.test.ts +33 -0
  89. package/src/webhook-exposure.ts +84 -0
  90. package/src/webhook-security.test.ts +172 -21
  91. package/src/webhook-security.ts +43 -29
  92. package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
  93. package/src/webhook.test.ts +1145 -27
  94. package/src/webhook.ts +523 -102
  95. package/src/webhook.types.ts +5 -0
  96. package/src/websocket-test-support.ts +72 -0
  97. package/tsconfig.json +16 -0
  98. package/CHANGELOG.md +0 -121
  99. package/src/providers/index.ts +0 -10
  100. package/src/providers/stt-openai-realtime.test.ts +0 -42
  101. package/src/providers/stt-openai-realtime.ts +0 -311
  102. package/src/providers/tts-openai.test.ts +0 -43
  103. package/src/providers/tts-openai.ts +0 -221
package/src/webhook.ts CHANGED
@@ -1,29 +1,82 @@
1
1
  import http from "node:http";
2
2
  import { URL } from "node:url";
3
+ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
4
+ import { resolveConfiguredCapabilityProvider } from "openclaw/plugin-sdk/provider-selection-runtime";
5
+ import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
6
+ import {
7
+ createWebhookInFlightLimiter,
8
+ WEBHOOK_BODY_READ_DEFAULTS,
9
+ } from "openclaw/plugin-sdk/webhook-ingress";
3
10
  import {
4
11
  isRequestBodyLimitError,
5
12
  readRequestBodyWithLimit,
6
13
  requestBodyErrorToText,
7
- } from "openclaw/plugin-sdk/voice-call";
8
- import { normalizeVoiceCallConfig, type VoiceCallConfig } from "./config.js";
9
- import type { CoreConfig } from "./core-bridge.js";
14
+ } from "../api.js";
15
+ import { isAllowlistedCaller, normalizePhoneNumber } from "./allowlist.js";
16
+ import {
17
+ normalizeVoiceCallConfig,
18
+ resolveVoiceCallEffectiveConfig,
19
+ type VoiceCallConfig,
20
+ } from "./config.js";
21
+ import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
22
+ import { getHeader } from "./http-headers.js";
10
23
  import type { CallManager } from "./manager.js";
11
24
  import type { MediaStreamConfig } from "./media-stream.js";
12
25
  import { MediaStreamHandler } from "./media-stream.js";
13
26
  import type { VoiceCallProvider } from "./providers/base.js";
14
- import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js";
27
+ import { isProviderStatusTerminal } from "./providers/shared/call-status.js";
15
28
  import type { TwilioProvider } from "./providers/twilio.js";
16
- import type { NormalizedEvent, WebhookContext } from "./types.js";
29
+ import type { CallRecord, NormalizedEvent, WebhookContext } from "./types.js";
30
+ import type { WebhookResponsePayload } from "./webhook.types.js";
31
+ import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
17
32
  import { startStaleCallReaper } from "./webhook/stale-call-reaper.js";
18
33
 
19
- const MAX_WEBHOOK_BODY_BYTES = 1024 * 1024;
20
-
21
- type WebhookResponsePayload = {
22
- statusCode: number;
23
- body: string;
24
- headers?: Record<string, string>;
34
+ const MAX_WEBHOOK_BODY_BYTES = WEBHOOK_BODY_READ_DEFAULTS.preAuth.maxBytes;
35
+ const WEBHOOK_BODY_TIMEOUT_MS = WEBHOOK_BODY_READ_DEFAULTS.preAuth.timeoutMs;
36
+ const MISSING_REMOTE_ADDRESS_IN_FLIGHT_KEY = "__voice_call_no_remote__";
37
+ const STREAM_DISCONNECT_HANGUP_GRACE_MS = 2000;
38
+ const TRANSCRIPT_LOG_MAX_CHARS = 200;
39
+
40
+ type RealtimeTranscriptionRuntime = typeof import("./realtime-transcription.runtime.js");
41
+ type ResponseGeneratorModule = typeof import("./response-generator.js");
42
+ type Logger = {
43
+ info: (message: string) => void;
44
+ warn: (message: string) => void;
45
+ error: (message: string) => void;
46
+ debug?: (message: string) => void;
25
47
  };
26
48
 
49
+ let realtimeTranscriptionRuntimePromise: Promise<RealtimeTranscriptionRuntime> | undefined;
50
+ let responseGeneratorModulePromise: Promise<ResponseGeneratorModule> | undefined;
51
+
52
+ function loadRealtimeTranscriptionRuntime(): Promise<RealtimeTranscriptionRuntime> {
53
+ realtimeTranscriptionRuntimePromise ??= import("./realtime-transcription.runtime.js");
54
+ return realtimeTranscriptionRuntimePromise;
55
+ }
56
+
57
+ function loadResponseGeneratorModule(): Promise<ResponseGeneratorModule> {
58
+ responseGeneratorModulePromise ??= import("./response-generator.js");
59
+ return responseGeneratorModulePromise;
60
+ }
61
+
62
+ type WebhookHeaderGateResult =
63
+ | { ok: true }
64
+ | {
65
+ ok: false;
66
+ reason: string;
67
+ };
68
+
69
+ function sanitizeTranscriptForLog(value: string): string {
70
+ const sanitized = value
71
+ .replace(/\p{Cc}/gu, " ")
72
+ .replace(/\s+/g, " ")
73
+ .trim();
74
+ if (sanitized.length <= TRANSCRIPT_LOG_MAX_CHARS) {
75
+ return sanitized;
76
+ }
77
+ return `${sanitized.slice(0, TRANSCRIPT_LOG_MAX_CHARS)}...`;
78
+ }
79
+
27
80
  function buildRequestUrl(
28
81
  requestUrl: string | undefined,
29
82
  requestHost: string | undefined,
@@ -32,6 +85,55 @@ function buildRequestUrl(
32
85
  return new URL(requestUrl ?? "/", `http://${requestHost ?? fallbackHost}`);
33
86
  }
34
87
 
88
+ function normalizeProxyIp(value: string | undefined): string | undefined {
89
+ const trimmed = value?.trim();
90
+ if (!trimmed) {
91
+ return undefined;
92
+ }
93
+ const unwrapped =
94
+ trimmed.startsWith("[") && trimmed.endsWith("]") ? trimmed.slice(1, -1) : trimmed;
95
+ const normalized = unwrapped.toLowerCase();
96
+ const mappedIpv4Prefix = "::ffff:";
97
+ if (normalized.startsWith(mappedIpv4Prefix)) {
98
+ const mappedIpv4 = normalized.slice(mappedIpv4Prefix.length);
99
+ if (/^\d{1,3}(?:\.\d{1,3}){3}$/.test(mappedIpv4)) {
100
+ return mappedIpv4;
101
+ }
102
+ }
103
+ return normalized;
104
+ }
105
+
106
+ function resolveForwardedClientIp(
107
+ request: http.IncomingMessage,
108
+ trustedProxyIPs: readonly string[],
109
+ ): string | undefined {
110
+ const normalizedTrustedProxyIps = new Set(
111
+ trustedProxyIPs.map((ip) => normalizeProxyIp(ip)).filter((ip): ip is string => Boolean(ip)),
112
+ );
113
+ const forwardedFor = getHeader(request.headers, "x-forwarded-for");
114
+ if (forwardedFor) {
115
+ const forwardedIps = forwardedFor
116
+ .split(",")
117
+ .map((part) => part.trim())
118
+ .filter(Boolean);
119
+ if (forwardedIps.length > 0) {
120
+ if (normalizedTrustedProxyIps.size === 0) {
121
+ return forwardedIps[0];
122
+ }
123
+ for (let index = forwardedIps.length - 1; index >= 0; index -= 1) {
124
+ const hop = forwardedIps[index];
125
+ if (!normalizedTrustedProxyIps.has(normalizeProxyIp(hop) ?? "")) {
126
+ return hop;
127
+ }
128
+ }
129
+ return forwardedIps[0];
130
+ }
131
+ }
132
+
133
+ const realIp = getHeader(request.headers, "x-real-ip")?.trim();
134
+ return realIp || undefined;
135
+ }
136
+
35
137
  function normalizeWebhookResponse(parsed: {
36
138
  statusCode?: number;
37
139
  providerResponseHeaders?: Record<string, string>;
@@ -44,6 +146,14 @@ function normalizeWebhookResponse(parsed: {
44
146
  };
45
147
  }
46
148
 
149
+ function buildRealtimeRejectedTwiML(): WebhookResponsePayload {
150
+ return {
151
+ statusCode: 200,
152
+ headers: { "Content-Type": "text/xml" },
153
+ body: '<?xml version="1.0" encoding="UTF-8"?><Response><Reject reason="rejected" /></Response>',
154
+ };
155
+ }
156
+
47
157
  /**
48
158
  * HTTP server for receiving voice call webhooks from providers.
49
159
  * Supports WebSocket upgrades for media streams when streaming is enabled.
@@ -51,30 +161,45 @@ function normalizeWebhookResponse(parsed: {
51
161
  export class VoiceCallWebhookServer {
52
162
  private server: http.Server | null = null;
53
163
  private listeningUrl: string | null = null;
164
+ private startPromise: Promise<string> | null = null;
54
165
  private config: VoiceCallConfig;
55
166
  private manager: CallManager;
56
167
  private provider: VoiceCallProvider;
57
168
  private coreConfig: CoreConfig | null;
169
+ private fullConfig: OpenClawConfig | null;
170
+ private agentRuntime: CoreAgentDeps | null;
171
+ private logger: Logger;
58
172
  private stopStaleCallReaper: (() => void) | null = null;
173
+ private readonly webhookInFlightLimiter = createWebhookInFlightLimiter();
59
174
 
60
175
  /** Media stream handler for bidirectional audio (when streaming enabled) */
61
176
  private mediaStreamHandler: MediaStreamHandler | null = null;
177
+ /** Delayed auto-hangup timers keyed by provider call ID after stream disconnect. */
178
+ private pendingDisconnectHangups = new Map<string, ReturnType<typeof setTimeout>>();
179
+ /** Realtime voice handler for duplex provider bridges. */
180
+ private realtimeHandler: RealtimeCallHandler | null = null;
62
181
 
63
182
  constructor(
64
183
  config: VoiceCallConfig,
65
184
  manager: CallManager,
66
185
  provider: VoiceCallProvider,
67
186
  coreConfig?: CoreConfig,
187
+ fullConfig?: OpenClawConfig,
188
+ agentRuntime?: CoreAgentDeps,
189
+ logger?: Logger,
68
190
  ) {
69
191
  this.config = normalizeVoiceCallConfig(config);
70
192
  this.manager = manager;
71
193
  this.provider = provider;
72
194
  this.coreConfig = coreConfig ?? null;
73
-
74
- // Initialize media stream handler if streaming is enabled
75
- if (this.config.streaming.enabled) {
76
- this.initializeMediaStreaming();
77
- }
195
+ this.fullConfig = fullConfig ?? null;
196
+ this.agentRuntime = agentRuntime ?? null;
197
+ this.logger = logger ?? {
198
+ info: console.log,
199
+ warn: console.warn,
200
+ error: console.error,
201
+ debug: console.debug,
202
+ };
78
203
  }
79
204
 
80
205
  /**
@@ -84,31 +209,125 @@ export class VoiceCallWebhookServer {
84
209
  return this.mediaStreamHandler;
85
210
  }
86
211
 
87
- /**
88
- * Initialize media streaming with OpenAI Realtime STT.
89
- */
90
- private initializeMediaStreaming(): void {
91
- const streaming = this.config.streaming;
92
- const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY;
212
+ getRealtimeHandler(): RealtimeCallHandler | null {
213
+ return this.realtimeHandler;
214
+ }
215
+
216
+ speakRealtime(callId: string, instructions: string): { success: boolean; error?: string } {
217
+ if (!this.realtimeHandler) {
218
+ return { success: false, error: "Realtime voice handler is not configured" };
219
+ }
220
+ return this.realtimeHandler.speak(callId, instructions);
221
+ }
93
222
 
94
- if (!apiKey) {
95
- console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
223
+ setRealtimeHandler(handler: RealtimeCallHandler): void {
224
+ this.realtimeHandler = handler;
225
+ }
226
+
227
+ private clearPendingDisconnectHangup(providerCallId: string): void {
228
+ const existing = this.pendingDisconnectHangups.get(providerCallId);
229
+ if (!existing) {
96
230
  return;
97
231
  }
232
+ clearTimeout(existing);
233
+ this.pendingDisconnectHangups.delete(providerCallId);
234
+ }
98
235
 
99
- const sttProvider = new OpenAIRealtimeSTTProvider({
100
- apiKey,
101
- model: streaming.sttModel,
102
- silenceDurationMs: streaming.silenceDurationMs,
103
- vadThreshold: streaming.vadThreshold,
236
+ private resolveMediaStreamClientIp(request: http.IncomingMessage): string | undefined {
237
+ const remoteIp = request.socket.remoteAddress ?? undefined;
238
+ const trustedProxyIPs = this.config.webhookSecurity.trustedProxyIPs.filter(Boolean);
239
+ const normalizedTrustedProxyIps = new Set(
240
+ trustedProxyIPs.map((ip) => normalizeProxyIp(ip)).filter((ip): ip is string => Boolean(ip)),
241
+ );
242
+ const normalizedRemoteIp = normalizeProxyIp(remoteIp);
243
+ const fromTrustedProxy =
244
+ normalizedTrustedProxyIps.size > 0 &&
245
+ normalizedRemoteIp !== undefined &&
246
+ normalizedTrustedProxyIps.has(normalizedRemoteIp);
247
+ const shouldTrustForwardingHeaders =
248
+ this.config.webhookSecurity.trustForwardingHeaders && fromTrustedProxy;
249
+
250
+ if (shouldTrustForwardingHeaders) {
251
+ const forwardedIp = resolveForwardedClientIp(request, trustedProxyIPs);
252
+ if (forwardedIp) {
253
+ return forwardedIp;
254
+ }
255
+ }
256
+
257
+ return remoteIp;
258
+ }
259
+
260
+ private shouldSuppressBargeInForInitialMessage(call: CallRecord | undefined): boolean {
261
+ if (!call || call.direction !== "outbound") {
262
+ return false;
263
+ }
264
+
265
+ // Suppress only while the initial greeting is actively being played.
266
+ // If playback fails and the call leaves "speaking", do not block auto-response.
267
+ if (call.state !== "speaking") {
268
+ return false;
269
+ }
270
+
271
+ const mode = (call.metadata?.mode as string | undefined) ?? "conversation";
272
+ if (mode !== "conversation") {
273
+ return false;
274
+ }
275
+
276
+ const initialMessage = normalizeOptionalString(call.metadata?.initialMessage) ?? "";
277
+ return initialMessage.length > 0;
278
+ }
279
+
280
+ /**
281
+ * Initialize media streaming with the selected realtime transcription provider.
282
+ */
283
+ private async initializeMediaStreaming(): Promise<void> {
284
+ const streaming = this.config.streaming;
285
+ const pluginConfig =
286
+ this.fullConfig ?? (this.coreConfig as unknown as OpenClawConfig | undefined);
287
+ const { getRealtimeTranscriptionProvider, listRealtimeTranscriptionProviders } =
288
+ await loadRealtimeTranscriptionRuntime();
289
+ const resolution = resolveConfiguredCapabilityProvider({
290
+ configuredProviderId: streaming.provider,
291
+ providerConfigs: streaming.providers,
292
+ cfg: pluginConfig,
293
+ cfgForResolve: pluginConfig ?? ({} as OpenClawConfig),
294
+ getConfiguredProvider: (providerId) =>
295
+ getRealtimeTranscriptionProvider(providerId, pluginConfig),
296
+ listProviders: () => listRealtimeTranscriptionProviders(pluginConfig),
297
+ resolveProviderConfig: ({ provider, cfg, rawConfig }) =>
298
+ provider.resolveConfig?.({ cfg, rawConfig }) ?? rawConfig,
299
+ isProviderConfigured: ({ provider, cfg, providerConfig }) =>
300
+ provider.isConfigured({ cfg, providerConfig }),
104
301
  });
302
+ if (!resolution.ok && resolution.code === "missing-configured-provider") {
303
+ console.warn(
304
+ `[voice-call] Streaming enabled but realtime transcription provider "${resolution.configuredProviderId}" is not registered`,
305
+ );
306
+ return;
307
+ }
308
+ if (!resolution.ok && resolution.code === "no-registered-provider") {
309
+ console.warn(
310
+ "[voice-call] Streaming enabled but no realtime transcription provider is registered",
311
+ );
312
+ return;
313
+ }
314
+ if (!resolution.ok) {
315
+ console.warn(
316
+ `[voice-call] Streaming enabled but provider "${resolution.provider?.id}" is not configured`,
317
+ );
318
+ return;
319
+ }
320
+ const provider = resolution.provider;
321
+ const providerConfig = resolution.providerConfig;
105
322
 
106
323
  const streamConfig: MediaStreamConfig = {
107
- sttProvider,
324
+ transcriptionProvider: provider,
325
+ providerConfig,
108
326
  preStartTimeoutMs: streaming.preStartTimeoutMs,
109
327
  maxPendingConnections: streaming.maxPendingConnections,
110
328
  maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,
111
329
  maxConnections: streaming.maxConnections,
330
+ resolveClientIp: (request) => this.resolveMediaStreamClientIp(request),
112
331
  shouldAcceptStream: ({ callId, token }) => {
113
332
  const call = this.manager.getCallByProviderCallId(callId);
114
333
  if (!call) {
@@ -124,19 +343,27 @@ export class VoiceCallWebhookServer {
124
343
  return true;
125
344
  },
126
345
  onTranscript: (providerCallId, transcript) => {
127
- console.log(`[voice-call] Transcript for ${providerCallId}: ${transcript}`);
128
-
129
- // Clear TTS queue on barge-in (user started speaking, interrupt current playback)
130
- if (this.provider.name === "twilio") {
131
- (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
132
- }
133
-
134
- // Look up our internal call ID from the provider call ID
346
+ const safeTranscript = sanitizeTranscriptForLog(transcript);
347
+ console.log(
348
+ `[voice-call] Transcript for ${providerCallId}: ${safeTranscript} (chars=${transcript.length})`,
349
+ );
135
350
  const call = this.manager.getCallByProviderCallId(providerCallId);
136
351
  if (!call) {
137
352
  console.warn(`[voice-call] No active call found for provider ID: ${providerCallId}`);
138
353
  return;
139
354
  }
355
+ const suppressBargeIn = this.shouldSuppressBargeInForInitialMessage(call);
356
+ if (suppressBargeIn) {
357
+ console.log(
358
+ `[voice-call] Ignoring barge transcript while initial message is still playing (${providerCallId})`,
359
+ );
360
+ return;
361
+ }
362
+
363
+ // Clear TTS queue on barge-in (user started speaking, interrupt current playback)
364
+ if (this.provider.name === "twilio") {
365
+ (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
366
+ }
140
367
 
141
368
  // Create a speech event and process it through the manager
142
369
  const event: NormalizedEvent = {
@@ -160,44 +387,63 @@ export class VoiceCallWebhookServer {
160
387
  }
161
388
  },
162
389
  onSpeechStart: (providerCallId) => {
163
- if (this.provider.name === "twilio") {
164
- (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
390
+ if (this.provider.name !== "twilio") {
391
+ return;
392
+ }
393
+ const call = this.manager.getCallByProviderCallId(providerCallId);
394
+ if (this.shouldSuppressBargeInForInitialMessage(call)) {
395
+ return;
165
396
  }
397
+ (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
166
398
  },
167
399
  onPartialTranscript: (callId, partial) => {
168
- console.log(`[voice-call] Partial for ${callId}: ${partial}`);
400
+ const safePartial = sanitizeTranscriptForLog(partial);
401
+ console.log(`[voice-call] Partial for ${callId}: ${safePartial} (chars=${partial.length})`);
169
402
  },
170
403
  onConnect: (callId, streamSid) => {
171
404
  console.log(`[voice-call] Media stream connected: ${callId} -> ${streamSid}`);
405
+ this.clearPendingDisconnectHangup(callId);
406
+
172
407
  // Register stream with provider for TTS routing
173
408
  if (this.provider.name === "twilio") {
174
409
  (this.provider as TwilioProvider).registerCallStream(callId, streamSid);
175
410
  }
176
-
177
- // Speak initial message if one was provided when call was initiated
178
- // Use setTimeout to allow stream setup to complete
179
- setTimeout(() => {
180
- this.manager.speakInitialMessage(callId).catch((err) => {
181
- console.warn(`[voice-call] Failed to speak initial message:`, err);
182
- });
183
- }, 500);
184
411
  },
185
- onDisconnect: (callId) => {
186
- console.log(`[voice-call] Media stream disconnected: ${callId}`);
187
- // Auto-end call when media stream disconnects to prevent stuck calls.
188
- // Without this, calls can remain active indefinitely after the stream closes.
189
- const disconnectedCall = this.manager.getCallByProviderCallId(callId);
190
- if (disconnectedCall) {
412
+ onTranscriptionReady: (callId) => {
413
+ this.manager.speakInitialMessage(callId).catch((err) => {
414
+ console.warn(`[voice-call] Failed to speak initial message:`, err);
415
+ });
416
+ },
417
+ onDisconnect: (callId, streamSid) => {
418
+ console.log(`[voice-call] Media stream disconnected: ${callId} (${streamSid})`);
419
+ if (this.provider.name === "twilio") {
420
+ (this.provider as TwilioProvider).unregisterCallStream(callId, streamSid);
421
+ }
422
+
423
+ this.clearPendingDisconnectHangup(callId);
424
+ const timer = setTimeout(() => {
425
+ this.pendingDisconnectHangups.delete(callId);
426
+ const disconnectedCall = this.manager.getCallByProviderCallId(callId);
427
+ if (!disconnectedCall) {
428
+ return;
429
+ }
430
+
431
+ if (this.provider.name === "twilio") {
432
+ const twilio = this.provider as TwilioProvider;
433
+ if (twilio.hasRegisteredStream(callId)) {
434
+ return;
435
+ }
436
+ }
437
+
191
438
  console.log(
192
- `[voice-call] Auto-ending call ${disconnectedCall.callId} on stream disconnect`,
439
+ `[voice-call] Auto-ending call ${disconnectedCall.callId} after stream disconnect grace`,
193
440
  );
194
441
  void this.manager.endCall(disconnectedCall.callId).catch((err) => {
195
442
  console.warn(`[voice-call] Failed to auto-end call ${disconnectedCall.callId}:`, err);
196
443
  });
197
- }
198
- if (this.provider.name === "twilio") {
199
- (this.provider as TwilioProvider).unregisterCallStream(callId);
200
- }
444
+ }, STREAM_DISCONNECT_HANGUP_GRACE_MS);
445
+ timer.unref?.();
446
+ this.pendingDisconnectHangups.set(callId, timer);
201
447
  },
202
448
  };
203
449
 
@@ -220,7 +466,15 @@ export class VoiceCallWebhookServer {
220
466
  return this.listeningUrl ?? this.resolveListeningUrl(bind, webhookPath);
221
467
  }
222
468
 
223
- return new Promise((resolve, reject) => {
469
+ if (this.config.streaming.enabled && !this.mediaStreamHandler) {
470
+ await this.initializeMediaStreaming();
471
+ }
472
+
473
+ if (this.startPromise) {
474
+ return this.startPromise;
475
+ }
476
+
477
+ this.startPromise = new Promise((resolve, reject) => {
224
478
  this.server = http.createServer((req, res) => {
225
479
  this.handleRequest(req, res, webhookPath).catch((err) => {
226
480
  console.error("[voice-call] Webhook error:", err);
@@ -229,12 +483,15 @@ export class VoiceCallWebhookServer {
229
483
  });
230
484
  });
231
485
 
232
- // Handle WebSocket upgrades for media streams
233
- if (this.mediaStreamHandler) {
486
+ // Handle WebSocket upgrades for realtime voice and media streams.
487
+ if (this.realtimeHandler || this.mediaStreamHandler) {
234
488
  this.server.on("upgrade", (request, socket, head) => {
489
+ if (this.realtimeHandler && this.isRealtimeWebSocketUpgrade(request)) {
490
+ this.realtimeHandler.handleWebSocketUpgrade(request, socket, head);
491
+ return;
492
+ }
235
493
  const path = this.getUpgradePathname(request);
236
- if (path === streamPath) {
237
- console.log("[voice-call] WebSocket upgrade for media stream");
494
+ if (path === streamPath && this.mediaStreamHandler) {
238
495
  this.mediaStreamHandler?.handleUpgrade(request, socket, head);
239
496
  } else {
240
497
  socket.destroy();
@@ -242,17 +499,23 @@ export class VoiceCallWebhookServer {
242
499
  });
243
500
  }
244
501
 
245
- this.server.on("error", reject);
502
+ this.server.on("error", (err) => {
503
+ this.server = null;
504
+ this.listeningUrl = null;
505
+ this.startPromise = null;
506
+ reject(err);
507
+ });
246
508
 
247
509
  this.server.listen(port, bind, () => {
248
510
  const url = this.resolveListeningUrl(bind, webhookPath);
249
511
  this.listeningUrl = url;
250
- console.log(`[voice-call] Webhook server listening on ${url}`);
512
+ this.startPromise = null;
513
+ this.logger.info(`[voice-call] Webhook server listening on ${url}`);
251
514
  if (this.mediaStreamHandler) {
252
515
  const address = this.server?.address();
253
516
  const actualPort =
254
517
  address && typeof address === "object" ? address.port : this.config.serve.port;
255
- console.log(
518
+ this.logger.info(
256
519
  `[voice-call] Media stream WebSocket on ws://${bind}:${actualPort}${streamPath}`,
257
520
  );
258
521
  }
@@ -265,12 +528,21 @@ export class VoiceCallWebhookServer {
265
528
  });
266
529
  });
267
530
  });
531
+
532
+ return this.startPromise;
268
533
  }
269
534
 
270
535
  /**
271
536
  * Stop the webhook server.
272
537
  */
273
538
  async stop(): Promise<void> {
539
+ for (const timer of this.pendingDisconnectHangups.values()) {
540
+ clearTimeout(timer);
541
+ }
542
+ this.pendingDisconnectHangups.clear();
543
+ this.webhookInFlightLimiter.clear();
544
+ this.startPromise = null;
545
+
274
546
  if (this.stopStaleCallReaper) {
275
547
  this.stopStaleCallReaper();
276
548
  this.stopStaleCallReaper = null;
@@ -364,49 +636,189 @@ export class VoiceCallWebhookServer {
364
636
  return { statusCode: 405, body: "Method Not Allowed" };
365
637
  }
366
638
 
367
- let body = "";
639
+ const headerGate = this.verifyPreAuthWebhookHeaders(req.headers);
640
+ if (!headerGate.ok) {
641
+ console.warn(`[voice-call] Webhook rejected before body read: ${headerGate.reason}`);
642
+ return { statusCode: 401, body: "Unauthorized" };
643
+ }
644
+
645
+ // createWebhookInFlightLimiter intentionally treats an empty key as fail-open.
646
+ // Missing socket metadata must still share one bucket instead of bypassing
647
+ // the pre-auth limiter entirely.
648
+ const remoteAddress = req.socket.remoteAddress;
649
+ if (!remoteAddress) {
650
+ console.warn(
651
+ `[voice-call] Webhook accepted with no remote address; using shared fallback in-flight key`,
652
+ );
653
+ }
654
+ const inFlightKey = remoteAddress || MISSING_REMOTE_ADDRESS_IN_FLIGHT_KEY;
655
+ if (!this.webhookInFlightLimiter.tryAcquire(inFlightKey)) {
656
+ console.warn(`[voice-call] Webhook rejected before body read: too many in-flight requests`);
657
+ return { statusCode: 429, body: "Too Many Requests" };
658
+ }
659
+
368
660
  try {
369
- body = await this.readBody(req, MAX_WEBHOOK_BODY_BYTES);
370
- } catch (err) {
371
- if (isRequestBodyLimitError(err, "PAYLOAD_TOO_LARGE")) {
372
- return { statusCode: 413, body: "Payload Too Large" };
661
+ let body = "";
662
+ try {
663
+ body = await this.readBody(req, MAX_WEBHOOK_BODY_BYTES, WEBHOOK_BODY_TIMEOUT_MS);
664
+ } catch (err) {
665
+ if (isRequestBodyLimitError(err, "PAYLOAD_TOO_LARGE")) {
666
+ return { statusCode: 413, body: "Payload Too Large" };
667
+ }
668
+ if (isRequestBodyLimitError(err, "REQUEST_BODY_TIMEOUT")) {
669
+ return { statusCode: 408, body: requestBodyErrorToText("REQUEST_BODY_TIMEOUT") };
670
+ }
671
+ throw err;
672
+ }
673
+
674
+ const ctx: WebhookContext = {
675
+ headers: req.headers as Record<string, string | string[] | undefined>,
676
+ rawBody: body,
677
+ url: url.toString(),
678
+ method: "POST",
679
+ query: Object.fromEntries(url.searchParams),
680
+ remoteAddress: req.socket.remoteAddress ?? undefined,
681
+ };
682
+
683
+ const verification = this.provider.verifyWebhook(ctx);
684
+ if (!verification.ok) {
685
+ console.warn(`[voice-call] Webhook verification failed: ${verification.reason}`);
686
+ return { statusCode: 401, body: "Unauthorized" };
373
687
  }
374
- if (isRequestBodyLimitError(err, "REQUEST_BODY_TIMEOUT")) {
375
- return { statusCode: 408, body: requestBodyErrorToText("REQUEST_BODY_TIMEOUT") };
688
+ if (!verification.verifiedRequestKey) {
689
+ console.warn("[voice-call] Webhook verification succeeded without request identity key");
690
+ return { statusCode: 401, body: "Unauthorized" };
376
691
  }
377
- throw err;
692
+
693
+ const initialTwiML = this.provider.consumeInitialTwiML?.(ctx);
694
+ if (initialTwiML !== undefined && initialTwiML !== null) {
695
+ const params = new URLSearchParams(ctx.rawBody);
696
+ console.log(
697
+ `[voice-call] Serving provider initial TwiML before realtime handling (callSid=${params.get("CallSid") ?? "unknown"}, direction=${params.get("Direction") ?? "unknown"})`,
698
+ );
699
+ return {
700
+ statusCode: 200,
701
+ headers: { "Content-Type": "application/xml" },
702
+ body: initialTwiML,
703
+ };
704
+ }
705
+
706
+ const realtimeParams = this.getRealtimeTwimlParams(ctx);
707
+ if (realtimeParams) {
708
+ const direction = realtimeParams.get("Direction");
709
+ const isInboundRealtimeRequest = !direction || direction === "inbound";
710
+ if (isInboundRealtimeRequest && !this.shouldAcceptRealtimeInboundRequest(realtimeParams)) {
711
+ console.log("[voice-call] Realtime inbound call rejected before stream setup");
712
+ return buildRealtimeRejectedTwiML();
713
+ }
714
+ console.log(
715
+ `[voice-call] Serving realtime TwiML for Twilio call ${realtimeParams.get("CallSid") ?? "unknown"} (direction=${direction ?? "unknown"})`,
716
+ );
717
+ return this.realtimeHandler!.buildTwiMLPayload(req, realtimeParams);
718
+ }
719
+
720
+ const parsed = this.provider.parseWebhookEvent(ctx, {
721
+ verifiedRequestKey: verification.verifiedRequestKey,
722
+ });
723
+
724
+ if (verification.isReplay) {
725
+ console.warn("[voice-call] Replay detected; skipping event side effects");
726
+ } else {
727
+ this.processParsedEvents(parsed.events);
728
+ }
729
+
730
+ return normalizeWebhookResponse(parsed);
731
+ } finally {
732
+ this.webhookInFlightLimiter.release(inFlightKey);
378
733
  }
734
+ }
379
735
 
380
- const ctx: WebhookContext = {
381
- headers: req.headers as Record<string, string | string[] | undefined>,
382
- rawBody: body,
383
- url: url.toString(),
384
- method: "POST",
385
- query: Object.fromEntries(url.searchParams),
386
- remoteAddress: req.socket.remoteAddress ?? undefined,
387
- };
736
+ private verifyPreAuthWebhookHeaders(headers: http.IncomingHttpHeaders): WebhookHeaderGateResult {
737
+ if (this.config.skipSignatureVerification) {
738
+ return { ok: true };
739
+ }
740
+ switch (this.provider.name) {
741
+ case "telnyx": {
742
+ const signature = getHeader(headers, "telnyx-signature-ed25519");
743
+ const timestamp = getHeader(headers, "telnyx-timestamp");
744
+ if (signature && timestamp) {
745
+ return { ok: true };
746
+ }
747
+ return { ok: false, reason: "missing Telnyx signature or timestamp header" };
748
+ }
749
+ case "twilio":
750
+ if (getHeader(headers, "x-twilio-signature")) {
751
+ return { ok: true };
752
+ }
753
+ return { ok: false, reason: "missing X-Twilio-Signature header" };
754
+ case "plivo": {
755
+ const hasV3 =
756
+ Boolean(getHeader(headers, "x-plivo-signature-v3")) &&
757
+ Boolean(getHeader(headers, "x-plivo-signature-v3-nonce"));
758
+ const hasV2 =
759
+ Boolean(getHeader(headers, "x-plivo-signature-v2")) &&
760
+ Boolean(getHeader(headers, "x-plivo-signature-v2-nonce"));
761
+ if (hasV3 || hasV2) {
762
+ return { ok: true };
763
+ }
764
+ return { ok: false, reason: "missing Plivo signature headers" };
765
+ }
766
+ default:
767
+ return { ok: true };
768
+ }
769
+ }
388
770
 
389
- const verification = this.provider.verifyWebhook(ctx);
390
- if (!verification.ok) {
391
- console.warn(`[voice-call] Webhook verification failed: ${verification.reason}`);
392
- return { statusCode: 401, body: "Unauthorized" };
771
+ private isRealtimeWebSocketUpgrade(req: http.IncomingMessage): boolean {
772
+ try {
773
+ const pathname = buildRequestUrl(req.url, req.headers.host).pathname;
774
+ const pattern = this.realtimeHandler?.getStreamPathPattern();
775
+ return Boolean(pattern && pathname.startsWith(pattern));
776
+ } catch {
777
+ return false;
393
778
  }
394
- if (!verification.verifiedRequestKey) {
395
- console.warn("[voice-call] Webhook verification succeeded without request identity key");
396
- return { statusCode: 401, body: "Unauthorized" };
779
+ }
780
+
781
+ private getRealtimeTwimlParams(ctx: WebhookContext): URLSearchParams | null {
782
+ if (!this.realtimeHandler || this.provider.name !== "twilio") {
783
+ return null;
397
784
  }
398
785
 
399
- const parsed = this.provider.parseWebhookEvent(ctx, {
400
- verifiedRequestKey: verification.verifiedRequestKey,
401
- });
786
+ const params = new URLSearchParams(ctx.rawBody);
787
+ const direction = params.get("Direction");
788
+ const isSupportedDirection =
789
+ !direction || direction === "inbound" || direction.startsWith("outbound");
790
+ if (!isSupportedDirection) {
791
+ return null;
792
+ }
402
793
 
403
- if (verification.isReplay) {
404
- console.warn("[voice-call] Replay detected; skipping event side effects");
405
- } else {
406
- this.processParsedEvents(parsed.events);
794
+ if (ctx.query?.type === "status") {
795
+ return null;
407
796
  }
408
797
 
409
- return normalizeWebhookResponse(parsed);
798
+ const callStatus = params.get("CallStatus");
799
+ if (callStatus && isProviderStatusTerminal(callStatus)) {
800
+ return null;
801
+ }
802
+
803
+ // Replays must return the same TwiML body so Twilio retries reconnect cleanly.
804
+ // The one-time token still changes, but the behavior stays identical.
805
+ return !params.get("SpeechResult") && !params.get("Digits") ? params : null;
806
+ }
807
+
808
+ private shouldAcceptRealtimeInboundRequest(params: URLSearchParams): boolean {
809
+ switch (this.config.inboundPolicy) {
810
+ case "open":
811
+ return true;
812
+ case "allowlist":
813
+ case "pairing":
814
+ return isAllowlistedCaller(
815
+ normalizePhoneNumber(params.get("From") ?? undefined),
816
+ this.config.allowFrom,
817
+ );
818
+ case "disabled":
819
+ default:
820
+ return false;
821
+ }
410
822
  }
411
823
 
412
824
  private processParsedEvents(events: NormalizedEvent[]): void {
@@ -435,7 +847,7 @@ export class VoiceCallWebhookServer {
435
847
  private readBody(
436
848
  req: http.IncomingMessage,
437
849
  maxBytes: number,
438
- timeoutMs = 30_000,
850
+ timeoutMs = WEBHOOK_BODY_TIMEOUT_MS,
439
851
  ): Promise<string> {
440
852
  return readRequestBodyWithLimit(req, { maxBytes, timeoutMs });
441
853
  }
@@ -458,14 +870,23 @@ export class VoiceCallWebhookServer {
458
870
  console.warn("[voice-call] Core config missing; skipping auto-response");
459
871
  return;
460
872
  }
873
+ if (!this.agentRuntime) {
874
+ console.warn("[voice-call] Agent runtime missing; skipping auto-response");
875
+ return;
876
+ }
461
877
 
462
878
  try {
463
- const { generateVoiceResponse } = await import("./response-generator.js");
879
+ const { generateVoiceResponse } = await loadResponseGeneratorModule();
880
+ const numberRouteKey =
881
+ typeof call.metadata?.numberRouteKey === "string" ? call.metadata.numberRouteKey : call.to;
882
+ const effectiveConfig = resolveVoiceCallEffectiveConfig(this.config, numberRouteKey).config;
464
883
 
465
884
  const result = await generateVoiceResponse({
466
- voiceConfig: this.config,
885
+ voiceConfig: effectiveConfig,
467
886
  coreConfig: this.coreConfig,
887
+ agentRuntime: this.agentRuntime,
468
888
  callId,
889
+ sessionKey: call.sessionKey,
469
890
  from: call.from,
470
891
  transcript: call.transcript,
471
892
  userMessage,