@openclaw/voice-call 2026.3.13 → 2026.5.1-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +25 -5
  2. package/api.ts +16 -0
  3. package/cli-metadata.ts +10 -0
  4. package/config-api.ts +12 -0
  5. package/index.test.ts +866 -0
  6. package/index.ts +353 -148
  7. package/openclaw.plugin.json +336 -157
  8. package/package.json +33 -5
  9. package/runtime-api.ts +20 -0
  10. package/runtime-entry.ts +1 -0
  11. package/setup-api.ts +47 -0
  12. package/src/allowlist.test.ts +18 -0
  13. package/src/cli.ts +533 -68
  14. package/src/config-compat.test.ts +120 -0
  15. package/src/config-compat.ts +227 -0
  16. package/src/config.test.ts +160 -12
  17. package/src/config.ts +243 -74
  18. package/src/core-bridge.ts +2 -147
  19. package/src/deep-merge.test.ts +40 -0
  20. package/src/gateway-continue-operation.ts +200 -0
  21. package/src/http-headers.ts +6 -3
  22. package/src/manager/context.ts +6 -5
  23. package/src/manager/events.test.ts +179 -19
  24. package/src/manager/events.ts +48 -30
  25. package/src/manager/lifecycle.ts +53 -0
  26. package/src/manager/lookup.test.ts +52 -0
  27. package/src/manager/outbound.test.ts +464 -0
  28. package/src/manager/outbound.ts +148 -55
  29. package/src/manager/store.ts +18 -6
  30. package/src/manager/timers.test.ts +129 -0
  31. package/src/manager/timers.ts +4 -3
  32. package/src/manager/twiml.test.ts +13 -0
  33. package/src/manager/twiml.ts +8 -0
  34. package/src/manager.closed-loop.test.ts +30 -12
  35. package/src/manager.inbound-allowlist.test.ts +77 -10
  36. package/src/manager.notify.test.ts +344 -20
  37. package/src/manager.restore.test.ts +95 -8
  38. package/src/manager.test-harness.ts +8 -6
  39. package/src/manager.ts +79 -5
  40. package/src/media-stream.test.ts +578 -81
  41. package/src/media-stream.ts +235 -54
  42. package/src/providers/base.ts +19 -0
  43. package/src/providers/mock.ts +7 -1
  44. package/src/providers/plivo.test.ts +50 -6
  45. package/src/providers/plivo.ts +14 -6
  46. package/src/providers/shared/call-status.ts +2 -1
  47. package/src/providers/shared/guarded-json-api.test.ts +106 -0
  48. package/src/providers/shared/guarded-json-api.ts +1 -1
  49. package/src/providers/telnyx.test.ts +178 -6
  50. package/src/providers/telnyx.ts +40 -3
  51. package/src/providers/twilio/api.test.ts +145 -0
  52. package/src/providers/twilio/api.ts +67 -16
  53. package/src/providers/twilio/twiml-policy.ts +6 -10
  54. package/src/providers/twilio/webhook.ts +1 -1
  55. package/src/providers/twilio.test.ts +425 -25
  56. package/src/providers/twilio.ts +230 -77
  57. package/src/providers/twilio.types.ts +17 -0
  58. package/src/realtime-defaults.ts +3 -0
  59. package/src/realtime-fast-context.test.ts +88 -0
  60. package/src/realtime-fast-context.ts +165 -0
  61. package/src/realtime-transcription.runtime.ts +4 -0
  62. package/src/realtime-voice.runtime.ts +5 -0
  63. package/src/response-generator.test.ts +277 -0
  64. package/src/response-generator.ts +186 -40
  65. package/src/response-model.test.ts +71 -0
  66. package/src/response-model.ts +23 -0
  67. package/src/runtime.test.ts +351 -0
  68. package/src/runtime.ts +254 -24
  69. package/src/telephony-audio.test.ts +61 -0
  70. package/src/telephony-audio.ts +1 -79
  71. package/src/telephony-tts.test.ts +133 -12
  72. package/src/telephony-tts.ts +155 -2
  73. package/src/test-fixtures.ts +26 -7
  74. package/src/tts-provider-voice.test.ts +34 -0
  75. package/src/tts-provider-voice.ts +21 -0
  76. package/src/tunnel.test.ts +166 -0
  77. package/src/tunnel.ts +1 -1
  78. package/src/types.ts +24 -37
  79. package/src/utils.test.ts +17 -0
  80. package/src/voice-mapping.test.ts +34 -0
  81. package/src/voice-mapping.ts +3 -2
  82. package/src/webhook/realtime-handler.test.ts +598 -0
  83. package/src/webhook/realtime-handler.ts +485 -0
  84. package/src/webhook/stale-call-reaper.test.ts +88 -0
  85. package/src/webhook/stale-call-reaper.ts +5 -0
  86. package/src/webhook/tailscale.test.ts +214 -0
  87. package/src/webhook/tailscale.ts +19 -5
  88. package/src/webhook-exposure.test.ts +33 -0
  89. package/src/webhook-exposure.ts +84 -0
  90. package/src/webhook-security.test.ts +172 -21
  91. package/src/webhook-security.ts +43 -29
  92. package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
  93. package/src/webhook.test.ts +1145 -27
  94. package/src/webhook.ts +513 -100
  95. package/src/webhook.types.ts +5 -0
  96. package/src/websocket-test-support.ts +72 -0
  97. package/tsconfig.json +16 -0
  98. package/CHANGELOG.md +0 -121
  99. package/src/providers/index.ts +0 -10
  100. package/src/providers/stt-openai-realtime.test.ts +0 -42
  101. package/src/providers/stt-openai-realtime.ts +0 -311
  102. package/src/providers/tts-openai.test.ts +0 -43
  103. package/src/providers/tts-openai.ts +0 -221
package/src/webhook.ts CHANGED
@@ -1,29 +1,78 @@
1
1
  import http from "node:http";
2
2
  import { URL } from "node:url";
3
+ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
4
+ import { resolveConfiguredCapabilityProvider } from "openclaw/plugin-sdk/provider-selection-runtime";
5
+ import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
6
+ import {
7
+ createWebhookInFlightLimiter,
8
+ WEBHOOK_BODY_READ_DEFAULTS,
9
+ } from "openclaw/plugin-sdk/webhook-ingress";
3
10
  import {
4
11
  isRequestBodyLimitError,
5
12
  readRequestBodyWithLimit,
6
13
  requestBodyErrorToText,
7
- } from "openclaw/plugin-sdk/voice-call";
14
+ } from "../api.js";
15
+ import { isAllowlistedCaller, normalizePhoneNumber } from "./allowlist.js";
8
16
  import { normalizeVoiceCallConfig, type VoiceCallConfig } from "./config.js";
9
- import type { CoreConfig } from "./core-bridge.js";
17
+ import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
18
+ import { getHeader } from "./http-headers.js";
10
19
  import type { CallManager } from "./manager.js";
11
20
  import type { MediaStreamConfig } from "./media-stream.js";
12
21
  import { MediaStreamHandler } from "./media-stream.js";
13
22
  import type { VoiceCallProvider } from "./providers/base.js";
14
- import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js";
23
+ import { isProviderStatusTerminal } from "./providers/shared/call-status.js";
15
24
  import type { TwilioProvider } from "./providers/twilio.js";
16
- import type { NormalizedEvent, WebhookContext } from "./types.js";
25
+ import type { CallRecord, NormalizedEvent, WebhookContext } from "./types.js";
26
+ import type { WebhookResponsePayload } from "./webhook.types.js";
27
+ import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
17
28
  import { startStaleCallReaper } from "./webhook/stale-call-reaper.js";
18
29
 
19
- const MAX_WEBHOOK_BODY_BYTES = 1024 * 1024;
20
-
21
- type WebhookResponsePayload = {
22
- statusCode: number;
23
- body: string;
24
- headers?: Record<string, string>;
30
+ const MAX_WEBHOOK_BODY_BYTES = WEBHOOK_BODY_READ_DEFAULTS.preAuth.maxBytes;
31
+ const WEBHOOK_BODY_TIMEOUT_MS = WEBHOOK_BODY_READ_DEFAULTS.preAuth.timeoutMs;
32
+ const MISSING_REMOTE_ADDRESS_IN_FLIGHT_KEY = "__voice_call_no_remote__";
33
+ const STREAM_DISCONNECT_HANGUP_GRACE_MS = 2000;
34
+ const TRANSCRIPT_LOG_MAX_CHARS = 200;
35
+
36
+ type RealtimeTranscriptionRuntime = typeof import("./realtime-transcription.runtime.js");
37
+ type ResponseGeneratorModule = typeof import("./response-generator.js");
38
+ type Logger = {
39
+ info: (message: string) => void;
40
+ warn: (message: string) => void;
41
+ error: (message: string) => void;
42
+ debug?: (message: string) => void;
25
43
  };
26
44
 
45
+ let realtimeTranscriptionRuntimePromise: Promise<RealtimeTranscriptionRuntime> | undefined;
46
+ let responseGeneratorModulePromise: Promise<ResponseGeneratorModule> | undefined;
47
+
48
+ function loadRealtimeTranscriptionRuntime(): Promise<RealtimeTranscriptionRuntime> {
49
+ realtimeTranscriptionRuntimePromise ??= import("./realtime-transcription.runtime.js");
50
+ return realtimeTranscriptionRuntimePromise;
51
+ }
52
+
53
+ function loadResponseGeneratorModule(): Promise<ResponseGeneratorModule> {
54
+ responseGeneratorModulePromise ??= import("./response-generator.js");
55
+ return responseGeneratorModulePromise;
56
+ }
57
+
58
+ type WebhookHeaderGateResult =
59
+ | { ok: true }
60
+ | {
61
+ ok: false;
62
+ reason: string;
63
+ };
64
+
65
+ function sanitizeTranscriptForLog(value: string): string {
66
+ const sanitized = value
67
+ .replace(/\p{Cc}/gu, " ")
68
+ .replace(/\s+/g, " ")
69
+ .trim();
70
+ if (sanitized.length <= TRANSCRIPT_LOG_MAX_CHARS) {
71
+ return sanitized;
72
+ }
73
+ return `${sanitized.slice(0, TRANSCRIPT_LOG_MAX_CHARS)}...`;
74
+ }
75
+
27
76
  function buildRequestUrl(
28
77
  requestUrl: string | undefined,
29
78
  requestHost: string | undefined,
@@ -32,6 +81,55 @@ function buildRequestUrl(
32
81
  return new URL(requestUrl ?? "/", `http://${requestHost ?? fallbackHost}`);
33
82
  }
34
83
 
84
+ function normalizeProxyIp(value: string | undefined): string | undefined {
85
+ const trimmed = value?.trim();
86
+ if (!trimmed) {
87
+ return undefined;
88
+ }
89
+ const unwrapped =
90
+ trimmed.startsWith("[") && trimmed.endsWith("]") ? trimmed.slice(1, -1) : trimmed;
91
+ const normalized = unwrapped.toLowerCase();
92
+ const mappedIpv4Prefix = "::ffff:";
93
+ if (normalized.startsWith(mappedIpv4Prefix)) {
94
+ const mappedIpv4 = normalized.slice(mappedIpv4Prefix.length);
95
+ if (/^\d{1,3}(?:\.\d{1,3}){3}$/.test(mappedIpv4)) {
96
+ return mappedIpv4;
97
+ }
98
+ }
99
+ return normalized;
100
+ }
101
+
102
+ function resolveForwardedClientIp(
103
+ request: http.IncomingMessage,
104
+ trustedProxyIPs: readonly string[],
105
+ ): string | undefined {
106
+ const normalizedTrustedProxyIps = new Set(
107
+ trustedProxyIPs.map((ip) => normalizeProxyIp(ip)).filter((ip): ip is string => Boolean(ip)),
108
+ );
109
+ const forwardedFor = getHeader(request.headers, "x-forwarded-for");
110
+ if (forwardedFor) {
111
+ const forwardedIps = forwardedFor
112
+ .split(",")
113
+ .map((part) => part.trim())
114
+ .filter(Boolean);
115
+ if (forwardedIps.length > 0) {
116
+ if (normalizedTrustedProxyIps.size === 0) {
117
+ return forwardedIps[0];
118
+ }
119
+ for (let index = forwardedIps.length - 1; index >= 0; index -= 1) {
120
+ const hop = forwardedIps[index];
121
+ if (!normalizedTrustedProxyIps.has(normalizeProxyIp(hop) ?? "")) {
122
+ return hop;
123
+ }
124
+ }
125
+ return forwardedIps[0];
126
+ }
127
+ }
128
+
129
+ const realIp = getHeader(request.headers, "x-real-ip")?.trim();
130
+ return realIp || undefined;
131
+ }
132
+
35
133
  function normalizeWebhookResponse(parsed: {
36
134
  statusCode?: number;
37
135
  providerResponseHeaders?: Record<string, string>;
@@ -44,6 +142,14 @@ function normalizeWebhookResponse(parsed: {
44
142
  };
45
143
  }
46
144
 
145
+ function buildRealtimeRejectedTwiML(): WebhookResponsePayload {
146
+ return {
147
+ statusCode: 200,
148
+ headers: { "Content-Type": "text/xml" },
149
+ body: '<?xml version="1.0" encoding="UTF-8"?><Response><Reject reason="rejected" /></Response>',
150
+ };
151
+ }
152
+
47
153
  /**
48
154
  * HTTP server for receiving voice call webhooks from providers.
49
155
  * Supports WebSocket upgrades for media streams when streaming is enabled.
@@ -51,30 +157,45 @@ function normalizeWebhookResponse(parsed: {
51
157
  export class VoiceCallWebhookServer {
52
158
  private server: http.Server | null = null;
53
159
  private listeningUrl: string | null = null;
160
+ private startPromise: Promise<string> | null = null;
54
161
  private config: VoiceCallConfig;
55
162
  private manager: CallManager;
56
163
  private provider: VoiceCallProvider;
57
164
  private coreConfig: CoreConfig | null;
165
+ private fullConfig: OpenClawConfig | null;
166
+ private agentRuntime: CoreAgentDeps | null;
167
+ private logger: Logger;
58
168
  private stopStaleCallReaper: (() => void) | null = null;
169
+ private readonly webhookInFlightLimiter = createWebhookInFlightLimiter();
59
170
 
60
171
  /** Media stream handler for bidirectional audio (when streaming enabled) */
61
172
  private mediaStreamHandler: MediaStreamHandler | null = null;
173
+ /** Delayed auto-hangup timers keyed by provider call ID after stream disconnect. */
174
+ private pendingDisconnectHangups = new Map<string, ReturnType<typeof setTimeout>>();
175
+ /** Realtime voice handler for duplex provider bridges. */
176
+ private realtimeHandler: RealtimeCallHandler | null = null;
62
177
 
63
178
  constructor(
64
179
  config: VoiceCallConfig,
65
180
  manager: CallManager,
66
181
  provider: VoiceCallProvider,
67
182
  coreConfig?: CoreConfig,
183
+ fullConfig?: OpenClawConfig,
184
+ agentRuntime?: CoreAgentDeps,
185
+ logger?: Logger,
68
186
  ) {
69
187
  this.config = normalizeVoiceCallConfig(config);
70
188
  this.manager = manager;
71
189
  this.provider = provider;
72
190
  this.coreConfig = coreConfig ?? null;
73
-
74
- // Initialize media stream handler if streaming is enabled
75
- if (this.config.streaming.enabled) {
76
- this.initializeMediaStreaming();
77
- }
191
+ this.fullConfig = fullConfig ?? null;
192
+ this.agentRuntime = agentRuntime ?? null;
193
+ this.logger = logger ?? {
194
+ info: console.log,
195
+ warn: console.warn,
196
+ error: console.error,
197
+ debug: console.debug,
198
+ };
78
199
  }
79
200
 
80
201
  /**
@@ -84,31 +205,125 @@ export class VoiceCallWebhookServer {
84
205
  return this.mediaStreamHandler;
85
206
  }
86
207
 
87
- /**
88
- * Initialize media streaming with OpenAI Realtime STT.
89
- */
90
- private initializeMediaStreaming(): void {
91
- const streaming = this.config.streaming;
92
- const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY;
208
+ getRealtimeHandler(): RealtimeCallHandler | null {
209
+ return this.realtimeHandler;
210
+ }
211
+
212
+ speakRealtime(callId: string, instructions: string): { success: boolean; error?: string } {
213
+ if (!this.realtimeHandler) {
214
+ return { success: false, error: "Realtime voice handler is not configured" };
215
+ }
216
+ return this.realtimeHandler.speak(callId, instructions);
217
+ }
218
+
219
+ setRealtimeHandler(handler: RealtimeCallHandler): void {
220
+ this.realtimeHandler = handler;
221
+ }
93
222
 
94
- if (!apiKey) {
95
- console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
223
+ private clearPendingDisconnectHangup(providerCallId: string): void {
224
+ const existing = this.pendingDisconnectHangups.get(providerCallId);
225
+ if (!existing) {
96
226
  return;
97
227
  }
228
+ clearTimeout(existing);
229
+ this.pendingDisconnectHangups.delete(providerCallId);
230
+ }
98
231
 
99
- const sttProvider = new OpenAIRealtimeSTTProvider({
100
- apiKey,
101
- model: streaming.sttModel,
102
- silenceDurationMs: streaming.silenceDurationMs,
103
- vadThreshold: streaming.vadThreshold,
232
+ private resolveMediaStreamClientIp(request: http.IncomingMessage): string | undefined {
233
+ const remoteIp = request.socket.remoteAddress ?? undefined;
234
+ const trustedProxyIPs = this.config.webhookSecurity.trustedProxyIPs.filter(Boolean);
235
+ const normalizedTrustedProxyIps = new Set(
236
+ trustedProxyIPs.map((ip) => normalizeProxyIp(ip)).filter((ip): ip is string => Boolean(ip)),
237
+ );
238
+ const normalizedRemoteIp = normalizeProxyIp(remoteIp);
239
+ const fromTrustedProxy =
240
+ normalizedTrustedProxyIps.size > 0 &&
241
+ normalizedRemoteIp !== undefined &&
242
+ normalizedTrustedProxyIps.has(normalizedRemoteIp);
243
+ const shouldTrustForwardingHeaders =
244
+ this.config.webhookSecurity.trustForwardingHeaders && fromTrustedProxy;
245
+
246
+ if (shouldTrustForwardingHeaders) {
247
+ const forwardedIp = resolveForwardedClientIp(request, trustedProxyIPs);
248
+ if (forwardedIp) {
249
+ return forwardedIp;
250
+ }
251
+ }
252
+
253
+ return remoteIp;
254
+ }
255
+
256
+ private shouldSuppressBargeInForInitialMessage(call: CallRecord | undefined): boolean {
257
+ if (!call || call.direction !== "outbound") {
258
+ return false;
259
+ }
260
+
261
+ // Suppress only while the initial greeting is actively being played.
262
+ // If playback fails and the call leaves "speaking", do not block auto-response.
263
+ if (call.state !== "speaking") {
264
+ return false;
265
+ }
266
+
267
+ const mode = (call.metadata?.mode as string | undefined) ?? "conversation";
268
+ if (mode !== "conversation") {
269
+ return false;
270
+ }
271
+
272
+ const initialMessage = normalizeOptionalString(call.metadata?.initialMessage) ?? "";
273
+ return initialMessage.length > 0;
274
+ }
275
+
276
+ /**
277
+ * Initialize media streaming with the selected realtime transcription provider.
278
+ */
279
+ private async initializeMediaStreaming(): Promise<void> {
280
+ const streaming = this.config.streaming;
281
+ const pluginConfig =
282
+ this.fullConfig ?? (this.coreConfig as unknown as OpenClawConfig | undefined);
283
+ const { getRealtimeTranscriptionProvider, listRealtimeTranscriptionProviders } =
284
+ await loadRealtimeTranscriptionRuntime();
285
+ const resolution = resolveConfiguredCapabilityProvider({
286
+ configuredProviderId: streaming.provider,
287
+ providerConfigs: streaming.providers,
288
+ cfg: pluginConfig,
289
+ cfgForResolve: pluginConfig ?? ({} as OpenClawConfig),
290
+ getConfiguredProvider: (providerId) =>
291
+ getRealtimeTranscriptionProvider(providerId, pluginConfig),
292
+ listProviders: () => listRealtimeTranscriptionProviders(pluginConfig),
293
+ resolveProviderConfig: ({ provider, cfg, rawConfig }) =>
294
+ provider.resolveConfig?.({ cfg, rawConfig }) ?? rawConfig,
295
+ isProviderConfigured: ({ provider, cfg, providerConfig }) =>
296
+ provider.isConfigured({ cfg, providerConfig }),
104
297
  });
298
+ if (!resolution.ok && resolution.code === "missing-configured-provider") {
299
+ console.warn(
300
+ `[voice-call] Streaming enabled but realtime transcription provider "${resolution.configuredProviderId}" is not registered`,
301
+ );
302
+ return;
303
+ }
304
+ if (!resolution.ok && resolution.code === "no-registered-provider") {
305
+ console.warn(
306
+ "[voice-call] Streaming enabled but no realtime transcription provider is registered",
307
+ );
308
+ return;
309
+ }
310
+ if (!resolution.ok) {
311
+ console.warn(
312
+ `[voice-call] Streaming enabled but provider "${resolution.provider?.id}" is not configured`,
313
+ );
314
+ return;
315
+ }
316
+ const provider = resolution.provider;
317
+ const providerConfig = resolution.providerConfig;
105
318
 
106
319
  const streamConfig: MediaStreamConfig = {
107
- sttProvider,
320
+ transcriptionProvider: provider,
321
+ providerConfig,
108
322
  preStartTimeoutMs: streaming.preStartTimeoutMs,
109
323
  maxPendingConnections: streaming.maxPendingConnections,
110
324
  maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,
111
325
  maxConnections: streaming.maxConnections,
326
+ resolveClientIp: (request) => this.resolveMediaStreamClientIp(request),
112
327
  shouldAcceptStream: ({ callId, token }) => {
113
328
  const call = this.manager.getCallByProviderCallId(callId);
114
329
  if (!call) {
@@ -124,19 +339,27 @@ export class VoiceCallWebhookServer {
124
339
  return true;
125
340
  },
126
341
  onTranscript: (providerCallId, transcript) => {
127
- console.log(`[voice-call] Transcript for ${providerCallId}: ${transcript}`);
128
-
129
- // Clear TTS queue on barge-in (user started speaking, interrupt current playback)
130
- if (this.provider.name === "twilio") {
131
- (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
132
- }
133
-
134
- // Look up our internal call ID from the provider call ID
342
+ const safeTranscript = sanitizeTranscriptForLog(transcript);
343
+ console.log(
344
+ `[voice-call] Transcript for ${providerCallId}: ${safeTranscript} (chars=${transcript.length})`,
345
+ );
135
346
  const call = this.manager.getCallByProviderCallId(providerCallId);
136
347
  if (!call) {
137
348
  console.warn(`[voice-call] No active call found for provider ID: ${providerCallId}`);
138
349
  return;
139
350
  }
351
+ const suppressBargeIn = this.shouldSuppressBargeInForInitialMessage(call);
352
+ if (suppressBargeIn) {
353
+ console.log(
354
+ `[voice-call] Ignoring barge transcript while initial message is still playing (${providerCallId})`,
355
+ );
356
+ return;
357
+ }
358
+
359
+ // Clear TTS queue on barge-in (user started speaking, interrupt current playback)
360
+ if (this.provider.name === "twilio") {
361
+ (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
362
+ }
140
363
 
141
364
  // Create a speech event and process it through the manager
142
365
  const event: NormalizedEvent = {
@@ -160,44 +383,63 @@ export class VoiceCallWebhookServer {
160
383
  }
161
384
  },
162
385
  onSpeechStart: (providerCallId) => {
163
- if (this.provider.name === "twilio") {
164
- (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
386
+ if (this.provider.name !== "twilio") {
387
+ return;
165
388
  }
389
+ const call = this.manager.getCallByProviderCallId(providerCallId);
390
+ if (this.shouldSuppressBargeInForInitialMessage(call)) {
391
+ return;
392
+ }
393
+ (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
166
394
  },
167
395
  onPartialTranscript: (callId, partial) => {
168
- console.log(`[voice-call] Partial for ${callId}: ${partial}`);
396
+ const safePartial = sanitizeTranscriptForLog(partial);
397
+ console.log(`[voice-call] Partial for ${callId}: ${safePartial} (chars=${partial.length})`);
169
398
  },
170
399
  onConnect: (callId, streamSid) => {
171
400
  console.log(`[voice-call] Media stream connected: ${callId} -> ${streamSid}`);
401
+ this.clearPendingDisconnectHangup(callId);
402
+
172
403
  // Register stream with provider for TTS routing
173
404
  if (this.provider.name === "twilio") {
174
405
  (this.provider as TwilioProvider).registerCallStream(callId, streamSid);
175
406
  }
176
-
177
- // Speak initial message if one was provided when call was initiated
178
- // Use setTimeout to allow stream setup to complete
179
- setTimeout(() => {
180
- this.manager.speakInitialMessage(callId).catch((err) => {
181
- console.warn(`[voice-call] Failed to speak initial message:`, err);
182
- });
183
- }, 500);
184
407
  },
185
- onDisconnect: (callId) => {
186
- console.log(`[voice-call] Media stream disconnected: ${callId}`);
187
- // Auto-end call when media stream disconnects to prevent stuck calls.
188
- // Without this, calls can remain active indefinitely after the stream closes.
189
- const disconnectedCall = this.manager.getCallByProviderCallId(callId);
190
- if (disconnectedCall) {
408
+ onTranscriptionReady: (callId) => {
409
+ this.manager.speakInitialMessage(callId).catch((err) => {
410
+ console.warn(`[voice-call] Failed to speak initial message:`, err);
411
+ });
412
+ },
413
+ onDisconnect: (callId, streamSid) => {
414
+ console.log(`[voice-call] Media stream disconnected: ${callId} (${streamSid})`);
415
+ if (this.provider.name === "twilio") {
416
+ (this.provider as TwilioProvider).unregisterCallStream(callId, streamSid);
417
+ }
418
+
419
+ this.clearPendingDisconnectHangup(callId);
420
+ const timer = setTimeout(() => {
421
+ this.pendingDisconnectHangups.delete(callId);
422
+ const disconnectedCall = this.manager.getCallByProviderCallId(callId);
423
+ if (!disconnectedCall) {
424
+ return;
425
+ }
426
+
427
+ if (this.provider.name === "twilio") {
428
+ const twilio = this.provider as TwilioProvider;
429
+ if (twilio.hasRegisteredStream(callId)) {
430
+ return;
431
+ }
432
+ }
433
+
191
434
  console.log(
192
- `[voice-call] Auto-ending call ${disconnectedCall.callId} on stream disconnect`,
435
+ `[voice-call] Auto-ending call ${disconnectedCall.callId} after stream disconnect grace`,
193
436
  );
194
437
  void this.manager.endCall(disconnectedCall.callId).catch((err) => {
195
438
  console.warn(`[voice-call] Failed to auto-end call ${disconnectedCall.callId}:`, err);
196
439
  });
197
- }
198
- if (this.provider.name === "twilio") {
199
- (this.provider as TwilioProvider).unregisterCallStream(callId);
200
- }
440
+ }, STREAM_DISCONNECT_HANGUP_GRACE_MS);
441
+ timer.unref?.();
442
+ this.pendingDisconnectHangups.set(callId, timer);
201
443
  },
202
444
  };
203
445
 
@@ -220,7 +462,15 @@ export class VoiceCallWebhookServer {
220
462
  return this.listeningUrl ?? this.resolveListeningUrl(bind, webhookPath);
221
463
  }
222
464
 
223
- return new Promise((resolve, reject) => {
465
+ if (this.config.streaming.enabled && !this.mediaStreamHandler) {
466
+ await this.initializeMediaStreaming();
467
+ }
468
+
469
+ if (this.startPromise) {
470
+ return this.startPromise;
471
+ }
472
+
473
+ this.startPromise = new Promise((resolve, reject) => {
224
474
  this.server = http.createServer((req, res) => {
225
475
  this.handleRequest(req, res, webhookPath).catch((err) => {
226
476
  console.error("[voice-call] Webhook error:", err);
@@ -229,12 +479,15 @@ export class VoiceCallWebhookServer {
229
479
  });
230
480
  });
231
481
 
232
- // Handle WebSocket upgrades for media streams
233
- if (this.mediaStreamHandler) {
482
+ // Handle WebSocket upgrades for realtime voice and media streams.
483
+ if (this.realtimeHandler || this.mediaStreamHandler) {
234
484
  this.server.on("upgrade", (request, socket, head) => {
485
+ if (this.realtimeHandler && this.isRealtimeWebSocketUpgrade(request)) {
486
+ this.realtimeHandler.handleWebSocketUpgrade(request, socket, head);
487
+ return;
488
+ }
235
489
  const path = this.getUpgradePathname(request);
236
- if (path === streamPath) {
237
- console.log("[voice-call] WebSocket upgrade for media stream");
490
+ if (path === streamPath && this.mediaStreamHandler) {
238
491
  this.mediaStreamHandler?.handleUpgrade(request, socket, head);
239
492
  } else {
240
493
  socket.destroy();
@@ -242,17 +495,23 @@ export class VoiceCallWebhookServer {
242
495
  });
243
496
  }
244
497
 
245
- this.server.on("error", reject);
498
+ this.server.on("error", (err) => {
499
+ this.server = null;
500
+ this.listeningUrl = null;
501
+ this.startPromise = null;
502
+ reject(err);
503
+ });
246
504
 
247
505
  this.server.listen(port, bind, () => {
248
506
  const url = this.resolveListeningUrl(bind, webhookPath);
249
507
  this.listeningUrl = url;
250
- console.log(`[voice-call] Webhook server listening on ${url}`);
508
+ this.startPromise = null;
509
+ this.logger.info(`[voice-call] Webhook server listening on ${url}`);
251
510
  if (this.mediaStreamHandler) {
252
511
  const address = this.server?.address();
253
512
  const actualPort =
254
513
  address && typeof address === "object" ? address.port : this.config.serve.port;
255
- console.log(
514
+ this.logger.info(
256
515
  `[voice-call] Media stream WebSocket on ws://${bind}:${actualPort}${streamPath}`,
257
516
  );
258
517
  }
@@ -265,12 +524,21 @@ export class VoiceCallWebhookServer {
265
524
  });
266
525
  });
267
526
  });
527
+
528
+ return this.startPromise;
268
529
  }
269
530
 
270
531
  /**
271
532
  * Stop the webhook server.
272
533
  */
273
534
  async stop(): Promise<void> {
535
+ for (const timer of this.pendingDisconnectHangups.values()) {
536
+ clearTimeout(timer);
537
+ }
538
+ this.pendingDisconnectHangups.clear();
539
+ this.webhookInFlightLimiter.clear();
540
+ this.startPromise = null;
541
+
274
542
  if (this.stopStaleCallReaper) {
275
543
  this.stopStaleCallReaper();
276
544
  this.stopStaleCallReaper = null;
@@ -364,49 +632,189 @@ export class VoiceCallWebhookServer {
364
632
  return { statusCode: 405, body: "Method Not Allowed" };
365
633
  }
366
634
 
367
- let body = "";
635
+ const headerGate = this.verifyPreAuthWebhookHeaders(req.headers);
636
+ if (!headerGate.ok) {
637
+ console.warn(`[voice-call] Webhook rejected before body read: ${headerGate.reason}`);
638
+ return { statusCode: 401, body: "Unauthorized" };
639
+ }
640
+
641
+ // createWebhookInFlightLimiter intentionally treats an empty key as fail-open.
642
+ // Missing socket metadata must still share one bucket instead of bypassing
643
+ // the pre-auth limiter entirely.
644
+ const remoteAddress = req.socket.remoteAddress;
645
+ if (!remoteAddress) {
646
+ console.warn(
647
+ `[voice-call] Webhook accepted with no remote address; using shared fallback in-flight key`,
648
+ );
649
+ }
650
+ const inFlightKey = remoteAddress || MISSING_REMOTE_ADDRESS_IN_FLIGHT_KEY;
651
+ if (!this.webhookInFlightLimiter.tryAcquire(inFlightKey)) {
652
+ console.warn(`[voice-call] Webhook rejected before body read: too many in-flight requests`);
653
+ return { statusCode: 429, body: "Too Many Requests" };
654
+ }
655
+
368
656
  try {
369
- body = await this.readBody(req, MAX_WEBHOOK_BODY_BYTES);
370
- } catch (err) {
371
- if (isRequestBodyLimitError(err, "PAYLOAD_TOO_LARGE")) {
372
- return { statusCode: 413, body: "Payload Too Large" };
657
+ let body = "";
658
+ try {
659
+ body = await this.readBody(req, MAX_WEBHOOK_BODY_BYTES, WEBHOOK_BODY_TIMEOUT_MS);
660
+ } catch (err) {
661
+ if (isRequestBodyLimitError(err, "PAYLOAD_TOO_LARGE")) {
662
+ return { statusCode: 413, body: "Payload Too Large" };
663
+ }
664
+ if (isRequestBodyLimitError(err, "REQUEST_BODY_TIMEOUT")) {
665
+ return { statusCode: 408, body: requestBodyErrorToText("REQUEST_BODY_TIMEOUT") };
666
+ }
667
+ throw err;
668
+ }
669
+
670
+ const ctx: WebhookContext = {
671
+ headers: req.headers as Record<string, string | string[] | undefined>,
672
+ rawBody: body,
673
+ url: url.toString(),
674
+ method: "POST",
675
+ query: Object.fromEntries(url.searchParams),
676
+ remoteAddress: req.socket.remoteAddress ?? undefined,
677
+ };
678
+
679
+ const verification = this.provider.verifyWebhook(ctx);
680
+ if (!verification.ok) {
681
+ console.warn(`[voice-call] Webhook verification failed: ${verification.reason}`);
682
+ return { statusCode: 401, body: "Unauthorized" };
373
683
  }
374
- if (isRequestBodyLimitError(err, "REQUEST_BODY_TIMEOUT")) {
375
- return { statusCode: 408, body: requestBodyErrorToText("REQUEST_BODY_TIMEOUT") };
684
+ if (!verification.verifiedRequestKey) {
685
+ console.warn("[voice-call] Webhook verification succeeded without request identity key");
686
+ return { statusCode: 401, body: "Unauthorized" };
376
687
  }
377
- throw err;
688
+
689
+ const initialTwiML = this.provider.consumeInitialTwiML?.(ctx);
690
+ if (initialTwiML !== undefined && initialTwiML !== null) {
691
+ const params = new URLSearchParams(ctx.rawBody);
692
+ console.log(
693
+ `[voice-call] Serving provider initial TwiML before realtime handling (callSid=${params.get("CallSid") ?? "unknown"}, direction=${params.get("Direction") ?? "unknown"})`,
694
+ );
695
+ return {
696
+ statusCode: 200,
697
+ headers: { "Content-Type": "application/xml" },
698
+ body: initialTwiML,
699
+ };
700
+ }
701
+
702
+ const realtimeParams = this.getRealtimeTwimlParams(ctx);
703
+ if (realtimeParams) {
704
+ const direction = realtimeParams.get("Direction");
705
+ const isInboundRealtimeRequest = !direction || direction === "inbound";
706
+ if (isInboundRealtimeRequest && !this.shouldAcceptRealtimeInboundRequest(realtimeParams)) {
707
+ console.log("[voice-call] Realtime inbound call rejected before stream setup");
708
+ return buildRealtimeRejectedTwiML();
709
+ }
710
+ console.log(
711
+ `[voice-call] Serving realtime TwiML for Twilio call ${realtimeParams.get("CallSid") ?? "unknown"} (direction=${direction ?? "unknown"})`,
712
+ );
713
+ return this.realtimeHandler!.buildTwiMLPayload(req, realtimeParams);
714
+ }
715
+
716
+ const parsed = this.provider.parseWebhookEvent(ctx, {
717
+ verifiedRequestKey: verification.verifiedRequestKey,
718
+ });
719
+
720
+ if (verification.isReplay) {
721
+ console.warn("[voice-call] Replay detected; skipping event side effects");
722
+ } else {
723
+ this.processParsedEvents(parsed.events);
724
+ }
725
+
726
+ return normalizeWebhookResponse(parsed);
727
+ } finally {
728
+ this.webhookInFlightLimiter.release(inFlightKey);
378
729
  }
730
+ }
379
731
 
380
- const ctx: WebhookContext = {
381
- headers: req.headers as Record<string, string | string[] | undefined>,
382
- rawBody: body,
383
- url: url.toString(),
384
- method: "POST",
385
- query: Object.fromEntries(url.searchParams),
386
- remoteAddress: req.socket.remoteAddress ?? undefined,
387
- };
732
+ private verifyPreAuthWebhookHeaders(headers: http.IncomingHttpHeaders): WebhookHeaderGateResult {
733
+ if (this.config.skipSignatureVerification) {
734
+ return { ok: true };
735
+ }
736
+ switch (this.provider.name) {
737
+ case "telnyx": {
738
+ const signature = getHeader(headers, "telnyx-signature-ed25519");
739
+ const timestamp = getHeader(headers, "telnyx-timestamp");
740
+ if (signature && timestamp) {
741
+ return { ok: true };
742
+ }
743
+ return { ok: false, reason: "missing Telnyx signature or timestamp header" };
744
+ }
745
+ case "twilio":
746
+ if (getHeader(headers, "x-twilio-signature")) {
747
+ return { ok: true };
748
+ }
749
+ return { ok: false, reason: "missing X-Twilio-Signature header" };
750
+ case "plivo": {
751
+ const hasV3 =
752
+ Boolean(getHeader(headers, "x-plivo-signature-v3")) &&
753
+ Boolean(getHeader(headers, "x-plivo-signature-v3-nonce"));
754
+ const hasV2 =
755
+ Boolean(getHeader(headers, "x-plivo-signature-v2")) &&
756
+ Boolean(getHeader(headers, "x-plivo-signature-v2-nonce"));
757
+ if (hasV3 || hasV2) {
758
+ return { ok: true };
759
+ }
760
+ return { ok: false, reason: "missing Plivo signature headers" };
761
+ }
762
+ default:
763
+ return { ok: true };
764
+ }
765
+ }
388
766
 
389
- const verification = this.provider.verifyWebhook(ctx);
390
- if (!verification.ok) {
391
- console.warn(`[voice-call] Webhook verification failed: ${verification.reason}`);
392
- return { statusCode: 401, body: "Unauthorized" };
767
+ private isRealtimeWebSocketUpgrade(req: http.IncomingMessage): boolean {
768
+ try {
769
+ const pathname = buildRequestUrl(req.url, req.headers.host).pathname;
770
+ const pattern = this.realtimeHandler?.getStreamPathPattern();
771
+ return Boolean(pattern && pathname.startsWith(pattern));
772
+ } catch {
773
+ return false;
393
774
  }
394
- if (!verification.verifiedRequestKey) {
395
- console.warn("[voice-call] Webhook verification succeeded without request identity key");
396
- return { statusCode: 401, body: "Unauthorized" };
775
+ }
776
+
777
+ private getRealtimeTwimlParams(ctx: WebhookContext): URLSearchParams | null {
778
+ if (!this.realtimeHandler || this.provider.name !== "twilio") {
779
+ return null;
397
780
  }
398
781
 
399
- const parsed = this.provider.parseWebhookEvent(ctx, {
400
- verifiedRequestKey: verification.verifiedRequestKey,
401
- });
782
+ const params = new URLSearchParams(ctx.rawBody);
783
+ const direction = params.get("Direction");
784
+ const isSupportedDirection =
785
+ !direction || direction === "inbound" || direction.startsWith("outbound");
786
+ if (!isSupportedDirection) {
787
+ return null;
788
+ }
402
789
 
403
- if (verification.isReplay) {
404
- console.warn("[voice-call] Replay detected; skipping event side effects");
405
- } else {
406
- this.processParsedEvents(parsed.events);
790
+ if (ctx.query?.type === "status") {
791
+ return null;
407
792
  }
408
793
 
409
- return normalizeWebhookResponse(parsed);
794
+ const callStatus = params.get("CallStatus");
795
+ if (callStatus && isProviderStatusTerminal(callStatus)) {
796
+ return null;
797
+ }
798
+
799
+ // Replays must return the same TwiML body so Twilio retries reconnect cleanly.
800
+ // The one-time token still changes, but the behavior stays identical.
801
+ return !params.get("SpeechResult") && !params.get("Digits") ? params : null;
802
+ }
803
+
804
+ private shouldAcceptRealtimeInboundRequest(params: URLSearchParams): boolean {
805
+ switch (this.config.inboundPolicy) {
806
+ case "open":
807
+ return true;
808
+ case "allowlist":
809
+ case "pairing":
810
+ return isAllowlistedCaller(
811
+ normalizePhoneNumber(params.get("From") ?? undefined),
812
+ this.config.allowFrom,
813
+ );
814
+ case "disabled":
815
+ default:
816
+ return false;
817
+ }
410
818
  }
411
819
 
412
820
  private processParsedEvents(events: NormalizedEvent[]): void {
@@ -435,7 +843,7 @@ export class VoiceCallWebhookServer {
435
843
  private readBody(
436
844
  req: http.IncomingMessage,
437
845
  maxBytes: number,
438
- timeoutMs = 30_000,
846
+ timeoutMs = WEBHOOK_BODY_TIMEOUT_MS,
439
847
  ): Promise<string> {
440
848
  return readRequestBodyWithLimit(req, { maxBytes, timeoutMs });
441
849
  }
@@ -458,13 +866,18 @@ export class VoiceCallWebhookServer {
458
866
  console.warn("[voice-call] Core config missing; skipping auto-response");
459
867
  return;
460
868
  }
869
+ if (!this.agentRuntime) {
870
+ console.warn("[voice-call] Agent runtime missing; skipping auto-response");
871
+ return;
872
+ }
461
873
 
462
874
  try {
463
- const { generateVoiceResponse } = await import("./response-generator.js");
875
+ const { generateVoiceResponse } = await loadResponseGeneratorModule();
464
876
 
465
877
  const result = await generateVoiceResponse({
466
878
  voiceConfig: this.config,
467
879
  coreConfig: this.coreConfig,
880
+ agentRuntime: this.agentRuntime,
468
881
  callId,
469
882
  from: call.from,
470
883
  transcript: call.transcript,