@openclaw/voice-call 2026.3.13 → 2026.5.2-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +27 -5
  2. package/api.ts +16 -0
  3. package/cli-metadata.ts +10 -0
  4. package/config-api.ts +12 -0
  5. package/index.test.ts +943 -0
  6. package/index.ts +379 -149
  7. package/openclaw.plugin.json +384 -157
  8. package/package.json +35 -5
  9. package/runtime-api.ts +20 -0
  10. package/runtime-entry.ts +1 -0
  11. package/setup-api.ts +47 -0
  12. package/src/allowlist.test.ts +18 -0
  13. package/src/cli.ts +533 -68
  14. package/src/config-compat.test.ts +120 -0
  15. package/src/config-compat.ts +227 -0
  16. package/src/config.test.ts +273 -12
  17. package/src/config.ts +355 -72
  18. package/src/core-bridge.ts +2 -147
  19. package/src/deep-merge.test.ts +40 -0
  20. package/src/gateway-continue-operation.ts +200 -0
  21. package/src/http-headers.ts +6 -3
  22. package/src/manager/context.ts +6 -5
  23. package/src/manager/events.test.ts +243 -19
  24. package/src/manager/events.ts +61 -31
  25. package/src/manager/lifecycle.ts +53 -0
  26. package/src/manager/lookup.test.ts +52 -0
  27. package/src/manager/outbound.test.ts +528 -0
  28. package/src/manager/outbound.ts +163 -57
  29. package/src/manager/store.ts +18 -6
  30. package/src/manager/timers.test.ts +129 -0
  31. package/src/manager/timers.ts +4 -3
  32. package/src/manager/twiml.test.ts +13 -0
  33. package/src/manager/twiml.ts +8 -0
  34. package/src/manager.closed-loop.test.ts +30 -12
  35. package/src/manager.inbound-allowlist.test.ts +77 -10
  36. package/src/manager.notify.test.ts +344 -20
  37. package/src/manager.restore.test.ts +95 -8
  38. package/src/manager.test-harness.ts +8 -6
  39. package/src/manager.ts +79 -5
  40. package/src/media-stream.test.ts +578 -81
  41. package/src/media-stream.ts +235 -54
  42. package/src/providers/base.ts +19 -0
  43. package/src/providers/mock.ts +7 -1
  44. package/src/providers/plivo.test.ts +50 -6
  45. package/src/providers/plivo.ts +14 -6
  46. package/src/providers/shared/call-status.ts +2 -1
  47. package/src/providers/shared/guarded-json-api.test.ts +106 -0
  48. package/src/providers/shared/guarded-json-api.ts +1 -1
  49. package/src/providers/telnyx.test.ts +178 -6
  50. package/src/providers/telnyx.ts +40 -3
  51. package/src/providers/twilio/api.test.ts +145 -0
  52. package/src/providers/twilio/api.ts +67 -16
  53. package/src/providers/twilio/twiml-policy.ts +6 -10
  54. package/src/providers/twilio/webhook.ts +1 -1
  55. package/src/providers/twilio.test.ts +425 -25
  56. package/src/providers/twilio.ts +230 -77
  57. package/src/providers/twilio.types.ts +17 -0
  58. package/src/realtime-defaults.ts +3 -0
  59. package/src/realtime-fast-context.test.ts +88 -0
  60. package/src/realtime-fast-context.ts +165 -0
  61. package/src/realtime-transcription.runtime.ts +4 -0
  62. package/src/realtime-voice.runtime.ts +5 -0
  63. package/src/response-generator.test.ts +321 -0
  64. package/src/response-generator.ts +213 -53
  65. package/src/response-model.test.ts +71 -0
  66. package/src/response-model.ts +23 -0
  67. package/src/runtime.test.ts +429 -0
  68. package/src/runtime.ts +270 -24
  69. package/src/telephony-audio.test.ts +61 -0
  70. package/src/telephony-audio.ts +1 -79
  71. package/src/telephony-tts.test.ts +133 -12
  72. package/src/telephony-tts.ts +155 -2
  73. package/src/test-fixtures.ts +28 -7
  74. package/src/tts-provider-voice.test.ts +34 -0
  75. package/src/tts-provider-voice.ts +21 -0
  76. package/src/tunnel.test.ts +166 -0
  77. package/src/tunnel.ts +1 -1
  78. package/src/types.ts +24 -37
  79. package/src/utils.test.ts +17 -0
  80. package/src/voice-mapping.test.ts +34 -0
  81. package/src/voice-mapping.ts +3 -2
  82. package/src/webhook/realtime-handler.test.ts +598 -0
  83. package/src/webhook/realtime-handler.ts +485 -0
  84. package/src/webhook/stale-call-reaper.test.ts +88 -0
  85. package/src/webhook/stale-call-reaper.ts +5 -0
  86. package/src/webhook/tailscale.test.ts +214 -0
  87. package/src/webhook/tailscale.ts +19 -5
  88. package/src/webhook-exposure.test.ts +33 -0
  89. package/src/webhook-exposure.ts +84 -0
  90. package/src/webhook-security.test.ts +172 -21
  91. package/src/webhook-security.ts +43 -29
  92. package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
  93. package/src/webhook.test.ts +1145 -27
  94. package/src/webhook.ts +523 -102
  95. package/src/webhook.types.ts +5 -0
  96. package/src/websocket-test-support.ts +72 -0
  97. package/tsconfig.json +16 -0
  98. package/CHANGELOG.md +0 -121
  99. package/src/providers/index.ts +0 -10
  100. package/src/providers/stt-openai-realtime.test.ts +0 -42
  101. package/src/providers/stt-openai-realtime.ts +0 -311
  102. package/src/providers/tts-openai.test.ts +0 -43
  103. package/src/providers/tts-openai.ts +0 -221
package/src/config.ts CHANGED
@@ -1,11 +1,14 @@
1
+ import { REALTIME_VOICE_AGENT_CONSULT_TOOL_POLICIES } from "openclaw/plugin-sdk/realtime-voice";
1
2
  import {
2
- TtsAutoSchema,
3
- TtsConfigSchema,
4
- TtsModeSchema,
5
- TtsProviderSchema,
6
- } from "openclaw/plugin-sdk/voice-call";
7
- import { z } from "zod";
3
+ buildSecretInputSchema,
4
+ hasConfiguredSecretInput,
5
+ normalizeResolvedSecretInputString,
6
+ type SecretInput,
7
+ } from "openclaw/plugin-sdk/secret-input";
8
+ import { z } from "openclaw/plugin-sdk/zod";
9
+ import { TtsConfigSchema } from "../api.js";
8
10
  import { deepMergeDefined } from "./deep-merge.js";
11
+ import { DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS } from "./realtime-defaults.js";
9
12
 
10
13
  // -----------------------------------------------------------------------------
11
14
  // Phone Number Validation
@@ -15,7 +18,7 @@ import { deepMergeDefined } from "./deep-merge.js";
15
18
  * E.164 phone number format: +[country code][number]
16
19
  * Examples use 555 prefix (reserved for fictional numbers)
17
20
  */
18
- export const E164Schema = z
21
+ const E164Schema = z
19
22
  .string()
20
23
  .regex(/^\+[1-9]\d{1,14}$/, "Expected E.164 format, e.g. +15550001234");
21
24
 
@@ -30,14 +33,15 @@ export const E164Schema = z
30
33
  * - "pairing": Unknown callers can request pairing (future)
31
34
  * - "open": Accept all inbound calls (dangerous!)
32
35
  */
33
- export const InboundPolicySchema = z.enum(["disabled", "allowlist", "pairing", "open"]);
34
- export type InboundPolicy = z.infer<typeof InboundPolicySchema>;
36
+ const InboundPolicySchema = z.enum(["disabled", "allowlist", "pairing", "open"]);
35
37
 
36
38
  // -----------------------------------------------------------------------------
37
39
  // Provider-Specific Configuration
38
40
  // -----------------------------------------------------------------------------
39
41
 
40
- export const TelnyxConfigSchema = z
42
+ const SecretInputSchema = buildSecretInputSchema();
43
+
44
+ const TelnyxConfigSchema = z
41
45
  .object({
42
46
  /** Telnyx API v2 key */
43
47
  apiKey: z.string().min(1).optional(),
@@ -49,17 +53,16 @@ export const TelnyxConfigSchema = z
49
53
  .strict();
50
54
  export type TelnyxConfig = z.infer<typeof TelnyxConfigSchema>;
51
55
 
52
- export const TwilioConfigSchema = z
56
+ const TwilioConfigSchema = z
53
57
  .object({
54
58
  /** Twilio Account SID */
55
59
  accountSid: z.string().min(1).optional(),
56
60
  /** Twilio Auth Token */
57
- authToken: z.string().min(1).optional(),
61
+ authToken: SecretInputSchema.optional(),
58
62
  })
59
63
  .strict();
60
- export type TwilioConfig = z.infer<typeof TwilioConfigSchema>;
61
64
 
62
- export const PlivoConfigSchema = z
65
+ const PlivoConfigSchema = z
63
66
  .object({
64
67
  /** Plivo Auth ID (starts with MA/SA) */
65
68
  authId: z.string().min(1).optional(),
@@ -69,29 +72,31 @@ export const PlivoConfigSchema = z
69
72
  .strict();
70
73
  export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
71
74
 
72
- // -----------------------------------------------------------------------------
73
- // STT/TTS Configuration
74
- // -----------------------------------------------------------------------------
75
+ export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
75
76
 
76
- export const SttConfigSchema = z
77
+ const VoiceCallNumberRouteConfigSchema = z
77
78
  .object({
78
- /** STT provider (currently only OpenAI supported) */
79
- provider: z.literal("openai").default("openai"),
80
- /** Whisper model to use */
81
- model: z.string().min(1).default("whisper-1"),
79
+ /** Greeting message for inbound calls to this number. */
80
+ inboundGreeting: z.string().optional(),
81
+ /** TTS override for inbound calls to this number. Deep-merges with global voice-call TTS. */
82
+ tts: TtsConfigSchema,
83
+ /** Agent ID to use for voice response generation for this number. */
84
+ agentId: z.string().min(1).optional(),
85
+ /** Optional model override for voice responses for this number. */
86
+ responseModel: z.string().optional(),
87
+ /** System prompt for voice responses for this number. */
88
+ responseSystemPrompt: z.string().optional(),
89
+ /** Timeout for response generation in ms for this number. */
90
+ responseTimeoutMs: z.number().int().positive().optional(),
82
91
  })
83
- .strict()
84
- .default({ provider: "openai", model: "whisper-1" });
85
- export type SttConfig = z.infer<typeof SttConfigSchema>;
86
-
87
- export { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema };
88
- export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
92
+ .strict();
93
+ export type VoiceCallNumberRouteConfig = z.infer<typeof VoiceCallNumberRouteConfigSchema>;
89
94
 
90
95
  // -----------------------------------------------------------------------------
91
96
  // Webhook Server Configuration
92
97
  // -----------------------------------------------------------------------------
93
98
 
94
- export const VoiceCallServeConfigSchema = z
99
+ const VoiceCallServeConfigSchema = z
95
100
  .object({
96
101
  /** Port to listen on */
97
102
  port: z.number().int().positive().default(3334),
@@ -102,9 +107,8 @@ export const VoiceCallServeConfigSchema = z
102
107
  })
103
108
  .strict()
104
109
  .default({ port: 3334, bind: "127.0.0.1", path: "/voice/webhook" });
105
- export type VoiceCallServeConfig = z.infer<typeof VoiceCallServeConfigSchema>;
106
110
 
107
- export const VoiceCallTailscaleConfigSchema = z
111
+ const VoiceCallTailscaleConfigSchema = z
108
112
  .object({
109
113
  /**
110
114
  * Tailscale exposure mode:
@@ -118,13 +122,12 @@ export const VoiceCallTailscaleConfigSchema = z
118
122
  })
119
123
  .strict()
120
124
  .default({ mode: "off", path: "/voice/webhook" });
121
- export type VoiceCallTailscaleConfig = z.infer<typeof VoiceCallTailscaleConfigSchema>;
122
125
 
123
126
  // -----------------------------------------------------------------------------
124
127
  // Tunnel Configuration (unified ngrok/tailscale)
125
128
  // -----------------------------------------------------------------------------
126
129
 
127
- export const VoiceCallTunnelConfigSchema = z
130
+ const VoiceCallTunnelConfigSchema = z
128
131
  .object({
129
132
  /**
130
133
  * Tunnel provider:
@@ -149,13 +152,12 @@ export const VoiceCallTunnelConfigSchema = z
149
152
  })
150
153
  .strict()
151
154
  .default({ provider: "none", allowNgrokFreeTierLoopbackBypass: false });
152
- export type VoiceCallTunnelConfig = z.infer<typeof VoiceCallTunnelConfigSchema>;
153
155
 
154
156
  // -----------------------------------------------------------------------------
155
157
  // Webhook Security Configuration
156
158
  // -----------------------------------------------------------------------------
157
159
 
158
- export const VoiceCallWebhookSecurityConfigSchema = z
160
+ const VoiceCallWebhookSecurityConfigSchema = z
159
161
  .object({
160
162
  /**
161
163
  * Allowed hostnames for webhook URL reconstruction.
@@ -186,10 +188,13 @@ export type WebhookSecurityConfig = z.infer<typeof VoiceCallWebhookSecurityConfi
186
188
  * - "notify": Deliver message and auto-hangup after delay (one-way notification)
187
189
  * - "conversation": Stay open for back-and-forth until explicit end or timeout
188
190
  */
189
- export const CallModeSchema = z.enum(["notify", "conversation"]);
191
+ const CallModeSchema = z.enum(["notify", "conversation"]);
190
192
  export type CallMode = z.infer<typeof CallModeSchema>;
191
193
 
192
- export const OutboundConfigSchema = z
194
+ const VoiceCallSessionScopeSchema = z.enum(["per-phone", "per-call"]);
195
+ export type VoiceCallSessionScope = z.infer<typeof VoiceCallSessionScopeSchema>;
196
+
197
+ const OutboundConfigSchema = z
193
198
  .object({
194
199
  /** Default call mode for outbound calls */
195
200
  defaultMode: CallModeSchema.default("notify"),
@@ -198,28 +203,115 @@ export const OutboundConfigSchema = z
198
203
  })
199
204
  .strict()
200
205
  .default({ defaultMode: "notify", notifyHangupDelaySec: 3 });
201
- export type OutboundConfig = z.infer<typeof OutboundConfigSchema>;
202
206
 
203
207
  // -----------------------------------------------------------------------------
204
- // Streaming Configuration (OpenAI Realtime STT)
208
+ // Realtime Voice Configuration
209
+ // -----------------------------------------------------------------------------
210
+
211
+ const RealtimeToolSchema = z
212
+ .object({
213
+ type: z.literal("function"),
214
+ name: z.string().min(1),
215
+ description: z.string(),
216
+ parameters: z.object({
217
+ type: z.literal("object"),
218
+ properties: z.record(z.string(), z.unknown()),
219
+ required: z.array(z.string()).optional(),
220
+ }),
221
+ })
222
+ .strict();
223
+ type RealtimeToolConfig = z.infer<typeof RealtimeToolSchema>;
224
+
225
+ const VoiceCallRealtimeProvidersConfigSchema = z
226
+ .record(z.string(), z.record(z.string(), z.unknown()))
227
+ .default({});
228
+
229
+ const VoiceCallRealtimeToolPolicySchema = z.enum(REALTIME_VOICE_AGENT_CONSULT_TOOL_POLICIES);
230
+
231
+ const VoiceCallRealtimeFastContextSourceSchema = z.enum(["memory", "sessions"]);
232
+
233
+ const VoiceCallRealtimeFastContextConfigSchema = z
234
+ .object({
235
+ /** Enable bounded memory/session lookup before the full consult agent. */
236
+ enabled: z.boolean().default(false),
237
+ /** Hard deadline for the fast context lookup. */
238
+ timeoutMs: z.number().int().positive().default(800),
239
+ /** Maximum memory/session hits to inject into the realtime tool result. */
240
+ maxResults: z.number().int().positive().default(3),
241
+ /** Indexed sources used by the fast context lookup. */
242
+ sources: z
243
+ .array(VoiceCallRealtimeFastContextSourceSchema)
244
+ .min(1)
245
+ .default(["memory", "sessions"]),
246
+ /** Fall back to the full agent consult when fast context has no answer. */
247
+ fallbackToConsult: z.boolean().default(false),
248
+ })
249
+ .strict()
250
+ .default({
251
+ enabled: false,
252
+ timeoutMs: 800,
253
+ maxResults: 3,
254
+ sources: ["memory", "sessions"],
255
+ fallbackToConsult: false,
256
+ });
257
+ export type VoiceCallRealtimeFastContextConfig = z.infer<
258
+ typeof VoiceCallRealtimeFastContextConfigSchema
259
+ >;
260
+
261
+ const VoiceCallStreamingProvidersConfigSchema = z
262
+ .record(z.string(), z.record(z.string(), z.unknown()))
263
+ .default({});
264
+
265
+ const VoiceCallRealtimeConfigSchema = z
266
+ .object({
267
+ /** Enable realtime voice-to-voice mode. */
268
+ enabled: z.boolean().default(false),
269
+ /** Provider id from registered realtime voice providers. */
270
+ provider: z.string().min(1).optional(),
271
+ /** Optional override for the local WebSocket route path. */
272
+ streamPath: z.string().min(1).optional(),
273
+ /** System instructions passed to the realtime provider. */
274
+ instructions: z.string().default(DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS),
275
+ /** Tool policy for the shared OpenClaw agent consult tool. */
276
+ toolPolicy: VoiceCallRealtimeToolPolicySchema.default("safe-read-only"),
277
+ /** Tool definitions exposed to the realtime provider. */
278
+ tools: z.array(RealtimeToolSchema).default([]),
279
+ /** Low-latency memory/session context for the consult tool. */
280
+ fastContext: VoiceCallRealtimeFastContextConfigSchema,
281
+ /** Provider-owned raw config blobs keyed by provider id. */
282
+ providers: VoiceCallRealtimeProvidersConfigSchema,
283
+ })
284
+ .strict()
285
+ .default({
286
+ enabled: false,
287
+ instructions: DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS,
288
+ toolPolicy: "safe-read-only",
289
+ tools: [],
290
+ fastContext: {
291
+ enabled: false,
292
+ timeoutMs: 800,
293
+ maxResults: 3,
294
+ sources: ["memory", "sessions"],
295
+ fallbackToConsult: false,
296
+ },
297
+ providers: {},
298
+ });
299
+ export type VoiceCallRealtimeConfig = z.infer<typeof VoiceCallRealtimeConfigSchema>;
300
+
301
+ // -----------------------------------------------------------------------------
302
+ // Streaming Configuration (Realtime Transcription)
205
303
  // -----------------------------------------------------------------------------
206
304
 
207
- export const VoiceCallStreamingConfigSchema = z
305
+ const VoiceCallStreamingConfigSchema = z
208
306
  .object({
209
307
  /** Enable real-time audio streaming (requires WebSocket support) */
210
308
  enabled: z.boolean().default(false),
211
- /** STT provider for real-time transcription */
212
- sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"),
213
- /** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */
214
- openaiApiKey: z.string().min(1).optional(),
215
- /** OpenAI transcription model (default: gpt-4o-transcribe) */
216
- sttModel: z.string().min(1).default("gpt-4o-transcribe"),
217
- /** VAD silence duration in ms before considering speech ended */
218
- silenceDurationMs: z.number().int().positive().default(800),
219
- /** VAD threshold 0-1 (higher = less sensitive) */
220
- vadThreshold: z.number().min(0).max(1).default(0.5),
309
+ /** Provider id from registered realtime transcription providers. */
310
+ provider: z.string().min(1).optional(),
221
311
  /** WebSocket path for media stream connections */
222
312
  streamPath: z.string().min(1).default("/voice/stream"),
313
+ /** Provider-owned raw config blobs keyed by provider id. */
314
+ providers: VoiceCallStreamingProvidersConfigSchema,
223
315
  /**
224
316
  * Close unauthenticated media stream sockets if no valid `start` frame arrives in time.
225
317
  * Protects against pre-auth idle connection hold attacks.
@@ -235,17 +327,13 @@ export const VoiceCallStreamingConfigSchema = z
235
327
  .strict()
236
328
  .default({
237
329
  enabled: false,
238
- sttProvider: "openai-realtime",
239
- sttModel: "gpt-4o-transcribe",
240
- silenceDurationMs: 800,
241
- vadThreshold: 0.5,
242
330
  streamPath: "/voice/stream",
331
+ providers: {},
243
332
  preStartTimeoutMs: 5000,
244
333
  maxPendingConnections: 32,
245
334
  maxPendingConnectionsPerIp: 4,
246
335
  maxConnections: 128,
247
336
  });
248
- export type VoiceCallStreamingConfig = z.infer<typeof VoiceCallStreamingConfigSchema>;
249
337
 
250
338
  // -----------------------------------------------------------------------------
251
339
  // Main Voice Call Configuration
@@ -283,6 +371,9 @@ export const VoiceCallConfigSchema = z
283
371
  /** Greeting message for inbound calls */
284
372
  inboundGreeting: z.string().optional(),
285
373
 
374
+ /** Per-dialed-number overrides for inbound calls. Keys are E.164 numbers. */
375
+ numbers: z.record(E164Schema, VoiceCallNumberRouteConfigSchema).default({}),
376
+
286
377
  /** Outbound call configuration */
287
378
  outbound: OutboundConfigSchema,
288
379
 
@@ -291,11 +382,10 @@ export const VoiceCallConfigSchema = z
291
382
 
292
383
  /**
293
384
  * Maximum age of a call in seconds before it is automatically reaped.
294
- * Catches calls stuck in unexpected states (e.g., notify-mode calls that
295
- * never receive a terminal webhook). Set to 0 to disable.
296
- * Default: 0 (disabled). Recommended: 120-300 for production.
385
+ * Catches calls stuck before answer (for example, local mock calls that
386
+ * never receive provider webhooks). Set to 0 to disable.
297
387
  */
298
- staleCallReaperSeconds: z.number().int().nonnegative().default(0),
388
+ staleCallReaperSeconds: z.number().int().nonnegative().default(120),
299
389
 
300
390
  /** Silence timeout for end-of-speech detection (ms) */
301
391
  silenceTimeoutMs: z.number().int().positive().default(800),
@@ -312,7 +402,7 @@ export const VoiceCallConfigSchema = z
312
402
  /** Webhook server configuration */
313
403
  serve: VoiceCallServeConfigSchema,
314
404
 
315
- /** Tailscale exposure configuration (legacy, prefer tunnel config) */
405
+ /** @deprecated Prefer tunnel config. */
316
406
  tailscale: VoiceCallTailscaleConfigSchema,
317
407
 
318
408
  /** Tunnel configuration (unified ngrok/tailscale) */
@@ -324,23 +414,29 @@ export const VoiceCallConfigSchema = z
324
414
  /** Real-time audio streaming configuration */
325
415
  streaming: VoiceCallStreamingConfigSchema,
326
416
 
417
+ /** Realtime voice-to-voice configuration */
418
+ realtime: VoiceCallRealtimeConfigSchema,
419
+
420
+ /** Session memory scope for voice conversations. */
421
+ sessionScope: VoiceCallSessionScopeSchema.default("per-phone"),
422
+
327
423
  /** Public webhook URL override (if set, bypasses tunnel auto-detection) */
328
424
  publicUrl: z.string().url().optional(),
329
425
 
330
426
  /** Skip webhook signature verification (development only, NOT for production) */
331
427
  skipSignatureVerification: z.boolean().default(false),
332
428
 
333
- /** STT configuration */
334
- stt: SttConfigSchema,
335
-
336
429
  /** TTS override (deep-merges with core messages.tts) */
337
430
  tts: TtsConfigSchema,
338
431
 
339
432
  /** Store path for call logs */
340
433
  store: z.string().optional(),
341
434
 
342
- /** Model for generating voice responses (e.g., "anthropic/claude-sonnet-4", "openai/gpt-4o") */
343
- responseModel: z.string().default("openai/gpt-4o-mini"),
435
+ /** Agent ID to use for voice response generation. Defaults to "main". */
436
+ agentId: z.string().min(1).optional(),
437
+
438
+ /** Optional model override for generating voice responses. */
439
+ responseModel: z.string().optional(),
344
440
 
345
441
  /** System prompt for voice responses */
346
442
  responseSystemPrompt: z.string().optional(),
@@ -351,13 +447,19 @@ export const VoiceCallConfigSchema = z
351
447
  .strict();
352
448
 
353
449
  export type VoiceCallConfig = z.infer<typeof VoiceCallConfigSchema>;
354
- type DeepPartial<T> =
355
- T extends Array<infer U>
450
+ export type VoiceCallEffectiveConfigResult = {
451
+ config: VoiceCallConfig;
452
+ numberRouteKey?: string;
453
+ };
454
+ type DeepPartial<T> = T extends SecretInput
455
+ ? T
456
+ : T extends Array<infer U>
356
457
  ? DeepPartial<U>[]
357
458
  : T extends object
358
459
  ? { [K in keyof T]?: DeepPartial<T[K]> }
359
460
  : T;
360
461
  export type VoiceCallConfigInput = DeepPartial<VoiceCallConfig>;
462
+ const TWILIO_AUTH_TOKEN_PATH = "plugins.entries.voice-call.config.twilio.authToken";
361
463
 
362
464
  // -----------------------------------------------------------------------------
363
465
  // Configuration Helpers
@@ -369,6 +471,29 @@ function cloneDefaultVoiceCallConfig(): VoiceCallConfig {
369
471
  return structuredClone(DEFAULT_VOICE_CALL_CONFIG);
370
472
  }
371
473
 
474
+ function normalizeWebhookLikePath(pathname: string): string {
475
+ const trimmed = pathname.trim();
476
+ if (!trimmed) {
477
+ return "/";
478
+ }
479
+ const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
480
+ if (prefixed === "/") {
481
+ return prefixed;
482
+ }
483
+ return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
484
+ }
485
+
486
+ function defaultRealtimeStreamPathForServePath(servePath: string): string {
487
+ const normalized = normalizeWebhookLikePath(servePath);
488
+ if (normalized.endsWith("/webhook")) {
489
+ return `${normalized.slice(0, -"/webhook".length)}/stream/realtime`;
490
+ }
491
+ if (normalized === "/") {
492
+ return "/voice/stream/realtime";
493
+ }
494
+ return `${normalized}/stream/realtime`;
495
+ }
496
+
372
497
  function normalizeVoiceCallTtsConfig(
373
498
  defaults: VoiceCallTtsConfig,
374
499
  overrides: DeepPartial<NonNullable<VoiceCallTtsConfig>> | undefined,
@@ -380,14 +505,116 @@ function normalizeVoiceCallTtsConfig(
380
505
  return TtsConfigSchema.parse(deepMergeDefined(defaults ?? {}, overrides ?? {}));
381
506
  }
382
507
 
508
+ function normalizePhoneRouteKey(phone: string | undefined): string {
509
+ return phone?.replace(/\D/g, "") ?? "";
510
+ }
511
+
512
+ export function resolveVoiceCallNumberRouteKey(
513
+ config: Pick<VoiceCallConfig, "numbers">,
514
+ phone: string | undefined,
515
+ ): string | undefined {
516
+ const routes = config.numbers;
517
+ if (!routes) {
518
+ return undefined;
519
+ }
520
+ if (phone && Object.prototype.hasOwnProperty.call(routes, phone)) {
521
+ return phone;
522
+ }
523
+
524
+ const normalizedPhone = normalizePhoneRouteKey(phone);
525
+ if (!normalizedPhone) {
526
+ return undefined;
527
+ }
528
+ return Object.keys(routes).find(
529
+ (routeKey) => normalizePhoneRouteKey(routeKey) === normalizedPhone,
530
+ );
531
+ }
532
+
533
+ export function resolveVoiceCallEffectiveConfig(
534
+ config: VoiceCallConfig,
535
+ phoneOrRouteKey: string | undefined,
536
+ ): VoiceCallEffectiveConfigResult {
537
+ const numberRouteKey = resolveVoiceCallNumberRouteKey(config, phoneOrRouteKey);
538
+ if (!numberRouteKey) {
539
+ return { config };
540
+ }
541
+
542
+ const route = config.numbers[numberRouteKey];
543
+ if (!route) {
544
+ return { config };
545
+ }
546
+
547
+ return {
548
+ numberRouteKey,
549
+ config: {
550
+ ...config,
551
+ ...route,
552
+ tts: normalizeVoiceCallTtsConfig(config.tts, route.tts),
553
+ numbers: config.numbers,
554
+ },
555
+ };
556
+ }
557
+
558
+ function sanitizeVoiceCallProviderConfigs(
559
+ value: Record<string, Record<string, unknown> | undefined> | undefined,
560
+ ): Record<string, Record<string, unknown>> {
561
+ if (!value) {
562
+ return {};
563
+ }
564
+ return Object.fromEntries(
565
+ Object.entries(value).filter(
566
+ (entry): entry is [string, Record<string, unknown>] => entry[1] !== undefined,
567
+ ),
568
+ );
569
+ }
570
+
571
+ function sanitizeVoiceCallNumberRoutes(
572
+ value: Record<string, unknown> | undefined,
573
+ ): Record<string, VoiceCallNumberRouteConfig> {
574
+ if (!value) {
575
+ return {};
576
+ }
577
+ return Object.fromEntries(
578
+ Object.entries(value)
579
+ .filter((entry): entry is [string, unknown] => entry[1] !== undefined)
580
+ .map(([key, route]) => [key, VoiceCallNumberRouteConfigSchema.parse(route)]),
581
+ );
582
+ }
583
+
584
+ export function resolveTwilioAuthToken(
585
+ config: Pick<VoiceCallConfig, "twilio">,
586
+ ): string | undefined {
587
+ return normalizeResolvedSecretInputString({
588
+ value: config.twilio?.authToken,
589
+ path: TWILIO_AUTH_TOKEN_PATH,
590
+ });
591
+ }
592
+
383
593
  export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
384
594
  const defaults = cloneDefaultVoiceCallConfig();
595
+ const serve = { ...defaults.serve, ...config.serve };
596
+ const streamingProvider = config.streaming?.provider;
597
+ const streamingProviders = sanitizeVoiceCallProviderConfigs(
598
+ config.streaming?.providers ?? defaults.streaming.providers,
599
+ );
600
+ const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider;
601
+ const realtimeProviders = sanitizeVoiceCallProviderConfigs(
602
+ config.realtime?.providers ?? defaults.realtime.providers,
603
+ );
604
+ const realtimeFastContext = {
605
+ ...defaults.realtime.fastContext,
606
+ ...config.realtime?.fastContext,
607
+ sources: config.realtime?.fastContext?.sources ?? defaults.realtime.fastContext.sources,
608
+ };
385
609
  return {
386
610
  ...defaults,
387
611
  ...config,
388
612
  allowFrom: config.allowFrom ?? defaults.allowFrom,
613
+ numbers: sanitizeVoiceCallNumberRoutes(
614
+ (config.numbers ?? defaults.numbers) as Record<string, unknown>,
615
+ ),
389
616
  outbound: { ...defaults.outbound, ...config.outbound },
390
- serve: { ...defaults.serve, ...config.serve },
617
+ serve,
391
618
  tailscale: { ...defaults.tailscale, ...config.tailscale },
392
619
  tunnel: { ...defaults.tunnel, ...config.tunnel },
393
620
  webhookSecurity: {
@@ -397,12 +624,45 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
397
624
  trustedProxyIPs:
398
625
  config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs,
399
626
  },
400
- streaming: { ...defaults.streaming, ...config.streaming },
401
- stt: { ...defaults.stt, ...config.stt },
627
+ streaming: {
628
+ ...defaults.streaming,
629
+ ...config.streaming,
630
+ provider: streamingProvider,
631
+ providers: streamingProviders,
632
+ },
633
+ realtime: {
634
+ ...defaults.realtime,
635
+ ...config.realtime,
636
+ provider: realtimeProvider,
637
+ streamPath:
638
+ config.realtime?.streamPath ??
639
+ defaultRealtimeStreamPathForServePath(serve.path ?? defaults.serve.path),
640
+ tools:
641
+ (config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools,
642
+ fastContext: realtimeFastContext,
643
+ providers: realtimeProviders,
644
+ },
402
645
  tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
403
646
  };
404
647
  }
405
648
 
649
+ export function resolveVoiceCallSessionKey(params: {
650
+ config: Pick<VoiceCallConfig, "sessionScope">;
651
+ callId: string;
652
+ phone?: string;
653
+ explicitSessionKey?: string;
654
+ }): string {
655
+ const explicit = params.explicitSessionKey?.trim();
656
+ if (explicit) {
657
+ return explicit;
658
+ }
659
+ if (params.config.sessionScope === "per-call") {
660
+ return `voice:call:${params.callId}`;
661
+ }
662
+ const normalizedPhone = params.phone?.replace(/\D/g, "");
663
+ return normalizedPhone ? `voice:${normalizedPhone}` : `voice:${params.callId}`;
664
+ }
665
+
406
666
  /**
407
667
  * Resolves the configuration by merging environment variables into missing fields.
408
668
  * Returns a new configuration object with environment variables applied.
@@ -420,6 +680,7 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
420
680
 
421
681
  // Twilio
422
682
  if (resolved.provider === "twilio") {
683
+ resolved.fromNumber = resolved.fromNumber ?? process.env.TWILIO_FROM_NUMBER;
423
684
  resolved.twilio = resolved.twilio ?? {};
424
685
  resolved.twilio.accountSid = resolved.twilio.accountSid ?? process.env.TWILIO_ACCOUNT_SID;
425
686
  resolved.twilio.authToken = resolved.twilio.authToken ?? process.env.TWILIO_AUTH_TOKEN;
@@ -474,7 +735,11 @@ export function validateProviderConfig(config: VoiceCallConfig): {
474
735
  }
475
736
 
476
737
  if (!config.fromNumber && config.provider !== "mock") {
477
- errors.push("plugins.entries.voice-call.config.fromNumber is required");
738
+ errors.push(
739
+ config.provider === "twilio"
740
+ ? "plugins.entries.voice-call.config.fromNumber is required (or set TWILIO_FROM_NUMBER env)"
741
+ : "plugins.entries.voice-call.config.fromNumber is required",
742
+ );
478
743
  }
479
744
 
480
745
  if (config.provider === "telnyx") {
@@ -501,7 +766,7 @@ export function validateProviderConfig(config: VoiceCallConfig): {
501
766
  "plugins.entries.voice-call.config.twilio.accountSid is required (or set TWILIO_ACCOUNT_SID env)",
502
767
  );
503
768
  }
504
- if (!config.twilio?.authToken) {
769
+ if (!hasConfiguredSecretInput(config.twilio?.authToken)) {
505
770
  errors.push(
506
771
  "plugins.entries.voice-call.config.twilio.authToken is required (or set TWILIO_AUTH_TOKEN env)",
507
772
  );
@@ -521,5 +786,23 @@ export function validateProviderConfig(config: VoiceCallConfig): {
521
786
  }
522
787
  }
523
788
 
789
+ if (config.realtime.enabled && config.inboundPolicy === "disabled") {
790
+ errors.push(
791
+ 'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true',
792
+ );
793
+ }
794
+
795
+ if (config.realtime.enabled && config.streaming.enabled) {
796
+ errors.push(
797
+ "plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true",
798
+ );
799
+ }
800
+
801
+ if (config.realtime.enabled && config.provider && config.provider !== "twilio") {
802
+ errors.push(
803
+ 'plugins.entries.voice-call.config.provider must be "twilio" when realtime.enabled is true',
804
+ );
805
+ }
806
+
524
807
  return { valid: errors.length === 0, errors };
525
808
  }