@openclaw/voice-call 2026.3.13 → 2026.5.1-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +25 -5
  2. package/api.ts +16 -0
  3. package/cli-metadata.ts +10 -0
  4. package/config-api.ts +12 -0
  5. package/index.test.ts +866 -0
  6. package/index.ts +353 -148
  7. package/openclaw.plugin.json +336 -157
  8. package/package.json +33 -5
  9. package/runtime-api.ts +20 -0
  10. package/runtime-entry.ts +1 -0
  11. package/setup-api.ts +47 -0
  12. package/src/allowlist.test.ts +18 -0
  13. package/src/cli.ts +533 -68
  14. package/src/config-compat.test.ts +120 -0
  15. package/src/config-compat.ts +227 -0
  16. package/src/config.test.ts +160 -12
  17. package/src/config.ts +243 -74
  18. package/src/core-bridge.ts +2 -147
  19. package/src/deep-merge.test.ts +40 -0
  20. package/src/gateway-continue-operation.ts +200 -0
  21. package/src/http-headers.ts +6 -3
  22. package/src/manager/context.ts +6 -5
  23. package/src/manager/events.test.ts +179 -19
  24. package/src/manager/events.ts +48 -30
  25. package/src/manager/lifecycle.ts +53 -0
  26. package/src/manager/lookup.test.ts +52 -0
  27. package/src/manager/outbound.test.ts +464 -0
  28. package/src/manager/outbound.ts +148 -55
  29. package/src/manager/store.ts +18 -6
  30. package/src/manager/timers.test.ts +129 -0
  31. package/src/manager/timers.ts +4 -3
  32. package/src/manager/twiml.test.ts +13 -0
  33. package/src/manager/twiml.ts +8 -0
  34. package/src/manager.closed-loop.test.ts +30 -12
  35. package/src/manager.inbound-allowlist.test.ts +77 -10
  36. package/src/manager.notify.test.ts +344 -20
  37. package/src/manager.restore.test.ts +95 -8
  38. package/src/manager.test-harness.ts +8 -6
  39. package/src/manager.ts +79 -5
  40. package/src/media-stream.test.ts +578 -81
  41. package/src/media-stream.ts +235 -54
  42. package/src/providers/base.ts +19 -0
  43. package/src/providers/mock.ts +7 -1
  44. package/src/providers/plivo.test.ts +50 -6
  45. package/src/providers/plivo.ts +14 -6
  46. package/src/providers/shared/call-status.ts +2 -1
  47. package/src/providers/shared/guarded-json-api.test.ts +106 -0
  48. package/src/providers/shared/guarded-json-api.ts +1 -1
  49. package/src/providers/telnyx.test.ts +178 -6
  50. package/src/providers/telnyx.ts +40 -3
  51. package/src/providers/twilio/api.test.ts +145 -0
  52. package/src/providers/twilio/api.ts +67 -16
  53. package/src/providers/twilio/twiml-policy.ts +6 -10
  54. package/src/providers/twilio/webhook.ts +1 -1
  55. package/src/providers/twilio.test.ts +425 -25
  56. package/src/providers/twilio.ts +230 -77
  57. package/src/providers/twilio.types.ts +17 -0
  58. package/src/realtime-defaults.ts +3 -0
  59. package/src/realtime-fast-context.test.ts +88 -0
  60. package/src/realtime-fast-context.ts +165 -0
  61. package/src/realtime-transcription.runtime.ts +4 -0
  62. package/src/realtime-voice.runtime.ts +5 -0
  63. package/src/response-generator.test.ts +277 -0
  64. package/src/response-generator.ts +186 -40
  65. package/src/response-model.test.ts +71 -0
  66. package/src/response-model.ts +23 -0
  67. package/src/runtime.test.ts +351 -0
  68. package/src/runtime.ts +254 -24
  69. package/src/telephony-audio.test.ts +61 -0
  70. package/src/telephony-audio.ts +1 -79
  71. package/src/telephony-tts.test.ts +133 -12
  72. package/src/telephony-tts.ts +155 -2
  73. package/src/test-fixtures.ts +26 -7
  74. package/src/tts-provider-voice.test.ts +34 -0
  75. package/src/tts-provider-voice.ts +21 -0
  76. package/src/tunnel.test.ts +166 -0
  77. package/src/tunnel.ts +1 -1
  78. package/src/types.ts +24 -37
  79. package/src/utils.test.ts +17 -0
  80. package/src/voice-mapping.test.ts +34 -0
  81. package/src/voice-mapping.ts +3 -2
  82. package/src/webhook/realtime-handler.test.ts +598 -0
  83. package/src/webhook/realtime-handler.ts +485 -0
  84. package/src/webhook/stale-call-reaper.test.ts +88 -0
  85. package/src/webhook/stale-call-reaper.ts +5 -0
  86. package/src/webhook/tailscale.test.ts +214 -0
  87. package/src/webhook/tailscale.ts +19 -5
  88. package/src/webhook-exposure.test.ts +33 -0
  89. package/src/webhook-exposure.ts +84 -0
  90. package/src/webhook-security.test.ts +172 -21
  91. package/src/webhook-security.ts +43 -29
  92. package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
  93. package/src/webhook.test.ts +1145 -27
  94. package/src/webhook.ts +513 -100
  95. package/src/webhook.types.ts +5 -0
  96. package/src/websocket-test-support.ts +72 -0
  97. package/tsconfig.json +16 -0
  98. package/CHANGELOG.md +0 -121
  99. package/src/providers/index.ts +0 -10
  100. package/src/providers/stt-openai-realtime.test.ts +0 -42
  101. package/src/providers/stt-openai-realtime.ts +0 -311
  102. package/src/providers/tts-openai.test.ts +0 -43
  103. package/src/providers/tts-openai.ts +0 -221
package/src/config.ts CHANGED
@@ -1,11 +1,14 @@
1
+ import { REALTIME_VOICE_AGENT_CONSULT_TOOL_POLICIES } from "openclaw/plugin-sdk/realtime-voice";
1
2
  import {
2
- TtsAutoSchema,
3
- TtsConfigSchema,
4
- TtsModeSchema,
5
- TtsProviderSchema,
6
- } from "openclaw/plugin-sdk/voice-call";
7
- import { z } from "zod";
3
+ buildSecretInputSchema,
4
+ hasConfiguredSecretInput,
5
+ normalizeResolvedSecretInputString,
6
+ type SecretInput,
7
+ } from "openclaw/plugin-sdk/secret-input";
8
+ import { z } from "openclaw/plugin-sdk/zod";
9
+ import { TtsConfigSchema } from "../api.js";
8
10
  import { deepMergeDefined } from "./deep-merge.js";
11
+ import { DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS } from "./realtime-defaults.js";
9
12
 
10
13
  // -----------------------------------------------------------------------------
11
14
  // Phone Number Validation
@@ -15,7 +18,7 @@ import { deepMergeDefined } from "./deep-merge.js";
15
18
  * E.164 phone number format: +[country code][number]
16
19
  * Examples use 555 prefix (reserved for fictional numbers)
17
20
  */
18
- export const E164Schema = z
21
+ const E164Schema = z
19
22
  .string()
20
23
  .regex(/^\+[1-9]\d{1,14}$/, "Expected E.164 format, e.g. +15550001234");
21
24
 
@@ -30,14 +33,15 @@ export const E164Schema = z
30
33
  * - "pairing": Unknown callers can request pairing (future)
31
34
  * - "open": Accept all inbound calls (dangerous!)
32
35
  */
33
- export const InboundPolicySchema = z.enum(["disabled", "allowlist", "pairing", "open"]);
34
- export type InboundPolicy = z.infer<typeof InboundPolicySchema>;
36
+ const InboundPolicySchema = z.enum(["disabled", "allowlist", "pairing", "open"]);
35
37
 
36
38
  // -----------------------------------------------------------------------------
37
39
  // Provider-Specific Configuration
38
40
  // -----------------------------------------------------------------------------
39
41
 
40
- export const TelnyxConfigSchema = z
42
+ const SecretInputSchema = buildSecretInputSchema();
43
+
44
+ const TelnyxConfigSchema = z
41
45
  .object({
42
46
  /** Telnyx API v2 key */
43
47
  apiKey: z.string().min(1).optional(),
@@ -49,17 +53,16 @@ export const TelnyxConfigSchema = z
49
53
  .strict();
50
54
  export type TelnyxConfig = z.infer<typeof TelnyxConfigSchema>;
51
55
 
52
- export const TwilioConfigSchema = z
56
+ const TwilioConfigSchema = z
53
57
  .object({
54
58
  /** Twilio Account SID */
55
59
  accountSid: z.string().min(1).optional(),
56
60
  /** Twilio Auth Token */
57
- authToken: z.string().min(1).optional(),
61
+ authToken: SecretInputSchema.optional(),
58
62
  })
59
63
  .strict();
60
- export type TwilioConfig = z.infer<typeof TwilioConfigSchema>;
61
64
 
62
- export const PlivoConfigSchema = z
65
+ const PlivoConfigSchema = z
63
66
  .object({
64
67
  /** Plivo Auth ID (starts with MA/SA) */
65
68
  authId: z.string().min(1).optional(),
@@ -69,29 +72,13 @@ export const PlivoConfigSchema = z
69
72
  .strict();
70
73
  export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
71
74
 
72
- // -----------------------------------------------------------------------------
73
- // STT/TTS Configuration
74
- // -----------------------------------------------------------------------------
75
-
76
- export const SttConfigSchema = z
77
- .object({
78
- /** STT provider (currently only OpenAI supported) */
79
- provider: z.literal("openai").default("openai"),
80
- /** Whisper model to use */
81
- model: z.string().min(1).default("whisper-1"),
82
- })
83
- .strict()
84
- .default({ provider: "openai", model: "whisper-1" });
85
- export type SttConfig = z.infer<typeof SttConfigSchema>;
86
-
87
- export { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema };
88
75
  export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
89
76
 
90
77
  // -----------------------------------------------------------------------------
91
78
  // Webhook Server Configuration
92
79
  // -----------------------------------------------------------------------------
93
80
 
94
- export const VoiceCallServeConfigSchema = z
81
+ const VoiceCallServeConfigSchema = z
95
82
  .object({
96
83
  /** Port to listen on */
97
84
  port: z.number().int().positive().default(3334),
@@ -102,9 +89,8 @@ export const VoiceCallServeConfigSchema = z
102
89
  })
103
90
  .strict()
104
91
  .default({ port: 3334, bind: "127.0.0.1", path: "/voice/webhook" });
105
- export type VoiceCallServeConfig = z.infer<typeof VoiceCallServeConfigSchema>;
106
92
 
107
- export const VoiceCallTailscaleConfigSchema = z
93
+ const VoiceCallTailscaleConfigSchema = z
108
94
  .object({
109
95
  /**
110
96
  * Tailscale exposure mode:
@@ -118,13 +104,12 @@ export const VoiceCallTailscaleConfigSchema = z
118
104
  })
119
105
  .strict()
120
106
  .default({ mode: "off", path: "/voice/webhook" });
121
- export type VoiceCallTailscaleConfig = z.infer<typeof VoiceCallTailscaleConfigSchema>;
122
107
 
123
108
  // -----------------------------------------------------------------------------
124
109
  // Tunnel Configuration (unified ngrok/tailscale)
125
110
  // -----------------------------------------------------------------------------
126
111
 
127
- export const VoiceCallTunnelConfigSchema = z
112
+ const VoiceCallTunnelConfigSchema = z
128
113
  .object({
129
114
  /**
130
115
  * Tunnel provider:
@@ -149,13 +134,12 @@ export const VoiceCallTunnelConfigSchema = z
149
134
  })
150
135
  .strict()
151
136
  .default({ provider: "none", allowNgrokFreeTierLoopbackBypass: false });
152
- export type VoiceCallTunnelConfig = z.infer<typeof VoiceCallTunnelConfigSchema>;
153
137
 
154
138
  // -----------------------------------------------------------------------------
155
139
  // Webhook Security Configuration
156
140
  // -----------------------------------------------------------------------------
157
141
 
158
- export const VoiceCallWebhookSecurityConfigSchema = z
142
+ const VoiceCallWebhookSecurityConfigSchema = z
159
143
  .object({
160
144
  /**
161
145
  * Allowed hostnames for webhook URL reconstruction.
@@ -186,10 +170,10 @@ export type WebhookSecurityConfig = z.infer<typeof VoiceCallWebhookSecurityConfi
186
170
  * - "notify": Deliver message and auto-hangup after delay (one-way notification)
187
171
  * - "conversation": Stay open for back-and-forth until explicit end or timeout
188
172
  */
189
- export const CallModeSchema = z.enum(["notify", "conversation"]);
173
+ const CallModeSchema = z.enum(["notify", "conversation"]);
190
174
  export type CallMode = z.infer<typeof CallModeSchema>;
191
175
 
192
- export const OutboundConfigSchema = z
176
+ const OutboundConfigSchema = z
193
177
  .object({
194
178
  /** Default call mode for outbound calls */
195
179
  defaultMode: CallModeSchema.default("notify"),
@@ -198,28 +182,115 @@ export const OutboundConfigSchema = z
198
182
  })
199
183
  .strict()
200
184
  .default({ defaultMode: "notify", notifyHangupDelaySec: 3 });
201
- export type OutboundConfig = z.infer<typeof OutboundConfigSchema>;
202
185
 
203
186
  // -----------------------------------------------------------------------------
204
- // Streaming Configuration (OpenAI Realtime STT)
187
+ // Realtime Voice Configuration
205
188
  // -----------------------------------------------------------------------------
206
189
 
207
- export const VoiceCallStreamingConfigSchema = z
190
+ const RealtimeToolSchema = z
191
+ .object({
192
+ type: z.literal("function"),
193
+ name: z.string().min(1),
194
+ description: z.string(),
195
+ parameters: z.object({
196
+ type: z.literal("object"),
197
+ properties: z.record(z.string(), z.unknown()),
198
+ required: z.array(z.string()).optional(),
199
+ }),
200
+ })
201
+ .strict();
202
+ type RealtimeToolConfig = z.infer<typeof RealtimeToolSchema>;
203
+
204
+ const VoiceCallRealtimeProvidersConfigSchema = z
205
+ .record(z.string(), z.record(z.string(), z.unknown()))
206
+ .default({});
207
+
208
+ const VoiceCallRealtimeToolPolicySchema = z.enum(REALTIME_VOICE_AGENT_CONSULT_TOOL_POLICIES);
209
+
210
+ const VoiceCallRealtimeFastContextSourceSchema = z.enum(["memory", "sessions"]);
211
+
212
+ const VoiceCallRealtimeFastContextConfigSchema = z
213
+ .object({
214
+ /** Enable bounded memory/session lookup before the full consult agent. */
215
+ enabled: z.boolean().default(false),
216
+ /** Hard deadline for the fast context lookup. */
217
+ timeoutMs: z.number().int().positive().default(800),
218
+ /** Maximum memory/session hits to inject into the realtime tool result. */
219
+ maxResults: z.number().int().positive().default(3),
220
+ /** Indexed sources used by the fast context lookup. */
221
+ sources: z
222
+ .array(VoiceCallRealtimeFastContextSourceSchema)
223
+ .min(1)
224
+ .default(["memory", "sessions"]),
225
+ /** Fall back to the full agent consult when fast context has no answer. */
226
+ fallbackToConsult: z.boolean().default(false),
227
+ })
228
+ .strict()
229
+ .default({
230
+ enabled: false,
231
+ timeoutMs: 800,
232
+ maxResults: 3,
233
+ sources: ["memory", "sessions"],
234
+ fallbackToConsult: false,
235
+ });
236
+ export type VoiceCallRealtimeFastContextConfig = z.infer<
237
+ typeof VoiceCallRealtimeFastContextConfigSchema
238
+ >;
239
+
240
+ const VoiceCallStreamingProvidersConfigSchema = z
241
+ .record(z.string(), z.record(z.string(), z.unknown()))
242
+ .default({});
243
+
244
+ const VoiceCallRealtimeConfigSchema = z
245
+ .object({
246
+ /** Enable realtime voice-to-voice mode. */
247
+ enabled: z.boolean().default(false),
248
+ /** Provider id from registered realtime voice providers. */
249
+ provider: z.string().min(1).optional(),
250
+ /** Optional override for the local WebSocket route path. */
251
+ streamPath: z.string().min(1).optional(),
252
+ /** System instructions passed to the realtime provider. */
253
+ instructions: z.string().default(DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS),
254
+ /** Tool policy for the shared OpenClaw agent consult tool. */
255
+ toolPolicy: VoiceCallRealtimeToolPolicySchema.default("safe-read-only"),
256
+ /** Tool definitions exposed to the realtime provider. */
257
+ tools: z.array(RealtimeToolSchema).default([]),
258
+ /** Low-latency memory/session context for the consult tool. */
259
+ fastContext: VoiceCallRealtimeFastContextConfigSchema,
260
+ /** Provider-owned raw config blobs keyed by provider id. */
261
+ providers: VoiceCallRealtimeProvidersConfigSchema,
262
+ })
263
+ .strict()
264
+ .default({
265
+ enabled: false,
266
+ instructions: DEFAULT_VOICE_CALL_REALTIME_INSTRUCTIONS,
267
+ toolPolicy: "safe-read-only",
268
+ tools: [],
269
+ fastContext: {
270
+ enabled: false,
271
+ timeoutMs: 800,
272
+ maxResults: 3,
273
+ sources: ["memory", "sessions"],
274
+ fallbackToConsult: false,
275
+ },
276
+ providers: {},
277
+ });
278
+ export type VoiceCallRealtimeConfig = z.infer<typeof VoiceCallRealtimeConfigSchema>;
279
+
280
+ // -----------------------------------------------------------------------------
281
+ // Streaming Configuration (Realtime Transcription)
282
+ // -----------------------------------------------------------------------------
283
+
284
+ const VoiceCallStreamingConfigSchema = z
208
285
  .object({
209
286
  /** Enable real-time audio streaming (requires WebSocket support) */
210
287
  enabled: z.boolean().default(false),
211
- /** STT provider for real-time transcription */
212
- sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"),
213
- /** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */
214
- openaiApiKey: z.string().min(1).optional(),
215
- /** OpenAI transcription model (default: gpt-4o-transcribe) */
216
- sttModel: z.string().min(1).default("gpt-4o-transcribe"),
217
- /** VAD silence duration in ms before considering speech ended */
218
- silenceDurationMs: z.number().int().positive().default(800),
219
- /** VAD threshold 0-1 (higher = less sensitive) */
220
- vadThreshold: z.number().min(0).max(1).default(0.5),
288
+ /** Provider id from registered realtime transcription providers. */
289
+ provider: z.string().min(1).optional(),
221
290
  /** WebSocket path for media stream connections */
222
291
  streamPath: z.string().min(1).default("/voice/stream"),
292
+ /** Provider-owned raw config blobs keyed by provider id. */
293
+ providers: VoiceCallStreamingProvidersConfigSchema,
223
294
  /**
224
295
  * Close unauthenticated media stream sockets if no valid `start` frame arrives in time.
225
296
  * Protects against pre-auth idle connection hold attacks.
@@ -235,17 +306,13 @@ export const VoiceCallStreamingConfigSchema = z
235
306
  .strict()
236
307
  .default({
237
308
  enabled: false,
238
- sttProvider: "openai-realtime",
239
- sttModel: "gpt-4o-transcribe",
240
- silenceDurationMs: 800,
241
- vadThreshold: 0.5,
242
309
  streamPath: "/voice/stream",
310
+ providers: {},
243
311
  preStartTimeoutMs: 5000,
244
312
  maxPendingConnections: 32,
245
313
  maxPendingConnectionsPerIp: 4,
246
314
  maxConnections: 128,
247
315
  });
248
- export type VoiceCallStreamingConfig = z.infer<typeof VoiceCallStreamingConfigSchema>;
249
316
 
250
317
  // -----------------------------------------------------------------------------
251
318
  // Main Voice Call Configuration
@@ -291,11 +358,10 @@ export const VoiceCallConfigSchema = z
291
358
 
292
359
  /**
293
360
  * Maximum age of a call in seconds before it is automatically reaped.
294
- * Catches calls stuck in unexpected states (e.g., notify-mode calls that
295
- * never receive a terminal webhook). Set to 0 to disable.
296
- * Default: 0 (disabled). Recommended: 120-300 for production.
361
+ * Catches calls stuck before answer (for example, local mock calls that
362
+ * never receive provider webhooks). Set to 0 to disable.
297
363
  */
298
- staleCallReaperSeconds: z.number().int().nonnegative().default(0),
364
+ staleCallReaperSeconds: z.number().int().nonnegative().default(120),
299
365
 
300
366
  /** Silence timeout for end-of-speech detection (ms) */
301
367
  silenceTimeoutMs: z.number().int().positive().default(800),
@@ -312,7 +378,7 @@ export const VoiceCallConfigSchema = z
312
378
  /** Webhook server configuration */
313
379
  serve: VoiceCallServeConfigSchema,
314
380
 
315
- /** Tailscale exposure configuration (legacy, prefer tunnel config) */
381
+ /** @deprecated Prefer tunnel config. */
316
382
  tailscale: VoiceCallTailscaleConfigSchema,
317
383
 
318
384
  /** Tunnel configuration (unified ngrok/tailscale) */
@@ -324,23 +390,26 @@ export const VoiceCallConfigSchema = z
324
390
  /** Real-time audio streaming configuration */
325
391
  streaming: VoiceCallStreamingConfigSchema,
326
392
 
393
+ /** Realtime voice-to-voice configuration */
394
+ realtime: VoiceCallRealtimeConfigSchema,
395
+
327
396
  /** Public webhook URL override (if set, bypasses tunnel auto-detection) */
328
397
  publicUrl: z.string().url().optional(),
329
398
 
330
399
  /** Skip webhook signature verification (development only, NOT for production) */
331
400
  skipSignatureVerification: z.boolean().default(false),
332
401
 
333
- /** STT configuration */
334
- stt: SttConfigSchema,
335
-
336
402
  /** TTS override (deep-merges with core messages.tts) */
337
403
  tts: TtsConfigSchema,
338
404
 
339
405
  /** Store path for call logs */
340
406
  store: z.string().optional(),
341
407
 
342
- /** Model for generating voice responses (e.g., "anthropic/claude-sonnet-4", "openai/gpt-4o") */
343
- responseModel: z.string().default("openai/gpt-4o-mini"),
408
+ /** Agent ID to use for voice response generation. Defaults to "main". */
409
+ agentId: z.string().min(1).optional(),
410
+
411
+ /** Optional model override for generating voice responses. */
412
+ responseModel: z.string().optional(),
344
413
 
345
414
  /** System prompt for voice responses */
346
415
  responseSystemPrompt: z.string().optional(),
@@ -351,13 +420,15 @@ export const VoiceCallConfigSchema = z
351
420
  .strict();
352
421
 
353
422
  export type VoiceCallConfig = z.infer<typeof VoiceCallConfigSchema>;
354
- type DeepPartial<T> =
355
- T extends Array<infer U>
423
+ type DeepPartial<T> = T extends SecretInput
424
+ ? T
425
+ : T extends Array<infer U>
356
426
  ? DeepPartial<U>[]
357
427
  : T extends object
358
428
  ? { [K in keyof T]?: DeepPartial<T[K]> }
359
429
  : T;
360
430
  export type VoiceCallConfigInput = DeepPartial<VoiceCallConfig>;
431
+ const TWILIO_AUTH_TOKEN_PATH = "plugins.entries.voice-call.config.twilio.authToken";
361
432
 
362
433
  // -----------------------------------------------------------------------------
363
434
  // Configuration Helpers
@@ -369,6 +440,29 @@ function cloneDefaultVoiceCallConfig(): VoiceCallConfig {
369
440
  return structuredClone(DEFAULT_VOICE_CALL_CONFIG);
370
441
  }
371
442
 
443
+ function normalizeWebhookLikePath(pathname: string): string {
444
+ const trimmed = pathname.trim();
445
+ if (!trimmed) {
446
+ return "/";
447
+ }
448
+ const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
449
+ if (prefixed === "/") {
450
+ return prefixed;
451
+ }
452
+ return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
453
+ }
454
+
455
+ function defaultRealtimeStreamPathForServePath(servePath: string): string {
456
+ const normalized = normalizeWebhookLikePath(servePath);
457
+ if (normalized.endsWith("/webhook")) {
458
+ return `${normalized.slice(0, -"/webhook".length)}/stream/realtime`;
459
+ }
460
+ if (normalized === "/") {
461
+ return "/voice/stream/realtime";
462
+ }
463
+ return `${normalized}/stream/realtime`;
464
+ }
465
+
372
466
  function normalizeVoiceCallTtsConfig(
373
467
  defaults: VoiceCallTtsConfig,
374
468
  overrides: DeepPartial<NonNullable<VoiceCallTtsConfig>> | undefined,
@@ -380,14 +474,50 @@ function normalizeVoiceCallTtsConfig(
380
474
  return TtsConfigSchema.parse(deepMergeDefined(defaults ?? {}, overrides ?? {}));
381
475
  }
382
476
 
477
+ function sanitizeVoiceCallProviderConfigs(
478
+ value: Record<string, Record<string, unknown> | undefined> | undefined,
479
+ ): Record<string, Record<string, unknown>> {
480
+ if (!value) {
481
+ return {};
482
+ }
483
+ return Object.fromEntries(
484
+ Object.entries(value).filter(
485
+ (entry): entry is [string, Record<string, unknown>] => entry[1] !== undefined,
486
+ ),
487
+ );
488
+ }
489
+
490
+ export function resolveTwilioAuthToken(
491
+ config: Pick<VoiceCallConfig, "twilio">,
492
+ ): string | undefined {
493
+ return normalizeResolvedSecretInputString({
494
+ value: config.twilio?.authToken,
495
+ path: TWILIO_AUTH_TOKEN_PATH,
496
+ });
497
+ }
498
+
383
499
  export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
384
500
  const defaults = cloneDefaultVoiceCallConfig();
501
+ const serve = { ...defaults.serve, ...config.serve };
502
+ const streamingProvider = config.streaming?.provider;
503
+ const streamingProviders = sanitizeVoiceCallProviderConfigs(
504
+ config.streaming?.providers ?? defaults.streaming.providers,
505
+ );
506
+ const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider;
507
+ const realtimeProviders = sanitizeVoiceCallProviderConfigs(
508
+ config.realtime?.providers ?? defaults.realtime.providers,
509
+ );
510
+ const realtimeFastContext = {
511
+ ...defaults.realtime.fastContext,
512
+ ...config.realtime?.fastContext,
513
+ sources: config.realtime?.fastContext?.sources ?? defaults.realtime.fastContext.sources,
514
+ };
385
515
  return {
386
516
  ...defaults,
387
517
  ...config,
388
518
  allowFrom: config.allowFrom ?? defaults.allowFrom,
389
519
  outbound: { ...defaults.outbound, ...config.outbound },
390
- serve: { ...defaults.serve, ...config.serve },
520
+ serve,
391
521
  tailscale: { ...defaults.tailscale, ...config.tailscale },
392
522
  tunnel: { ...defaults.tunnel, ...config.tunnel },
393
523
  webhookSecurity: {
@@ -397,8 +527,24 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
397
527
  trustedProxyIPs:
398
528
  config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs,
399
529
  },
400
- streaming: { ...defaults.streaming, ...config.streaming },
401
- stt: { ...defaults.stt, ...config.stt },
530
+ streaming: {
531
+ ...defaults.streaming,
532
+ ...config.streaming,
533
+ provider: streamingProvider,
534
+ providers: streamingProviders,
535
+ },
536
+ realtime: {
537
+ ...defaults.realtime,
538
+ ...config.realtime,
539
+ provider: realtimeProvider,
540
+ streamPath:
541
+ config.realtime?.streamPath ??
542
+ defaultRealtimeStreamPathForServePath(serve.path ?? defaults.serve.path),
543
+ tools:
544
+ (config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools,
545
+ fastContext: realtimeFastContext,
546
+ providers: realtimeProviders,
547
+ },
402
548
  tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
403
549
  };
404
550
  }
@@ -420,6 +566,7 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
420
566
 
421
567
  // Twilio
422
568
  if (resolved.provider === "twilio") {
569
+ resolved.fromNumber = resolved.fromNumber ?? process.env.TWILIO_FROM_NUMBER;
423
570
  resolved.twilio = resolved.twilio ?? {};
424
571
  resolved.twilio.accountSid = resolved.twilio.accountSid ?? process.env.TWILIO_ACCOUNT_SID;
425
572
  resolved.twilio.authToken = resolved.twilio.authToken ?? process.env.TWILIO_AUTH_TOKEN;
@@ -474,7 +621,11 @@ export function validateProviderConfig(config: VoiceCallConfig): {
474
621
  }
475
622
 
476
623
  if (!config.fromNumber && config.provider !== "mock") {
477
- errors.push("plugins.entries.voice-call.config.fromNumber is required");
624
+ errors.push(
625
+ config.provider === "twilio"
626
+ ? "plugins.entries.voice-call.config.fromNumber is required (or set TWILIO_FROM_NUMBER env)"
627
+ : "plugins.entries.voice-call.config.fromNumber is required",
628
+ );
478
629
  }
479
630
 
480
631
  if (config.provider === "telnyx") {
@@ -501,7 +652,7 @@ export function validateProviderConfig(config: VoiceCallConfig): {
501
652
  "plugins.entries.voice-call.config.twilio.accountSid is required (or set TWILIO_ACCOUNT_SID env)",
502
653
  );
503
654
  }
504
- if (!config.twilio?.authToken) {
655
+ if (!hasConfiguredSecretInput(config.twilio?.authToken)) {
505
656
  errors.push(
506
657
  "plugins.entries.voice-call.config.twilio.authToken is required (or set TWILIO_AUTH_TOKEN env)",
507
658
  );
@@ -521,5 +672,23 @@ export function validateProviderConfig(config: VoiceCallConfig): {
521
672
  }
522
673
  }
523
674
 
675
+ if (config.realtime.enabled && config.inboundPolicy === "disabled") {
676
+ errors.push(
677
+ 'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true',
678
+ );
679
+ }
680
+
681
+ if (config.realtime.enabled && config.streaming.enabled) {
682
+ errors.push(
683
+ "plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true",
684
+ );
685
+ }
686
+
687
+ if (config.realtime.enabled && config.provider && config.provider !== "twilio") {
688
+ errors.push(
689
+ 'plugins.entries.voice-call.config.provider must be "twilio" when realtime.enabled is true',
690
+ );
691
+ }
692
+
524
693
  return { valid: errors.length === 0, errors };
525
694
  }
@@ -1,6 +1,4 @@
1
- import fs from "node:fs";
2
- import path from "node:path";
3
- import { fileURLToPath, pathToFileURL } from "node:url";
1
+ import type { OpenClawPluginApi } from "../api.js";
4
2
  import type { VoiceCallTtsConfig } from "./config.js";
5
3
 
6
4
  export type CoreConfig = {
@@ -13,147 +11,4 @@ export type CoreConfig = {
13
11
  [key: string]: unknown;
14
12
  };
15
13
 
16
- type CoreAgentDeps = {
17
- resolveAgentDir: (cfg: CoreConfig, agentId: string) => string;
18
- resolveAgentWorkspaceDir: (cfg: CoreConfig, agentId: string) => string;
19
- resolveAgentIdentity: (
20
- cfg: CoreConfig,
21
- agentId: string,
22
- ) => { name?: string | null } | null | undefined;
23
- resolveThinkingDefault: (params: {
24
- cfg: CoreConfig;
25
- provider?: string;
26
- model?: string;
27
- }) => string;
28
- runEmbeddedPiAgent: (params: {
29
- sessionId: string;
30
- sessionKey?: string;
31
- messageProvider?: string;
32
- sessionFile: string;
33
- workspaceDir: string;
34
- config?: CoreConfig;
35
- prompt: string;
36
- provider?: string;
37
- model?: string;
38
- thinkLevel?: string;
39
- verboseLevel?: string;
40
- timeoutMs: number;
41
- runId: string;
42
- lane?: string;
43
- extraSystemPrompt?: string;
44
- agentDir?: string;
45
- }) => Promise<{
46
- payloads?: Array<{ text?: string; isError?: boolean }>;
47
- meta?: { aborted?: boolean };
48
- }>;
49
- resolveAgentTimeoutMs: (opts: { cfg: CoreConfig }) => number;
50
- ensureAgentWorkspace: (params?: { dir: string }) => Promise<void>;
51
- resolveStorePath: (store?: string, opts?: { agentId?: string }) => string;
52
- loadSessionStore: (storePath: string) => Record<string, unknown>;
53
- saveSessionStore: (storePath: string, store: Record<string, unknown>) => Promise<void>;
54
- resolveSessionFilePath: (
55
- sessionId: string,
56
- entry: unknown,
57
- opts?: { agentId?: string },
58
- ) => string;
59
- DEFAULT_MODEL: string;
60
- DEFAULT_PROVIDER: string;
61
- };
62
-
63
- let coreRootCache: string | null = null;
64
- let coreDepsPromise: Promise<CoreAgentDeps> | null = null;
65
-
66
- function findPackageRoot(startDir: string, name: string): string | null {
67
- let dir = startDir;
68
- for (;;) {
69
- const pkgPath = path.join(dir, "package.json");
70
- try {
71
- if (fs.existsSync(pkgPath)) {
72
- const raw = fs.readFileSync(pkgPath, "utf8");
73
- const pkg = JSON.parse(raw) as { name?: string };
74
- if (pkg.name === name) {
75
- return dir;
76
- }
77
- }
78
- } catch {
79
- // ignore parse errors and keep walking
80
- }
81
- const parent = path.dirname(dir);
82
- if (parent === dir) {
83
- return null;
84
- }
85
- dir = parent;
86
- }
87
- }
88
-
89
- function resolveOpenClawRoot(): string {
90
- if (coreRootCache) {
91
- return coreRootCache;
92
- }
93
- const override = process.env.OPENCLAW_ROOT?.trim();
94
- if (override) {
95
- coreRootCache = override;
96
- return override;
97
- }
98
-
99
- const candidates = new Set<string>();
100
- if (process.argv[1]) {
101
- candidates.add(path.dirname(process.argv[1]));
102
- }
103
- candidates.add(process.cwd());
104
- try {
105
- const urlPath = fileURLToPath(import.meta.url);
106
- candidates.add(path.dirname(urlPath));
107
- } catch {
108
- // ignore
109
- }
110
-
111
- for (const start of candidates) {
112
- for (const name of ["openclaw"]) {
113
- const found = findPackageRoot(start, name);
114
- if (found) {
115
- coreRootCache = found;
116
- return found;
117
- }
118
- }
119
- }
120
-
121
- throw new Error("Unable to resolve core root. Set OPENCLAW_ROOT to the package root.");
122
- }
123
-
124
- async function importCoreExtensionAPI(): Promise<{
125
- resolveAgentDir: CoreAgentDeps["resolveAgentDir"];
126
- resolveAgentWorkspaceDir: CoreAgentDeps["resolveAgentWorkspaceDir"];
127
- DEFAULT_MODEL: string;
128
- DEFAULT_PROVIDER: string;
129
- resolveAgentIdentity: CoreAgentDeps["resolveAgentIdentity"];
130
- resolveThinkingDefault: CoreAgentDeps["resolveThinkingDefault"];
131
- runEmbeddedPiAgent: CoreAgentDeps["runEmbeddedPiAgent"];
132
- resolveAgentTimeoutMs: CoreAgentDeps["resolveAgentTimeoutMs"];
133
- ensureAgentWorkspace: CoreAgentDeps["ensureAgentWorkspace"];
134
- resolveStorePath: CoreAgentDeps["resolveStorePath"];
135
- loadSessionStore: CoreAgentDeps["loadSessionStore"];
136
- saveSessionStore: CoreAgentDeps["saveSessionStore"];
137
- resolveSessionFilePath: CoreAgentDeps["resolveSessionFilePath"];
138
- }> {
139
- // Do not import any other module. You can't touch this or you will be fired.
140
- const distPath = path.join(resolveOpenClawRoot(), "dist", "extensionAPI.js");
141
- if (!fs.existsSync(distPath)) {
142
- throw new Error(
143
- `Missing core module at ${distPath}. Run \`pnpm build\` or install the official package.`,
144
- );
145
- }
146
- return await import(pathToFileURL(distPath).href);
147
- }
148
-
149
- export async function loadCoreAgentDeps(): Promise<CoreAgentDeps> {
150
- if (coreDepsPromise) {
151
- return coreDepsPromise;
152
- }
153
-
154
- coreDepsPromise = (async () => {
155
- return await importCoreExtensionAPI();
156
- })();
157
-
158
- return coreDepsPromise;
159
- }
14
+ export type CoreAgentDeps = OpenClawPluginApi["runtime"]["agent"];