@openclaw/voice-call 2026.1.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/CHANGELOG.md +78 -0
  2. package/README.md +135 -0
  3. package/index.ts +497 -0
  4. package/openclaw.plugin.json +601 -0
  5. package/package.json +16 -0
  6. package/src/cli.ts +312 -0
  7. package/src/config.test.ts +204 -0
  8. package/src/config.ts +502 -0
  9. package/src/core-bridge.ts +198 -0
  10. package/src/manager/context.ts +21 -0
  11. package/src/manager/events.ts +177 -0
  12. package/src/manager/lookup.ts +33 -0
  13. package/src/manager/outbound.ts +248 -0
  14. package/src/manager/state.ts +50 -0
  15. package/src/manager/store.ts +88 -0
  16. package/src/manager/timers.ts +86 -0
  17. package/src/manager/twiml.ts +9 -0
  18. package/src/manager.test.ts +108 -0
  19. package/src/manager.ts +888 -0
  20. package/src/media-stream.test.ts +97 -0
  21. package/src/media-stream.ts +393 -0
  22. package/src/providers/base.ts +67 -0
  23. package/src/providers/index.ts +10 -0
  24. package/src/providers/mock.ts +168 -0
  25. package/src/providers/plivo.test.ts +28 -0
  26. package/src/providers/plivo.ts +504 -0
  27. package/src/providers/stt-openai-realtime.ts +311 -0
  28. package/src/providers/telnyx.ts +364 -0
  29. package/src/providers/tts-openai.ts +264 -0
  30. package/src/providers/twilio/api.ts +45 -0
  31. package/src/providers/twilio/webhook.ts +30 -0
  32. package/src/providers/twilio.test.ts +64 -0
  33. package/src/providers/twilio.ts +595 -0
  34. package/src/response-generator.ts +171 -0
  35. package/src/runtime.ts +217 -0
  36. package/src/telephony-audio.ts +88 -0
  37. package/src/telephony-tts.ts +95 -0
  38. package/src/tunnel.ts +331 -0
  39. package/src/types.ts +273 -0
  40. package/src/utils.ts +12 -0
  41. package/src/voice-mapping.ts +65 -0
  42. package/src/webhook-security.test.ts +260 -0
  43. package/src/webhook-security.ts +469 -0
  44. package/src/webhook.ts +491 -0
@@ -0,0 +1,171 @@
1
+ /**
2
+ * Voice call response generator - uses the embedded Pi agent for tool support.
3
+ * Routes voice responses through the same agent infrastructure as messaging.
4
+ */
5
+
6
+ import crypto from "node:crypto";
7
+
8
+ import { loadCoreAgentDeps, type CoreConfig } from "./core-bridge.js";
9
+
10
+ import type { VoiceCallConfig } from "./config.js";
11
+
12
+ export type VoiceResponseParams = {
13
+ /** Voice call config */
14
+ voiceConfig: VoiceCallConfig;
15
+ /** Core OpenClaw config */
16
+ coreConfig: CoreConfig;
17
+ /** Call ID for session tracking */
18
+ callId: string;
19
+ /** Caller's phone number */
20
+ from: string;
21
+ /** Conversation transcript */
22
+ transcript: Array<{ speaker: "user" | "bot"; text: string }>;
23
+ /** Latest user message */
24
+ userMessage: string;
25
+ };
26
+
27
+ export type VoiceResponseResult = {
28
+ text: string | null;
29
+ error?: string;
30
+ };
31
+
32
+ type SessionEntry = {
33
+ sessionId: string;
34
+ updatedAt: number;
35
+ };
36
+
37
+ /**
38
+ * Generate a voice response using the embedded Pi agent with full tool support.
39
+ * Uses the same agent infrastructure as messaging for consistent behavior.
40
+ */
41
+ export async function generateVoiceResponse(
42
+ params: VoiceResponseParams,
43
+ ): Promise<VoiceResponseResult> {
44
+ const { voiceConfig, callId, from, transcript, userMessage, coreConfig } =
45
+ params;
46
+
47
+ if (!coreConfig) {
48
+ return { text: null, error: "Core config unavailable for voice response" };
49
+ }
50
+
51
+ let deps: Awaited<ReturnType<typeof loadCoreAgentDeps>>;
52
+ try {
53
+ deps = await loadCoreAgentDeps();
54
+ } catch (err) {
55
+ return {
56
+ text: null,
57
+ error:
58
+ err instanceof Error
59
+ ? err.message
60
+ : "Unable to load core agent dependencies",
61
+ };
62
+ }
63
+ const cfg = coreConfig;
64
+
65
+ // Build voice-specific session key based on phone number
66
+ const normalizedPhone = from.replace(/\D/g, "");
67
+ const sessionKey = `voice:${normalizedPhone}`;
68
+ const agentId = "main";
69
+
70
+ // Resolve paths
71
+ const storePath = deps.resolveStorePath(cfg.session?.store, { agentId });
72
+ const agentDir = deps.resolveAgentDir(cfg, agentId);
73
+ const workspaceDir = deps.resolveAgentWorkspaceDir(cfg, agentId);
74
+
75
+ // Ensure workspace exists
76
+ await deps.ensureAgentWorkspace({ dir: workspaceDir });
77
+
78
+ // Load or create session entry
79
+ const sessionStore = deps.loadSessionStore(storePath);
80
+ const now = Date.now();
81
+ let sessionEntry = sessionStore[sessionKey] as SessionEntry | undefined;
82
+
83
+ if (!sessionEntry) {
84
+ sessionEntry = {
85
+ sessionId: crypto.randomUUID(),
86
+ updatedAt: now,
87
+ };
88
+ sessionStore[sessionKey] = sessionEntry;
89
+ await deps.saveSessionStore(storePath, sessionStore);
90
+ }
91
+
92
+ const sessionId = sessionEntry.sessionId;
93
+ const sessionFile = deps.resolveSessionFilePath(sessionId, sessionEntry, {
94
+ agentId,
95
+ });
96
+
97
+ // Resolve model from config
98
+ const modelRef =
99
+ voiceConfig.responseModel ||
100
+ `${deps.DEFAULT_PROVIDER}/${deps.DEFAULT_MODEL}`;
101
+ const slashIndex = modelRef.indexOf("/");
102
+ const provider =
103
+ slashIndex === -1 ? deps.DEFAULT_PROVIDER : modelRef.slice(0, slashIndex);
104
+ const model = slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1);
105
+
106
+ // Resolve thinking level
107
+ const thinkLevel = deps.resolveThinkingDefault({ cfg, provider, model });
108
+
109
+ // Resolve agent identity for personalized prompt
110
+ const identity = deps.resolveAgentIdentity(cfg, agentId);
111
+ const agentName = identity?.name?.trim() || "assistant";
112
+
113
+ // Build system prompt with conversation history
114
+ const basePrompt =
115
+ voiceConfig.responseSystemPrompt ??
116
+ `You are ${agentName}, a helpful voice assistant on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. The caller's phone number is ${from}. You have access to tools - use them when helpful.`;
117
+
118
+ let extraSystemPrompt = basePrompt;
119
+ if (transcript.length > 0) {
120
+ const history = transcript
121
+ .map(
122
+ (entry) =>
123
+ `${entry.speaker === "bot" ? "You" : "Caller"}: ${entry.text}`,
124
+ )
125
+ .join("\n");
126
+ extraSystemPrompt = `${basePrompt}\n\nConversation so far:\n${history}`;
127
+ }
128
+
129
+ // Resolve timeout
130
+ const timeoutMs =
131
+ voiceConfig.responseTimeoutMs ?? deps.resolveAgentTimeoutMs({ cfg });
132
+ const runId = `voice:${callId}:${Date.now()}`;
133
+
134
+ try {
135
+ const result = await deps.runEmbeddedPiAgent({
136
+ sessionId,
137
+ sessionKey,
138
+ messageProvider: "voice",
139
+ sessionFile,
140
+ workspaceDir,
141
+ config: cfg,
142
+ prompt: userMessage,
143
+ provider,
144
+ model,
145
+ thinkLevel,
146
+ verboseLevel: "off",
147
+ timeoutMs,
148
+ runId,
149
+ lane: "voice",
150
+ extraSystemPrompt,
151
+ agentDir,
152
+ });
153
+
154
+ // Extract text from payloads
155
+ const texts = (result.payloads ?? [])
156
+ .filter((p) => p.text && !p.isError)
157
+ .map((p) => p.text?.trim())
158
+ .filter(Boolean);
159
+
160
+ const text = texts.join(" ") || null;
161
+
162
+ if (!text && result.meta.aborted) {
163
+ return { text: null, error: "Response generation was aborted" };
164
+ }
165
+
166
+ return { text };
167
+ } catch (err) {
168
+ console.error(`[voice-call] Response generation failed:`, err);
169
+ return { text: null, error: String(err) };
170
+ }
171
+ }
package/src/runtime.ts ADDED
@@ -0,0 +1,217 @@
1
+ import type { CoreConfig } from "./core-bridge.js";
2
+ import type { VoiceCallConfig } from "./config.js";
3
+ import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js";
4
+ import { CallManager } from "./manager.js";
5
+ import type { VoiceCallProvider } from "./providers/base.js";
6
+ import { MockProvider } from "./providers/mock.js";
7
+ import { PlivoProvider } from "./providers/plivo.js";
8
+ import { TelnyxProvider } from "./providers/telnyx.js";
9
+ import { TwilioProvider } from "./providers/twilio.js";
10
+ import type { TelephonyTtsRuntime } from "./telephony-tts.js";
11
+ import { createTelephonyTtsProvider } from "./telephony-tts.js";
12
+ import { startTunnel, type TunnelResult } from "./tunnel.js";
13
+ import {
14
+ cleanupTailscaleExposure,
15
+ setupTailscaleExposure,
16
+ VoiceCallWebhookServer,
17
+ } from "./webhook.js";
18
+
19
+ export type VoiceCallRuntime = {
20
+ config: VoiceCallConfig;
21
+ provider: VoiceCallProvider;
22
+ manager: CallManager;
23
+ webhookServer: VoiceCallWebhookServer;
24
+ webhookUrl: string;
25
+ publicUrl: string | null;
26
+ stop: () => Promise<void>;
27
+ };
28
+
29
+ type Logger = {
30
+ info: (message: string) => void;
31
+ warn: (message: string) => void;
32
+ error: (message: string) => void;
33
+ debug: (message: string) => void;
34
+ };
35
+
36
+ function isLoopbackBind(bind: string | undefined): boolean {
37
+ if (!bind) return false;
38
+ return bind === "127.0.0.1" || bind === "::1" || bind === "localhost";
39
+ }
40
+
41
+ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
42
+ const allowNgrokFreeTierLoopbackBypass =
43
+ config.tunnel?.provider === "ngrok" &&
44
+ isLoopbackBind(config.serve?.bind) &&
45
+ (config.tunnel?.allowNgrokFreeTierLoopbackBypass ||
46
+ config.tunnel?.allowNgrokFreeTier ||
47
+ false);
48
+
49
+ switch (config.provider) {
50
+ case "telnyx":
51
+ return new TelnyxProvider({
52
+ apiKey: config.telnyx?.apiKey,
53
+ connectionId: config.telnyx?.connectionId,
54
+ publicKey: config.telnyx?.publicKey,
55
+ });
56
+ case "twilio":
57
+ return new TwilioProvider(
58
+ {
59
+ accountSid: config.twilio?.accountSid,
60
+ authToken: config.twilio?.authToken,
61
+ },
62
+ {
63
+ allowNgrokFreeTierLoopbackBypass,
64
+ publicUrl: config.publicUrl,
65
+ skipVerification: config.skipSignatureVerification,
66
+ streamPath: config.streaming?.enabled
67
+ ? config.streaming.streamPath
68
+ : undefined,
69
+ },
70
+ );
71
+ case "plivo":
72
+ return new PlivoProvider(
73
+ {
74
+ authId: config.plivo?.authId,
75
+ authToken: config.plivo?.authToken,
76
+ },
77
+ {
78
+ publicUrl: config.publicUrl,
79
+ skipVerification: config.skipSignatureVerification,
80
+ ringTimeoutSec: Math.max(1, Math.floor(config.ringTimeoutMs / 1000)),
81
+ },
82
+ );
83
+ case "mock":
84
+ return new MockProvider();
85
+ default:
86
+ throw new Error(
87
+ `Unsupported voice-call provider: ${String(config.provider)}`,
88
+ );
89
+ }
90
+ }
91
+
92
+ export async function createVoiceCallRuntime(params: {
93
+ config: VoiceCallConfig;
94
+ coreConfig: CoreConfig;
95
+ ttsRuntime?: TelephonyTtsRuntime;
96
+ logger?: Logger;
97
+ }): Promise<VoiceCallRuntime> {
98
+ const { config: rawConfig, coreConfig, ttsRuntime, logger } = params;
99
+ const log = logger ?? {
100
+ info: console.log,
101
+ warn: console.warn,
102
+ error: console.error,
103
+ debug: console.debug,
104
+ };
105
+
106
+ const config = resolveVoiceCallConfig(rawConfig);
107
+
108
+ if (!config.enabled) {
109
+ throw new Error(
110
+ "Voice call disabled. Enable the plugin entry in config.",
111
+ );
112
+ }
113
+
114
+ const validation = validateProviderConfig(config);
115
+ if (!validation.valid) {
116
+ throw new Error(`Invalid voice-call config: ${validation.errors.join("; ")}`);
117
+ }
118
+
119
+ const provider = resolveProvider(config);
120
+ const manager = new CallManager(config);
121
+ const webhookServer = new VoiceCallWebhookServer(
122
+ config,
123
+ manager,
124
+ provider,
125
+ coreConfig,
126
+ );
127
+
128
+ const localUrl = await webhookServer.start();
129
+
130
+ // Determine public URL - priority: config.publicUrl > tunnel > legacy tailscale
131
+ let publicUrl: string | null = config.publicUrl ?? null;
132
+ let tunnelResult: TunnelResult | null = null;
133
+
134
+ if (!publicUrl && config.tunnel?.provider && config.tunnel.provider !== "none") {
135
+ try {
136
+ tunnelResult = await startTunnel({
137
+ provider: config.tunnel.provider,
138
+ port: config.serve.port,
139
+ path: config.serve.path,
140
+ ngrokAuthToken: config.tunnel.ngrokAuthToken,
141
+ ngrokDomain: config.tunnel.ngrokDomain,
142
+ });
143
+ publicUrl = tunnelResult?.publicUrl ?? null;
144
+ } catch (err) {
145
+ log.error(
146
+ `[voice-call] Tunnel setup failed: ${
147
+ err instanceof Error ? err.message : String(err)
148
+ }`,
149
+ );
150
+ }
151
+ }
152
+
153
+ if (!publicUrl && config.tailscale?.mode !== "off") {
154
+ publicUrl = await setupTailscaleExposure(config);
155
+ }
156
+
157
+ const webhookUrl = publicUrl ?? localUrl;
158
+
159
+ if (publicUrl && provider.name === "twilio") {
160
+ (provider as TwilioProvider).setPublicUrl(publicUrl);
161
+ }
162
+
163
+ if (provider.name === "twilio" && config.streaming?.enabled) {
164
+ const twilioProvider = provider as TwilioProvider;
165
+ if (ttsRuntime?.textToSpeechTelephony) {
166
+ try {
167
+ const ttsProvider = createTelephonyTtsProvider({
168
+ coreConfig,
169
+ ttsOverride: config.tts,
170
+ runtime: ttsRuntime,
171
+ });
172
+ twilioProvider.setTTSProvider(ttsProvider);
173
+ log.info("[voice-call] Telephony TTS provider configured");
174
+ } catch (err) {
175
+ log.warn(
176
+ `[voice-call] Failed to initialize telephony TTS: ${
177
+ err instanceof Error ? err.message : String(err)
178
+ }`,
179
+ );
180
+ }
181
+ } else {
182
+ log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
183
+ }
184
+
185
+ const mediaHandler = webhookServer.getMediaStreamHandler();
186
+ if (mediaHandler) {
187
+ twilioProvider.setMediaStreamHandler(mediaHandler);
188
+ log.info("[voice-call] Media stream handler wired to provider");
189
+ }
190
+ }
191
+
192
+ manager.initialize(provider, webhookUrl);
193
+
194
+ const stop = async () => {
195
+ if (tunnelResult) {
196
+ await tunnelResult.stop();
197
+ }
198
+ await cleanupTailscaleExposure(config);
199
+ await webhookServer.stop();
200
+ };
201
+
202
+ log.info("[voice-call] Runtime initialized");
203
+ log.info(`[voice-call] Webhook URL: ${webhookUrl}`);
204
+ if (publicUrl) {
205
+ log.info(`[voice-call] Public URL: ${publicUrl}`);
206
+ }
207
+
208
+ return {
209
+ config,
210
+ provider,
211
+ manager,
212
+ webhookServer,
213
+ webhookUrl,
214
+ publicUrl,
215
+ stop,
216
+ };
217
+ }
@@ -0,0 +1,88 @@
1
+ const TELEPHONY_SAMPLE_RATE = 8000;
2
+
3
+ function clamp16(value: number): number {
4
+ return Math.max(-32768, Math.min(32767, value));
5
+ }
6
+
7
+ /**
8
+ * Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
9
+ */
10
+ export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
11
+ if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input;
12
+ const inputSamples = Math.floor(input.length / 2);
13
+ if (inputSamples === 0) return Buffer.alloc(0);
14
+
15
+ const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
16
+ const outputSamples = Math.floor(inputSamples / ratio);
17
+ const output = Buffer.alloc(outputSamples * 2);
18
+
19
+ for (let i = 0; i < outputSamples; i++) {
20
+ const srcPos = i * ratio;
21
+ const srcIndex = Math.floor(srcPos);
22
+ const frac = srcPos - srcIndex;
23
+
24
+ const s0 = input.readInt16LE(srcIndex * 2);
25
+ const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
26
+ const s1 = input.readInt16LE(s1Index * 2);
27
+
28
+ const sample = Math.round(s0 + frac * (s1 - s0));
29
+ output.writeInt16LE(clamp16(sample), i * 2);
30
+ }
31
+
32
+ return output;
33
+ }
34
+
35
+ /**
36
+ * Convert 16-bit PCM to 8-bit mu-law (G.711).
37
+ */
38
+ export function pcmToMulaw(pcm: Buffer): Buffer {
39
+ const samples = Math.floor(pcm.length / 2);
40
+ const mulaw = Buffer.alloc(samples);
41
+
42
+ for (let i = 0; i < samples; i++) {
43
+ const sample = pcm.readInt16LE(i * 2);
44
+ mulaw[i] = linearToMulaw(sample);
45
+ }
46
+
47
+ return mulaw;
48
+ }
49
+
50
+ export function convertPcmToMulaw8k(
51
+ pcm: Buffer,
52
+ inputSampleRate: number,
53
+ ): Buffer {
54
+ const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
55
+ return pcmToMulaw(pcm8k);
56
+ }
57
+
58
+ /**
59
+ * Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
60
+ */
61
+ export function chunkAudio(
62
+ audio: Buffer,
63
+ chunkSize = 160,
64
+ ): Generator<Buffer, void, unknown> {
65
+ return (function* () {
66
+ for (let i = 0; i < audio.length; i += chunkSize) {
67
+ yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
68
+ }
69
+ })();
70
+ }
71
+
72
+ function linearToMulaw(sample: number): number {
73
+ const BIAS = 132;
74
+ const CLIP = 32635;
75
+
76
+ const sign = sample < 0 ? 0x80 : 0;
77
+ if (sample < 0) sample = -sample;
78
+ if (sample > CLIP) sample = CLIP;
79
+
80
+ sample += BIAS;
81
+ let exponent = 7;
82
+ for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
83
+ expMask >>= 1;
84
+ }
85
+
86
+ const mantissa = (sample >> (exponent + 3)) & 0x0f;
87
+ return ~(sign | (exponent << 4) | mantissa) & 0xff;
88
+ }
@@ -0,0 +1,95 @@
1
+ import type { CoreConfig } from "./core-bridge.js";
2
+ import type { VoiceCallTtsConfig } from "./config.js";
3
+ import { convertPcmToMulaw8k } from "./telephony-audio.js";
4
+
5
+ export type TelephonyTtsRuntime = {
6
+ textToSpeechTelephony: (params: {
7
+ text: string;
8
+ cfg: CoreConfig;
9
+ prefsPath?: string;
10
+ }) => Promise<{
11
+ success: boolean;
12
+ audioBuffer?: Buffer;
13
+ sampleRate?: number;
14
+ provider?: string;
15
+ error?: string;
16
+ }>;
17
+ };
18
+
19
+ export type TelephonyTtsProvider = {
20
+ synthesizeForTelephony: (text: string) => Promise<Buffer>;
21
+ };
22
+
23
+ export function createTelephonyTtsProvider(params: {
24
+ coreConfig: CoreConfig;
25
+ ttsOverride?: VoiceCallTtsConfig;
26
+ runtime: TelephonyTtsRuntime;
27
+ }): TelephonyTtsProvider {
28
+ const { coreConfig, ttsOverride, runtime } = params;
29
+ const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
30
+
31
+ return {
32
+ synthesizeForTelephony: async (text: string) => {
33
+ const result = await runtime.textToSpeechTelephony({
34
+ text,
35
+ cfg: mergedConfig,
36
+ });
37
+
38
+ if (!result.success || !result.audioBuffer || !result.sampleRate) {
39
+ throw new Error(result.error ?? "TTS conversion failed");
40
+ }
41
+
42
+ return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
43
+ },
44
+ };
45
+ }
46
+
47
+ function applyTtsOverride(
48
+ coreConfig: CoreConfig,
49
+ override?: VoiceCallTtsConfig,
50
+ ): CoreConfig {
51
+ if (!override) return coreConfig;
52
+
53
+ const base = coreConfig.messages?.tts;
54
+ const merged = mergeTtsConfig(base, override);
55
+ if (!merged) return coreConfig;
56
+
57
+ return {
58
+ ...coreConfig,
59
+ messages: {
60
+ ...(coreConfig.messages ?? {}),
61
+ tts: merged,
62
+ },
63
+ };
64
+ }
65
+
66
+ function mergeTtsConfig(
67
+ base?: VoiceCallTtsConfig,
68
+ override?: VoiceCallTtsConfig,
69
+ ): VoiceCallTtsConfig | undefined {
70
+ if (!base && !override) return undefined;
71
+ if (!override) return base;
72
+ if (!base) return override;
73
+ return deepMerge(base, override);
74
+ }
75
+
76
+ function deepMerge<T>(base: T, override: T): T {
77
+ if (!isPlainObject(base) || !isPlainObject(override)) {
78
+ return override;
79
+ }
80
+ const result: Record<string, unknown> = { ...base };
81
+ for (const [key, value] of Object.entries(override)) {
82
+ if (value === undefined) continue;
83
+ const existing = (base as Record<string, unknown>)[key];
84
+ if (isPlainObject(existing) && isPlainObject(value)) {
85
+ result[key] = deepMerge(existing, value);
86
+ } else {
87
+ result[key] = value;
88
+ }
89
+ }
90
+ return result as T;
91
+ }
92
+
93
+ function isPlainObject(value: unknown): value is Record<string, unknown> {
94
+ return Boolean(value) && typeof value === "object" && !Array.isArray(value);
95
+ }