@openclaw/voice-call 2026.3.2 → 2026.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # Changelog
2
2
 
3
+ ## 2026.3.7
4
+
5
+ ### Changes
6
+
7
+ - Version alignment with core OpenClaw release numbers.
8
+
9
+ ## 2026.3.3
10
+
11
+ ### Changes
12
+
13
+ - Version alignment with core OpenClaw release numbers.
14
+
3
15
  ## 2026.3.2
4
16
 
5
17
  ### Changes
package/index.ts CHANGED
@@ -1,5 +1,8 @@
1
1
  import { Type } from "@sinclair/typebox";
2
- import type { GatewayRequestHandlerOptions, OpenClawPluginApi } from "openclaw/plugin-sdk";
2
+ import type {
3
+ GatewayRequestHandlerOptions,
4
+ OpenClawPluginApi,
5
+ } from "openclaw/plugin-sdk/voice-call";
3
6
  import { registerVoiceCallCli } from "./src/cli.js";
4
7
  import {
5
8
  VoiceCallConfigSchema,
@@ -206,6 +209,23 @@ const voiceCallPlugin = {
206
209
  const rt = await ensureRuntime();
207
210
  return { rt, callId, message } as const;
208
211
  };
212
+ const initiateCallAndRespond = async (params: {
213
+ rt: VoiceCallRuntime;
214
+ respond: GatewayRequestHandlerOptions["respond"];
215
+ to: string;
216
+ message?: string;
217
+ mode?: "notify" | "conversation";
218
+ }) => {
219
+ const result = await params.rt.manager.initiateCall(params.to, undefined, {
220
+ message: params.message,
221
+ mode: params.mode,
222
+ });
223
+ if (!result.success) {
224
+ params.respond(false, { error: result.error || "initiate failed" });
225
+ return;
226
+ }
227
+ params.respond(true, { callId: result.callId, initiated: true });
228
+ };
209
229
 
210
230
  api.registerGatewayMethod(
211
231
  "voicecall.initiate",
@@ -227,15 +247,13 @@ const voiceCallPlugin = {
227
247
  }
228
248
  const mode =
229
249
  params?.mode === "notify" || params?.mode === "conversation" ? params.mode : undefined;
230
- const result = await rt.manager.initiateCall(to, undefined, {
250
+ await initiateCallAndRespond({
251
+ rt,
252
+ respond,
253
+ to,
231
254
  message,
232
255
  mode,
233
256
  });
234
- if (!result.success) {
235
- respond(false, { error: result.error || "initiate failed" });
236
- return;
237
- }
238
- respond(true, { callId: result.callId, initiated: true });
239
257
  } catch (err) {
240
258
  sendError(respond, err);
241
259
  }
@@ -344,14 +362,12 @@ const voiceCallPlugin = {
344
362
  return;
345
363
  }
346
364
  const rt = await ensureRuntime();
347
- const result = await rt.manager.initiateCall(to, undefined, {
365
+ await initiateCallAndRespond({
366
+ rt,
367
+ respond,
368
+ to,
348
369
  message: message || undefined,
349
370
  });
350
- if (!result.success) {
351
- respond(false, { error: result.error || "initiate failed" });
352
- return;
353
- }
354
- respond(true, { callId: result.callId, initiated: true });
355
371
  } catch (err) {
356
372
  sendError(respond, err);
357
373
  }
@@ -249,6 +249,10 @@
249
249
  "type": "integer",
250
250
  "minimum": 1
251
251
  },
252
+ "staleCallReaperSeconds": {
253
+ "type": "integer",
254
+ "minimum": 0
255
+ },
252
256
  "silenceTimeoutMs": {
253
257
  "type": "integer",
254
258
  "minimum": 1
@@ -313,6 +317,27 @@
313
317
  }
314
318
  }
315
319
  },
320
+ "webhookSecurity": {
321
+ "type": "object",
322
+ "additionalProperties": false,
323
+ "properties": {
324
+ "allowedHosts": {
325
+ "type": "array",
326
+ "items": {
327
+ "type": "string"
328
+ }
329
+ },
330
+ "trustForwardingHeaders": {
331
+ "type": "boolean"
332
+ },
333
+ "trustedProxyIPs": {
334
+ "type": "array",
335
+ "items": {
336
+ "type": "string"
337
+ }
338
+ }
339
+ }
340
+ },
316
341
  "streaming": {
317
342
  "type": "object",
318
343
  "additionalProperties": false,
@@ -341,6 +366,22 @@
341
366
  },
342
367
  "streamPath": {
343
368
  "type": "string"
369
+ },
370
+ "preStartTimeoutMs": {
371
+ "type": "integer",
372
+ "minimum": 1
373
+ },
374
+ "maxPendingConnections": {
375
+ "type": "integer",
376
+ "minimum": 1
377
+ },
378
+ "maxPendingConnectionsPerIp": {
379
+ "type": "integer",
380
+ "minimum": 1
381
+ },
382
+ "maxConnections": {
383
+ "type": "integer",
384
+ "minimum": 1
344
385
  }
345
386
  }
346
387
  },
package/package.json CHANGED
@@ -1,10 +1,11 @@
1
1
  {
2
2
  "name": "@openclaw/voice-call",
3
- "version": "2026.3.2",
3
+ "version": "2026.3.7",
4
4
  "description": "OpenClaw voice-call plugin",
5
5
  "type": "module",
6
6
  "dependencies": {
7
7
  "@sinclair/typebox": "0.34.48",
8
+ "commander": "^14.0.3",
8
9
  "ws": "^8.19.0",
9
10
  "zod": "^4.3.6"
10
11
  },
package/src/cli.ts CHANGED
@@ -2,7 +2,7 @@ import fs from "node:fs";
2
2
  import os from "node:os";
3
3
  import path from "node:path";
4
4
  import type { Command } from "commander";
5
- import { sleep } from "openclaw/plugin-sdk";
5
+ import { sleep } from "openclaw/plugin-sdk/voice-call";
6
6
  import type { VoiceCallConfig } from "./config.js";
7
7
  import type { VoiceCallRuntime } from "./runtime.js";
8
8
  import { resolveUserPath } from "./utils.js";
@@ -1,49 +1,14 @@
1
1
  import { afterEach, beforeEach, describe, expect, it } from "vitest";
2
- import { validateProviderConfig, resolveVoiceCallConfig, type VoiceCallConfig } from "./config.js";
2
+ import {
3
+ validateProviderConfig,
4
+ normalizeVoiceCallConfig,
5
+ resolveVoiceCallConfig,
6
+ type VoiceCallConfig,
7
+ } from "./config.js";
8
+ import { createVoiceCallBaseConfig } from "./test-fixtures.js";
3
9
 
4
10
  function createBaseConfig(provider: "telnyx" | "twilio" | "plivo" | "mock"): VoiceCallConfig {
5
- return {
6
- enabled: true,
7
- provider,
8
- fromNumber: "+15550001234",
9
- inboundPolicy: "disabled",
10
- allowFrom: [],
11
- outbound: { defaultMode: "notify", notifyHangupDelaySec: 3 },
12
- maxDurationSeconds: 300,
13
- staleCallReaperSeconds: 600,
14
- silenceTimeoutMs: 800,
15
- transcriptTimeoutMs: 180000,
16
- ringTimeoutMs: 30000,
17
- maxConcurrentCalls: 1,
18
- serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
19
- tailscale: { mode: "off", path: "/voice/webhook" },
20
- tunnel: { provider: "none", allowNgrokFreeTierLoopbackBypass: false },
21
- webhookSecurity: {
22
- allowedHosts: [],
23
- trustForwardingHeaders: false,
24
- trustedProxyIPs: [],
25
- },
26
- streaming: {
27
- enabled: false,
28
- sttProvider: "openai-realtime",
29
- sttModel: "gpt-4o-transcribe",
30
- silenceDurationMs: 800,
31
- vadThreshold: 0.5,
32
- streamPath: "/voice/stream",
33
- preStartTimeoutMs: 5000,
34
- maxPendingConnections: 32,
35
- maxPendingConnectionsPerIp: 4,
36
- maxConnections: 128,
37
- },
38
- skipSignatureVerification: false,
39
- stt: { provider: "openai", model: "whisper-1" },
40
- tts: {
41
- provider: "openai",
42
- openai: { model: "gpt-4o-mini-tts", voice: "coral" },
43
- },
44
- responseModel: "openai/gpt-4o-mini",
45
- responseTimeoutMs: 30000,
46
- };
11
+ return createVoiceCallBaseConfig({ provider });
47
12
  }
48
13
 
49
14
  describe("validateProviderConfig", () => {
@@ -206,3 +171,48 @@ describe("validateProviderConfig", () => {
206
171
  });
207
172
  });
208
173
  });
174
+
175
+ describe("normalizeVoiceCallConfig", () => {
176
+ it("fills nested runtime defaults from a partial config boundary", () => {
177
+ const normalized = normalizeVoiceCallConfig({
178
+ enabled: true,
179
+ provider: "mock",
180
+ streaming: {
181
+ enabled: true,
182
+ streamPath: "/custom-stream",
183
+ },
184
+ });
185
+
186
+ expect(normalized.serve.path).toBe("/voice/webhook");
187
+ expect(normalized.streaming.streamPath).toBe("/custom-stream");
188
+ expect(normalized.streaming.sttModel).toBe("gpt-4o-transcribe");
189
+ expect(normalized.tunnel.provider).toBe("none");
190
+ expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
191
+ });
192
+
193
+ it("accepts partial nested TTS overrides and preserves nested objects", () => {
194
+ const normalized = normalizeVoiceCallConfig({
195
+ tts: {
196
+ provider: "elevenlabs",
197
+ elevenlabs: {
198
+ apiKey: {
199
+ source: "env",
200
+ provider: "elevenlabs",
201
+ id: "ELEVENLABS_API_KEY",
202
+ },
203
+ voiceSettings: {
204
+ speed: 1.1,
205
+ },
206
+ },
207
+ },
208
+ });
209
+
210
+ expect(normalized.tts?.provider).toBe("elevenlabs");
211
+ expect(normalized.tts?.elevenlabs?.apiKey).toEqual({
212
+ source: "env",
213
+ provider: "elevenlabs",
214
+ id: "ELEVENLABS_API_KEY",
215
+ });
216
+ expect(normalized.tts?.elevenlabs?.voiceSettings).toEqual({ speed: 1.1 });
217
+ });
218
+ });
package/src/config.ts CHANGED
@@ -3,8 +3,9 @@ import {
3
3
  TtsConfigSchema,
4
4
  TtsModeSchema,
5
5
  TtsProviderSchema,
6
- } from "openclaw/plugin-sdk";
6
+ } from "openclaw/plugin-sdk/voice-call";
7
7
  import { z } from "zod";
8
+ import { deepMergeDefined } from "./deep-merge.js";
8
9
 
9
10
  // -----------------------------------------------------------------------------
10
11
  // Phone Number Validation
@@ -350,17 +351,64 @@ export const VoiceCallConfigSchema = z
350
351
  .strict();
351
352
 
352
353
  export type VoiceCallConfig = z.infer<typeof VoiceCallConfigSchema>;
354
+ type DeepPartial<T> =
355
+ T extends Array<infer U>
356
+ ? DeepPartial<U>[]
357
+ : T extends object
358
+ ? { [K in keyof T]?: DeepPartial<T[K]> }
359
+ : T;
360
+ export type VoiceCallConfigInput = DeepPartial<VoiceCallConfig>;
353
361
 
354
362
  // -----------------------------------------------------------------------------
355
363
  // Configuration Helpers
356
364
  // -----------------------------------------------------------------------------
357
365
 
366
+ const DEFAULT_VOICE_CALL_CONFIG = VoiceCallConfigSchema.parse({});
367
+
368
+ function cloneDefaultVoiceCallConfig(): VoiceCallConfig {
369
+ return structuredClone(DEFAULT_VOICE_CALL_CONFIG);
370
+ }
371
+
372
+ function normalizeVoiceCallTtsConfig(
373
+ defaults: VoiceCallTtsConfig,
374
+ overrides: DeepPartial<NonNullable<VoiceCallTtsConfig>> | undefined,
375
+ ): VoiceCallTtsConfig {
376
+ if (!defaults && !overrides) {
377
+ return undefined;
378
+ }
379
+
380
+ return TtsConfigSchema.parse(deepMergeDefined(defaults ?? {}, overrides ?? {}));
381
+ }
382
+
383
+ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
384
+ const defaults = cloneDefaultVoiceCallConfig();
385
+ return {
386
+ ...defaults,
387
+ ...config,
388
+ allowFrom: config.allowFrom ?? defaults.allowFrom,
389
+ outbound: { ...defaults.outbound, ...config.outbound },
390
+ serve: { ...defaults.serve, ...config.serve },
391
+ tailscale: { ...defaults.tailscale, ...config.tailscale },
392
+ tunnel: { ...defaults.tunnel, ...config.tunnel },
393
+ webhookSecurity: {
394
+ ...defaults.webhookSecurity,
395
+ ...config.webhookSecurity,
396
+ allowedHosts: config.webhookSecurity?.allowedHosts ?? defaults.webhookSecurity.allowedHosts,
397
+ trustedProxyIPs:
398
+ config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs,
399
+ },
400
+ streaming: { ...defaults.streaming, ...config.streaming },
401
+ stt: { ...defaults.stt, ...config.stt },
402
+ tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
403
+ };
404
+ }
405
+
358
406
  /**
359
407
  * Resolves the configuration by merging environment variables into missing fields.
360
408
  * Returns a new configuration object with environment variables applied.
361
409
  */
362
- export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig {
363
- const resolved = JSON.parse(JSON.stringify(config)) as VoiceCallConfig;
410
+ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
411
+ const resolved = normalizeVoiceCallConfig(config);
364
412
 
365
413
  // Telnyx
366
414
  if (resolved.provider === "telnyx") {
@@ -405,7 +453,7 @@ export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig
405
453
  resolved.webhookSecurity.trustForwardingHeaders ?? false;
406
454
  resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
407
455
 
408
- return resolved;
456
+ return normalizeVoiceCallConfig(resolved);
409
457
  }
410
458
 
411
459
  /**
@@ -0,0 +1,23 @@
1
+ const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
2
+
3
+ export function deepMergeDefined(base: unknown, override: unknown): unknown {
4
+ if (!isPlainObject(base) || !isPlainObject(override)) {
5
+ return override === undefined ? base : override;
6
+ }
7
+
8
+ const result: Record<string, unknown> = { ...base };
9
+ for (const [key, value] of Object.entries(override)) {
10
+ if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
11
+ continue;
12
+ }
13
+
14
+ const existing = result[key];
15
+ result[key] = key in result ? deepMergeDefined(existing, value) : value;
16
+ }
17
+
18
+ return result;
19
+ }
20
+
21
+ function isPlainObject(value: unknown): value is Record<string, unknown> {
22
+ return Boolean(value) && typeof value === "object" && !Array.isArray(value);
23
+ }
@@ -0,0 +1,78 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import type { WebhookContext } from "../types.js";
3
+ import { MockProvider } from "./mock.js";
4
+
5
+ function createWebhookContext(rawBody: string): WebhookContext {
6
+ return {
7
+ headers: {},
8
+ rawBody,
9
+ url: "http://localhost/voice/webhook",
10
+ method: "POST",
11
+ query: {},
12
+ };
13
+ }
14
+
15
+ describe("MockProvider", () => {
16
+ it("preserves explicit falsy event values", () => {
17
+ const provider = new MockProvider();
18
+ const result = provider.parseWebhookEvent(
19
+ createWebhookContext(
20
+ JSON.stringify({
21
+ events: [
22
+ {
23
+ id: "evt-error",
24
+ type: "call.error",
25
+ callId: "call-1",
26
+ timestamp: 0,
27
+ error: "",
28
+ retryable: false,
29
+ },
30
+ {
31
+ id: "evt-ended",
32
+ type: "call.ended",
33
+ callId: "call-2",
34
+ reason: "",
35
+ },
36
+ {
37
+ id: "evt-speech",
38
+ type: "call.speech",
39
+ callId: "call-3",
40
+ transcript: "",
41
+ isFinal: false,
42
+ },
43
+ ],
44
+ }),
45
+ ),
46
+ );
47
+
48
+ expect(result.events).toEqual([
49
+ {
50
+ id: "evt-error",
51
+ type: "call.error",
52
+ callId: "call-1",
53
+ providerCallId: undefined,
54
+ timestamp: 0,
55
+ error: "",
56
+ retryable: false,
57
+ },
58
+ {
59
+ id: "evt-ended",
60
+ type: "call.ended",
61
+ callId: "call-2",
62
+ providerCallId: undefined,
63
+ timestamp: expect.any(Number),
64
+ reason: "",
65
+ },
66
+ {
67
+ id: "evt-speech",
68
+ type: "call.speech",
69
+ callId: "call-3",
70
+ providerCallId: undefined,
71
+ timestamp: expect.any(Number),
72
+ transcript: "",
73
+ isFinal: false,
74
+ confidence: undefined,
75
+ },
76
+ ]);
77
+ });
78
+ });
@@ -65,10 +65,10 @@ export class MockProvider implements VoiceCallProvider {
65
65
  }
66
66
 
67
67
  const base = {
68
- id: evt.id || crypto.randomUUID(),
68
+ id: evt.id ?? crypto.randomUUID(),
69
69
  callId: evt.callId,
70
70
  providerCallId: evt.providerCallId,
71
- timestamp: evt.timestamp || Date.now(),
71
+ timestamp: evt.timestamp ?? Date.now(),
72
72
  };
73
73
 
74
74
  switch (evt.type) {
@@ -83,7 +83,7 @@ export class MockProvider implements VoiceCallProvider {
83
83
  return {
84
84
  ...base,
85
85
  type: evt.type,
86
- text: payload.text || "",
86
+ text: payload.text ?? "",
87
87
  };
88
88
  }
89
89
 
@@ -98,7 +98,7 @@ export class MockProvider implements VoiceCallProvider {
98
98
  return {
99
99
  ...base,
100
100
  type: evt.type,
101
- transcript: payload.transcript || "",
101
+ transcript: payload.transcript ?? "",
102
102
  isFinal: payload.isFinal ?? true,
103
103
  confidence: payload.confidence,
104
104
  };
@@ -109,7 +109,7 @@ export class MockProvider implements VoiceCallProvider {
109
109
  return {
110
110
  ...base,
111
111
  type: evt.type,
112
- durationMs: payload.durationMs || 0,
112
+ durationMs: payload.durationMs ?? 0,
113
113
  };
114
114
  }
115
115
 
@@ -118,7 +118,7 @@ export class MockProvider implements VoiceCallProvider {
118
118
  return {
119
119
  ...base,
120
120
  type: evt.type,
121
- digits: payload.digits || "",
121
+ digits: payload.digits ?? "",
122
122
  };
123
123
  }
124
124
 
@@ -127,7 +127,7 @@ export class MockProvider implements VoiceCallProvider {
127
127
  return {
128
128
  ...base,
129
129
  type: evt.type,
130
- reason: payload.reason || "completed",
130
+ reason: payload.reason ?? "completed",
131
131
  };
132
132
  }
133
133
 
@@ -136,7 +136,7 @@ export class MockProvider implements VoiceCallProvider {
136
136
  return {
137
137
  ...base,
138
138
  type: evt.type,
139
- error: payload.error || "unknown error",
139
+ error: payload.error ?? "unknown error",
140
140
  retryable: payload.retryable,
141
141
  };
142
142
  }
@@ -1,4 +1,4 @@
1
- import { fetchWithSsrFGuard } from "openclaw/plugin-sdk";
1
+ import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/voice-call";
2
2
 
3
3
  type GuardedJsonApiRequestParams = {
4
4
  url: string;
@@ -0,0 +1,42 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import type { RealtimeSTTConfig } from "./stt-openai-realtime.js";
3
+ import { OpenAIRealtimeSTTProvider } from "./stt-openai-realtime.js";
4
+
5
+ type ProviderInternals = {
6
+ vadThreshold: number;
7
+ silenceDurationMs: number;
8
+ };
9
+
10
+ function readProviderInternals(config: RealtimeSTTConfig): ProviderInternals {
11
+ const provider = new OpenAIRealtimeSTTProvider(config) as unknown as Record<string, unknown>;
12
+ return {
13
+ vadThreshold: provider["vadThreshold"] as number,
14
+ silenceDurationMs: provider["silenceDurationMs"] as number,
15
+ };
16
+ }
17
+
18
+ describe("OpenAIRealtimeSTTProvider constructor defaults", () => {
19
+ it("uses vadThreshold: 0 when explicitly configured (max sensitivity)", () => {
20
+ const provider = readProviderInternals({
21
+ apiKey: "sk-test", // pragma: allowlist secret
22
+ vadThreshold: 0,
23
+ });
24
+ expect(provider.vadThreshold).toBe(0);
25
+ });
26
+
27
+ it("uses silenceDurationMs: 0 when explicitly configured", () => {
28
+ const provider = readProviderInternals({
29
+ apiKey: "sk-test", // pragma: allowlist secret
30
+ silenceDurationMs: 0,
31
+ });
32
+ expect(provider.silenceDurationMs).toBe(0);
33
+ });
34
+
35
+ it("falls back to defaults when values are undefined", () => {
36
+ const provider = readProviderInternals({
37
+ apiKey: "sk-test", // pragma: allowlist secret
38
+ });
39
+ expect(provider.vadThreshold).toBe(0.5);
40
+ expect(provider.silenceDurationMs).toBe(800);
41
+ });
42
+ });
@@ -62,8 +62,8 @@ export class OpenAIRealtimeSTTProvider {
62
62
  }
63
63
  this.apiKey = config.apiKey;
64
64
  this.model = config.model || "gpt-4o-transcribe";
65
- this.silenceDurationMs = config.silenceDurationMs || 800;
66
- this.vadThreshold = config.vadThreshold || 0.5;
65
+ this.silenceDurationMs = config.silenceDurationMs ?? 800;
66
+ this.vadThreshold = config.vadThreshold ?? 0.5;
67
67
  }
68
68
 
69
69
  /**
@@ -0,0 +1,43 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import type { OpenAITTSConfig } from "./tts-openai.js";
3
+ import { OpenAITTSProvider } from "./tts-openai.js";
4
+
5
+ type ProviderInternals = {
6
+ model: string;
7
+ voice: string;
8
+ speed: number;
9
+ };
10
+
11
+ function readProviderInternals(config: OpenAITTSConfig): ProviderInternals {
12
+ return new OpenAITTSProvider(config) as unknown as ProviderInternals;
13
+ }
14
+
15
+ describe("OpenAITTSProvider constructor defaults", () => {
16
+ it("uses speed: 0 when explicitly configured", () => {
17
+ const provider = readProviderInternals({
18
+ apiKey: "sk-test", // pragma: allowlist secret
19
+ speed: 0,
20
+ });
21
+
22
+ expect(provider.speed).toBe(0);
23
+ });
24
+
25
+ it("falls back to speed default when undefined", () => {
26
+ const provider = readProviderInternals({
27
+ apiKey: "sk-test", // pragma: allowlist secret
28
+ });
29
+
30
+ expect(provider.speed).toBe(1.0);
31
+ });
32
+
33
+ it("treats blank model and voice overrides as unset", () => {
34
+ const provider = readProviderInternals({
35
+ apiKey: "sk-test", // pragma: allowlist secret
36
+ model: " ",
37
+ voice: "",
38
+ });
39
+
40
+ expect(provider.model).toBe("gpt-4o-mini-tts");
41
+ expect(provider.voice).toBe("coral");
42
+ });
43
+ });
@@ -1,3 +1,5 @@
1
+ import { pcmToMulaw } from "../telephony-audio.js";
2
+
1
3
  /**
2
4
  * OpenAI TTS Provider
3
5
  *
@@ -64,6 +66,11 @@ export const OPENAI_TTS_VOICES = [
64
66
 
65
67
  export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
66
68
 
69
+ function trimToUndefined(value: string | undefined): string | undefined {
70
+ const trimmed = value?.trim();
71
+ return trimmed ? trimmed : undefined;
72
+ }
73
+
67
74
  /**
68
75
  * OpenAI TTS Provider for generating speech audio.
69
76
  */
@@ -75,13 +82,14 @@ export class OpenAITTSProvider {
75
82
  private instructions?: string;
76
83
 
77
84
  constructor(config: OpenAITTSConfig = {}) {
78
- this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
85
+ this.apiKey =
86
+ trimToUndefined(config.apiKey) ?? trimToUndefined(process.env.OPENAI_API_KEY) ?? "";
79
87
  // Default to gpt-4o-mini-tts for intelligent realtime applications
80
- this.model = config.model || "gpt-4o-mini-tts";
88
+ this.model = trimToUndefined(config.model) ?? "gpt-4o-mini-tts";
81
89
  // Default to coral - good balance of quality and natural tone
82
- this.voice = (config.voice as OpenAITTSVoice) || "coral";
83
- this.speed = config.speed || 1.0;
84
- this.instructions = config.instructions;
90
+ this.voice = (trimToUndefined(config.voice) as OpenAITTSVoice | undefined) ?? "coral";
91
+ this.speed = config.speed ?? 1.0;
92
+ this.instructions = trimToUndefined(config.instructions);
85
93
 
86
94
  if (!this.apiKey) {
87
95
  throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
@@ -103,7 +111,7 @@ export class OpenAITTSProvider {
103
111
  };
104
112
 
105
113
  // Add instructions if using gpt-4o-mini-tts model
106
- const effectiveInstructions = instructions || this.instructions;
114
+ const effectiveInstructions = trimToUndefined(instructions) ?? this.instructions;
107
115
  if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
108
116
  body.instructions = effectiveInstructions;
109
117
  }
@@ -179,55 +187,6 @@ function clamp16(value: number): number {
179
187
  return Math.max(-32768, Math.min(32767, value));
180
188
  }
181
189
 
182
- /**
183
- * Convert 16-bit PCM to 8-bit mu-law.
184
- * Standard G.711 mu-law encoding for telephony.
185
- */
186
- function pcmToMulaw(pcm: Buffer): Buffer {
187
- const samples = pcm.length / 2;
188
- const mulaw = Buffer.alloc(samples);
189
-
190
- for (let i = 0; i < samples; i++) {
191
- const sample = pcm.readInt16LE(i * 2);
192
- mulaw[i] = linearToMulaw(sample);
193
- }
194
-
195
- return mulaw;
196
- }
197
-
198
- /**
199
- * Convert a single 16-bit linear sample to 8-bit mu-law.
200
- * Implements ITU-T G.711 mu-law encoding.
201
- */
202
- function linearToMulaw(sample: number): number {
203
- const BIAS = 132;
204
- const CLIP = 32635;
205
-
206
- // Get sign bit
207
- const sign = sample < 0 ? 0x80 : 0;
208
- if (sample < 0) {
209
- sample = -sample;
210
- }
211
-
212
- // Clip to prevent overflow
213
- if (sample > CLIP) {
214
- sample = CLIP;
215
- }
216
-
217
- // Add bias and find segment
218
- sample += BIAS;
219
- let exponent = 7;
220
- for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--, expMask >>= 1) {
221
- // Find the segment (exponent)
222
- }
223
-
224
- // Extract mantissa bits
225
- const mantissa = (sample >> (exponent + 3)) & 0x0f;
226
-
227
- // Combine into mu-law byte (inverted for transmission)
228
- return ~(sign | (exponent << 4) | mantissa) & 0xff;
229
- }
230
-
231
190
  /**
232
191
  * Convert 8-bit mu-law to 16-bit linear PCM.
233
192
  * Useful for decoding incoming audio.
@@ -1,6 +1,7 @@
1
1
  import { beforeEach, describe, expect, it, vi } from "vitest";
2
2
  import type { VoiceCallConfig } from "./config.js";
3
3
  import type { CoreConfig } from "./core-bridge.js";
4
+ import { createVoiceCallBaseConfig } from "./test-fixtures.js";
4
5
 
5
6
  const mocks = vi.hoisted(() => ({
6
7
  resolveVoiceCallConfig: vi.fn(),
@@ -45,48 +46,7 @@ vi.mock("./webhook/tailscale.js", () => ({
45
46
  import { createVoiceCallRuntime } from "./runtime.js";
46
47
 
47
48
  function createBaseConfig(): VoiceCallConfig {
48
- return {
49
- enabled: true,
50
- provider: "mock",
51
- fromNumber: "+15550001234",
52
- inboundPolicy: "disabled",
53
- allowFrom: [],
54
- outbound: { defaultMode: "notify", notifyHangupDelaySec: 3 },
55
- maxDurationSeconds: 300,
56
- staleCallReaperSeconds: 600,
57
- silenceTimeoutMs: 800,
58
- transcriptTimeoutMs: 180000,
59
- ringTimeoutMs: 30000,
60
- maxConcurrentCalls: 1,
61
- serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
62
- tailscale: { mode: "off", path: "/voice/webhook" },
63
- tunnel: { provider: "ngrok", allowNgrokFreeTierLoopbackBypass: false },
64
- webhookSecurity: {
65
- allowedHosts: [],
66
- trustForwardingHeaders: false,
67
- trustedProxyIPs: [],
68
- },
69
- streaming: {
70
- enabled: false,
71
- sttProvider: "openai-realtime",
72
- sttModel: "gpt-4o-transcribe",
73
- silenceDurationMs: 800,
74
- vadThreshold: 0.5,
75
- streamPath: "/voice/stream",
76
- preStartTimeoutMs: 5000,
77
- maxPendingConnections: 32,
78
- maxPendingConnectionsPerIp: 4,
79
- maxConnections: 128,
80
- },
81
- skipSignatureVerification: false,
82
- stt: { provider: "openai", model: "whisper-1" },
83
- tts: {
84
- provider: "openai",
85
- openai: { model: "gpt-4o-mini-tts", voice: "coral" },
86
- },
87
- responseModel: "openai/gpt-4o-mini",
88
- responseTimeoutMs: 30000,
89
- };
49
+ return createVoiceCallBaseConfig({ tunnelProvider: "ngrok" });
90
50
  }
91
51
 
92
52
  describe("createVoiceCallRuntime lifecycle", () => {
@@ -1,5 +1,6 @@
1
1
  import type { VoiceCallTtsConfig } from "./config.js";
2
2
  import type { CoreConfig } from "./core-bridge.js";
3
+ import { deepMergeDefined } from "./deep-merge.js";
3
4
  import { convertPcmToMulaw8k } from "./telephony-audio.js";
4
5
 
5
6
  export type TelephonyTtsRuntime = {
@@ -20,8 +21,6 @@ export type TelephonyTtsProvider = {
20
21
  synthesizeForTelephony: (text: string) => Promise<Buffer>;
21
22
  };
22
23
 
23
- const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
24
-
25
24
  export function createTelephonyTtsProvider(params: {
26
25
  coreConfig: CoreConfig;
27
26
  ttsOverride?: VoiceCallTtsConfig;
@@ -79,28 +78,5 @@ function mergeTtsConfig(
79
78
  if (!base) {
80
79
  return override;
81
80
  }
82
- return deepMerge(base, override);
83
- }
84
-
85
- function deepMerge<T>(base: T, override: T): T {
86
- if (!isPlainObject(base) || !isPlainObject(override)) {
87
- return override;
88
- }
89
- const result: Record<string, unknown> = { ...base };
90
- for (const [key, value] of Object.entries(override)) {
91
- if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
92
- continue;
93
- }
94
- const existing = (base as Record<string, unknown>)[key];
95
- if (isPlainObject(existing) && isPlainObject(value)) {
96
- result[key] = deepMerge(existing, value);
97
- } else {
98
- result[key] = value;
99
- }
100
- }
101
- return result as T;
102
- }
103
-
104
- function isPlainObject(value: unknown): value is Record<string, unknown> {
105
- return Boolean(value) && typeof value === "object" && !Array.isArray(value);
81
+ return deepMergeDefined(base, override) as VoiceCallTtsConfig;
106
82
  }
@@ -0,0 +1,52 @@
1
+ import type { VoiceCallConfig } from "./config.js";
2
+
3
+ export function createVoiceCallBaseConfig(params?: {
4
+ provider?: "telnyx" | "twilio" | "plivo" | "mock";
5
+ tunnelProvider?: "none" | "ngrok";
6
+ }): VoiceCallConfig {
7
+ return {
8
+ enabled: true,
9
+ provider: params?.provider ?? "mock",
10
+ fromNumber: "+15550001234",
11
+ inboundPolicy: "disabled",
12
+ allowFrom: [],
13
+ outbound: { defaultMode: "notify", notifyHangupDelaySec: 3 },
14
+ maxDurationSeconds: 300,
15
+ staleCallReaperSeconds: 600,
16
+ silenceTimeoutMs: 800,
17
+ transcriptTimeoutMs: 180000,
18
+ ringTimeoutMs: 30000,
19
+ maxConcurrentCalls: 1,
20
+ serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
21
+ tailscale: { mode: "off", path: "/voice/webhook" },
22
+ tunnel: {
23
+ provider: params?.tunnelProvider ?? "none",
24
+ allowNgrokFreeTierLoopbackBypass: false,
25
+ },
26
+ webhookSecurity: {
27
+ allowedHosts: [],
28
+ trustForwardingHeaders: false,
29
+ trustedProxyIPs: [],
30
+ },
31
+ streaming: {
32
+ enabled: false,
33
+ sttProvider: "openai-realtime",
34
+ sttModel: "gpt-4o-transcribe",
35
+ silenceDurationMs: 800,
36
+ vadThreshold: 0.5,
37
+ streamPath: "/voice/stream",
38
+ preStartTimeoutMs: 5000,
39
+ maxPendingConnections: 32,
40
+ maxPendingConnectionsPerIp: 4,
41
+ maxConnections: 128,
42
+ },
43
+ skipSignatureVerification: false,
44
+ stt: { provider: "openai", model: "whisper-1" },
45
+ tts: {
46
+ provider: "openai",
47
+ openai: { model: "gpt-4o-mini-tts", voice: "coral" },
48
+ },
49
+ responseModel: "openai/gpt-4o-mini",
50
+ responseTimeoutMs: 30000,
51
+ };
52
+ }
@@ -274,6 +274,32 @@ describe("VoiceCallWebhookServer replay handling", () => {
274
274
  });
275
275
  });
276
276
 
277
+ describe("VoiceCallWebhookServer response normalization", () => {
278
+ it("preserves explicit empty provider response bodies", async () => {
279
+ const responseProvider: VoiceCallProvider = {
280
+ ...provider,
281
+ parseWebhookEvent: () => ({
282
+ events: [],
283
+ statusCode: 204,
284
+ providerResponseBody: "",
285
+ }),
286
+ };
287
+ const { manager } = createManager([]);
288
+ const config = createConfig({ serve: { port: 0, bind: "127.0.0.1", path: "/voice/webhook" } });
289
+ const server = new VoiceCallWebhookServer(config, manager, responseProvider);
290
+
291
+ try {
292
+ const baseUrl = await server.start();
293
+ const response = await postWebhookForm(server, baseUrl, "CallSid=CA123&SpeechResult=hello");
294
+
295
+ expect(response.status).toBe(204);
296
+ expect(await response.text()).toBe("");
297
+ } finally {
298
+ await server.stop();
299
+ }
300
+ });
301
+ });
302
+
277
303
  describe("VoiceCallWebhookServer start idempotency", () => {
278
304
  it("returns existing URL when start() is called twice without stop()", async () => {
279
305
  const { manager } = createManager([]);
package/src/webhook.ts CHANGED
@@ -4,8 +4,8 @@ import {
4
4
  isRequestBodyLimitError,
5
5
  readRequestBodyWithLimit,
6
6
  requestBodyErrorToText,
7
- } from "openclaw/plugin-sdk";
8
- import type { VoiceCallConfig } from "./config.js";
7
+ } from "openclaw/plugin-sdk/voice-call";
8
+ import { normalizeVoiceCallConfig, type VoiceCallConfig } from "./config.js";
9
9
  import type { CoreConfig } from "./core-bridge.js";
10
10
  import type { CallManager } from "./manager.js";
11
11
  import type { MediaStreamConfig } from "./media-stream.js";
@@ -24,6 +24,26 @@ type WebhookResponsePayload = {
24
24
  headers?: Record<string, string>;
25
25
  };
26
26
 
27
+ function buildRequestUrl(
28
+ requestUrl: string | undefined,
29
+ requestHost: string | undefined,
30
+ fallbackHost = "localhost",
31
+ ): URL {
32
+ return new URL(requestUrl ?? "/", `http://${requestHost ?? fallbackHost}`);
33
+ }
34
+
35
+ function normalizeWebhookResponse(parsed: {
36
+ statusCode?: number;
37
+ providerResponseHeaders?: Record<string, string>;
38
+ providerResponseBody?: string;
39
+ }): WebhookResponsePayload {
40
+ return {
41
+ statusCode: parsed.statusCode ?? 200,
42
+ headers: parsed.providerResponseHeaders,
43
+ body: parsed.providerResponseBody ?? "OK",
44
+ };
45
+ }
46
+
27
47
  /**
28
48
  * HTTP server for receiving voice call webhooks from providers.
29
49
  * Supports WebSocket upgrades for media streams when streaming is enabled.
@@ -46,13 +66,13 @@ export class VoiceCallWebhookServer {
46
66
  provider: VoiceCallProvider,
47
67
  coreConfig?: CoreConfig,
48
68
  ) {
49
- this.config = config;
69
+ this.config = normalizeVoiceCallConfig(config);
50
70
  this.manager = manager;
51
71
  this.provider = provider;
52
72
  this.coreConfig = coreConfig ?? null;
53
73
 
54
74
  // Initialize media stream handler if streaming is enabled
55
- if (config.streaming?.enabled) {
75
+ if (this.config.streaming.enabled) {
56
76
  this.initializeMediaStreaming();
57
77
  }
58
78
  }
@@ -68,7 +88,8 @@ export class VoiceCallWebhookServer {
68
88
  * Initialize media streaming with OpenAI Realtime STT.
69
89
  */
70
90
  private initializeMediaStreaming(): void {
71
- const apiKey = this.config.streaming?.openaiApiKey || process.env.OPENAI_API_KEY;
91
+ const streaming = this.config.streaming;
92
+ const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY;
72
93
 
73
94
  if (!apiKey) {
74
95
  console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
@@ -77,17 +98,17 @@ export class VoiceCallWebhookServer {
77
98
 
78
99
  const sttProvider = new OpenAIRealtimeSTTProvider({
79
100
  apiKey,
80
- model: this.config.streaming?.sttModel,
81
- silenceDurationMs: this.config.streaming?.silenceDurationMs,
82
- vadThreshold: this.config.streaming?.vadThreshold,
101
+ model: streaming.sttModel,
102
+ silenceDurationMs: streaming.silenceDurationMs,
103
+ vadThreshold: streaming.vadThreshold,
83
104
  });
84
105
 
85
106
  const streamConfig: MediaStreamConfig = {
86
107
  sttProvider,
87
- preStartTimeoutMs: this.config.streaming?.preStartTimeoutMs,
88
- maxPendingConnections: this.config.streaming?.maxPendingConnections,
89
- maxPendingConnectionsPerIp: this.config.streaming?.maxPendingConnectionsPerIp,
90
- maxConnections: this.config.streaming?.maxConnections,
108
+ preStartTimeoutMs: streaming.preStartTimeoutMs,
109
+ maxPendingConnections: streaming.maxPendingConnections,
110
+ maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,
111
+ maxConnections: streaming.maxConnections,
91
112
  shouldAcceptStream: ({ callId, token }) => {
92
113
  const call = this.manager.getCallByProviderCallId(callId);
93
114
  if (!call) {
@@ -190,7 +211,7 @@ export class VoiceCallWebhookServer {
190
211
  */
191
212
  async start(): Promise<string> {
192
213
  const { port, bind, path: webhookPath } = this.config.serve;
193
- const streamPath = this.config.streaming?.streamPath || "/voice/stream";
214
+ const streamPath = this.config.streaming.streamPath;
194
215
 
195
216
  // Guard: if a server is already listening, return the existing URL.
196
217
  // This prevents EADDRINUSE when start() is called more than once on the
@@ -280,8 +301,7 @@ export class VoiceCallWebhookServer {
280
301
 
281
302
  private getUpgradePathname(request: http.IncomingMessage): string | null {
282
303
  try {
283
- const host = request.headers.host || "localhost";
284
- return new URL(request.url || "/", `http://${host}`).pathname;
304
+ return buildRequestUrl(request.url, request.headers.host).pathname;
285
305
  } catch {
286
306
  return null;
287
307
  }
@@ -322,7 +342,7 @@ export class VoiceCallWebhookServer {
322
342
  req: http.IncomingMessage,
323
343
  webhookPath: string,
324
344
  ): Promise<WebhookResponsePayload> {
325
- const url = new URL(req.url || "/", `http://${req.headers.host}`);
345
+ const url = buildRequestUrl(req.url, req.headers.host);
326
346
 
327
347
  if (url.pathname === "/voice/hold-music") {
328
348
  return {
@@ -360,7 +380,7 @@ export class VoiceCallWebhookServer {
360
380
  const ctx: WebhookContext = {
361
381
  headers: req.headers as Record<string, string | string[] | undefined>,
362
382
  rawBody: body,
363
- url: `http://${req.headers.host}${req.url}`,
383
+ url: url.toString(),
364
384
  method: "POST",
365
385
  query: Object.fromEntries(url.searchParams),
366
386
  remoteAddress: req.socket.remoteAddress ?? undefined,
@@ -386,11 +406,7 @@ export class VoiceCallWebhookServer {
386
406
  this.processParsedEvents(parsed.events);
387
407
  }
388
408
 
389
- return {
390
- statusCode: parsed.statusCode || 200,
391
- headers: parsed.providerResponseHeaders,
392
- body: parsed.providerResponseBody || "OK",
393
- };
409
+ return normalizeWebhookResponse(parsed);
394
410
  }
395
411
 
396
412
  private processParsedEvents(events: NormalizedEvent[]): void {