@openclaw/voice-call 2026.3.2 → 2026.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/index.ts +29 -13
- package/openclaw.plugin.json +41 -0
- package/package.json +2 -1
- package/src/cli.ts +1 -1
- package/src/config.test.ts +53 -43
- package/src/config.ts +52 -4
- package/src/deep-merge.ts +23 -0
- package/src/providers/mock.test.ts +78 -0
- package/src/providers/mock.ts +8 -8
- package/src/providers/shared/guarded-json-api.ts +1 -1
- package/src/providers/stt-openai-realtime.test.ts +42 -0
- package/src/providers/stt-openai-realtime.ts +2 -2
- package/src/providers/tts-openai.test.ts +43 -0
- package/src/providers/tts-openai.ts +14 -55
- package/src/runtime.test.ts +2 -42
- package/src/telephony-tts.ts +2 -26
- package/src/test-fixtures.ts +52 -0
- package/src/webhook.test.ts +26 -0
- package/src/webhook.ts +38 -22
package/CHANGELOG.md
CHANGED
package/index.ts
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import { Type } from "@sinclair/typebox";
|
|
2
|
-
import type {
|
|
2
|
+
import type {
|
|
3
|
+
GatewayRequestHandlerOptions,
|
|
4
|
+
OpenClawPluginApi,
|
|
5
|
+
} from "openclaw/plugin-sdk/voice-call";
|
|
3
6
|
import { registerVoiceCallCli } from "./src/cli.js";
|
|
4
7
|
import {
|
|
5
8
|
VoiceCallConfigSchema,
|
|
@@ -206,6 +209,23 @@ const voiceCallPlugin = {
|
|
|
206
209
|
const rt = await ensureRuntime();
|
|
207
210
|
return { rt, callId, message } as const;
|
|
208
211
|
};
|
|
212
|
+
const initiateCallAndRespond = async (params: {
|
|
213
|
+
rt: VoiceCallRuntime;
|
|
214
|
+
respond: GatewayRequestHandlerOptions["respond"];
|
|
215
|
+
to: string;
|
|
216
|
+
message?: string;
|
|
217
|
+
mode?: "notify" | "conversation";
|
|
218
|
+
}) => {
|
|
219
|
+
const result = await params.rt.manager.initiateCall(params.to, undefined, {
|
|
220
|
+
message: params.message,
|
|
221
|
+
mode: params.mode,
|
|
222
|
+
});
|
|
223
|
+
if (!result.success) {
|
|
224
|
+
params.respond(false, { error: result.error || "initiate failed" });
|
|
225
|
+
return;
|
|
226
|
+
}
|
|
227
|
+
params.respond(true, { callId: result.callId, initiated: true });
|
|
228
|
+
};
|
|
209
229
|
|
|
210
230
|
api.registerGatewayMethod(
|
|
211
231
|
"voicecall.initiate",
|
|
@@ -227,15 +247,13 @@ const voiceCallPlugin = {
|
|
|
227
247
|
}
|
|
228
248
|
const mode =
|
|
229
249
|
params?.mode === "notify" || params?.mode === "conversation" ? params.mode : undefined;
|
|
230
|
-
|
|
250
|
+
await initiateCallAndRespond({
|
|
251
|
+
rt,
|
|
252
|
+
respond,
|
|
253
|
+
to,
|
|
231
254
|
message,
|
|
232
255
|
mode,
|
|
233
256
|
});
|
|
234
|
-
if (!result.success) {
|
|
235
|
-
respond(false, { error: result.error || "initiate failed" });
|
|
236
|
-
return;
|
|
237
|
-
}
|
|
238
|
-
respond(true, { callId: result.callId, initiated: true });
|
|
239
257
|
} catch (err) {
|
|
240
258
|
sendError(respond, err);
|
|
241
259
|
}
|
|
@@ -344,14 +362,12 @@ const voiceCallPlugin = {
|
|
|
344
362
|
return;
|
|
345
363
|
}
|
|
346
364
|
const rt = await ensureRuntime();
|
|
347
|
-
|
|
365
|
+
await initiateCallAndRespond({
|
|
366
|
+
rt,
|
|
367
|
+
respond,
|
|
368
|
+
to,
|
|
348
369
|
message: message || undefined,
|
|
349
370
|
});
|
|
350
|
-
if (!result.success) {
|
|
351
|
-
respond(false, { error: result.error || "initiate failed" });
|
|
352
|
-
return;
|
|
353
|
-
}
|
|
354
|
-
respond(true, { callId: result.callId, initiated: true });
|
|
355
371
|
} catch (err) {
|
|
356
372
|
sendError(respond, err);
|
|
357
373
|
}
|
package/openclaw.plugin.json
CHANGED
|
@@ -249,6 +249,10 @@
|
|
|
249
249
|
"type": "integer",
|
|
250
250
|
"minimum": 1
|
|
251
251
|
},
|
|
252
|
+
"staleCallReaperSeconds": {
|
|
253
|
+
"type": "integer",
|
|
254
|
+
"minimum": 0
|
|
255
|
+
},
|
|
252
256
|
"silenceTimeoutMs": {
|
|
253
257
|
"type": "integer",
|
|
254
258
|
"minimum": 1
|
|
@@ -313,6 +317,27 @@
|
|
|
313
317
|
}
|
|
314
318
|
}
|
|
315
319
|
},
|
|
320
|
+
"webhookSecurity": {
|
|
321
|
+
"type": "object",
|
|
322
|
+
"additionalProperties": false,
|
|
323
|
+
"properties": {
|
|
324
|
+
"allowedHosts": {
|
|
325
|
+
"type": "array",
|
|
326
|
+
"items": {
|
|
327
|
+
"type": "string"
|
|
328
|
+
}
|
|
329
|
+
},
|
|
330
|
+
"trustForwardingHeaders": {
|
|
331
|
+
"type": "boolean"
|
|
332
|
+
},
|
|
333
|
+
"trustedProxyIPs": {
|
|
334
|
+
"type": "array",
|
|
335
|
+
"items": {
|
|
336
|
+
"type": "string"
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
},
|
|
316
341
|
"streaming": {
|
|
317
342
|
"type": "object",
|
|
318
343
|
"additionalProperties": false,
|
|
@@ -341,6 +366,22 @@
|
|
|
341
366
|
},
|
|
342
367
|
"streamPath": {
|
|
343
368
|
"type": "string"
|
|
369
|
+
},
|
|
370
|
+
"preStartTimeoutMs": {
|
|
371
|
+
"type": "integer",
|
|
372
|
+
"minimum": 1
|
|
373
|
+
},
|
|
374
|
+
"maxPendingConnections": {
|
|
375
|
+
"type": "integer",
|
|
376
|
+
"minimum": 1
|
|
377
|
+
},
|
|
378
|
+
"maxPendingConnectionsPerIp": {
|
|
379
|
+
"type": "integer",
|
|
380
|
+
"minimum": 1
|
|
381
|
+
},
|
|
382
|
+
"maxConnections": {
|
|
383
|
+
"type": "integer",
|
|
384
|
+
"minimum": 1
|
|
344
385
|
}
|
|
345
386
|
}
|
|
346
387
|
},
|
package/package.json
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@openclaw/voice-call",
|
|
3
|
-
"version": "2026.3.
|
|
3
|
+
"version": "2026.3.7",
|
|
4
4
|
"description": "OpenClaw voice-call plugin",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"dependencies": {
|
|
7
7
|
"@sinclair/typebox": "0.34.48",
|
|
8
|
+
"commander": "^14.0.3",
|
|
8
9
|
"ws": "^8.19.0",
|
|
9
10
|
"zod": "^4.3.6"
|
|
10
11
|
},
|
package/src/cli.ts
CHANGED
|
@@ -2,7 +2,7 @@ import fs from "node:fs";
|
|
|
2
2
|
import os from "node:os";
|
|
3
3
|
import path from "node:path";
|
|
4
4
|
import type { Command } from "commander";
|
|
5
|
-
import { sleep } from "openclaw/plugin-sdk";
|
|
5
|
+
import { sleep } from "openclaw/plugin-sdk/voice-call";
|
|
6
6
|
import type { VoiceCallConfig } from "./config.js";
|
|
7
7
|
import type { VoiceCallRuntime } from "./runtime.js";
|
|
8
8
|
import { resolveUserPath } from "./utils.js";
|
package/src/config.test.ts
CHANGED
|
@@ -1,49 +1,14 @@
|
|
|
1
1
|
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
|
2
|
-
import {
|
|
2
|
+
import {
|
|
3
|
+
validateProviderConfig,
|
|
4
|
+
normalizeVoiceCallConfig,
|
|
5
|
+
resolveVoiceCallConfig,
|
|
6
|
+
type VoiceCallConfig,
|
|
7
|
+
} from "./config.js";
|
|
8
|
+
import { createVoiceCallBaseConfig } from "./test-fixtures.js";
|
|
3
9
|
|
|
4
10
|
function createBaseConfig(provider: "telnyx" | "twilio" | "plivo" | "mock"): VoiceCallConfig {
|
|
5
|
-
return {
|
|
6
|
-
enabled: true,
|
|
7
|
-
provider,
|
|
8
|
-
fromNumber: "+15550001234",
|
|
9
|
-
inboundPolicy: "disabled",
|
|
10
|
-
allowFrom: [],
|
|
11
|
-
outbound: { defaultMode: "notify", notifyHangupDelaySec: 3 },
|
|
12
|
-
maxDurationSeconds: 300,
|
|
13
|
-
staleCallReaperSeconds: 600,
|
|
14
|
-
silenceTimeoutMs: 800,
|
|
15
|
-
transcriptTimeoutMs: 180000,
|
|
16
|
-
ringTimeoutMs: 30000,
|
|
17
|
-
maxConcurrentCalls: 1,
|
|
18
|
-
serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
|
|
19
|
-
tailscale: { mode: "off", path: "/voice/webhook" },
|
|
20
|
-
tunnel: { provider: "none", allowNgrokFreeTierLoopbackBypass: false },
|
|
21
|
-
webhookSecurity: {
|
|
22
|
-
allowedHosts: [],
|
|
23
|
-
trustForwardingHeaders: false,
|
|
24
|
-
trustedProxyIPs: [],
|
|
25
|
-
},
|
|
26
|
-
streaming: {
|
|
27
|
-
enabled: false,
|
|
28
|
-
sttProvider: "openai-realtime",
|
|
29
|
-
sttModel: "gpt-4o-transcribe",
|
|
30
|
-
silenceDurationMs: 800,
|
|
31
|
-
vadThreshold: 0.5,
|
|
32
|
-
streamPath: "/voice/stream",
|
|
33
|
-
preStartTimeoutMs: 5000,
|
|
34
|
-
maxPendingConnections: 32,
|
|
35
|
-
maxPendingConnectionsPerIp: 4,
|
|
36
|
-
maxConnections: 128,
|
|
37
|
-
},
|
|
38
|
-
skipSignatureVerification: false,
|
|
39
|
-
stt: { provider: "openai", model: "whisper-1" },
|
|
40
|
-
tts: {
|
|
41
|
-
provider: "openai",
|
|
42
|
-
openai: { model: "gpt-4o-mini-tts", voice: "coral" },
|
|
43
|
-
},
|
|
44
|
-
responseModel: "openai/gpt-4o-mini",
|
|
45
|
-
responseTimeoutMs: 30000,
|
|
46
|
-
};
|
|
11
|
+
return createVoiceCallBaseConfig({ provider });
|
|
47
12
|
}
|
|
48
13
|
|
|
49
14
|
describe("validateProviderConfig", () => {
|
|
@@ -206,3 +171,48 @@ describe("validateProviderConfig", () => {
|
|
|
206
171
|
});
|
|
207
172
|
});
|
|
208
173
|
});
|
|
174
|
+
|
|
175
|
+
describe("normalizeVoiceCallConfig", () => {
|
|
176
|
+
it("fills nested runtime defaults from a partial config boundary", () => {
|
|
177
|
+
const normalized = normalizeVoiceCallConfig({
|
|
178
|
+
enabled: true,
|
|
179
|
+
provider: "mock",
|
|
180
|
+
streaming: {
|
|
181
|
+
enabled: true,
|
|
182
|
+
streamPath: "/custom-stream",
|
|
183
|
+
},
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
expect(normalized.serve.path).toBe("/voice/webhook");
|
|
187
|
+
expect(normalized.streaming.streamPath).toBe("/custom-stream");
|
|
188
|
+
expect(normalized.streaming.sttModel).toBe("gpt-4o-transcribe");
|
|
189
|
+
expect(normalized.tunnel.provider).toBe("none");
|
|
190
|
+
expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
it("accepts partial nested TTS overrides and preserves nested objects", () => {
|
|
194
|
+
const normalized = normalizeVoiceCallConfig({
|
|
195
|
+
tts: {
|
|
196
|
+
provider: "elevenlabs",
|
|
197
|
+
elevenlabs: {
|
|
198
|
+
apiKey: {
|
|
199
|
+
source: "env",
|
|
200
|
+
provider: "elevenlabs",
|
|
201
|
+
id: "ELEVENLABS_API_KEY",
|
|
202
|
+
},
|
|
203
|
+
voiceSettings: {
|
|
204
|
+
speed: 1.1,
|
|
205
|
+
},
|
|
206
|
+
},
|
|
207
|
+
},
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
expect(normalized.tts?.provider).toBe("elevenlabs");
|
|
211
|
+
expect(normalized.tts?.elevenlabs?.apiKey).toEqual({
|
|
212
|
+
source: "env",
|
|
213
|
+
provider: "elevenlabs",
|
|
214
|
+
id: "ELEVENLABS_API_KEY",
|
|
215
|
+
});
|
|
216
|
+
expect(normalized.tts?.elevenlabs?.voiceSettings).toEqual({ speed: 1.1 });
|
|
217
|
+
});
|
|
218
|
+
});
|
package/src/config.ts
CHANGED
|
@@ -3,8 +3,9 @@ import {
|
|
|
3
3
|
TtsConfigSchema,
|
|
4
4
|
TtsModeSchema,
|
|
5
5
|
TtsProviderSchema,
|
|
6
|
-
} from "openclaw/plugin-sdk";
|
|
6
|
+
} from "openclaw/plugin-sdk/voice-call";
|
|
7
7
|
import { z } from "zod";
|
|
8
|
+
import { deepMergeDefined } from "./deep-merge.js";
|
|
8
9
|
|
|
9
10
|
// -----------------------------------------------------------------------------
|
|
10
11
|
// Phone Number Validation
|
|
@@ -350,17 +351,64 @@ export const VoiceCallConfigSchema = z
|
|
|
350
351
|
.strict();
|
|
351
352
|
|
|
352
353
|
export type VoiceCallConfig = z.infer<typeof VoiceCallConfigSchema>;
|
|
354
|
+
type DeepPartial<T> =
|
|
355
|
+
T extends Array<infer U>
|
|
356
|
+
? DeepPartial<U>[]
|
|
357
|
+
: T extends object
|
|
358
|
+
? { [K in keyof T]?: DeepPartial<T[K]> }
|
|
359
|
+
: T;
|
|
360
|
+
export type VoiceCallConfigInput = DeepPartial<VoiceCallConfig>;
|
|
353
361
|
|
|
354
362
|
// -----------------------------------------------------------------------------
|
|
355
363
|
// Configuration Helpers
|
|
356
364
|
// -----------------------------------------------------------------------------
|
|
357
365
|
|
|
366
|
+
const DEFAULT_VOICE_CALL_CONFIG = VoiceCallConfigSchema.parse({});
|
|
367
|
+
|
|
368
|
+
function cloneDefaultVoiceCallConfig(): VoiceCallConfig {
|
|
369
|
+
return structuredClone(DEFAULT_VOICE_CALL_CONFIG);
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
function normalizeVoiceCallTtsConfig(
|
|
373
|
+
defaults: VoiceCallTtsConfig,
|
|
374
|
+
overrides: DeepPartial<NonNullable<VoiceCallTtsConfig>> | undefined,
|
|
375
|
+
): VoiceCallTtsConfig {
|
|
376
|
+
if (!defaults && !overrides) {
|
|
377
|
+
return undefined;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
return TtsConfigSchema.parse(deepMergeDefined(defaults ?? {}, overrides ?? {}));
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
|
|
384
|
+
const defaults = cloneDefaultVoiceCallConfig();
|
|
385
|
+
return {
|
|
386
|
+
...defaults,
|
|
387
|
+
...config,
|
|
388
|
+
allowFrom: config.allowFrom ?? defaults.allowFrom,
|
|
389
|
+
outbound: { ...defaults.outbound, ...config.outbound },
|
|
390
|
+
serve: { ...defaults.serve, ...config.serve },
|
|
391
|
+
tailscale: { ...defaults.tailscale, ...config.tailscale },
|
|
392
|
+
tunnel: { ...defaults.tunnel, ...config.tunnel },
|
|
393
|
+
webhookSecurity: {
|
|
394
|
+
...defaults.webhookSecurity,
|
|
395
|
+
...config.webhookSecurity,
|
|
396
|
+
allowedHosts: config.webhookSecurity?.allowedHosts ?? defaults.webhookSecurity.allowedHosts,
|
|
397
|
+
trustedProxyIPs:
|
|
398
|
+
config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs,
|
|
399
|
+
},
|
|
400
|
+
streaming: { ...defaults.streaming, ...config.streaming },
|
|
401
|
+
stt: { ...defaults.stt, ...config.stt },
|
|
402
|
+
tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
|
|
358
406
|
/**
|
|
359
407
|
* Resolves the configuration by merging environment variables into missing fields.
|
|
360
408
|
* Returns a new configuration object with environment variables applied.
|
|
361
409
|
*/
|
|
362
|
-
export function resolveVoiceCallConfig(config:
|
|
363
|
-
const resolved =
|
|
410
|
+
export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
|
|
411
|
+
const resolved = normalizeVoiceCallConfig(config);
|
|
364
412
|
|
|
365
413
|
// Telnyx
|
|
366
414
|
if (resolved.provider === "telnyx") {
|
|
@@ -405,7 +453,7 @@ export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig
|
|
|
405
453
|
resolved.webhookSecurity.trustForwardingHeaders ?? false;
|
|
406
454
|
resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
|
|
407
455
|
|
|
408
|
-
return resolved;
|
|
456
|
+
return normalizeVoiceCallConfig(resolved);
|
|
409
457
|
}
|
|
410
458
|
|
|
411
459
|
/**
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
|
|
2
|
+
|
|
3
|
+
export function deepMergeDefined(base: unknown, override: unknown): unknown {
|
|
4
|
+
if (!isPlainObject(base) || !isPlainObject(override)) {
|
|
5
|
+
return override === undefined ? base : override;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
const result: Record<string, unknown> = { ...base };
|
|
9
|
+
for (const [key, value] of Object.entries(override)) {
|
|
10
|
+
if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
|
|
11
|
+
continue;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
const existing = result[key];
|
|
15
|
+
result[key] = key in result ? deepMergeDefined(existing, value) : value;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
return result;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function isPlainObject(value: unknown): value is Record<string, unknown> {
|
|
22
|
+
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
|
23
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import type { WebhookContext } from "../types.js";
|
|
3
|
+
import { MockProvider } from "./mock.js";
|
|
4
|
+
|
|
5
|
+
function createWebhookContext(rawBody: string): WebhookContext {
|
|
6
|
+
return {
|
|
7
|
+
headers: {},
|
|
8
|
+
rawBody,
|
|
9
|
+
url: "http://localhost/voice/webhook",
|
|
10
|
+
method: "POST",
|
|
11
|
+
query: {},
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
describe("MockProvider", () => {
|
|
16
|
+
it("preserves explicit falsy event values", () => {
|
|
17
|
+
const provider = new MockProvider();
|
|
18
|
+
const result = provider.parseWebhookEvent(
|
|
19
|
+
createWebhookContext(
|
|
20
|
+
JSON.stringify({
|
|
21
|
+
events: [
|
|
22
|
+
{
|
|
23
|
+
id: "evt-error",
|
|
24
|
+
type: "call.error",
|
|
25
|
+
callId: "call-1",
|
|
26
|
+
timestamp: 0,
|
|
27
|
+
error: "",
|
|
28
|
+
retryable: false,
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
id: "evt-ended",
|
|
32
|
+
type: "call.ended",
|
|
33
|
+
callId: "call-2",
|
|
34
|
+
reason: "",
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
id: "evt-speech",
|
|
38
|
+
type: "call.speech",
|
|
39
|
+
callId: "call-3",
|
|
40
|
+
transcript: "",
|
|
41
|
+
isFinal: false,
|
|
42
|
+
},
|
|
43
|
+
],
|
|
44
|
+
}),
|
|
45
|
+
),
|
|
46
|
+
);
|
|
47
|
+
|
|
48
|
+
expect(result.events).toEqual([
|
|
49
|
+
{
|
|
50
|
+
id: "evt-error",
|
|
51
|
+
type: "call.error",
|
|
52
|
+
callId: "call-1",
|
|
53
|
+
providerCallId: undefined,
|
|
54
|
+
timestamp: 0,
|
|
55
|
+
error: "",
|
|
56
|
+
retryable: false,
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
id: "evt-ended",
|
|
60
|
+
type: "call.ended",
|
|
61
|
+
callId: "call-2",
|
|
62
|
+
providerCallId: undefined,
|
|
63
|
+
timestamp: expect.any(Number),
|
|
64
|
+
reason: "",
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
id: "evt-speech",
|
|
68
|
+
type: "call.speech",
|
|
69
|
+
callId: "call-3",
|
|
70
|
+
providerCallId: undefined,
|
|
71
|
+
timestamp: expect.any(Number),
|
|
72
|
+
transcript: "",
|
|
73
|
+
isFinal: false,
|
|
74
|
+
confidence: undefined,
|
|
75
|
+
},
|
|
76
|
+
]);
|
|
77
|
+
});
|
|
78
|
+
});
|
package/src/providers/mock.ts
CHANGED
|
@@ -65,10 +65,10 @@ export class MockProvider implements VoiceCallProvider {
|
|
|
65
65
|
}
|
|
66
66
|
|
|
67
67
|
const base = {
|
|
68
|
-
id: evt.id
|
|
68
|
+
id: evt.id ?? crypto.randomUUID(),
|
|
69
69
|
callId: evt.callId,
|
|
70
70
|
providerCallId: evt.providerCallId,
|
|
71
|
-
timestamp: evt.timestamp
|
|
71
|
+
timestamp: evt.timestamp ?? Date.now(),
|
|
72
72
|
};
|
|
73
73
|
|
|
74
74
|
switch (evt.type) {
|
|
@@ -83,7 +83,7 @@ export class MockProvider implements VoiceCallProvider {
|
|
|
83
83
|
return {
|
|
84
84
|
...base,
|
|
85
85
|
type: evt.type,
|
|
86
|
-
text: payload.text
|
|
86
|
+
text: payload.text ?? "",
|
|
87
87
|
};
|
|
88
88
|
}
|
|
89
89
|
|
|
@@ -98,7 +98,7 @@ export class MockProvider implements VoiceCallProvider {
|
|
|
98
98
|
return {
|
|
99
99
|
...base,
|
|
100
100
|
type: evt.type,
|
|
101
|
-
transcript: payload.transcript
|
|
101
|
+
transcript: payload.transcript ?? "",
|
|
102
102
|
isFinal: payload.isFinal ?? true,
|
|
103
103
|
confidence: payload.confidence,
|
|
104
104
|
};
|
|
@@ -109,7 +109,7 @@ export class MockProvider implements VoiceCallProvider {
|
|
|
109
109
|
return {
|
|
110
110
|
...base,
|
|
111
111
|
type: evt.type,
|
|
112
|
-
durationMs: payload.durationMs
|
|
112
|
+
durationMs: payload.durationMs ?? 0,
|
|
113
113
|
};
|
|
114
114
|
}
|
|
115
115
|
|
|
@@ -118,7 +118,7 @@ export class MockProvider implements VoiceCallProvider {
|
|
|
118
118
|
return {
|
|
119
119
|
...base,
|
|
120
120
|
type: evt.type,
|
|
121
|
-
digits: payload.digits
|
|
121
|
+
digits: payload.digits ?? "",
|
|
122
122
|
};
|
|
123
123
|
}
|
|
124
124
|
|
|
@@ -127,7 +127,7 @@ export class MockProvider implements VoiceCallProvider {
|
|
|
127
127
|
return {
|
|
128
128
|
...base,
|
|
129
129
|
type: evt.type,
|
|
130
|
-
reason: payload.reason
|
|
130
|
+
reason: payload.reason ?? "completed",
|
|
131
131
|
};
|
|
132
132
|
}
|
|
133
133
|
|
|
@@ -136,7 +136,7 @@ export class MockProvider implements VoiceCallProvider {
|
|
|
136
136
|
return {
|
|
137
137
|
...base,
|
|
138
138
|
type: evt.type,
|
|
139
|
-
error: payload.error
|
|
139
|
+
error: payload.error ?? "unknown error",
|
|
140
140
|
retryable: payload.retryable,
|
|
141
141
|
};
|
|
142
142
|
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import type { RealtimeSTTConfig } from "./stt-openai-realtime.js";
|
|
3
|
+
import { OpenAIRealtimeSTTProvider } from "./stt-openai-realtime.js";
|
|
4
|
+
|
|
5
|
+
type ProviderInternals = {
|
|
6
|
+
vadThreshold: number;
|
|
7
|
+
silenceDurationMs: number;
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
function readProviderInternals(config: RealtimeSTTConfig): ProviderInternals {
|
|
11
|
+
const provider = new OpenAIRealtimeSTTProvider(config) as unknown as Record<string, unknown>;
|
|
12
|
+
return {
|
|
13
|
+
vadThreshold: provider["vadThreshold"] as number,
|
|
14
|
+
silenceDurationMs: provider["silenceDurationMs"] as number,
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
describe("OpenAIRealtimeSTTProvider constructor defaults", () => {
|
|
19
|
+
it("uses vadThreshold: 0 when explicitly configured (max sensitivity)", () => {
|
|
20
|
+
const provider = readProviderInternals({
|
|
21
|
+
apiKey: "sk-test", // pragma: allowlist secret
|
|
22
|
+
vadThreshold: 0,
|
|
23
|
+
});
|
|
24
|
+
expect(provider.vadThreshold).toBe(0);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("uses silenceDurationMs: 0 when explicitly configured", () => {
|
|
28
|
+
const provider = readProviderInternals({
|
|
29
|
+
apiKey: "sk-test", // pragma: allowlist secret
|
|
30
|
+
silenceDurationMs: 0,
|
|
31
|
+
});
|
|
32
|
+
expect(provider.silenceDurationMs).toBe(0);
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it("falls back to defaults when values are undefined", () => {
|
|
36
|
+
const provider = readProviderInternals({
|
|
37
|
+
apiKey: "sk-test", // pragma: allowlist secret
|
|
38
|
+
});
|
|
39
|
+
expect(provider.vadThreshold).toBe(0.5);
|
|
40
|
+
expect(provider.silenceDurationMs).toBe(800);
|
|
41
|
+
});
|
|
42
|
+
});
|
|
@@ -62,8 +62,8 @@ export class OpenAIRealtimeSTTProvider {
|
|
|
62
62
|
}
|
|
63
63
|
this.apiKey = config.apiKey;
|
|
64
64
|
this.model = config.model || "gpt-4o-transcribe";
|
|
65
|
-
this.silenceDurationMs = config.silenceDurationMs
|
|
66
|
-
this.vadThreshold = config.vadThreshold
|
|
65
|
+
this.silenceDurationMs = config.silenceDurationMs ?? 800;
|
|
66
|
+
this.vadThreshold = config.vadThreshold ?? 0.5;
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
/**
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import type { OpenAITTSConfig } from "./tts-openai.js";
|
|
3
|
+
import { OpenAITTSProvider } from "./tts-openai.js";
|
|
4
|
+
|
|
5
|
+
type ProviderInternals = {
|
|
6
|
+
model: string;
|
|
7
|
+
voice: string;
|
|
8
|
+
speed: number;
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
function readProviderInternals(config: OpenAITTSConfig): ProviderInternals {
|
|
12
|
+
return new OpenAITTSProvider(config) as unknown as ProviderInternals;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
describe("OpenAITTSProvider constructor defaults", () => {
|
|
16
|
+
it("uses speed: 0 when explicitly configured", () => {
|
|
17
|
+
const provider = readProviderInternals({
|
|
18
|
+
apiKey: "sk-test", // pragma: allowlist secret
|
|
19
|
+
speed: 0,
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
expect(provider.speed).toBe(0);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("falls back to speed default when undefined", () => {
|
|
26
|
+
const provider = readProviderInternals({
|
|
27
|
+
apiKey: "sk-test", // pragma: allowlist secret
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
expect(provider.speed).toBe(1.0);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
it("treats blank model and voice overrides as unset", () => {
|
|
34
|
+
const provider = readProviderInternals({
|
|
35
|
+
apiKey: "sk-test", // pragma: allowlist secret
|
|
36
|
+
model: " ",
|
|
37
|
+
voice: "",
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
expect(provider.model).toBe("gpt-4o-mini-tts");
|
|
41
|
+
expect(provider.voice).toBe("coral");
|
|
42
|
+
});
|
|
43
|
+
});
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { pcmToMulaw } from "../telephony-audio.js";
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* OpenAI TTS Provider
|
|
3
5
|
*
|
|
@@ -64,6 +66,11 @@ export const OPENAI_TTS_VOICES = [
|
|
|
64
66
|
|
|
65
67
|
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
|
|
66
68
|
|
|
69
|
+
function trimToUndefined(value: string | undefined): string | undefined {
|
|
70
|
+
const trimmed = value?.trim();
|
|
71
|
+
return trimmed ? trimmed : undefined;
|
|
72
|
+
}
|
|
73
|
+
|
|
67
74
|
/**
|
|
68
75
|
* OpenAI TTS Provider for generating speech audio.
|
|
69
76
|
*/
|
|
@@ -75,13 +82,14 @@ export class OpenAITTSProvider {
|
|
|
75
82
|
private instructions?: string;
|
|
76
83
|
|
|
77
84
|
constructor(config: OpenAITTSConfig = {}) {
|
|
78
|
-
this.apiKey =
|
|
85
|
+
this.apiKey =
|
|
86
|
+
trimToUndefined(config.apiKey) ?? trimToUndefined(process.env.OPENAI_API_KEY) ?? "";
|
|
79
87
|
// Default to gpt-4o-mini-tts for intelligent realtime applications
|
|
80
|
-
this.model = config.model
|
|
88
|
+
this.model = trimToUndefined(config.model) ?? "gpt-4o-mini-tts";
|
|
81
89
|
// Default to coral - good balance of quality and natural tone
|
|
82
|
-
this.voice = (config.voice as OpenAITTSVoice)
|
|
83
|
-
this.speed = config.speed
|
|
84
|
-
this.instructions = config.instructions;
|
|
90
|
+
this.voice = (trimToUndefined(config.voice) as OpenAITTSVoice | undefined) ?? "coral";
|
|
91
|
+
this.speed = config.speed ?? 1.0;
|
|
92
|
+
this.instructions = trimToUndefined(config.instructions);
|
|
85
93
|
|
|
86
94
|
if (!this.apiKey) {
|
|
87
95
|
throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
|
|
@@ -103,7 +111,7 @@ export class OpenAITTSProvider {
|
|
|
103
111
|
};
|
|
104
112
|
|
|
105
113
|
// Add instructions if using gpt-4o-mini-tts model
|
|
106
|
-
const effectiveInstructions = instructions
|
|
114
|
+
const effectiveInstructions = trimToUndefined(instructions) ?? this.instructions;
|
|
107
115
|
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
|
|
108
116
|
body.instructions = effectiveInstructions;
|
|
109
117
|
}
|
|
@@ -179,55 +187,6 @@ function clamp16(value: number): number {
|
|
|
179
187
|
return Math.max(-32768, Math.min(32767, value));
|
|
180
188
|
}
|
|
181
189
|
|
|
182
|
-
/**
|
|
183
|
-
* Convert 16-bit PCM to 8-bit mu-law.
|
|
184
|
-
* Standard G.711 mu-law encoding for telephony.
|
|
185
|
-
*/
|
|
186
|
-
function pcmToMulaw(pcm: Buffer): Buffer {
|
|
187
|
-
const samples = pcm.length / 2;
|
|
188
|
-
const mulaw = Buffer.alloc(samples);
|
|
189
|
-
|
|
190
|
-
for (let i = 0; i < samples; i++) {
|
|
191
|
-
const sample = pcm.readInt16LE(i * 2);
|
|
192
|
-
mulaw[i] = linearToMulaw(sample);
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
return mulaw;
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
/**
|
|
199
|
-
* Convert a single 16-bit linear sample to 8-bit mu-law.
|
|
200
|
-
* Implements ITU-T G.711 mu-law encoding.
|
|
201
|
-
*/
|
|
202
|
-
function linearToMulaw(sample: number): number {
|
|
203
|
-
const BIAS = 132;
|
|
204
|
-
const CLIP = 32635;
|
|
205
|
-
|
|
206
|
-
// Get sign bit
|
|
207
|
-
const sign = sample < 0 ? 0x80 : 0;
|
|
208
|
-
if (sample < 0) {
|
|
209
|
-
sample = -sample;
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
// Clip to prevent overflow
|
|
213
|
-
if (sample > CLIP) {
|
|
214
|
-
sample = CLIP;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
// Add bias and find segment
|
|
218
|
-
sample += BIAS;
|
|
219
|
-
let exponent = 7;
|
|
220
|
-
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--, expMask >>= 1) {
|
|
221
|
-
// Find the segment (exponent)
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
// Extract mantissa bits
|
|
225
|
-
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
|
226
|
-
|
|
227
|
-
// Combine into mu-law byte (inverted for transmission)
|
|
228
|
-
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
190
|
/**
|
|
232
191
|
* Convert 8-bit mu-law to 16-bit linear PCM.
|
|
233
192
|
* Useful for decoding incoming audio.
|
package/src/runtime.test.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { beforeEach, describe, expect, it, vi } from "vitest";
|
|
2
2
|
import type { VoiceCallConfig } from "./config.js";
|
|
3
3
|
import type { CoreConfig } from "./core-bridge.js";
|
|
4
|
+
import { createVoiceCallBaseConfig } from "./test-fixtures.js";
|
|
4
5
|
|
|
5
6
|
const mocks = vi.hoisted(() => ({
|
|
6
7
|
resolveVoiceCallConfig: vi.fn(),
|
|
@@ -45,48 +46,7 @@ vi.mock("./webhook/tailscale.js", () => ({
|
|
|
45
46
|
import { createVoiceCallRuntime } from "./runtime.js";
|
|
46
47
|
|
|
47
48
|
function createBaseConfig(): VoiceCallConfig {
|
|
48
|
-
return {
|
|
49
|
-
enabled: true,
|
|
50
|
-
provider: "mock",
|
|
51
|
-
fromNumber: "+15550001234",
|
|
52
|
-
inboundPolicy: "disabled",
|
|
53
|
-
allowFrom: [],
|
|
54
|
-
outbound: { defaultMode: "notify", notifyHangupDelaySec: 3 },
|
|
55
|
-
maxDurationSeconds: 300,
|
|
56
|
-
staleCallReaperSeconds: 600,
|
|
57
|
-
silenceTimeoutMs: 800,
|
|
58
|
-
transcriptTimeoutMs: 180000,
|
|
59
|
-
ringTimeoutMs: 30000,
|
|
60
|
-
maxConcurrentCalls: 1,
|
|
61
|
-
serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
|
|
62
|
-
tailscale: { mode: "off", path: "/voice/webhook" },
|
|
63
|
-
tunnel: { provider: "ngrok", allowNgrokFreeTierLoopbackBypass: false },
|
|
64
|
-
webhookSecurity: {
|
|
65
|
-
allowedHosts: [],
|
|
66
|
-
trustForwardingHeaders: false,
|
|
67
|
-
trustedProxyIPs: [],
|
|
68
|
-
},
|
|
69
|
-
streaming: {
|
|
70
|
-
enabled: false,
|
|
71
|
-
sttProvider: "openai-realtime",
|
|
72
|
-
sttModel: "gpt-4o-transcribe",
|
|
73
|
-
silenceDurationMs: 800,
|
|
74
|
-
vadThreshold: 0.5,
|
|
75
|
-
streamPath: "/voice/stream",
|
|
76
|
-
preStartTimeoutMs: 5000,
|
|
77
|
-
maxPendingConnections: 32,
|
|
78
|
-
maxPendingConnectionsPerIp: 4,
|
|
79
|
-
maxConnections: 128,
|
|
80
|
-
},
|
|
81
|
-
skipSignatureVerification: false,
|
|
82
|
-
stt: { provider: "openai", model: "whisper-1" },
|
|
83
|
-
tts: {
|
|
84
|
-
provider: "openai",
|
|
85
|
-
openai: { model: "gpt-4o-mini-tts", voice: "coral" },
|
|
86
|
-
},
|
|
87
|
-
responseModel: "openai/gpt-4o-mini",
|
|
88
|
-
responseTimeoutMs: 30000,
|
|
89
|
-
};
|
|
49
|
+
return createVoiceCallBaseConfig({ tunnelProvider: "ngrok" });
|
|
90
50
|
}
|
|
91
51
|
|
|
92
52
|
describe("createVoiceCallRuntime lifecycle", () => {
|
package/src/telephony-tts.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { VoiceCallTtsConfig } from "./config.js";
|
|
2
2
|
import type { CoreConfig } from "./core-bridge.js";
|
|
3
|
+
import { deepMergeDefined } from "./deep-merge.js";
|
|
3
4
|
import { convertPcmToMulaw8k } from "./telephony-audio.js";
|
|
4
5
|
|
|
5
6
|
export type TelephonyTtsRuntime = {
|
|
@@ -20,8 +21,6 @@ export type TelephonyTtsProvider = {
|
|
|
20
21
|
synthesizeForTelephony: (text: string) => Promise<Buffer>;
|
|
21
22
|
};
|
|
22
23
|
|
|
23
|
-
const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
|
|
24
|
-
|
|
25
24
|
export function createTelephonyTtsProvider(params: {
|
|
26
25
|
coreConfig: CoreConfig;
|
|
27
26
|
ttsOverride?: VoiceCallTtsConfig;
|
|
@@ -79,28 +78,5 @@ function mergeTtsConfig(
|
|
|
79
78
|
if (!base) {
|
|
80
79
|
return override;
|
|
81
80
|
}
|
|
82
|
-
return
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
function deepMerge<T>(base: T, override: T): T {
|
|
86
|
-
if (!isPlainObject(base) || !isPlainObject(override)) {
|
|
87
|
-
return override;
|
|
88
|
-
}
|
|
89
|
-
const result: Record<string, unknown> = { ...base };
|
|
90
|
-
for (const [key, value] of Object.entries(override)) {
|
|
91
|
-
if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
|
|
92
|
-
continue;
|
|
93
|
-
}
|
|
94
|
-
const existing = (base as Record<string, unknown>)[key];
|
|
95
|
-
if (isPlainObject(existing) && isPlainObject(value)) {
|
|
96
|
-
result[key] = deepMerge(existing, value);
|
|
97
|
-
} else {
|
|
98
|
-
result[key] = value;
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
return result as T;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
function isPlainObject(value: unknown): value is Record<string, unknown> {
|
|
105
|
-
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
|
81
|
+
return deepMergeDefined(base, override) as VoiceCallTtsConfig;
|
|
106
82
|
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import type { VoiceCallConfig } from "./config.js";
|
|
2
|
+
|
|
3
|
+
export function createVoiceCallBaseConfig(params?: {
|
|
4
|
+
provider?: "telnyx" | "twilio" | "plivo" | "mock";
|
|
5
|
+
tunnelProvider?: "none" | "ngrok";
|
|
6
|
+
}): VoiceCallConfig {
|
|
7
|
+
return {
|
|
8
|
+
enabled: true,
|
|
9
|
+
provider: params?.provider ?? "mock",
|
|
10
|
+
fromNumber: "+15550001234",
|
|
11
|
+
inboundPolicy: "disabled",
|
|
12
|
+
allowFrom: [],
|
|
13
|
+
outbound: { defaultMode: "notify", notifyHangupDelaySec: 3 },
|
|
14
|
+
maxDurationSeconds: 300,
|
|
15
|
+
staleCallReaperSeconds: 600,
|
|
16
|
+
silenceTimeoutMs: 800,
|
|
17
|
+
transcriptTimeoutMs: 180000,
|
|
18
|
+
ringTimeoutMs: 30000,
|
|
19
|
+
maxConcurrentCalls: 1,
|
|
20
|
+
serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
|
|
21
|
+
tailscale: { mode: "off", path: "/voice/webhook" },
|
|
22
|
+
tunnel: {
|
|
23
|
+
provider: params?.tunnelProvider ?? "none",
|
|
24
|
+
allowNgrokFreeTierLoopbackBypass: false,
|
|
25
|
+
},
|
|
26
|
+
webhookSecurity: {
|
|
27
|
+
allowedHosts: [],
|
|
28
|
+
trustForwardingHeaders: false,
|
|
29
|
+
trustedProxyIPs: [],
|
|
30
|
+
},
|
|
31
|
+
streaming: {
|
|
32
|
+
enabled: false,
|
|
33
|
+
sttProvider: "openai-realtime",
|
|
34
|
+
sttModel: "gpt-4o-transcribe",
|
|
35
|
+
silenceDurationMs: 800,
|
|
36
|
+
vadThreshold: 0.5,
|
|
37
|
+
streamPath: "/voice/stream",
|
|
38
|
+
preStartTimeoutMs: 5000,
|
|
39
|
+
maxPendingConnections: 32,
|
|
40
|
+
maxPendingConnectionsPerIp: 4,
|
|
41
|
+
maxConnections: 128,
|
|
42
|
+
},
|
|
43
|
+
skipSignatureVerification: false,
|
|
44
|
+
stt: { provider: "openai", model: "whisper-1" },
|
|
45
|
+
tts: {
|
|
46
|
+
provider: "openai",
|
|
47
|
+
openai: { model: "gpt-4o-mini-tts", voice: "coral" },
|
|
48
|
+
},
|
|
49
|
+
responseModel: "openai/gpt-4o-mini",
|
|
50
|
+
responseTimeoutMs: 30000,
|
|
51
|
+
};
|
|
52
|
+
}
|
package/src/webhook.test.ts
CHANGED
|
@@ -274,6 +274,32 @@ describe("VoiceCallWebhookServer replay handling", () => {
|
|
|
274
274
|
});
|
|
275
275
|
});
|
|
276
276
|
|
|
277
|
+
describe("VoiceCallWebhookServer response normalization", () => {
|
|
278
|
+
it("preserves explicit empty provider response bodies", async () => {
|
|
279
|
+
const responseProvider: VoiceCallProvider = {
|
|
280
|
+
...provider,
|
|
281
|
+
parseWebhookEvent: () => ({
|
|
282
|
+
events: [],
|
|
283
|
+
statusCode: 204,
|
|
284
|
+
providerResponseBody: "",
|
|
285
|
+
}),
|
|
286
|
+
};
|
|
287
|
+
const { manager } = createManager([]);
|
|
288
|
+
const config = createConfig({ serve: { port: 0, bind: "127.0.0.1", path: "/voice/webhook" } });
|
|
289
|
+
const server = new VoiceCallWebhookServer(config, manager, responseProvider);
|
|
290
|
+
|
|
291
|
+
try {
|
|
292
|
+
const baseUrl = await server.start();
|
|
293
|
+
const response = await postWebhookForm(server, baseUrl, "CallSid=CA123&SpeechResult=hello");
|
|
294
|
+
|
|
295
|
+
expect(response.status).toBe(204);
|
|
296
|
+
expect(await response.text()).toBe("");
|
|
297
|
+
} finally {
|
|
298
|
+
await server.stop();
|
|
299
|
+
}
|
|
300
|
+
});
|
|
301
|
+
});
|
|
302
|
+
|
|
277
303
|
describe("VoiceCallWebhookServer start idempotency", () => {
|
|
278
304
|
it("returns existing URL when start() is called twice without stop()", async () => {
|
|
279
305
|
const { manager } = createManager([]);
|
package/src/webhook.ts
CHANGED
|
@@ -4,8 +4,8 @@ import {
|
|
|
4
4
|
isRequestBodyLimitError,
|
|
5
5
|
readRequestBodyWithLimit,
|
|
6
6
|
requestBodyErrorToText,
|
|
7
|
-
} from "openclaw/plugin-sdk";
|
|
8
|
-
import type
|
|
7
|
+
} from "openclaw/plugin-sdk/voice-call";
|
|
8
|
+
import { normalizeVoiceCallConfig, type VoiceCallConfig } from "./config.js";
|
|
9
9
|
import type { CoreConfig } from "./core-bridge.js";
|
|
10
10
|
import type { CallManager } from "./manager.js";
|
|
11
11
|
import type { MediaStreamConfig } from "./media-stream.js";
|
|
@@ -24,6 +24,26 @@ type WebhookResponsePayload = {
|
|
|
24
24
|
headers?: Record<string, string>;
|
|
25
25
|
};
|
|
26
26
|
|
|
27
|
+
function buildRequestUrl(
|
|
28
|
+
requestUrl: string | undefined,
|
|
29
|
+
requestHost: string | undefined,
|
|
30
|
+
fallbackHost = "localhost",
|
|
31
|
+
): URL {
|
|
32
|
+
return new URL(requestUrl ?? "/", `http://${requestHost ?? fallbackHost}`);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function normalizeWebhookResponse(parsed: {
|
|
36
|
+
statusCode?: number;
|
|
37
|
+
providerResponseHeaders?: Record<string, string>;
|
|
38
|
+
providerResponseBody?: string;
|
|
39
|
+
}): WebhookResponsePayload {
|
|
40
|
+
return {
|
|
41
|
+
statusCode: parsed.statusCode ?? 200,
|
|
42
|
+
headers: parsed.providerResponseHeaders,
|
|
43
|
+
body: parsed.providerResponseBody ?? "OK",
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
27
47
|
/**
|
|
28
48
|
* HTTP server for receiving voice call webhooks from providers.
|
|
29
49
|
* Supports WebSocket upgrades for media streams when streaming is enabled.
|
|
@@ -46,13 +66,13 @@ export class VoiceCallWebhookServer {
|
|
|
46
66
|
provider: VoiceCallProvider,
|
|
47
67
|
coreConfig?: CoreConfig,
|
|
48
68
|
) {
|
|
49
|
-
this.config = config;
|
|
69
|
+
this.config = normalizeVoiceCallConfig(config);
|
|
50
70
|
this.manager = manager;
|
|
51
71
|
this.provider = provider;
|
|
52
72
|
this.coreConfig = coreConfig ?? null;
|
|
53
73
|
|
|
54
74
|
// Initialize media stream handler if streaming is enabled
|
|
55
|
-
if (config.streaming
|
|
75
|
+
if (this.config.streaming.enabled) {
|
|
56
76
|
this.initializeMediaStreaming();
|
|
57
77
|
}
|
|
58
78
|
}
|
|
@@ -68,7 +88,8 @@ export class VoiceCallWebhookServer {
|
|
|
68
88
|
* Initialize media streaming with OpenAI Realtime STT.
|
|
69
89
|
*/
|
|
70
90
|
private initializeMediaStreaming(): void {
|
|
71
|
-
const
|
|
91
|
+
const streaming = this.config.streaming;
|
|
92
|
+
const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY;
|
|
72
93
|
|
|
73
94
|
if (!apiKey) {
|
|
74
95
|
console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
|
|
@@ -77,17 +98,17 @@ export class VoiceCallWebhookServer {
|
|
|
77
98
|
|
|
78
99
|
const sttProvider = new OpenAIRealtimeSTTProvider({
|
|
79
100
|
apiKey,
|
|
80
|
-
model:
|
|
81
|
-
silenceDurationMs:
|
|
82
|
-
vadThreshold:
|
|
101
|
+
model: streaming.sttModel,
|
|
102
|
+
silenceDurationMs: streaming.silenceDurationMs,
|
|
103
|
+
vadThreshold: streaming.vadThreshold,
|
|
83
104
|
});
|
|
84
105
|
|
|
85
106
|
const streamConfig: MediaStreamConfig = {
|
|
86
107
|
sttProvider,
|
|
87
|
-
preStartTimeoutMs:
|
|
88
|
-
maxPendingConnections:
|
|
89
|
-
maxPendingConnectionsPerIp:
|
|
90
|
-
maxConnections:
|
|
108
|
+
preStartTimeoutMs: streaming.preStartTimeoutMs,
|
|
109
|
+
maxPendingConnections: streaming.maxPendingConnections,
|
|
110
|
+
maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,
|
|
111
|
+
maxConnections: streaming.maxConnections,
|
|
91
112
|
shouldAcceptStream: ({ callId, token }) => {
|
|
92
113
|
const call = this.manager.getCallByProviderCallId(callId);
|
|
93
114
|
if (!call) {
|
|
@@ -190,7 +211,7 @@ export class VoiceCallWebhookServer {
|
|
|
190
211
|
*/
|
|
191
212
|
async start(): Promise<string> {
|
|
192
213
|
const { port, bind, path: webhookPath } = this.config.serve;
|
|
193
|
-
const streamPath = this.config.streaming
|
|
214
|
+
const streamPath = this.config.streaming.streamPath;
|
|
194
215
|
|
|
195
216
|
// Guard: if a server is already listening, return the existing URL.
|
|
196
217
|
// This prevents EADDRINUSE when start() is called more than once on the
|
|
@@ -280,8 +301,7 @@ export class VoiceCallWebhookServer {
|
|
|
280
301
|
|
|
281
302
|
private getUpgradePathname(request: http.IncomingMessage): string | null {
|
|
282
303
|
try {
|
|
283
|
-
|
|
284
|
-
return new URL(request.url || "/", `http://${host}`).pathname;
|
|
304
|
+
return buildRequestUrl(request.url, request.headers.host).pathname;
|
|
285
305
|
} catch {
|
|
286
306
|
return null;
|
|
287
307
|
}
|
|
@@ -322,7 +342,7 @@ export class VoiceCallWebhookServer {
|
|
|
322
342
|
req: http.IncomingMessage,
|
|
323
343
|
webhookPath: string,
|
|
324
344
|
): Promise<WebhookResponsePayload> {
|
|
325
|
-
const url =
|
|
345
|
+
const url = buildRequestUrl(req.url, req.headers.host);
|
|
326
346
|
|
|
327
347
|
if (url.pathname === "/voice/hold-music") {
|
|
328
348
|
return {
|
|
@@ -360,7 +380,7 @@ export class VoiceCallWebhookServer {
|
|
|
360
380
|
const ctx: WebhookContext = {
|
|
361
381
|
headers: req.headers as Record<string, string | string[] | undefined>,
|
|
362
382
|
rawBody: body,
|
|
363
|
-
url:
|
|
383
|
+
url: url.toString(),
|
|
364
384
|
method: "POST",
|
|
365
385
|
query: Object.fromEntries(url.searchParams),
|
|
366
386
|
remoteAddress: req.socket.remoteAddress ?? undefined,
|
|
@@ -386,11 +406,7 @@ export class VoiceCallWebhookServer {
|
|
|
386
406
|
this.processParsedEvents(parsed.events);
|
|
387
407
|
}
|
|
388
408
|
|
|
389
|
-
return
|
|
390
|
-
statusCode: parsed.statusCode || 200,
|
|
391
|
-
headers: parsed.providerResponseHeaders,
|
|
392
|
-
body: parsed.providerResponseBody || "OK",
|
|
393
|
-
};
|
|
409
|
+
return normalizeWebhookResponse(parsed);
|
|
394
410
|
}
|
|
395
411
|
|
|
396
412
|
private processParsedEvents(events: NormalizedEvent[]): void {
|