@clawdbot/voice-call 2026.1.21 → 2026.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,22 @@
1
1
  # Changelog
2
2
 
3
+ ## 2026.1.24
4
+
5
+ ### Changes
6
+ - Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core).
7
+ - Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
8
+ - Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
9
+
10
+ ## 2026.1.23
11
+
12
+ ### Changes
13
+ - Version alignment with core Clawdbot release numbers.
14
+
15
+ ## 2026.1.22
16
+
17
+ ### Changes
18
+ - Version alignment with core Clawdbot release numbers.
19
+
3
20
  ## 2026.1.21
4
21
 
5
22
  ### Changes
package/README.md CHANGED
@@ -75,6 +75,27 @@ Notes:
75
75
  - Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
76
76
  - `mock` is a local dev provider (no network calls).
77
77
 
78
+ ## TTS for calls
79
+
80
+ Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
81
+ streaming speech on calls. You can override it under the plugin config with the
82
+ same shape — overrides deep-merge with `messages.tts`.
83
+
84
+ ```json5
85
+ {
86
+ tts: {
87
+ provider: "openai",
88
+ openai: {
89
+ voice: "alloy"
90
+ }
91
+ }
92
+ }
93
+ ```
94
+
95
+ Notes:
96
+ - Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
97
+ - Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
98
+
78
99
  ## CLI
79
100
 
80
101
  ```bash
@@ -99,16 +99,39 @@
99
99
  "label": "Media Stream Path",
100
100
  "advanced": true
101
101
  },
102
- "tts.model": {
103
- "label": "TTS Model",
102
+ "tts.provider": {
103
+ "label": "TTS Provider Override",
104
+ "help": "Deep-merges with messages.tts (Edge is ignored for calls).",
104
105
  "advanced": true
105
106
  },
106
- "tts.voice": {
107
- "label": "TTS Voice",
107
+ "tts.openai.model": {
108
+ "label": "OpenAI TTS Model",
108
109
  "advanced": true
109
110
  },
110
- "tts.instructions": {
111
- "label": "TTS Instructions",
111
+ "tts.openai.voice": {
112
+ "label": "OpenAI TTS Voice",
113
+ "advanced": true
114
+ },
115
+ "tts.openai.apiKey": {
116
+ "label": "OpenAI API Key",
117
+ "sensitive": true,
118
+ "advanced": true
119
+ },
120
+ "tts.elevenlabs.modelId": {
121
+ "label": "ElevenLabs Model ID",
122
+ "advanced": true
123
+ },
124
+ "tts.elevenlabs.voiceId": {
125
+ "label": "ElevenLabs Voice ID",
126
+ "advanced": true
127
+ },
128
+ "tts.elevenlabs.apiKey": {
129
+ "label": "ElevenLabs API Key",
130
+ "sensitive": true,
131
+ "advanced": true
132
+ },
133
+ "tts.elevenlabs.baseUrl": {
134
+ "label": "ElevenLabs Base URL",
112
135
  "advanced": true
113
136
  },
114
137
  "publicUrl": {
@@ -370,20 +393,193 @@
370
393
  "type": "object",
371
394
  "additionalProperties": false,
372
395
  "properties": {
396
+ "auto": {
397
+ "type": "string",
398
+ "enum": [
399
+ "off",
400
+ "always",
401
+ "inbound",
402
+ "tagged"
403
+ ]
404
+ },
405
+ "enabled": {
406
+ "type": "boolean"
407
+ },
408
+ "mode": {
409
+ "type": "string",
410
+ "enum": [
411
+ "final",
412
+ "all"
413
+ ]
414
+ },
373
415
  "provider": {
374
416
  "type": "string",
375
417
  "enum": [
376
- "openai"
418
+ "openai",
419
+ "elevenlabs",
420
+ "edge"
377
421
  ]
378
422
  },
379
- "model": {
423
+ "summaryModel": {
380
424
  "type": "string"
381
425
  },
382
- "voice": {
383
- "type": "string"
426
+ "modelOverrides": {
427
+ "type": "object",
428
+ "additionalProperties": false,
429
+ "properties": {
430
+ "enabled": {
431
+ "type": "boolean"
432
+ },
433
+ "allowText": {
434
+ "type": "boolean"
435
+ },
436
+ "allowProvider": {
437
+ "type": "boolean"
438
+ },
439
+ "allowVoice": {
440
+ "type": "boolean"
441
+ },
442
+ "allowModelId": {
443
+ "type": "boolean"
444
+ },
445
+ "allowVoiceSettings": {
446
+ "type": "boolean"
447
+ },
448
+ "allowNormalization": {
449
+ "type": "boolean"
450
+ },
451
+ "allowSeed": {
452
+ "type": "boolean"
453
+ }
454
+ }
455
+ },
456
+ "elevenlabs": {
457
+ "type": "object",
458
+ "additionalProperties": false,
459
+ "properties": {
460
+ "apiKey": {
461
+ "type": "string"
462
+ },
463
+ "baseUrl": {
464
+ "type": "string"
465
+ },
466
+ "voiceId": {
467
+ "type": "string"
468
+ },
469
+ "modelId": {
470
+ "type": "string"
471
+ },
472
+ "seed": {
473
+ "type": "integer",
474
+ "minimum": 0,
475
+ "maximum": 4294967295
476
+ },
477
+ "applyTextNormalization": {
478
+ "type": "string",
479
+ "enum": [
480
+ "auto",
481
+ "on",
482
+ "off"
483
+ ]
484
+ },
485
+ "languageCode": {
486
+ "type": "string"
487
+ },
488
+ "voiceSettings": {
489
+ "type": "object",
490
+ "additionalProperties": false,
491
+ "properties": {
492
+ "stability": {
493
+ "type": "number",
494
+ "minimum": 0,
495
+ "maximum": 1
496
+ },
497
+ "similarityBoost": {
498
+ "type": "number",
499
+ "minimum": 0,
500
+ "maximum": 1
501
+ },
502
+ "style": {
503
+ "type": "number",
504
+ "minimum": 0,
505
+ "maximum": 1
506
+ },
507
+ "useSpeakerBoost": {
508
+ "type": "boolean"
509
+ },
510
+ "speed": {
511
+ "type": "number",
512
+ "minimum": 0.5,
513
+ "maximum": 2
514
+ }
515
+ }
516
+ }
517
+ }
518
+ },
519
+ "openai": {
520
+ "type": "object",
521
+ "additionalProperties": false,
522
+ "properties": {
523
+ "apiKey": {
524
+ "type": "string"
525
+ },
526
+ "model": {
527
+ "type": "string"
528
+ },
529
+ "voice": {
530
+ "type": "string"
531
+ }
532
+ }
384
533
  },
385
- "instructions": {
534
+ "edge": {
535
+ "type": "object",
536
+ "additionalProperties": false,
537
+ "properties": {
538
+ "enabled": {
539
+ "type": "boolean"
540
+ },
541
+ "voice": {
542
+ "type": "string"
543
+ },
544
+ "lang": {
545
+ "type": "string"
546
+ },
547
+ "outputFormat": {
548
+ "type": "string"
549
+ },
550
+ "pitch": {
551
+ "type": "string"
552
+ },
553
+ "rate": {
554
+ "type": "string"
555
+ },
556
+ "volume": {
557
+ "type": "string"
558
+ },
559
+ "saveSubtitles": {
560
+ "type": "boolean"
561
+ },
562
+ "proxy": {
563
+ "type": "string"
564
+ },
565
+ "timeoutMs": {
566
+ "type": "integer",
567
+ "minimum": 1000,
568
+ "maximum": 120000
569
+ }
570
+ }
571
+ },
572
+ "prefsPath": {
386
573
  "type": "string"
574
+ },
575
+ "maxTextLength": {
576
+ "type": "integer",
577
+ "minimum": 1
578
+ },
579
+ "timeoutMs": {
580
+ "type": "integer",
581
+ "minimum": 1000,
582
+ "maximum": 120000
387
583
  }
388
584
  }
389
585
  },
package/index.ts CHANGED
@@ -74,9 +74,26 @@ const voiceCallConfigSchema = {
74
74
  },
75
75
  "streaming.sttModel": { label: "Realtime STT Model", advanced: true },
76
76
  "streaming.streamPath": { label: "Media Stream Path", advanced: true },
77
- "tts.model": { label: "TTS Model", advanced: true },
78
- "tts.voice": { label: "TTS Voice", advanced: true },
79
- "tts.instructions": { label: "TTS Instructions", advanced: true },
77
+ "tts.provider": {
78
+ label: "TTS Provider Override",
79
+ help: "Deep-merges with messages.tts (Edge is ignored for calls).",
80
+ advanced: true,
81
+ },
82
+ "tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
83
+ "tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
84
+ "tts.openai.apiKey": {
85
+ label: "OpenAI API Key",
86
+ sensitive: true,
87
+ advanced: true,
88
+ },
89
+ "tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
90
+ "tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
91
+ "tts.elevenlabs.apiKey": {
92
+ label: "ElevenLabs API Key",
93
+ sensitive: true,
94
+ advanced: true,
95
+ },
96
+ "tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
80
97
  publicUrl: { label: "Public Webhook URL", advanced: true },
81
98
  skipSignatureVerification: {
82
99
  label: "Skip Signature Verification",
@@ -161,6 +178,7 @@ const voiceCallPlugin = {
161
178
  runtimePromise = createVoiceCallRuntime({
162
179
  config: cfg,
163
180
  coreConfig: api.config as CoreConfig,
181
+ ttsRuntime: api.runtime.tts,
164
182
  logger: api.logger,
165
183
  });
166
184
  }
package/package.json CHANGED
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "@clawdbot/voice-call",
3
- "version": "2026.1.21",
3
+ "version": "2026.1.24",
4
4
  "type": "module",
5
5
  "description": "Clawdbot voice-call plugin",
6
6
  "dependencies": {
7
7
  "@sinclair/typebox": "0.34.47",
8
8
  "ws": "^8.19.0",
9
- "zod": "^4.3.5"
9
+ "zod": "^4.3.6"
10
10
  },
11
11
  "clawdbot": {
12
12
  "extensions": [
package/src/config.ts CHANGED
@@ -82,31 +82,82 @@ export const SttConfigSchema = z
82
82
  .default({ provider: "openai", model: "whisper-1" });
83
83
  export type SttConfig = z.infer<typeof SttConfigSchema>;
84
84
 
85
+ export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
86
+ export const TtsModeSchema = z.enum(["final", "all"]);
87
+ export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
88
+
85
89
  export const TtsConfigSchema = z
86
90
  .object({
87
- /** TTS provider (currently only OpenAI supported) */
88
- provider: z.literal("openai").default("openai"),
89
- /**
90
- * TTS model to use:
91
- * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
92
- * - tts-1: lower latency
93
- * - tts-1-hd: higher quality
94
- */
95
- model: z.string().min(1).default("gpt-4o-mini-tts"),
96
- /**
97
- * Voice ID. For best quality, use marin or cedar.
98
- * All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
99
- */
100
- voice: z.string().min(1).default("coral"),
101
- /**
102
- * Instructions for speech style (only works with gpt-4o-mini-tts).
103
- * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
104
- */
105
- instructions: z.string().optional(),
91
+ auto: TtsAutoSchema.optional(),
92
+ enabled: z.boolean().optional(),
93
+ mode: TtsModeSchema.optional(),
94
+ provider: TtsProviderSchema.optional(),
95
+ summaryModel: z.string().optional(),
96
+ modelOverrides: z
97
+ .object({
98
+ enabled: z.boolean().optional(),
99
+ allowText: z.boolean().optional(),
100
+ allowProvider: z.boolean().optional(),
101
+ allowVoice: z.boolean().optional(),
102
+ allowModelId: z.boolean().optional(),
103
+ allowVoiceSettings: z.boolean().optional(),
104
+ allowNormalization: z.boolean().optional(),
105
+ allowSeed: z.boolean().optional(),
106
+ })
107
+ .strict()
108
+ .optional(),
109
+ elevenlabs: z
110
+ .object({
111
+ apiKey: z.string().optional(),
112
+ baseUrl: z.string().optional(),
113
+ voiceId: z.string().optional(),
114
+ modelId: z.string().optional(),
115
+ seed: z.number().int().min(0).max(4294967295).optional(),
116
+ applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
117
+ languageCode: z.string().optional(),
118
+ voiceSettings: z
119
+ .object({
120
+ stability: z.number().min(0).max(1).optional(),
121
+ similarityBoost: z.number().min(0).max(1).optional(),
122
+ style: z.number().min(0).max(1).optional(),
123
+ useSpeakerBoost: z.boolean().optional(),
124
+ speed: z.number().min(0.5).max(2).optional(),
125
+ })
126
+ .strict()
127
+ .optional(),
128
+ })
129
+ .strict()
130
+ .optional(),
131
+ openai: z
132
+ .object({
133
+ apiKey: z.string().optional(),
134
+ model: z.string().optional(),
135
+ voice: z.string().optional(),
136
+ })
137
+ .strict()
138
+ .optional(),
139
+ edge: z
140
+ .object({
141
+ enabled: z.boolean().optional(),
142
+ voice: z.string().optional(),
143
+ lang: z.string().optional(),
144
+ outputFormat: z.string().optional(),
145
+ pitch: z.string().optional(),
146
+ rate: z.string().optional(),
147
+ volume: z.string().optional(),
148
+ saveSubtitles: z.boolean().optional(),
149
+ proxy: z.string().optional(),
150
+ timeoutMs: z.number().int().min(1000).max(120000).optional(),
151
+ })
152
+ .strict()
153
+ .optional(),
154
+ prefsPath: z.string().optional(),
155
+ maxTextLength: z.number().int().min(1).optional(),
156
+ timeoutMs: z.number().int().min(1000).max(120000).optional(),
106
157
  })
107
158
  .strict()
108
- .default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
109
- export type TtsConfig = z.infer<typeof TtsConfigSchema>;
159
+ .optional();
160
+ export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
110
161
 
111
162
  // -----------------------------------------------------------------------------
112
163
  // Webhook Server Configuration
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
307
358
  /** STT configuration */
308
359
  stt: SttConfigSchema,
309
360
 
310
- /** TTS configuration */
361
+ /** TTS override (deep-merges with core messages.tts) */
311
362
  tts: TtsConfigSchema,
312
363
 
313
364
  /** Store path for call logs */
@@ -2,10 +2,16 @@ import fs from "node:fs";
2
2
  import path from "node:path";
3
3
  import { fileURLToPath, pathToFileURL } from "node:url";
4
4
 
5
+ import type { VoiceCallTtsConfig } from "./config.js";
6
+
5
7
  export type CoreConfig = {
6
8
  session?: {
7
9
  store?: string;
8
10
  };
11
+ messages?: {
12
+ tts?: VoiceCallTtsConfig;
13
+ };
14
+ [key: string]: unknown;
9
15
  };
10
16
 
11
17
  type CoreAgentDeps = {
@@ -19,4 +19,3 @@ export type CallManagerContext = {
19
19
  transcriptWaiters: Map<CallId, TranscriptWaiter>;
20
20
  maxDurationTimers: Map<CallId, NodeJS.Timeout>;
21
21
  };
22
-
@@ -175,4 +175,3 @@ export function processEvent(ctx: CallManagerContext, event: NormalizedEvent): v
175
175
 
176
176
  persistCallRecord(ctx.storePath, call);
177
177
  }
178
-
@@ -31,4 +31,3 @@ export function findCall(params: {
31
31
  providerCallId: params.callIdOrProviderCallId,
32
32
  });
33
33
  }
34
-
@@ -68,7 +68,7 @@ export async function initiateCall(
68
68
  // For notify mode with a message, use inline TwiML with <Say>.
69
69
  let inlineTwiml: string | undefined;
70
70
  if (mode === "notify" && initialMessage) {
71
- const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice);
71
+ const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
72
72
  inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
73
73
  console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
74
74
  }
@@ -120,11 +120,13 @@ export async function speak(
120
120
 
121
121
  addTranscriptEntry(call, "bot", text);
122
122
 
123
+ const voice =
124
+ ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
123
125
  await ctx.provider.playTts({
124
126
  callId,
125
127
  providerCallId: call.providerCallId,
126
128
  text,
127
- voice: ctx.config.tts.voice,
129
+ voice,
128
130
  });
129
131
 
130
132
  return { success: true };
@@ -244,4 +246,3 @@ export async function endCall(
244
246
  return { success: false, error: err instanceof Error ? err.message : String(err) };
245
247
  }
246
248
  }
247
-
@@ -48,4 +48,3 @@ export function addTranscriptEntry(
48
48
  };
49
49
  call.transcript.push(entry);
50
50
  }
51
-
@@ -86,4 +86,3 @@ export async function getCallHistoryFromStore(
86
86
 
87
87
  return calls;
88
88
  }
89
-
@@ -84,4 +84,3 @@ export function waitForFinalTranscript(
84
84
  ctx.transcriptWaiters.set(callId, { resolve, reject, timeout });
85
85
  });
86
86
  }
87
-
@@ -7,4 +7,3 @@ export function generateNotifyTwiml(message: string, voice: string): string {
7
7
  <Hangup/>
8
8
  </Response>`;
9
9
  }
10
-
package/src/manager.ts CHANGED
@@ -143,7 +143,7 @@ export class CallManager {
143
143
  // For notify mode with a message, use inline TwiML with <Say>
144
144
  let inlineTwiml: string | undefined;
145
145
  if (mode === "notify" && initialMessage) {
146
- const pollyVoice = mapVoiceToPolly(this.config.tts.voice);
146
+ const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
147
147
  inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
148
148
  console.log(
149
149
  `[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
@@ -210,11 +210,13 @@ export class CallManager {
210
210
  this.addTranscriptEntry(call, "bot", text);
211
211
 
212
212
  // Play TTS
213
+ const voice =
214
+ this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
213
215
  await this.provider.playTts({
214
216
  callId,
215
217
  providerCallId: call.providerCallId,
216
218
  text,
217
- voice: this.config.tts.voice,
219
+ voice,
218
220
  });
219
221
 
220
222
  return { success: true };
@@ -0,0 +1,97 @@
1
+ import { describe, expect, it } from "vitest";
2
+
3
+ import type {
4
+ OpenAIRealtimeSTTProvider,
5
+ RealtimeSTTSession,
6
+ } from "./providers/stt-openai-realtime.js";
7
+ import { MediaStreamHandler } from "./media-stream.js";
8
+
9
+ const createStubSession = (): RealtimeSTTSession => ({
10
+ connect: async () => {},
11
+ sendAudio: () => {},
12
+ waitForTranscript: async () => "",
13
+ onPartial: () => {},
14
+ onTranscript: () => {},
15
+ onSpeechStart: () => {},
16
+ close: () => {},
17
+ isConnected: () => true,
18
+ });
19
+
20
+ const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
21
+ ({
22
+ createSession: () => createStubSession(),
23
+ }) as unknown as OpenAIRealtimeSTTProvider;
24
+
25
+ const flush = async (): Promise<void> => {
26
+ await new Promise((resolve) => setTimeout(resolve, 0));
27
+ };
28
+
29
+ const waitForAbort = (signal: AbortSignal): Promise<void> =>
30
+ new Promise((resolve) => {
31
+ if (signal.aborted) {
32
+ resolve();
33
+ return;
34
+ }
35
+ signal.addEventListener("abort", () => resolve(), { once: true });
36
+ });
37
+
38
+ describe("MediaStreamHandler TTS queue", () => {
39
+ it("serializes TTS playback and resolves in order", async () => {
40
+ const handler = new MediaStreamHandler({
41
+ sttProvider: createStubSttProvider(),
42
+ });
43
+ const started: number[] = [];
44
+ const finished: number[] = [];
45
+
46
+ let resolveFirst!: () => void;
47
+ const firstGate = new Promise<void>((resolve) => {
48
+ resolveFirst = resolve;
49
+ });
50
+
51
+ const first = handler.queueTts("stream-1", async () => {
52
+ started.push(1);
53
+ await firstGate;
54
+ finished.push(1);
55
+ });
56
+ const second = handler.queueTts("stream-1", async () => {
57
+ started.push(2);
58
+ finished.push(2);
59
+ });
60
+
61
+ await flush();
62
+ expect(started).toEqual([1]);
63
+
64
+ resolveFirst();
65
+ await first;
66
+ await second;
67
+
68
+ expect(started).toEqual([1, 2]);
69
+ expect(finished).toEqual([1, 2]);
70
+ });
71
+
72
+ it("cancels active playback and clears queued items", async () => {
73
+ const handler = new MediaStreamHandler({
74
+ sttProvider: createStubSttProvider(),
75
+ });
76
+
77
+ let queuedRan = false;
78
+ const started: string[] = [];
79
+
80
+ const active = handler.queueTts("stream-1", async (signal) => {
81
+ started.push("active");
82
+ await waitForAbort(signal);
83
+ });
84
+ void handler.queueTts("stream-1", async () => {
85
+ queuedRan = true;
86
+ });
87
+
88
+ await flush();
89
+ expect(started).toEqual(["active"]);
90
+
91
+ handler.clearTtsQueue("stream-1");
92
+ await active;
93
+ await flush();
94
+
95
+ expect(queuedRan).toBe(false);
96
+ });
97
+ });