npm - @clawdbot/voice-call - Versions diffs - 2026.1.21 → 2026.1.24 - Mend

@clawdbot/voice-call 2026.1.21 → 2026.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/CHANGELOG.md +17 -0
package/README.md +21 -0
package/clawdbot.plugin.json +207 -11
package/index.ts +21 -3
package/package.json +2 -2
package/src/config.ts +73 -22
package/src/core-bridge.ts +6 -0
package/src/manager/context.ts +0 -1
package/src/manager/events.ts +0 -1
package/src/manager/lookup.ts +0 -1
package/src/manager/outbound.ts +4 -3
package/src/manager/state.ts +0 -1
package/src/manager/store.ts +0 -1
package/src/manager/timers.ts +0 -1
package/src/manager/twiml.ts +0 -1
package/src/manager.ts +4 -2
package/src/media-stream.test.ts +97 -0
package/src/media-stream.ts +114 -0
package/src/providers/plivo.test.ts +0 -1
package/src/providers/stt-openai-realtime.ts +8 -0
package/src/providers/twilio/webhook.ts +0 -1
package/src/providers/twilio.test.ts +64 -0
package/src/providers/twilio.ts +51 -24
package/src/runtime.ts +12 -13
package/src/telephony-audio.ts +88 -0
package/src/telephony-tts.ts +95 -0
package/src/webhook.ts +10 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,22 @@
 # Changelog
+## 2026.1.24
+### Changes
+- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core).
+- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
+- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
+## 2026.1.23
+### Changes
+- Version alignment with core Clawdbot release numbers.
+## 2026.1.22
+### Changes
+- Version alignment with core Clawdbot release numbers.
 ## 2026.1.21
 ### Changes

package/README.md CHANGED Viewed

@@ -75,6 +75,27 @@ Notes:
 - Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
 - `mock` is a local dev provider (no network calls).
+## TTS for calls
+Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
+streaming speech on calls. You can override it under the plugin config with the
+same shape — overrides deep-merge with `messages.tts`.
+```json5
+{
+  tts: {
+    provider: "openai",
+    openai: {
+      voice: "alloy"
+    }
+  }
+}
+```
+Notes:
+- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
+- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
 ## CLI
 ```bash

package/clawdbot.plugin.json CHANGED Viewed

@@ -99,16 +99,39 @@
       "label": "Media Stream Path",
       "advanced": true
     },
-    "tts.model": {
-      "label": "TTS Model",
+    "tts.provider": {
+      "label": "TTS Provider Override",
+      "help": "Deep-merges with messages.tts (Edge is ignored for calls).",
       "advanced": true
     },
-    "tts.voice": {
-      "label": "TTS Voice",
+    "tts.openai.model": {
+      "label": "OpenAI TTS Model",
       "advanced": true
     },
-    "tts.instructions": {
-      "label": "TTS Instructions",
+    "tts.openai.voice": {
+      "label": "OpenAI TTS Voice",
+      "advanced": true
+    },
+    "tts.openai.apiKey": {
+      "label": "OpenAI API Key",
+      "sensitive": true,
+      "advanced": true
+    },
+    "tts.elevenlabs.modelId": {
+      "label": "ElevenLabs Model ID",
+      "advanced": true
+    },
+    "tts.elevenlabs.voiceId": {
+      "label": "ElevenLabs Voice ID",
+      "advanced": true
+    },
+    "tts.elevenlabs.apiKey": {
+      "label": "ElevenLabs API Key",
+      "sensitive": true,
+      "advanced": true
+    },
+    "tts.elevenlabs.baseUrl": {
+      "label": "ElevenLabs Base URL",
       "advanced": true
     },
     "publicUrl": {
@@ -370,20 +393,193 @@
         "type": "object",
         "additionalProperties": false,
         "properties": {
+          "auto": {
+            "type": "string",
+            "enum": [
+              "off",
+              "always",
+              "inbound",
+              "tagged"
+            ]
+          },
+          "enabled": {
+            "type": "boolean"
+          },
+          "mode": {
+            "type": "string",
+            "enum": [
+              "final",
+              "all"
+            ]
+          },
           "provider": {
             "type": "string",
             "enum": [
-              "openai"
+              "openai",
+              "elevenlabs",
+              "edge"
             ]
           },
-          "model": {
+          "summaryModel": {
             "type": "string"
           },
-          "voice": {
-            "type": "string"
+          "modelOverrides": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "enabled": {
+                "type": "boolean"
+              },
+              "allowText": {
+                "type": "boolean"
+              },
+              "allowProvider": {
+                "type": "boolean"
+              },
+              "allowVoice": {
+                "type": "boolean"
+              },
+              "allowModelId": {
+                "type": "boolean"
+              },
+              "allowVoiceSettings": {
+                "type": "boolean"
+              },
+              "allowNormalization": {
+                "type": "boolean"
+              },
+              "allowSeed": {
+                "type": "boolean"
+              }
+            }
+          },
+          "elevenlabs": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "apiKey": {
+                "type": "string"
+              },
+              "baseUrl": {
+                "type": "string"
+              },
+              "voiceId": {
+                "type": "string"
+              },
+              "modelId": {
+                "type": "string"
+              },
+              "seed": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 4294967295
+              },
+              "applyTextNormalization": {
+                "type": "string",
+                "enum": [
+                  "auto",
+                  "on",
+                  "off"
+                ]
+              },
+              "languageCode": {
+                "type": "string"
+              },
+              "voiceSettings": {
+                "type": "object",
+                "additionalProperties": false,
+                "properties": {
+                  "stability": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 1
+                  },
+                  "similarityBoost": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 1
+                  },
+                  "style": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 1
+                  },
+                  "useSpeakerBoost": {
+                    "type": "boolean"
+                  },
+                  "speed": {
+                    "type": "number",
+                    "minimum": 0.5,
+                    "maximum": 2
+                  }
+                }
+              }
+            }
+          },
+          "openai": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "apiKey": {
+                "type": "string"
+              },
+              "model": {
+                "type": "string"
+              },
+              "voice": {
+                "type": "string"
+              }
+            }
           },
-          "instructions": {
+          "edge": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "enabled": {
+                "type": "boolean"
+              },
+              "voice": {
+                "type": "string"
+              },
+              "lang": {
+                "type": "string"
+              },
+              "outputFormat": {
+                "type": "string"
+              },
+              "pitch": {
+                "type": "string"
+              },
+              "rate": {
+                "type": "string"
+              },
+              "volume": {
+                "type": "string"
+              },
+              "saveSubtitles": {
+                "type": "boolean"
+              },
+              "proxy": {
+                "type": "string"
+              },
+              "timeoutMs": {
+                "type": "integer",
+                "minimum": 1000,
+                "maximum": 120000
+              }
+            }
+          },
+          "prefsPath": {
             "type": "string"
+          },
+          "maxTextLength": {
+            "type": "integer",
+            "minimum": 1
+          },
+          "timeoutMs": {
+            "type": "integer",
+            "minimum": 1000,
+            "maximum": 120000
           }
         }
       },

package/index.ts CHANGED Viewed

@@ -74,9 +74,26 @@ const voiceCallConfigSchema = {
     },
     "streaming.sttModel": { label: "Realtime STT Model", advanced: true },
     "streaming.streamPath": { label: "Media Stream Path", advanced: true },
-    "tts.model": { label: "TTS Model", advanced: true },
-    "tts.voice": { label: "TTS Voice", advanced: true },
-    "tts.instructions": { label: "TTS Instructions", advanced: true },
+    "tts.provider": {
+      label: "TTS Provider Override",
+      help: "Deep-merges with messages.tts (Edge is ignored for calls).",
+      advanced: true,
+    },
+    "tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
+    "tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
+    "tts.openai.apiKey": {
+      label: "OpenAI API Key",
+      sensitive: true,
+      advanced: true,
+    },
+    "tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
+    "tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
+    "tts.elevenlabs.apiKey": {
+      label: "ElevenLabs API Key",
+      sensitive: true,
+      advanced: true,
+    },
+    "tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
     publicUrl: { label: "Public Webhook URL", advanced: true },
     skipSignatureVerification: {
       label: "Skip Signature Verification",
@@ -161,6 +178,7 @@ const voiceCallPlugin = {
         runtimePromise = createVoiceCallRuntime({
           config: cfg,
           coreConfig: api.config as CoreConfig,
+          ttsRuntime: api.runtime.tts,
           logger: api.logger,
         });
       }

package/package.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "@clawdbot/voice-call",
-  "version": "2026.1.21",
+  "version": "2026.1.24",
   "type": "module",
   "description": "Clawdbot voice-call plugin",
   "dependencies": {
     "@sinclair/typebox": "0.34.47",
     "ws": "^8.19.0",
-    "zod": "^4.3.5"
+    "zod": "^4.3.6"
   },
   "clawdbot": {
     "extensions": [

package/src/config.ts CHANGED Viewed

@@ -82,31 +82,82 @@ export const SttConfigSchema = z
   .default({ provider: "openai", model: "whisper-1" });
 export type SttConfig = z.infer<typeof SttConfigSchema>;
+export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
+export const TtsModeSchema = z.enum(["final", "all"]);
+export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
 export const TtsConfigSchema = z
   .object({
-    /** TTS provider (currently only OpenAI supported) */
-    provider: z.literal("openai").default("openai"),
-    /**
-     * TTS model to use:
-     * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
-     * - tts-1: lower latency
-     * - tts-1-hd: higher quality
-     */
-    model: z.string().min(1).default("gpt-4o-mini-tts"),
-    /**
-     * Voice ID. For best quality, use marin or cedar.
-     * All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
-     */
-    voice: z.string().min(1).default("coral"),
-    /**
-     * Instructions for speech style (only works with gpt-4o-mini-tts).
-     * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
-     */
-    instructions: z.string().optional(),
+    auto: TtsAutoSchema.optional(),
+    enabled: z.boolean().optional(),
+    mode: TtsModeSchema.optional(),
+    provider: TtsProviderSchema.optional(),
+    summaryModel: z.string().optional(),
+    modelOverrides: z
+      .object({
+        enabled: z.boolean().optional(),
+        allowText: z.boolean().optional(),
+        allowProvider: z.boolean().optional(),
+        allowVoice: z.boolean().optional(),
+        allowModelId: z.boolean().optional(),
+        allowVoiceSettings: z.boolean().optional(),
+        allowNormalization: z.boolean().optional(),
+        allowSeed: z.boolean().optional(),
+      })
+      .strict()
+      .optional(),
+    elevenlabs: z
+      .object({
+        apiKey: z.string().optional(),
+        baseUrl: z.string().optional(),
+        voiceId: z.string().optional(),
+        modelId: z.string().optional(),
+        seed: z.number().int().min(0).max(4294967295).optional(),
+        applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
+        languageCode: z.string().optional(),
+        voiceSettings: z
+          .object({
+            stability: z.number().min(0).max(1).optional(),
+            similarityBoost: z.number().min(0).max(1).optional(),
+            style: z.number().min(0).max(1).optional(),
+            useSpeakerBoost: z.boolean().optional(),
+            speed: z.number().min(0.5).max(2).optional(),
+          })
+          .strict()
+          .optional(),
+      })
+      .strict()
+      .optional(),
+    openai: z
+      .object({
+        apiKey: z.string().optional(),
+        model: z.string().optional(),
+        voice: z.string().optional(),
+      })
+      .strict()
+      .optional(),
+    edge: z
+      .object({
+        enabled: z.boolean().optional(),
+        voice: z.string().optional(),
+        lang: z.string().optional(),
+        outputFormat: z.string().optional(),
+        pitch: z.string().optional(),
+        rate: z.string().optional(),
+        volume: z.string().optional(),
+        saveSubtitles: z.boolean().optional(),
+        proxy: z.string().optional(),
+        timeoutMs: z.number().int().min(1000).max(120000).optional(),
+      })
+      .strict()
+      .optional(),
+    prefsPath: z.string().optional(),
+    maxTextLength: z.number().int().min(1).optional(),
+    timeoutMs: z.number().int().min(1000).max(120000).optional(),
   })
   .strict()
-  .default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
-export type TtsConfig = z.infer<typeof TtsConfigSchema>;
+  .optional();
+export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
 // -----------------------------------------------------------------------------
 // Webhook Server Configuration
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
   /** STT configuration */
   stt: SttConfigSchema,
-  /** TTS configuration */
+  /** TTS override (deep-merges with core messages.tts) */
   tts: TtsConfigSchema,
   /** Store path for call logs */

package/src/core-bridge.ts CHANGED Viewed

@@ -2,10 +2,16 @@ import fs from "node:fs";
 import path from "node:path";
 import { fileURLToPath, pathToFileURL } from "node:url";
+import type { VoiceCallTtsConfig } from "./config.js";
 export type CoreConfig = {
   session?: {
     store?: string;
   };
+  messages?: {
+    tts?: VoiceCallTtsConfig;
+  };
+  [key: string]: unknown;
 };
 type CoreAgentDeps = {

package/src/manager/context.ts CHANGED Viewed

@@ -19,4 +19,3 @@ export type CallManagerContext = {
   transcriptWaiters: Map<CallId, TranscriptWaiter>;
   maxDurationTimers: Map<CallId, NodeJS.Timeout>;
 };

package/src/manager/events.ts CHANGED Viewed

@@ -175,4 +175,3 @@ export function processEvent(ctx: CallManagerContext, event: NormalizedEvent): v
   persistCallRecord(ctx.storePath, call);
 }

package/src/manager/lookup.ts CHANGED Viewed

@@ -31,4 +31,3 @@ export function findCall(params: {
     providerCallId: params.callIdOrProviderCallId,
   });
 }

package/src/manager/outbound.ts CHANGED Viewed

@@ -68,7 +68,7 @@ export async function initiateCall(
     // For notify mode with a message, use inline TwiML with <Say>.
     let inlineTwiml: string | undefined;
     if (mode === "notify" && initialMessage) {
-      const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice);
+      const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
       inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
       console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
     }
@@ -120,11 +120,13 @@ export async function speak(
     addTranscriptEntry(call, "bot", text);
+    const voice =
+      ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
     await ctx.provider.playTts({
       callId,
       providerCallId: call.providerCallId,
       text,
-      voice: ctx.config.tts.voice,
+      voice,
     });
     return { success: true };
@@ -244,4 +246,3 @@ export async function endCall(
     return { success: false, error: err instanceof Error ? err.message : String(err) };
   }
 }

package/src/manager/state.ts CHANGED Viewed

@@ -48,4 +48,3 @@ export function addTranscriptEntry(
   };
   call.transcript.push(entry);
 }

package/src/manager/store.ts CHANGED Viewed

@@ -86,4 +86,3 @@ export async function getCallHistoryFromStore(
   return calls;
 }

package/src/manager/timers.ts CHANGED Viewed

@@ -84,4 +84,3 @@ export function waitForFinalTranscript(
     ctx.transcriptWaiters.set(callId, { resolve, reject, timeout });
   });
 }

package/src/manager/twiml.ts CHANGED Viewed

@@ -7,4 +7,3 @@ export function generateNotifyTwiml(message: string, voice: string): string {
   <Hangup/>
 </Response>`;
 }

package/src/manager.ts CHANGED Viewed

@@ -143,7 +143,7 @@ export class CallManager {
       // For notify mode with a message, use inline TwiML with <Say>
       let inlineTwiml: string | undefined;
       if (mode === "notify" && initialMessage) {
-        const pollyVoice = mapVoiceToPolly(this.config.tts.voice);
+        const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
         inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
         console.log(
           `[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
@@ -210,11 +210,13 @@ export class CallManager {
       this.addTranscriptEntry(call, "bot", text);
       // Play TTS
+      const voice =
+        this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
       await this.provider.playTts({
         callId,
         providerCallId: call.providerCallId,
         text,
-        voice: this.config.tts.voice,
+        voice,
       });
       return { success: true };

package/src/media-stream.test.ts ADDED Viewed

@@ -0,0 +1,97 @@
+import { describe, expect, it } from "vitest";
+import type {
+  OpenAIRealtimeSTTProvider,
+  RealtimeSTTSession,
+} from "./providers/stt-openai-realtime.js";
+import { MediaStreamHandler } from "./media-stream.js";
+const createStubSession = (): RealtimeSTTSession => ({
+  connect: async () => {},
+  sendAudio: () => {},
+  waitForTranscript: async () => "",
+  onPartial: () => {},
+  onTranscript: () => {},
+  onSpeechStart: () => {},
+  close: () => {},
+  isConnected: () => true,
+});
+const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
+  ({
+    createSession: () => createStubSession(),
+  }) as unknown as OpenAIRealtimeSTTProvider;
+const flush = async (): Promise<void> => {
+  await new Promise((resolve) => setTimeout(resolve, 0));
+};
+const waitForAbort = (signal: AbortSignal): Promise<void> =>
+  new Promise((resolve) => {
+    if (signal.aborted) {
+      resolve();
+      return;
+    }
+    signal.addEventListener("abort", () => resolve(), { once: true });
+  });
+describe("MediaStreamHandler TTS queue", () => {
+  it("serializes TTS playback and resolves in order", async () => {
+    const handler = new MediaStreamHandler({
+      sttProvider: createStubSttProvider(),
+    });
+    const started: number[] = [];
+    const finished: number[] = [];
+    let resolveFirst!: () => void;
+    const firstGate = new Promise<void>((resolve) => {
+      resolveFirst = resolve;
+    });
+    const first = handler.queueTts("stream-1", async () => {
+      started.push(1);
+      await firstGate;
+      finished.push(1);
+    });
+    const second = handler.queueTts("stream-1", async () => {
+      started.push(2);
+      finished.push(2);
+    });
+    await flush();
+    expect(started).toEqual([1]);
+    resolveFirst();
+    await first;
+    await second;
+    expect(started).toEqual([1, 2]);
+    expect(finished).toEqual([1, 2]);
+  });
+  it("cancels active playback and clears queued items", async () => {
+    const handler = new MediaStreamHandler({
+      sttProvider: createStubSttProvider(),
+    });
+    let queuedRan = false;
+    const started: string[] = [];
+    const active = handler.queueTts("stream-1", async (signal) => {
+      started.push("active");
+      await waitForAbort(signal);
+    });
+    void handler.queueTts("stream-1", async () => {
+      queuedRan = true;
+    });
+    await flush();
+    expect(started).toEqual(["active"]);
+    handler.clearTtsQueue("stream-1");
+    await active;
+    await flush();
+    expect(queuedRan).toBe(false);
+  });
+});