npm - @clawdbot/voice-call - Versions diffs - 0.1.0 - Mend

@clawdbot/voice-call 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/CHANGELOG.md +20 -0
package/README.md +107 -0
package/index.ts +477 -0
package/package.json +14 -0
package/src/cli.ts +297 -0
package/src/config.ts +355 -0
package/src/core-bridge.ts +190 -0
package/src/manager.ts +846 -0
package/src/media-stream.ts +279 -0
package/src/providers/base.ts +67 -0
package/src/providers/index.ts +9 -0
package/src/providers/mock.ts +168 -0
package/src/providers/stt-openai-realtime.ts +303 -0
package/src/providers/telnyx.ts +364 -0
package/src/providers/tts-openai.ts +264 -0
package/src/providers/twilio.ts +537 -0
package/src/response-generator.ts +171 -0
package/src/runtime.ts +194 -0
package/src/tunnel.ts +330 -0
package/src/types.ts +272 -0
package/src/utils.ts +12 -0
package/src/voice-mapping.ts +65 -0
package/src/webhook-security.ts +197 -0
package/src/webhook.ts +480 -0

package/src/providers/twilio.ts ADDED Viewed

@@ -0,0 +1,537 @@
+import crypto from "node:crypto";
+import type { TwilioConfig } from "../config.js";
+import type { MediaStreamHandler } from "../media-stream.js";
+import type {
+  HangupCallInput,
+  InitiateCallInput,
+  InitiateCallResult,
+  NormalizedEvent,
+  PlayTtsInput,
+  ProviderWebhookParseResult,
+  StartListeningInput,
+  StopListeningInput,
+  WebhookContext,
+  WebhookVerificationResult,
+} from "../types.js";
+import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
+import { verifyTwilioWebhook } from "../webhook-security.js";
+import type { VoiceCallProvider } from "./base.js";
+import type { OpenAITTSProvider } from "./tts-openai.js";
+import { chunkAudio } from "./tts-openai.js";
+/**
+ * Twilio Voice API provider implementation.
+ *
+ * Uses Twilio Programmable Voice API with Media Streams for real-time
+ * bidirectional audio streaming.
+ *
+ * @see https://www.twilio.com/docs/voice
+ * @see https://www.twilio.com/docs/voice/media-streams
+ */
+export interface TwilioProviderOptions {
+  /** Allow ngrok free tier compatibility mode (less secure) */
+  allowNgrokFreeTier?: boolean;
+  /** Override public URL for signature verification */
+  publicUrl?: string;
+  /** Path for media stream WebSocket (e.g., /voice/stream) */
+  streamPath?: string;
+  /** Skip webhook signature verification (development only) */
+  skipVerification?: boolean;
+}
+export class TwilioProvider implements VoiceCallProvider {
+  readonly name = "twilio" as const;
+  private readonly accountSid: string;
+  private readonly authToken: string;
+  private readonly baseUrl: string;
+  private readonly callWebhookUrls = new Map<string, string>();
+  private readonly options: TwilioProviderOptions;
+  /** Current public webhook URL (set when tunnel starts or from config) */
+  private currentPublicUrl: string | null = null;
+  /** Optional OpenAI TTS provider for streaming TTS */
+  private ttsProvider: OpenAITTSProvider | null = null;
+  /** Optional media stream handler for sending audio */
+  private mediaStreamHandler: MediaStreamHandler | null = null;
+  /** Map of call SID to stream SID for media streams */
+  private callStreamMap = new Map<string, string>();
+  constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) {
+    if (!config.accountSid) {
+      throw new Error("Twilio Account SID is required");
+    }
+    if (!config.authToken) {
+      throw new Error("Twilio Auth Token is required");
+    }
+    this.accountSid = config.accountSid;
+    this.authToken = config.authToken;
+    this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`;
+    this.options = options;
+    if (options.publicUrl) {
+      this.currentPublicUrl = options.publicUrl;
+    }
+  }
+  /**
+   * Set the current public webhook URL (called when tunnel starts).
+   */
+  setPublicUrl(url: string): void {
+    this.currentPublicUrl = url;
+  }
+  /**
+   * Get the current public webhook URL.
+   */
+  getPublicUrl(): string | null {
+    return this.currentPublicUrl;
+  }
+  /**
+   * Set the OpenAI TTS provider for streaming TTS.
+   * When set, playTts will use OpenAI audio via media streams.
+   */
+  setTTSProvider(provider: OpenAITTSProvider): void {
+    this.ttsProvider = provider;
+  }
+  /**
+   * Set the media stream handler for sending audio.
+   */
+  setMediaStreamHandler(handler: MediaStreamHandler): void {
+    this.mediaStreamHandler = handler;
+  }
+  /**
+   * Register a call's stream SID for audio routing.
+   */
+  registerCallStream(callSid: string, streamSid: string): void {
+    this.callStreamMap.set(callSid, streamSid);
+  }
+  /**
+   * Unregister a call's stream SID.
+   */
+  unregisterCallStream(callSid: string): void {
+    this.callStreamMap.delete(callSid);
+  }
+  /**
+   * Make an authenticated request to the Twilio API.
+   */
+  private async apiRequest<T = unknown>(
+    endpoint: string,
+    params: Record<string, string>,
+    options?: { allowNotFound?: boolean },
+  ): Promise<T> {
+    const response = await fetch(`${this.baseUrl}${endpoint}`, {
+      method: "POST",
+      headers: {
+        Authorization: `Basic ${Buffer.from(`${this.accountSid}:${this.authToken}`).toString("base64")}`,
+        "Content-Type": "application/x-www-form-urlencoded",
+      },
+      body: new URLSearchParams(params),
+    });
+    if (!response.ok) {
+      if (options?.allowNotFound && response.status === 404) {
+        return undefined as T;
+      }
+      const errorText = await response.text();
+      throw new Error(`Twilio API error: ${response.status} ${errorText}`);
+    }
+    const text = await response.text();
+    return text ? (JSON.parse(text) as T) : (undefined as T);
+  }
+  /**
+   * Verify Twilio webhook signature using HMAC-SHA1.
+   *
+   * Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing
+   * the public URL from forwarding headers.
+   *
+   * @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
+   */
+  verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
+    const result = verifyTwilioWebhook(ctx, this.authToken, {
+      publicUrl: this.currentPublicUrl || undefined,
+      allowNgrokFreeTier: this.options.allowNgrokFreeTier ?? true,
+      skipVerification: this.options.skipVerification,
+    });
+    if (!result.ok) {
+      console.warn(`[twilio] Webhook verification failed: ${result.reason}`);
+      if (result.verificationUrl) {
+        console.warn(`[twilio] Verification URL: ${result.verificationUrl}`);
+      }
+    }
+    return {
+      ok: result.ok,
+      reason: result.reason,
+    };
+  }
+  /**
+   * Parse Twilio webhook event into normalized format.
+   */
+  parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
+    try {
+      const params = new URLSearchParams(ctx.rawBody);
+      const callIdFromQuery =
+        typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
+          ? ctx.query.callId.trim()
+          : undefined;
+      const event = this.normalizeEvent(params, callIdFromQuery);
+      // For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
+      // so the webhook response is typically a pause to keep the call alive.
+      const twiml = this.generateTwimlResponse(ctx);
+      return {
+        events: event ? [event] : [],
+        providerResponseBody: twiml,
+        providerResponseHeaders: { "Content-Type": "application/xml" },
+        statusCode: 200,
+      };
+    } catch {
+      return { events: [], statusCode: 400 };
+    }
+  }
+  /**
+   * Parse Twilio direction to normalized format.
+   */
+  private static parseDirection(
+    direction: string | null,
+  ): "inbound" | "outbound" | undefined {
+    if (direction === "inbound") return "inbound";
+    if (direction === "outbound-api" || direction === "outbound-dial")
+      return "outbound";
+    return undefined;
+  }
+  /**
+   * Convert Twilio webhook params to normalized event format.
+   */
+  private normalizeEvent(
+    params: URLSearchParams,
+    callIdOverride?: string,
+  ): NormalizedEvent | null {
+    const callSid = params.get("CallSid") || "";
+    const baseEvent = {
+      id: crypto.randomUUID(),
+      callId: callIdOverride || callSid,
+      providerCallId: callSid,
+      timestamp: Date.now(),
+      direction: TwilioProvider.parseDirection(params.get("Direction")),
+      from: params.get("From") || undefined,
+      to: params.get("To") || undefined,
+    };
+    // Handle speech result (from <Gather>)
+    const speechResult = params.get("SpeechResult");
+    if (speechResult) {
+      return {
+        ...baseEvent,
+        type: "call.speech",
+        transcript: speechResult,
+        isFinal: true,
+        confidence: parseFloat(params.get("Confidence") || "0.9"),
+      };
+    }
+    // Handle DTMF
+    const digits = params.get("Digits");
+    if (digits) {
+      return { ...baseEvent, type: "call.dtmf", digits };
+    }
+    // Handle call status changes
+    const callStatus = params.get("CallStatus");
+    switch (callStatus) {
+      case "initiated":
+        return { ...baseEvent, type: "call.initiated" };
+      case "ringing":
+        return { ...baseEvent, type: "call.ringing" };
+      case "in-progress":
+        return { ...baseEvent, type: "call.answered" };
+      case "completed":
+      case "busy":
+      case "no-answer":
+      case "failed":
+        return { ...baseEvent, type: "call.ended", reason: callStatus };
+      case "canceled":
+        return { ...baseEvent, type: "call.ended", reason: "hangup-bot" };
+      default:
+        return null;
+    }
+  }
+  private static readonly EMPTY_TWIML =
+    '<?xml version="1.0" encoding="UTF-8"?><Response></Response>';
+  private static readonly PAUSE_TWIML = `<?xml version="1.0" encoding="UTF-8"?>
+<Response>
+  <Pause length="30"/>
+</Response>`;
+  /**
+   * Generate TwiML response for webhook.
+   * When a call is answered, connects to media stream for bidirectional audio.
+   */
+  private generateTwimlResponse(ctx?: WebhookContext): string {
+    if (!ctx) return TwilioProvider.EMPTY_TWIML;
+    const params = new URLSearchParams(ctx.rawBody);
+    const callStatus = params.get("CallStatus");
+    const direction = params.get("Direction");
+    console.log(
+      `[voice-call] generateTwimlResponse: status=${callStatus} direction=${direction}`,
+    );
+    // For inbound calls, answer immediately with stream
+    if (direction === "inbound") {
+      const streamUrl = this.getStreamUrl();
+      return streamUrl
+        ? this.getStreamConnectXml(streamUrl)
+        : TwilioProvider.PAUSE_TWIML;
+    }
+    // For outbound calls, only connect to stream when call is in-progress
+    if (callStatus !== "in-progress") {
+      return TwilioProvider.EMPTY_TWIML;
+    }
+    const streamUrl = this.getStreamUrl();
+    return streamUrl
+      ? this.getStreamConnectXml(streamUrl)
+      : TwilioProvider.PAUSE_TWIML;
+  }
+  /**
+   * Get the WebSocket URL for media streaming.
+   * Derives from the public URL origin + stream path.
+   */
+  private getStreamUrl(): string | null {
+    if (!this.currentPublicUrl || !this.options.streamPath) {
+      return null;
+    }
+    // Extract just the origin (host) from the public URL, ignoring any path
+    const url = new URL(this.currentPublicUrl);
+    const origin = url.origin;
+    // Convert https:// to wss:// for WebSocket
+    const wsOrigin = origin
+      .replace(/^https:\/\//, "wss://")
+      .replace(/^http:\/\//, "ws://");
+    // Append the stream path
+    const path = this.options.streamPath.startsWith("/")
+      ? this.options.streamPath
+      : `/${this.options.streamPath}`;
+    return `${wsOrigin}${path}`;
+  }
+  /**
+   * Generate TwiML to connect a call to a WebSocket media stream.
+   * This enables bidirectional audio streaming for real-time STT/TTS.
+   *
+   * @param streamUrl - WebSocket URL (wss://...) for the media stream
+   */
+  getStreamConnectXml(streamUrl: string): string {
+    return `<?xml version="1.0" encoding="UTF-8"?>
+<Response>
+  <Connect>
+    <Stream url="${escapeXml(streamUrl)}" />
+  </Connect>
+</Response>`;
+  }
+  /**
+   * Initiate an outbound call via Twilio API.
+   * If inlineTwiml is provided, uses that directly (for notify mode).
+   * Otherwise, uses webhook URL for dynamic TwiML.
+   */
+  async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
+    const url = new URL(input.webhookUrl);
+    url.searchParams.set("callId", input.callId);
+    // Build request params
+    const params: Record<string, string> = {
+      To: input.to,
+      From: input.from,
+      StatusCallback: url.toString(),
+      StatusCallbackEvent: "initiated ringing answered completed",
+      Timeout: "30",
+    };
+    // Use inline TwiML for notify mode (simpler, no webhook needed)
+    if (input.inlineTwiml) {
+      params.Twiml = input.inlineTwiml;
+    } else {
+      params.Url = url.toString();
+    }
+    const result = await this.apiRequest<TwilioCallResponse>(
+      "/Calls.json",
+      params,
+    );
+    this.callWebhookUrls.set(result.sid, url.toString());
+    return {
+      providerCallId: result.sid,
+      status: result.status === "queued" ? "queued" : "initiated",
+    };
+  }
+  /**
+   * Hang up a call via Twilio API.
+   */
+  async hangupCall(input: HangupCallInput): Promise<void> {
+    this.callWebhookUrls.delete(input.providerCallId);
+    await this.apiRequest(
+      `/Calls/${input.providerCallId}.json`,
+      { Status: "completed" },
+      { allowNotFound: true },
+    );
+  }
+  /**
+   * Play TTS audio via Twilio.
+   *
+   * Two modes:
+   * 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
+   *    generates audio via OpenAI and streams it through WebSocket (preferred).
+   * 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
+   *    Note: This may not work on all Twilio accounts.
+   */
+  async playTts(input: PlayTtsInput): Promise<void> {
+    // Try OpenAI TTS via media stream first (if configured)
+    const streamSid = this.callStreamMap.get(input.providerCallId);
+    if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
+      try {
+        await this.playTtsViaStream(input.text, streamSid);
+        return;
+      } catch (err) {
+        console.warn(
+          `[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
+          err instanceof Error ? err.message : err,
+        );
+        // Fall through to TwiML <Say> fallback
+      }
+    }
+    // Fall back to TwiML <Say> (may not work on all accounts)
+    const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
+    if (!webhookUrl) {
+      throw new Error(
+        "Missing webhook URL for this call (provider state not initialized)",
+      );
+    }
+    console.warn(
+      "[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
+    );
+    const pollyVoice = mapVoiceToPolly(input.voice);
+    const twiml = `<?xml version="1.0" encoding="UTF-8"?>
+<Response>
+  <Say voice="${pollyVoice}" language="${input.locale || "en-US"}">${escapeXml(input.text)}</Say>
+  <Gather input="speech" speechTimeout="auto" action="${escapeXml(webhookUrl)}" method="POST">
+    <Say>.</Say>
+  </Gather>
+</Response>`;
+    await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
+      Twiml: twiml,
+    });
+  }
+  /**
+   * Play TTS via OpenAI and Twilio Media Streams.
+   * Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
+   * Uses a jitter buffer to smooth out timing variations.
+   */
+  private async playTtsViaStream(
+    text: string,
+    streamSid: string,
+  ): Promise<void> {
+    if (!this.ttsProvider || !this.mediaStreamHandler) {
+      throw new Error("TTS provider and media stream handler required");
+    }
+    // Generate audio with OpenAI TTS (returns mu-law at 8kHz)
+    const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
+    // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
+    const CHUNK_SIZE = 160;
+    const CHUNK_DELAY_MS = 20;
+    for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
+      this.mediaStreamHandler.sendAudio(streamSid, chunk);
+      // Pace the audio to match real-time playback
+      await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
+    }
+    // Send a mark to track when audio finishes
+    this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
+  }
+  /**
+   * Start listening for speech via Twilio <Gather>.
+   */
+  async startListening(input: StartListeningInput): Promise<void> {
+    const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
+    if (!webhookUrl) {
+      throw new Error(
+        "Missing webhook URL for this call (provider state not initialized)",
+      );
+    }
+    const twiml = `<?xml version="1.0" encoding="UTF-8"?>
+<Response>
+  <Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(webhookUrl)}" method="POST">
+  </Gather>
+</Response>`;
+    await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
+      Twiml: twiml,
+    });
+  }
+  /**
+   * Stop listening - for Twilio this is a no-op as <Gather> auto-ends.
+   */
+  async stopListening(_input: StopListeningInput): Promise<void> {
+    // Twilio's <Gather> automatically stops on speech end
+    // No explicit action needed
+  }
+}
+// -----------------------------------------------------------------------------
+// Twilio-specific types
+// -----------------------------------------------------------------------------
+interface TwilioCallResponse {
+  sid: string;
+  status: string;
+  direction: string;
+  from: string;
+  to: string;
+  uri: string;
+}

package/src/response-generator.ts ADDED Viewed

@@ -0,0 +1,171 @@
+/**
+ * Voice call response generator - uses the embedded Pi agent for tool support.
+ * Routes voice responses through the same agent infrastructure as messaging.
+ */
+import crypto from "node:crypto";
+import { loadCoreAgentDeps, type CoreConfig } from "./core-bridge.js";
+import type { VoiceCallConfig } from "./config.js";
+export type VoiceResponseParams = {
+  /** Voice call config */
+  voiceConfig: VoiceCallConfig;
+  /** Core Clawdbot config */
+  coreConfig: CoreConfig;
+  /** Call ID for session tracking */
+  callId: string;
+  /** Caller's phone number */
+  from: string;
+  /** Conversation transcript */
+  transcript: Array<{ speaker: "user" | "bot"; text: string }>;
+  /** Latest user message */
+  userMessage: string;
+};
+export type VoiceResponseResult = {
+  text: string | null;
+  error?: string;
+};
+type SessionEntry = {
+  sessionId: string;
+  updatedAt: number;
+};
+/**
+ * Generate a voice response using the embedded Pi agent with full tool support.
+ * Uses the same agent infrastructure as messaging for consistent behavior.
+ */
+export async function generateVoiceResponse(
+  params: VoiceResponseParams,
+): Promise<VoiceResponseResult> {
+  const { voiceConfig, callId, from, transcript, userMessage, coreConfig } =
+    params;
+  if (!coreConfig) {
+    return { text: null, error: "Core config unavailable for voice response" };
+  }
+  let deps: Awaited<ReturnType<typeof loadCoreAgentDeps>>;
+  try {
+    deps = await loadCoreAgentDeps();
+  } catch (err) {
+    return {
+      text: null,
+      error:
+        err instanceof Error
+          ? err.message
+          : "Unable to load core agent dependencies",
+    };
+  }
+  const cfg = coreConfig;
+  // Build voice-specific session key based on phone number
+  const normalizedPhone = from.replace(/\D/g, "");
+  const sessionKey = `voice:${normalizedPhone}`;
+  const agentId = "main";
+  // Resolve paths
+  const storePath = deps.resolveStorePath(cfg.session?.store, { agentId });
+  const agentDir = deps.resolveAgentDir(cfg, agentId);
+  const workspaceDir = deps.resolveAgentWorkspaceDir(cfg, agentId);
+  // Ensure workspace exists
+  await deps.ensureAgentWorkspace({ dir: workspaceDir });
+  // Load or create session entry
+  const sessionStore = deps.loadSessionStore(storePath);
+  const now = Date.now();
+  let sessionEntry = sessionStore[sessionKey] as SessionEntry | undefined;
+  if (!sessionEntry) {
+    sessionEntry = {
+      sessionId: crypto.randomUUID(),
+      updatedAt: now,
+    };
+    sessionStore[sessionKey] = sessionEntry;
+    await deps.saveSessionStore(storePath, sessionStore);
+  }
+  const sessionId = sessionEntry.sessionId;
+  const sessionFile = deps.resolveSessionFilePath(sessionId, sessionEntry, {
+    agentId,
+  });
+  // Resolve model from config
+  const modelRef =
+    voiceConfig.responseModel ||
+    `${deps.DEFAULT_PROVIDER}/${deps.DEFAULT_MODEL}`;
+  const slashIndex = modelRef.indexOf("/");
+  const provider =
+    slashIndex === -1 ? deps.DEFAULT_PROVIDER : modelRef.slice(0, slashIndex);
+  const model = slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1);
+  // Resolve thinking level
+  const thinkLevel = deps.resolveThinkingDefault({ cfg, provider, model });
+  // Resolve agent identity for personalized prompt
+  const identity = deps.resolveAgentIdentity(cfg, agentId);
+  const agentName = identity?.name?.trim() || "assistant";
+  // Build system prompt with conversation history
+  const basePrompt =
+    voiceConfig.responseSystemPrompt ??
+    `You are ${agentName}, a helpful voice assistant on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. The caller's phone number is ${from}. You have access to tools - use them when helpful.`;
+  let extraSystemPrompt = basePrompt;
+  if (transcript.length > 0) {
+    const history = transcript
+      .map(
+        (entry) =>
+          `${entry.speaker === "bot" ? "You" : "Caller"}: ${entry.text}`,
+      )
+      .join("\n");
+    extraSystemPrompt = `${basePrompt}\n\nConversation so far:\n${history}`;
+  }
+  // Resolve timeout
+  const timeoutMs =
+    voiceConfig.responseTimeoutMs ?? deps.resolveAgentTimeoutMs({ cfg });
+  const runId = `voice:${callId}:${Date.now()}`;
+  try {
+    const result = await deps.runEmbeddedPiAgent({
+      sessionId,
+      sessionKey,
+      messageProvider: "voice",
+      sessionFile,
+      workspaceDir,
+      config: cfg,
+      prompt: userMessage,
+      provider,
+      model,
+      thinkLevel,
+      verboseLevel: "off",
+      timeoutMs,
+      runId,
+      lane: "voice",
+      extraSystemPrompt,
+      agentDir,
+    });
+    // Extract text from payloads
+    const texts = (result.payloads ?? [])
+      .filter((p) => p.text && !p.isError)
+      .map((p) => p.text?.trim())
+      .filter(Boolean);
+    const text = texts.join(" ") || null;
+    if (!text && result.meta.aborted) {
+      return { text: null, error: "Response generation was aborted" };
+    }
+    return { text };
+  } catch (err) {
+    console.error(`[voice-call] Response generation failed:`, err);
+    return { text: null, error: String(err) };
+  }
+}