npm - voicecc - Versions diffs - 1.2.6 → 1.2.8 - Mend

voicecc 1.2.6 → 1.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +11 -8
package/bin/voicecc.js +10 -2
package/dashboard/routes/agents.ts +41 -2
package/dashboard/server.ts +4 -39
package/package.json +1 -1
package/voice-server/claude_llm_service.py +2 -6
package/voice-server/heartbeat.py +1 -1
package/voice-server/initial_prompt_test.py +150 -0
package/voice-server/twilio_pipeline.py +43 -5
package/voice-server/voice_pipeline.py +16 -1

package/README.md CHANGED Viewed

@@ -9,10 +9,10 @@ A Voice Agent Platform running on Claude Code. Create, manage, and deploy conver
 ## Project Structure
 ```
-server/             Backend: voice pipeline + orchestration services
-  voice/            Real-time audio: STT, TTS, VAD, session management
-  services/         Orchestration: tunnel, Twilio, browser calls, agents
-  index.ts          Entry point (boots dashboard + auto-starts integrations)
+voice-server/       Python FastAPI: real-time audio pipeline (VAD, STT, TTS, Claude sessions)
+server/             Node.js orchestration: boots dashboard + voice server, manages integrations
+  services/         Tunnel, Twilio, browser calls, agents, device pairing
+  index.ts          Entry point (spawns voice-server + dashboard, auto-starts integrations)
 dashboard/          Web UI (Vite + React) + API routes (Hono)
 lander/             Static landing page
 init/               Default prompt templates for new agents
@@ -25,6 +25,7 @@ bin/                CLI entry point (voicecc command)
 - macOS or Linux
 - Node.js 18+
+- Python 3.11+ with `venv`
 - An ElevenLabs API key
 ### Terminal
@@ -41,11 +42,13 @@ voicecc
 ## How It Works
-1. **Mic capture**: Browser captures 16kHz mono PCM via WebRTC
+The platform runs two servers: a **Node.js orchestrator** (dashboard, integrations, CLI) and a **Python voice server** (real-time audio pipeline via Pipecat).
+1. **Mic capture**: Browser captures audio via WebRTC, connected to the Python voice server
 2. **Voice activity detection**: Silero VAD v5 detects speech segments
-3. **Speech-to-text**: ElevenLabs Scribe API transcribes audio
+3. **Speech-to-text**: ElevenLabs Scribe transcribes audio
 4. **Endpointing**: VAD silence-based turn detection
 5. **Claude inference**: Transcript sent to Claude Agent SDK session with streaming response
 6. **Narration**: Claude's response stripped of markdown and split into sentences
-7. **Text-to-speech**: ElevenLabs streaming TTS API generates audio
-8. **Speaker playback**: Audio output through browser at 24kHz
+7. **Text-to-speech**: ElevenLabs streaming TTS generates audio
+8. **Speaker playback**: Audio streamed back through WebRTC

package/bin/voicecc.js CHANGED Viewed

@@ -129,7 +129,11 @@ function ensurePython() {
   if (process.platform !== "linux") {
     console.error("ERROR: Python 3.12+ is required but not found.");
-    console.error("Install Python 3.12+ and run 'voicecc' again.");
+    if (process.platform === "darwin") {
+      console.error("Install it with Homebrew: brew install python@3.12");
+    } else {
+      console.error("Install Python 3.12+ and run 'voicecc' again.");
+    }
     process.exit(1);
   }
@@ -157,7 +161,11 @@ function ensureVenvModule(systemPython) {
   if (process.platform !== "linux") {
     console.error("ERROR: Python venv module is missing.");
-    console.error("Install it and run 'voicecc' again.");
+    if (process.platform === "darwin") {
+      console.error("Reinstall Python with Homebrew: brew install python@3.12");
+    } else {
+      console.error("Install the venv module and run 'voicecc' again.");
+    }
     process.exit(1);
   }

package/dashboard/routes/agents.ts CHANGED Viewed

@@ -11,6 +11,7 @@
  */
 import { Hono } from "hono";
+import twilioSdk from "twilio";
 import {
   listAgents,
   getAgent,
@@ -21,6 +22,8 @@ import {
   importAgent,
 } from "../../server/services/agent-store.js";
 import type { AgentConfig } from "../../server/services/agent-store.js";
+import { readEnv } from "../../server/services/env.js";
+import { getTunnelUrl } from "../../server/services/tunnel.js";
 /** Base URL for the Python voice server API */
 const VOICE_API_URL = process.env.VOICE_SERVER_URL ?? "http://localhost:7861";
@@ -155,11 +158,30 @@ export function agentsRoutes(): Hono {
   app.post("/:id/call", async (c) => {
     const id = c.req.param("id");
     try {
+      const envVars = await readEnv();
+      const accountSid = envVars.TWILIO_ACCOUNT_SID;
+      const authToken = envVars.TWILIO_AUTH_TOKEN;
+      const userPhone = envVars.USER_PHONE_NUMBER;
+      const tunnelUrl = getTunnelUrl();
+      if (!accountSid || !authToken) {
+        return c.json({ error: "Twilio credentials not configured" }, 400);
+      }
+      if (!userPhone) {
+        return c.json({ error: "User phone number not configured" }, 400);
+      }
+      if (!tunnelUrl) {
+        return c.json({ error: "Tunnel is not running" }, 400);
+      }
+      const token = crypto.randomUUID();
+      // Register the token with the Python voice server
       const response = await fetch(`${VOICE_API_URL}/register-call`, {
         method: "POST",
         headers: { "Content-Type": "application/json" },
         body: JSON.stringify({
-          token: crypto.randomUUID(),
+          token,
           agent_id: id,
           initial_prompt: "The user pressed the 'Call Me' button. Greet them and ask how you can help.",
         }),
@@ -168,7 +190,24 @@ export function agentsRoutes(): Hono {
         const data = await response.json();
         throw new Error(data.error ?? "Voice server error");
       }
-      return c.json({ success: true });
+      // Place the actual Twilio call
+      const client = twilioSdk(accountSid, authToken);
+      const numbers = await client.incomingPhoneNumbers.list({ limit: 1 });
+      if (numbers.length === 0) {
+        return c.json({ error: "No Twilio phone numbers found on this account" }, 400);
+      }
+      const tunnelHost = tunnelUrl.replace(/^https?:\/\//, "");
+      const twiml = `<Response><Connect><Stream url="wss://${tunnelHost}/media/${token}?agentId=${id}" /></Connect></Response>`;
+      const call = await client.calls.create({
+        to: userPhone,
+        from: numbers[0].phoneNumber,
+        twiml,
+      });
+      return c.json({ success: true, callSid: call.sid });
     } catch (err) {
       return c.json({ error: (err as Error).message }, 400);
     }

package/dashboard/server.ts CHANGED Viewed

@@ -16,10 +16,9 @@ import { readFileSync } from "fs";
 import { access } from "fs/promises";
 import { join } from "path";
 import { homedir } from "os";
-import { WebSocket as WsWebSocket, WebSocketServer } from "ws";
+import { attachMediaProxy } from "./ws-proxy.js";
-import type { IncomingMessage } from "http";
-import type { Duplex } from "stream";
+import type http from "http";
 import { claudeMdRoutes } from "./routes/claude-md.js";
 import { conversationRoutes } from "./routes/conversations.js";
@@ -142,42 +141,8 @@ export async function startDashboard(): Promise<number> {
         });
         server.on("error", reject);
-        // Proxy /media/:token WebSocket upgrades to the Python server
-        const wss = new WebSocketServer({ noServer: true });
-        server.on("upgrade", (req: IncomingMessage, socket: Duplex, head: Buffer) => {
-          const url = req.url ?? "";
-          const match = url.match(/^\/media\/([a-f0-9-]+)(?:\?.*)?$/);
-          if (!match) return; // Not a Twilio media WebSocket -- let it fall through
-          const targetWsUrl = VOICE_API_URL.replace(/^http/, "ws") + url;
-          const upstream = new WsWebSocket(targetWsUrl);
-          upstream.on("open", () => {
-            wss.handleUpgrade(req, socket, head, (clientWs) => {
-              // Bidirectional message proxy
-              clientWs.on("message", (data) => {
-                if (upstream.readyState === WsWebSocket.OPEN) {
-                  upstream.send(data);
-                }
-              });
-              upstream.on("message", (data) => {
-                if (clientWs.readyState === WsWebSocket.OPEN) {
-                  clientWs.send(data);
-                }
-              });
-              clientWs.on("close", () => upstream.close());
-              upstream.on("close", () => clientWs.close());
-              clientWs.on("error", () => upstream.close());
-              upstream.on("error", () => clientWs.close());
-            });
-          });
-          upstream.on("error", (err) => {
-            console.error(`[dashboard] Twilio WS proxy error: ${err.message}`);
-            socket.destroy();
-          });
-        });
+        // Proxy /media/:token WebSocket upgrades to the Python voice server
+        attachMediaProxy(server as unknown as http.Server, VOICE_API_URL);
       });
       setDashboardPort(port);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voicecc",
-  "version": "1.2.6",
+  "version": "1.2.8",
   "description": "Voice Agent Platform running on Claude Code -- create and deploy conversational voice agents with ElevenLabs STT/TTS and VAD",
   "repository": {
     "type": "git",

package/voice-server/claude_llm_service.py CHANGED Viewed

@@ -108,12 +108,8 @@ class ClaudeLLMService(LLMService):
         self._settings.user_turn_completion_config = None
     async def start(self, frame: StartFrame):
-        """Handle pipeline start. Sends initial_prompt if configured."""
+        """Handle pipeline start."""
         await super().start(frame)
-        if self._config.initial_prompt and not self._initial_prompt_sent:
-            self._initial_prompt_sent = True
-            await self._ensure_client()
-            await self._send_to_claude(self._config.initial_prompt)
     async def stop(self, frame: EndFrame):
         """Handle pipeline stop. Disconnects the Claude session."""
@@ -237,7 +233,7 @@ class ClaudeLLMService(LLMService):
                 allowed_tools=self._config.allowed_tools or [],
                 permission_mode="bypassPermissions",
                 include_partial_messages=True,
-                max_thinking_tokens=10000,
+                max_thinking_tokens=0,
             )
             self._client = ClaudeSDKClient(options=options)

package/voice-server/heartbeat.py CHANGED Viewed

@@ -314,7 +314,7 @@ async def _run_heartbeat_session(
         allowed_tools=[],
         permission_mode="bypassPermissions",
         include_partial_messages=True,
-        max_thinking_tokens=10000,
+        max_thinking_tokens=0,
     )
     client = ClaudeSDKClient(options=options)
     await client.connect()

package/voice-server/initial_prompt_test.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Tests for agent-speaks-first behavior.
+Verifies that when a call starts with an initial_prompt configured,
+the agent produces a greeting (text output wrapped in response frames)
+without any user input.
+Run: cd voice-server && .venv/bin/python -m pytest initial-prompt.test.py -v
+"""
+import asyncio
+from unittest.mock import AsyncMock
+import pytest
+from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock
+from pipecat.frames.frames import (
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMTextFrame,
+)
+from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
+# ============================================================================
+# HELPERS
+# ============================================================================
+def _make_fake_client(response_text: str = "Hello! How can I help?"):
+    """Create a mock ClaudeSDKClient that returns a canned text response."""
+    client = AsyncMock()
+    client.connect = AsyncMock()
+    client.disconnect = AsyncMock()
+    client.query = AsyncMock()
+    async def fake_receive():
+        yield AssistantMessage(
+            content=[TextBlock(text=response_text)],
+            model="test",
+        )
+        yield ResultMessage(
+            subtype="success",
+            is_error=False,
+            duration_ms=0,
+            duration_api_ms=0,
+            num_turns=1,
+            session_id="test",
+        )
+    client.receive_response = fake_receive
+    return client
+def _collect_frames(service: ClaudeLLMService) -> list:
+    """Patch push_frame on a service to collect all output frames."""
+    frames = []
+    async def capture(frame, *args, **kwargs):
+        frames.append(frame)
+    service.push_frame = capture
+    return frames
+async def _trigger_initial_prompt(service: ClaudeLLMService, prompt: str):
+    """Reproduce what the pipeline's on_pipeline_started handler does."""
+    await service._ensure_client()
+    await service.push_frame(LLMFullResponseStartFrame())
+    await service._send_to_claude(prompt)
+    await service.push_frame(LLMFullResponseEndFrame())
+# ============================================================================
+# TESTS
+# ============================================================================
+@pytest.mark.asyncio
+async def test_agent_greets_user_on_call_start():
+    """When a call starts with an initial_prompt, the agent should produce
+    a spoken greeting — text frames wrapped in response start/end frames —
+    without any user input."""
+    client = _make_fake_client("Hey there! Welcome to the call.")
+    config = ClaudeLLMServiceConfig(
+        cwd="/tmp",
+        system_prompt="You are a test agent.",
+        initial_prompt="Greet the user briefly.",
+        existing_client=client,
+    )
+    service = ClaudeLLMService(config=config)
+    frames = _collect_frames(service)
+    await _trigger_initial_prompt(service, config.initial_prompt)
+    # The agent should have produced spoken output
+    text_frames = [f for f in frames if isinstance(f, LLMTextFrame)]
+    assert len(text_frames) >= 1, "Agent did not produce any spoken output"
+    full_text = " ".join(f.text for f in text_frames)
+    assert len(full_text) > 0, "Agent greeting was empty"
+    # The prompt should have been sent to Claude
+    client.query.assert_awaited_once_with("Greet the user briefly.")
+@pytest.mark.asyncio
+async def test_greeting_is_wrapped_for_tts():
+    """The greeting must be wrapped in response start/end frames so TTS
+    treats it as a single utterance (no gaps, no dropped last sentence)."""
+    config = ClaudeLLMServiceConfig(
+        cwd="/tmp",
+        system_prompt="You are a test agent.",
+        initial_prompt="Say hello.",
+        existing_client=_make_fake_client("Hi! Nice to meet you."),
+    )
+    service = ClaudeLLMService(config=config)
+    frames = _collect_frames(service)
+    await _trigger_initial_prompt(service, config.initial_prompt)
+    frame_types = [type(f) for f in frames]
+    # Must have: start, then text(s), then end
+    assert LLMFullResponseStartFrame in frame_types, "Missing response start"
+    assert LLMFullResponseEndFrame in frame_types, "Missing response end"
+    start_idx = frame_types.index(LLMFullResponseStartFrame)
+    end_idx = frame_types.index(LLMFullResponseEndFrame)
+    text_indices = [i for i, t in enumerate(frame_types) if t == LLMTextFrame]
+    assert text_indices, "No text frames between start and end"
+    assert all(start_idx < i < end_idx for i in text_indices), (
+        "Text frames must appear between start and end for TTS to work correctly"
+    )
+@pytest.mark.asyncio
+async def test_no_greeting_without_initial_prompt():
+    """Without an initial_prompt, the agent should stay silent on call start."""
+    config = ClaudeLLMServiceConfig(
+        cwd="/tmp",
+        system_prompt="You are a test agent.",
+        initial_prompt=None,
+        existing_client=_make_fake_client(),
+    )
+    service = ClaudeLLMService(config=config)
+    frames = _collect_frames(service)
+    # No trigger — the pipeline would not call _trigger_initial_prompt
+    # because initial_prompt is None. Verify that's the guard.
+    assert config.initial_prompt is None
+    assert len(frames) == 0, "Agent should stay silent without initial_prompt"

package/voice-server/twilio_pipeline.py CHANGED Viewed

@@ -21,10 +21,16 @@ import os
 import aiohttp
 from fastapi import WebSocket
+from pipecat.frames.frames import LLMFullResponseEndFrame, LLMFullResponseStartFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.aggregators.llm_response_universal import (
+    LLMContextAggregatorPair,
+    LLMUserAggregatorParams,
+)
 from pipecat.serializers.twilio import TwilioFrameSerializer
 from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
 from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
@@ -72,7 +78,19 @@ async def handle_twilio_websocket(websocket: WebSocket, call_token: str) -> None
     try:
         # Read messages until we get the "start" event
         while True:
-            raw = await websocket.receive_text()
+            message = await websocket.receive()
+            if message.get("type") == "websocket.disconnect":
+                logger.warning("[twilio] WebSocket disconnected before start event")
+                return
+            # Twilio may send frames as text or binary
+            raw = message.get("text") or (
+                message.get("bytes", b"").decode("utf-8") if message.get("bytes") else None
+            )
+            if not raw:
+                continue
             msg = json.loads(raw)
             if msg.get("event") == "start":
@@ -173,7 +191,12 @@ async def _run_twilio_pipeline(
         llm_config: Claude LLM service configuration
         voice_id: ElevenLabs voice ID
     """
-    serializer = TwilioFrameSerializer(stream_sid=stream_sid, call_sid=call_sid)
+    serializer = TwilioFrameSerializer(
+        stream_sid=stream_sid,
+        call_sid=call_sid,
+        account_sid=config.twilio_account_sid,
+        auth_token=config.twilio_auth_token,
+    )
     transport = FastAPIWebsocketTransport(
         websocket=websocket,
@@ -211,8 +234,13 @@ async def _run_twilio_pipeline(
         narration = NarrationProcessor()
         # Context aggregator
-        context = OpenAILLMContext(messages=[], tools=[])
-        context_aggregator = claude_llm.create_context_aggregator(context)
+        context = LLMContext()
+        context_aggregator = LLMContextAggregatorPair(
+            context,
+            user_params=LLMUserAggregatorParams(
+                vad_analyzer=SileroVADAnalyzer(),
+            ),
+        )
         # Pipeline
         pipeline = Pipeline(
@@ -233,5 +261,15 @@ async def _run_twilio_pipeline(
             params=PipelineParams(allow_interruptions=True),
         )
+        # Send initial prompt once the pipeline is fully ready
+        @task.event_handler("on_pipeline_started")
+        async def on_pipeline_started(task_ref, *args):
+            if llm_config.initial_prompt and not claude_llm._initial_prompt_sent:
+                claude_llm._initial_prompt_sent = True
+                await claude_llm._ensure_client()
+                await claude_llm.push_frame(LLMFullResponseStartFrame())
+                await claude_llm._send_to_claude(llm_config.initial_prompt)
+                await claude_llm.push_frame(LLMFullResponseEndFrame())
         runner = PipelineRunner()
         await runner.run(task)

package/voice-server/voice_pipeline.py CHANGED Viewed

@@ -19,7 +19,11 @@ Responsibilities:
 import aiohttp
 import logging
-from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.frames.frames import (
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesFrame,
+)
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -97,6 +101,7 @@ async def bot(runner_args: SmallWebRTCRunnerArguments):
         claude_config = ClaudeLLMServiceConfig(
             cwd=config.default_cwd,
             system_prompt=system_prompt,
+            initial_prompt="The user just joined the call. Greet them briefly.",
         )
         claude_llm = ClaudeLLMService(config=claude_config)
@@ -135,6 +140,16 @@ async def bot(runner_args: SmallWebRTCRunnerArguments):
             params=PipelineParams(allow_interruptions=True),
         )
+        # Send initial prompt once the pipeline is fully ready
+        @task.event_handler("on_pipeline_started")
+        async def on_pipeline_started(task_ref, *args):
+            if claude_config.initial_prompt and not claude_llm._initial_prompt_sent:
+                claude_llm._initial_prompt_sent = True
+                await claude_llm._ensure_client()
+                await claude_llm.push_frame(LLMFullResponseStartFrame())
+                await claude_llm._send_to_claude(claude_config.initial_prompt)
+                await claude_llm.push_frame(LLMFullResponseEndFrame())
         runner = PipelineRunner(handle_sigint=False)
         await runner.run(task)