npm - voicecc - Versions diffs - 1.2.5 → 1.2.7 - Mend

voicecc 1.2.5 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +11 -8
package/bin/voicecc.js +27 -2
package/package.json +1 -1
package/voice-server/claude_llm_service.py +2 -6
package/voice-server/heartbeat.py +1 -1
package/voice-server/initial_prompt_test.py +150 -0
package/voice-server/twilio_pipeline.py +27 -3
package/voice-server/voice_pipeline.py +16 -1

package/README.md CHANGED Viewed

@@ -9,10 +9,10 @@ A Voice Agent Platform running on Claude Code. Create, manage, and deploy conver
 ## Project Structure
 ```
-server/             Backend: voice pipeline + orchestration services
-  voice/            Real-time audio: STT, TTS, VAD, session management
-  services/         Orchestration: tunnel, Twilio, browser calls, agents
-  index.ts          Entry point (boots dashboard + auto-starts integrations)
+voice-server/       Python FastAPI: real-time audio pipeline (VAD, STT, TTS, Claude sessions)
+server/             Node.js orchestration: boots dashboard + voice server, manages integrations
+  services/         Tunnel, Twilio, browser calls, agents, device pairing
+  index.ts          Entry point (spawns voice-server + dashboard, auto-starts integrations)
 dashboard/          Web UI (Vite + React) + API routes (Hono)
 lander/             Static landing page
 init/               Default prompt templates for new agents
@@ -25,6 +25,7 @@ bin/                CLI entry point (voicecc command)
 - macOS or Linux
 - Node.js 18+
+- Python 3.11+ with `venv`
 - An ElevenLabs API key
 ### Terminal
@@ -41,11 +42,13 @@ voicecc
 ## How It Works
-1. **Mic capture**: Browser captures 16kHz mono PCM via WebRTC
+The platform runs two servers: a **Node.js orchestrator** (dashboard, integrations, CLI) and a **Python voice server** (real-time audio pipeline via Pipecat).
+1. **Mic capture**: Browser captures audio via WebRTC, connected to the Python voice server
 2. **Voice activity detection**: Silero VAD v5 detects speech segments
-3. **Speech-to-text**: ElevenLabs Scribe API transcribes audio
+3. **Speech-to-text**: ElevenLabs Scribe transcribes audio
 4. **Endpointing**: VAD silence-based turn detection
 5. **Claude inference**: Transcript sent to Claude Agent SDK session with streaming response
 6. **Narration**: Claude's response stripped of markdown and split into sentences
-7. **Text-to-speech**: ElevenLabs streaming TTS API generates audio
-8. **Speaker playback**: Audio output through browser at 24kHz
+7. **Text-to-speech**: ElevenLabs streaming TTS generates audio
+8. **Speaker playback**: Audio streamed back through WebRTC

package/bin/voicecc.js CHANGED Viewed

@@ -129,7 +129,11 @@ function ensurePython() {
   if (process.platform !== "linux") {
     console.error("ERROR: Python 3.12+ is required but not found.");
-    console.error("Install Python 3.12+ and run 'voicecc' again.");
+    if (process.platform === "darwin") {
+      console.error("Install it with Homebrew: brew install python@3.12");
+    } else {
+      console.error("Install Python 3.12+ and run 'voicecc' again.");
+    }
     process.exit(1);
   }
@@ -157,7 +161,11 @@ function ensureVenvModule(systemPython) {
   if (process.platform !== "linux") {
     console.error("ERROR: Python venv module is missing.");
-    console.error("Install it and run 'voicecc' again.");
+    if (process.platform === "darwin") {
+      console.error("Reinstall Python with Homebrew: brew install python@3.12");
+    } else {
+      console.error("Install the venv module and run 'voicecc' again.");
+    }
     process.exit(1);
   }
@@ -195,6 +203,23 @@ function ensurePythonVenv() {
   // Step 2: Ensure venv module is available
   ensureVenvModule(systemPython);
+  // Step 2.5: Ensure system libraries needed by Python packages (OpenCV, audio, WebRTC)
+  if (process.platform === "linux") {
+    const requiredLibs = ["libGL.so.1", "libSM.so.6", "libsndfile.so.1"];
+    const missing = requiredLibs.some((lib) => {
+      try { execSync(`ldconfig -p | grep ${lib}`, { encoding: "utf-8" }); return false; } catch { return true; }
+    });
+    if (missing) {
+      console.log("Installing system libraries required by Python packages...");
+      try {
+        linuxInstallPackage("libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 libsndfile1 libportaudio2");
+      } catch (err) {
+        console.error(`Failed to install system libraries: ${err.message}`);
+        process.exit(1);
+      }
+    }
+  }
   // Step 3: Create venv if needed
   if (!existsSync(venvPython)) {
     console.log("Setting up Python environment for voice server...");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voicecc",
-  "version": "1.2.5",
+  "version": "1.2.7",
   "description": "Voice Agent Platform running on Claude Code -- create and deploy conversational voice agents with ElevenLabs STT/TTS and VAD",
   "repository": {
     "type": "git",

package/voice-server/claude_llm_service.py CHANGED Viewed

@@ -108,12 +108,8 @@ class ClaudeLLMService(LLMService):
         self._settings.user_turn_completion_config = None
     async def start(self, frame: StartFrame):
-        """Handle pipeline start. Sends initial_prompt if configured."""
+        """Handle pipeline start."""
         await super().start(frame)
-        if self._config.initial_prompt and not self._initial_prompt_sent:
-            self._initial_prompt_sent = True
-            await self._ensure_client()
-            await self._send_to_claude(self._config.initial_prompt)
     async def stop(self, frame: EndFrame):
         """Handle pipeline stop. Disconnects the Claude session."""
@@ -237,7 +233,7 @@ class ClaudeLLMService(LLMService):
                 allowed_tools=self._config.allowed_tools or [],
                 permission_mode="bypassPermissions",
                 include_partial_messages=True,
-                max_thinking_tokens=10000,
+                max_thinking_tokens=0,
             )
             self._client = ClaudeSDKClient(options=options)

package/voice-server/heartbeat.py CHANGED Viewed

@@ -314,7 +314,7 @@ async def _run_heartbeat_session(
         allowed_tools=[],
         permission_mode="bypassPermissions",
         include_partial_messages=True,
-        max_thinking_tokens=10000,
+        max_thinking_tokens=0,
     )
     client = ClaudeSDKClient(options=options)
     await client.connect()

package/voice-server/initial_prompt_test.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Tests for agent-speaks-first behavior.
+Verifies that when a call starts with an initial_prompt configured,
+the agent produces a greeting (text output wrapped in response frames)
+without any user input.
+Run: cd voice-server && .venv/bin/python -m pytest initial-prompt.test.py -v
+"""
+import asyncio
+from unittest.mock import AsyncMock
+import pytest
+from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock
+from pipecat.frames.frames import (
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMTextFrame,
+)
+from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
+# ============================================================================
+# HELPERS
+# ============================================================================
+def _make_fake_client(response_text: str = "Hello! How can I help?"):
+    """Create a mock ClaudeSDKClient that returns a canned text response."""
+    client = AsyncMock()
+    client.connect = AsyncMock()
+    client.disconnect = AsyncMock()
+    client.query = AsyncMock()
+    async def fake_receive():
+        yield AssistantMessage(
+            content=[TextBlock(text=response_text)],
+            model="test",
+        )
+        yield ResultMessage(
+            subtype="success",
+            is_error=False,
+            duration_ms=0,
+            duration_api_ms=0,
+            num_turns=1,
+            session_id="test",
+        )
+    client.receive_response = fake_receive
+    return client
+def _collect_frames(service: ClaudeLLMService) -> list:
+    """Patch push_frame on a service to collect all output frames."""
+    frames = []
+    async def capture(frame, *args, **kwargs):
+        frames.append(frame)
+    service.push_frame = capture
+    return frames
+async def _trigger_initial_prompt(service: ClaudeLLMService, prompt: str):
+    """Reproduce what the pipeline's on_pipeline_started handler does."""
+    await service._ensure_client()
+    await service.push_frame(LLMFullResponseStartFrame())
+    await service._send_to_claude(prompt)
+    await service.push_frame(LLMFullResponseEndFrame())
+# ============================================================================
+# TESTS
+# ============================================================================
+@pytest.mark.asyncio
+async def test_agent_greets_user_on_call_start():
+    """When a call starts with an initial_prompt, the agent should produce
+    a spoken greeting — text frames wrapped in response start/end frames —
+    without any user input."""
+    client = _make_fake_client("Hey there! Welcome to the call.")
+    config = ClaudeLLMServiceConfig(
+        cwd="/tmp",
+        system_prompt="You are a test agent.",
+        initial_prompt="Greet the user briefly.",
+        existing_client=client,
+    )
+    service = ClaudeLLMService(config=config)
+    frames = _collect_frames(service)
+    await _trigger_initial_prompt(service, config.initial_prompt)
+    # The agent should have produced spoken output
+    text_frames = [f for f in frames if isinstance(f, LLMTextFrame)]
+    assert len(text_frames) >= 1, "Agent did not produce any spoken output"
+    full_text = " ".join(f.text for f in text_frames)
+    assert len(full_text) > 0, "Agent greeting was empty"
+    # The prompt should have been sent to Claude
+    client.query.assert_awaited_once_with("Greet the user briefly.")
+@pytest.mark.asyncio
+async def test_greeting_is_wrapped_for_tts():
+    """The greeting must be wrapped in response start/end frames so TTS
+    treats it as a single utterance (no gaps, no dropped last sentence)."""
+    config = ClaudeLLMServiceConfig(
+        cwd="/tmp",
+        system_prompt="You are a test agent.",
+        initial_prompt="Say hello.",
+        existing_client=_make_fake_client("Hi! Nice to meet you."),
+    )
+    service = ClaudeLLMService(config=config)
+    frames = _collect_frames(service)
+    await _trigger_initial_prompt(service, config.initial_prompt)
+    frame_types = [type(f) for f in frames]
+    # Must have: start, then text(s), then end
+    assert LLMFullResponseStartFrame in frame_types, "Missing response start"
+    assert LLMFullResponseEndFrame in frame_types, "Missing response end"
+    start_idx = frame_types.index(LLMFullResponseStartFrame)
+    end_idx = frame_types.index(LLMFullResponseEndFrame)
+    text_indices = [i for i, t in enumerate(frame_types) if t == LLMTextFrame]
+    assert text_indices, "No text frames between start and end"
+    assert all(start_idx < i < end_idx for i in text_indices), (
+        "Text frames must appear between start and end for TTS to work correctly"
+    )
+@pytest.mark.asyncio
+async def test_no_greeting_without_initial_prompt():
+    """Without an initial_prompt, the agent should stay silent on call start."""
+    config = ClaudeLLMServiceConfig(
+        cwd="/tmp",
+        system_prompt="You are a test agent.",
+        initial_prompt=None,
+        existing_client=_make_fake_client(),
+    )
+    service = ClaudeLLMService(config=config)
+    frames = _collect_frames(service)
+    # No trigger — the pipeline would not call _trigger_initial_prompt
+    # because initial_prompt is None. Verify that's the guard.
+    assert config.initial_prompt is None
+    assert len(frames) == 0, "Agent should stay silent without initial_prompt"

package/voice-server/twilio_pipeline.py CHANGED Viewed

@@ -72,8 +72,16 @@ async def handle_twilio_websocket(websocket: WebSocket, call_token: str) -> None
     try:
         # Read messages until we get the "start" event
         while True:
-            raw = await websocket.receive_text()
-            msg = json.loads(raw)
+            message = await websocket.receive()
+            # Skip binary frames (early audio before start)
+            if message.get("type") == "websocket.disconnect":
+                logger.warning("[twilio] WebSocket disconnected before start event")
+                return
+            if "text" not in message:
+                continue
+            msg = json.loads(message["text"])
             if msg.get("event") == "start":
                 start_data = msg.get("start", {})
@@ -173,7 +181,12 @@ async def _run_twilio_pipeline(
         llm_config: Claude LLM service configuration
         voice_id: ElevenLabs voice ID
     """
-    serializer = TwilioFrameSerializer(stream_sid=stream_sid, call_sid=call_sid)
+    serializer = TwilioFrameSerializer(
+        stream_sid=stream_sid,
+        call_sid=call_sid,
+        account_sid=config.twilio_account_sid,
+        auth_token=config.twilio_auth_token,
+    )
     transport = FastAPIWebsocketTransport(
         websocket=websocket,
@@ -233,5 +246,16 @@ async def _run_twilio_pipeline(
             params=PipelineParams(allow_interruptions=True),
         )
+        # For Twilio, the WebSocket is already connected, so send the
+        # initial prompt shortly after the pipeline starts.
+        async def _send_initial_prompt():
+            await asyncio.sleep(1)  # Let the pipeline fully initialize
+            if llm_config.initial_prompt and not claude_llm._initial_prompt_sent:
+                claude_llm._initial_prompt_sent = True
+                await claude_llm._ensure_client()
+                await claude_llm._send_to_claude(llm_config.initial_prompt)
+        asyncio.create_task(_send_initial_prompt())
         runner = PipelineRunner()
         await runner.run(task)

package/voice-server/voice_pipeline.py CHANGED Viewed

@@ -19,7 +19,11 @@ Responsibilities:
 import aiohttp
 import logging
-from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.frames.frames import (
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesFrame,
+)
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -97,6 +101,7 @@ async def bot(runner_args: SmallWebRTCRunnerArguments):
         claude_config = ClaudeLLMServiceConfig(
             cwd=config.default_cwd,
             system_prompt=system_prompt,
+            initial_prompt="The user just joined the call. Greet them briefly.",
         )
         claude_llm = ClaudeLLMService(config=claude_config)
@@ -135,6 +140,16 @@ async def bot(runner_args: SmallWebRTCRunnerArguments):
             params=PipelineParams(allow_interruptions=True),
         )
+        # Send initial prompt once the pipeline is fully ready
+        @task.event_handler("on_pipeline_started")
+        async def on_pipeline_started(task_ref, *args):
+            if claude_config.initial_prompt and not claude_llm._initial_prompt_sent:
+                claude_llm._initial_prompt_sent = True
+                await claude_llm._ensure_client()
+                await claude_llm.push_frame(LLMFullResponseStartFrame())
+                await claude_llm._send_to_claude(claude_config.initial_prompt)
+                await claude_llm.push_frame(LLMFullResponseEndFrame())
         runner = PipelineRunner(handle_sigint=False)
         await runner.run(task)