voicecc 1.2.6 → 1.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,10 +9,10 @@ A Voice Agent Platform running on Claude Code. Create, manage, and deploy conver
9
9
  ## Project Structure
10
10
 
11
11
  ```
12
- server/ Backend: voice pipeline + orchestration services
13
- voice/ Real-time audio: STT, TTS, VAD, session management
14
- services/ Orchestration: tunnel, Twilio, browser calls, agents
15
- index.ts Entry point (boots dashboard + auto-starts integrations)
12
+ voice-server/ Python FastAPI: real-time audio pipeline (VAD, STT, TTS, Claude sessions)
13
+ server/ Node.js orchestration: boots dashboard + voice server, manages integrations
14
+ services/ Tunnel, Twilio, browser calls, agents, device pairing
15
+ index.ts Entry point (spawns voice-server + dashboard, auto-starts integrations)
16
16
  dashboard/ Web UI (Vite + React) + API routes (Hono)
17
17
  lander/ Static landing page
18
18
  init/ Default prompt templates for new agents
@@ -25,6 +25,7 @@ bin/ CLI entry point (voicecc command)
25
25
 
26
26
  - macOS or Linux
27
27
  - Node.js 18+
28
+ - Python 3.11+ with `venv`
28
29
  - An ElevenLabs API key
29
30
 
30
31
  ### Terminal
@@ -41,11 +42,13 @@ voicecc
41
42
 
42
43
  ## How It Works
43
44
 
44
- 1. **Mic capture**: Browser captures 16kHz mono PCM via WebRTC
45
+ The platform runs two servers: a **Node.js orchestrator** (dashboard, integrations, CLI) and a **Python voice server** (real-time audio pipeline via Pipecat).
46
+
47
+ 1. **Mic capture**: Browser captures audio via WebRTC, connected to the Python voice server
45
48
  2. **Voice activity detection**: Silero VAD v5 detects speech segments
46
- 3. **Speech-to-text**: ElevenLabs Scribe API transcribes audio
49
+ 3. **Speech-to-text**: ElevenLabs Scribe transcribes audio
47
50
  4. **Endpointing**: VAD silence-based turn detection
48
51
  5. **Claude inference**: Transcript sent to Claude Agent SDK session with streaming response
49
52
  6. **Narration**: Claude's response stripped of markdown and split into sentences
50
- 7. **Text-to-speech**: ElevenLabs streaming TTS API generates audio
51
- 8. **Speaker playback**: Audio output through browser at 24kHz
53
+ 7. **Text-to-speech**: ElevenLabs streaming TTS generates audio
54
+ 8. **Speaker playback**: Audio streamed back through WebRTC
package/bin/voicecc.js CHANGED
@@ -129,7 +129,11 @@ function ensurePython() {
129
129
 
130
130
  if (process.platform !== "linux") {
131
131
  console.error("ERROR: Python 3.12+ is required but not found.");
132
- console.error("Install Python 3.12+ and run 'voicecc' again.");
132
+ if (process.platform === "darwin") {
133
+ console.error("Install it with Homebrew: brew install python@3.12");
134
+ } else {
135
+ console.error("Install Python 3.12+ and run 'voicecc' again.");
136
+ }
133
137
  process.exit(1);
134
138
  }
135
139
 
@@ -157,7 +161,11 @@ function ensureVenvModule(systemPython) {
157
161
 
158
162
  if (process.platform !== "linux") {
159
163
  console.error("ERROR: Python venv module is missing.");
160
- console.error("Install it and run 'voicecc' again.");
164
+ if (process.platform === "darwin") {
165
+ console.error("Reinstall Python with Homebrew: brew install python@3.12");
166
+ } else {
167
+ console.error("Install the venv module and run 'voicecc' again.");
168
+ }
161
169
  process.exit(1);
162
170
  }
163
171
 
@@ -11,6 +11,7 @@
11
11
  */
12
12
 
13
13
  import { Hono } from "hono";
14
+ import twilioSdk from "twilio";
14
15
  import {
15
16
  listAgents,
16
17
  getAgent,
@@ -21,6 +22,8 @@ import {
21
22
  importAgent,
22
23
  } from "../../server/services/agent-store.js";
23
24
  import type { AgentConfig } from "../../server/services/agent-store.js";
25
+ import { readEnv } from "../../server/services/env.js";
26
+ import { getTunnelUrl } from "../../server/services/tunnel.js";
24
27
 
25
28
  /** Base URL for the Python voice server API */
26
29
  const VOICE_API_URL = process.env.VOICE_SERVER_URL ?? "http://localhost:7861";
@@ -155,11 +158,30 @@ export function agentsRoutes(): Hono {
155
158
  app.post("/:id/call", async (c) => {
156
159
  const id = c.req.param("id");
157
160
  try {
161
+ const envVars = await readEnv();
162
+ const accountSid = envVars.TWILIO_ACCOUNT_SID;
163
+ const authToken = envVars.TWILIO_AUTH_TOKEN;
164
+ const userPhone = envVars.USER_PHONE_NUMBER;
165
+ const tunnelUrl = getTunnelUrl();
166
+
167
+ if (!accountSid || !authToken) {
168
+ return c.json({ error: "Twilio credentials not configured" }, 400);
169
+ }
170
+ if (!userPhone) {
171
+ return c.json({ error: "User phone number not configured" }, 400);
172
+ }
173
+ if (!tunnelUrl) {
174
+ return c.json({ error: "Tunnel is not running" }, 400);
175
+ }
176
+
177
+ const token = crypto.randomUUID();
178
+
179
+ // Register the token with the Python voice server
158
180
  const response = await fetch(`${VOICE_API_URL}/register-call`, {
159
181
  method: "POST",
160
182
  headers: { "Content-Type": "application/json" },
161
183
  body: JSON.stringify({
162
- token: crypto.randomUUID(),
184
+ token,
163
185
  agent_id: id,
164
186
  initial_prompt: "The user pressed the 'Call Me' button. Greet them and ask how you can help.",
165
187
  }),
@@ -168,7 +190,24 @@ export function agentsRoutes(): Hono {
168
190
  const data = await response.json();
169
191
  throw new Error(data.error ?? "Voice server error");
170
192
  }
171
- return c.json({ success: true });
193
+
194
+ // Place the actual Twilio call
195
+ const client = twilioSdk(accountSid, authToken);
196
+ const numbers = await client.incomingPhoneNumbers.list({ limit: 1 });
197
+ if (numbers.length === 0) {
198
+ return c.json({ error: "No Twilio phone numbers found on this account" }, 400);
199
+ }
200
+
201
+ const tunnelHost = tunnelUrl.replace(/^https?:\/\//, "");
202
+ const twiml = `<Response><Connect><Stream url="wss://${tunnelHost}/media/${token}?agentId=${id}" /></Connect></Response>`;
203
+
204
+ const call = await client.calls.create({
205
+ to: userPhone,
206
+ from: numbers[0].phoneNumber,
207
+ twiml,
208
+ });
209
+
210
+ return c.json({ success: true, callSid: call.sid });
172
211
  } catch (err) {
173
212
  return c.json({ error: (err as Error).message }, 400);
174
213
  }
@@ -16,10 +16,9 @@ import { readFileSync } from "fs";
16
16
  import { access } from "fs/promises";
17
17
  import { join } from "path";
18
18
  import { homedir } from "os";
19
- import { WebSocket as WsWebSocket, WebSocketServer } from "ws";
19
+ import { attachMediaProxy } from "./ws-proxy.js";
20
20
 
21
- import type { IncomingMessage } from "http";
22
- import type { Duplex } from "stream";
21
+ import type http from "http";
23
22
 
24
23
  import { claudeMdRoutes } from "./routes/claude-md.js";
25
24
  import { conversationRoutes } from "./routes/conversations.js";
@@ -142,42 +141,8 @@ export async function startDashboard(): Promise<number> {
142
141
  });
143
142
  server.on("error", reject);
144
143
 
145
- // Proxy /media/:token WebSocket upgrades to the Python server
146
- const wss = new WebSocketServer({ noServer: true });
147
- server.on("upgrade", (req: IncomingMessage, socket: Duplex, head: Buffer) => {
148
- const url = req.url ?? "";
149
- const match = url.match(/^\/media\/([a-f0-9-]+)(?:\?.*)?$/);
150
- if (!match) return; // Not a Twilio media WebSocket -- let it fall through
151
-
152
- const targetWsUrl = VOICE_API_URL.replace(/^http/, "ws") + url;
153
- const upstream = new WsWebSocket(targetWsUrl);
154
-
155
- upstream.on("open", () => {
156
- wss.handleUpgrade(req, socket, head, (clientWs) => {
157
- // Bidirectional message proxy
158
- clientWs.on("message", (data) => {
159
- if (upstream.readyState === WsWebSocket.OPEN) {
160
- upstream.send(data);
161
- }
162
- });
163
- upstream.on("message", (data) => {
164
- if (clientWs.readyState === WsWebSocket.OPEN) {
165
- clientWs.send(data);
166
- }
167
- });
168
-
169
- clientWs.on("close", () => upstream.close());
170
- upstream.on("close", () => clientWs.close());
171
- clientWs.on("error", () => upstream.close());
172
- upstream.on("error", () => clientWs.close());
173
- });
174
- });
175
-
176
- upstream.on("error", (err) => {
177
- console.error(`[dashboard] Twilio WS proxy error: ${err.message}`);
178
- socket.destroy();
179
- });
180
- });
144
+ // Proxy /media/:token WebSocket upgrades to the Python voice server
145
+ attachMediaProxy(server as unknown as http.Server, VOICE_API_URL);
181
146
  });
182
147
 
183
148
  setDashboardPort(port);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voicecc",
3
- "version": "1.2.6",
3
+ "version": "1.2.8",
4
4
  "description": "Voice Agent Platform running on Claude Code -- create and deploy conversational voice agents with ElevenLabs STT/TTS and VAD",
5
5
  "repository": {
6
6
  "type": "git",
@@ -108,12 +108,8 @@ class ClaudeLLMService(LLMService):
108
108
  self._settings.user_turn_completion_config = None
109
109
 
110
110
  async def start(self, frame: StartFrame):
111
- """Handle pipeline start. Sends initial_prompt if configured."""
111
+ """Handle pipeline start."""
112
112
  await super().start(frame)
113
- if self._config.initial_prompt and not self._initial_prompt_sent:
114
- self._initial_prompt_sent = True
115
- await self._ensure_client()
116
- await self._send_to_claude(self._config.initial_prompt)
117
113
 
118
114
  async def stop(self, frame: EndFrame):
119
115
  """Handle pipeline stop. Disconnects the Claude session."""
@@ -237,7 +233,7 @@ class ClaudeLLMService(LLMService):
237
233
  allowed_tools=self._config.allowed_tools or [],
238
234
  permission_mode="bypassPermissions",
239
235
  include_partial_messages=True,
240
- max_thinking_tokens=10000,
236
+ max_thinking_tokens=0,
241
237
  )
242
238
  self._client = ClaudeSDKClient(options=options)
243
239
 
@@ -314,7 +314,7 @@ async def _run_heartbeat_session(
314
314
  allowed_tools=[],
315
315
  permission_mode="bypassPermissions",
316
316
  include_partial_messages=True,
317
- max_thinking_tokens=10000,
317
+ max_thinking_tokens=0,
318
318
  )
319
319
  client = ClaudeSDKClient(options=options)
320
320
  await client.connect()
@@ -0,0 +1,150 @@
1
+ """Tests for agent-speaks-first behavior.
2
+
3
+ Verifies that when a call starts with an initial_prompt configured,
4
+ the agent produces a greeting (text output wrapped in response frames)
5
+ without any user input.
6
+
7
+ Run: cd voice-server && .venv/bin/python -m pytest initial-prompt.test.py -v
8
+ """
9
+
10
+ import asyncio
11
+ from unittest.mock import AsyncMock
12
+
13
+ import pytest
14
+
15
+ from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock
16
+ from pipecat.frames.frames import (
17
+ LLMFullResponseEndFrame,
18
+ LLMFullResponseStartFrame,
19
+ LLMTextFrame,
20
+ )
21
+
22
+ from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
23
+
24
+
25
+ # ============================================================================
26
+ # HELPERS
27
+ # ============================================================================
28
+
29
+ def _make_fake_client(response_text: str = "Hello! How can I help?"):
30
+ """Create a mock ClaudeSDKClient that returns a canned text response."""
31
+ client = AsyncMock()
32
+ client.connect = AsyncMock()
33
+ client.disconnect = AsyncMock()
34
+ client.query = AsyncMock()
35
+
36
+ async def fake_receive():
37
+ yield AssistantMessage(
38
+ content=[TextBlock(text=response_text)],
39
+ model="test",
40
+ )
41
+ yield ResultMessage(
42
+ subtype="success",
43
+ is_error=False,
44
+ duration_ms=0,
45
+ duration_api_ms=0,
46
+ num_turns=1,
47
+ session_id="test",
48
+ )
49
+
50
+ client.receive_response = fake_receive
51
+ return client
52
+
53
+
54
+ def _collect_frames(service: ClaudeLLMService) -> list:
55
+ """Patch push_frame on a service to collect all output frames."""
56
+ frames = []
57
+
58
+ async def capture(frame, *args, **kwargs):
59
+ frames.append(frame)
60
+
61
+ service.push_frame = capture
62
+ return frames
63
+
64
+
65
+ async def _trigger_initial_prompt(service: ClaudeLLMService, prompt: str):
66
+ """Reproduce what the pipeline's on_pipeline_started handler does."""
67
+ await service._ensure_client()
68
+ await service.push_frame(LLMFullResponseStartFrame())
69
+ await service._send_to_claude(prompt)
70
+ await service.push_frame(LLMFullResponseEndFrame())
71
+
72
+
73
+ # ============================================================================
74
+ # TESTS
75
+ # ============================================================================
76
+
77
+ @pytest.mark.asyncio
78
+ async def test_agent_greets_user_on_call_start():
79
+ """When a call starts with an initial_prompt, the agent should produce
80
+ a spoken greeting — text frames wrapped in response start/end frames —
81
+ without any user input."""
82
+ client = _make_fake_client("Hey there! Welcome to the call.")
83
+ config = ClaudeLLMServiceConfig(
84
+ cwd="/tmp",
85
+ system_prompt="You are a test agent.",
86
+ initial_prompt="Greet the user briefly.",
87
+ existing_client=client,
88
+ )
89
+ service = ClaudeLLMService(config=config)
90
+ frames = _collect_frames(service)
91
+
92
+ await _trigger_initial_prompt(service, config.initial_prompt)
93
+
94
+ # The agent should have produced spoken output
95
+ text_frames = [f for f in frames if isinstance(f, LLMTextFrame)]
96
+ assert len(text_frames) >= 1, "Agent did not produce any spoken output"
97
+ full_text = " ".join(f.text for f in text_frames)
98
+ assert len(full_text) > 0, "Agent greeting was empty"
99
+
100
+ # The prompt should have been sent to Claude
101
+ client.query.assert_awaited_once_with("Greet the user briefly.")
102
+
103
+
104
+ @pytest.mark.asyncio
105
+ async def test_greeting_is_wrapped_for_tts():
106
+ """The greeting must be wrapped in response start/end frames so TTS
107
+ treats it as a single utterance (no gaps, no dropped last sentence)."""
108
+ config = ClaudeLLMServiceConfig(
109
+ cwd="/tmp",
110
+ system_prompt="You are a test agent.",
111
+ initial_prompt="Say hello.",
112
+ existing_client=_make_fake_client("Hi! Nice to meet you."),
113
+ )
114
+ service = ClaudeLLMService(config=config)
115
+ frames = _collect_frames(service)
116
+
117
+ await _trigger_initial_prompt(service, config.initial_prompt)
118
+
119
+ frame_types = [type(f) for f in frames]
120
+
121
+ # Must have: start, then text(s), then end
122
+ assert LLMFullResponseStartFrame in frame_types, "Missing response start"
123
+ assert LLMFullResponseEndFrame in frame_types, "Missing response end"
124
+
125
+ start_idx = frame_types.index(LLMFullResponseStartFrame)
126
+ end_idx = frame_types.index(LLMFullResponseEndFrame)
127
+ text_indices = [i for i, t in enumerate(frame_types) if t == LLMTextFrame]
128
+
129
+ assert text_indices, "No text frames between start and end"
130
+ assert all(start_idx < i < end_idx for i in text_indices), (
131
+ "Text frames must appear between start and end for TTS to work correctly"
132
+ )
133
+
134
+
135
+ @pytest.mark.asyncio
136
+ async def test_no_greeting_without_initial_prompt():
137
+ """Without an initial_prompt, the agent should stay silent on call start."""
138
+ config = ClaudeLLMServiceConfig(
139
+ cwd="/tmp",
140
+ system_prompt="You are a test agent.",
141
+ initial_prompt=None,
142
+ existing_client=_make_fake_client(),
143
+ )
144
+ service = ClaudeLLMService(config=config)
145
+ frames = _collect_frames(service)
146
+
147
+ # No trigger — the pipeline would not call _trigger_initial_prompt
148
+ # because initial_prompt is None. Verify that's the guard.
149
+ assert config.initial_prompt is None
150
+ assert len(frames) == 0, "Agent should stay silent without initial_prompt"
@@ -21,10 +21,16 @@ import os
21
21
  import aiohttp
22
22
  from fastapi import WebSocket
23
23
 
24
+ from pipecat.frames.frames import LLMFullResponseEndFrame, LLMFullResponseStartFrame
24
25
  from pipecat.pipeline.pipeline import Pipeline
25
26
  from pipecat.pipeline.runner import PipelineRunner
26
27
  from pipecat.pipeline.task import PipelineParams, PipelineTask
27
- from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
28
+ from pipecat.audio.vad.silero import SileroVADAnalyzer
29
+ from pipecat.processors.aggregators.llm_context import LLMContext
30
+ from pipecat.processors.aggregators.llm_response_universal import (
31
+ LLMContextAggregatorPair,
32
+ LLMUserAggregatorParams,
33
+ )
28
34
  from pipecat.serializers.twilio import TwilioFrameSerializer
29
35
  from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
30
36
  from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
@@ -72,7 +78,19 @@ async def handle_twilio_websocket(websocket: WebSocket, call_token: str) -> None
72
78
  try:
73
79
  # Read messages until we get the "start" event
74
80
  while True:
75
- raw = await websocket.receive_text()
81
+ message = await websocket.receive()
82
+
83
+ if message.get("type") == "websocket.disconnect":
84
+ logger.warning("[twilio] WebSocket disconnected before start event")
85
+ return
86
+
87
+ # Twilio may send frames as text or binary
88
+ raw = message.get("text") or (
89
+ message.get("bytes", b"").decode("utf-8") if message.get("bytes") else None
90
+ )
91
+ if not raw:
92
+ continue
93
+
76
94
  msg = json.loads(raw)
77
95
 
78
96
  if msg.get("event") == "start":
@@ -173,7 +191,12 @@ async def _run_twilio_pipeline(
173
191
  llm_config: Claude LLM service configuration
174
192
  voice_id: ElevenLabs voice ID
175
193
  """
176
- serializer = TwilioFrameSerializer(stream_sid=stream_sid, call_sid=call_sid)
194
+ serializer = TwilioFrameSerializer(
195
+ stream_sid=stream_sid,
196
+ call_sid=call_sid,
197
+ account_sid=config.twilio_account_sid,
198
+ auth_token=config.twilio_auth_token,
199
+ )
177
200
 
178
201
  transport = FastAPIWebsocketTransport(
179
202
  websocket=websocket,
@@ -211,8 +234,13 @@ async def _run_twilio_pipeline(
211
234
  narration = NarrationProcessor()
212
235
 
213
236
  # Context aggregator
214
- context = OpenAILLMContext(messages=[], tools=[])
215
- context_aggregator = claude_llm.create_context_aggregator(context)
237
+ context = LLMContext()
238
+ context_aggregator = LLMContextAggregatorPair(
239
+ context,
240
+ user_params=LLMUserAggregatorParams(
241
+ vad_analyzer=SileroVADAnalyzer(),
242
+ ),
243
+ )
216
244
 
217
245
  # Pipeline
218
246
  pipeline = Pipeline(
@@ -233,5 +261,15 @@ async def _run_twilio_pipeline(
233
261
  params=PipelineParams(allow_interruptions=True),
234
262
  )
235
263
 
264
+ # Send initial prompt once the pipeline is fully ready
265
+ @task.event_handler("on_pipeline_started")
266
+ async def on_pipeline_started(task_ref, *args):
267
+ if llm_config.initial_prompt and not claude_llm._initial_prompt_sent:
268
+ claude_llm._initial_prompt_sent = True
269
+ await claude_llm._ensure_client()
270
+ await claude_llm.push_frame(LLMFullResponseStartFrame())
271
+ await claude_llm._send_to_claude(llm_config.initial_prompt)
272
+ await claude_llm.push_frame(LLMFullResponseEndFrame())
273
+
236
274
  runner = PipelineRunner()
237
275
  await runner.run(task)
@@ -19,7 +19,11 @@ Responsibilities:
19
19
  import aiohttp
20
20
  import logging
21
21
 
22
- from pipecat.frames.frames import LLMMessagesFrame
22
+ from pipecat.frames.frames import (
23
+ LLMFullResponseEndFrame,
24
+ LLMFullResponseStartFrame,
25
+ LLMMessagesFrame,
26
+ )
23
27
  from pipecat.pipeline.pipeline import Pipeline
24
28
  from pipecat.pipeline.runner import PipelineRunner
25
29
  from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -97,6 +101,7 @@ async def bot(runner_args: SmallWebRTCRunnerArguments):
97
101
  claude_config = ClaudeLLMServiceConfig(
98
102
  cwd=config.default_cwd,
99
103
  system_prompt=system_prompt,
104
+ initial_prompt="The user just joined the call. Greet them briefly.",
100
105
  )
101
106
  claude_llm = ClaudeLLMService(config=claude_config)
102
107
 
@@ -135,6 +140,16 @@ async def bot(runner_args: SmallWebRTCRunnerArguments):
135
140
  params=PipelineParams(allow_interruptions=True),
136
141
  )
137
142
 
143
+ # Send initial prompt once the pipeline is fully ready
144
+ @task.event_handler("on_pipeline_started")
145
+ async def on_pipeline_started(task_ref, *args):
146
+ if claude_config.initial_prompt and not claude_llm._initial_prompt_sent:
147
+ claude_llm._initial_prompt_sent = True
148
+ await claude_llm._ensure_client()
149
+ await claude_llm.push_frame(LLMFullResponseStartFrame())
150
+ await claude_llm._send_to_claude(claude_config.initial_prompt)
151
+ await claude_llm.push_frame(LLMFullResponseEndFrame())
152
+
138
153
  runner = PipelineRunner(handle_sigint=False)
139
154
  await runner.run(task)
140
155