voicecc 1.2.5 → 1.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/voicecc.js +27 -2
- package/package.json +1 -1
- package/voice-server/claude_llm_service.py +2 -6
- package/voice-server/heartbeat.py +1 -1
- package/voice-server/initial_prompt_test.py +150 -0
- package/voice-server/twilio_pipeline.py +27 -3
- package/voice-server/voice_pipeline.py +16 -1
package/README.md
CHANGED
|
@@ -9,10 +9,10 @@ A Voice Agent Platform running on Claude Code. Create, manage, and deploy conver
|
|
|
9
9
|
## Project Structure
|
|
10
10
|
|
|
11
11
|
```
|
|
12
|
-
server/
|
|
13
|
-
|
|
14
|
-
services/
|
|
15
|
-
index.ts Entry point (
|
|
12
|
+
voice-server/ Python FastAPI: real-time audio pipeline (VAD, STT, TTS, Claude sessions)
|
|
13
|
+
server/ Node.js orchestration: boots dashboard + voice server, manages integrations
|
|
14
|
+
services/ Tunnel, Twilio, browser calls, agents, device pairing
|
|
15
|
+
index.ts Entry point (spawns voice-server + dashboard, auto-starts integrations)
|
|
16
16
|
dashboard/ Web UI (Vite + React) + API routes (Hono)
|
|
17
17
|
lander/ Static landing page
|
|
18
18
|
init/ Default prompt templates for new agents
|
|
@@ -25,6 +25,7 @@ bin/ CLI entry point (voicecc command)
|
|
|
25
25
|
|
|
26
26
|
- macOS or Linux
|
|
27
27
|
- Node.js 18+
|
|
28
|
+
- Python 3.11+ with `venv`
|
|
28
29
|
- An ElevenLabs API key
|
|
29
30
|
|
|
30
31
|
### Terminal
|
|
@@ -41,11 +42,13 @@ voicecc
|
|
|
41
42
|
|
|
42
43
|
## How It Works
|
|
43
44
|
|
|
44
|
-
|
|
45
|
+
The platform runs two servers: a **Node.js orchestrator** (dashboard, integrations, CLI) and a **Python voice server** (real-time audio pipeline via Pipecat).
|
|
46
|
+
|
|
47
|
+
1. **Mic capture**: Browser captures audio via WebRTC, connected to the Python voice server
|
|
45
48
|
2. **Voice activity detection**: Silero VAD v5 detects speech segments
|
|
46
|
-
3. **Speech-to-text**: ElevenLabs Scribe
|
|
49
|
+
3. **Speech-to-text**: ElevenLabs Scribe transcribes audio
|
|
47
50
|
4. **Endpointing**: VAD silence-based turn detection
|
|
48
51
|
5. **Claude inference**: Transcript sent to Claude Agent SDK session with streaming response
|
|
49
52
|
6. **Narration**: Claude's response stripped of markdown and split into sentences
|
|
50
|
-
7. **Text-to-speech**: ElevenLabs streaming TTS
|
|
51
|
-
8. **Speaker playback**: Audio
|
|
53
|
+
7. **Text-to-speech**: ElevenLabs streaming TTS generates audio
|
|
54
|
+
8. **Speaker playback**: Audio streamed back through WebRTC
|
package/bin/voicecc.js
CHANGED
|
@@ -129,7 +129,11 @@ function ensurePython() {
|
|
|
129
129
|
|
|
130
130
|
if (process.platform !== "linux") {
|
|
131
131
|
console.error("ERROR: Python 3.12+ is required but not found.");
|
|
132
|
-
|
|
132
|
+
if (process.platform === "darwin") {
|
|
133
|
+
console.error("Install it with Homebrew: brew install python@3.12");
|
|
134
|
+
} else {
|
|
135
|
+
console.error("Install Python 3.12+ and run 'voicecc' again.");
|
|
136
|
+
}
|
|
133
137
|
process.exit(1);
|
|
134
138
|
}
|
|
135
139
|
|
|
@@ -157,7 +161,11 @@ function ensureVenvModule(systemPython) {
|
|
|
157
161
|
|
|
158
162
|
if (process.platform !== "linux") {
|
|
159
163
|
console.error("ERROR: Python venv module is missing.");
|
|
160
|
-
|
|
164
|
+
if (process.platform === "darwin") {
|
|
165
|
+
console.error("Reinstall Python with Homebrew: brew install python@3.12");
|
|
166
|
+
} else {
|
|
167
|
+
console.error("Install the venv module and run 'voicecc' again.");
|
|
168
|
+
}
|
|
161
169
|
process.exit(1);
|
|
162
170
|
}
|
|
163
171
|
|
|
@@ -195,6 +203,23 @@ function ensurePythonVenv() {
|
|
|
195
203
|
// Step 2: Ensure venv module is available
|
|
196
204
|
ensureVenvModule(systemPython);
|
|
197
205
|
|
|
206
|
+
// Step 2.5: Ensure system libraries needed by Python packages (OpenCV, audio, WebRTC)
|
|
207
|
+
if (process.platform === "linux") {
|
|
208
|
+
const requiredLibs = ["libGL.so.1", "libSM.so.6", "libsndfile.so.1"];
|
|
209
|
+
const missing = requiredLibs.some((lib) => {
|
|
210
|
+
try { execSync(`ldconfig -p | grep ${lib}`, { encoding: "utf-8" }); return false; } catch { return true; }
|
|
211
|
+
});
|
|
212
|
+
if (missing) {
|
|
213
|
+
console.log("Installing system libraries required by Python packages...");
|
|
214
|
+
try {
|
|
215
|
+
linuxInstallPackage("libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 libsndfile1 libportaudio2");
|
|
216
|
+
} catch (err) {
|
|
217
|
+
console.error(`Failed to install system libraries: ${err.message}`);
|
|
218
|
+
process.exit(1);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
198
223
|
// Step 3: Create venv if needed
|
|
199
224
|
if (!existsSync(venvPython)) {
|
|
200
225
|
console.log("Setting up Python environment for voice server...");
|
package/package.json
CHANGED
|
@@ -108,12 +108,8 @@ class ClaudeLLMService(LLMService):
|
|
|
108
108
|
self._settings.user_turn_completion_config = None
|
|
109
109
|
|
|
110
110
|
async def start(self, frame: StartFrame):
|
|
111
|
-
"""Handle pipeline start.
|
|
111
|
+
"""Handle pipeline start."""
|
|
112
112
|
await super().start(frame)
|
|
113
|
-
if self._config.initial_prompt and not self._initial_prompt_sent:
|
|
114
|
-
self._initial_prompt_sent = True
|
|
115
|
-
await self._ensure_client()
|
|
116
|
-
await self._send_to_claude(self._config.initial_prompt)
|
|
117
113
|
|
|
118
114
|
async def stop(self, frame: EndFrame):
|
|
119
115
|
"""Handle pipeline stop. Disconnects the Claude session."""
|
|
@@ -237,7 +233,7 @@ class ClaudeLLMService(LLMService):
|
|
|
237
233
|
allowed_tools=self._config.allowed_tools or [],
|
|
238
234
|
permission_mode="bypassPermissions",
|
|
239
235
|
include_partial_messages=True,
|
|
240
|
-
max_thinking_tokens=
|
|
236
|
+
max_thinking_tokens=0,
|
|
241
237
|
)
|
|
242
238
|
self._client = ClaudeSDKClient(options=options)
|
|
243
239
|
|
|
@@ -314,7 +314,7 @@ async def _run_heartbeat_session(
|
|
|
314
314
|
allowed_tools=[],
|
|
315
315
|
permission_mode="bypassPermissions",
|
|
316
316
|
include_partial_messages=True,
|
|
317
|
-
max_thinking_tokens=
|
|
317
|
+
max_thinking_tokens=0,
|
|
318
318
|
)
|
|
319
319
|
client = ClaudeSDKClient(options=options)
|
|
320
320
|
await client.connect()
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Tests for agent-speaks-first behavior.
|
|
2
|
+
|
|
3
|
+
Verifies that when a call starts with an initial_prompt configured,
|
|
4
|
+
the agent produces a greeting (text output wrapped in response frames)
|
|
5
|
+
without any user input.
|
|
6
|
+
|
|
7
|
+
Run: cd voice-server && .venv/bin/python -m pytest initial-prompt.test.py -v
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
from unittest.mock import AsyncMock
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock
|
|
16
|
+
from pipecat.frames.frames import (
|
|
17
|
+
LLMFullResponseEndFrame,
|
|
18
|
+
LLMFullResponseStartFrame,
|
|
19
|
+
LLMTextFrame,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ============================================================================
|
|
26
|
+
# HELPERS
|
|
27
|
+
# ============================================================================
|
|
28
|
+
|
|
29
|
+
def _make_fake_client(response_text: str = "Hello! How can I help?"):
|
|
30
|
+
"""Create a mock ClaudeSDKClient that returns a canned text response."""
|
|
31
|
+
client = AsyncMock()
|
|
32
|
+
client.connect = AsyncMock()
|
|
33
|
+
client.disconnect = AsyncMock()
|
|
34
|
+
client.query = AsyncMock()
|
|
35
|
+
|
|
36
|
+
async def fake_receive():
|
|
37
|
+
yield AssistantMessage(
|
|
38
|
+
content=[TextBlock(text=response_text)],
|
|
39
|
+
model="test",
|
|
40
|
+
)
|
|
41
|
+
yield ResultMessage(
|
|
42
|
+
subtype="success",
|
|
43
|
+
is_error=False,
|
|
44
|
+
duration_ms=0,
|
|
45
|
+
duration_api_ms=0,
|
|
46
|
+
num_turns=1,
|
|
47
|
+
session_id="test",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
client.receive_response = fake_receive
|
|
51
|
+
return client
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _collect_frames(service: ClaudeLLMService) -> list:
|
|
55
|
+
"""Patch push_frame on a service to collect all output frames."""
|
|
56
|
+
frames = []
|
|
57
|
+
|
|
58
|
+
async def capture(frame, *args, **kwargs):
|
|
59
|
+
frames.append(frame)
|
|
60
|
+
|
|
61
|
+
service.push_frame = capture
|
|
62
|
+
return frames
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def _trigger_initial_prompt(service: ClaudeLLMService, prompt: str):
|
|
66
|
+
"""Reproduce what the pipeline's on_pipeline_started handler does."""
|
|
67
|
+
await service._ensure_client()
|
|
68
|
+
await service.push_frame(LLMFullResponseStartFrame())
|
|
69
|
+
await service._send_to_claude(prompt)
|
|
70
|
+
await service.push_frame(LLMFullResponseEndFrame())
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ============================================================================
|
|
74
|
+
# TESTS
|
|
75
|
+
# ============================================================================
|
|
76
|
+
|
|
77
|
+
@pytest.mark.asyncio
|
|
78
|
+
async def test_agent_greets_user_on_call_start():
|
|
79
|
+
"""When a call starts with an initial_prompt, the agent should produce
|
|
80
|
+
a spoken greeting — text frames wrapped in response start/end frames —
|
|
81
|
+
without any user input."""
|
|
82
|
+
client = _make_fake_client("Hey there! Welcome to the call.")
|
|
83
|
+
config = ClaudeLLMServiceConfig(
|
|
84
|
+
cwd="/tmp",
|
|
85
|
+
system_prompt="You are a test agent.",
|
|
86
|
+
initial_prompt="Greet the user briefly.",
|
|
87
|
+
existing_client=client,
|
|
88
|
+
)
|
|
89
|
+
service = ClaudeLLMService(config=config)
|
|
90
|
+
frames = _collect_frames(service)
|
|
91
|
+
|
|
92
|
+
await _trigger_initial_prompt(service, config.initial_prompt)
|
|
93
|
+
|
|
94
|
+
# The agent should have produced spoken output
|
|
95
|
+
text_frames = [f for f in frames if isinstance(f, LLMTextFrame)]
|
|
96
|
+
assert len(text_frames) >= 1, "Agent did not produce any spoken output"
|
|
97
|
+
full_text = " ".join(f.text for f in text_frames)
|
|
98
|
+
assert len(full_text) > 0, "Agent greeting was empty"
|
|
99
|
+
|
|
100
|
+
# The prompt should have been sent to Claude
|
|
101
|
+
client.query.assert_awaited_once_with("Greet the user briefly.")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@pytest.mark.asyncio
|
|
105
|
+
async def test_greeting_is_wrapped_for_tts():
|
|
106
|
+
"""The greeting must be wrapped in response start/end frames so TTS
|
|
107
|
+
treats it as a single utterance (no gaps, no dropped last sentence)."""
|
|
108
|
+
config = ClaudeLLMServiceConfig(
|
|
109
|
+
cwd="/tmp",
|
|
110
|
+
system_prompt="You are a test agent.",
|
|
111
|
+
initial_prompt="Say hello.",
|
|
112
|
+
existing_client=_make_fake_client("Hi! Nice to meet you."),
|
|
113
|
+
)
|
|
114
|
+
service = ClaudeLLMService(config=config)
|
|
115
|
+
frames = _collect_frames(service)
|
|
116
|
+
|
|
117
|
+
await _trigger_initial_prompt(service, config.initial_prompt)
|
|
118
|
+
|
|
119
|
+
frame_types = [type(f) for f in frames]
|
|
120
|
+
|
|
121
|
+
# Must have: start, then text(s), then end
|
|
122
|
+
assert LLMFullResponseStartFrame in frame_types, "Missing response start"
|
|
123
|
+
assert LLMFullResponseEndFrame in frame_types, "Missing response end"
|
|
124
|
+
|
|
125
|
+
start_idx = frame_types.index(LLMFullResponseStartFrame)
|
|
126
|
+
end_idx = frame_types.index(LLMFullResponseEndFrame)
|
|
127
|
+
text_indices = [i for i, t in enumerate(frame_types) if t == LLMTextFrame]
|
|
128
|
+
|
|
129
|
+
assert text_indices, "No text frames between start and end"
|
|
130
|
+
assert all(start_idx < i < end_idx for i in text_indices), (
|
|
131
|
+
"Text frames must appear between start and end for TTS to work correctly"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@pytest.mark.asyncio
|
|
136
|
+
async def test_no_greeting_without_initial_prompt():
|
|
137
|
+
"""Without an initial_prompt, the agent should stay silent on call start."""
|
|
138
|
+
config = ClaudeLLMServiceConfig(
|
|
139
|
+
cwd="/tmp",
|
|
140
|
+
system_prompt="You are a test agent.",
|
|
141
|
+
initial_prompt=None,
|
|
142
|
+
existing_client=_make_fake_client(),
|
|
143
|
+
)
|
|
144
|
+
service = ClaudeLLMService(config=config)
|
|
145
|
+
frames = _collect_frames(service)
|
|
146
|
+
|
|
147
|
+
# No trigger — the pipeline would not call _trigger_initial_prompt
|
|
148
|
+
# because initial_prompt is None. Verify that's the guard.
|
|
149
|
+
assert config.initial_prompt is None
|
|
150
|
+
assert len(frames) == 0, "Agent should stay silent without initial_prompt"
|
|
@@ -72,8 +72,16 @@ async def handle_twilio_websocket(websocket: WebSocket, call_token: str) -> None
|
|
|
72
72
|
try:
|
|
73
73
|
# Read messages until we get the "start" event
|
|
74
74
|
while True:
|
|
75
|
-
|
|
76
|
-
|
|
75
|
+
message = await websocket.receive()
|
|
76
|
+
|
|
77
|
+
# Skip binary frames (early audio before start)
|
|
78
|
+
if message.get("type") == "websocket.disconnect":
|
|
79
|
+
logger.warning("[twilio] WebSocket disconnected before start event")
|
|
80
|
+
return
|
|
81
|
+
if "text" not in message:
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
msg = json.loads(message["text"])
|
|
77
85
|
|
|
78
86
|
if msg.get("event") == "start":
|
|
79
87
|
start_data = msg.get("start", {})
|
|
@@ -173,7 +181,12 @@ async def _run_twilio_pipeline(
|
|
|
173
181
|
llm_config: Claude LLM service configuration
|
|
174
182
|
voice_id: ElevenLabs voice ID
|
|
175
183
|
"""
|
|
176
|
-
serializer = TwilioFrameSerializer(
|
|
184
|
+
serializer = TwilioFrameSerializer(
|
|
185
|
+
stream_sid=stream_sid,
|
|
186
|
+
call_sid=call_sid,
|
|
187
|
+
account_sid=config.twilio_account_sid,
|
|
188
|
+
auth_token=config.twilio_auth_token,
|
|
189
|
+
)
|
|
177
190
|
|
|
178
191
|
transport = FastAPIWebsocketTransport(
|
|
179
192
|
websocket=websocket,
|
|
@@ -233,5 +246,16 @@ async def _run_twilio_pipeline(
|
|
|
233
246
|
params=PipelineParams(allow_interruptions=True),
|
|
234
247
|
)
|
|
235
248
|
|
|
249
|
+
# For Twilio, the WebSocket is already connected, so send the
|
|
250
|
+
# initial prompt shortly after the pipeline starts.
|
|
251
|
+
async def _send_initial_prompt():
|
|
252
|
+
await asyncio.sleep(1) # Let the pipeline fully initialize
|
|
253
|
+
if llm_config.initial_prompt and not claude_llm._initial_prompt_sent:
|
|
254
|
+
claude_llm._initial_prompt_sent = True
|
|
255
|
+
await claude_llm._ensure_client()
|
|
256
|
+
await claude_llm._send_to_claude(llm_config.initial_prompt)
|
|
257
|
+
|
|
258
|
+
asyncio.create_task(_send_initial_prompt())
|
|
259
|
+
|
|
236
260
|
runner = PipelineRunner()
|
|
237
261
|
await runner.run(task)
|
|
@@ -19,7 +19,11 @@ Responsibilities:
|
|
|
19
19
|
import aiohttp
|
|
20
20
|
import logging
|
|
21
21
|
|
|
22
|
-
from pipecat.frames.frames import
|
|
22
|
+
from pipecat.frames.frames import (
|
|
23
|
+
LLMFullResponseEndFrame,
|
|
24
|
+
LLMFullResponseStartFrame,
|
|
25
|
+
LLMMessagesFrame,
|
|
26
|
+
)
|
|
23
27
|
from pipecat.pipeline.pipeline import Pipeline
|
|
24
28
|
from pipecat.pipeline.runner import PipelineRunner
|
|
25
29
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
|
@@ -97,6 +101,7 @@ async def bot(runner_args: SmallWebRTCRunnerArguments):
|
|
|
97
101
|
claude_config = ClaudeLLMServiceConfig(
|
|
98
102
|
cwd=config.default_cwd,
|
|
99
103
|
system_prompt=system_prompt,
|
|
104
|
+
initial_prompt="The user just joined the call. Greet them briefly.",
|
|
100
105
|
)
|
|
101
106
|
claude_llm = ClaudeLLMService(config=claude_config)
|
|
102
107
|
|
|
@@ -135,6 +140,16 @@ async def bot(runner_args: SmallWebRTCRunnerArguments):
|
|
|
135
140
|
params=PipelineParams(allow_interruptions=True),
|
|
136
141
|
)
|
|
137
142
|
|
|
143
|
+
# Send initial prompt once the pipeline is fully ready
|
|
144
|
+
@task.event_handler("on_pipeline_started")
|
|
145
|
+
async def on_pipeline_started(task_ref, *args):
|
|
146
|
+
if claude_config.initial_prompt and not claude_llm._initial_prompt_sent:
|
|
147
|
+
claude_llm._initial_prompt_sent = True
|
|
148
|
+
await claude_llm._ensure_client()
|
|
149
|
+
await claude_llm.push_frame(LLMFullResponseStartFrame())
|
|
150
|
+
await claude_llm._send_to_claude(claude_config.initial_prompt)
|
|
151
|
+
await claude_llm.push_frame(LLMFullResponseEndFrame())
|
|
152
|
+
|
|
138
153
|
runner = PipelineRunner(handle_sigint=False)
|
|
139
154
|
await runner.run(task)
|
|
140
155
|
|