voicecc 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/voicecc.js +92 -68
- package/package.json +2 -1
- package/voice-server/.python-version +1 -0
- package/voice-server/claude_llm_service.py +333 -0
- package/voice-server/claude_session.py +312 -0
- package/voice-server/config.py +340 -0
- package/voice-server/dev-server-start.sh +128 -0
- package/voice-server/heartbeat.py +505 -0
- package/voice-server/narration_processor.py +140 -0
- package/voice-server/requirements.txt +8 -0
- package/voice-server/server.py +335 -0
- package/voice-server/stop_phrase_processor.py +50 -0
- package/voice-server/twilio_pipeline.py +237 -0
- package/voice-server/voice_pipeline.py +147 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Twilio voice pipeline using FastAPIWebsocketTransport with TwilioFrameSerializer.
|
|
3
|
+
|
|
4
|
+
Handles inbound and outbound Twilio phone calls by wiring Pipecat components
|
|
5
|
+
for mulaw audio over WebSocket. Supports heartbeat session handoff where a
|
|
6
|
+
pre-existing Claude session is passed through to preserve context.
|
|
7
|
+
|
|
8
|
+
Responsibilities:
|
|
9
|
+
- Create a Pipecat pipeline with TwilioFrameSerializer for mulaw 8kHz audio
|
|
10
|
+
- Handle FastAPI WebSocket connections from Twilio media streams
|
|
11
|
+
- Extract Twilio metadata (stream_sid, call_sid) from the WebSocket "start" event
|
|
12
|
+
- Look up pending calls to retrieve pre-existing ClaudeSDKClient sessions
|
|
13
|
+
- Wire STT -> LLM -> TTS pipeline identical to browser pipeline
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
|
|
21
|
+
import aiohttp
|
|
22
|
+
from fastapi import WebSocket
|
|
23
|
+
|
|
24
|
+
from pipecat.pipeline.pipeline import Pipeline
|
|
25
|
+
from pipecat.pipeline.runner import PipelineRunner
|
|
26
|
+
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
|
27
|
+
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|
28
|
+
from pipecat.serializers.twilio import TwilioFrameSerializer
|
|
29
|
+
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
|
|
30
|
+
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
|
31
|
+
from pipecat.transports.websocket.fastapi import (
|
|
32
|
+
FastAPIWebsocketParams,
|
|
33
|
+
FastAPIWebsocketTransport,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
|
|
37
|
+
from config import (
|
|
38
|
+
DEFAULT_AGENTS_DIR,
|
|
39
|
+
build_system_prompt,
|
|
40
|
+
get_agent_voice_id,
|
|
41
|
+
load_config,
|
|
42
|
+
)
|
|
43
|
+
from heartbeat import get_pending_client
|
|
44
|
+
from narration_processor import NarrationProcessor
|
|
45
|
+
from stop_phrase_processor import StopPhraseProcessor
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ============================================================================
|
|
51
|
+
# MAIN HANDLERS
|
|
52
|
+
# ============================================================================
|
|
53
|
+
|
|
54
|
+
async def handle_twilio_websocket(websocket: WebSocket, call_token: str) -> None:
|
|
55
|
+
"""Handle a Twilio media stream WebSocket connection.
|
|
56
|
+
|
|
57
|
+
Accepts the WebSocket, waits for the Twilio "start" event to extract metadata,
|
|
58
|
+
looks up any pending call config, then creates and runs the voice pipeline.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
websocket: FastAPI WebSocket connection from Twilio
|
|
62
|
+
call_token: Per-call UUID token from the URL path
|
|
63
|
+
"""
|
|
64
|
+
await websocket.accept()
|
|
65
|
+
|
|
66
|
+
config = load_config()
|
|
67
|
+
|
|
68
|
+
# Wait for the Twilio "start" event to get stream metadata
|
|
69
|
+
stream_sid = None
|
|
70
|
+
call_sid = None
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
# Read messages until we get the "start" event
|
|
74
|
+
while True:
|
|
75
|
+
raw = await websocket.receive_text()
|
|
76
|
+
msg = json.loads(raw)
|
|
77
|
+
|
|
78
|
+
if msg.get("event") == "start":
|
|
79
|
+
start_data = msg.get("start", {})
|
|
80
|
+
stream_sid = start_data.get("streamSid")
|
|
81
|
+
call_sid = start_data.get("callSid")
|
|
82
|
+
logger.info(
|
|
83
|
+
f"[twilio] Stream started -- callSid: {call_sid}, "
|
|
84
|
+
f"streamSid: {stream_sid}"
|
|
85
|
+
)
|
|
86
|
+
break
|
|
87
|
+
|
|
88
|
+
if msg.get("event") == "connected":
|
|
89
|
+
# Initial connected event -- keep waiting for start
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# Unexpected event before start
|
|
93
|
+
logger.warning(f"[twilio] Unexpected event before start: {msg.get('event')}")
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.error(f"[twilio] Error waiting for start event: {e}")
|
|
97
|
+
await websocket.close()
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
if not stream_sid:
|
|
101
|
+
logger.error("[twilio] No stream_sid in start event")
|
|
102
|
+
await websocket.close()
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# Look up pending call for heartbeat handoff or API-initiated calls
|
|
106
|
+
pending = get_pending_client(call_token)
|
|
107
|
+
agent_id = None
|
|
108
|
+
existing_client = None
|
|
109
|
+
initial_prompt = None
|
|
110
|
+
|
|
111
|
+
if pending:
|
|
112
|
+
agent_id = pending.agent_id
|
|
113
|
+
existing_client = pending.client # May be None for API calls
|
|
114
|
+
initial_prompt = pending.initial_prompt
|
|
115
|
+
logger.info(
|
|
116
|
+
f'[twilio] Using pending call for agent "{agent_id}", '
|
|
117
|
+
f'has_client={existing_client is not None}'
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Build LLM config
|
|
121
|
+
system_prompt = build_system_prompt(agent_id, "voice")
|
|
122
|
+
cwd = os.path.join(DEFAULT_AGENTS_DIR, agent_id) if agent_id else config.default_cwd
|
|
123
|
+
voice_id = get_agent_voice_id(agent_id)
|
|
124
|
+
|
|
125
|
+
llm_config = ClaudeLLMServiceConfig(
|
|
126
|
+
cwd=cwd,
|
|
127
|
+
system_prompt=system_prompt,
|
|
128
|
+
existing_client=existing_client,
|
|
129
|
+
initial_prompt=initial_prompt,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Create and run the pipeline
|
|
133
|
+
try:
|
|
134
|
+
await _run_twilio_pipeline(
|
|
135
|
+
websocket=websocket,
|
|
136
|
+
stream_sid=stream_sid,
|
|
137
|
+
call_sid=call_sid or "",
|
|
138
|
+
config=config,
|
|
139
|
+
llm_config=llm_config,
|
|
140
|
+
voice_id=voice_id,
|
|
141
|
+
)
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logger.error(f"[twilio] Pipeline error: {e}")
|
|
144
|
+
finally:
|
|
145
|
+
try:
|
|
146
|
+
await websocket.close()
|
|
147
|
+
except Exception:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ============================================================================
|
|
152
|
+
# HELPER FUNCTIONS
|
|
153
|
+
# ============================================================================
|
|
154
|
+
|
|
155
|
+
async def _run_twilio_pipeline(
|
|
156
|
+
websocket: WebSocket,
|
|
157
|
+
stream_sid: str,
|
|
158
|
+
call_sid: str,
|
|
159
|
+
config,
|
|
160
|
+
llm_config: ClaudeLLMServiceConfig,
|
|
161
|
+
voice_id: str,
|
|
162
|
+
) -> None:
|
|
163
|
+
"""Create and run the Twilio voice pipeline.
|
|
164
|
+
|
|
165
|
+
Assembles: transport.input -> STT -> stop_phrase -> user_aggregator
|
|
166
|
+
-> claude_llm -> narration -> TTS -> transport.output
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
websocket: Active FastAPI WebSocket connection
|
|
170
|
+
stream_sid: Twilio stream identifier
|
|
171
|
+
call_sid: Twilio call SID
|
|
172
|
+
config: Voice server configuration
|
|
173
|
+
llm_config: Claude LLM service configuration
|
|
174
|
+
voice_id: ElevenLabs voice ID
|
|
175
|
+
"""
|
|
176
|
+
serializer = TwilioFrameSerializer(stream_sid=stream_sid, call_sid=call_sid)
|
|
177
|
+
|
|
178
|
+
transport = FastAPIWebsocketTransport(
|
|
179
|
+
websocket=websocket,
|
|
180
|
+
params=FastAPIWebsocketParams(
|
|
181
|
+
audio_in_enabled=True,
|
|
182
|
+
audio_out_enabled=True,
|
|
183
|
+
audio_in_sample_rate=8000,
|
|
184
|
+
audio_out_sample_rate=8000,
|
|
185
|
+
vad_enabled=True,
|
|
186
|
+
vad_audio_passthrough=True,
|
|
187
|
+
serializer=serializer,
|
|
188
|
+
),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
async with aiohttp.ClientSession() as session:
|
|
192
|
+
# STT
|
|
193
|
+
stt = ElevenLabsSTTService(
|
|
194
|
+
api_key=config.elevenlabs_api_key,
|
|
195
|
+
aiohttp_session=session,
|
|
196
|
+
model=config.elevenlabs_stt_model,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# TTS
|
|
200
|
+
tts = ElevenLabsTTSService(
|
|
201
|
+
api_key=config.elevenlabs_api_key,
|
|
202
|
+
voice_id=voice_id,
|
|
203
|
+
model=config.elevenlabs_tts_model,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Claude LLM
|
|
207
|
+
claude_llm = ClaudeLLMService(config=llm_config)
|
|
208
|
+
|
|
209
|
+
# Processors
|
|
210
|
+
stop_phrase = StopPhraseProcessor()
|
|
211
|
+
narration = NarrationProcessor()
|
|
212
|
+
|
|
213
|
+
# Context aggregator
|
|
214
|
+
context = OpenAILLMContext(messages=[], tools=[])
|
|
215
|
+
context_aggregator = claude_llm.create_context_aggregator(context)
|
|
216
|
+
|
|
217
|
+
# Pipeline
|
|
218
|
+
pipeline = Pipeline(
|
|
219
|
+
[
|
|
220
|
+
transport.input(),
|
|
221
|
+
stt,
|
|
222
|
+
stop_phrase,
|
|
223
|
+
context_aggregator.user(),
|
|
224
|
+
claude_llm,
|
|
225
|
+
narration,
|
|
226
|
+
tts,
|
|
227
|
+
transport.output(),
|
|
228
|
+
]
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
task = PipelineTask(
|
|
232
|
+
pipeline,
|
|
233
|
+
params=PipelineParams(allow_interruptions=True),
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
runner = PipelineRunner()
|
|
237
|
+
await runner.run(task)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Browser voice pipeline entry point for Pipecat runner.
|
|
3
|
+
|
|
4
|
+
Assembles the voice pipeline: WebRTC transport -> ElevenLabs STT -> stop phrase
|
|
5
|
+
detection -> user context aggregation -> Claude LLM -> narration -> ElevenLabs TTS
|
|
6
|
+
-> WebRTC output.
|
|
7
|
+
|
|
8
|
+
Can be run standalone via `python voice_pipeline.py` or imported from server.py
|
|
9
|
+
which starts it alongside the FastAPI server.
|
|
10
|
+
|
|
11
|
+
Responsibilities:
|
|
12
|
+
- Create SmallWebRTCTransport with audio I/O
|
|
13
|
+
- Wire STT -> LLM -> TTS pipeline with narration and stop phrase processors
|
|
14
|
+
- Load config and build system prompt
|
|
15
|
+
- Serve as the entry point for `pipecat.runner.run.main()`
|
|
16
|
+
- Expose `main` for import by server.py
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import aiohttp
|
|
20
|
+
import logging
|
|
21
|
+
|
|
22
|
+
from pipecat.frames.frames import LLMMessagesFrame
|
|
23
|
+
from pipecat.pipeline.pipeline import Pipeline
|
|
24
|
+
from pipecat.pipeline.runner import PipelineRunner
|
|
25
|
+
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
|
26
|
+
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
|
27
|
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
28
|
+
from pipecat.processors.aggregators.llm_response_universal import (
|
|
29
|
+
LLMContextAggregatorPair,
|
|
30
|
+
LLMUserAggregatorParams,
|
|
31
|
+
)
|
|
32
|
+
from pipecat.runner.types import SmallWebRTCRunnerArguments
|
|
33
|
+
from pipecat.runner.run import main
|
|
34
|
+
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService, ElevenLabsSTTSettings
|
|
35
|
+
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService, ElevenLabsTTSSettings
|
|
36
|
+
from pipecat.transports.base_transport import TransportParams
|
|
37
|
+
from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
|
|
38
|
+
|
|
39
|
+
from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
|
|
40
|
+
from config import build_system_prompt, get_agent_voice_id, load_config
|
|
41
|
+
from narration_processor import NarrationProcessor
|
|
42
|
+
from stop_phrase_processor import StopPhraseProcessor
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ============================================================================
|
|
48
|
+
# MAIN HANDLERS
|
|
49
|
+
# ============================================================================
|
|
50
|
+
|
|
51
|
+
async def bot(runner_args: SmallWebRTCRunnerArguments):
|
|
52
|
+
"""Entry point for the Pipecat runner.
|
|
53
|
+
|
|
54
|
+
Creates the full voice pipeline and runs it. Called automatically by
|
|
55
|
+
Pipecat's runner when a WebRTC client connects.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
runner_args: Runner arguments containing the WebRTC connection
|
|
59
|
+
"""
|
|
60
|
+
config = load_config()
|
|
61
|
+
|
|
62
|
+
# TODO: Accept agent_id from WebRTC signaling query params
|
|
63
|
+
agent_id = None
|
|
64
|
+
|
|
65
|
+
system_prompt = build_system_prompt(agent_id, "voice")
|
|
66
|
+
voice_id = get_agent_voice_id(agent_id)
|
|
67
|
+
|
|
68
|
+
# Transport
|
|
69
|
+
transport = SmallWebRTCTransport(
|
|
70
|
+
webrtc_connection=runner_args.webrtc_connection,
|
|
71
|
+
params=TransportParams(
|
|
72
|
+
audio_in_enabled=True,
|
|
73
|
+
audio_out_enabled=True,
|
|
74
|
+
audio_in_sample_rate=16000,
|
|
75
|
+
audio_out_sample_rate=24000,
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# STT
|
|
80
|
+
async with aiohttp.ClientSession() as session:
|
|
81
|
+
stt = ElevenLabsSTTService(
|
|
82
|
+
api_key=config.elevenlabs_api_key,
|
|
83
|
+
aiohttp_session=session,
|
|
84
|
+
settings=ElevenLabsSTTSettings(model=config.elevenlabs_stt_model),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# TTS
|
|
88
|
+
tts = ElevenLabsTTSService(
|
|
89
|
+
api_key=config.elevenlabs_api_key,
|
|
90
|
+
settings=ElevenLabsTTSSettings(
|
|
91
|
+
voice=voice_id,
|
|
92
|
+
model=config.elevenlabs_tts_model,
|
|
93
|
+
),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Claude LLM
|
|
97
|
+
claude_config = ClaudeLLMServiceConfig(
|
|
98
|
+
cwd=config.default_cwd,
|
|
99
|
+
system_prompt=system_prompt,
|
|
100
|
+
)
|
|
101
|
+
claude_llm = ClaudeLLMService(config=claude_config)
|
|
102
|
+
|
|
103
|
+
# Processors
|
|
104
|
+
stop_phrase = StopPhraseProcessor()
|
|
105
|
+
narration = NarrationProcessor()
|
|
106
|
+
|
|
107
|
+
# Context aggregator -- Pipecat needs this to collect user speech into
|
|
108
|
+
# LLM context frames. Claude SDK maintains its own history, so we just
|
|
109
|
+
# need the aggregators to deliver user text to process_frame.
|
|
110
|
+
context = LLMContext()
|
|
111
|
+
context_aggregator = LLMContextAggregatorPair(
|
|
112
|
+
context,
|
|
113
|
+
user_params=LLMUserAggregatorParams(
|
|
114
|
+
vad_analyzer=SileroVADAnalyzer(),
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Pipeline:
|
|
119
|
+
# transport.input -> STT -> stop_phrase -> user_aggregator -> LLM -> narration -> TTS -> transport.output
|
|
120
|
+
pipeline = Pipeline(
|
|
121
|
+
[
|
|
122
|
+
transport.input(),
|
|
123
|
+
stt,
|
|
124
|
+
stop_phrase,
|
|
125
|
+
context_aggregator.user(),
|
|
126
|
+
claude_llm,
|
|
127
|
+
narration,
|
|
128
|
+
tts,
|
|
129
|
+
transport.output(),
|
|
130
|
+
]
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
task = PipelineTask(
|
|
134
|
+
pipeline,
|
|
135
|
+
params=PipelineParams(allow_interruptions=True),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
runner = PipelineRunner(handle_sigint=False)
|
|
139
|
+
await runner.run(task)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ============================================================================
|
|
143
|
+
# ENTRY POINT
|
|
144
|
+
# ============================================================================
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
main()
|