voicecc 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ """
2
+ Twilio voice pipeline using FastAPIWebsocketTransport with TwilioFrameSerializer.
3
+
4
+ Handles inbound and outbound Twilio phone calls by wiring Pipecat components
5
+ for mulaw audio over WebSocket. Supports heartbeat session handoff where a
6
+ pre-existing Claude session is passed through to preserve context.
7
+
8
+ Responsibilities:
9
+ - Create a Pipecat pipeline with TwilioFrameSerializer for mulaw 8kHz audio
10
+ - Handle FastAPI WebSocket connections from Twilio media streams
11
+ - Extract Twilio metadata (stream_sid, call_sid) from the WebSocket "start" event
12
+ - Look up pending calls to retrieve pre-existing ClaudeSDKClient sessions
13
+ - Wire STT -> LLM -> TTS pipeline identical to browser pipeline
14
+ """
15
+
16
+ import asyncio
17
+ import json
18
+ import logging
19
+ import os
20
+
21
+ import aiohttp
22
+ from fastapi import WebSocket
23
+
24
+ from pipecat.pipeline.pipeline import Pipeline
25
+ from pipecat.pipeline.runner import PipelineRunner
26
+ from pipecat.pipeline.task import PipelineParams, PipelineTask
27
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
28
+ from pipecat.serializers.twilio import TwilioFrameSerializer
29
+ from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
30
+ from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
31
+ from pipecat.transports.websocket.fastapi import (
32
+ FastAPIWebsocketParams,
33
+ FastAPIWebsocketTransport,
34
+ )
35
+
36
+ from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
37
+ from config import (
38
+ DEFAULT_AGENTS_DIR,
39
+ build_system_prompt,
40
+ get_agent_voice_id,
41
+ load_config,
42
+ )
43
+ from heartbeat import get_pending_client
44
+ from narration_processor import NarrationProcessor
45
+ from stop_phrase_processor import StopPhraseProcessor
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ # ============================================================================
51
+ # MAIN HANDLERS
52
+ # ============================================================================
53
+
54
+ async def handle_twilio_websocket(websocket: WebSocket, call_token: str) -> None:
55
+ """Handle a Twilio media stream WebSocket connection.
56
+
57
+ Accepts the WebSocket, waits for the Twilio "start" event to extract metadata,
58
+ looks up any pending call config, then creates and runs the voice pipeline.
59
+
60
+ Args:
61
+ websocket: FastAPI WebSocket connection from Twilio
62
+ call_token: Per-call UUID token from the URL path
63
+ """
64
+ await websocket.accept()
65
+
66
+ config = load_config()
67
+
68
+ # Wait for the Twilio "start" event to get stream metadata
69
+ stream_sid = None
70
+ call_sid = None
71
+
72
+ try:
73
+ # Read messages until we get the "start" event
74
+ while True:
75
+ raw = await websocket.receive_text()
76
+ msg = json.loads(raw)
77
+
78
+ if msg.get("event") == "start":
79
+ start_data = msg.get("start", {})
80
+ stream_sid = start_data.get("streamSid")
81
+ call_sid = start_data.get("callSid")
82
+ logger.info(
83
+ f"[twilio] Stream started -- callSid: {call_sid}, "
84
+ f"streamSid: {stream_sid}"
85
+ )
86
+ break
87
+
88
+ if msg.get("event") == "connected":
89
+ # Initial connected event -- keep waiting for start
90
+ continue
91
+
92
+ # Unexpected event before start
93
+ logger.warning(f"[twilio] Unexpected event before start: {msg.get('event')}")
94
+
95
+ except Exception as e:
96
+ logger.error(f"[twilio] Error waiting for start event: {e}")
97
+ await websocket.close()
98
+ return
99
+
100
+ if not stream_sid:
101
+ logger.error("[twilio] No stream_sid in start event")
102
+ await websocket.close()
103
+ return
104
+
105
+ # Look up pending call for heartbeat handoff or API-initiated calls
106
+ pending = get_pending_client(call_token)
107
+ agent_id = None
108
+ existing_client = None
109
+ initial_prompt = None
110
+
111
+ if pending:
112
+ agent_id = pending.agent_id
113
+ existing_client = pending.client # May be None for API calls
114
+ initial_prompt = pending.initial_prompt
115
+ logger.info(
116
+ f'[twilio] Using pending call for agent "{agent_id}", '
117
+ f'has_client={existing_client is not None}'
118
+ )
119
+
120
+ # Build LLM config
121
+ system_prompt = build_system_prompt(agent_id, "voice")
122
+ cwd = os.path.join(DEFAULT_AGENTS_DIR, agent_id) if agent_id else config.default_cwd
123
+ voice_id = get_agent_voice_id(agent_id)
124
+
125
+ llm_config = ClaudeLLMServiceConfig(
126
+ cwd=cwd,
127
+ system_prompt=system_prompt,
128
+ existing_client=existing_client,
129
+ initial_prompt=initial_prompt,
130
+ )
131
+
132
+ # Create and run the pipeline
133
+ try:
134
+ await _run_twilio_pipeline(
135
+ websocket=websocket,
136
+ stream_sid=stream_sid,
137
+ call_sid=call_sid or "",
138
+ config=config,
139
+ llm_config=llm_config,
140
+ voice_id=voice_id,
141
+ )
142
+ except Exception as e:
143
+ logger.error(f"[twilio] Pipeline error: {e}")
144
+ finally:
145
+ try:
146
+ await websocket.close()
147
+ except Exception:
148
+ pass
149
+
150
+
151
+ # ============================================================================
152
+ # HELPER FUNCTIONS
153
+ # ============================================================================
154
+
155
+ async def _run_twilio_pipeline(
156
+ websocket: WebSocket,
157
+ stream_sid: str,
158
+ call_sid: str,
159
+ config,
160
+ llm_config: ClaudeLLMServiceConfig,
161
+ voice_id: str,
162
+ ) -> None:
163
+ """Create and run the Twilio voice pipeline.
164
+
165
+ Assembles: transport.input -> STT -> stop_phrase -> user_aggregator
166
+ -> claude_llm -> narration -> TTS -> transport.output
167
+
168
+ Args:
169
+ websocket: Active FastAPI WebSocket connection
170
+ stream_sid: Twilio stream identifier
171
+ call_sid: Twilio call SID
172
+ config: Voice server configuration
173
+ llm_config: Claude LLM service configuration
174
+ voice_id: ElevenLabs voice ID
175
+ """
176
+ serializer = TwilioFrameSerializer(stream_sid=stream_sid, call_sid=call_sid)
177
+
178
+ transport = FastAPIWebsocketTransport(
179
+ websocket=websocket,
180
+ params=FastAPIWebsocketParams(
181
+ audio_in_enabled=True,
182
+ audio_out_enabled=True,
183
+ audio_in_sample_rate=8000,
184
+ audio_out_sample_rate=8000,
185
+ vad_enabled=True,
186
+ vad_audio_passthrough=True,
187
+ serializer=serializer,
188
+ ),
189
+ )
190
+
191
+ async with aiohttp.ClientSession() as session:
192
+ # STT
193
+ stt = ElevenLabsSTTService(
194
+ api_key=config.elevenlabs_api_key,
195
+ aiohttp_session=session,
196
+ model=config.elevenlabs_stt_model,
197
+ )
198
+
199
+ # TTS
200
+ tts = ElevenLabsTTSService(
201
+ api_key=config.elevenlabs_api_key,
202
+ voice_id=voice_id,
203
+ model=config.elevenlabs_tts_model,
204
+ )
205
+
206
+ # Claude LLM
207
+ claude_llm = ClaudeLLMService(config=llm_config)
208
+
209
+ # Processors
210
+ stop_phrase = StopPhraseProcessor()
211
+ narration = NarrationProcessor()
212
+
213
+ # Context aggregator
214
+ context = OpenAILLMContext(messages=[], tools=[])
215
+ context_aggregator = claude_llm.create_context_aggregator(context)
216
+
217
+ # Pipeline
218
+ pipeline = Pipeline(
219
+ [
220
+ transport.input(),
221
+ stt,
222
+ stop_phrase,
223
+ context_aggregator.user(),
224
+ claude_llm,
225
+ narration,
226
+ tts,
227
+ transport.output(),
228
+ ]
229
+ )
230
+
231
+ task = PipelineTask(
232
+ pipeline,
233
+ params=PipelineParams(allow_interruptions=True),
234
+ )
235
+
236
+ runner = PipelineRunner()
237
+ await runner.run(task)
@@ -0,0 +1,147 @@
1
+ """
2
+ Browser voice pipeline entry point for Pipecat runner.
3
+
4
+ Assembles the voice pipeline: WebRTC transport -> ElevenLabs STT -> stop phrase
5
+ detection -> user context aggregation -> Claude LLM -> narration -> ElevenLabs TTS
6
+ -> WebRTC output.
7
+
8
+ Can be run standalone via `python voice_pipeline.py` or imported from server.py
9
+ which starts it alongside the FastAPI server.
10
+
11
+ Responsibilities:
12
+ - Create SmallWebRTCTransport with audio I/O
13
+ - Wire STT -> LLM -> TTS pipeline with narration and stop phrase processors
14
+ - Load config and build system prompt
15
+ - Serve as the entry point for `pipecat.runner.run.main()`
16
+ - Expose `main` for import by server.py
17
+ """
18
+
19
+ import aiohttp
20
+ import logging
21
+
22
+ from pipecat.frames.frames import LLMMessagesFrame
23
+ from pipecat.pipeline.pipeline import Pipeline
24
+ from pipecat.pipeline.runner import PipelineRunner
25
+ from pipecat.pipeline.task import PipelineParams, PipelineTask
26
+ from pipecat.audio.vad.silero import SileroVADAnalyzer
27
+ from pipecat.processors.aggregators.llm_context import LLMContext
28
+ from pipecat.processors.aggregators.llm_response_universal import (
29
+ LLMContextAggregatorPair,
30
+ LLMUserAggregatorParams,
31
+ )
32
+ from pipecat.runner.types import SmallWebRTCRunnerArguments
33
+ from pipecat.runner.run import main
34
+ from pipecat.services.elevenlabs.stt import ElevenLabsSTTService, ElevenLabsSTTSettings
35
+ from pipecat.services.elevenlabs.tts import ElevenLabsTTSService, ElevenLabsTTSSettings
36
+ from pipecat.transports.base_transport import TransportParams
37
+ from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
38
+
39
+ from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
40
+ from config import build_system_prompt, get_agent_voice_id, load_config
41
+ from narration_processor import NarrationProcessor
42
+ from stop_phrase_processor import StopPhraseProcessor
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ # ============================================================================
48
+ # MAIN HANDLERS
49
+ # ============================================================================
50
+
51
+ async def bot(runner_args: SmallWebRTCRunnerArguments):
52
+ """Entry point for the Pipecat runner.
53
+
54
+ Creates the full voice pipeline and runs it. Called automatically by
55
+ Pipecat's runner when a WebRTC client connects.
56
+
57
+ Args:
58
+ runner_args: Runner arguments containing the WebRTC connection
59
+ """
60
+ config = load_config()
61
+
62
+ # TODO: Accept agent_id from WebRTC signaling query params
63
+ agent_id = None
64
+
65
+ system_prompt = build_system_prompt(agent_id, "voice")
66
+ voice_id = get_agent_voice_id(agent_id)
67
+
68
+ # Transport
69
+ transport = SmallWebRTCTransport(
70
+ webrtc_connection=runner_args.webrtc_connection,
71
+ params=TransportParams(
72
+ audio_in_enabled=True,
73
+ audio_out_enabled=True,
74
+ audio_in_sample_rate=16000,
75
+ audio_out_sample_rate=24000,
76
+ ),
77
+ )
78
+
79
+ # STT
80
+ async with aiohttp.ClientSession() as session:
81
+ stt = ElevenLabsSTTService(
82
+ api_key=config.elevenlabs_api_key,
83
+ aiohttp_session=session,
84
+ settings=ElevenLabsSTTSettings(model=config.elevenlabs_stt_model),
85
+ )
86
+
87
+ # TTS
88
+ tts = ElevenLabsTTSService(
89
+ api_key=config.elevenlabs_api_key,
90
+ settings=ElevenLabsTTSSettings(
91
+ voice=voice_id,
92
+ model=config.elevenlabs_tts_model,
93
+ ),
94
+ )
95
+
96
+ # Claude LLM
97
+ claude_config = ClaudeLLMServiceConfig(
98
+ cwd=config.default_cwd,
99
+ system_prompt=system_prompt,
100
+ )
101
+ claude_llm = ClaudeLLMService(config=claude_config)
102
+
103
+ # Processors
104
+ stop_phrase = StopPhraseProcessor()
105
+ narration = NarrationProcessor()
106
+
107
+ # Context aggregator -- Pipecat needs this to collect user speech into
108
+ # LLM context frames. Claude SDK maintains its own history, so we just
109
+ # need the aggregators to deliver user text to process_frame.
110
+ context = LLMContext()
111
+ context_aggregator = LLMContextAggregatorPair(
112
+ context,
113
+ user_params=LLMUserAggregatorParams(
114
+ vad_analyzer=SileroVADAnalyzer(),
115
+ ),
116
+ )
117
+
118
+ # Pipeline:
119
+ # transport.input -> STT -> stop_phrase -> user_aggregator -> LLM -> narration -> TTS -> transport.output
120
+ pipeline = Pipeline(
121
+ [
122
+ transport.input(),
123
+ stt,
124
+ stop_phrase,
125
+ context_aggregator.user(),
126
+ claude_llm,
127
+ narration,
128
+ tts,
129
+ transport.output(),
130
+ ]
131
+ )
132
+
133
+ task = PipelineTask(
134
+ pipeline,
135
+ params=PipelineParams(allow_interruptions=True),
136
+ )
137
+
138
+ runner = PipelineRunner(handle_sigint=False)
139
+ await runner.run(task)
140
+
141
+
142
+ # ============================================================================
143
+ # ENTRY POINT
144
+ # ============================================================================
145
+
146
+ if __name__ == "__main__":
147
+ main()