voicecc 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,335 @@
1
+ """
2
+ FastAPI server for the Python voice server.
3
+
4
+ Hosts text chat, Twilio media WebSocket, call registration, heartbeat status,
5
+ health check, and config update endpoints. Runs alongside Pipecat's SmallWebRTC
6
+ server (port 7860) on a separate port (7861).
7
+
8
+ Responsibilities:
9
+ - Health check endpoint
10
+ - Text chat SSE streaming (proxied from dashboard)
11
+ - Chat stop/close endpoints
12
+ - Twilio incoming-call webhook (returns TwiML)
13
+ - Twilio media WebSocket handler
14
+ - Call registration for outbound calls
15
+ - Heartbeat status endpoint
16
+ - Heartbeat start/stop on server lifecycle
17
+ - Tunnel URL config update (called by dashboard after tunnel starts)
18
+ - Start both SmallWebRTC and FastAPI on server run
19
+ """
20
+
21
+ import asyncio
22
+ import json
23
+ import logging
24
+
25
+ import uvicorn
26
+ from fastapi import FastAPI, Request, WebSocket
27
+ from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
28
+
29
+ from claude_session import (
30
+ close_session,
31
+ get_or_create_session,
32
+ interrupt_session,
33
+ start_cleanup_timer,
34
+ stream_message,
35
+ )
36
+ from config import VoiceServerConfig, load_config
37
+ from heartbeat import (
38
+ get_heartbeat_status,
39
+ get_pending_client,
40
+ register_pending_call,
41
+ start_heartbeat,
42
+ stop_heartbeat,
43
+ )
44
+ from twilio_pipeline import handle_twilio_websocket
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+ # ============================================================================
49
+ # STATE
50
+ # ============================================================================
51
+
52
+ # Mutable tunnel URL, updated by dashboard via POST /config/tunnel-url
53
+ _tunnel_url: str | None = None
54
+
55
+
56
+ def get_tunnel_url() -> str | None:
57
+ """Get the current tunnel URL."""
58
+ return _tunnel_url
59
+
60
+
61
+ # ============================================================================
62
+ # MAIN HANDLERS
63
+ # ============================================================================
64
+
65
+ app = FastAPI(title="VoiceCC Python Server")
66
+
67
+
68
+ @app.get("/health")
69
+ async def health():
70
+ """Health check endpoint."""
71
+ return {"status": "ok"}
72
+
73
+
74
+ @app.post("/chat/send")
75
+ async def chat_send(request: Request):
76
+ """Send a message and stream Claude's response as SSE.
77
+
78
+ Body: { session_key: str, agent_id?: str, text: str }
79
+ Returns: SSE stream of ChatSseEvent
80
+ """
81
+ try:
82
+ body = await request.json()
83
+ except Exception:
84
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
85
+
86
+ session_key = body.get("session_key")
87
+ text = body.get("text", "").strip()
88
+ agent_id = body.get("agent_id")
89
+
90
+ if not session_key or not isinstance(session_key, str):
91
+ return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
92
+ if not text:
93
+ return JSONResponse({"error": "Missing or empty 'text' field"}, status_code=400)
94
+
95
+ # Get or create the chat session
96
+ try:
97
+ await get_or_create_session(session_key, agent_id)
98
+ except RuntimeError as e:
99
+ logger.error(f"[server] Failed to create chat session: {e}")
100
+ return JSONResponse({"error": str(e)}, status_code=503)
101
+
102
+ # Stream response as SSE
103
+ try:
104
+
105
+ async def event_generator():
106
+ async for event in stream_message(session_key, text):
107
+ data = json.dumps(event.to_dict())
108
+ yield f"data: {data}\n\n"
109
+
110
+ return StreamingResponse(
111
+ event_generator(),
112
+ media_type="text/event-stream",
113
+ headers={
114
+ "Cache-Control": "no-cache",
115
+ "Connection": "keep-alive",
116
+ "X-Accel-Buffering": "no",
117
+ },
118
+ )
119
+ except RuntimeError as e:
120
+ if "ALREADY_STREAMING" in str(e):
121
+ return JSONResponse(
122
+ {"error": "Already streaming a response. Wait for it to complete."},
123
+ status_code=409,
124
+ )
125
+ return JSONResponse({"error": str(e)}, status_code=500)
126
+
127
+
128
+ @app.post("/chat/stop")
129
+ async def chat_stop(request: Request):
130
+ """Interrupt the current streaming response.
131
+
132
+ Body: { session_key: str }
133
+ Returns: { ok: true, interrupted: bool }
134
+ """
135
+ try:
136
+ body = await request.json()
137
+ except Exception:
138
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
139
+
140
+ session_key = body.get("session_key")
141
+ if not session_key or not isinstance(session_key, str):
142
+ return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
143
+
144
+ interrupted = await interrupt_session(session_key)
145
+ return {"ok": True, "interrupted": interrupted}
146
+
147
+
148
+ @app.post("/chat/close")
149
+ async def chat_close(request: Request):
150
+ """Close a chat session.
151
+
152
+ Body: { session_key: str }
153
+ Returns: { ok: true }
154
+ """
155
+ try:
156
+ body = await request.json()
157
+ except Exception:
158
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
159
+
160
+ session_key = body.get("session_key")
161
+ if not session_key or not isinstance(session_key, str):
162
+ return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
163
+
164
+ await close_session(session_key)
165
+ return {"ok": True}
166
+
167
+
168
+ @app.post("/register-call")
169
+ async def register_call(request: Request):
170
+ """Register a pending outbound call.
171
+
172
+ Body: { token: str, agent_id: str, initial_prompt?: str }
173
+ Returns: { ok: true }
174
+ """
175
+ try:
176
+ body = await request.json()
177
+ except Exception:
178
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
179
+
180
+ token = body.get("token")
181
+ agent_id = body.get("agent_id")
182
+
183
+ if not token or not isinstance(token, str):
184
+ return JSONResponse({"error": "Missing 'token' field"}, status_code=400)
185
+ if not agent_id or not isinstance(agent_id, str):
186
+ return JSONResponse({"error": "Missing 'agent_id' field"}, status_code=400)
187
+
188
+ initial_prompt = body.get("initial_prompt")
189
+ register_pending_call(token, agent_id, initial_prompt)
190
+
191
+ logger.info(f"[server] Registered outbound call token: {token}, agentId: {agent_id}")
192
+ return {"ok": True}
193
+
194
+
195
+ @app.post("/twilio/incoming-call")
196
+ async def twilio_incoming_call(request: Request):
197
+ """Handle Twilio incoming call webhook. Returns TwiML for media stream.
198
+
199
+ The TwiML tells Twilio to connect a media stream WebSocket to our
200
+ /media/{token} endpoint via the tunnel URL.
201
+
202
+ Returns: TwiML XML response
203
+ """
204
+ tunnel_url = get_tunnel_url()
205
+ if not tunnel_url:
206
+ logger.error("[server] Rejected incoming call: no tunnel URL available")
207
+ return PlainTextResponse("Server misconfigured", status_code=500)
208
+
209
+ tunnel_host = tunnel_url.replace("https://", "").replace("http://", "")
210
+
211
+ # Generate a token for this call and register it
212
+ from uuid import uuid4
213
+
214
+ token = str(uuid4())
215
+ register_pending_call(token, agent_id="", initial_prompt=None)
216
+
217
+ logger.info(f"[server] Incoming call accepted, token: {token}")
218
+
219
+ # Respond with TwiML to connect a media stream
220
+ twiml = (
221
+ '<?xml version="1.0" encoding="UTF-8"?>'
222
+ "<Response>"
223
+ " <Connect>"
224
+ f' <Stream url="wss://{tunnel_host}/media/{token}" />'
225
+ " </Connect>"
226
+ "</Response>"
227
+ )
228
+
229
+ return PlainTextResponse(twiml, media_type="text/xml")
230
+
231
+
232
+ @app.websocket("/media/{token}")
233
+ async def media_websocket(websocket: WebSocket, token: str):
234
+ """Handle Twilio media stream WebSocket connection.
235
+
236
+ Delegates to handle_twilio_websocket which manages the full pipeline lifecycle.
237
+
238
+ Args:
239
+ websocket: FastAPI WebSocket connection
240
+ token: Per-call UUID token from the URL path
241
+ """
242
+ logger.info(f"[server] Twilio media WebSocket connected, token: {token}")
243
+ await handle_twilio_websocket(websocket, token)
244
+
245
+
246
+ @app.get("/heartbeat/status")
247
+ async def heartbeat_status():
248
+ """Get the last heartbeat results per agent.
249
+
250
+ Returns: Dict of agent_id -> HeartbeatResult
251
+ """
252
+ return get_heartbeat_status()
253
+
254
+
255
+ @app.post("/config/tunnel-url")
256
+ async def config_tunnel_url(request: Request):
257
+ """Update the tunnel URL (called by dashboard after tunnel starts).
258
+
259
+ Body: { url: str }
260
+ Returns: { ok: true }
261
+ """
262
+ global _tunnel_url
263
+
264
+ try:
265
+ body = await request.json()
266
+ except Exception:
267
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
268
+
269
+ url = body.get("url")
270
+ if not url or not isinstance(url, str):
271
+ return JSONResponse({"error": "Missing 'url' field"}, status_code=400)
272
+
273
+ _tunnel_url = url
274
+ logger.info(f"[server] Tunnel URL updated: {url}")
275
+ return {"ok": True}
276
+
277
+
278
+ # ============================================================================
279
+ # ENTRY POINT
280
+ # ============================================================================
281
+
282
+ @app.on_event("startup")
283
+ async def on_startup():
284
+ """Start cleanup timer and heartbeat on FastAPI startup."""
285
+ start_cleanup_timer()
286
+
287
+ config = load_config()
288
+ start_heartbeat(config, get_tunnel_url)
289
+
290
+
291
+ @app.on_event("shutdown")
292
+ async def on_shutdown():
293
+ """Stop heartbeat on FastAPI shutdown."""
294
+ stop_heartbeat()
295
+
296
+
297
+ async def start_fastapi(config: VoiceServerConfig) -> None:
298
+ """Start the FastAPI server on the configured API port.
299
+
300
+ Args:
301
+ config: Voice server configuration
302
+ """
303
+ server_config = uvicorn.Config(
304
+ app,
305
+ host="127.0.0.1",
306
+ port=config.api_port,
307
+ log_level="info",
308
+ )
309
+ server = uvicorn.Server(server_config)
310
+ await server.serve()
311
+
312
+
313
+ async def start_all() -> None:
314
+ """Start both the SmallWebRTC server and FastAPI server concurrently."""
315
+ config = load_config()
316
+
317
+ # Import here to avoid circular imports
318
+ from voice_pipeline import main as start_webrtc
319
+
320
+ logger.info(
321
+ f"[server] Starting SmallWebRTC on :{config.webrtc_port}, "
322
+ f"FastAPI on :{config.api_port}"
323
+ )
324
+
325
+ # Run both servers concurrently
326
+ await asyncio.gather(
327
+ start_fastapi(config),
328
+ # SmallWebRTC's main() is a blocking call that starts its own server
329
+ asyncio.to_thread(start_webrtc),
330
+ )
331
+
332
+
333
+ if __name__ == "__main__":
334
+ logging.basicConfig(level=logging.INFO)
335
+ asyncio.run(start_all())
@@ -0,0 +1,50 @@
1
+ """
2
+ FrameProcessor that detects "stop listening" in transcriptions and ends the pipeline.
3
+
4
+ Listens for TranscriptionFrame events. If the transcribed text contains
5
+ "stop listening" (case-insensitive), pushes an EndFrame to terminate the session.
6
+ Otherwise, passes the frame through unchanged.
7
+
8
+ Responsibilities:
9
+ - Detect "stop listening" phrase in user transcriptions
10
+ - Push EndFrame to cleanly shut down the pipeline
11
+ - Pass all other frames through unchanged
12
+ """
13
+
14
+ import logging
15
+
16
+ from pipecat.frames.frames import EndFrame, Frame, TranscriptionFrame
17
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ STOP_PHRASE = "stop listening"
22
+
23
+
24
+ # ============================================================================
25
+ # MAIN HANDLERS
26
+ # ============================================================================
27
+
28
+ class StopPhraseProcessor(FrameProcessor):
29
+ """Detects 'stop listening' in transcriptions and ends the pipeline."""
30
+
31
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
32
+ """Check transcription frames for the stop phrase.
33
+
34
+ If detected, pushes an EndFrame to terminate the pipeline.
35
+ Otherwise, passes the frame through.
36
+
37
+ Args:
38
+ frame: The incoming frame
39
+ direction: Frame direction
40
+ """
41
+ await super().process_frame(frame, direction)
42
+
43
+ if isinstance(frame, TranscriptionFrame):
44
+ text = frame.text.lower().strip()
45
+ if STOP_PHRASE in text:
46
+ logger.info("[stop-phrase] 'stop listening' detected, ending pipeline")
47
+ await self.push_frame(EndFrame())
48
+ return
49
+
50
+ await self.push_frame(frame, direction)
@@ -0,0 +1,237 @@
1
+ """
2
+ Twilio voice pipeline using FastAPIWebsocketTransport with TwilioFrameSerializer.
3
+
4
+ Handles inbound and outbound Twilio phone calls by wiring Pipecat components
5
+ for mulaw audio over WebSocket. Supports heartbeat session handoff where a
6
+ pre-existing Claude session is passed through to preserve context.
7
+
8
+ Responsibilities:
9
+ - Create a Pipecat pipeline with TwilioFrameSerializer for mulaw 8kHz audio
10
+ - Handle FastAPI WebSocket connections from Twilio media streams
11
+ - Extract Twilio metadata (stream_sid, call_sid) from the WebSocket "start" event
12
+ - Look up pending calls to retrieve pre-existing ClaudeSDKClient sessions
13
+ - Wire STT -> LLM -> TTS pipeline identical to browser pipeline
14
+ """
15
+
16
+ import asyncio
17
+ import json
18
+ import logging
19
+ import os
20
+
21
+ import aiohttp
22
+ from fastapi import WebSocket
23
+
24
+ from pipecat.pipeline.pipeline import Pipeline
25
+ from pipecat.pipeline.runner import PipelineRunner
26
+ from pipecat.pipeline.task import PipelineParams, PipelineTask
27
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
28
+ from pipecat.serializers.twilio import TwilioFrameSerializer
29
+ from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
30
+ from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
31
+ from pipecat.transports.websocket.fastapi import (
32
+ FastAPIWebsocketParams,
33
+ FastAPIWebsocketTransport,
34
+ )
35
+
36
+ from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
37
+ from config import (
38
+ DEFAULT_AGENTS_DIR,
39
+ build_system_prompt,
40
+ get_agent_voice_id,
41
+ load_config,
42
+ )
43
+ from heartbeat import get_pending_client
44
+ from narration_processor import NarrationProcessor
45
+ from stop_phrase_processor import StopPhraseProcessor
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ # ============================================================================
51
+ # MAIN HANDLERS
52
+ # ============================================================================
53
+
54
+ async def handle_twilio_websocket(websocket: WebSocket, call_token: str) -> None:
55
+ """Handle a Twilio media stream WebSocket connection.
56
+
57
+ Accepts the WebSocket, waits for the Twilio "start" event to extract metadata,
58
+ looks up any pending call config, then creates and runs the voice pipeline.
59
+
60
+ Args:
61
+ websocket: FastAPI WebSocket connection from Twilio
62
+ call_token: Per-call UUID token from the URL path
63
+ """
64
+ await websocket.accept()
65
+
66
+ config = load_config()
67
+
68
+ # Wait for the Twilio "start" event to get stream metadata
69
+ stream_sid = None
70
+ call_sid = None
71
+
72
+ try:
73
+ # Read messages until we get the "start" event
74
+ while True:
75
+ raw = await websocket.receive_text()
76
+ msg = json.loads(raw)
77
+
78
+ if msg.get("event") == "start":
79
+ start_data = msg.get("start", {})
80
+ stream_sid = start_data.get("streamSid")
81
+ call_sid = start_data.get("callSid")
82
+ logger.info(
83
+ f"[twilio] Stream started -- callSid: {call_sid}, "
84
+ f"streamSid: {stream_sid}"
85
+ )
86
+ break
87
+
88
+ if msg.get("event") == "connected":
89
+ # Initial connected event -- keep waiting for start
90
+ continue
91
+
92
+ # Unexpected event before start
93
+ logger.warning(f"[twilio] Unexpected event before start: {msg.get('event')}")
94
+
95
+ except Exception as e:
96
+ logger.error(f"[twilio] Error waiting for start event: {e}")
97
+ await websocket.close()
98
+ return
99
+
100
+ if not stream_sid:
101
+ logger.error("[twilio] No stream_sid in start event")
102
+ await websocket.close()
103
+ return
104
+
105
+ # Look up pending call for heartbeat handoff or API-initiated calls
106
+ pending = get_pending_client(call_token)
107
+ agent_id = None
108
+ existing_client = None
109
+ initial_prompt = None
110
+
111
+ if pending:
112
+ agent_id = pending.agent_id
113
+ existing_client = pending.client # May be None for API calls
114
+ initial_prompt = pending.initial_prompt
115
+ logger.info(
116
+ f'[twilio] Using pending call for agent "{agent_id}", '
117
+ f'has_client={existing_client is not None}'
118
+ )
119
+
120
+ # Build LLM config
121
+ system_prompt = build_system_prompt(agent_id, "voice")
122
+ cwd = os.path.join(DEFAULT_AGENTS_DIR, agent_id) if agent_id else config.default_cwd
123
+ voice_id = get_agent_voice_id(agent_id)
124
+
125
+ llm_config = ClaudeLLMServiceConfig(
126
+ cwd=cwd,
127
+ system_prompt=system_prompt,
128
+ existing_client=existing_client,
129
+ initial_prompt=initial_prompt,
130
+ )
131
+
132
+ # Create and run the pipeline
133
+ try:
134
+ await _run_twilio_pipeline(
135
+ websocket=websocket,
136
+ stream_sid=stream_sid,
137
+ call_sid=call_sid or "",
138
+ config=config,
139
+ llm_config=llm_config,
140
+ voice_id=voice_id,
141
+ )
142
+ except Exception as e:
143
+ logger.error(f"[twilio] Pipeline error: {e}")
144
+ finally:
145
+ try:
146
+ await websocket.close()
147
+ except Exception:
148
+ pass
149
+
150
+
151
+ # ============================================================================
152
+ # HELPER FUNCTIONS
153
+ # ============================================================================
154
+
155
+ async def _run_twilio_pipeline(
156
+ websocket: WebSocket,
157
+ stream_sid: str,
158
+ call_sid: str,
159
+ config,
160
+ llm_config: ClaudeLLMServiceConfig,
161
+ voice_id: str,
162
+ ) -> None:
163
+ """Create and run the Twilio voice pipeline.
164
+
165
+ Assembles: transport.input -> STT -> stop_phrase -> user_aggregator
166
+ -> claude_llm -> narration -> TTS -> transport.output
167
+
168
+ Args:
169
+ websocket: Active FastAPI WebSocket connection
170
+ stream_sid: Twilio stream identifier
171
+ call_sid: Twilio call SID
172
+ config: Voice server configuration
173
+ llm_config: Claude LLM service configuration
174
+ voice_id: ElevenLabs voice ID
175
+ """
176
+ serializer = TwilioFrameSerializer(stream_sid=stream_sid, call_sid=call_sid)
177
+
178
+ transport = FastAPIWebsocketTransport(
179
+ websocket=websocket,
180
+ params=FastAPIWebsocketParams(
181
+ audio_in_enabled=True,
182
+ audio_out_enabled=True,
183
+ audio_in_sample_rate=8000,
184
+ audio_out_sample_rate=8000,
185
+ vad_enabled=True,
186
+ vad_audio_passthrough=True,
187
+ serializer=serializer,
188
+ ),
189
+ )
190
+
191
+ async with aiohttp.ClientSession() as session:
192
+ # STT
193
+ stt = ElevenLabsSTTService(
194
+ api_key=config.elevenlabs_api_key,
195
+ aiohttp_session=session,
196
+ model=config.elevenlabs_stt_model,
197
+ )
198
+
199
+ # TTS
200
+ tts = ElevenLabsTTSService(
201
+ api_key=config.elevenlabs_api_key,
202
+ voice_id=voice_id,
203
+ model=config.elevenlabs_tts_model,
204
+ )
205
+
206
+ # Claude LLM
207
+ claude_llm = ClaudeLLMService(config=llm_config)
208
+
209
+ # Processors
210
+ stop_phrase = StopPhraseProcessor()
211
+ narration = NarrationProcessor()
212
+
213
+ # Context aggregator
214
+ context = OpenAILLMContext(messages=[], tools=[])
215
+ context_aggregator = claude_llm.create_context_aggregator(context)
216
+
217
+ # Pipeline
218
+ pipeline = Pipeline(
219
+ [
220
+ transport.input(),
221
+ stt,
222
+ stop_phrase,
223
+ context_aggregator.user(),
224
+ claude_llm,
225
+ narration,
226
+ tts,
227
+ transport.output(),
228
+ ]
229
+ )
230
+
231
+ task = PipelineTask(
232
+ pipeline,
233
+ params=PipelineParams(allow_interruptions=True),
234
+ )
235
+
236
+ runner = PipelineRunner()
237
+ await runner.run(task)