voicecc 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ """
2
+ FrameProcessor that emits spoken updates during tool use and strips markdown from text.
3
+
4
+ Watches for tool_start markers in TextFrames (emitted by ClaudeLLMService as
5
+ "__tool_start:<name>") and announces them as "Running <tool>..." messages.
6
+ Emits periodic "Still working..." messages for long-running tools. Strips
7
+ markdown syntax from regular text so it reads naturally when spoken.
8
+
9
+ Responsibilities:
10
+ - Detect tool_start markers and emit spoken announcements
11
+ - Emit periodic "Still working..." for long-running tools (12s interval)
12
+ - Strip markdown syntax (bold, headings, code blocks, links) from text
13
+ - Pass all non-text frames through unchanged
14
+ """
15
+
16
+ import asyncio
17
+ import logging
18
+ import re
19
+
20
+ from pipecat.frames.frames import Frame, LLMTextFrame, TextFrame
21
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # ============================================================================
26
+ # CONSTANTS
27
+ # ============================================================================
28
+
29
+ TOOL_START_PREFIX = "__tool_start:"
30
+ SUMMARY_INTERVAL_SECONDS = 12.0
31
+
32
+
33
+ # ============================================================================
34
+ # MAIN HANDLERS
35
+ # ============================================================================
36
+
37
+ class NarrationProcessor(FrameProcessor):
38
+ """Emits spoken updates during tool use and cleans markdown from text."""
39
+
40
+ def __init__(self, **kwargs):
41
+ super().__init__(**kwargs)
42
+ self._current_tool_name: str | None = None
43
+ self._summary_task: asyncio.Task | None = None
44
+ self._in_long_task = False
45
+
46
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
47
+ """Process text frames for tool markers and markdown stripping.
48
+
49
+ Args:
50
+ frame: The incoming frame
51
+ direction: Frame direction
52
+ """
53
+ await super().process_frame(frame, direction)
54
+
55
+ # Check for tool_start markers from ClaudeLLMService
56
+ if isinstance(frame, TextFrame) and isinstance(frame.text, str):
57
+ if frame.text.startswith(TOOL_START_PREFIX):
58
+ tool_name = frame.text[len(TOOL_START_PREFIX):]
59
+ await self._handle_tool_start(tool_name)
60
+ return
61
+
62
+ # Strip markdown from LLM text frames
63
+ if isinstance(frame, LLMTextFrame):
64
+ # Text arriving means Claude is responding -- exit long-task mode
65
+ if self._in_long_task:
66
+ self._cancel_summary_timer()
67
+ self._in_long_task = False
68
+ self._current_tool_name = None
69
+
70
+ clean = strip_markdown(frame.text)
71
+ if clean:
72
+ await self.push_frame(LLMTextFrame(clean))
73
+ return
74
+
75
+ await self.push_frame(frame, direction)
76
+
77
+ async def cleanup(self):
78
+ """Cancel any running summary timer on cleanup."""
79
+ self._cancel_summary_timer()
80
+ await super().cleanup()
81
+
82
+ # ============================================================================
83
+ # HELPER FUNCTIONS
84
+ # ============================================================================
85
+
86
+ async def _handle_tool_start(self, tool_name: str) -> None:
87
+ """Handle a tool_start event: announce it and start the summary timer.
88
+
89
+ Args:
90
+ tool_name: Name of the tool being executed
91
+ """
92
+ self._current_tool_name = tool_name
93
+ self._in_long_task = True
94
+
95
+ self._cancel_summary_timer()
96
+ self._start_summary_timer()
97
+
98
+ # Emit spoken announcement
99
+ await self.push_frame(LLMTextFrame(f"Running {tool_name}..."))
100
+ logger.info(f"[narration] Tool started: {tool_name}")
101
+
102
+ def _start_summary_timer(self) -> None:
103
+ """Start periodic 'Still working...' announcements."""
104
+ async def _emit_summaries():
105
+ while True:
106
+ await asyncio.sleep(SUMMARY_INTERVAL_SECONDS)
107
+ name = self._current_tool_name or "the task"
108
+ try:
109
+ await self.push_frame(LLMTextFrame(f"Still working on {name}..."))
110
+ except Exception:
111
+ break
112
+
113
+ self._summary_task = asyncio.create_task(_emit_summaries())
114
+
115
+ def _cancel_summary_timer(self) -> None:
116
+ """Cancel the summary timer if active."""
117
+ if self._summary_task and not self._summary_task.done():
118
+ self._summary_task.cancel()
119
+ self._summary_task = None
120
+
121
+
122
+ def strip_markdown(text: str) -> str:
123
+ """Strip markdown syntax so text reads naturally when spoken.
124
+
125
+ Removes bold/italic asterisks, heading markers, code fences,
126
+ inline code backticks, markdown links, and list markers.
127
+
128
+ Args:
129
+ text: Raw markdown text
130
+
131
+ Returns:
132
+ Cleaned text suitable for TTS
133
+ """
134
+ text = re.sub(r"\*+", "", text) # bold/italic asterisks
135
+ text = re.sub(r"#+\s*", "", text) # heading markers
136
+ text = re.sub(r"`+", "", text) # inline code / code fences
137
+ text = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", text) # [text](url) -> text
138
+ text = re.sub(r"^-\s+", "", text, flags=re.MULTILINE) # unordered list markers
139
+ text = re.sub(r"^\d+\.\s+", "", text, flags=re.MULTILINE) # ordered list markers
140
+ return text
@@ -0,0 +1,8 @@
1
+ pipecat-ai[webrtc,elevenlabs,silero]
2
+ pipecat-ai-small-webrtc-prebuilt
3
+ claude-agent-sdk
4
+ python-dotenv
5
+ fastapi
6
+ uvicorn
7
+ twilio
8
+ pyright
@@ -0,0 +1,335 @@
1
+ """
2
+ FastAPI server for the Python voice server.
3
+
4
+ Hosts text chat, Twilio media WebSocket, call registration, heartbeat status,
5
+ health check, and config update endpoints. Runs alongside Pipecat's SmallWebRTC
6
+ server (port 7860) on a separate port (7861).
7
+
8
+ Responsibilities:
9
+ - Health check endpoint
10
+ - Text chat SSE streaming (proxied from dashboard)
11
+ - Chat stop/close endpoints
12
+ - Twilio incoming-call webhook (returns TwiML)
13
+ - Twilio media WebSocket handler
14
+ - Call registration for outbound calls
15
+ - Heartbeat status endpoint
16
+ - Heartbeat start/stop on server lifecycle
17
+ - Tunnel URL config update (called by dashboard after tunnel starts)
18
+ - Start both SmallWebRTC and FastAPI on server run
19
+ """
20
+
21
+ import asyncio
22
+ import json
23
+ import logging
24
+
25
+ import uvicorn
26
+ from fastapi import FastAPI, Request, WebSocket
27
+ from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
28
+
29
+ from claude_session import (
30
+ close_session,
31
+ get_or_create_session,
32
+ interrupt_session,
33
+ start_cleanup_timer,
34
+ stream_message,
35
+ )
36
+ from config import VoiceServerConfig, load_config
37
+ from heartbeat import (
38
+ get_heartbeat_status,
39
+ get_pending_client,
40
+ register_pending_call,
41
+ start_heartbeat,
42
+ stop_heartbeat,
43
+ )
44
+ from twilio_pipeline import handle_twilio_websocket
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+ # ============================================================================
49
+ # STATE
50
+ # ============================================================================
51
+
52
+ # Mutable tunnel URL, updated by dashboard via POST /config/tunnel-url
53
+ _tunnel_url: str | None = None
54
+
55
+
56
+ def get_tunnel_url() -> str | None:
57
+ """Get the current tunnel URL."""
58
+ return _tunnel_url
59
+
60
+
61
+ # ============================================================================
62
+ # MAIN HANDLERS
63
+ # ============================================================================
64
+
65
+ app = FastAPI(title="VoiceCC Python Server")
66
+
67
+
68
+ @app.get("/health")
69
+ async def health():
70
+ """Health check endpoint."""
71
+ return {"status": "ok"}
72
+
73
+
74
+ @app.post("/chat/send")
75
+ async def chat_send(request: Request):
76
+ """Send a message and stream Claude's response as SSE.
77
+
78
+ Body: { session_key: str, agent_id?: str, text: str }
79
+ Returns: SSE stream of ChatSseEvent
80
+ """
81
+ try:
82
+ body = await request.json()
83
+ except Exception:
84
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
85
+
86
+ session_key = body.get("session_key")
87
+ text = body.get("text", "").strip()
88
+ agent_id = body.get("agent_id")
89
+
90
+ if not session_key or not isinstance(session_key, str):
91
+ return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
92
+ if not text:
93
+ return JSONResponse({"error": "Missing or empty 'text' field"}, status_code=400)
94
+
95
+ # Get or create the chat session
96
+ try:
97
+ await get_or_create_session(session_key, agent_id)
98
+ except RuntimeError as e:
99
+ logger.error(f"[server] Failed to create chat session: {e}")
100
+ return JSONResponse({"error": str(e)}, status_code=503)
101
+
102
+ # Stream response as SSE
103
+ try:
104
+
105
+ async def event_generator():
106
+ async for event in stream_message(session_key, text):
107
+ data = json.dumps(event.to_dict())
108
+ yield f"data: {data}\n\n"
109
+
110
+ return StreamingResponse(
111
+ event_generator(),
112
+ media_type="text/event-stream",
113
+ headers={
114
+ "Cache-Control": "no-cache",
115
+ "Connection": "keep-alive",
116
+ "X-Accel-Buffering": "no",
117
+ },
118
+ )
119
+ except RuntimeError as e:
120
+ if "ALREADY_STREAMING" in str(e):
121
+ return JSONResponse(
122
+ {"error": "Already streaming a response. Wait for it to complete."},
123
+ status_code=409,
124
+ )
125
+ return JSONResponse({"error": str(e)}, status_code=500)
126
+
127
+
128
+ @app.post("/chat/stop")
129
+ async def chat_stop(request: Request):
130
+ """Interrupt the current streaming response.
131
+
132
+ Body: { session_key: str }
133
+ Returns: { ok: true, interrupted: bool }
134
+ """
135
+ try:
136
+ body = await request.json()
137
+ except Exception:
138
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
139
+
140
+ session_key = body.get("session_key")
141
+ if not session_key or not isinstance(session_key, str):
142
+ return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
143
+
144
+ interrupted = await interrupt_session(session_key)
145
+ return {"ok": True, "interrupted": interrupted}
146
+
147
+
148
+ @app.post("/chat/close")
149
+ async def chat_close(request: Request):
150
+ """Close a chat session.
151
+
152
+ Body: { session_key: str }
153
+ Returns: { ok: true }
154
+ """
155
+ try:
156
+ body = await request.json()
157
+ except Exception:
158
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
159
+
160
+ session_key = body.get("session_key")
161
+ if not session_key or not isinstance(session_key, str):
162
+ return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
163
+
164
+ await close_session(session_key)
165
+ return {"ok": True}
166
+
167
+
168
+ @app.post("/register-call")
169
+ async def register_call(request: Request):
170
+ """Register a pending outbound call.
171
+
172
+ Body: { token: str, agent_id: str, initial_prompt?: str }
173
+ Returns: { ok: true }
174
+ """
175
+ try:
176
+ body = await request.json()
177
+ except Exception:
178
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
179
+
180
+ token = body.get("token")
181
+ agent_id = body.get("agent_id")
182
+
183
+ if not token or not isinstance(token, str):
184
+ return JSONResponse({"error": "Missing 'token' field"}, status_code=400)
185
+ if not agent_id or not isinstance(agent_id, str):
186
+ return JSONResponse({"error": "Missing 'agent_id' field"}, status_code=400)
187
+
188
+ initial_prompt = body.get("initial_prompt")
189
+ register_pending_call(token, agent_id, initial_prompt)
190
+
191
+ logger.info(f"[server] Registered outbound call token: {token}, agentId: {agent_id}")
192
+ return {"ok": True}
193
+
194
+
195
+ @app.post("/twilio/incoming-call")
196
+ async def twilio_incoming_call(request: Request):
197
+ """Handle Twilio incoming call webhook. Returns TwiML for media stream.
198
+
199
+ The TwiML tells Twilio to connect a media stream WebSocket to our
200
+ /media/{token} endpoint via the tunnel URL.
201
+
202
+ Returns: TwiML XML response
203
+ """
204
+ tunnel_url = get_tunnel_url()
205
+ if not tunnel_url:
206
+ logger.error("[server] Rejected incoming call: no tunnel URL available")
207
+ return PlainTextResponse("Server misconfigured", status_code=500)
208
+
209
+ tunnel_host = tunnel_url.replace("https://", "").replace("http://", "")
210
+
211
+ # Generate a token for this call and register it
212
+ from uuid import uuid4
213
+
214
+ token = str(uuid4())
215
+ register_pending_call(token, agent_id="", initial_prompt=None)
216
+
217
+ logger.info(f"[server] Incoming call accepted, token: {token}")
218
+
219
+ # Respond with TwiML to connect a media stream
220
+ twiml = (
221
+ '<?xml version="1.0" encoding="UTF-8"?>'
222
+ "<Response>"
223
+ " <Connect>"
224
+ f' <Stream url="wss://{tunnel_host}/media/{token}" />'
225
+ " </Connect>"
226
+ "</Response>"
227
+ )
228
+
229
+ return PlainTextResponse(twiml, media_type="text/xml")
230
+
231
+
232
+ @app.websocket("/media/{token}")
233
+ async def media_websocket(websocket: WebSocket, token: str):
234
+ """Handle Twilio media stream WebSocket connection.
235
+
236
+ Delegates to handle_twilio_websocket which manages the full pipeline lifecycle.
237
+
238
+ Args:
239
+ websocket: FastAPI WebSocket connection
240
+ token: Per-call UUID token from the URL path
241
+ """
242
+ logger.info(f"[server] Twilio media WebSocket connected, token: {token}")
243
+ await handle_twilio_websocket(websocket, token)
244
+
245
+
246
+ @app.get("/heartbeat/status")
247
+ async def heartbeat_status():
248
+ """Get the last heartbeat results per agent.
249
+
250
+ Returns: Dict of agent_id -> HeartbeatResult
251
+ """
252
+ return get_heartbeat_status()
253
+
254
+
255
+ @app.post("/config/tunnel-url")
256
+ async def config_tunnel_url(request: Request):
257
+ """Update the tunnel URL (called by dashboard after tunnel starts).
258
+
259
+ Body: { url: str }
260
+ Returns: { ok: true }
261
+ """
262
+ global _tunnel_url
263
+
264
+ try:
265
+ body = await request.json()
266
+ except Exception:
267
+ return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
268
+
269
+ url = body.get("url")
270
+ if not url or not isinstance(url, str):
271
+ return JSONResponse({"error": "Missing 'url' field"}, status_code=400)
272
+
273
+ _tunnel_url = url
274
+ logger.info(f"[server] Tunnel URL updated: {url}")
275
+ return {"ok": True}
276
+
277
+
278
+ # ============================================================================
279
+ # ENTRY POINT
280
+ # ============================================================================
281
+
282
+ @app.on_event("startup")
283
+ async def on_startup():
284
+ """Start cleanup timer and heartbeat on FastAPI startup."""
285
+ start_cleanup_timer()
286
+
287
+ config = load_config()
288
+ start_heartbeat(config, get_tunnel_url)
289
+
290
+
291
+ @app.on_event("shutdown")
292
+ async def on_shutdown():
293
+ """Stop heartbeat on FastAPI shutdown."""
294
+ stop_heartbeat()
295
+
296
+
297
+ async def start_fastapi(config: VoiceServerConfig) -> None:
298
+ """Start the FastAPI server on the configured API port.
299
+
300
+ Args:
301
+ config: Voice server configuration
302
+ """
303
+ server_config = uvicorn.Config(
304
+ app,
305
+ host="127.0.0.1",
306
+ port=config.api_port,
307
+ log_level="info",
308
+ )
309
+ server = uvicorn.Server(server_config)
310
+ await server.serve()
311
+
312
+
313
+ async def start_all() -> None:
314
+ """Start both the SmallWebRTC server and FastAPI server concurrently."""
315
+ config = load_config()
316
+
317
+ # Import here to avoid circular imports
318
+ from voice_pipeline import main as start_webrtc
319
+
320
+ logger.info(
321
+ f"[server] Starting SmallWebRTC on :{config.webrtc_port}, "
322
+ f"FastAPI on :{config.api_port}"
323
+ )
324
+
325
+ # Run both servers concurrently
326
+ await asyncio.gather(
327
+ start_fastapi(config),
328
+ # SmallWebRTC's main() is a blocking call that starts its own server
329
+ asyncio.to_thread(start_webrtc),
330
+ )
331
+
332
+
333
+ if __name__ == "__main__":
334
+ logging.basicConfig(level=logging.INFO)
335
+ asyncio.run(start_all())
@@ -0,0 +1,50 @@
1
+ """
2
+ FrameProcessor that detects "stop listening" in transcriptions and ends the pipeline.
3
+
4
+ Listens for TranscriptionFrame events. If the transcribed text contains
5
+ "stop listening" (case-insensitive), pushes an EndFrame to terminate the session.
6
+ Otherwise, passes the frame through unchanged.
7
+
8
+ Responsibilities:
9
+ - Detect "stop listening" phrase in user transcriptions
10
+ - Push EndFrame to cleanly shut down the pipeline
11
+ - Pass all other frames through unchanged
12
+ """
13
+
14
+ import logging
15
+
16
+ from pipecat.frames.frames import EndFrame, Frame, TranscriptionFrame
17
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ STOP_PHRASE = "stop listening"
22
+
23
+
24
+ # ============================================================================
25
+ # MAIN HANDLERS
26
+ # ============================================================================
27
+
28
+ class StopPhraseProcessor(FrameProcessor):
29
+ """Detects 'stop listening' in transcriptions and ends the pipeline."""
30
+
31
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
32
+ """Check transcription frames for the stop phrase.
33
+
34
+ If detected, pushes an EndFrame to terminate the pipeline.
35
+ Otherwise, passes the frame through.
36
+
37
+ Args:
38
+ frame: The incoming frame
39
+ direction: Frame direction
40
+ """
41
+ await super().process_frame(frame, direction)
42
+
43
+ if isinstance(frame, TranscriptionFrame):
44
+ text = frame.text.lower().strip()
45
+ if STOP_PHRASE in text:
46
+ logger.info("[stop-phrase] 'stop listening' detected, ending pipeline")
47
+ await self.push_frame(EndFrame())
48
+ return
49
+
50
+ await self.push_frame(frame, direction)