npm - voicecc - Versions diffs - 1.2.2 → 1.2.3 - Mend

voicecc 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/bin/voicecc.js +9 -0
package/package.json +2 -1
package/voice-server/.python-version +1 -0
package/voice-server/claude_llm_service.py +333 -0
package/voice-server/claude_session.py +312 -0
package/voice-server/config.py +340 -0
package/voice-server/dev-server-start.sh +128 -0
package/voice-server/heartbeat.py +505 -0
package/voice-server/narration_processor.py +140 -0
package/voice-server/requirements.txt +8 -0
package/voice-server/server.py +335 -0
package/voice-server/stop_phrase_processor.py +50 -0
package/voice-server/twilio_pipeline.py +237 -0
package/voice-server/voice_pipeline.py +147 -0

package/voice-server/server.py ADDED Viewed

@@ -0,0 +1,335 @@
+"""
+FastAPI server for the Python voice server.
+Hosts text chat, Twilio media WebSocket, call registration, heartbeat status,
+health check, and config update endpoints. Runs alongside Pipecat's SmallWebRTC
+server (port 7860) on a separate port (7861).
+Responsibilities:
+- Health check endpoint
+- Text chat SSE streaming (proxied from dashboard)
+- Chat stop/close endpoints
+- Twilio incoming-call webhook (returns TwiML)
+- Twilio media WebSocket handler
+- Call registration for outbound calls
+- Heartbeat status endpoint
+- Heartbeat start/stop on server lifecycle
+- Tunnel URL config update (called by dashboard after tunnel starts)
+- Start both SmallWebRTC and FastAPI on server run
+"""
+import asyncio
+import json
+import logging
+import uvicorn
+from fastapi import FastAPI, Request, WebSocket
+from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
+from claude_session import (
+    close_session,
+    get_or_create_session,
+    interrupt_session,
+    start_cleanup_timer,
+    stream_message,
+)
+from config import VoiceServerConfig, load_config
+from heartbeat import (
+    get_heartbeat_status,
+    get_pending_client,
+    register_pending_call,
+    start_heartbeat,
+    stop_heartbeat,
+)
+from twilio_pipeline import handle_twilio_websocket
+logger = logging.getLogger(__name__)
+# ============================================================================
+# STATE
+# ============================================================================
+# Mutable tunnel URL, updated by dashboard via POST /config/tunnel-url
+_tunnel_url: str | None = None
+def get_tunnel_url() -> str | None:
+    """Get the current tunnel URL."""
+    return _tunnel_url
+# ============================================================================
+# MAIN HANDLERS
+# ============================================================================
+app = FastAPI(title="VoiceCC Python Server")
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {"status": "ok"}
+@app.post("/chat/send")
+async def chat_send(request: Request):
+    """Send a message and stream Claude's response as SSE.
+    Body: { session_key: str, agent_id?: str, text: str }
+    Returns: SSE stream of ChatSseEvent
+    """
+    try:
+        body = await request.json()
+    except Exception:
+        return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
+    session_key = body.get("session_key")
+    text = body.get("text", "").strip()
+    agent_id = body.get("agent_id")
+    if not session_key or not isinstance(session_key, str):
+        return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
+    if not text:
+        return JSONResponse({"error": "Missing or empty 'text' field"}, status_code=400)
+    # Get or create the chat session
+    try:
+        await get_or_create_session(session_key, agent_id)
+    except RuntimeError as e:
+        logger.error(f"[server] Failed to create chat session: {e}")
+        return JSONResponse({"error": str(e)}, status_code=503)
+    # Stream response as SSE
+    try:
+        async def event_generator():
+            async for event in stream_message(session_key, text):
+                data = json.dumps(event.to_dict())
+                yield f"data: {data}\n\n"
+        return StreamingResponse(
+            event_generator(),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+            },
+        )
+    except RuntimeError as e:
+        if "ALREADY_STREAMING" in str(e):
+            return JSONResponse(
+                {"error": "Already streaming a response. Wait for it to complete."},
+                status_code=409,
+            )
+        return JSONResponse({"error": str(e)}, status_code=500)
+@app.post("/chat/stop")
+async def chat_stop(request: Request):
+    """Interrupt the current streaming response.
+    Body: { session_key: str }
+    Returns: { ok: true, interrupted: bool }
+    """
+    try:
+        body = await request.json()
+    except Exception:
+        return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
+    session_key = body.get("session_key")
+    if not session_key or not isinstance(session_key, str):
+        return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
+    interrupted = await interrupt_session(session_key)
+    return {"ok": True, "interrupted": interrupted}
+@app.post("/chat/close")
+async def chat_close(request: Request):
+    """Close a chat session.
+    Body: { session_key: str }
+    Returns: { ok: true }
+    """
+    try:
+        body = await request.json()
+    except Exception:
+        return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
+    session_key = body.get("session_key")
+    if not session_key or not isinstance(session_key, str):
+        return JSONResponse({"error": "Missing 'session_key' field"}, status_code=400)
+    await close_session(session_key)
+    return {"ok": True}
+@app.post("/register-call")
+async def register_call(request: Request):
+    """Register a pending outbound call.
+    Body: { token: str, agent_id: str, initial_prompt?: str }
+    Returns: { ok: true }
+    """
+    try:
+        body = await request.json()
+    except Exception:
+        return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
+    token = body.get("token")
+    agent_id = body.get("agent_id")
+    if not token or not isinstance(token, str):
+        return JSONResponse({"error": "Missing 'token' field"}, status_code=400)
+    if not agent_id or not isinstance(agent_id, str):
+        return JSONResponse({"error": "Missing 'agent_id' field"}, status_code=400)
+    initial_prompt = body.get("initial_prompt")
+    register_pending_call(token, agent_id, initial_prompt)
+    logger.info(f"[server] Registered outbound call token: {token}, agentId: {agent_id}")
+    return {"ok": True}
+@app.post("/twilio/incoming-call")
+async def twilio_incoming_call(request: Request):
+    """Handle Twilio incoming call webhook. Returns TwiML for media stream.
+    The TwiML tells Twilio to connect a media stream WebSocket to our
+    /media/{token} endpoint via the tunnel URL.
+    Returns: TwiML XML response
+    """
+    tunnel_url = get_tunnel_url()
+    if not tunnel_url:
+        logger.error("[server] Rejected incoming call: no tunnel URL available")
+        return PlainTextResponse("Server misconfigured", status_code=500)
+    tunnel_host = tunnel_url.replace("https://", "").replace("http://", "")
+    # Generate a token for this call and register it
+    from uuid import uuid4
+    token = str(uuid4())
+    register_pending_call(token, agent_id="", initial_prompt=None)
+    logger.info(f"[server] Incoming call accepted, token: {token}")
+    # Respond with TwiML to connect a media stream
+    twiml = (
+        '<?xml version="1.0" encoding="UTF-8"?>'
+        "<Response>"
+        "  <Connect>"
+        f'    <Stream url="wss://{tunnel_host}/media/{token}" />'
+        "  </Connect>"
+        "</Response>"
+    )
+    return PlainTextResponse(twiml, media_type="text/xml")
+@app.websocket("/media/{token}")
+async def media_websocket(websocket: WebSocket, token: str):
+    """Handle Twilio media stream WebSocket connection.
+    Delegates to handle_twilio_websocket which manages the full pipeline lifecycle.
+    Args:
+        websocket: FastAPI WebSocket connection
+        token: Per-call UUID token from the URL path
+    """
+    logger.info(f"[server] Twilio media WebSocket connected, token: {token}")
+    await handle_twilio_websocket(websocket, token)
+@app.get("/heartbeat/status")
+async def heartbeat_status():
+    """Get the last heartbeat results per agent.
+    Returns: Dict of agent_id -> HeartbeatResult
+    """
+    return get_heartbeat_status()
+@app.post("/config/tunnel-url")
+async def config_tunnel_url(request: Request):
+    """Update the tunnel URL (called by dashboard after tunnel starts).
+    Body: { url: str }
+    Returns: { ok: true }
+    """
+    global _tunnel_url
+    try:
+        body = await request.json()
+    except Exception:
+        return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
+    url = body.get("url")
+    if not url or not isinstance(url, str):
+        return JSONResponse({"error": "Missing 'url' field"}, status_code=400)
+    _tunnel_url = url
+    logger.info(f"[server] Tunnel URL updated: {url}")
+    return {"ok": True}
+# ============================================================================
+# ENTRY POINT
+# ============================================================================
+@app.on_event("startup")
+async def on_startup():
+    """Start cleanup timer and heartbeat on FastAPI startup."""
+    start_cleanup_timer()
+    config = load_config()
+    start_heartbeat(config, get_tunnel_url)
+@app.on_event("shutdown")
+async def on_shutdown():
+    """Stop heartbeat on FastAPI shutdown."""
+    stop_heartbeat()
+async def start_fastapi(config: VoiceServerConfig) -> None:
+    """Start the FastAPI server on the configured API port.
+    Args:
+        config: Voice server configuration
+    """
+    server_config = uvicorn.Config(
+        app,
+        host="127.0.0.1",
+        port=config.api_port,
+        log_level="info",
+    )
+    server = uvicorn.Server(server_config)
+    await server.serve()
+async def start_all() -> None:
+    """Start both the SmallWebRTC server and FastAPI server concurrently."""
+    config = load_config()
+    # Import here to avoid circular imports
+    from voice_pipeline import main as start_webrtc
+    logger.info(
+        f"[server] Starting SmallWebRTC on :{config.webrtc_port}, "
+        f"FastAPI on :{config.api_port}"
+    )
+    # Run both servers concurrently
+    await asyncio.gather(
+        start_fastapi(config),
+        # SmallWebRTC's main() is a blocking call that starts its own server
+        asyncio.to_thread(start_webrtc),
+    )
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    asyncio.run(start_all())

package/voice-server/stop_phrase_processor.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""
+FrameProcessor that detects "stop listening" in transcriptions and ends the pipeline.
+Listens for TranscriptionFrame events. If the transcribed text contains
+"stop listening" (case-insensitive), pushes an EndFrame to terminate the session.
+Otherwise, passes the frame through unchanged.
+Responsibilities:
+- Detect "stop listening" phrase in user transcriptions
+- Push EndFrame to cleanly shut down the pipeline
+- Pass all other frames through unchanged
+"""
+import logging
+from pipecat.frames.frames import EndFrame, Frame, TranscriptionFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+logger = logging.getLogger(__name__)
+STOP_PHRASE = "stop listening"
+# ============================================================================
+# MAIN HANDLERS
+# ============================================================================
+class StopPhraseProcessor(FrameProcessor):
+    """Detects 'stop listening' in transcriptions and ends the pipeline."""
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Check transcription frames for the stop phrase.
+        If detected, pushes an EndFrame to terminate the pipeline.
+        Otherwise, passes the frame through.
+        Args:
+            frame: The incoming frame
+            direction: Frame direction
+        """
+        await super().process_frame(frame, direction)
+        if isinstance(frame, TranscriptionFrame):
+            text = frame.text.lower().strip()
+            if STOP_PHRASE in text:
+                logger.info("[stop-phrase] 'stop listening' detected, ending pipeline")
+                await self.push_frame(EndFrame())
+                return
+        await self.push_frame(frame, direction)

package/voice-server/twilio_pipeline.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""
+Twilio voice pipeline using FastAPIWebsocketTransport with TwilioFrameSerializer.
+Handles inbound and outbound Twilio phone calls by wiring Pipecat components
+for mulaw audio over WebSocket. Supports heartbeat session handoff where a
+pre-existing Claude session is passed through to preserve context.
+Responsibilities:
+- Create a Pipecat pipeline with TwilioFrameSerializer for mulaw 8kHz audio
+- Handle FastAPI WebSocket connections from Twilio media streams
+- Extract Twilio metadata (stream_sid, call_sid) from the WebSocket "start" event
+- Look up pending calls to retrieve pre-existing ClaudeSDKClient sessions
+- Wire STT -> LLM -> TTS pipeline identical to browser pipeline
+"""
+import asyncio
+import json
+import logging
+import os
+import aiohttp
+from fastapi import WebSocket
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.serializers.twilio import TwilioFrameSerializer
+from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
+from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
+from pipecat.transports.websocket.fastapi import (
+    FastAPIWebsocketParams,
+    FastAPIWebsocketTransport,
+)
+from claude_llm_service import ClaudeLLMService, ClaudeLLMServiceConfig
+from config import (
+    DEFAULT_AGENTS_DIR,
+    build_system_prompt,
+    get_agent_voice_id,
+    load_config,
+)
+from heartbeat import get_pending_client
+from narration_processor import NarrationProcessor
+from stop_phrase_processor import StopPhraseProcessor
+logger = logging.getLogger(__name__)
+# ============================================================================
+# MAIN HANDLERS
+# ============================================================================
+async def handle_twilio_websocket(websocket: WebSocket, call_token: str) -> None:
+    """Handle a Twilio media stream WebSocket connection.
+    Accepts the WebSocket, waits for the Twilio "start" event to extract metadata,
+    looks up any pending call config, then creates and runs the voice pipeline.
+    Args:
+        websocket: FastAPI WebSocket connection from Twilio
+        call_token: Per-call UUID token from the URL path
+    """
+    await websocket.accept()
+    config = load_config()
+    # Wait for the Twilio "start" event to get stream metadata
+    stream_sid = None
+    call_sid = None
+    try:
+        # Read messages until we get the "start" event
+        while True:
+            raw = await websocket.receive_text()
+            msg = json.loads(raw)
+            if msg.get("event") == "start":
+                start_data = msg.get("start", {})
+                stream_sid = start_data.get("streamSid")
+                call_sid = start_data.get("callSid")
+                logger.info(
+                    f"[twilio] Stream started -- callSid: {call_sid}, "
+                    f"streamSid: {stream_sid}"
+                )
+                break
+            if msg.get("event") == "connected":
+                # Initial connected event -- keep waiting for start
+                continue
+            # Unexpected event before start
+            logger.warning(f"[twilio] Unexpected event before start: {msg.get('event')}")
+    except Exception as e:
+        logger.error(f"[twilio] Error waiting for start event: {e}")
+        await websocket.close()
+        return
+    if not stream_sid:
+        logger.error("[twilio] No stream_sid in start event")
+        await websocket.close()
+        return
+    # Look up pending call for heartbeat handoff or API-initiated calls
+    pending = get_pending_client(call_token)
+    agent_id = None
+    existing_client = None
+    initial_prompt = None
+    if pending:
+        agent_id = pending.agent_id
+        existing_client = pending.client  # May be None for API calls
+        initial_prompt = pending.initial_prompt
+        logger.info(
+            f'[twilio] Using pending call for agent "{agent_id}", '
+            f'has_client={existing_client is not None}'
+        )
+    # Build LLM config
+    system_prompt = build_system_prompt(agent_id, "voice")
+    cwd = os.path.join(DEFAULT_AGENTS_DIR, agent_id) if agent_id else config.default_cwd
+    voice_id = get_agent_voice_id(agent_id)
+    llm_config = ClaudeLLMServiceConfig(
+        cwd=cwd,
+        system_prompt=system_prompt,
+        existing_client=existing_client,
+        initial_prompt=initial_prompt,
+    )
+    # Create and run the pipeline
+    try:
+        await _run_twilio_pipeline(
+            websocket=websocket,
+            stream_sid=stream_sid,
+            call_sid=call_sid or "",
+            config=config,
+            llm_config=llm_config,
+            voice_id=voice_id,
+        )
+    except Exception as e:
+        logger.error(f"[twilio] Pipeline error: {e}")
+    finally:
+        try:
+            await websocket.close()
+        except Exception:
+            pass
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+async def _run_twilio_pipeline(
+    websocket: WebSocket,
+    stream_sid: str,
+    call_sid: str,
+    config,
+    llm_config: ClaudeLLMServiceConfig,
+    voice_id: str,
+) -> None:
+    """Create and run the Twilio voice pipeline.
+    Assembles: transport.input -> STT -> stop_phrase -> user_aggregator
+    -> claude_llm -> narration -> TTS -> transport.output
+    Args:
+        websocket: Active FastAPI WebSocket connection
+        stream_sid: Twilio stream identifier
+        call_sid: Twilio call SID
+        config: Voice server configuration
+        llm_config: Claude LLM service configuration
+        voice_id: ElevenLabs voice ID
+    """
+    serializer = TwilioFrameSerializer(stream_sid=stream_sid, call_sid=call_sid)
+    transport = FastAPIWebsocketTransport(
+        websocket=websocket,
+        params=FastAPIWebsocketParams(
+            audio_in_enabled=True,
+            audio_out_enabled=True,
+            audio_in_sample_rate=8000,
+            audio_out_sample_rate=8000,
+            vad_enabled=True,
+            vad_audio_passthrough=True,
+            serializer=serializer,
+        ),
+    )
+    async with aiohttp.ClientSession() as session:
+        # STT
+        stt = ElevenLabsSTTService(
+            api_key=config.elevenlabs_api_key,
+            aiohttp_session=session,
+            model=config.elevenlabs_stt_model,
+        )
+        # TTS
+        tts = ElevenLabsTTSService(
+            api_key=config.elevenlabs_api_key,
+            voice_id=voice_id,
+            model=config.elevenlabs_tts_model,
+        )
+        # Claude LLM
+        claude_llm = ClaudeLLMService(config=llm_config)
+        # Processors
+        stop_phrase = StopPhraseProcessor()
+        narration = NarrationProcessor()
+        # Context aggregator
+        context = OpenAILLMContext(messages=[], tools=[])
+        context_aggregator = claude_llm.create_context_aggregator(context)
+        # Pipeline
+        pipeline = Pipeline(
+            [
+                transport.input(),
+                stt,
+                stop_phrase,
+                context_aggregator.user(),
+                claude_llm,
+                narration,
+                tts,
+                transport.output(),
+            ]
+        )
+        task = PipelineTask(
+            pipeline,
+            params=PipelineParams(allow_interruptions=True),
+        )
+        runner = PipelineRunner()
+        await runner.run(task)