npm - voicecc - Versions diffs - 1.2.2 → 1.2.4 - Mend

voicecc 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/bin/voicecc.js +92 -68
package/package.json +2 -1
package/voice-server/.python-version +1 -0
package/voice-server/claude_llm_service.py +333 -0
package/voice-server/claude_session.py +312 -0
package/voice-server/config.py +340 -0
package/voice-server/dev-server-start.sh +128 -0
package/voice-server/heartbeat.py +505 -0
package/voice-server/narration_processor.py +140 -0
package/voice-server/requirements.txt +8 -0
package/voice-server/server.py +335 -0
package/voice-server/stop_phrase_processor.py +50 -0
package/voice-server/twilio_pipeline.py +237 -0
package/voice-server/voice_pipeline.py +147 -0

package/voice-server/claude_session.py ADDED Viewed

@@ -0,0 +1,312 @@
+"""
+Text chat session manager for the Python voice server.
+Port of chat-server.ts + claude-session.ts. Manages ClaudeSDKClient lifecycle
+for text chat: lazy creation on first message, multi-turn reuse, inactivity
+cleanup after 10 minutes.
+Responsibilities:
+- Create and reuse ClaudeSDKClient sessions keyed by device token
+- Stream Claude responses as ChatSseEvent async generators
+- Enforce max concurrent sessions
+- Auto-cleanup inactive sessions on a 60-second timer
+"""
+import asyncio
+import logging
+import time
+from dataclasses import dataclass, field
+from claude_agent_sdk import (
+    AssistantMessage,
+    ClaudeAgentOptions,
+    ClaudeSDKClient,
+    ResultMessage,
+    TextBlock,
+    ToolUseBlock,
+)
+from config import build_system_prompt, load_config, DEFAULT_AGENTS_DIR
+logger = logging.getLogger(__name__)
+# ============================================================================
+# CONSTANTS
+# ============================================================================
+INACTIVITY_TIMEOUT_SECONDS = 600  # 10 minutes
+CLEANUP_INTERVAL_SECONDS = 60
+# ============================================================================
+# TYPES
+# ============================================================================
+@dataclass
+class ChatSseEvent:
+    """SSE event sent to the client during text chat streaming.
+    Attributes:
+        type: Event type ("text_delta", "tool_start", "tool_end", "result", "error")
+        content: Text content or error message
+        tool_name: Tool name (only for tool_start events)
+    """
+    type: str
+    content: str
+    tool_name: str | None = None
+    def to_dict(self) -> dict:
+        """Serialize to a JSON-safe dict, omitting None fields."""
+        d: dict = {"type": self.type, "content": self.content}
+        if self.tool_name is not None:
+            d["toolName"] = self.tool_name
+        return d
+@dataclass
+class ChatSession:
+    """Tracks an active text chat session.
+    Attributes:
+        session_key: Device token used as the session key
+        client: Persistent ClaudeSDKClient for multi-turn chat
+        agent_id: Optional agent identifier for agent-specific prompts
+        streaming: Whether the session is currently streaming a response
+        last_activity: Unix timestamp of last activity (for inactivity timeout)
+    """
+    session_key: str
+    client: ClaudeSDKClient
+    agent_id: str | None = None
+    streaming: bool = False
+    last_activity: float = field(default_factory=time.time)
+# ============================================================================
+# STATE
+# ============================================================================
+_active_sessions: dict[str, ChatSession] = {}
+_cleanup_task: asyncio.Task | None = None
+# ============================================================================
+# MAIN HANDLERS
+# ============================================================================
+async def get_or_create_session(session_key: str, agent_id: str | None = None) -> ChatSession:
+    """Get an existing chat session or create a new one.
+    On first call for a session_key, creates a ClaudeSDKClient with the
+    appropriate system prompt. Subsequent calls return the existing session.
+    Enforces max concurrent sessions from config.
+    Args:
+        session_key: Device token to key the session on
+        agent_id: Optional agent ID for agent-specific prompts
+    Returns:
+        The active ChatSession
+    Raises:
+        RuntimeError: If max concurrent sessions exceeded
+    """
+    existing = _active_sessions.get(session_key)
+    if existing:
+        existing.last_activity = time.time()
+        return existing
+    config = load_config()
+    if len(_active_sessions) >= config.max_concurrent_sessions:
+        raise RuntimeError(
+            f"Max concurrent sessions ({config.max_concurrent_sessions}) reached"
+        )
+    system_prompt = build_system_prompt(agent_id, "text")
+    # Determine working directory
+    import os
+    cwd = config.default_cwd
+    if agent_id:
+        agent_dir = os.path.join(DEFAULT_AGENTS_DIR, agent_id)
+        if os.path.isdir(agent_dir):
+            cwd = agent_dir
+    options = ClaudeAgentOptions(
+        system_prompt=system_prompt,
+        cwd=cwd,
+        allowed_tools=[],
+        permission_mode="bypassPermissions",
+        include_partial_messages=True,
+        max_thinking_tokens=10000,
+    )
+    client = ClaudeSDKClient(options=options)
+    await client.connect()
+    session = ChatSession(
+        session_key=session_key,
+        client=client,
+        agent_id=agent_id,
+    )
+    _active_sessions[session_key] = session
+    logger.info(f"[chat] Session created, key: {session_key}")
+    return session
+async def stream_message(session_key: str, text: str):
+    """Send a user message and yield SSE events from Claude's response.
+    Guards against concurrent streaming on the same session. Yields
+    ChatSseEvent objects for each streaming event from Claude.
+    Args:
+        session_key: Device token identifying the session
+        text: User message text
+    Yields:
+        ChatSseEvent objects for each streaming event
+    Raises:
+        RuntimeError: If no active session or already streaming
+    """
+    session = _active_sessions.get(session_key)
+    if not session:
+        raise RuntimeError("No active session")
+    if session.streaming:
+        raise RuntimeError("ALREADY_STREAMING")
+    session.last_activity = time.time()
+    session.streaming = True
+    try:
+        await session.client.query(text)
+        async for msg in session.client.receive_response():
+            if isinstance(msg, AssistantMessage):
+                for block in msg.content:
+                    if isinstance(block, TextBlock) and block.text:
+                        yield ChatSseEvent(type="text_delta", content=block.text)
+                    elif isinstance(block, ToolUseBlock):
+                        yield ChatSseEvent(
+                            type="tool_start", content="", tool_name=block.name
+                        )
+            elif isinstance(msg, ResultMessage):
+                if msg.is_error:
+                    yield ChatSseEvent(
+                        type="error", content=msg.subtype or "Unknown error"
+                    )
+                break
+        yield ChatSseEvent(type="result", content="")
+    except Exception as e:
+        logger.error(f"[chat] Stream error for {session_key}: {e}")
+        yield ChatSseEvent(type="error", content=str(e))
+    finally:
+        session.streaming = False
+        session.last_activity = time.time()
+async def close_session(session_key: str) -> None:
+    """Close a chat session, disconnecting the Claude client.
+    Args:
+        session_key: Device token identifying the session
+    """
+    session = _active_sessions.pop(session_key, None)
+    if not session:
+        return
+    try:
+        await session.client.disconnect()
+    except Exception as e:
+        logger.warning(f"[chat] Error disconnecting session {session_key}: {e}")
+    logger.info(f"[chat] Session closed, key: {session_key}")
+async def interrupt_session(session_key: str) -> bool:
+    """Interrupt the current streaming response for a session.
+    Args:
+        session_key: Device token identifying the session
+    Returns:
+        True if a streaming session was interrupted, False otherwise
+    """
+    session = _active_sessions.get(session_key)
+    if not session or not session.streaming:
+        return False
+    try:
+        await session.client.interrupt()
+    except Exception as e:
+        logger.warning(f"[chat] Interrupt error for {session_key}: {e}")
+    session.streaming = False
+    session.last_activity = time.time()
+    logger.info(f"[chat] Session interrupted, key: {session_key}")
+    return True
+def has_session(session_key: str) -> bool:
+    """Check if a session exists for the given key.
+    Args:
+        session_key: Device token to check
+    Returns:
+        True if a session exists
+    """
+    return session_key in _active_sessions
+async def cleanup_inactive() -> None:
+    """Close sessions that have been inactive for 10+ minutes.
+    Called on a periodic timer. Safe to call concurrently.
+    """
+    now = time.time()
+    stale_keys = [
+        key
+        for key, session in _active_sessions.items()
+        if now - session.last_activity > INACTIVITY_TIMEOUT_SECONDS
+    ]
+    for key in stale_keys:
+        logger.info(f"[chat] Session timed out due to inactivity, key: {key}")
+        await close_session(key)
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+async def _cleanup_loop() -> None:
+    """Background loop that runs cleanup_inactive every 60 seconds."""
+    while True:
+        await asyncio.sleep(CLEANUP_INTERVAL_SECONDS)
+        try:
+            await cleanup_inactive()
+        except Exception as e:
+            logger.error(f"[chat] Cleanup error: {e}")
+def start_cleanup_timer() -> None:
+    """Start the background cleanup timer. Call once at server startup."""
+    global _cleanup_task
+    if _cleanup_task is None:
+        _cleanup_task = asyncio.create_task(_cleanup_loop())
+        logger.info("[chat] Inactivity cleanup timer started")
+def stop_cleanup_timer() -> None:
+    """Stop the background cleanup timer."""
+    global _cleanup_task
+    if _cleanup_task is not None:
+        _cleanup_task.cancel()
+        _cleanup_task = None

package/voice-server/config.py ADDED Viewed

@@ -0,0 +1,340 @@
+"""
+Configuration, environment loading, prompt builder, and agent loader for the voice server.
+Ports the TypeScript env.ts + prompt-builder.ts + agent-store.ts patterns to Python.
+Responsibilities:
+- Load environment variables from ~/.voicecc/.env
+- Build system prompts with mode overlays and agent files
+- Load agent config from ~/.claude-voice-agents/<agentId>/
+- Provide typed VoiceServerConfig dataclass
+"""
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from dotenv import load_dotenv
+# ============================================================================
+# CONSTANTS
+# ============================================================================
+DEFAULT_VOICECC_DIR = os.path.join(os.path.expanduser("~"), ".voicecc")
+DEFAULT_AGENTS_DIR = os.path.join(os.path.expanduser("~"), ".claude-voice-agents")
+DEFAULT_AGENT_VOICE_ID = "IKne3meq5aSn9XLyUdCD"  # Charlie
+DEFAULT_NON_AGENT_VOICE_ID = "WrjxnKxK0m1uiaH0uteU"
+DEFAULT_TTS_MODEL = "eleven_turbo_v2_5"
+DEFAULT_STT_MODEL = "scribe_v1"
+DEFAULT_WEBRTC_PORT = 7860
+DEFAULT_API_PORT = 7861
+DEFAULT_TWILIO_PORT = 8080
+DEFAULT_MAX_CONCURRENT_SESSIONS = 2
+# Project root is the parent of voice-server/
+PROJECT_ROOT = str(Path(__file__).resolve().parent.parent)
+DEFAULTS_DIR = os.path.join(PROJECT_ROOT, "init", "defaults")
+# ============================================================================
+# TYPES
+# ============================================================================
+@dataclass
+class VoicePreference:
+    """Voice preference for a TTS provider."""
+    id: str
+    name: str
+@dataclass
+class AgentVoiceConfig:
+    """Per-provider voice preferences."""
+    elevenlabs: VoicePreference | None = None
+    local: VoicePreference | None = None
+@dataclass
+class AgentConfig:
+    """Configuration stored in config.json for each agent."""
+    heartbeat_interval_minutes: int = 10
+    heartbeat_timeout_minutes: int | None = None
+    enabled: bool = True
+    voice: AgentVoiceConfig | None = None
+@dataclass
+class Agent:
+    """Full agent data including all file contents."""
+    id: str
+    soul_md: str
+    memory_md: str
+    heartbeat_md: str
+    config: AgentConfig
+@dataclass
+class VoiceServerConfig:
+    """Typed configuration for the voice server."""
+    webrtc_port: int
+    api_port: int
+    tunnel_url: str | None
+    elevenlabs_api_key: str
+    elevenlabs_voice_id: str
+    elevenlabs_tts_model: str
+    elevenlabs_stt_model: str
+    agents_dir: str
+    default_cwd: str
+    project_root: str
+    twilio_account_sid: str
+    twilio_auth_token: str
+    user_phone_number: str
+    max_concurrent_sessions: int
+# ============================================================================
+# MAIN HANDLERS
+# ============================================================================
+def load_config() -> VoiceServerConfig:
+    """Load environment variables from ~/.voicecc/.env and return a typed config.
+    Reads .env using python-dotenv, then extracts all required values.
+    Fails fast if ELEVENLABS_API_KEY is missing.
+    Returns:
+        VoiceServerConfig with all settings populated
+    """
+    voicecc_dir = os.environ.get("VOICECC_DIR", DEFAULT_VOICECC_DIR)
+    env_path = os.path.join(voicecc_dir, ".env")
+    load_dotenv(env_path)
+    api_key = os.environ.get("ELEVENLABS_API_KEY", "")
+    if not api_key:
+        raise ValueError("ELEVENLABS_API_KEY is required in ~/.voicecc/.env")
+    return VoiceServerConfig(
+        webrtc_port=int(os.environ.get("WEBRTC_PORT", str(DEFAULT_WEBRTC_PORT))),
+        api_port=int(os.environ.get("API_PORT", str(DEFAULT_API_PORT))),
+        tunnel_url=os.environ.get("TUNNEL_URL"),
+        elevenlabs_api_key=api_key,
+        elevenlabs_voice_id=os.environ.get("ELEVENLABS_VOICE_ID", DEFAULT_NON_AGENT_VOICE_ID),
+        elevenlabs_tts_model=os.environ.get("ELEVENLABS_MODEL_ID", DEFAULT_TTS_MODEL),
+        elevenlabs_stt_model=os.environ.get("ELEVENLABS_STT_MODEL_ID", DEFAULT_STT_MODEL),
+        agents_dir=os.environ.get("AGENTS_DIR", DEFAULT_AGENTS_DIR),
+        default_cwd=os.environ.get("DEFAULT_CWD", os.path.expanduser("~")),
+        project_root=PROJECT_ROOT,
+        twilio_account_sid=os.environ.get("TWILIO_ACCOUNT_SID", ""),
+        twilio_auth_token=os.environ.get("TWILIO_AUTH_TOKEN", ""),
+        user_phone_number=os.environ.get("USER_PHONE_NUMBER", ""),
+        max_concurrent_sessions=int(
+            os.environ.get("MAX_CONCURRENT_SESSIONS") or DEFAULT_MAX_CONCURRENT_SESSIONS
+        ),
+    )
+def build_system_prompt(agent_id: str | None, overlay: str) -> str:
+    """Build a complete system prompt with mode overlay and optional agent files.
+    Reads the base system.md template, replaces <<MODE_OVERLAY>> with the
+    given overlay, and if agent_id is provided, injects SOUL/MEMORY/HEARTBEAT
+    files and the agent directory path.
+    Args:
+        agent_id: Agent identifier, or None for default prompt
+        overlay: "voice" or "text" -- selects the overlay file
+    Returns:
+        Complete system prompt string
+    """
+    base_template = _read_template("system.md")
+    overlay_content = _read_overlay(overlay)
+    prompt = base_template.replace("<<MODE_OVERLAY>>", overlay_content)
+    if agent_id:
+        agent = load_agent(agent_id)
+        agent_dir = os.path.join(DEFAULT_AGENTS_DIR, agent_id)
+        agent_files = "\n\n".join([
+            f"<SOUL.md>\n{agent.soul_md}\n</SOUL.md>",
+            f"<HEARTBEAT.md>\n{agent.heartbeat_md}\n</HEARTBEAT.md>",
+            f"<MEMORY.md>\n{agent.memory_md}\n</MEMORY.md>",
+        ])
+        prompt = prompt.replace("<<AGENT_DIR>>", agent_dir)
+        prompt = prompt.replace("<<AGENT_FILES>>", agent_files)
+    return prompt
+def load_agent(agent_id: str) -> Agent:
+    """Read agent data from ~/.claude-voice-agents/<agentId>/.
+    Reads SOUL.md, MEMORY.md, HEARTBEAT.md, and config.json.
+    Fails fast if the agent directory does not exist.
+    Args:
+        agent_id: Agent identifier
+    Returns:
+        Agent with all file contents loaded
+    """
+    agent_dir = os.path.join(DEFAULT_AGENTS_DIR, agent_id)
+    if not os.path.isdir(agent_dir):
+        raise FileNotFoundError(f'Agent "{agent_id}" not found at {agent_dir}')
+    soul_md = _read_file(os.path.join(agent_dir, "SOUL.md"))
+    memory_md = _read_file(os.path.join(agent_dir, "MEMORY.md"))
+    heartbeat_md = _read_file(os.path.join(agent_dir, "HEARTBEAT.md"))
+    config = _read_agent_config(os.path.join(agent_dir, "config.json"))
+    return Agent(
+        id=agent_id,
+        soul_md=soul_md,
+        memory_md=memory_md,
+        heartbeat_md=heartbeat_md,
+        config=config,
+    )
+def list_agents(agents_dir: str | None = None) -> list[Agent]:
+    """List all agents that have heartbeat enabled.
+    Scans the agents directory for subdirectories with config.json,
+    returns only those with enabled=True.
+    Args:
+        agents_dir: Override agents directory path (defaults to DEFAULT_AGENTS_DIR)
+    Returns:
+        List of Agent objects with enabled=True
+    """
+    dir_path = agents_dir or DEFAULT_AGENTS_DIR
+    if not os.path.isdir(dir_path):
+        return []
+    agents: list[Agent] = []
+    for entry in os.listdir(dir_path):
+        entry_path = os.path.join(dir_path, entry)
+        if not os.path.isdir(entry_path):
+            continue
+        config_path = os.path.join(entry_path, "config.json")
+        if not os.path.isfile(config_path):
+            continue
+        try:
+            agent = load_agent(entry)
+            if agent.config.enabled:
+                agents.append(agent)
+        except Exception as e:
+            print(f"[config] Skipping agent {entry}: {e}")
+    return agents
+def get_agent_voice_id(agent_id: str | None) -> str:
+    """Get the ElevenLabs voice ID for an agent, falling back to defaults.
+    Args:
+        agent_id: Agent identifier, or None
+    Returns:
+        ElevenLabs voice ID string
+    """
+    if not agent_id:
+        return DEFAULT_NON_AGENT_VOICE_ID
+    try:
+        agent = load_agent(agent_id)
+        if agent.config.voice and agent.config.voice.elevenlabs:
+            return agent.config.voice.elevenlabs.id
+    except FileNotFoundError:
+        pass
+    return DEFAULT_AGENT_VOICE_ID
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def _read_file(path: str) -> str:
+    """Read a file and return its contents as a string.
+    Args:
+        path: Absolute path to the file
+    Returns:
+        File contents, or empty string if file does not exist
+    """
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        return ""
+def _read_template(filename: str) -> str:
+    """Read a template file from init/defaults/.
+    Args:
+        filename: Name of the template file
+    Returns:
+        Template contents
+    """
+    path = os.path.join(DEFAULTS_DIR, filename)
+    content = _read_file(path)
+    if not content:
+        raise FileNotFoundError(f"Template not found: {path}")
+    return content
+def _read_overlay(overlay: str) -> str:
+    """Read a mode overlay file (voice or text).
+    Args:
+        overlay: "voice" or "text"
+    Returns:
+        Overlay file contents
+    """
+    filename_map = {
+        "voice": "system-voice-overlay.md",
+        "text": "system-text-overlay.md",
+    }
+    filename = filename_map.get(overlay)
+    if not filename:
+        raise ValueError(f'Unknown overlay mode: "{overlay}". Expected "voice" or "text".')
+    return _read_template(filename)
+def _read_agent_config(config_path: str) -> AgentConfig:
+    """Parse an agent's config.json into an AgentConfig dataclass.
+    Args:
+        config_path: Path to config.json
+    Returns:
+        Parsed AgentConfig
+    """
+    with open(config_path, "r", encoding="utf-8") as f:
+        raw = json.load(f)
+    voice_config = None
+    if "voice" in raw:
+        voice_raw = raw["voice"]
+        elevenlabs = None
+        if "elevenlabs" in voice_raw:
+            el = voice_raw["elevenlabs"]
+            elevenlabs = VoicePreference(id=el["id"], name=el["name"])
+        voice_config = AgentVoiceConfig(elevenlabs=elevenlabs)
+    return AgentConfig(
+        heartbeat_interval_minutes=raw.get("heartbeatIntervalMinutes", 10),
+        heartbeat_timeout_minutes=raw.get("heartbeatTimeoutMinutes"),
+        enabled=raw.get("enabled", True),
+        voice=voice_config,
+    )