PyPI - atom-audio-engine - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

atom-audio-engine 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{atom_audio_engine-0.1.1.dist-info → atom_audio_engine-0.1.2.dist-info}/METADATA +1 -1
atom_audio_engine-0.1.2.dist-info/RECORD +57 -0
audio_engine/asr/__init__.py +45 -0
audio_engine/asr/base.py +89 -0
audio_engine/asr/cartesia.py +356 -0
audio_engine/asr/deepgram.py +196 -0
audio_engine/core/__init__.py +13 -0
audio_engine/core/config.py +162 -0
audio_engine/core/pipeline.py +282 -0
audio_engine/core/types.py +87 -0
audio_engine/examples/__init__.py +1 -0
audio_engine/examples/basic_stt_llm_tts.py +200 -0
audio_engine/examples/geneface_animation.py +99 -0
audio_engine/examples/personaplex_pipeline.py +116 -0
audio_engine/examples/websocket_server.py +86 -0
audio_engine/integrations/__init__.py +5 -0
audio_engine/integrations/geneface.py +297 -0
audio_engine/llm/__init__.py +38 -0
audio_engine/llm/base.py +108 -0
audio_engine/llm/groq.py +210 -0
audio_engine/pipelines/__init__.py +1 -0
audio_engine/pipelines/personaplex/__init__.py +41 -0
audio_engine/pipelines/personaplex/client.py +259 -0
audio_engine/pipelines/personaplex/config.py +69 -0
audio_engine/pipelines/personaplex/pipeline.py +301 -0
audio_engine/pipelines/personaplex/types.py +173 -0
audio_engine/pipelines/personaplex/utils.py +192 -0
audio_engine/scripts/debug_pipeline.py +79 -0
audio_engine/scripts/debug_tts.py +162 -0
audio_engine/scripts/test_cartesia_connect.py +57 -0
audio_engine/streaming/__init__.py +5 -0
audio_engine/streaming/websocket_server.py +341 -0
audio_engine/tests/__init__.py +1 -0
audio_engine/tests/test_personaplex/__init__.py +1 -0
audio_engine/tests/test_personaplex/test_personaplex.py +10 -0
audio_engine/tests/test_personaplex/test_personaplex_client.py +259 -0
audio_engine/tests/test_personaplex/test_personaplex_config.py +71 -0
audio_engine/tests/test_personaplex/test_personaplex_message.py +80 -0
audio_engine/tests/test_personaplex/test_personaplex_pipeline.py +226 -0
audio_engine/tests/test_personaplex/test_personaplex_session.py +184 -0
audio_engine/tests/test_personaplex/test_personaplex_transcript.py +184 -0
audio_engine/tests/test_traditional_pipeline/__init__.py +1 -0
audio_engine/tests/test_traditional_pipeline/test_cartesia_asr.py +474 -0
audio_engine/tests/test_traditional_pipeline/test_config_env.py +97 -0
audio_engine/tests/test_traditional_pipeline/test_conversation_context.py +115 -0
audio_engine/tests/test_traditional_pipeline/test_pipeline_creation.py +64 -0
audio_engine/tests/test_traditional_pipeline/test_pipeline_with_mocks.py +173 -0
audio_engine/tests/test_traditional_pipeline/test_provider_factories.py +61 -0
audio_engine/tests/test_traditional_pipeline/test_websocket_server.py +58 -0
audio_engine/tts/__init__.py +37 -0
audio_engine/tts/base.py +155 -0
audio_engine/tts/cartesia.py +392 -0
audio_engine/utils/__init__.py +15 -0
audio_engine/utils/audio.py +220 -0
atom_audio_engine-0.1.1.dist-info/RECORD +0 -5
{atom_audio_engine-0.1.1.dist-info → atom_audio_engine-0.1.2.dist-info}/WHEEL +0 -0
{atom_audio_engine-0.1.1.dist-info → atom_audio_engine-0.1.2.dist-info}/top_level.txt +0 -0

audio_engine/pipelines/personaplex/client.py ADDED Viewed

@@ -0,0 +1,259 @@
+"""Low-level WebSocket client for PersonaPlex."""
+import asyncio
+import logging
+from typing import AsyncIterator, Optional
+import websockets
+from websockets.asyncio.client import ClientConnection
+from .config import PersonaPlexConfig
+from .types import MessageType, PersonaPlexMessage, AudioChunk, TextChunk
+logger = logging.getLogger(__name__)
+class PersonaPlexClient:
+    """
+    WebSocket client for PersonaPlex speech-to-speech model.
+    Handles binary message encoding/decoding, Opus audio streaming,
+    and bidirectional text/audio communication.
+    Approach:
+    - Connect to WebSocket URL with query parameters (text_prompt, voice_prompt, etc.)
+    - Send Opus audio chunks with 0x01 prefix
+    - Receive Opus audio and text tokens asynchronously
+    - Handle connection lifecycle: connect, send, receive, disconnect
+    """
+    def __init__(self, config: PersonaPlexConfig):
+        """
+        Initialize PersonaPlex WebSocket client.
+        Args:
+            config: PersonaPlexConfig with server URL and model parameters
+        """
+        self.config = config
+        self.connection: Optional[ClientConnection] = None
+        self._is_connected = False
+    def _build_url(self, system_prompt: str) -> str:
+        """
+        Build WebSocket URL with query parameters.
+        Args:
+            system_prompt: Text prompt for controlling persona/behavior
+        Returns:
+            Full WebSocket URL with encoded parameters
+        """
+        url = self.config.server_url
+        params = {
+            "text_prompt": system_prompt,
+            "voice_prompt": self.config.voice_prompt,
+            "text_temperature": str(self.config.text_temperature),
+            "audio_temperature": str(self.config.audio_temperature),
+            "text_topk": str(self.config.text_topk),
+            "audio_topk": str(self.config.audio_topk),
+        }
+        # URL-encode and append parameters
+        param_str = "&".join(f"{k}={v}" for k, v in params.items())
+        return f"{url}?{param_str}"
+    async def connect(self, system_prompt: str) -> None:
+        """
+        Connect to PersonaPlex WebSocket server.
+        Args:
+            system_prompt: System prompt for persona control
+        Raises:
+            ConnectionError: If connection fails
+        """
+        if self._is_connected:
+            logger.warning("Already connected, skipping reconnect")
+            return
+        try:
+            url = self._build_url(system_prompt)
+            logger.debug(f"Connecting to PersonaPlex at {url}")
+            self.connection = await websockets.connect(
+                url,
+                ping_interval=30,  # Send ping every 30s to keep connection alive
+                ping_timeout=10,
+            )
+            self._is_connected = True
+            logger.info("Connected to PersonaPlex server")
+        except Exception as e:
+            logger.error(f"Failed to connect to PersonaPlex: {e}")
+            raise ConnectionError(f"PersonaPlex connection failed: {e}") from e
+    async def disconnect(self) -> None:
+        """Close WebSocket connection."""
+        if self.connection and self._is_connected:
+            try:
+                await self.connection.close()
+                logger.info("Disconnected from PersonaPlex server")
+            except Exception as e:
+                logger.error(f"Error closing connection: {e}")
+            finally:
+                self.connection = None
+                self._is_connected = False
+    async def send_audio(self, audio_chunk: bytes) -> None:
+        """
+        Send Opus-encoded audio chunk to server.
+        Args:
+            audio_chunk: Raw Opus-encoded audio bytes
+        Raises:
+            RuntimeError: If not connected
+        """
+        if not self._is_connected or not self.connection:
+            raise RuntimeError("Not connected to PersonaPlex server")
+        try:
+            # Message format: 0x01 (audio type) + Opus bytes
+            message = MessageType.AUDIO.value.to_bytes(1, "big") + audio_chunk
+            await self.connection.send(message)
+        except Exception as e:
+            logger.error(f"Failed to send audio: {e}")
+            raise
+    async def receive_audio(self) -> Optional[AudioChunk]:
+        """
+        Receive next audio chunk from server.
+        Returns:
+            AudioChunk with Opus data, or None if disconnected
+        Raises:
+            RuntimeError: If not connected
+        """
+        if not self._is_connected or not self.connection:
+            raise RuntimeError("Not connected to PersonaPlex server")
+        try:
+            message = await asyncio.wait_for(
+                self.connection.recv(),
+                timeout=self.config.session_timeout_seconds,
+            )
+            if isinstance(message, bytes):
+                parsed = PersonaPlexMessage.decode(message)
+                if parsed.type == MessageType.AUDIO:
+                    return AudioChunk(
+                        data=parsed.data,  # type: ignore
+                        sample_rate=self.config.sample_rate,
+                    )
+                elif parsed.type == MessageType.ERROR:
+                    error_msg = (
+                        parsed.data.decode("utf-8")
+                        if isinstance(parsed.data, bytes)
+                        else parsed.data
+                    )
+                    logger.error(f"Server error: {error_msg}")
+                    return None
+        except asyncio.TimeoutError:
+            logger.warning("Timeout waiting for audio from server")
+            return None
+        except Exception as e:
+            logger.error(f"Error receiving audio: {e}")
+            raise
+        return None
+    async def receive_text(self) -> Optional[TextChunk]:
+        """
+        Receive next text token from server.
+        Returns:
+            TextChunk with text data, or None if no text available
+        Raises:
+            RuntimeError: If not connected
+        """
+        if not self._is_connected or not self.connection:
+            raise RuntimeError("Not connected to PersonaPlex server")
+        try:
+            message = await asyncio.wait_for(
+                self.connection.recv(),
+                timeout=self.config.session_timeout_seconds,
+            )
+            if isinstance(message, bytes):
+                parsed = PersonaPlexMessage.decode(message)
+                if parsed.type == MessageType.TEXT:
+                    return TextChunk(text=parsed.data)  # type: ignore
+        except asyncio.TimeoutError:
+            logger.warning("Timeout waiting for text from server")
+            return None
+        except Exception as e:
+            logger.error(f"Error receiving text: {e}")
+            raise
+        return None
+    async def receive_any(self) -> Optional[PersonaPlexMessage]:
+        """
+        Receive next message of any type from server.
+        Returns:
+            PersonaPlexMessage, or None on timeout/error
+        """
+        if not self._is_connected or not self.connection:
+            raise RuntimeError("Not connected to PersonaPlex server")
+        try:
+            message = await asyncio.wait_for(
+                self.connection.recv(),
+                timeout=self.config.session_timeout_seconds,
+            )
+            if isinstance(message, bytes):
+                return PersonaPlexMessage.decode(message)
+        except asyncio.TimeoutError:
+            return None
+        except Exception as e:
+            logger.error(f"Error receiving message: {e}")
+            raise
+        return None
+    async def stream_messages(self) -> AsyncIterator[PersonaPlexMessage]:
+        """
+        Stream all messages from server until disconnection.
+        Yields:
+            PersonaPlexMessage objects as they arrive
+        """
+        if not self._is_connected or not self.connection:
+            raise RuntimeError("Not connected to PersonaPlex server")
+        try:
+            async for message in self.connection:
+                if isinstance(message, bytes):
+                    parsed = PersonaPlexMessage.decode(message)
+                    yield parsed
+        except Exception as e:
+            logger.error(f"Error in message stream: {e}")
+            raise
+    @property
+    def is_connected(self) -> bool:
+        """Check if currently connected."""
+        return self._is_connected and self.connection is not None
+    async def __aenter__(self) -> "PersonaPlexClient":
+        """Async context manager entry."""
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self.disconnect()

audio_engine/pipelines/personaplex/config.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Configuration for PersonaPlex speech-to-speech pipeline."""
+from dataclasses import dataclass, field
+from typing import Optional
+from pathlib import Path
+@dataclass
+class PersonaPlexConfig:
+    """
+    Configuration for PersonaPlex full-duplex speech-to-speech model.
+    PersonaPlex is a real-time, full-duplex conversational speech model
+    that handles audio input/output simultaneously with optional text streaming.
+    Attributes:
+        server_url: WebSocket URL to PersonaPlex server (default: official RunPod deployment)
+        voice_prompt: Voice preset name (e.g., "NATF0.pt", "NATM1.pt")
+                     See: https://github.com/NVIDIA/personaplex#voices
+        text_prompt: System prompt for controlling persona/behavior
+        text_temperature: LLM temperature for text generation (0.0-2.0)
+        audio_temperature: Audio codec temperature for naturalness (0.0-2.0)
+        text_topk: Top-K sampling for text tokens
+        audio_topk: Top-K sampling for audio tokens
+        sample_rate: Audio sample rate in Hz (Opus default: 48000)
+        save_transcripts: Whether to save session transcripts to disk
+        transcript_path: Directory to save transcripts
+        session_timeout_seconds: Max seconds to wait before closing idle connection
+    """
+    server_url: str = "wss://cl9unux255nnzf-8998.proxy.runpod.net"
+    voice_prompt: str = "NATF0.pt"
+    text_prompt: str = "You are a helpful AI assistant. Have a natural conversation."
+    text_temperature: float = 0.7
+    audio_temperature: float = 0.8
+    text_topk: int = 25
+    audio_topk: int = 250
+    sample_rate: int = 48000
+    save_transcripts: bool = True
+    transcript_path: str = "./transcripts/"
+    session_timeout_seconds: float = 300.0
+    extra: dict = field(default_factory=dict)
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        if self.text_temperature < 0.0 or self.text_temperature > 2.0:
+            raise ValueError("text_temperature must be between 0.0 and 2.0")
+        if self.audio_temperature < 0.0 or self.audio_temperature > 2.0:
+            raise ValueError("audio_temperature must be between 0.0 and 2.0")
+        if self.text_topk < 1:
+            raise ValueError("text_topk must be >= 1")
+        if self.audio_topk < 1:
+            raise ValueError("audio_topk must be >= 1")
+        if self.sample_rate not in (48000, 24000, 16000):
+            raise ValueError("sample_rate must be 48000, 24000, or 16000")
+        # Create transcript directory if save_transcripts is enabled
+        if self.save_transcripts:
+            Path(self.transcript_path).mkdir(parents=True, exist_ok=True)
+    @classmethod
+    def default(cls) -> "PersonaPlexConfig":
+        """Get default configuration."""
+        return cls()
+    @classmethod
+    def from_dict(cls, data: dict) -> "PersonaPlexConfig":
+        """Create config from dictionary."""
+        return cls(**data)

audio_engine/pipelines/personaplex/pipeline.py ADDED Viewed

@@ -0,0 +1,301 @@
+"""Main PersonaPlex pipeline orchestrator."""
+import asyncio
+import logging
+from typing import AsyncIterator, Optional, Tuple
+from .config import PersonaPlexConfig
+from .client import PersonaPlexClient
+from .types import MessageType, AudioChunk, TextChunk, SessionData
+from .utils import generate_session_id, get_timestamp_iso, save_transcript
+logger = logging.getLogger(__name__)
+class PersonaPlexPipeline:
+    """
+    Full-duplex speech-to-speech pipeline using PersonaPlex.
+    This pipeline handles real-time bidirectional communication:
+    - Sends user audio to PersonaPlex
+    - Receives assistant audio and text streaming from PersonaPlex
+    - Maintains conversation transcript
+    - Optionally saves transcripts to disk
+    Unlike the audio-engine's sequential ASR→LLM→TTS pipeline, PersonaPlex
+    is truly full-duplex: user can speak while assistant responds simultaneously.
+    Approach:
+    1. Create session with UUID and timestamp
+    2. Connect client with system prompt
+    3. Launch concurrent receive task to handle server messages
+    4. Caller sends user audio; pipeline yields received audio/text chunks
+    5. On stop, save transcript and disconnect
+    Example:
+        ```python
+        pipeline = PersonaPlexPipeline(
+            system_prompt="You are a helpful AI.",
+            save_transcripts=True
+        )
+        await pipeline.start()
+        # Send user audio, receive assistant response
+        async for audio_chunk, text_chunk in pipeline.stream(user_audio_stream):
+            if audio_chunk:
+                play_audio(audio_chunk)
+            if text_chunk:
+                print(text_chunk.text, end="", flush=True)
+        transcript = await pipeline.stop()
+        ```
+    """
+    def __init__(
+        self,
+        config: Optional[PersonaPlexConfig] = None,
+        system_prompt: str = "You are a helpful AI assistant.",
+        save_transcripts: bool = True,
+        debug: bool = False,
+    ):
+        """
+        Initialize PersonaPlex pipeline.
+        Args:
+            config: PersonaPlexConfig (uses defaults if None)
+            system_prompt: System prompt for persona control
+            save_transcripts: Whether to save transcript after session
+            debug: Enable debug logging
+        """
+        self.config = config or PersonaPlexConfig()
+        self.config.text_prompt = system_prompt
+        self.config.save_transcripts = save_transcripts
+        self.system_prompt = system_prompt
+        self.client = PersonaPlexClient(self.config)
+        # Session state
+        self.session_id = generate_session_id()
+        self.session_data = SessionData(
+            session_id=self.session_id,
+            timestamp=get_timestamp_iso(),
+            system_prompt=system_prompt,
+            voice_prompt=self.config.voice_prompt,
+        )
+        self._is_running = False
+        self._receive_task: Optional[asyncio.Task] = None
+        self._audio_queue: asyncio.Queue[Optional[AudioChunk]] = asyncio.Queue()
+        self._text_queue: asyncio.Queue[Optional[TextChunk]] = asyncio.Queue()
+        if debug:
+            logging.basicConfig(level=logging.DEBUG)
+        logger.info(f"PersonaPlexPipeline initialized (session: {self.session_id})")
+    async def start(self) -> None:
+        """
+        Connect to PersonaPlex server and start listening for messages.
+        Raises:
+            ConnectionError: If connection fails
+        """
+        if self._is_running:
+            logger.warning("Pipeline already running")
+            return
+        try:
+            await self.client.connect(self.system_prompt)
+            self._is_running = True
+            # Start background task to receive messages
+            self._receive_task = asyncio.create_task(self._receive_loop())
+            logger.info("PersonaPlex pipeline started")
+        except Exception as e:
+            logger.error(f"Failed to start pipeline: {e}")
+            raise
+    async def stop(self) -> Optional[SessionData]:
+        """
+        Stop the pipeline, close connection, and optionally save transcript.
+        Returns:
+            SessionData with transcript if save_transcripts=True, else None
+        """
+        if not self._is_running:
+            logger.warning("Pipeline not running")
+            return None
+        try:
+            self._is_running = False
+            # Cancel receive task
+            if self._receive_task:
+                self._receive_task.cancel()
+                try:
+                    await self._receive_task
+                except asyncio.CancelledError:
+                    pass
+            # Disconnect from server
+            await self.client.disconnect()
+            # Save transcript if enabled
+            if self.config.save_transcripts:
+                transcript_path = save_transcript(
+                    self.session_data,
+                    self.config.transcript_path,
+                )
+                logger.info(f"Transcript saved: {transcript_path}")
+            logger.info("PersonaPlex pipeline stopped")
+            return self.session_data
+        except Exception as e:
+            logger.error(f"Error stopping pipeline: {e}")
+            raise
+    async def _receive_loop(self) -> None:
+        """
+        Background task: continuously receive messages from server.
+        Puts audio/text chunks into respective queues.
+        """
+        try:
+            async for message in self.client.stream_messages():
+                if not self._is_running:
+                    break
+                if message.type == MessageType.AUDIO:
+                    chunk = AudioChunk(
+                        data=message.data,  # type: ignore
+                        sample_rate=self.config.sample_rate,
+                    )
+                    await self._audio_queue.put(chunk)
+                elif message.type == MessageType.TEXT:
+                    text = (
+                        message.data.decode("utf-8")
+                        if isinstance(message.data, bytes)
+                        else message.data
+                    )
+                    chunk = TextChunk(text=text)
+                    # Track in transcript
+                    if text and text.strip():
+                        self.session_data.add_message("assistant", text)
+                    await self._text_queue.put(chunk)
+                elif message.type == MessageType.ERROR:
+                    error_msg = (
+                        message.data.decode("utf-8")
+                        if isinstance(message.data, bytes)
+                        else str(message.data)
+                    )
+                    logger.error(f"Server error: {error_msg}")
+        except asyncio.CancelledError:
+            logger.debug("Receive loop cancelled")
+        except Exception as e:
+            logger.error(f"Error in receive loop: {e}")
+    async def send_audio(self, audio_chunk: bytes) -> None:
+        """
+        Send audio chunk to PersonaPlex server.
+        Args:
+            audio_chunk: Raw Opus-encoded audio bytes
+        """
+        if not self._is_running:
+            raise RuntimeError("Pipeline not running")
+        try:
+            await self.client.send_audio(audio_chunk)
+            # Track in transcript (user audio sent)
+            # Note: We don't transcribe user audio; PersonaPlex returns text
+        except Exception as e:
+            logger.error(f"Failed to send audio: {e}")
+            raise
+    async def stream(
+        self,
+        audio_stream: Optional[AsyncIterator[bytes]] = None,
+    ) -> AsyncIterator[Tuple[Optional[AudioChunk], Optional[TextChunk]]]:
+        """
+        Stream bidirectional audio/text from PersonaPlex.
+        This is a generator that yields (audio_chunk, text_chunk) tuples.
+        If audio_stream is provided, sends user audio concurrently.
+        Approach:
+        - If audio_stream provided: spawn task to continuously send user audio
+        - Concurrently receive audio and text from server
+        - Yield (audio, text) tuples as they arrive (either can be None)
+        Args:
+            audio_stream: Optional async iterator of audio bytes to send
+        Yields:
+            Tuple of (AudioChunk or None, TextChunk or None)
+        """
+        if not self._is_running:
+            raise RuntimeError("Pipeline not running")
+        # Optional task to send user audio
+        send_task: Optional[asyncio.Task] = None
+        if audio_stream:
+            async def send_user_audio():
+                """Background task: send audio from user stream."""
+                try:
+                    async for audio_chunk in audio_stream:
+                        if not self._is_running:
+                            break
+                        await self.send_audio(audio_chunk)
+                except asyncio.CancelledError:
+                    logger.debug("Send task cancelled")
+                except Exception as e:
+                    logger.error(f"Error sending audio: {e}")
+            send_task = asyncio.create_task(send_user_audio())
+        try:
+            while self._is_running:
+                # Wait for either audio or text (non-blocking)
+                try:
+                    # Try to get audio (non-blocking)
+                    audio_chunk = self._audio_queue.get_nowait()
+                except asyncio.QueueEmpty:
+                    audio_chunk = None
+                try:
+                    # Try to get text (non-blocking)
+                    text_chunk = self._text_queue.get_nowait()
+                except asyncio.QueueEmpty:
+                    text_chunk = None
+                # If we got something, yield it
+                if audio_chunk or text_chunk:
+                    yield (audio_chunk, text_chunk)
+                else:
+                    # Nothing available, wait a bit before polling again
+                    await asyncio.sleep(0.01)
+        finally:
+            # Clean up send task
+            if send_task:
+                send_task.cancel()
+                try:
+                    await send_task
+                except asyncio.CancelledError:
+                    pass
+    async def __aenter__(self) -> "PersonaPlexPipeline":
+        """Async context manager entry."""
+        await self.start()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self.stop()

atom-audio-engine 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

atom-audio-engine 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl