PyPI - atom-audio-engine - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

atom-audio-engine 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{atom_audio_engine-0.1.1.dist-info → atom_audio_engine-0.1.2.dist-info}/METADATA +1 -1
atom_audio_engine-0.1.2.dist-info/RECORD +57 -0
audio_engine/asr/__init__.py +45 -0
audio_engine/asr/base.py +89 -0
audio_engine/asr/cartesia.py +356 -0
audio_engine/asr/deepgram.py +196 -0
audio_engine/core/__init__.py +13 -0
audio_engine/core/config.py +162 -0
audio_engine/core/pipeline.py +282 -0
audio_engine/core/types.py +87 -0
audio_engine/examples/__init__.py +1 -0
audio_engine/examples/basic_stt_llm_tts.py +200 -0
audio_engine/examples/geneface_animation.py +99 -0
audio_engine/examples/personaplex_pipeline.py +116 -0
audio_engine/examples/websocket_server.py +86 -0
audio_engine/integrations/__init__.py +5 -0
audio_engine/integrations/geneface.py +297 -0
audio_engine/llm/__init__.py +38 -0
audio_engine/llm/base.py +108 -0
audio_engine/llm/groq.py +210 -0
audio_engine/pipelines/__init__.py +1 -0
audio_engine/pipelines/personaplex/__init__.py +41 -0
audio_engine/pipelines/personaplex/client.py +259 -0
audio_engine/pipelines/personaplex/config.py +69 -0
audio_engine/pipelines/personaplex/pipeline.py +301 -0
audio_engine/pipelines/personaplex/types.py +173 -0
audio_engine/pipelines/personaplex/utils.py +192 -0
audio_engine/scripts/debug_pipeline.py +79 -0
audio_engine/scripts/debug_tts.py +162 -0
audio_engine/scripts/test_cartesia_connect.py +57 -0
audio_engine/streaming/__init__.py +5 -0
audio_engine/streaming/websocket_server.py +341 -0
audio_engine/tests/__init__.py +1 -0
audio_engine/tests/test_personaplex/__init__.py +1 -0
audio_engine/tests/test_personaplex/test_personaplex.py +10 -0
audio_engine/tests/test_personaplex/test_personaplex_client.py +259 -0
audio_engine/tests/test_personaplex/test_personaplex_config.py +71 -0
audio_engine/tests/test_personaplex/test_personaplex_message.py +80 -0
audio_engine/tests/test_personaplex/test_personaplex_pipeline.py +226 -0
audio_engine/tests/test_personaplex/test_personaplex_session.py +184 -0
audio_engine/tests/test_personaplex/test_personaplex_transcript.py +184 -0
audio_engine/tests/test_traditional_pipeline/__init__.py +1 -0
audio_engine/tests/test_traditional_pipeline/test_cartesia_asr.py +474 -0
audio_engine/tests/test_traditional_pipeline/test_config_env.py +97 -0
audio_engine/tests/test_traditional_pipeline/test_conversation_context.py +115 -0
audio_engine/tests/test_traditional_pipeline/test_pipeline_creation.py +64 -0
audio_engine/tests/test_traditional_pipeline/test_pipeline_with_mocks.py +173 -0
audio_engine/tests/test_traditional_pipeline/test_provider_factories.py +61 -0
audio_engine/tests/test_traditional_pipeline/test_websocket_server.py +58 -0
audio_engine/tts/__init__.py +37 -0
audio_engine/tts/base.py +155 -0
audio_engine/tts/cartesia.py +392 -0
audio_engine/utils/__init__.py +15 -0
audio_engine/utils/audio.py +220 -0
atom_audio_engine-0.1.1.dist-info/RECORD +0 -5
{atom_audio_engine-0.1.1.dist-info → atom_audio_engine-0.1.2.dist-info}/WHEEL +0 -0
{atom_audio_engine-0.1.1.dist-info → atom_audio_engine-0.1.2.dist-info}/top_level.txt +0 -0

audio_engine/tts/cartesia.py ADDED Viewed

@@ -0,0 +1,392 @@
+"""Cartesia API implementation for TTS (Text-to-Speech)."""
+import asyncio
+import base64
+import json
+import logging
+from typing import AsyncIterator, Optional
+import websockets
+from core.types import AudioChunk, AudioFormat
+from .base import BaseTTS
+logger = logging.getLogger(__name__)
+class CartesiaTTS(BaseTTS):
+    """
+    Cartesia API client for text-to-speech synthesis.
+    Supports streaming synthesis with per-chunk latency < 200ms.
+    Uses WebSocket connections for real-time streaming with continuations.
+    Outputs 16kHz PCM by default (can be configured).
+    Example:
+        tts = CartesiaTTS(api_key="...", voice_id="sonic")
+        # Streaming text input (from LLM)
+        async for chunk in tts.synthesize_stream_text(llm_text_stream):
+            play_audio(chunk)
+    """
+    CARTESIA_VERSION = "2025-04-16"
+    DEFAULT_VOICE_ID = "c8605446-247c-4d39-acd4-8f4c28aa363c"  # Edith voice
+    WS_URL = "wss://api.cartesia.ai/tts/websocket"
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        voice_id: Optional[str] = None,
+        model: Optional[str] = "sonic-3",
+        speed: float = 1.0,
+        output_format: AudioFormat = AudioFormat.PCM_16K,
+        sample_rate: int = 16000,
+        max_buffer_delay_ms: int = 1500,
+        **kwargs,
+    ):
+        """
+        Initialize Cartesia TTS provider.
+        Args:
+            api_key: Cartesia API key (or None to use CARTESIA_API_KEY env var)
+            voice_id: Voice identifier (UUID or default Edith)
+            model: Model to use (default: sonic-3)
+            speed: Speech speed multiplier (1.0 = normal)
+            output_format: Desired audio output format (default 16kHz PCM)
+            sample_rate: Output sample rate in Hz (default: 16000)
+            max_buffer_delay_ms: Buffering delay for streaming (0-5000ms)
+            **kwargs: Additional config
+        """
+        # Fallback to environment variable if not provided
+        if not api_key:
+            import os
+            api_key = os.getenv("CARTESIA_API_KEY")
+        super().__init__(
+            api_key=api_key,
+            voice_id=voice_id or self.DEFAULT_VOICE_ID,
+            model=model,
+            speed=speed,
+            output_format=output_format,
+            **kwargs,
+        )
+        self._sample_rate = sample_rate
+        self.max_buffer_delay_ms = max_buffer_delay_ms
+    @property
+    def name(self) -> str:
+        """Return provider name."""
+        return "cartesia"
+    @property
+    def sample_rate(self) -> int:
+        """Return the sample rate for this provider's output."""
+        return self._sample_rate
+    async def connect(self):
+        """Cartesia uses WebSocket connections - no persistent client needed."""
+        pass
+    async def disconnect(self):
+        """Cartesia uses WebSocket connections - no persistent client needed."""
+        pass
+    async def synthesize(self, text: str) -> bytes:
+        """
+        Synthesize complete audio from text (non-streaming).
+        Args:
+            text: Text to convert to speech
+        Returns:
+            Complete audio as bytes (PCM)
+        """
+        audio_data = bytearray()
+        async for chunk in self.synthesize_stream_text(self._text_to_async_iter(text)):
+            if chunk.data and not chunk.is_final:
+                audio_data.extend(chunk.data)
+        return bytes(audio_data)
+    async def synthesize_stream(self, text: str) -> AsyncIterator[AudioChunk]:
+        """
+        Synthesize streaming audio from text.
+        Args:
+            text: Text to convert to speech
+        Yields:
+            AudioChunk objects with audio data
+        """
+        async for chunk in self.synthesize_stream_text(self._text_to_async_iter(text)):
+            yield chunk
+    async def synthesize_stream_text(
+        self, text_stream: AsyncIterator[str]
+    ) -> AsyncIterator[AudioChunk]:
+        """
+        Synthesize streaming audio from streaming text input via WebSocket.
+        Uses continuations to maintain natural prosody across streamed text chunks.
+        Args:
+            text_stream: Async iterator yielding text tokens
+        Yields:
+            AudioChunk objects with audio data
+        """
+        if websockets is None:
+            raise ImportError(
+                "websockets package required. Install: pip install websockets"
+            )
+        if not self.api_key:
+            raise ValueError("api_key required for Cartesia TTS")
+        # Use unique context ID for this synthesis session
+        import uuid
+        context_id = str(uuid.uuid4())
+        ws_url = (
+            f"{self.WS_URL}"
+            f"?api_key={self.api_key}"
+            f"&cartesia_version={self.CARTESIA_VERSION}"
+        )
+        try:
+            async with websockets.connect(ws_url) as websocket:
+                logger.debug(
+                    f"Cartesia TTS WebSocket connected | Context: {context_id}"
+                )
+                # Task to receive audio from WebSocket
+                async def receive_audio():
+                    """Receive audio chunks from TTS WebSocket."""
+                    logger.debug("Cartesia: receive_audio started")
+                    try:
+                        async for message in websocket:
+                            if isinstance(message, str):
+                                try:
+                                    response = json.loads(message)
+                                    logger.debug(
+                                        f"Cartesia: received response type={response.get('type')}"
+                                    )
+                                    # Handle audio chunk (base64 in "data" field)
+                                    if response.get("type") == "chunk" and response.get(
+                                        "data"
+                                    ):
+                                        audio_bytes = base64.b64decode(response["data"])
+                                        yield audio_bytes
+                                        logger.debug(
+                                            f"Cartesia: received audio chunk {len(audio_bytes)} bytes"
+                                        )
+                                    # Handle buffer flush
+                                    elif response.get("type") == "flush_done":
+                                        logger.debug("Cartesia: buffer flushed")
+                                    # Handle completion
+                                    elif response.get("type") == "done":
+                                        logger.info("Cartesia: TTS generation complete")
+                                        break
+                                    # Handle error
+                                    elif response.get("type") == "error":
+                                        error_msg = (
+                                            response.get("error")
+                                            or response.get("error_message")
+                                            or response.get("message")
+                                            or str(response)
+                                        )
+                                        logger.error(f"Cartesia TTS error: {error_msg}")
+                                        raise RuntimeError(
+                                            f"Cartesia API error: {error_msg}"
+                                        )
+                                    else:
+                                        logger.debug(
+                                            f"Cartesia: response type {response.get('type')}"
+                                        )
+                                except json.JSONDecodeError:
+                                    logger.warning(
+                                        f"Failed to parse Cartesia response: {message}"
+                                    )
+                    except Exception as e:
+                        logger.error(f"Cartesia receive error: {e}", exc_info=True)
+                        raise
+                # Task to send text to WebSocket
+                async def send_text():
+                    """Send text tokens to TTS WebSocket."""
+                    logger.debug("Cartesia: send_text started")
+                    accumulated_text = ""
+                    first_token_timeout = 30.0
+                    subsequent_token_timeout = 2.0
+                    first_token_received = False
+                    try:
+                        while True:
+                            try:
+                                # Wait for token with appropriate timeout
+                                timeout = (
+                                    first_token_timeout
+                                    if not first_token_received
+                                    else subsequent_token_timeout
+                                )
+                                token = await asyncio.wait_for(
+                                    self._get_next_token(text_stream),
+                                    timeout=timeout,
+                                )
+                                first_token_received = True
+                            except asyncio.TimeoutError:
+                                logger.debug(
+                                    f"Cartesia: token timeout (first_token={first_token_received})"
+                                )
+                                # Send accumulated text even on timeout
+                                if accumulated_text.strip():
+                                    request = {
+                                        "model_id": self.model,
+                                        "transcript": accumulated_text,
+                                        "context_id": context_id,
+                                        "continue": True,
+                                        "max_buffer_delay_ms": self.max_buffer_delay_ms,
+                                        "voice": {
+                                            "mode": "id",
+                                            "id": self.voice_id,
+                                        },
+                                        "output_format": {
+                                            "container": "raw",
+                                            "encoding": "pcm_s16le",
+                                            "sample_rate": self.sample_rate,
+                                        },
+                                    }
+                                    await websocket.send(json.dumps(request))
+                                    logger.debug(
+                                        f"Cartesia: sent text on timeout (continue=true)"
+                                    )
+                                    accumulated_text = ""
+                                continue
+                            # None signals end of text stream
+                            if token is None:
+                                # Send remaining text with continue=false
+                                if accumulated_text.strip():
+                                    request = {
+                                        "model_id": self.model,
+                                        "transcript": accumulated_text,
+                                        "context_id": context_id,
+                                        "continue": False,
+                                        "max_buffer_delay_ms": self.max_buffer_delay_ms,
+                                        "voice": {
+                                            "mode": "id",
+                                            "id": self.voice_id,
+                                        },
+                                        "output_format": {
+                                            "container": "raw",
+                                            "encoding": "pcm_s16le",
+                                            "sample_rate": self.sample_rate,
+                                        },
+                                    }
+                                    await websocket.send(json.dumps(request))
+                                    logger.debug(
+                                        f"Cartesia: sent final text (continue=false)"
+                                    )
+                                else:
+                                    # Send empty transcript to signal end
+                                    request = {
+                                        "model_id": self.model,
+                                        "transcript": "",
+                                        "context_id": context_id,
+                                        "continue": False,
+                                        "max_buffer_delay_ms": self.max_buffer_delay_ms,
+                                        "voice": {
+                                            "mode": "id",
+                                            "id": self.voice_id,
+                                        },
+                                        "output_format": {
+                                            "container": "raw",
+                                            "encoding": "pcm_s16le",
+                                            "sample_rate": self.sample_rate,
+                                        },
+                                    }
+                                    await websocket.send(json.dumps(request))
+                                    logger.debug(
+                                        "Cartesia: sent empty transcript to signal end"
+                                    )
+                                logger.info("Cartesia: all text sent")
+                                break
+                            # Accumulate token
+                            accumulated_text += token
+                            logger.debug(
+                                f"Cartesia: buffered token {len(accumulated_text)} chars total"
+                            )
+                            # Send when buffer is large enough or ends with punctuation
+                            if len(accumulated_text) > 30 or token.endswith(
+                                (".", "!", "?")
+                            ):
+                                request = {
+                                    "model_id": self.model,
+                                    "transcript": accumulated_text,
+                                    "context_id": context_id,
+                                    "continue": True,
+                                    "max_buffer_delay_ms": self.max_buffer_delay_ms,
+                                    "voice": {
+                                        "mode": "id",
+                                        "id": self.voice_id,
+                                    },
+                                    "output_format": {
+                                        "container": "raw",
+                                        "encoding": "pcm_s16le",
+                                        "sample_rate": self.sample_rate,
+                                    },
+                                }
+                                await websocket.send(json.dumps(request))
+                                logger.debug(
+                                    f"Cartesia: sent buffered text (continue=true)"
+                                )
+                                accumulated_text = ""
+                    except Exception as e:
+                        logger.error(f"Cartesia send error: {e}")
+                # Run send and receive concurrently
+                send_task = asyncio.create_task(send_text())
+                async for audio_bytes in receive_audio():
+                    yield AudioChunk(
+                        data=audio_bytes,
+                        sample_rate=self.sample_rate,
+                        channels=1,
+                        format=self.output_format,
+                        is_final=False,
+                    )
+                # Wait for send task to complete
+                await send_task
+                # Yield final marker
+                yield AudioChunk(
+                    data=b"",
+                    sample_rate=self.sample_rate,
+                    channels=1,
+                    format=self.output_format,
+                    is_final=True,
+                )
+                logger.info("Cartesia: stream complete")
+        except Exception as e:
+            logger.error(f"Cartesia streaming text error: {e}")
+            raise
+    async def _get_next_token(self, text_stream: AsyncIterator[str]) -> Optional[str]:
+        """Get next token from async iterator."""
+        try:
+            return await text_stream.__anext__()
+        except StopAsyncIteration:
+            return None
+    async def _text_to_async_iter(self, text: str) -> AsyncIterator[str]:
+        """Convert plain text to async iterator."""
+        yield text

audio_engine/utils/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Utility functions for the audio engine."""
+from utils.audio import (
+    resample_audio,
+    pcm_to_wav,
+    wav_to_pcm,
+    get_audio_duration,
+)
+__all__ = [
+    "resample_audio",
+    "pcm_to_wav",
+    "wav_to_pcm",
+    "get_audio_duration",
+]

audio_engine/utils/audio.py ADDED Viewed

@@ -0,0 +1,220 @@
+"""Audio utility functions."""
+import struct
+from typing import Optional
+def resample_audio(
+    audio: bytes,
+    from_rate: int,
+    to_rate: int,
+    channels: int = 1,
+    sample_width: int = 2,
+) -> bytes:
+    """
+    Resample audio to a different sample rate.
+    Uses linear interpolation for simple resampling.
+    For higher quality, consider using librosa or scipy.
+    Args:
+        audio: Input audio bytes (PCM format)
+        from_rate: Original sample rate
+        to_rate: Target sample rate
+        channels: Number of audio channels
+        sample_width: Bytes per sample (2 for 16-bit)
+    Returns:
+        Resampled audio bytes
+    """
+    if from_rate == to_rate:
+        return audio
+    try:
+        import numpy as np
+        from scipy import signal
+        # Convert bytes to numpy array
+        dtype = np.int16 if sample_width == 2 else np.int32
+        samples = np.frombuffer(audio, dtype=dtype)
+        # Resample using scipy
+        num_samples = int(len(samples) * to_rate / from_rate)
+        resampled = signal.resample(samples, num_samples)
+        return resampled.astype(dtype).tobytes()
+    except ImportError:
+        # Fallback to simple linear interpolation
+        return _simple_resample(audio, from_rate, to_rate, sample_width)
+def _simple_resample(
+    audio: bytes,
+    from_rate: int,
+    to_rate: int,
+    sample_width: int = 2,
+) -> bytes:
+    """Simple linear interpolation resampling."""
+    if sample_width == 2:
+        fmt = "<h"
+        samples = [
+            struct.unpack(fmt, audio[i : i + 2])[0] for i in range(0, len(audio), 2)
+        ]
+    else:
+        raise ValueError(f"Unsupported sample width: {sample_width}")
+    ratio = from_rate / to_rate
+    new_length = int(len(samples) / ratio)
+    resampled = []
+    for i in range(new_length):
+        pos = i * ratio
+        idx = int(pos)
+        frac = pos - idx
+        if idx + 1 < len(samples):
+            sample = int(samples[idx] * (1 - frac) + samples[idx + 1] * frac)
+        else:
+            sample = samples[idx]
+        resampled.append(sample)
+    return struct.pack(f"<{len(resampled)}h", *resampled)
+def pcm_to_wav(
+    pcm_data: bytes,
+    sample_rate: int = 16000,
+    channels: int = 1,
+    bits_per_sample: int = 16,
+) -> bytes:
+    """
+    Convert raw PCM data to WAV format.
+    Args:
+        pcm_data: Raw PCM audio bytes
+        sample_rate: Sample rate in Hz
+        channels: Number of audio channels
+        bits_per_sample: Bits per sample (typically 16)
+    Returns:
+        WAV file as bytes
+    """
+    byte_rate = sample_rate * channels * bits_per_sample // 8
+    block_align = channels * bits_per_sample // 8
+    data_size = len(pcm_data)
+    header = struct.pack(
+        "<4sI4s4sIHHIIHH4sI",
+        b"RIFF",
+        36 + data_size,
+        b"WAVE",
+        b"fmt ",
+        16,  # fmt chunk size
+        1,  # audio format (PCM)
+        channels,
+        sample_rate,
+        byte_rate,
+        block_align,
+        bits_per_sample,
+        b"data",
+        data_size,
+    )
+    return header + pcm_data
+def wav_to_pcm(wav_data: bytes) -> tuple[bytes, int, int, int]:
+    """
+    Extract raw PCM data from WAV format.
+    Args:
+        wav_data: WAV file as bytes
+    Returns:
+        Tuple of (pcm_data, sample_rate, channels, bits_per_sample)
+    """
+    # Parse RIFF header
+    if wav_data[:4] != b"RIFF" or wav_data[8:12] != b"WAVE":
+        raise ValueError("Invalid WAV file")
+    # Find fmt chunk
+    pos = 12
+    sample_rate = 0
+    channels = 0
+    bits_per_sample = 0
+    while pos < len(wav_data):
+        chunk_id = wav_data[pos : pos + 4]
+        chunk_size = struct.unpack("<I", wav_data[pos + 4 : pos + 8])[0]
+        if chunk_id == b"fmt ":
+            _, channels, sample_rate, _, _, bits_per_sample = struct.unpack(
+                "<HHIIHH", wav_data[pos + 8 : pos + 24]
+            )
+        elif chunk_id == b"data":
+            pcm_data = wav_data[pos + 8 : pos + 8 + chunk_size]
+            return pcm_data, sample_rate, channels, bits_per_sample
+        pos += 8 + chunk_size
+    raise ValueError("No data chunk found in WAV file")
+def get_audio_duration(
+    audio: bytes,
+    sample_rate: int,
+    channels: int = 1,
+    bits_per_sample: int = 16,
+) -> float:
+    """
+    Calculate duration of PCM audio in seconds.
+    Args:
+        audio: PCM audio bytes
+        sample_rate: Sample rate in Hz
+        channels: Number of audio channels
+        bits_per_sample: Bits per sample
+    Returns:
+        Duration in seconds
+    """
+    bytes_per_sample = bits_per_sample // 8
+    total_samples = len(audio) // (bytes_per_sample * channels)
+    return total_samples / sample_rate
+def normalize_audio(audio: bytes, target_db: float = -20.0) -> bytes:
+    """
+    Normalize audio to a target dB level.
+    Args:
+        audio: PCM audio bytes (16-bit)
+        target_db: Target dB level
+    Returns:
+        Normalized audio bytes
+    """
+    try:
+        import numpy as np
+        samples = np.frombuffer(audio, dtype=np.int16).astype(np.float32)
+        # Calculate current RMS
+        rms = np.sqrt(np.mean(samples**2))
+        if rms == 0:
+            return audio
+        # Calculate target RMS
+        target_rms = 32768 * (10 ** (target_db / 20))
+        # Scale
+        gain = target_rms / rms
+        normalized = np.clip(samples * gain, -32768, 32767).astype(np.int16)
+        return normalized.tobytes()
+    except ImportError:
+        # Return unchanged if numpy not available
+        return audio

atom_audio_engine-0.1.1.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-audio_engine/__init__.py,sha256=AQ0uto-Jn3cNqW35MMtSyX5mhXJMFv9AQhjcAkqZ7L4,1499
-atom_audio_engine-0.1.1.dist-info/METADATA,sha256=Apv8YTxoTYqqptLeY6ofsjyk82X9LRxtqEJDiAV14Bg,6690
-atom_audio_engine-0.1.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-atom_audio_engine-0.1.1.dist-info/top_level.txt,sha256=IyumwgFrsDL7nlZlBijX-0shiSVhhBCFPUNBRNKzWP4,13
-atom_audio_engine-0.1.1.dist-info/RECORD,,

{atom_audio_engine-0.1.1.dist-info → atom_audio_engine-0.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{atom_audio_engine-0.1.1.dist-info → atom_audio_engine-0.1.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

atom-audio-engine 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

atom-audio-engine 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl