PyPI - atom-audio-engine - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

atom-audio-engine 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/METADATA +1 -1
atom_audio_engine-0.1.6.dist-info/RECORD +32 -0
audio_engine/__init__.py +6 -2
audio_engine/asr/__init__.py +48 -0
audio_engine/asr/base.py +89 -0
audio_engine/asr/cartesia.py +350 -0
audio_engine/asr/deepgram.py +196 -0
audio_engine/core/__init__.py +13 -0
audio_engine/core/config.py +162 -0
audio_engine/core/pipeline.py +278 -0
audio_engine/core/types.py +87 -0
audio_engine/integrations/__init__.py +5 -0
audio_engine/integrations/geneface.py +297 -0
audio_engine/llm/__init__.py +40 -0
audio_engine/llm/base.py +106 -0
audio_engine/llm/groq.py +208 -0
audio_engine/pipelines/__init__.py +1 -0
audio_engine/pipelines/personaplex/__init__.py +41 -0
audio_engine/pipelines/personaplex/client.py +259 -0
audio_engine/pipelines/personaplex/config.py +69 -0
audio_engine/pipelines/personaplex/pipeline.py +301 -0
audio_engine/pipelines/personaplex/types.py +173 -0
audio_engine/pipelines/personaplex/utils.py +192 -0
audio_engine/streaming/__init__.py +5 -0
audio_engine/streaming/websocket_server.py +333 -0
audio_engine/tts/__init__.py +35 -0
audio_engine/tts/base.py +153 -0
audio_engine/tts/cartesia.py +370 -0
audio_engine/utils/__init__.py +15 -0
audio_engine/utils/audio.py +218 -0
atom_audio_engine-0.1.4.dist-info/RECORD +0 -5
{atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/WHEEL +0 -0
{atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/top_level.txt +0 -0

audio_engine/core/types.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Shared types and data structures for the audio engine."""
+from dataclasses import dataclass, field
+from typing import Optional
+from enum import Enum
+class AudioFormat(Enum):
+    """Supported audio formats."""
+    PCM_16K = "pcm_16k"  # 16-bit PCM at 16kHz
+    PCM_24K = "pcm_24k"  # 16-bit PCM at 24kHz
+    PCM_44K = "pcm_44k"  # 16-bit PCM at 44.1kHz
+    WAV = "wav"
+    MP3 = "mp3"
+    OGG = "ogg"
+@dataclass
+class AudioChunk:
+    """A chunk of audio data."""
+    data: bytes
+    sample_rate: int = 16000
+    channels: int = 1
+    format: AudioFormat = AudioFormat.PCM_16K
+    timestamp_ms: Optional[int] = None
+    is_final: bool = False
+@dataclass
+class TranscriptChunk:
+    """A chunk of transcribed text from ASR."""
+    text: str
+    is_final: bool = False
+    confidence: Optional[float] = None
+    timestamp_ms: Optional[int] = None
+@dataclass
+class ResponseChunk:
+    """A chunk of LLM response text."""
+    text: str
+    is_final: bool = False
+    timestamp_ms: Optional[int] = None
+@dataclass
+class ConversationMessage:
+    """A message in the conversation history."""
+    role: str  # "user" or "assistant"
+    content: str
+    timestamp_ms: Optional[int] = None
+@dataclass
+class ConversationContext:
+    """Maintains conversation state and history."""
+    messages: list[ConversationMessage] = field(default_factory=list)
+    system_prompt: Optional[str] = None
+    max_history: int = 20
+    def add_message(self, role: str, content: str, timestamp_ms: Optional[int] = None):
+        """Add a message to the conversation history."""
+        self.messages.append(
+            ConversationMessage(role=role, content=content, timestamp_ms=timestamp_ms)
+        )
+        # Trim history if needed
+        if len(self.messages) > self.max_history:
+            self.messages = self.messages[-self.max_history :]
+    def get_messages_for_llm(self) -> list[dict]:
+        """Get messages formatted for LLM API calls."""
+        result = []
+        if self.system_prompt:
+            result.append({"role": "system", "content": self.system_prompt})
+        for msg in self.messages:
+            result.append({"role": msg.role, "content": msg.content})
+        return result
+    def clear(self):
+        """Clear conversation history."""
+        self.messages = []

audio_engine/integrations/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""External system integrations."""
+from .geneface import GeneFaceIntegration
+__all__ = ["GeneFaceIntegration"]

audio_engine/integrations/geneface.py ADDED Viewed

@@ -0,0 +1,297 @@
+"""GeneFace++ integration for face animation from audio."""
+import asyncio
+import logging
+import tempfile
+import os
+from pathlib import Path
+from typing import Optional, AsyncIterator
+from dataclasses import dataclass
+from ..core.types import AudioChunk
+logger = logging.getLogger(__name__)
+@dataclass
+class GeneFaceConfig:
+    """Configuration for GeneFace++ integration."""
+    geneface_path: str  # Path to ai-geneface-realtime directory
+    checkpoint_path: Optional[str] = None  # Path to trained model
+    output_resolution: tuple[int, int] = (512, 512)
+    fps: int = 25
+    device: str = "cuda"
+class GeneFaceIntegration:
+    """
+    Integration with GeneFace++ for generating animated face videos from audio.
+    This wraps the GeneFace++ inference system to generate talking face videos
+    from the audio output of the conversation pipeline.
+    Example:
+        ```python
+        geneface = GeneFaceIntegration(
+            config=GeneFaceConfig(
+                geneface_path="/path/to/ai-geneface-realtime",
+                checkpoint_path="/path/to/model.ckpt"
+            )
+        )
+        # Generate video from audio
+        video_path = await geneface.generate_video(audio_bytes)
+        ```
+    """
+    def __init__(self, config: GeneFaceConfig):
+        """
+        Initialize GeneFace++ integration.
+        Args:
+            config: GeneFace configuration
+        """
+        self.config = config
+        self._infer = None
+        self._initialized = False
+    async def initialize(self):
+        """
+        Initialize the GeneFace++ inference system.
+        This loads the models and prepares for inference.
+        """
+        if self._initialized:
+            return
+        # Add GeneFace path to Python path
+        import sys
+        geneface_path = Path(self.config.geneface_path)
+        if str(geneface_path) not in sys.path:
+            sys.path.insert(0, str(geneface_path))
+        try:
+            # Import GeneFace modules
+            from inference.genefacepp_infer import GeneFace2Infer
+            # Initialize inference object
+            # Note: This will load models which takes time
+            self._infer = GeneFace2Infer(
+                audio2secc_dir=self.config.checkpoint_path,
+                device=self.config.device,
+            )
+            self._initialized = True
+            logger.info("GeneFace++ integration initialized")
+        except ImportError as e:
+            logger.error(f"Failed to import GeneFace++: {e}")
+            raise ImportError(
+                f"Could not import GeneFace++. Ensure it's installed at {self.config.geneface_path}"
+            ) from e
+    async def generate_video(
+        self,
+        audio: bytes,
+        sample_rate: int = 16000,
+        output_path: Optional[str] = None,
+    ) -> str:
+        """
+        Generate a talking face video from audio.
+        Args:
+            audio: Audio bytes (PCM format)
+            sample_rate: Sample rate of the audio
+            output_path: Optional output video path. If not provided,
+                        a temporary file will be created.
+        Returns:
+            Path to the generated video file
+        """
+        if not self._initialized:
+            await self.initialize()
+        # Save audio to temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            temp_audio_path = f.name
+            # Write WAV header and data
+            self._write_wav(f, audio, sample_rate)
+        try:
+            # Determine output path
+            if output_path is None:
+                output_path = tempfile.mktemp(suffix=".mp4")
+            # Run GeneFace++ inference in executor to not block
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(
+                None,
+                self._run_inference,
+                temp_audio_path,
+                output_path,
+            )
+            logger.info(f"Generated video at: {output_path}")
+            return output_path
+        finally:
+            # Cleanup temp audio file
+            if os.path.exists(temp_audio_path):
+                os.unlink(temp_audio_path)
+    def _run_inference(self, audio_path: str, output_path: str):
+        """Run GeneFace++ inference (blocking)."""
+        self._infer.infer_once(
+            inp={
+                "drv_audio": audio_path,
+                "out_name": output_path,
+            }
+        )
+    def _write_wav(self, file, audio: bytes, sample_rate: int):
+        """Write audio bytes as a WAV file."""
+        import struct
+        # WAV header
+        channels = 1
+        bits_per_sample = 16
+        byte_rate = sample_rate * channels * bits_per_sample // 8
+        block_align = channels * bits_per_sample // 8
+        data_size = len(audio)
+        # Write RIFF header
+        file.write(b"RIFF")
+        file.write(struct.pack("<I", 36 + data_size))
+        file.write(b"WAVE")
+        # Write fmt chunk
+        file.write(b"fmt ")
+        file.write(struct.pack("<I", 16))  # Chunk size
+        file.write(struct.pack("<H", 1))  # Audio format (PCM)
+        file.write(struct.pack("<H", channels))
+        file.write(struct.pack("<I", sample_rate))
+        file.write(struct.pack("<I", byte_rate))
+        file.write(struct.pack("<H", block_align))
+        file.write(struct.pack("<H", bits_per_sample))
+        # Write data chunk
+        file.write(b"data")
+        file.write(struct.pack("<I", data_size))
+        file.write(audio)
+    async def generate_video_stream(
+        self,
+        audio_stream: AsyncIterator[AudioChunk],
+        output_path: Optional[str] = None,
+    ) -> str:
+        """
+        Generate video from streaming audio.
+        Buffers audio chunks until stream completes, then generates video.
+        For true real-time video streaming, additional work would be needed
+        to chunk the inference.
+        Args:
+            audio_stream: Async iterator of audio chunks
+            output_path: Optional output video path
+        Returns:
+            Path to the generated video file
+        """
+        # Buffer all audio chunks
+        audio_buffer = bytearray()
+        sample_rate = 16000
+        async for chunk in audio_stream:
+            audio_buffer.extend(chunk.data)
+            sample_rate = chunk.sample_rate
+        return await self.generate_video(
+            bytes(audio_buffer),
+            sample_rate,
+            output_path,
+        )
+class GeneFacePipelineWrapper:
+    """
+    Wrapper that adds face animation to an audio pipeline.
+    Example:
+        ```python
+        from audio_engine import Pipeline
+        from audio_engine.integrations.geneface import GeneFacePipelineWrapper
+        # Create base pipeline
+        pipeline = Pipeline(asr=..., llm=..., tts=...)
+        # Wrap with face animation
+        wrapped = GeneFacePipelineWrapper(
+            pipeline=pipeline,
+            geneface_config=GeneFaceConfig(...)
+        )
+        # Now returns both audio and video
+        audio, video_path = await wrapped.process_with_video(input_audio)
+        ```
+    """
+    def __init__(self, pipeline, geneface_config: GeneFaceConfig):
+        """
+        Initialize the wrapper.
+        Args:
+            pipeline: Pipeline instance
+            geneface_config: GeneFace configuration
+        """
+        self.pipeline = pipeline
+        self.geneface = GeneFaceIntegration(geneface_config)
+    async def connect(self):
+        """Initialize all components."""
+        await asyncio.gather(
+            self.pipeline.connect(),
+            self.geneface.initialize(),
+        )
+    async def disconnect(self):
+        """Clean up all components."""
+        await self.pipeline.disconnect()
+    async def __aenter__(self):
+        await self.connect()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.disconnect()
+    async def process_with_video(
+        self,
+        audio: bytes,
+        sample_rate: int = 16000,
+        video_output_path: Optional[str] = None,
+    ) -> tuple[bytes, str]:
+        """
+        Process audio and generate both response audio and face video.
+        Args:
+            audio: Input audio bytes
+            sample_rate: Sample rate of input
+            video_output_path: Optional path for output video
+        Returns:
+            Tuple of (response_audio_bytes, video_path)
+        """
+        # Get audio response from pipeline
+        response_audio = await self.pipeline.process(audio, sample_rate)
+        # Generate face animation video
+        video_path = await self.geneface.generate_video(
+            response_audio,
+            self.pipeline.tts.sample_rate,
+            video_output_path,
+        )
+        return response_audio, video_path

audio_engine/llm/__init__.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""LLM (Large Language Model) providers."""
+from ..core.config import LLMConfig
+from .base import BaseLLM
+try:
+    from .groq import GroqLLM
+except ImportError:
+    pass
+__all__ = ["BaseLLM", "GroqLLM", "get_llm_from_config"]
+def get_llm_from_config(config: LLMConfig) -> BaseLLM:
+    """
+    Instantiate LLM provider from config.
+    Args:
+        config: LLMConfig object with provider name and settings
+    Returns:
+        Initialized BaseLLM provider instance
+    Raises:
+        ValueError: If provider name is not recognized
+    """
+    provider_name = config.provider.lower()
+    if provider_name == "groq":
+        return GroqLLM(
+            api_key=config.api_key,
+            model=config.model or "llama-3.1-8b-instant",
+            temperature=config.temperature,
+            max_tokens=config.max_tokens,
+            system_prompt=config.system_prompt,
+            **config.extra,
+        )
+    else:
+        raise ValueError(f"Unknown LLM provider: {config.provider}. " f"Supported: groq")

audio_engine/llm/base.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Abstract base class for LLM providers."""
+from abc import ABC, abstractmethod
+from typing import AsyncIterator, Optional
+from ..core.types import ResponseChunk, ConversationContext
+class BaseLLM(ABC):
+    """
+    Abstract base class for Large Language Model providers.
+    All LLM implementations must inherit from this class and implement
+    the required methods for both batch and streaming text generation.
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = "gpt-4o",
+        temperature: float = 0.7,
+        max_tokens: int = 1024,
+        system_prompt: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Initialize the LLM provider.
+        Args:
+            api_key: API key for the provider
+            model: Model identifier to use
+            temperature: Sampling temperature (0.0-2.0)
+            max_tokens: Maximum tokens in response
+            system_prompt: Default system prompt
+            **kwargs: Additional provider-specific configuration
+        """
+        self.api_key = api_key
+        self.model = model
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.system_prompt = system_prompt
+        self.config = kwargs
+    @abstractmethod
+    async def generate(self, prompt: str, context: Optional[ConversationContext] = None) -> str:
+        """
+        Generate a complete response to a prompt.
+        Args:
+            prompt: User's input text
+            context: Optional conversation history
+        Returns:
+            Complete response text
+        """
+        pass
+    @abstractmethod
+    async def generate_stream(
+        self, prompt: str, context: Optional[ConversationContext] = None
+    ) -> AsyncIterator[ResponseChunk]:
+        """
+        Generate a streaming response to a prompt.
+        Args:
+            prompt: User's input text
+            context: Optional conversation history
+        Yields:
+            ResponseChunk objects with partial text
+        """
+        pass
+    async def __aenter__(self):
+        """Async context manager entry."""
+        await self.connect()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self.disconnect()
+    async def connect(self):
+        """
+        Initialize the LLM client.
+        Override in subclasses if needed.
+        """
+        pass
+    async def disconnect(self):
+        """
+        Clean up the LLM client.
+        Override in subclasses if needed.
+        """
+        pass
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Return the name of this LLM provider."""
+        pass
+    @property
+    def supports_streaming(self) -> bool:
+        """Whether this provider supports streaming responses."""
+        return True

atom-audio-engine 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

atom-audio-engine 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl