PyPI - atom-audio-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

atom-audio-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

asr/__init__.py +45 -0
asr/base.py +89 -0
asr/cartesia.py +356 -0
asr/deepgram.py +196 -0
atom_audio_engine-0.1.0.dist-info/METADATA +247 -0
atom_audio_engine-0.1.0.dist-info/RECORD +25 -0
atom_audio_engine-0.1.0.dist-info/WHEEL +5 -0
atom_audio_engine-0.1.0.dist-info/top_level.txt +8 -0
core/__init__.py +13 -0
core/config.py +162 -0
core/pipeline.py +282 -0
core/types.py +87 -0
integrations/__init__.py +5 -0
integrations/geneface.py +297 -0
llm/__init__.py +38 -0
llm/base.py +108 -0
llm/groq.py +210 -0
pipelines/__init__.py +1 -0
streaming/__init__.py +5 -0
streaming/websocket_server.py +341 -0
tts/__init__.py +37 -0
tts/base.py +155 -0
tts/cartesia.py +392 -0
utils/__init__.py +15 -0
utils/audio.py +220 -0

atom_audio_engine-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,247 @@
+Metadata-Version: 2.4
+Name: atom-audio-engine
+Version: 0.1.0
+Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
+Author-email: ATOM Group <info@atomgroup.ng>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/ATOM-GROUP-NG/audio-engine
+Project-URL: Repository, https://github.com/ATOM-GROUP-NG/audio-engine.git
+Project-URL: Issues, https://github.com/ATOM-GROUP-NG/audio-engine/issues
+Keywords: audio,speech-to-text,text-to-speech,llm,conversational-ai,real-time,streaming,websocket
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Multimedia :: Sound/Audio
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: websockets>=12.0
+Requires-Dist: aiohttp>=3.9.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: scipy>=1.10.0
+Provides-Extra: asr
+Requires-Dist: openai>=1.0.0; extra == "asr"
+Requires-Dist: deepgram-sdk>=3.0.0; extra == "asr"
+Requires-Dist: assemblyai>=0.20.0; extra == "asr"
+Requires-Dist: cartesia>=1.0.0; extra == "asr"
+Provides-Extra: llm
+Requires-Dist: anthropic>=0.18.0; extra == "llm"
+Requires-Dist: groq>=0.4.0; extra == "llm"
+Provides-Extra: tts
+Requires-Dist: cartesia>=1.0.0; extra == "tts"
+Requires-Dist: elevenlabs>=1.0.0; extra == "tts"
+Provides-Extra: all
+Requires-Dist: openai>=1.0.0; extra == "all"
+Requires-Dist: deepgram-sdk>=3.0.0; extra == "all"
+Requires-Dist: assemblyai>=0.20.0; extra == "all"
+Requires-Dist: cartesia>=1.0.0; extra == "all"
+Requires-Dist: anthropic>=0.18.0; extra == "all"
+Requires-Dist: groq>=0.4.0; extra == "all"
+Requires-Dist: elevenlabs>=1.0.0; extra == "all"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+# Audio Engine
+A pluggable audio-to-audio conversational engine with real-time streaming support.
+## Features
+- **Pluggable Architecture**: Swap ASR, LLM, and TTS providers easily
+- **Real-time Streaming**: WebSocket server for low-latency conversations
+- **GeneFace++ Integration**: Optional face animation from audio
+- **Simple API**: Get started with just a few lines of code
+## Installation
+```bash
+cd /Users/mayowaadebanjo/Projects/audio_engine
+pip install -r requirements.txt
+```
+## Quick Start
+### Basic Usage
+```python
+from audio_engine import Pipeline
+from audio_engine.asr import WhisperASR
+from audio_engine.llm import AnthropicLLM
+from audio_engine.tts import CartesiaTTS
+# Create pipeline with your providers
+pipeline = Pipeline(
+    asr=WhisperASR(api_key="your-openai-key"),
+    llm=AnthropicLLM(api_key="your-anthropic-key", model="claude-sonnet-4-20250514"),
+    tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
+    system_prompt="You are a helpful assistant.",
+)
+async with pipeline:
+    # Simple: process complete audio
+    response_audio = await pipeline.process(input_audio_bytes)
+    # Streaming: lower latency
+    async for chunk in pipeline.stream(audio_stream):
+        play_audio(chunk)
+```
+### WebSocket Server
+```python
+from audio_engine import Pipeline
+from audio_engine.streaming import WebSocketServer
+pipeline = Pipeline(asr=..., llm=..., tts=...)
+server = WebSocketServer(pipeline, host="0.0.0.0", port=8765)
+await server.start()
+```
+### With GeneFace++ Face Animation
+```python
+from audio_engine.integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
+wrapped = GeneFacePipelineWrapper(
+    pipeline=pipeline,
+    geneface_config=GeneFaceConfig(
+        geneface_path="/path/to/ai-geneface-realtime"
+    )
+)
+audio, video_path = await wrapped.process_with_video(input_audio)
+```
+## Architecture
+```
+User Audio → ASR → LLM → TTS → Response Audio
+                           ↓
+                    GeneFace++ (optional)
+                           ↓
+                    Animated Face Video
+```
+## Directory Structure
+```
+audio_engine/
+├── core/           # Pipeline and configuration
+├── asr/            # Speech-to-Text providers
+├── llm/            # LLM providers
+├── tts/            # Text-to-Speech providers
+├── streaming/      # WebSocket server
+├── integrations/   # GeneFace++ integration
+├── utils/          # Audio utilities
+└── examples/       # Example scripts
+```
+## Implementing a Provider
+### Custom ASR
+```python
+from audio_engine.asr.base import BaseASR
+class MyASR(BaseASR):
+    @property
+    def name(self) -> str:
+        return "my-asr"
+    async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
+        # Your implementation
+        pass
+    async def transcribe_stream(self, audio_stream):
+        # Your streaming implementation
+        pass
+```
+### Custom LLM
+```python
+from audio_engine.llm.base import BaseLLM
+class MyLLM(BaseLLM):
+    @property
+    def name(self) -> str:
+        return "my-llm"
+    async def generate(self, prompt: str, context=None) -> str:
+        # Your implementation
+        pass
+    async def generate_stream(self, prompt: str, context=None):
+        # Your streaming implementation
+        pass
+```
+### Custom TTS
+```python
+from audio_engine.tts.base import BaseTTS
+class MyTTS(BaseTTS):
+    @property
+    def name(self) -> str:
+        return "my-tts"
+    async def synthesize(self, text: str) -> bytes:
+        # Your implementation
+        pass
+    async def synthesize_stream(self, text: str):
+        # Your streaming implementation
+        pass
+```
+## WebSocket Protocol
+### Client → Server
+- **Binary**: Raw audio chunks (PCM 16-bit, 16kHz mono)
+- **JSON**: `{"type": "end_of_speech"}` or `{"type": "reset"}`
+### Server → Client
+- **Binary**: Response audio chunks
+- **JSON Events**:
+  - `{"type": "connected", "client_id": "..."}`
+  - `{"type": "transcript", "text": "..."}`
+  - `{"type": "response_text", "text": "..."}`
+  - `{"type": "response_start"}`
+  - `{"type": "response_end"}`
+## Environment Variables
+```bash
+# ASR
+ASR_PROVIDER=whisper
+ASR_API_KEY=your-key
+# LLM
+LLM_PROVIDER=anthropic
+LLM_API_KEY=your-key
+LLM_MODEL=claude-sonnet-4-20250514
+# TTS
+TTS_PROVIDER=cartesia
+TTS_API_KEY=your-key
+TTS_VOICE_ID=your-voice-id
+# Debug
+DEBUG=true
+```
+## License
+MIT

atom_audio_engine-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,25 @@
+asr/__init__.py,sha256=w0t2ahxgApZbZjSc748tN3tmKDeXzasfBh51ZjPF9uc,1203
+asr/base.py,sha256=MFC_7HmyEDnhDwUn62CWZsiF9_-mBVVsUK-Yppiq4Vk,2378
+asr/cartesia.py,sha256=BXnvscO9VaR3LsfEGn7lJ66udzUjz44JzZTmSizZqIg,13321
+asr/deepgram.py,sha256=M59lgrVFMS6-3YQcYaUY7cUdt2-MBptt_VExdfnSXr0,6429
+core/__init__.py,sha256=7naTEkqDjrPsejviXk662OR86xVCyckU7eMKVpjwYys,301
+core/config.py,sha256=EF98O2Gt8q29FX3T6UeDwWNIbm77bni99SThiJKl5Tk,5203
+core/pipeline.py,sha256=jX9jAlIfwU6V8GjqjivyK8Y7P41S-QS8xKYv5c9_qG0,8850
+core/types.py,sha256=iFQPajgeS1YgMWXJvubA8sWbxLI1Z8nF-z1uucrgNm4,2295
+integrations/__init__.py,sha256=1y4CTaqybOwmfk_xxkWANYkc-A7PgH0JFMZCTq33fe4,126
+integrations/geneface.py,sha256=2oeVZazp2R9gN-YmQhzzrZb87CBpEiAyKA8hHUxUZJk,8788
+llm/__init__.py,sha256=mwr0C1E1Wf5589fVt7emOFMA2fHoXxQ5t-3dOxkXQEI,997
+llm/base.py,sha256=C-ZNOab0Ca-vlxWgnPzB8uZXFNYbPgAYfQLNvaal2KU,2873
+llm/groq.py,sha256=oGSjJBW0TiCmOzzl1HTE8zUhPC78I3ywhAYFq7Te2IA,6694
+pipelines/__init__.py,sha256=Q1iZjX38TigrZPBaFgv_5AXw21wBN1Z-4nfXPjV-xDI,49
+streaming/__init__.py,sha256=Pd_ICcYeW75DXMsFpMrJnn9N-RU5s1_Wb3WZ3YbOTC4,136
+streaming/websocket_server.py,sha256=miqHoVkUjznpmpQQrgkyaURR6DsDJLzkP_OGrBFOBYk,10994
+tts/__init__.py,sha256=85XrpIkxFrRvOn19mWphkeBjTaEcsrFECYK_ZoGv1dQ,987
+tts/base.py,sha256=vo0MSiep9QJQtpdCmDJWN-okK-ERYRA6Sk_g6IXCYZk,4475
+tts/cartesia.py,sha256=bxhkNbWpQmlPTZ8RWcVCQzG_Q2mYr3t1aAd9OonSSWQ,17011
+utils/__init__.py,sha256=WIeVykg3MqyOoCYEWsuzGyVniP8SIl9FE881ieR7WuE,250
+utils/audio.py,sha256=Z7avyNqhzZ2fnBxZ_d0qUglOCCvHSffBveg5CQWTCM0,5529
+atom_audio_engine-0.1.0.dist-info/METADATA,sha256=XX0wqawBJIB4MqOrjFwKOXaTUqEb7wp2CXYGhnJh5QY,6651
+atom_audio_engine-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+atom_audio_engine-0.1.0.dist-info/top_level.txt,sha256=AH3Jl4o8vsxs7yvHGt0CZt3yI4xM7g5eBG9f1T4V4WE,56
+atom_audio_engine-0.1.0.dist-info/RECORD,,

atom_audio_engine-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.10.2)
+Root-Is-Purelib: true
+Tag: py3-none-any

atom_audio_engine-0.1.0.dist-info/top_level.txt ADDED Viewed

@@ -0,0 +1,8 @@
+asr
+core
+integrations
+llm
+pipelines
+streaming
+tts
+utils

core/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Core pipeline and configuration."""
+from core.pipeline import Pipeline
+from core.config import AudioEngineConfig
+from core.types import AudioChunk, TranscriptChunk, ResponseChunk
+__all__ = [
+    "Pipeline",
+    "AudioEngineConfig",
+    "AudioChunk",
+    "TranscriptChunk",
+    "ResponseChunk",
+]

core/config.py ADDED Viewed

@@ -0,0 +1,162 @@
+"""Configuration management for the audio engine."""
+from dataclasses import dataclass, field
+from typing import Optional, Any
+# Provider defaults
+DEFAULT_ASR_PROVIDER = "cartesia"
+DEFAULT_LLM_PROVIDER = "groq"
+DEFAULT_TTS_PROVIDER = "cartesia"
+@dataclass
+class ASRConfig:
+    """Configuration for ASR (Speech-to-Text) provider."""
+    provider: str = DEFAULT_ASR_PROVIDER  # deepgram, etc.
+    api_key: Optional[str] = None
+    model: Optional[str] = None
+    language: str = "en"
+    extra: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class LLMConfig:
+    """Configuration for LLM provider."""
+    provider: str = DEFAULT_LLM_PROVIDER  # groq, etc.
+    api_key: Optional[str] = None
+    model: str = "llama-3.1-8b-instant"
+    temperature: float = 0.7
+    max_tokens: int = 1024
+    system_prompt: Optional[str] = None
+    extra: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class TTSConfig:
+    """Configuration for TTS (Text-to-Speech) provider."""
+    provider: str = DEFAULT_TTS_PROVIDER  # cartesia, etc.
+    api_key: Optional[str] = None
+    voice_id: Optional[str] = None
+    model: Optional[str] = None
+    speed: float = 1.0
+    extra: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class StreamingConfig:
+    """Configuration for streaming/WebSocket server."""
+    host: str = "0.0.0.0"
+    port: int = 8765
+    chunk_size_ms: int = 100  # Audio chunk size in milliseconds
+    buffer_size: int = 4096
+    timeout_seconds: float = 30.0
+@dataclass
+class GeneFaceConfig:
+    """Configuration for GeneFace++ integration."""
+    enabled: bool = False
+    model_path: Optional[str] = None
+    output_resolution: tuple[int, int] = (512, 512)
+    fps: int = 25
+@dataclass
+class AudioEngineConfig:
+    """Main configuration for the audio engine."""
+    asr: ASRConfig = field(default_factory=ASRConfig)
+    llm: LLMConfig = field(default_factory=LLMConfig)
+    tts: TTSConfig = field(default_factory=TTSConfig)
+    streaming: StreamingConfig = field(default_factory=StreamingConfig)
+    geneface: GeneFaceConfig = field(default_factory=GeneFaceConfig)
+    # Global settings
+    debug: bool = False
+    log_level: str = "INFO"
+    @classmethod
+    def from_env(cls) -> "AudioEngineConfig":
+        """
+        Create config from environment variables.
+        Supported environment variables:
+        - ASR_PROVIDER: ASR provider name (default: deepgram)
+        - ASR_API_KEY: ASR API key (fallback: DEEPGRAM_API_KEY)
+        - LLM_PROVIDER: LLM provider name (default: groq)
+        - LLM_API_KEY: LLM API key (fallback: GROQ_API_KEY)
+        - LLM_MODEL: LLM model name (default: llama-3.1-8b-instant)
+        - TTS_PROVIDER: TTS provider name (default: cartesia)
+        - TTS_API_KEY: TTS API key (fallback: CARTESIA_API_KEY)
+        - TTS_VOICE_ID: TTS voice identifier
+        - DEBUG: Enable debug mode (default: false)
+        """
+        import os
+        return cls(
+            asr=ASRConfig(
+                provider=os.getenv("ASR_PROVIDER", DEFAULT_ASR_PROVIDER),
+                api_key=os.getenv("ASR_API_KEY")
+                or os.getenv("CARTESIA_API_KEY")
+                or os.getenv("DEEPGRAM_API_KEY"),
+            ),
+            llm=LLMConfig(
+                provider=os.getenv("LLM_PROVIDER", DEFAULT_LLM_PROVIDER),
+                api_key=os.getenv("LLM_API_KEY") or os.getenv("GROQ_API_KEY"),
+                model=os.getenv("LLM_MODEL", "llama-3.1-8b-instant"),
+            ),
+            tts=TTSConfig(
+                provider=os.getenv("TTS_PROVIDER", DEFAULT_TTS_PROVIDER),
+                api_key=os.getenv("TTS_API_KEY") or os.getenv("CARTESIA_API_KEY"),
+                voice_id=os.getenv("TTS_VOICE_ID"),
+            ),
+            debug=os.getenv("DEBUG", "false").lower() == "true",
+        )
+    @classmethod
+    def from_dict(cls, data: dict) -> "AudioEngineConfig":
+        """Create config from a dictionary."""
+        return cls(
+            asr=ASRConfig(**data.get("asr", {})),
+            llm=LLMConfig(**data.get("llm", {})),
+            tts=TTSConfig(**data.get("tts", {})),
+            streaming=StreamingConfig(**data.get("streaming", {})),
+            geneface=GeneFaceConfig(**data.get("geneface", {})),
+            debug=data.get("debug", False),
+            log_level=data.get("log_level", "INFO"),
+        )
+    def create_pipeline(self, system_prompt: Optional[str] = None) -> "Pipeline":
+        """
+        Create a Pipeline instance from this config.
+        Args:
+            system_prompt: Optional system prompt override
+        Returns:
+            Initialized Pipeline with providers
+        Raises:
+            ValueError: If provider initialization fails
+        """
+        from asr import get_asr_from_config
+        from llm import get_llm_from_config
+        from tts import get_tts_from_config
+        from core.pipeline import Pipeline
+        asr = get_asr_from_config(self.asr)
+        llm = get_llm_from_config(self.llm)
+        tts = get_tts_from_config(self.tts)
+        return Pipeline(
+            asr=asr,
+            llm=llm,
+            tts=tts,
+            system_prompt=system_prompt or self.llm.system_prompt,
+            debug=self.debug,
+        )