atom-audio-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.4
2
+ Name: atom-audio-engine
3
+ Version: 0.1.0
4
+ Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
5
+ Author-email: ATOM Group <info@atomgroup.ng>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/ATOM-GROUP-NG/audio-engine
8
+ Project-URL: Repository, https://github.com/ATOM-GROUP-NG/audio-engine.git
9
+ Project-URL: Issues, https://github.com/ATOM-GROUP-NG/audio-engine/issues
10
+ Keywords: audio,speech-to-text,text-to-speech,llm,conversational-ai,real-time,streaming,websocket
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Multimedia :: Sound/Audio
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: websockets>=12.0
23
+ Requires-Dist: aiohttp>=3.9.0
24
+ Requires-Dist: python-dotenv>=1.0.0
25
+ Requires-Dist: numpy>=1.24.0
26
+ Requires-Dist: scipy>=1.10.0
27
+ Provides-Extra: asr
28
+ Requires-Dist: openai>=1.0.0; extra == "asr"
29
+ Requires-Dist: deepgram-sdk>=3.0.0; extra == "asr"
30
+ Requires-Dist: assemblyai>=0.20.0; extra == "asr"
31
+ Requires-Dist: cartesia>=1.0.0; extra == "asr"
32
+ Provides-Extra: llm
33
+ Requires-Dist: anthropic>=0.18.0; extra == "llm"
34
+ Requires-Dist: groq>=0.4.0; extra == "llm"
35
+ Provides-Extra: tts
36
+ Requires-Dist: cartesia>=1.0.0; extra == "tts"
37
+ Requires-Dist: elevenlabs>=1.0.0; extra == "tts"
38
+ Provides-Extra: all
39
+ Requires-Dist: openai>=1.0.0; extra == "all"
40
+ Requires-Dist: deepgram-sdk>=3.0.0; extra == "all"
41
+ Requires-Dist: assemblyai>=0.20.0; extra == "all"
42
+ Requires-Dist: cartesia>=1.0.0; extra == "all"
43
+ Requires-Dist: anthropic>=0.18.0; extra == "all"
44
+ Requires-Dist: groq>=0.4.0; extra == "all"
45
+ Requires-Dist: elevenlabs>=1.0.0; extra == "all"
46
+ Provides-Extra: dev
47
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
48
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
49
+ Requires-Dist: black>=23.0.0; extra == "dev"
50
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
51
+
52
+ # Audio Engine
53
+
54
+ A pluggable audio-to-audio conversational engine with real-time streaming support.
55
+
56
+ ## Features
57
+
58
+ - **Pluggable Architecture**: Swap ASR, LLM, and TTS providers easily
59
+ - **Real-time Streaming**: WebSocket server for low-latency conversations
60
+ - **GeneFace++ Integration**: Optional face animation from audio
61
+ - **Simple API**: Get started with just a few lines of code
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ cd /Users/mayowaadebanjo/Projects/audio_engine
67
+ pip install -r requirements.txt
68
+ ```
69
+
70
+ ## Quick Start
71
+
72
+ ### Basic Usage
73
+
74
+ ```python
75
+ from audio_engine import Pipeline
76
+ from audio_engine.asr import WhisperASR
77
+ from audio_engine.llm import AnthropicLLM
78
+ from audio_engine.tts import CartesiaTTS
79
+
80
+ # Create pipeline with your providers
81
+ pipeline = Pipeline(
82
+ asr=WhisperASR(api_key="your-openai-key"),
83
+ llm=AnthropicLLM(api_key="your-anthropic-key", model="claude-sonnet-4-20250514"),
84
+ tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
85
+ system_prompt="You are a helpful assistant.",
86
+ )
87
+
88
+ async with pipeline:
89
+ # Simple: process complete audio
90
+ response_audio = await pipeline.process(input_audio_bytes)
91
+
92
+ # Streaming: lower latency
93
+ async for chunk in pipeline.stream(audio_stream):
94
+ play_audio(chunk)
95
+ ```
96
+
97
+ ### WebSocket Server
98
+
99
+ ```python
100
+ from audio_engine import Pipeline
101
+ from audio_engine.streaming import WebSocketServer
102
+
103
+ pipeline = Pipeline(asr=..., llm=..., tts=...)
104
+ server = WebSocketServer(pipeline, host="0.0.0.0", port=8765)
105
+
106
+ await server.start()
107
+ ```
108
+
109
+ ### With GeneFace++ Face Animation
110
+
111
+ ```python
112
+ from audio_engine.integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
113
+
114
+ wrapped = GeneFacePipelineWrapper(
115
+ pipeline=pipeline,
116
+ geneface_config=GeneFaceConfig(
117
+ geneface_path="/path/to/ai-geneface-realtime"
118
+ )
119
+ )
120
+
121
+ audio, video_path = await wrapped.process_with_video(input_audio)
122
+ ```
123
+
124
+ ## Architecture
125
+
126
+ ```
127
+ User Audio → ASR → LLM → TTS → Response Audio
128
+
129
+ GeneFace++ (optional)
130
+
131
+ Animated Face Video
132
+ ```
133
+
134
+ ## Directory Structure
135
+
136
+ ```
137
+ audio_engine/
138
+ ├── core/ # Pipeline and configuration
139
+ ├── asr/ # Speech-to-Text providers
140
+ ├── llm/ # LLM providers
141
+ ├── tts/ # Text-to-Speech providers
142
+ ├── streaming/ # WebSocket server
143
+ ├── integrations/ # GeneFace++ integration
144
+ ├── utils/ # Audio utilities
145
+ └── examples/ # Example scripts
146
+ ```
147
+
148
+ ## Implementing a Provider
149
+
150
+ ### Custom ASR
151
+
152
+ ```python
153
+ from audio_engine.asr.base import BaseASR
154
+
155
+ class MyASR(BaseASR):
156
+ @property
157
+ def name(self) -> str:
158
+ return "my-asr"
159
+
160
+ async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
161
+ # Your implementation
162
+ pass
163
+
164
+ async def transcribe_stream(self, audio_stream):
165
+ # Your streaming implementation
166
+ pass
167
+ ```
168
+
169
+ ### Custom LLM
170
+
171
+ ```python
172
+ from audio_engine.llm.base import BaseLLM
173
+
174
+ class MyLLM(BaseLLM):
175
+ @property
176
+ def name(self) -> str:
177
+ return "my-llm"
178
+
179
+ async def generate(self, prompt: str, context=None) -> str:
180
+ # Your implementation
181
+ pass
182
+
183
+ async def generate_stream(self, prompt: str, context=None):
184
+ # Your streaming implementation
185
+ pass
186
+ ```
187
+
188
+ ### Custom TTS
189
+
190
+ ```python
191
+ from audio_engine.tts.base import BaseTTS
192
+
193
+ class MyTTS(BaseTTS):
194
+ @property
195
+ def name(self) -> str:
196
+ return "my-tts"
197
+
198
+ async def synthesize(self, text: str) -> bytes:
199
+ # Your implementation
200
+ pass
201
+
202
+ async def synthesize_stream(self, text: str):
203
+ # Your streaming implementation
204
+ pass
205
+ ```
206
+
207
+ ## WebSocket Protocol
208
+
209
+ ### Client → Server
210
+
211
+ - **Binary**: Raw audio chunks (PCM 16-bit, 16kHz mono)
212
+ - **JSON**: `{"type": "end_of_speech"}` or `{"type": "reset"}`
213
+
214
+ ### Server → Client
215
+
216
+ - **Binary**: Response audio chunks
217
+ - **JSON Events**:
218
+ - `{"type": "connected", "client_id": "..."}`
219
+ - `{"type": "transcript", "text": "..."}`
220
+ - `{"type": "response_text", "text": "..."}`
221
+ - `{"type": "response_start"}`
222
+ - `{"type": "response_end"}`
223
+
224
+ ## Environment Variables
225
+
226
+ ```bash
227
+ # ASR
228
+ ASR_PROVIDER=whisper
229
+ ASR_API_KEY=your-key
230
+
231
+ # LLM
232
+ LLM_PROVIDER=anthropic
233
+ LLM_API_KEY=your-key
234
+ LLM_MODEL=claude-sonnet-4-20250514
235
+
236
+ # TTS
237
+ TTS_PROVIDER=cartesia
238
+ TTS_API_KEY=your-key
239
+ TTS_VOICE_ID=your-voice-id
240
+
241
+ # Debug
242
+ DEBUG=true
243
+ ```
244
+
245
+ ## License
246
+
247
+ MIT
@@ -0,0 +1,25 @@
1
+ asr/__init__.py,sha256=w0t2ahxgApZbZjSc748tN3tmKDeXzasfBh51ZjPF9uc,1203
2
+ asr/base.py,sha256=MFC_7HmyEDnhDwUn62CWZsiF9_-mBVVsUK-Yppiq4Vk,2378
3
+ asr/cartesia.py,sha256=BXnvscO9VaR3LsfEGn7lJ66udzUjz44JzZTmSizZqIg,13321
4
+ asr/deepgram.py,sha256=M59lgrVFMS6-3YQcYaUY7cUdt2-MBptt_VExdfnSXr0,6429
5
+ core/__init__.py,sha256=7naTEkqDjrPsejviXk662OR86xVCyckU7eMKVpjwYys,301
6
+ core/config.py,sha256=EF98O2Gt8q29FX3T6UeDwWNIbm77bni99SThiJKl5Tk,5203
7
+ core/pipeline.py,sha256=jX9jAlIfwU6V8GjqjivyK8Y7P41S-QS8xKYv5c9_qG0,8850
8
+ core/types.py,sha256=iFQPajgeS1YgMWXJvubA8sWbxLI1Z8nF-z1uucrgNm4,2295
9
+ integrations/__init__.py,sha256=1y4CTaqybOwmfk_xxkWANYkc-A7PgH0JFMZCTq33fe4,126
10
+ integrations/geneface.py,sha256=2oeVZazp2R9gN-YmQhzzrZb87CBpEiAyKA8hHUxUZJk,8788
11
+ llm/__init__.py,sha256=mwr0C1E1Wf5589fVt7emOFMA2fHoXxQ5t-3dOxkXQEI,997
12
+ llm/base.py,sha256=C-ZNOab0Ca-vlxWgnPzB8uZXFNYbPgAYfQLNvaal2KU,2873
13
+ llm/groq.py,sha256=oGSjJBW0TiCmOzzl1HTE8zUhPC78I3ywhAYFq7Te2IA,6694
14
+ pipelines/__init__.py,sha256=Q1iZjX38TigrZPBaFgv_5AXw21wBN1Z-4nfXPjV-xDI,49
15
+ streaming/__init__.py,sha256=Pd_ICcYeW75DXMsFpMrJnn9N-RU5s1_Wb3WZ3YbOTC4,136
16
+ streaming/websocket_server.py,sha256=miqHoVkUjznpmpQQrgkyaURR6DsDJLzkP_OGrBFOBYk,10994
17
+ tts/__init__.py,sha256=85XrpIkxFrRvOn19mWphkeBjTaEcsrFECYK_ZoGv1dQ,987
18
+ tts/base.py,sha256=vo0MSiep9QJQtpdCmDJWN-okK-ERYRA6Sk_g6IXCYZk,4475
19
+ tts/cartesia.py,sha256=bxhkNbWpQmlPTZ8RWcVCQzG_Q2mYr3t1aAd9OonSSWQ,17011
20
+ utils/__init__.py,sha256=WIeVykg3MqyOoCYEWsuzGyVniP8SIl9FE881ieR7WuE,250
21
+ utils/audio.py,sha256=Z7avyNqhzZ2fnBxZ_d0qUglOCCvHSffBveg5CQWTCM0,5529
22
+ atom_audio_engine-0.1.0.dist-info/METADATA,sha256=XX0wqawBJIB4MqOrjFwKOXaTUqEb7wp2CXYGhnJh5QY,6651
23
+ atom_audio_engine-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
24
+ atom_audio_engine-0.1.0.dist-info/top_level.txt,sha256=AH3Jl4o8vsxs7yvHGt0CZt3yI4xM7g5eBG9f1T4V4WE,56
25
+ atom_audio_engine-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,8 @@
1
+ asr
2
+ core
3
+ integrations
4
+ llm
5
+ pipelines
6
+ streaming
7
+ tts
8
+ utils
core/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ """Core pipeline and configuration."""
2
+
3
+ from core.pipeline import Pipeline
4
+ from core.config import AudioEngineConfig
5
+ from core.types import AudioChunk, TranscriptChunk, ResponseChunk
6
+
7
+ __all__ = [
8
+ "Pipeline",
9
+ "AudioEngineConfig",
10
+ "AudioChunk",
11
+ "TranscriptChunk",
12
+ "ResponseChunk",
13
+ ]
core/config.py ADDED
@@ -0,0 +1,162 @@
1
+ """Configuration management for the audio engine."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Optional, Any
5
+
6
+ # Provider defaults
7
+ DEFAULT_ASR_PROVIDER = "cartesia"
8
+ DEFAULT_LLM_PROVIDER = "groq"
9
+ DEFAULT_TTS_PROVIDER = "cartesia"
10
+
11
+
12
+ @dataclass
13
+ class ASRConfig:
14
+ """Configuration for ASR (Speech-to-Text) provider."""
15
+
16
+ provider: str = DEFAULT_ASR_PROVIDER # deepgram, etc.
17
+ api_key: Optional[str] = None
18
+ model: Optional[str] = None
19
+ language: str = "en"
20
+ extra: dict[str, Any] = field(default_factory=dict)
21
+
22
+
23
+ @dataclass
24
+ class LLMConfig:
25
+ """Configuration for LLM provider."""
26
+
27
+ provider: str = DEFAULT_LLM_PROVIDER # groq, etc.
28
+ api_key: Optional[str] = None
29
+ model: str = "llama-3.1-8b-instant"
30
+ temperature: float = 0.7
31
+ max_tokens: int = 1024
32
+ system_prompt: Optional[str] = None
33
+ extra: dict[str, Any] = field(default_factory=dict)
34
+
35
+
36
+ @dataclass
37
+ class TTSConfig:
38
+ """Configuration for TTS (Text-to-Speech) provider."""
39
+
40
+ provider: str = DEFAULT_TTS_PROVIDER # cartesia, etc.
41
+ api_key: Optional[str] = None
42
+ voice_id: Optional[str] = None
43
+ model: Optional[str] = None
44
+ speed: float = 1.0
45
+ extra: dict[str, Any] = field(default_factory=dict)
46
+
47
+
48
+ @dataclass
49
+ class StreamingConfig:
50
+ """Configuration for streaming/WebSocket server."""
51
+
52
+ host: str = "0.0.0.0"
53
+ port: int = 8765
54
+ chunk_size_ms: int = 100 # Audio chunk size in milliseconds
55
+ buffer_size: int = 4096
56
+ timeout_seconds: float = 30.0
57
+
58
+
59
+ @dataclass
60
+ class GeneFaceConfig:
61
+ """Configuration for GeneFace++ integration."""
62
+
63
+ enabled: bool = False
64
+ model_path: Optional[str] = None
65
+ output_resolution: tuple[int, int] = (512, 512)
66
+ fps: int = 25
67
+
68
+
69
+ @dataclass
70
+ class AudioEngineConfig:
71
+ """Main configuration for the audio engine."""
72
+
73
+ asr: ASRConfig = field(default_factory=ASRConfig)
74
+ llm: LLMConfig = field(default_factory=LLMConfig)
75
+ tts: TTSConfig = field(default_factory=TTSConfig)
76
+ streaming: StreamingConfig = field(default_factory=StreamingConfig)
77
+ geneface: GeneFaceConfig = field(default_factory=GeneFaceConfig)
78
+
79
+ # Global settings
80
+ debug: bool = False
81
+ log_level: str = "INFO"
82
+
83
+ @classmethod
84
+ def from_env(cls) -> "AudioEngineConfig":
85
+ """
86
+ Create config from environment variables.
87
+
88
+ Supported environment variables:
89
+ - ASR_PROVIDER: ASR provider name (default: deepgram)
90
+ - ASR_API_KEY: ASR API key (fallback: DEEPGRAM_API_KEY)
91
+ - LLM_PROVIDER: LLM provider name (default: groq)
92
+ - LLM_API_KEY: LLM API key (fallback: GROQ_API_KEY)
93
+ - LLM_MODEL: LLM model name (default: llama-3.1-8b-instant)
94
+ - TTS_PROVIDER: TTS provider name (default: cartesia)
95
+ - TTS_API_KEY: TTS API key (fallback: CARTESIA_API_KEY)
96
+ - TTS_VOICE_ID: TTS voice identifier
97
+ - DEBUG: Enable debug mode (default: false)
98
+ """
99
+ import os
100
+
101
+ return cls(
102
+ asr=ASRConfig(
103
+ provider=os.getenv("ASR_PROVIDER", DEFAULT_ASR_PROVIDER),
104
+ api_key=os.getenv("ASR_API_KEY")
105
+ or os.getenv("CARTESIA_API_KEY")
106
+ or os.getenv("DEEPGRAM_API_KEY"),
107
+ ),
108
+ llm=LLMConfig(
109
+ provider=os.getenv("LLM_PROVIDER", DEFAULT_LLM_PROVIDER),
110
+ api_key=os.getenv("LLM_API_KEY") or os.getenv("GROQ_API_KEY"),
111
+ model=os.getenv("LLM_MODEL", "llama-3.1-8b-instant"),
112
+ ),
113
+ tts=TTSConfig(
114
+ provider=os.getenv("TTS_PROVIDER", DEFAULT_TTS_PROVIDER),
115
+ api_key=os.getenv("TTS_API_KEY") or os.getenv("CARTESIA_API_KEY"),
116
+ voice_id=os.getenv("TTS_VOICE_ID"),
117
+ ),
118
+ debug=os.getenv("DEBUG", "false").lower() == "true",
119
+ )
120
+
121
+ @classmethod
122
+ def from_dict(cls, data: dict) -> "AudioEngineConfig":
123
+ """Create config from a dictionary."""
124
+ return cls(
125
+ asr=ASRConfig(**data.get("asr", {})),
126
+ llm=LLMConfig(**data.get("llm", {})),
127
+ tts=TTSConfig(**data.get("tts", {})),
128
+ streaming=StreamingConfig(**data.get("streaming", {})),
129
+ geneface=GeneFaceConfig(**data.get("geneface", {})),
130
+ debug=data.get("debug", False),
131
+ log_level=data.get("log_level", "INFO"),
132
+ )
133
+
134
+ def create_pipeline(self, system_prompt: Optional[str] = None) -> "Pipeline":
135
+ """
136
+ Create a Pipeline instance from this config.
137
+
138
+ Args:
139
+ system_prompt: Optional system prompt override
140
+
141
+ Returns:
142
+ Initialized Pipeline with providers
143
+
144
+ Raises:
145
+ ValueError: If provider initialization fails
146
+ """
147
+ from asr import get_asr_from_config
148
+ from llm import get_llm_from_config
149
+ from tts import get_tts_from_config
150
+ from core.pipeline import Pipeline
151
+
152
+ asr = get_asr_from_config(self.asr)
153
+ llm = get_llm_from_config(self.llm)
154
+ tts = get_tts_from_config(self.tts)
155
+
156
+ return Pipeline(
157
+ asr=asr,
158
+ llm=llm,
159
+ tts=tts,
160
+ system_prompt=system_prompt or self.llm.system_prompt,
161
+ debug=self.debug,
162
+ )