atom-audio-engine 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atom_audio_engine-0.1.1/PKG-INFO +252 -0
- atom_audio_engine-0.1.1/README.md +201 -0
- atom_audio_engine-0.1.1/atom_audio_engine.egg-info/PKG-INFO +252 -0
- atom_audio_engine-0.1.1/atom_audio_engine.egg-info/SOURCES.txt +9 -0
- atom_audio_engine-0.1.1/atom_audio_engine.egg-info/dependency_links.txt +1 -0
- atom_audio_engine-0.1.1/atom_audio_engine.egg-info/requires.txt +34 -0
- atom_audio_engine-0.1.1/atom_audio_engine.egg-info/top_level.txt +1 -0
- atom_audio_engine-0.1.1/audio_engine/__init__.py +80 -0
- atom_audio_engine-0.1.1/pyproject.toml +123 -0
- atom_audio_engine-0.1.1/setup.cfg +4 -0
- atom_audio_engine-0.1.1/setup.py +6 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: atom-audio-engine
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
|
|
5
|
+
Author-email: ATOM Group <info@atomgroup.ng>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ATOM-GROUP-NG/audio-engine
|
|
8
|
+
Project-URL: Repository, https://github.com/ATOM-GROUP-NG/audio-engine.git
|
|
9
|
+
Project-URL: Issues, https://github.com/ATOM-GROUP-NG/audio-engine/issues
|
|
10
|
+
Keywords: audio,speech-to-text,text-to-speech,llm,conversational-ai,real-time,streaming,websocket
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: websockets>=12.0
|
|
23
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
24
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
25
|
+
Requires-Dist: numpy>=1.24.0
|
|
26
|
+
Requires-Dist: scipy>=1.10.0
|
|
27
|
+
Provides-Extra: asr
|
|
28
|
+
Requires-Dist: openai>=1.0.0; extra == "asr"
|
|
29
|
+
Requires-Dist: deepgram-sdk>=3.0.0; extra == "asr"
|
|
30
|
+
Requires-Dist: assemblyai>=0.20.0; extra == "asr"
|
|
31
|
+
Requires-Dist: cartesia>=1.0.0; extra == "asr"
|
|
32
|
+
Provides-Extra: llm
|
|
33
|
+
Requires-Dist: anthropic>=0.18.0; extra == "llm"
|
|
34
|
+
Requires-Dist: groq>=0.4.0; extra == "llm"
|
|
35
|
+
Provides-Extra: tts
|
|
36
|
+
Requires-Dist: cartesia>=1.0.0; extra == "tts"
|
|
37
|
+
Requires-Dist: elevenlabs>=1.0.0; extra == "tts"
|
|
38
|
+
Provides-Extra: all
|
|
39
|
+
Requires-Dist: openai>=1.0.0; extra == "all"
|
|
40
|
+
Requires-Dist: deepgram-sdk>=3.0.0; extra == "all"
|
|
41
|
+
Requires-Dist: assemblyai>=0.20.0; extra == "all"
|
|
42
|
+
Requires-Dist: cartesia>=1.0.0; extra == "all"
|
|
43
|
+
Requires-Dist: anthropic>=0.18.0; extra == "all"
|
|
44
|
+
Requires-Dist: groq>=0.4.0; extra == "all"
|
|
45
|
+
Requires-Dist: elevenlabs>=1.0.0; extra == "all"
|
|
46
|
+
Provides-Extra: dev
|
|
47
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
48
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
49
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
50
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
51
|
+
|
|
52
|
+
# Audio Engine
|
|
53
|
+
|
|
54
|
+
A pluggable audio-to-audio conversational engine with real-time streaming support.
|
|
55
|
+
|
|
56
|
+
## Features
|
|
57
|
+
|
|
58
|
+
- **Pluggable Architecture**: Swap ASR, LLM, and TTS providers easily
|
|
59
|
+
- **Real-time Streaming**: WebSocket server for low-latency conversations
|
|
60
|
+
- **GeneFace++ Integration**: Optional face animation from audio
|
|
61
|
+
- **Simple API**: Get started with just a few lines of code
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install atom-audio-engine
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
For development with all optional dependencies:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install atom-audio-engine[all,dev]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Quick Start
|
|
76
|
+
|
|
77
|
+
### Basic Usage
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from audio_engine import Pipeline
|
|
81
|
+
from audio_engine.asr import WhisperASR
|
|
82
|
+
from audio_engine.llm import AnthropicLLM
|
|
83
|
+
from audio_engine.tts import CartesiaTTS
|
|
84
|
+
|
|
85
|
+
# Create pipeline with your providers
|
|
86
|
+
pipeline = Pipeline(
|
|
87
|
+
asr=CartesiaASR(api_key="your-cartesia-key"),
|
|
88
|
+
llm=GroqLLM(api_key="your-groq-key", model="mixtral-8x7b-32768"),
|
|
89
|
+
tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
|
|
90
|
+
system_prompt="You are a helpful assistant.",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
async with pipeline:
|
|
94
|
+
# Simple: process complete audio
|
|
95
|
+
response_audio = await pipeline.process(input_audio_bytes)
|
|
96
|
+
|
|
97
|
+
# Streaming: lower latency
|
|
98
|
+
async for chunk in pipeline.stream(audio_stream):
|
|
99
|
+
play_audio(chunk)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### WebSocket Server
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from audio_engine import Pipeline
|
|
106
|
+
from audio_engine.streaming import WebSocketServer
|
|
107
|
+
|
|
108
|
+
pipeline = Pipeline(asr=..., llm=..., tts=...)
|
|
109
|
+
server = WebSocketServer(pipeline, host="0.0.0.0", port=8765)
|
|
110
|
+
|
|
111
|
+
await server.start()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### With GeneFace++ Face Animation
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from audio_engine.integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
|
|
118
|
+
|
|
119
|
+
wrapped = GeneFacePipelineWrapper(
|
|
120
|
+
pipeline=pipeline,
|
|
121
|
+
geneface_config=GeneFaceConfig(
|
|
122
|
+
geneface_path="/path/to/ai-geneface-realtime"
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
audio, video_path = await wrapped.process_with_video(input_audio)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Architecture
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
User Audio → ASR → LLM → TTS → Response Audio
|
|
133
|
+
↓
|
|
134
|
+
GeneFace++ (optional)
|
|
135
|
+
↓
|
|
136
|
+
Animated Face Video
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Directory Structure
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
audio_engine/
|
|
143
|
+
├── core/ # Pipeline and configuration
|
|
144
|
+
├── asr/ # Speech-to-Text providers
|
|
145
|
+
├── llm/ # LLM providers
|
|
146
|
+
├── tts/ # Text-to-Speech providers
|
|
147
|
+
├── streaming/ # WebSocket server
|
|
148
|
+
├── integrations/ # GeneFace++ integration
|
|
149
|
+
├── utils/ # Audio utilities
|
|
150
|
+
└── examples/ # Example scripts
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Implementing a Provider
|
|
154
|
+
|
|
155
|
+
### Custom ASR
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from audio_engine.asr.base import BaseASR
|
|
159
|
+
|
|
160
|
+
class MyASR(BaseASR):
|
|
161
|
+
@property
|
|
162
|
+
def name(self) -> str:
|
|
163
|
+
return "my-asr"
|
|
164
|
+
|
|
165
|
+
async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
|
|
166
|
+
# Your implementation
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
async def transcribe_stream(self, audio_stream):
|
|
170
|
+
# Your streaming implementation
|
|
171
|
+
pass
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Custom LLM
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from audio_engine.llm.base import BaseLLM
|
|
178
|
+
|
|
179
|
+
class MyLLM(BaseLLM):
|
|
180
|
+
@property
|
|
181
|
+
def name(self) -> str:
|
|
182
|
+
return "my-llm"
|
|
183
|
+
|
|
184
|
+
async def generate(self, prompt: str, context=None) -> str:
|
|
185
|
+
# Your implementation
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
async def generate_stream(self, prompt: str, context=None):
|
|
189
|
+
# Your streaming implementation
|
|
190
|
+
pass
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Custom TTS
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
from audio_engine.tts.base import BaseTTS
|
|
197
|
+
|
|
198
|
+
class MyTTS(BaseTTS):
|
|
199
|
+
@property
|
|
200
|
+
def name(self) -> str:
|
|
201
|
+
return "my-tts"
|
|
202
|
+
|
|
203
|
+
async def synthesize(self, text: str) -> bytes:
|
|
204
|
+
# Your implementation
|
|
205
|
+
pass
|
|
206
|
+
|
|
207
|
+
async def synthesize_stream(self, text: str):
|
|
208
|
+
# Your streaming implementation
|
|
209
|
+
pass
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## WebSocket Protocol
|
|
213
|
+
|
|
214
|
+
### Client → Server
|
|
215
|
+
|
|
216
|
+
- **Binary**: Raw audio chunks (PCM 16-bit, 16kHz mono)
|
|
217
|
+
- **JSON**: `{"type": "end_of_speech"}` or `{"type": "reset"}`
|
|
218
|
+
|
|
219
|
+
### Server → Client
|
|
220
|
+
|
|
221
|
+
- **Binary**: Response audio chunks
|
|
222
|
+
- **JSON Events**:
|
|
223
|
+
- `{"type": "connected", "client_id": "..."}`
|
|
224
|
+
- `{"type": "transcript", "text": "..."}`
|
|
225
|
+
- `{"type": "response_text", "text": "..."}`
|
|
226
|
+
- `{"type": "response_start"}`
|
|
227
|
+
- `{"type": "response_end"}`
|
|
228
|
+
|
|
229
|
+
## Environment Variables
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# ASR
|
|
233
|
+
ASR_PROVIDER=whisper
|
|
234
|
+
ASR_API_KEY=your-key
|
|
235
|
+
|
|
236
|
+
# LLM
|
|
237
|
+
LLM_PROVIDER=anthropic
|
|
238
|
+
LLM_API_KEY=your-key
|
|
239
|
+
LLM_MODEL=claude-sonnet-4-20250514
|
|
240
|
+
|
|
241
|
+
# TTS
|
|
242
|
+
TTS_PROVIDER=cartesia
|
|
243
|
+
TTS_API_KEY=your-key
|
|
244
|
+
TTS_VOICE_ID=your-voice-id
|
|
245
|
+
|
|
246
|
+
# Debug
|
|
247
|
+
DEBUG=true
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## License
|
|
251
|
+
|
|
252
|
+
MIT
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# Audio Engine
|
|
2
|
+
|
|
3
|
+
A pluggable audio-to-audio conversational engine with real-time streaming support.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Pluggable Architecture**: Swap ASR, LLM, and TTS providers easily
|
|
8
|
+
- **Real-time Streaming**: WebSocket server for low-latency conversations
|
|
9
|
+
- **GeneFace++ Integration**: Optional face animation from audio
|
|
10
|
+
- **Simple API**: Get started with just a few lines of code
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install atom-audio-engine
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
For development with all optional dependencies:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install atom-audio-engine[all,dev]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
### Basic Usage
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from audio_engine import Pipeline
|
|
30
|
+
from audio_engine.asr import WhisperASR
|
|
31
|
+
from audio_engine.llm import AnthropicLLM
|
|
32
|
+
from audio_engine.tts import CartesiaTTS
|
|
33
|
+
|
|
34
|
+
# Create pipeline with your providers
|
|
35
|
+
pipeline = Pipeline(
|
|
36
|
+
asr=CartesiaASR(api_key="your-cartesia-key"),
|
|
37
|
+
llm=GroqLLM(api_key="your-groq-key", model="mixtral-8x7b-32768"),
|
|
38
|
+
tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
|
|
39
|
+
system_prompt="You are a helpful assistant.",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
async with pipeline:
|
|
43
|
+
# Simple: process complete audio
|
|
44
|
+
response_audio = await pipeline.process(input_audio_bytes)
|
|
45
|
+
|
|
46
|
+
# Streaming: lower latency
|
|
47
|
+
async for chunk in pipeline.stream(audio_stream):
|
|
48
|
+
play_audio(chunk)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### WebSocket Server
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from audio_engine import Pipeline
|
|
55
|
+
from audio_engine.streaming import WebSocketServer
|
|
56
|
+
|
|
57
|
+
pipeline = Pipeline(asr=..., llm=..., tts=...)
|
|
58
|
+
server = WebSocketServer(pipeline, host="0.0.0.0", port=8765)
|
|
59
|
+
|
|
60
|
+
await server.start()
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### With GeneFace++ Face Animation
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from audio_engine.integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
|
|
67
|
+
|
|
68
|
+
wrapped = GeneFacePipelineWrapper(
|
|
69
|
+
pipeline=pipeline,
|
|
70
|
+
geneface_config=GeneFaceConfig(
|
|
71
|
+
geneface_path="/path/to/ai-geneface-realtime"
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
audio, video_path = await wrapped.process_with_video(input_audio)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Architecture
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
User Audio → ASR → LLM → TTS → Response Audio
|
|
82
|
+
↓
|
|
83
|
+
GeneFace++ (optional)
|
|
84
|
+
↓
|
|
85
|
+
Animated Face Video
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Directory Structure
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
audio_engine/
|
|
92
|
+
├── core/ # Pipeline and configuration
|
|
93
|
+
├── asr/ # Speech-to-Text providers
|
|
94
|
+
├── llm/ # LLM providers
|
|
95
|
+
├── tts/ # Text-to-Speech providers
|
|
96
|
+
├── streaming/ # WebSocket server
|
|
97
|
+
├── integrations/ # GeneFace++ integration
|
|
98
|
+
├── utils/ # Audio utilities
|
|
99
|
+
└── examples/ # Example scripts
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Implementing a Provider
|
|
103
|
+
|
|
104
|
+
### Custom ASR
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from audio_engine.asr.base import BaseASR
|
|
108
|
+
|
|
109
|
+
class MyASR(BaseASR):
|
|
110
|
+
@property
|
|
111
|
+
def name(self) -> str:
|
|
112
|
+
return "my-asr"
|
|
113
|
+
|
|
114
|
+
async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
|
|
115
|
+
# Your implementation
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
async def transcribe_stream(self, audio_stream):
|
|
119
|
+
# Your streaming implementation
|
|
120
|
+
pass
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Custom LLM
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from audio_engine.llm.base import BaseLLM
|
|
127
|
+
|
|
128
|
+
class MyLLM(BaseLLM):
|
|
129
|
+
@property
|
|
130
|
+
def name(self) -> str:
|
|
131
|
+
return "my-llm"
|
|
132
|
+
|
|
133
|
+
async def generate(self, prompt: str, context=None) -> str:
|
|
134
|
+
# Your implementation
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
async def generate_stream(self, prompt: str, context=None):
|
|
138
|
+
# Your streaming implementation
|
|
139
|
+
pass
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Custom TTS
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from audio_engine.tts.base import BaseTTS
|
|
146
|
+
|
|
147
|
+
class MyTTS(BaseTTS):
|
|
148
|
+
@property
|
|
149
|
+
def name(self) -> str:
|
|
150
|
+
return "my-tts"
|
|
151
|
+
|
|
152
|
+
async def synthesize(self, text: str) -> bytes:
|
|
153
|
+
# Your implementation
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
async def synthesize_stream(self, text: str):
|
|
157
|
+
# Your streaming implementation
|
|
158
|
+
pass
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## WebSocket Protocol
|
|
162
|
+
|
|
163
|
+
### Client → Server
|
|
164
|
+
|
|
165
|
+
- **Binary**: Raw audio chunks (PCM 16-bit, 16kHz mono)
|
|
166
|
+
- **JSON**: `{"type": "end_of_speech"}` or `{"type": "reset"}`
|
|
167
|
+
|
|
168
|
+
### Server → Client
|
|
169
|
+
|
|
170
|
+
- **Binary**: Response audio chunks
|
|
171
|
+
- **JSON Events**:
|
|
172
|
+
- `{"type": "connected", "client_id": "..."}`
|
|
173
|
+
- `{"type": "transcript", "text": "..."}`
|
|
174
|
+
- `{"type": "response_text", "text": "..."}`
|
|
175
|
+
- `{"type": "response_start"}`
|
|
176
|
+
- `{"type": "response_end"}`
|
|
177
|
+
|
|
178
|
+
## Environment Variables
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
# ASR
|
|
182
|
+
ASR_PROVIDER=whisper
|
|
183
|
+
ASR_API_KEY=your-key
|
|
184
|
+
|
|
185
|
+
# LLM
|
|
186
|
+
LLM_PROVIDER=anthropic
|
|
187
|
+
LLM_API_KEY=your-key
|
|
188
|
+
LLM_MODEL=claude-sonnet-4-20250514
|
|
189
|
+
|
|
190
|
+
# TTS
|
|
191
|
+
TTS_PROVIDER=cartesia
|
|
192
|
+
TTS_API_KEY=your-key
|
|
193
|
+
TTS_VOICE_ID=your-voice-id
|
|
194
|
+
|
|
195
|
+
# Debug
|
|
196
|
+
DEBUG=true
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## License
|
|
200
|
+
|
|
201
|
+
MIT
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: atom-audio-engine
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
|
|
5
|
+
Author-email: ATOM Group <info@atomgroup.ng>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ATOM-GROUP-NG/audio-engine
|
|
8
|
+
Project-URL: Repository, https://github.com/ATOM-GROUP-NG/audio-engine.git
|
|
9
|
+
Project-URL: Issues, https://github.com/ATOM-GROUP-NG/audio-engine/issues
|
|
10
|
+
Keywords: audio,speech-to-text,text-to-speech,llm,conversational-ai,real-time,streaming,websocket
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: websockets>=12.0
|
|
23
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
24
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
25
|
+
Requires-Dist: numpy>=1.24.0
|
|
26
|
+
Requires-Dist: scipy>=1.10.0
|
|
27
|
+
Provides-Extra: asr
|
|
28
|
+
Requires-Dist: openai>=1.0.0; extra == "asr"
|
|
29
|
+
Requires-Dist: deepgram-sdk>=3.0.0; extra == "asr"
|
|
30
|
+
Requires-Dist: assemblyai>=0.20.0; extra == "asr"
|
|
31
|
+
Requires-Dist: cartesia>=1.0.0; extra == "asr"
|
|
32
|
+
Provides-Extra: llm
|
|
33
|
+
Requires-Dist: anthropic>=0.18.0; extra == "llm"
|
|
34
|
+
Requires-Dist: groq>=0.4.0; extra == "llm"
|
|
35
|
+
Provides-Extra: tts
|
|
36
|
+
Requires-Dist: cartesia>=1.0.0; extra == "tts"
|
|
37
|
+
Requires-Dist: elevenlabs>=1.0.0; extra == "tts"
|
|
38
|
+
Provides-Extra: all
|
|
39
|
+
Requires-Dist: openai>=1.0.0; extra == "all"
|
|
40
|
+
Requires-Dist: deepgram-sdk>=3.0.0; extra == "all"
|
|
41
|
+
Requires-Dist: assemblyai>=0.20.0; extra == "all"
|
|
42
|
+
Requires-Dist: cartesia>=1.0.0; extra == "all"
|
|
43
|
+
Requires-Dist: anthropic>=0.18.0; extra == "all"
|
|
44
|
+
Requires-Dist: groq>=0.4.0; extra == "all"
|
|
45
|
+
Requires-Dist: elevenlabs>=1.0.0; extra == "all"
|
|
46
|
+
Provides-Extra: dev
|
|
47
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
48
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
49
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
50
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
51
|
+
|
|
52
|
+
# Audio Engine
|
|
53
|
+
|
|
54
|
+
A pluggable audio-to-audio conversational engine with real-time streaming support.
|
|
55
|
+
|
|
56
|
+
## Features
|
|
57
|
+
|
|
58
|
+
- **Pluggable Architecture**: Swap ASR, LLM, and TTS providers easily
|
|
59
|
+
- **Real-time Streaming**: WebSocket server for low-latency conversations
|
|
60
|
+
- **GeneFace++ Integration**: Optional face animation from audio
|
|
61
|
+
- **Simple API**: Get started with just a few lines of code
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install atom-audio-engine
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
For development with all optional dependencies:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install atom-audio-engine[all,dev]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Quick Start
|
|
76
|
+
|
|
77
|
+
### Basic Usage
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from audio_engine import Pipeline
|
|
81
|
+
from audio_engine.asr import WhisperASR
|
|
82
|
+
from audio_engine.llm import AnthropicLLM
|
|
83
|
+
from audio_engine.tts import CartesiaTTS
|
|
84
|
+
|
|
85
|
+
# Create pipeline with your providers
|
|
86
|
+
pipeline = Pipeline(
|
|
87
|
+
asr=CartesiaASR(api_key="your-cartesia-key"),
|
|
88
|
+
llm=GroqLLM(api_key="your-groq-key", model="mixtral-8x7b-32768"),
|
|
89
|
+
tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
|
|
90
|
+
system_prompt="You are a helpful assistant.",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
async with pipeline:
|
|
94
|
+
# Simple: process complete audio
|
|
95
|
+
response_audio = await pipeline.process(input_audio_bytes)
|
|
96
|
+
|
|
97
|
+
# Streaming: lower latency
|
|
98
|
+
async for chunk in pipeline.stream(audio_stream):
|
|
99
|
+
play_audio(chunk)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### WebSocket Server
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from audio_engine import Pipeline
|
|
106
|
+
from audio_engine.streaming import WebSocketServer
|
|
107
|
+
|
|
108
|
+
pipeline = Pipeline(asr=..., llm=..., tts=...)
|
|
109
|
+
server = WebSocketServer(pipeline, host="0.0.0.0", port=8765)
|
|
110
|
+
|
|
111
|
+
await server.start()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### With GeneFace++ Face Animation
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from audio_engine.integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
|
|
118
|
+
|
|
119
|
+
wrapped = GeneFacePipelineWrapper(
|
|
120
|
+
pipeline=pipeline,
|
|
121
|
+
geneface_config=GeneFaceConfig(
|
|
122
|
+
geneface_path="/path/to/ai-geneface-realtime"
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
audio, video_path = await wrapped.process_with_video(input_audio)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Architecture
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
User Audio → ASR → LLM → TTS → Response Audio
|
|
133
|
+
↓
|
|
134
|
+
GeneFace++ (optional)
|
|
135
|
+
↓
|
|
136
|
+
Animated Face Video
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Directory Structure
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
audio_engine/
|
|
143
|
+
├── core/ # Pipeline and configuration
|
|
144
|
+
├── asr/ # Speech-to-Text providers
|
|
145
|
+
├── llm/ # LLM providers
|
|
146
|
+
├── tts/ # Text-to-Speech providers
|
|
147
|
+
├── streaming/ # WebSocket server
|
|
148
|
+
├── integrations/ # GeneFace++ integration
|
|
149
|
+
├── utils/ # Audio utilities
|
|
150
|
+
└── examples/ # Example scripts
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Implementing a Provider
|
|
154
|
+
|
|
155
|
+
### Custom ASR
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from audio_engine.asr.base import BaseASR
|
|
159
|
+
|
|
160
|
+
class MyASR(BaseASR):
|
|
161
|
+
@property
|
|
162
|
+
def name(self) -> str:
|
|
163
|
+
return "my-asr"
|
|
164
|
+
|
|
165
|
+
async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
|
|
166
|
+
# Your implementation
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
async def transcribe_stream(self, audio_stream):
|
|
170
|
+
# Your streaming implementation
|
|
171
|
+
pass
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Custom LLM
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from audio_engine.llm.base import BaseLLM
|
|
178
|
+
|
|
179
|
+
class MyLLM(BaseLLM):
|
|
180
|
+
@property
|
|
181
|
+
def name(self) -> str:
|
|
182
|
+
return "my-llm"
|
|
183
|
+
|
|
184
|
+
async def generate(self, prompt: str, context=None) -> str:
|
|
185
|
+
# Your implementation
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
async def generate_stream(self, prompt: str, context=None):
|
|
189
|
+
# Your streaming implementation
|
|
190
|
+
pass
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Custom TTS
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
from audio_engine.tts.base import BaseTTS
|
|
197
|
+
|
|
198
|
+
class MyTTS(BaseTTS):
|
|
199
|
+
@property
|
|
200
|
+
def name(self) -> str:
|
|
201
|
+
return "my-tts"
|
|
202
|
+
|
|
203
|
+
async def synthesize(self, text: str) -> bytes:
|
|
204
|
+
# Your implementation
|
|
205
|
+
pass
|
|
206
|
+
|
|
207
|
+
async def synthesize_stream(self, text: str):
|
|
208
|
+
# Your streaming implementation
|
|
209
|
+
pass
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## WebSocket Protocol
|
|
213
|
+
|
|
214
|
+
### Client → Server
|
|
215
|
+
|
|
216
|
+
- **Binary**: Raw audio chunks (PCM 16-bit, 16kHz mono)
|
|
217
|
+
- **JSON**: `{"type": "end_of_speech"}` or `{"type": "reset"}`
|
|
218
|
+
|
|
219
|
+
### Server → Client
|
|
220
|
+
|
|
221
|
+
- **Binary**: Response audio chunks
|
|
222
|
+
- **JSON Events**:
|
|
223
|
+
- `{"type": "connected", "client_id": "..."}`
|
|
224
|
+
- `{"type": "transcript", "text": "..."}`
|
|
225
|
+
- `{"type": "response_text", "text": "..."}`
|
|
226
|
+
- `{"type": "response_start"}`
|
|
227
|
+
- `{"type": "response_end"}`
|
|
228
|
+
|
|
229
|
+
## Environment Variables
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# ASR
|
|
233
|
+
ASR_PROVIDER=whisper
|
|
234
|
+
ASR_API_KEY=your-key
|
|
235
|
+
|
|
236
|
+
# LLM
|
|
237
|
+
LLM_PROVIDER=anthropic
|
|
238
|
+
LLM_API_KEY=your-key
|
|
239
|
+
LLM_MODEL=claude-sonnet-4-20250514
|
|
240
|
+
|
|
241
|
+
# TTS
|
|
242
|
+
TTS_PROVIDER=cartesia
|
|
243
|
+
TTS_API_KEY=your-key
|
|
244
|
+
TTS_VOICE_ID=your-voice-id
|
|
245
|
+
|
|
246
|
+
# Debug
|
|
247
|
+
DEBUG=true
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## License
|
|
251
|
+
|
|
252
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
atom_audio_engine.egg-info/PKG-INFO
|
|
5
|
+
atom_audio_engine.egg-info/SOURCES.txt
|
|
6
|
+
atom_audio_engine.egg-info/dependency_links.txt
|
|
7
|
+
atom_audio_engine.egg-info/requires.txt
|
|
8
|
+
atom_audio_engine.egg-info/top_level.txt
|
|
9
|
+
audio_engine/__init__.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
websockets>=12.0
|
|
2
|
+
aiohttp>=3.9.0
|
|
3
|
+
python-dotenv>=1.0.0
|
|
4
|
+
numpy>=1.24.0
|
|
5
|
+
scipy>=1.10.0
|
|
6
|
+
|
|
7
|
+
[all]
|
|
8
|
+
openai>=1.0.0
|
|
9
|
+
deepgram-sdk>=3.0.0
|
|
10
|
+
assemblyai>=0.20.0
|
|
11
|
+
cartesia>=1.0.0
|
|
12
|
+
anthropic>=0.18.0
|
|
13
|
+
groq>=0.4.0
|
|
14
|
+
elevenlabs>=1.0.0
|
|
15
|
+
|
|
16
|
+
[asr]
|
|
17
|
+
openai>=1.0.0
|
|
18
|
+
deepgram-sdk>=3.0.0
|
|
19
|
+
assemblyai>=0.20.0
|
|
20
|
+
cartesia>=1.0.0
|
|
21
|
+
|
|
22
|
+
[dev]
|
|
23
|
+
pytest>=7.0.0
|
|
24
|
+
pytest-asyncio>=0.21.0
|
|
25
|
+
black>=23.0.0
|
|
26
|
+
ruff>=0.1.0
|
|
27
|
+
|
|
28
|
+
[llm]
|
|
29
|
+
anthropic>=0.18.0
|
|
30
|
+
groq>=0.4.0
|
|
31
|
+
|
|
32
|
+
[tts]
|
|
33
|
+
cartesia>=1.0.0
|
|
34
|
+
elevenlabs>=1.0.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
audio_engine
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio Engine - Pluggable audio-to-audio conversational AI framework.
|
|
3
|
+
|
|
4
|
+
Orchestrates ASR → LLM → TTS pipeline with real-time streaming support.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
# Core exports
|
|
10
|
+
from .core.pipeline import Pipeline
|
|
11
|
+
from .core.config import (
|
|
12
|
+
AudioEngineConfig,
|
|
13
|
+
ASRConfig,
|
|
14
|
+
LLMConfig,
|
|
15
|
+
TTSConfig,
|
|
16
|
+
StreamingConfig,
|
|
17
|
+
)
|
|
18
|
+
from .core.types import (
|
|
19
|
+
AudioChunk,
|
|
20
|
+
TranscriptChunk,
|
|
21
|
+
ResponseChunk,
|
|
22
|
+
ConversationContext,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# ASR Providers
|
|
26
|
+
from .asr.base import BaseASR
|
|
27
|
+
from .asr.cartesia import CartesiaASR
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from .asr.deepgram import DeepgramASR
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
# LLM Providers
|
|
35
|
+
from .llm.base import BaseLLM
|
|
36
|
+
from .llm.groq import GroqLLM
|
|
37
|
+
|
|
38
|
+
# TTS Providers
|
|
39
|
+
from .tts.base import BaseTTS
|
|
40
|
+
from .tts.cartesia import CartesiaTTS
|
|
41
|
+
|
|
42
|
+
# Streaming
|
|
43
|
+
from .streaming.websocket_server import WebSocketServer
|
|
44
|
+
|
|
45
|
+
# Integrations
|
|
46
|
+
try:
|
|
47
|
+
from .integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
|
|
48
|
+
except ImportError:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
# Version
|
|
53
|
+
"__version__",
|
|
54
|
+
# Core
|
|
55
|
+
"Pipeline",
|
|
56
|
+
"AudioEngineConfig",
|
|
57
|
+
"ASRConfig",
|
|
58
|
+
"LLMConfig",
|
|
59
|
+
"TTSConfig",
|
|
60
|
+
"StreamingConfig",
|
|
61
|
+
"AudioChunk",
|
|
62
|
+
"TranscriptChunk",
|
|
63
|
+
"ResponseChunk",
|
|
64
|
+
"ConversationContext",
|
|
65
|
+
# ASR
|
|
66
|
+
"BaseASR",
|
|
67
|
+
"CartesiaASR",
|
|
68
|
+
"DeepgramASR",
|
|
69
|
+
# LLM
|
|
70
|
+
"BaseLLM",
|
|
71
|
+
"GroqLLM",
|
|
72
|
+
# TTS
|
|
73
|
+
"BaseTTS",
|
|
74
|
+
"CartesiaTTS",
|
|
75
|
+
# Streaming
|
|
76
|
+
"WebSocketServer",
|
|
77
|
+
# Integrations
|
|
78
|
+
"GeneFacePipelineWrapper",
|
|
79
|
+
"GeneFaceConfig",
|
|
80
|
+
]
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=65.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "atom-audio-engine"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "A pluggable, async-first Python framework for real-time audio-to-audio conversational AI"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "ATOM Group", email = "info@atomgroup.ng"}
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"audio",
|
|
17
|
+
"speech-to-text",
|
|
18
|
+
"text-to-speech",
|
|
19
|
+
"llm",
|
|
20
|
+
"conversational-ai",
|
|
21
|
+
"real-time",
|
|
22
|
+
"streaming",
|
|
23
|
+
"websocket"
|
|
24
|
+
]
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Development Status :: 4 - Beta",
|
|
27
|
+
"Intended Audience :: Developers",
|
|
28
|
+
"Programming Language :: Python :: 3",
|
|
29
|
+
"Programming Language :: Python :: 3.9",
|
|
30
|
+
"Programming Language :: Python :: 3.10",
|
|
31
|
+
"Programming Language :: Python :: 3.11",
|
|
32
|
+
"Programming Language :: Python :: 3.12",
|
|
33
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
34
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
dependencies = [
|
|
38
|
+
"websockets>=12.0",
|
|
39
|
+
"aiohttp>=3.9.0",
|
|
40
|
+
"python-dotenv>=1.0.0",
|
|
41
|
+
"numpy>=1.24.0",
|
|
42
|
+
"scipy>=1.10.0",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.optional-dependencies]
|
|
46
|
+
asr = [
|
|
47
|
+
"openai>=1.0.0", # For Whisper
|
|
48
|
+
"deepgram-sdk>=3.0.0", # For Deepgram
|
|
49
|
+
"assemblyai>=0.20.0", # For AssemblyAI
|
|
50
|
+
"cartesia>=1.0.0", # For Cartesia
|
|
51
|
+
]
|
|
52
|
+
llm = [
|
|
53
|
+
"anthropic>=0.18.0", # For Claude
|
|
54
|
+
"groq>=0.4.0", # For Groq
|
|
55
|
+
]
|
|
56
|
+
tts = [
|
|
57
|
+
"cartesia>=1.0.0", # For Cartesia
|
|
58
|
+
"elevenlabs>=1.0.0", # For ElevenLabs
|
|
59
|
+
]
|
|
60
|
+
all = [
|
|
61
|
+
"openai>=1.0.0",
|
|
62
|
+
"deepgram-sdk>=3.0.0",
|
|
63
|
+
"assemblyai>=0.20.0",
|
|
64
|
+
"cartesia>=1.0.0",
|
|
65
|
+
"anthropic>=0.18.0",
|
|
66
|
+
"groq>=0.4.0",
|
|
67
|
+
"elevenlabs>=1.0.0",
|
|
68
|
+
]
|
|
69
|
+
dev = [
|
|
70
|
+
"pytest>=7.0.0",
|
|
71
|
+
"pytest-asyncio>=0.21.0",
|
|
72
|
+
"black>=23.0.0",
|
|
73
|
+
"ruff>=0.1.0",
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
[project.urls]
|
|
77
|
+
Homepage = "https://github.com/ATOM-GROUP-NG/audio-engine"
|
|
78
|
+
Repository = "https://github.com/ATOM-GROUP-NG/audio-engine.git"
|
|
79
|
+
Issues = "https://github.com/ATOM-GROUP-NG/audio-engine/issues"
|
|
80
|
+
|
|
81
|
+
[tool.setuptools]
|
|
82
|
+
packages = ["audio_engine"]
|
|
83
|
+
|
|
84
|
+
[tool.setuptools.package-data]
|
|
85
|
+
audio_engine = ["py.typed"]
|
|
86
|
+
|
|
87
|
+
[tool.black]
|
|
88
|
+
line-length = 100
|
|
89
|
+
target-version = ["py39"]
|
|
90
|
+
include = '\.pyi?$'
|
|
91
|
+
extend-exclude = '''
|
|
92
|
+
/(
|
|
93
|
+
# directories
|
|
94
|
+
\.eggs
|
|
95
|
+
| \.git
|
|
96
|
+
| \.hg
|
|
97
|
+
| \.mypy_cache
|
|
98
|
+
| \.tox
|
|
99
|
+
| \.venv
|
|
100
|
+
| build
|
|
101
|
+
| dist
|
|
102
|
+
)/
|
|
103
|
+
'''
|
|
104
|
+
|
|
105
|
+
[tool.ruff]
|
|
106
|
+
line-length = 100
|
|
107
|
+
target-version = "py39"
|
|
108
|
+
select = [
|
|
109
|
+
"E", # pycodestyle errors
|
|
110
|
+
"W", # pycodestyle warnings
|
|
111
|
+
"F", # pyflakes
|
|
112
|
+
"I", # isort
|
|
113
|
+
"C", # flake8-comprehensions
|
|
114
|
+
"B", # flake8-bugbear
|
|
115
|
+
]
|
|
116
|
+
ignore = [
|
|
117
|
+
"E501", # line too long, handled by black
|
|
118
|
+
"B008", # do not perform function calls in argument defaults
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
[tool.pytest.ini_options]
|
|
122
|
+
testpaths = ["tests"]
|
|
123
|
+
asyncio_mode = "auto"
|