atom-audio-engine 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,252 @@
1
+ Metadata-Version: 2.4
2
+ Name: atom-audio-engine
3
+ Version: 0.1.1
4
+ Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
5
+ Author-email: ATOM Group <info@atomgroup.ng>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/ATOM-GROUP-NG/audio-engine
8
+ Project-URL: Repository, https://github.com/ATOM-GROUP-NG/audio-engine.git
9
+ Project-URL: Issues, https://github.com/ATOM-GROUP-NG/audio-engine/issues
10
+ Keywords: audio,speech-to-text,text-to-speech,llm,conversational-ai,real-time,streaming,websocket
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Multimedia :: Sound/Audio
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: websockets>=12.0
23
+ Requires-Dist: aiohttp>=3.9.0
24
+ Requires-Dist: python-dotenv>=1.0.0
25
+ Requires-Dist: numpy>=1.24.0
26
+ Requires-Dist: scipy>=1.10.0
27
+ Provides-Extra: asr
28
+ Requires-Dist: openai>=1.0.0; extra == "asr"
29
+ Requires-Dist: deepgram-sdk>=3.0.0; extra == "asr"
30
+ Requires-Dist: assemblyai>=0.20.0; extra == "asr"
31
+ Requires-Dist: cartesia>=1.0.0; extra == "asr"
32
+ Provides-Extra: llm
33
+ Requires-Dist: anthropic>=0.18.0; extra == "llm"
34
+ Requires-Dist: groq>=0.4.0; extra == "llm"
35
+ Provides-Extra: tts
36
+ Requires-Dist: cartesia>=1.0.0; extra == "tts"
37
+ Requires-Dist: elevenlabs>=1.0.0; extra == "tts"
38
+ Provides-Extra: all
39
+ Requires-Dist: openai>=1.0.0; extra == "all"
40
+ Requires-Dist: deepgram-sdk>=3.0.0; extra == "all"
41
+ Requires-Dist: assemblyai>=0.20.0; extra == "all"
42
+ Requires-Dist: cartesia>=1.0.0; extra == "all"
43
+ Requires-Dist: anthropic>=0.18.0; extra == "all"
44
+ Requires-Dist: groq>=0.4.0; extra == "all"
45
+ Requires-Dist: elevenlabs>=1.0.0; extra == "all"
46
+ Provides-Extra: dev
47
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
48
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
49
+ Requires-Dist: black>=23.0.0; extra == "dev"
50
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
51
+
52
+ # Audio Engine
53
+
54
+ A pluggable audio-to-audio conversational engine with real-time streaming support.
55
+
56
+ ## Features
57
+
58
+ - **Pluggable Architecture**: Swap ASR, LLM, and TTS providers easily
59
+ - **Real-time Streaming**: WebSocket server for low-latency conversations
60
+ - **GeneFace++ Integration**: Optional face animation from audio
61
+ - **Simple API**: Get started with just a few lines of code
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install atom-audio-engine
67
+ ```
68
+
69
+ For development with all optional dependencies:
70
+
71
+ ```bash
72
+ pip install atom-audio-engine[all,dev]
73
+ ```
74
+
75
+ ## Quick Start
76
+
77
+ ### Basic Usage
78
+
79
+ ```python
80
+ from audio_engine import Pipeline
81
+ from audio_engine.asr import WhisperASR
82
+ from audio_engine.llm import AnthropicLLM
83
+ from audio_engine.tts import CartesiaTTS
84
+
85
+ # Create pipeline with your providers
86
+ pipeline = Pipeline(
87
+ asr=CartesiaASR(api_key="your-cartesia-key"),
88
+ llm=GroqLLM(api_key="your-groq-key", model="mixtral-8x7b-32768"),
89
+ tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
90
+ system_prompt="You are a helpful assistant.",
91
+ )
92
+
93
+ async with pipeline:
94
+ # Simple: process complete audio
95
+ response_audio = await pipeline.process(input_audio_bytes)
96
+
97
+ # Streaming: lower latency
98
+ async for chunk in pipeline.stream(audio_stream):
99
+ play_audio(chunk)
100
+ ```
101
+
102
+ ### WebSocket Server
103
+
104
+ ```python
105
+ from audio_engine import Pipeline
106
+ from audio_engine.streaming import WebSocketServer
107
+
108
+ pipeline = Pipeline(asr=..., llm=..., tts=...)
109
+ server = WebSocketServer(pipeline, host="0.0.0.0", port=8765)
110
+
111
+ await server.start()
112
+ ```
113
+
114
+ ### With GeneFace++ Face Animation
115
+
116
+ ```python
117
+ from audio_engine.integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
118
+
119
+ wrapped = GeneFacePipelineWrapper(
120
+ pipeline=pipeline,
121
+ geneface_config=GeneFaceConfig(
122
+ geneface_path="/path/to/ai-geneface-realtime"
123
+ )
124
+ )
125
+
126
+ audio, video_path = await wrapped.process_with_video(input_audio)
127
+ ```
128
+
129
+ ## Architecture
130
+
131
+ ```
132
+ User Audio → ASR → LLM → TTS → Response Audio
133
+
134
+ GeneFace++ (optional)
135
+
136
+ Animated Face Video
137
+ ```
138
+
139
+ ## Directory Structure
140
+
141
+ ```
142
+ audio_engine/
143
+ ├── core/ # Pipeline and configuration
144
+ ├── asr/ # Speech-to-Text providers
145
+ ├── llm/ # LLM providers
146
+ ├── tts/ # Text-to-Speech providers
147
+ ├── streaming/ # WebSocket server
148
+ ├── integrations/ # GeneFace++ integration
149
+ ├── utils/ # Audio utilities
150
+ └── examples/ # Example scripts
151
+ ```
152
+
153
+ ## Implementing a Provider
154
+
155
+ ### Custom ASR
156
+
157
+ ```python
158
+ from audio_engine.asr.base import BaseASR
159
+
160
+ class MyASR(BaseASR):
161
+ @property
162
+ def name(self) -> str:
163
+ return "my-asr"
164
+
165
+ async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
166
+ # Your implementation
167
+ pass
168
+
169
+ async def transcribe_stream(self, audio_stream):
170
+ # Your streaming implementation
171
+ pass
172
+ ```
173
+
174
+ ### Custom LLM
175
+
176
+ ```python
177
+ from audio_engine.llm.base import BaseLLM
178
+
179
+ class MyLLM(BaseLLM):
180
+ @property
181
+ def name(self) -> str:
182
+ return "my-llm"
183
+
184
+ async def generate(self, prompt: str, context=None) -> str:
185
+ # Your implementation
186
+ pass
187
+
188
+ async def generate_stream(self, prompt: str, context=None):
189
+ # Your streaming implementation
190
+ pass
191
+ ```
192
+
193
+ ### Custom TTS
194
+
195
+ ```python
196
+ from audio_engine.tts.base import BaseTTS
197
+
198
+ class MyTTS(BaseTTS):
199
+ @property
200
+ def name(self) -> str:
201
+ return "my-tts"
202
+
203
+ async def synthesize(self, text: str) -> bytes:
204
+ # Your implementation
205
+ pass
206
+
207
+ async def synthesize_stream(self, text: str):
208
+ # Your streaming implementation
209
+ pass
210
+ ```
211
+
212
+ ## WebSocket Protocol
213
+
214
+ ### Client → Server
215
+
216
+ - **Binary**: Raw audio chunks (PCM 16-bit, 16kHz mono)
217
+ - **JSON**: `{"type": "end_of_speech"}` or `{"type": "reset"}`
218
+
219
+ ### Server → Client
220
+
221
+ - **Binary**: Response audio chunks
222
+ - **JSON Events**:
223
+ - `{"type": "connected", "client_id": "..."}`
224
+ - `{"type": "transcript", "text": "..."}`
225
+ - `{"type": "response_text", "text": "..."}`
226
+ - `{"type": "response_start"}`
227
+ - `{"type": "response_end"}`
228
+
229
+ ## Environment Variables
230
+
231
+ ```bash
232
+ # ASR
233
+ ASR_PROVIDER=whisper
234
+ ASR_API_KEY=your-key
235
+
236
+ # LLM
237
+ LLM_PROVIDER=anthropic
238
+ LLM_API_KEY=your-key
239
+ LLM_MODEL=claude-sonnet-4-20250514
240
+
241
+ # TTS
242
+ TTS_PROVIDER=cartesia
243
+ TTS_API_KEY=your-key
244
+ TTS_VOICE_ID=your-voice-id
245
+
246
+ # Debug
247
+ DEBUG=true
248
+ ```
249
+
250
+ ## License
251
+
252
+ MIT
@@ -0,0 +1,201 @@
1
+ # Audio Engine
2
+
3
+ A pluggable audio-to-audio conversational engine with real-time streaming support.
4
+
5
+ ## Features
6
+
7
+ - **Pluggable Architecture**: Swap ASR, LLM, and TTS providers easily
8
+ - **Real-time Streaming**: WebSocket server for low-latency conversations
9
+ - **GeneFace++ Integration**: Optional face animation from audio
10
+ - **Simple API**: Get started with just a few lines of code
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install atom-audio-engine
16
+ ```
17
+
18
+ For development with all optional dependencies:
19
+
20
+ ```bash
21
+ pip install atom-audio-engine[all,dev]
22
+ ```
23
+
24
+ ## Quick Start
25
+
26
+ ### Basic Usage
27
+
28
+ ```python
29
+ from audio_engine import Pipeline
30
+ from audio_engine.asr import WhisperASR
31
+ from audio_engine.llm import AnthropicLLM
32
+ from audio_engine.tts import CartesiaTTS
33
+
34
+ # Create pipeline with your providers
35
+ pipeline = Pipeline(
36
+ asr=CartesiaASR(api_key="your-cartesia-key"),
37
+ llm=GroqLLM(api_key="your-groq-key", model="mixtral-8x7b-32768"),
38
+ tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
39
+ system_prompt="You are a helpful assistant.",
40
+ )
41
+
42
+ async with pipeline:
43
+ # Simple: process complete audio
44
+ response_audio = await pipeline.process(input_audio_bytes)
45
+
46
+ # Streaming: lower latency
47
+ async for chunk in pipeline.stream(audio_stream):
48
+ play_audio(chunk)
49
+ ```
50
+
51
+ ### WebSocket Server
52
+
53
+ ```python
54
+ from audio_engine import Pipeline
55
+ from audio_engine.streaming import WebSocketServer
56
+
57
+ pipeline = Pipeline(asr=..., llm=..., tts=...)
58
+ server = WebSocketServer(pipeline, host="0.0.0.0", port=8765)
59
+
60
+ await server.start()
61
+ ```
62
+
63
+ ### With GeneFace++ Face Animation
64
+
65
+ ```python
66
+ from audio_engine.integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
67
+
68
+ wrapped = GeneFacePipelineWrapper(
69
+ pipeline=pipeline,
70
+ geneface_config=GeneFaceConfig(
71
+ geneface_path="/path/to/ai-geneface-realtime"
72
+ )
73
+ )
74
+
75
+ audio, video_path = await wrapped.process_with_video(input_audio)
76
+ ```
77
+
78
+ ## Architecture
79
+
80
+ ```
81
+ User Audio → ASR → LLM → TTS → Response Audio
82
+
83
+ GeneFace++ (optional)
84
+
85
+ Animated Face Video
86
+ ```
87
+
88
+ ## Directory Structure
89
+
90
+ ```
91
+ audio_engine/
92
+ ├── core/ # Pipeline and configuration
93
+ ├── asr/ # Speech-to-Text providers
94
+ ├── llm/ # LLM providers
95
+ ├── tts/ # Text-to-Speech providers
96
+ ├── streaming/ # WebSocket server
97
+ ├── integrations/ # GeneFace++ integration
98
+ ├── utils/ # Audio utilities
99
+ └── examples/ # Example scripts
100
+ ```
101
+
102
+ ## Implementing a Provider
103
+
104
+ ### Custom ASR
105
+
106
+ ```python
107
+ from audio_engine.asr.base import BaseASR
108
+
109
+ class MyASR(BaseASR):
110
+ @property
111
+ def name(self) -> str:
112
+ return "my-asr"
113
+
114
+ async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
115
+ # Your implementation
116
+ pass
117
+
118
+ async def transcribe_stream(self, audio_stream):
119
+ # Your streaming implementation
120
+ pass
121
+ ```
122
+
123
+ ### Custom LLM
124
+
125
+ ```python
126
+ from audio_engine.llm.base import BaseLLM
127
+
128
+ class MyLLM(BaseLLM):
129
+ @property
130
+ def name(self) -> str:
131
+ return "my-llm"
132
+
133
+ async def generate(self, prompt: str, context=None) -> str:
134
+ # Your implementation
135
+ pass
136
+
137
+ async def generate_stream(self, prompt: str, context=None):
138
+ # Your streaming implementation
139
+ pass
140
+ ```
141
+
142
+ ### Custom TTS
143
+
144
+ ```python
145
+ from audio_engine.tts.base import BaseTTS
146
+
147
+ class MyTTS(BaseTTS):
148
+ @property
149
+ def name(self) -> str:
150
+ return "my-tts"
151
+
152
+ async def synthesize(self, text: str) -> bytes:
153
+ # Your implementation
154
+ pass
155
+
156
+ async def synthesize_stream(self, text: str):
157
+ # Your streaming implementation
158
+ pass
159
+ ```
160
+
161
+ ## WebSocket Protocol
162
+
163
+ ### Client → Server
164
+
165
+ - **Binary**: Raw audio chunks (PCM 16-bit, 16kHz mono)
166
+ - **JSON**: `{"type": "end_of_speech"}` or `{"type": "reset"}`
167
+
168
+ ### Server → Client
169
+
170
+ - **Binary**: Response audio chunks
171
+ - **JSON Events**:
172
+ - `{"type": "connected", "client_id": "..."}`
173
+ - `{"type": "transcript", "text": "..."}`
174
+ - `{"type": "response_text", "text": "..."}`
175
+ - `{"type": "response_start"}`
176
+ - `{"type": "response_end"}`
177
+
178
+ ## Environment Variables
179
+
180
+ ```bash
181
+ # ASR
182
+ ASR_PROVIDER=whisper
183
+ ASR_API_KEY=your-key
184
+
185
+ # LLM
186
+ LLM_PROVIDER=anthropic
187
+ LLM_API_KEY=your-key
188
+ LLM_MODEL=claude-sonnet-4-20250514
189
+
190
+ # TTS
191
+ TTS_PROVIDER=cartesia
192
+ TTS_API_KEY=your-key
193
+ TTS_VOICE_ID=your-voice-id
194
+
195
+ # Debug
196
+ DEBUG=true
197
+ ```
198
+
199
+ ## License
200
+
201
+ MIT
@@ -0,0 +1,252 @@
1
+ Metadata-Version: 2.4
2
+ Name: atom-audio-engine
3
+ Version: 0.1.1
4
+ Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
5
+ Author-email: ATOM Group <info@atomgroup.ng>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/ATOM-GROUP-NG/audio-engine
8
+ Project-URL: Repository, https://github.com/ATOM-GROUP-NG/audio-engine.git
9
+ Project-URL: Issues, https://github.com/ATOM-GROUP-NG/audio-engine/issues
10
+ Keywords: audio,speech-to-text,text-to-speech,llm,conversational-ai,real-time,streaming,websocket
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Multimedia :: Sound/Audio
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: websockets>=12.0
23
+ Requires-Dist: aiohttp>=3.9.0
24
+ Requires-Dist: python-dotenv>=1.0.0
25
+ Requires-Dist: numpy>=1.24.0
26
+ Requires-Dist: scipy>=1.10.0
27
+ Provides-Extra: asr
28
+ Requires-Dist: openai>=1.0.0; extra == "asr"
29
+ Requires-Dist: deepgram-sdk>=3.0.0; extra == "asr"
30
+ Requires-Dist: assemblyai>=0.20.0; extra == "asr"
31
+ Requires-Dist: cartesia>=1.0.0; extra == "asr"
32
+ Provides-Extra: llm
33
+ Requires-Dist: anthropic>=0.18.0; extra == "llm"
34
+ Requires-Dist: groq>=0.4.0; extra == "llm"
35
+ Provides-Extra: tts
36
+ Requires-Dist: cartesia>=1.0.0; extra == "tts"
37
+ Requires-Dist: elevenlabs>=1.0.0; extra == "tts"
38
+ Provides-Extra: all
39
+ Requires-Dist: openai>=1.0.0; extra == "all"
40
+ Requires-Dist: deepgram-sdk>=3.0.0; extra == "all"
41
+ Requires-Dist: assemblyai>=0.20.0; extra == "all"
42
+ Requires-Dist: cartesia>=1.0.0; extra == "all"
43
+ Requires-Dist: anthropic>=0.18.0; extra == "all"
44
+ Requires-Dist: groq>=0.4.0; extra == "all"
45
+ Requires-Dist: elevenlabs>=1.0.0; extra == "all"
46
+ Provides-Extra: dev
47
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
48
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
49
+ Requires-Dist: black>=23.0.0; extra == "dev"
50
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
51
+
52
+ # Audio Engine
53
+
54
+ A pluggable audio-to-audio conversational engine with real-time streaming support.
55
+
56
+ ## Features
57
+
58
+ - **Pluggable Architecture**: Swap ASR, LLM, and TTS providers easily
59
+ - **Real-time Streaming**: WebSocket server for low-latency conversations
60
+ - **GeneFace++ Integration**: Optional face animation from audio
61
+ - **Simple API**: Get started with just a few lines of code
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install atom-audio-engine
67
+ ```
68
+
69
+ For development with all optional dependencies:
70
+
71
+ ```bash
72
+ pip install atom-audio-engine[all,dev]
73
+ ```
74
+
75
+ ## Quick Start
76
+
77
+ ### Basic Usage
78
+
79
+ ```python
80
+ from audio_engine import Pipeline
81
+ from audio_engine.asr import WhisperASR
82
+ from audio_engine.llm import AnthropicLLM
83
+ from audio_engine.tts import CartesiaTTS
84
+
85
+ # Create pipeline with your providers
86
+ pipeline = Pipeline(
87
+ asr=CartesiaASR(api_key="your-cartesia-key"),
88
+ llm=GroqLLM(api_key="your-groq-key", model="mixtral-8x7b-32768"),
89
+ tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
90
+ system_prompt="You are a helpful assistant.",
91
+ )
92
+
93
+ async with pipeline:
94
+ # Simple: process complete audio
95
+ response_audio = await pipeline.process(input_audio_bytes)
96
+
97
+ # Streaming: lower latency
98
+ async for chunk in pipeline.stream(audio_stream):
99
+ play_audio(chunk)
100
+ ```
101
+
102
+ ### WebSocket Server
103
+
104
+ ```python
105
+ from audio_engine import Pipeline
106
+ from audio_engine.streaming import WebSocketServer
107
+
108
+ pipeline = Pipeline(asr=..., llm=..., tts=...)
109
+ server = WebSocketServer(pipeline, host="0.0.0.0", port=8765)
110
+
111
+ await server.start()
112
+ ```
113
+
114
+ ### With GeneFace++ Face Animation
115
+
116
+ ```python
117
+ from audio_engine.integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
118
+
119
+ wrapped = GeneFacePipelineWrapper(
120
+ pipeline=pipeline,
121
+ geneface_config=GeneFaceConfig(
122
+ geneface_path="/path/to/ai-geneface-realtime"
123
+ )
124
+ )
125
+
126
+ audio, video_path = await wrapped.process_with_video(input_audio)
127
+ ```
128
+
129
+ ## Architecture
130
+
131
+ ```
132
+ User Audio → ASR → LLM → TTS → Response Audio
133
+
134
+ GeneFace++ (optional)
135
+
136
+ Animated Face Video
137
+ ```
138
+
139
+ ## Directory Structure
140
+
141
+ ```
142
+ audio_engine/
143
+ ├── core/ # Pipeline and configuration
144
+ ├── asr/ # Speech-to-Text providers
145
+ ├── llm/ # LLM providers
146
+ ├── tts/ # Text-to-Speech providers
147
+ ├── streaming/ # WebSocket server
148
+ ├── integrations/ # GeneFace++ integration
149
+ ├── utils/ # Audio utilities
150
+ └── examples/ # Example scripts
151
+ ```
152
+
153
+ ## Implementing a Provider
154
+
155
+ ### Custom ASR
156
+
157
+ ```python
158
+ from audio_engine.asr.base import BaseASR
159
+
160
+ class MyASR(BaseASR):
161
+ @property
162
+ def name(self) -> str:
163
+ return "my-asr"
164
+
165
+ async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
166
+ # Your implementation
167
+ pass
168
+
169
+ async def transcribe_stream(self, audio_stream):
170
+ # Your streaming implementation
171
+ pass
172
+ ```
173
+
174
+ ### Custom LLM
175
+
176
+ ```python
177
+ from audio_engine.llm.base import BaseLLM
178
+
179
+ class MyLLM(BaseLLM):
180
+ @property
181
+ def name(self) -> str:
182
+ return "my-llm"
183
+
184
+ async def generate(self, prompt: str, context=None) -> str:
185
+ # Your implementation
186
+ pass
187
+
188
+ async def generate_stream(self, prompt: str, context=None):
189
+ # Your streaming implementation
190
+ pass
191
+ ```
192
+
193
+ ### Custom TTS
194
+
195
+ ```python
196
+ from audio_engine.tts.base import BaseTTS
197
+
198
+ class MyTTS(BaseTTS):
199
+ @property
200
+ def name(self) -> str:
201
+ return "my-tts"
202
+
203
+ async def synthesize(self, text: str) -> bytes:
204
+ # Your implementation
205
+ pass
206
+
207
+ async def synthesize_stream(self, text: str):
208
+ # Your streaming implementation
209
+ pass
210
+ ```
211
+
212
+ ## WebSocket Protocol
213
+
214
+ ### Client → Server
215
+
216
+ - **Binary**: Raw audio chunks (PCM 16-bit, 16kHz mono)
217
+ - **JSON**: `{"type": "end_of_speech"}` or `{"type": "reset"}`
218
+
219
+ ### Server → Client
220
+
221
+ - **Binary**: Response audio chunks
222
+ - **JSON Events**:
223
+ - `{"type": "connected", "client_id": "..."}`
224
+ - `{"type": "transcript", "text": "..."}`
225
+ - `{"type": "response_text", "text": "..."}`
226
+ - `{"type": "response_start"}`
227
+ - `{"type": "response_end"}`
228
+
229
+ ## Environment Variables
230
+
231
+ ```bash
232
+ # ASR
233
+ ASR_PROVIDER=whisper
234
+ ASR_API_KEY=your-key
235
+
236
+ # LLM
237
+ LLM_PROVIDER=anthropic
238
+ LLM_API_KEY=your-key
239
+ LLM_MODEL=claude-sonnet-4-20250514
240
+
241
+ # TTS
242
+ TTS_PROVIDER=cartesia
243
+ TTS_API_KEY=your-key
244
+ TTS_VOICE_ID=your-voice-id
245
+
246
+ # Debug
247
+ DEBUG=true
248
+ ```
249
+
250
+ ## License
251
+
252
+ MIT
@@ -0,0 +1,9 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ atom_audio_engine.egg-info/PKG-INFO
5
+ atom_audio_engine.egg-info/SOURCES.txt
6
+ atom_audio_engine.egg-info/dependency_links.txt
7
+ atom_audio_engine.egg-info/requires.txt
8
+ atom_audio_engine.egg-info/top_level.txt
9
+ audio_engine/__init__.py
@@ -0,0 +1,34 @@
1
+ websockets>=12.0
2
+ aiohttp>=3.9.0
3
+ python-dotenv>=1.0.0
4
+ numpy>=1.24.0
5
+ scipy>=1.10.0
6
+
7
+ [all]
8
+ openai>=1.0.0
9
+ deepgram-sdk>=3.0.0
10
+ assemblyai>=0.20.0
11
+ cartesia>=1.0.0
12
+ anthropic>=0.18.0
13
+ groq>=0.4.0
14
+ elevenlabs>=1.0.0
15
+
16
+ [asr]
17
+ openai>=1.0.0
18
+ deepgram-sdk>=3.0.0
19
+ assemblyai>=0.20.0
20
+ cartesia>=1.0.0
21
+
22
+ [dev]
23
+ pytest>=7.0.0
24
+ pytest-asyncio>=0.21.0
25
+ black>=23.0.0
26
+ ruff>=0.1.0
27
+
28
+ [llm]
29
+ anthropic>=0.18.0
30
+ groq>=0.4.0
31
+
32
+ [tts]
33
+ cartesia>=1.0.0
34
+ elevenlabs>=1.0.0
@@ -0,0 +1,80 @@
1
+ """
2
+ Audio Engine - Pluggable audio-to-audio conversational AI framework.
3
+
4
+ Orchestrates ASR → LLM → TTS pipeline with real-time streaming support.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ # Core exports
10
+ from .core.pipeline import Pipeline
11
+ from .core.config import (
12
+ AudioEngineConfig,
13
+ ASRConfig,
14
+ LLMConfig,
15
+ TTSConfig,
16
+ StreamingConfig,
17
+ )
18
+ from .core.types import (
19
+ AudioChunk,
20
+ TranscriptChunk,
21
+ ResponseChunk,
22
+ ConversationContext,
23
+ )
24
+
25
+ # ASR Providers
26
+ from .asr.base import BaseASR
27
+ from .asr.cartesia import CartesiaASR
28
+
29
+ try:
30
+ from .asr.deepgram import DeepgramASR
31
+ except ImportError:
32
+ pass
33
+
34
+ # LLM Providers
35
+ from .llm.base import BaseLLM
36
+ from .llm.groq import GroqLLM
37
+
38
+ # TTS Providers
39
+ from .tts.base import BaseTTS
40
+ from .tts.cartesia import CartesiaTTS
41
+
42
+ # Streaming
43
+ from .streaming.websocket_server import WebSocketServer
44
+
45
+ # Integrations
46
+ try:
47
+ from .integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
48
+ except ImportError:
49
+ pass
50
+
51
+ __all__ = [
52
+ # Version
53
+ "__version__",
54
+ # Core
55
+ "Pipeline",
56
+ "AudioEngineConfig",
57
+ "ASRConfig",
58
+ "LLMConfig",
59
+ "TTSConfig",
60
+ "StreamingConfig",
61
+ "AudioChunk",
62
+ "TranscriptChunk",
63
+ "ResponseChunk",
64
+ "ConversationContext",
65
+ # ASR
66
+ "BaseASR",
67
+ "CartesiaASR",
68
+ "DeepgramASR",
69
+ # LLM
70
+ "BaseLLM",
71
+ "GroqLLM",
72
+ # TTS
73
+ "BaseTTS",
74
+ "CartesiaTTS",
75
+ # Streaming
76
+ "WebSocketServer",
77
+ # Integrations
78
+ "GeneFacePipelineWrapper",
79
+ "GeneFaceConfig",
80
+ ]
@@ -0,0 +1,123 @@
1
+ [build-system]
2
+ requires = ["setuptools>=65.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "atom-audio-engine"
7
+ version = "0.1.1"
8
+ description = "A pluggable, async-first Python framework for real-time audio-to-audio conversational AI"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT"
12
+ authors = [
13
+ {name = "ATOM Group", email = "info@atomgroup.ng"}
14
+ ]
15
+ keywords = [
16
+ "audio",
17
+ "speech-to-text",
18
+ "text-to-speech",
19
+ "llm",
20
+ "conversational-ai",
21
+ "real-time",
22
+ "streaming",
23
+ "websocket"
24
+ ]
25
+ classifiers = [
26
+ "Development Status :: 4 - Beta",
27
+ "Intended Audience :: Developers",
28
+ "Programming Language :: Python :: 3",
29
+ "Programming Language :: Python :: 3.9",
30
+ "Programming Language :: Python :: 3.10",
31
+ "Programming Language :: Python :: 3.11",
32
+ "Programming Language :: Python :: 3.12",
33
+ "Topic :: Multimedia :: Sound/Audio",
34
+ "Topic :: Software Development :: Libraries :: Python Modules",
35
+ ]
36
+
37
+ dependencies = [
38
+ "websockets>=12.0",
39
+ "aiohttp>=3.9.0",
40
+ "python-dotenv>=1.0.0",
41
+ "numpy>=1.24.0",
42
+ "scipy>=1.10.0",
43
+ ]
44
+
45
+ [project.optional-dependencies]
46
+ asr = [
47
+ "openai>=1.0.0", # For Whisper
48
+ "deepgram-sdk>=3.0.0", # For Deepgram
49
+ "assemblyai>=0.20.0", # For AssemblyAI
50
+ "cartesia>=1.0.0", # For Cartesia
51
+ ]
52
+ llm = [
53
+ "anthropic>=0.18.0", # For Claude
54
+ "groq>=0.4.0", # For Groq
55
+ ]
56
+ tts = [
57
+ "cartesia>=1.0.0", # For Cartesia
58
+ "elevenlabs>=1.0.0", # For ElevenLabs
59
+ ]
60
+ all = [
61
+ "openai>=1.0.0",
62
+ "deepgram-sdk>=3.0.0",
63
+ "assemblyai>=0.20.0",
64
+ "cartesia>=1.0.0",
65
+ "anthropic>=0.18.0",
66
+ "groq>=0.4.0",
67
+ "elevenlabs>=1.0.0",
68
+ ]
69
+ dev = [
70
+ "pytest>=7.0.0",
71
+ "pytest-asyncio>=0.21.0",
72
+ "black>=23.0.0",
73
+ "ruff>=0.1.0",
74
+ ]
75
+
76
+ [project.urls]
77
+ Homepage = "https://github.com/ATOM-GROUP-NG/audio-engine"
78
+ Repository = "https://github.com/ATOM-GROUP-NG/audio-engine.git"
79
+ Issues = "https://github.com/ATOM-GROUP-NG/audio-engine/issues"
80
+
81
+ [tool.setuptools]
82
+ packages = ["audio_engine"]
83
+
84
+ [tool.setuptools.package-data]
85
+ audio_engine = ["py.typed"]
86
+
87
+ [tool.black]
88
+ line-length = 100
89
+ target-version = ["py39"]
90
+ include = '\.pyi?$'
91
+ extend-exclude = '''
92
+ /(
93
+ # directories
94
+ \.eggs
95
+ | \.git
96
+ | \.hg
97
+ | \.mypy_cache
98
+ | \.tox
99
+ | \.venv
100
+ | build
101
+ | dist
102
+ )/
103
+ '''
104
+
105
+ [tool.ruff]
106
+ line-length = 100
107
+ target-version = "py39"
108
+ select = [
109
+ "E", # pycodestyle errors
110
+ "W", # pycodestyle warnings
111
+ "F", # pyflakes
112
+ "I", # isort
113
+ "C", # flake8-comprehensions
114
+ "B", # flake8-bugbear
115
+ ]
116
+ ignore = [
117
+ "E501", # line too long, handled by black
118
+ "B008", # do not perform function calls in argument defaults
119
+ ]
120
+
121
+ [tool.pytest.ini_options]
122
+ testpaths = ["tests"]
123
+ asyncio_mode = "auto"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python
2
+ """Setup script for audio-engine package."""
3
+
4
+ from setuptools import setup, find_packages
5
+
6
+ setup()