atom-audio-engine 0.1.4__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/PKG-INFO +1 -1
  2. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/atom_audio_engine.egg-info/PKG-INFO +1 -1
  3. atom_audio_engine-0.1.5/atom_audio_engine.egg-info/SOURCES.txt +36 -0
  4. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/audio_engine/__init__.py +1 -1
  5. atom_audio_engine-0.1.5/audio_engine/asr/__init__.py +44 -0
  6. atom_audio_engine-0.1.5/audio_engine/asr/base.py +89 -0
  7. atom_audio_engine-0.1.5/audio_engine/asr/cartesia.py +350 -0
  8. atom_audio_engine-0.1.5/audio_engine/asr/deepgram.py +196 -0
  9. atom_audio_engine-0.1.5/audio_engine/core/__init__.py +13 -0
  10. atom_audio_engine-0.1.5/audio_engine/core/config.py +162 -0
  11. atom_audio_engine-0.1.5/audio_engine/core/pipeline.py +278 -0
  12. atom_audio_engine-0.1.5/audio_engine/core/types.py +87 -0
  13. atom_audio_engine-0.1.5/audio_engine/integrations/__init__.py +5 -0
  14. atom_audio_engine-0.1.5/audio_engine/integrations/geneface.py +297 -0
  15. atom_audio_engine-0.1.5/audio_engine/llm/__init__.py +36 -0
  16. atom_audio_engine-0.1.5/audio_engine/llm/base.py +106 -0
  17. atom_audio_engine-0.1.5/audio_engine/llm/groq.py +208 -0
  18. atom_audio_engine-0.1.5/audio_engine/pipelines/__init__.py +1 -0
  19. atom_audio_engine-0.1.5/audio_engine/pipelines/personaplex/__init__.py +41 -0
  20. atom_audio_engine-0.1.5/audio_engine/pipelines/personaplex/client.py +259 -0
  21. atom_audio_engine-0.1.5/audio_engine/pipelines/personaplex/config.py +69 -0
  22. atom_audio_engine-0.1.5/audio_engine/pipelines/personaplex/pipeline.py +301 -0
  23. atom_audio_engine-0.1.5/audio_engine/pipelines/personaplex/types.py +173 -0
  24. atom_audio_engine-0.1.5/audio_engine/pipelines/personaplex/utils.py +192 -0
  25. atom_audio_engine-0.1.5/audio_engine/streaming/__init__.py +5 -0
  26. atom_audio_engine-0.1.5/audio_engine/streaming/websocket_server.py +333 -0
  27. atom_audio_engine-0.1.5/audio_engine/tts/__init__.py +35 -0
  28. atom_audio_engine-0.1.5/audio_engine/tts/base.py +153 -0
  29. atom_audio_engine-0.1.5/audio_engine/tts/cartesia.py +370 -0
  30. atom_audio_engine-0.1.5/audio_engine/utils/__init__.py +15 -0
  31. atom_audio_engine-0.1.5/audio_engine/utils/audio.py +218 -0
  32. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/pyproject.toml +13 -2
  33. atom_audio_engine-0.1.4/atom_audio_engine.egg-info/SOURCES.txt +0 -9
  34. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/README.md +0 -0
  35. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/atom_audio_engine.egg-info/dependency_links.txt +0 -0
  36. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/atom_audio_engine.egg-info/requires.txt +0 -0
  37. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/atom_audio_engine.egg-info/top_level.txt +0 -0
  38. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/setup.cfg +0 -0
  39. {atom_audio_engine-0.1.4 → atom_audio_engine-0.1.5}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: atom-audio-engine
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
5
5
  Author-email: ATOM Group <info@atomgroup.ng>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: atom-audio-engine
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
5
5
  Author-email: ATOM Group <info@atomgroup.ng>
6
6
  License-Expression: MIT
@@ -0,0 +1,36 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ atom_audio_engine.egg-info/PKG-INFO
5
+ atom_audio_engine.egg-info/SOURCES.txt
6
+ atom_audio_engine.egg-info/dependency_links.txt
7
+ atom_audio_engine.egg-info/requires.txt
8
+ atom_audio_engine.egg-info/top_level.txt
9
+ audio_engine/__init__.py
10
+ audio_engine/asr/__init__.py
11
+ audio_engine/asr/base.py
12
+ audio_engine/asr/cartesia.py
13
+ audio_engine/asr/deepgram.py
14
+ audio_engine/core/__init__.py
15
+ audio_engine/core/config.py
16
+ audio_engine/core/pipeline.py
17
+ audio_engine/core/types.py
18
+ audio_engine/integrations/__init__.py
19
+ audio_engine/integrations/geneface.py
20
+ audio_engine/llm/__init__.py
21
+ audio_engine/llm/base.py
22
+ audio_engine/llm/groq.py
23
+ audio_engine/pipelines/__init__.py
24
+ audio_engine/pipelines/personaplex/__init__.py
25
+ audio_engine/pipelines/personaplex/client.py
26
+ audio_engine/pipelines/personaplex/config.py
27
+ audio_engine/pipelines/personaplex/pipeline.py
28
+ audio_engine/pipelines/personaplex/types.py
29
+ audio_engine/pipelines/personaplex/utils.py
30
+ audio_engine/streaming/__init__.py
31
+ audio_engine/streaming/websocket_server.py
32
+ audio_engine/tts/__init__.py
33
+ audio_engine/tts/base.py
34
+ audio_engine/tts/cartesia.py
35
+ audio_engine/utils/__init__.py
36
+ audio_engine/utils/audio.py
@@ -4,7 +4,7 @@ Audio Engine - Pluggable audio-to-audio conversational AI framework.
4
4
  Orchestrates ASR → LLM → TTS pipeline with real-time streaming support.
5
5
  """
6
6
 
7
- __version__ = "0.1.4"
7
+ __version__ = "0.1.5"
8
8
 
9
9
  # Core exports
10
10
  from .core.pipeline import Pipeline
@@ -0,0 +1,44 @@
1
+ """ASR (Speech-to-Text) providers."""
2
+
3
+ from ..core.config import ASRConfig
4
+
5
+ from .base import BaseASR
6
+ from .deepgram import DeepgramASR
7
+ from .cartesia import CartesiaASR
8
+
9
+ __all__ = ["BaseASR", "DeepgramASR", "CartesiaASR", "get_asr_from_config"]
10
+
11
+
12
+ def get_asr_from_config(config: ASRConfig) -> BaseASR:
13
+ """
14
+ Instantiate ASR provider from config.
15
+
16
+ Args:
17
+ config: ASRConfig object with provider name and settings
18
+
19
+ Returns:
20
+ Initialized BaseASR provider instance
21
+
22
+ Raises:
23
+ ValueError: If provider name is not recognized
24
+ """
25
+ provider_name = config.provider.lower()
26
+
27
+ if provider_name == "deepgram":
28
+ return DeepgramASR(
29
+ api_key=config.api_key,
30
+ model=config.model or "nova-2",
31
+ language=config.language,
32
+ **config.extra,
33
+ )
34
+ elif provider_name == "cartesia":
35
+ return CartesiaASR(
36
+ api_key=config.api_key,
37
+ model=config.model or "ink-whisper",
38
+ language=config.language,
39
+ **config.extra,
40
+ )
41
+ else:
42
+ raise ValueError(
43
+ f"Unknown ASR provider: {config.provider}. " f"Supported: deepgram, cartesia"
44
+ )
@@ -0,0 +1,89 @@
1
+ """Abstract base class for ASR (Speech-to-Text) providers."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import AsyncIterator, Optional
5
+
6
+ from ..core.types import AudioChunk, TranscriptChunk
7
+
8
+
9
+ class BaseASR(ABC):
10
+ """
11
+ Abstract base class for Speech-to-Text providers.
12
+
13
+ All ASR implementations must inherit from this class and implement
14
+ the required methods for both batch and streaming transcription.
15
+ """
16
+
17
+ def __init__(self, api_key: Optional[str] = None, **kwargs):
18
+ """
19
+ Initialize the ASR provider.
20
+
21
+ Args:
22
+ api_key: API key for the provider (if required)
23
+ **kwargs: Additional provider-specific configuration
24
+ """
25
+ self.api_key = api_key
26
+ self.config = kwargs
27
+
28
+ @abstractmethod
29
+ async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
30
+ """
31
+ Transcribe a complete audio buffer to text.
32
+
33
+ Args:
34
+ audio: Raw audio bytes (PCM format expected)
35
+ sample_rate: Sample rate of the audio in Hz
36
+
37
+ Returns:
38
+ Transcribed text string
39
+ """
40
+ pass
41
+
42
+ @abstractmethod
43
+ async def transcribe_stream(
44
+ self, audio_stream: AsyncIterator[AudioChunk]
45
+ ) -> AsyncIterator[TranscriptChunk]:
46
+ """
47
+ Transcribe streaming audio in real-time.
48
+
49
+ Args:
50
+ audio_stream: Async iterator yielding AudioChunk objects
51
+
52
+ Yields:
53
+ TranscriptChunk objects with partial and final transcriptions
54
+ """
55
+ pass
56
+
57
+ async def __aenter__(self):
58
+ """Async context manager entry."""
59
+ await self.connect()
60
+ return self
61
+
62
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
63
+ """Async context manager exit."""
64
+ await self.disconnect()
65
+
66
+ async def connect(self):
67
+ """
68
+ Establish connection to the ASR service.
69
+ Override in subclasses if needed.
70
+ """
71
+ pass
72
+
73
+ async def disconnect(self):
74
+ """
75
+ Close connection to the ASR service.
76
+ Override in subclasses if needed.
77
+ """
78
+ pass
79
+
80
+ @property
81
+ @abstractmethod
82
+ def name(self) -> str:
83
+ """Return the name of this ASR provider."""
84
+ pass
85
+
86
+ @property
87
+ def supports_streaming(self) -> bool:
88
+ """Whether this provider supports real-time streaming."""
89
+ return True
@@ -0,0 +1,350 @@
1
+ """Cartesia API implementation for ASR (Speech-to-Text) via WebSocket."""
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ from typing import AsyncIterator, Optional
7
+ from urllib.parse import quote
8
+
9
+ import websockets
10
+
11
+ from ..core.types import AudioChunk, TranscriptChunk
12
+ from .base import BaseASR
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Cartesia API version (required header)
17
+ CARTESIA_VERSION = "2025-04-16"
18
+
19
+
20
+ class CartesiaASR(BaseASR):
21
+ """
22
+ Cartesia API client for speech-to-text transcription via WebSocket.
23
+
24
+ Supports both batch transcription and real-time streaming.
25
+ Uses Cartesia's Whisper model (ink-whisper) for high-accuracy transcription.
26
+
27
+ Approach:
28
+ 1. Batch mode: collect audio, send via WebSocket, wait for final result
29
+ 2. Streaming mode: send audio chunks as they arrive, yield results immediately
30
+ 3. Background receive task queues responses from server
31
+ 4. VAD (Voice Activity Detection) configurable via min_volume and max_silence_duration_secs
32
+
33
+ Example:
34
+ asr = CartesiaASR(api_key="sk_...")
35
+
36
+ # Batch transcription
37
+ text = await asr.transcribe(audio_bytes)
38
+
39
+ # Streaming transcription
40
+ async for chunk in asr.transcribe_stream(audio_stream):
41
+ print(chunk.text, end="", flush=True)
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ api_key: Optional[str] = None,
47
+ model: str = "ink-whisper",
48
+ language: str = "en",
49
+ encoding: str = "pcm_s16le",
50
+ sample_rate: int = 16000,
51
+ min_volume: float = 0.0,
52
+ max_silence_duration_secs: float = 30.0,
53
+ **kwargs,
54
+ ):
55
+ """
56
+ Initialize Cartesia ASR provider.
57
+
58
+ Args:
59
+ api_key: Cartesia API key
60
+ model: Model to use (default: ink-whisper)
61
+ language: Language code in ISO-639-1 format (default: en)
62
+ encoding: Audio encoding format (default: pcm_s16le)
63
+ sample_rate: Sample rate in Hz (default: 16000)
64
+ min_volume: VAD threshold 0.0-1.0, higher = more aggressive (default: 0.0)
65
+ max_silence_duration_secs: Max silence before endpointing (default: 30.0)
66
+ **kwargs: Additional config (stored in self.config)
67
+ """
68
+ super().__init__(api_key=api_key, **kwargs)
69
+ self.model = model
70
+ self.language = language
71
+ self.encoding = encoding
72
+ self.sample_rate = sample_rate
73
+ self.min_volume = min_volume
74
+ self.max_silence_duration_secs = max_silence_duration_secs
75
+
76
+ self.websocket = None
77
+ self._receive_task: Optional[asyncio.Task] = None
78
+ self._response_queue: asyncio.Queue = asyncio.Queue()
79
+
80
+ @property
81
+ def name(self) -> str:
82
+ """Return provider name."""
83
+ return "cartesia"
84
+
85
+ async def connect(self):
86
+ """
87
+ Initialize WebSocket connection to Cartesia STT endpoint.
88
+
89
+ Approach:
90
+ 1. Construct WebSocket URL with parameters (model, language, encoding, sample_rate, VAD)
91
+ 2. Connect to wss://api.cartesia.ai/stt/websocket
92
+ 3. Launch background receive task to collect server responses
93
+ 4. Log initialization status
94
+
95
+ Rationale: Lazy connection on first transcription; background task ensures
96
+ responses are queued even if caller is temporarily blocked.
97
+ """
98
+ if self.websocket:
99
+ return
100
+
101
+ try:
102
+ if not self.api_key:
103
+ # Try to get from environment
104
+ import os
105
+
106
+ self.api_key = os.getenv("CARTESIA_API_KEY") or os.getenv("ASR_API_KEY")
107
+
108
+ if not self.api_key:
109
+ raise ValueError("Cartesia API key not provided")
110
+
111
+ # Construct WebSocket URL with properly encoded parameters
112
+ # API key must be URL-encoded to handle special characters
113
+ url = (
114
+ f"wss://api.cartesia.ai/stt/websocket?"
115
+ f"model={quote(str(self.model))}"
116
+ f"&language={quote(str(self.language))}"
117
+ f"&encoding={quote(str(self.encoding))}"
118
+ f"&sample_rate={quote(str(self.sample_rate))}"
119
+ f"&min_volume={quote(str(self.min_volume))}"
120
+ f"&max_silence_duration_secs={quote(str(self.max_silence_duration_secs))}"
121
+ f"&api_key={quote(str(self.api_key))}"
122
+ )
123
+ logger.debug(f"Cartesia WebSocket URL: {url}")
124
+
125
+ # Connect to WebSocket with required Cartesia-Version header
126
+ try:
127
+ # Build URL with all parameters
128
+ # Add Cartesia-Version header via subprotocol or connection params
129
+ self.websocket = await asyncio.wait_for(
130
+ websockets.connect(
131
+ url, additional_headers=[("Cartesia-Version", CARTESIA_VERSION)]
132
+ ),
133
+ timeout=30.0, # Increase timeout to 30s for initial connection
134
+ )
135
+ logger.debug("Cartesia WebSocket connected")
136
+ except asyncio.TimeoutError:
137
+ logger.error(f"WebSocket connection timeout to {url}")
138
+ raise TimeoutError("Failed to connect to Cartesia WebSocket within 30s timeout")
139
+
140
+ # Start background receive task
141
+ self._receive_task = asyncio.create_task(self._receive_loop())
142
+
143
+ except Exception as e:
144
+ logger.error(f"Failed to initialize Cartesia WebSocket: {e}")
145
+ raise
146
+
147
+ async def disconnect(self):
148
+ """Close WebSocket connection and cleanup."""
149
+ try:
150
+ if self._receive_task:
151
+ self._receive_task.cancel()
152
+ try:
153
+ await self._receive_task
154
+ except asyncio.CancelledError:
155
+ pass
156
+
157
+ if self.websocket:
158
+ await self.websocket.close()
159
+ logger.debug("Cartesia WebSocket closed")
160
+
161
+ except Exception as e:
162
+ logger.error(f"Error disconnecting Cartesia: {e}")
163
+
164
+ async def _receive_loop(self):
165
+ """
166
+ Background task: continuously receive messages from WebSocket.
167
+
168
+ Parses JSON responses and queues them for retrieval by transcribe methods.
169
+ Handles: transcript, flush_done, done, error message types.
170
+ """
171
+ try:
172
+ if not self.websocket:
173
+ return
174
+
175
+ async for message in self.websocket:
176
+ try:
177
+ # Parse JSON response
178
+ response = json.loads(message)
179
+ await self._response_queue.put(response)
180
+
181
+ except json.JSONDecodeError as e:
182
+ logger.error(f"Failed to parse Cartesia response: {e}")
183
+ except Exception as e:
184
+ logger.error(f"Error in receive loop: {e}")
185
+
186
+ except asyncio.CancelledError:
187
+ logger.debug("Receive loop cancelled")
188
+ except Exception as e:
189
+ logger.error(f"Unexpected error in receive loop: {e}")
190
+
191
+ async def transcribe(self, audio: bytes, sample_rate: int = 16000) -> str:
192
+ """
193
+ Transcribe complete audio buffer to text.
194
+
195
+ Approach:
196
+ 1. Initialize WebSocket if needed
197
+ 2. Send audio in chunks (100ms intervals)
198
+ 3. Send 'done' command to finalize
199
+ 4. Collect all responses until 'done' received
200
+ 5. Extract and return transcript text
201
+
202
+ Rationale: Batch mode for complete audio files; simple sequential flow.
203
+
204
+ Args:
205
+ audio: Raw PCM audio bytes
206
+ sample_rate: Sample rate in Hz (default 16000)
207
+
208
+ Returns:
209
+ Transcribed text
210
+ """
211
+ if not self.websocket:
212
+ await self.connect()
213
+
214
+ try:
215
+ logger.debug(f"Transcribing {len(audio)} bytes at {sample_rate}Hz")
216
+
217
+ # Send audio in chunks (100ms intervals at 16kHz = 3200 bytes)
218
+ chunk_size = int(self.sample_rate * 0.1 * 2) # 100ms in bytes
219
+ offset = 0
220
+
221
+ while offset < len(audio):
222
+ chunk = audio[offset : offset + chunk_size]
223
+ await self.websocket.send(chunk)
224
+ offset += chunk_size
225
+
226
+ # Send 'done' command to finalize
227
+ await self.websocket.send("done")
228
+
229
+ # Collect responses until 'done' received
230
+ transcript_parts = []
231
+ while True:
232
+ try:
233
+ response = await asyncio.wait_for(self._response_queue.get(), timeout=10.0)
234
+
235
+ if response.get("type") == "transcript":
236
+ text = response.get("text", "")
237
+ if text:
238
+ transcript_parts.append(text)
239
+
240
+ elif response.get("type") == "done":
241
+ break
242
+
243
+ elif response.get("type") == "error":
244
+ error_msg = response.get("error", "Unknown error")
245
+ raise RuntimeError(f"Cartesia error: {error_msg}")
246
+
247
+ except asyncio.TimeoutError:
248
+ logger.warning("Timeout waiting for Cartesia response")
249
+ break
250
+
251
+ return "".join(transcript_parts)
252
+
253
+ except Exception as e:
254
+ logger.error(f"Cartesia transcription error: {e}")
255
+ raise
256
+
257
+ async def transcribe_stream(
258
+ self, audio_stream: AsyncIterator[AudioChunk]
259
+ ) -> AsyncIterator[TranscriptChunk]:
260
+ """
261
+ Transcribe streaming audio in real-time.
262
+
263
+ Approach:
264
+ 1. Initialize WebSocket if needed
265
+ 2. For each audio chunk from stream:
266
+ - Send binary audio via WebSocket
267
+ - Check response queue for server responses (non-blocking)
268
+ - Yield TranscriptChunk for each response
269
+ 3. On final audio chunk (is_final=True), send 'done' command
270
+ 4. Continue yielding responses until 'done' received
271
+ 5. Signal stream end
272
+
273
+ Rationale: Streaming yields results immediately; low latency;
274
+ background task queues responses so we don't block on receives.
275
+
276
+ Args:
277
+ audio_stream: Async iterator yielding AudioChunk objects
278
+
279
+ Yields:
280
+ TranscriptChunk objects with partial and final transcriptions
281
+ """
282
+ if not self.websocket:
283
+ await self.connect()
284
+
285
+ try:
286
+ done_sent = False
287
+
288
+ async for audio_chunk in audio_stream:
289
+ # Send audio via WebSocket
290
+ await self.websocket.send(audio_chunk.data)
291
+
292
+ # Try to get responses (non-blocking)
293
+ while not self._response_queue.empty():
294
+ response = self._response_queue.get_nowait()
295
+
296
+ if response.get("type") == "transcript":
297
+ text = response.get("text", "")
298
+ is_final = response.get("is_final", False)
299
+ if text:
300
+ yield TranscriptChunk(
301
+ text=text,
302
+ confidence=None, # Cartesia doesn't return confidence
303
+ is_final=is_final,
304
+ )
305
+
306
+ elif response.get("type") == "error":
307
+ error_msg = response.get("error", "Unknown error")
308
+ logger.error(f"Cartesia error: {error_msg}")
309
+
310
+ # If this is the final audio chunk, send 'done' command
311
+ if audio_chunk.is_final and not done_sent:
312
+ await self.websocket.send("done")
313
+ done_sent = True
314
+
315
+ # Continue collecting responses until 'done' received
316
+ if done_sent:
317
+ while True:
318
+ try:
319
+ response = await asyncio.wait_for(self._response_queue.get(), timeout=5.0)
320
+
321
+ if response.get("type") == "transcript":
322
+ text = response.get("text", "")
323
+ is_final = response.get("is_final", False)
324
+ if text:
325
+ yield TranscriptChunk(
326
+ text=text,
327
+ confidence=None,
328
+ is_final=is_final,
329
+ )
330
+
331
+ elif response.get("type") == "done":
332
+ # Yield final chunk to signal stream end
333
+ yield TranscriptChunk(
334
+ text="",
335
+ confidence=None,
336
+ is_final=True,
337
+ )
338
+ break
339
+
340
+ elif response.get("type") == "error":
341
+ error_msg = response.get("error", "Unknown error")
342
+ logger.error(f"Cartesia error: {error_msg}")
343
+
344
+ except asyncio.TimeoutError:
345
+ logger.warning("Timeout waiting for Cartesia final response")
346
+ break
347
+
348
+ except Exception as e:
349
+ logger.error(f"Cartesia streaming transcription error: {e}")
350
+ raise