atom-audio-engine 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/METADATA +1 -1
  2. atom_audio_engine-0.1.6.dist-info/RECORD +32 -0
  3. audio_engine/__init__.py +6 -2
  4. audio_engine/asr/__init__.py +48 -0
  5. audio_engine/asr/base.py +89 -0
  6. audio_engine/asr/cartesia.py +350 -0
  7. audio_engine/asr/deepgram.py +196 -0
  8. audio_engine/core/__init__.py +13 -0
  9. audio_engine/core/config.py +162 -0
  10. audio_engine/core/pipeline.py +278 -0
  11. audio_engine/core/types.py +87 -0
  12. audio_engine/integrations/__init__.py +5 -0
  13. audio_engine/integrations/geneface.py +297 -0
  14. audio_engine/llm/__init__.py +40 -0
  15. audio_engine/llm/base.py +106 -0
  16. audio_engine/llm/groq.py +208 -0
  17. audio_engine/pipelines/__init__.py +1 -0
  18. audio_engine/pipelines/personaplex/__init__.py +41 -0
  19. audio_engine/pipelines/personaplex/client.py +259 -0
  20. audio_engine/pipelines/personaplex/config.py +69 -0
  21. audio_engine/pipelines/personaplex/pipeline.py +301 -0
  22. audio_engine/pipelines/personaplex/types.py +173 -0
  23. audio_engine/pipelines/personaplex/utils.py +192 -0
  24. audio_engine/streaming/__init__.py +5 -0
  25. audio_engine/streaming/websocket_server.py +333 -0
  26. audio_engine/tts/__init__.py +35 -0
  27. audio_engine/tts/base.py +153 -0
  28. audio_engine/tts/cartesia.py +370 -0
  29. audio_engine/utils/__init__.py +15 -0
  30. audio_engine/utils/audio.py +218 -0
  31. atom_audio_engine-0.1.4.dist-info/RECORD +0 -5
  32. {atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/WHEEL +0 -0
  33. {atom_audio_engine-0.1.4.dist-info → atom_audio_engine-0.1.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,370 @@
1
+ """Cartesia API implementation for TTS (Text-to-Speech)."""
2
+
3
+ import asyncio
4
+ import base64
5
+ import json
6
+ import logging
7
+ from typing import AsyncIterator, Optional
8
+
9
+ import websockets
10
+
11
+
12
+ from ..core.types import AudioChunk, AudioFormat
13
+ from .base import BaseTTS
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class CartesiaTTS(BaseTTS):
19
+ """
20
+ Cartesia API client for text-to-speech synthesis.
21
+
22
+ Supports streaming synthesis with per-chunk latency < 200ms.
23
+ Uses WebSocket connections for real-time streaming with continuations.
24
+ Outputs 16kHz PCM by default (can be configured).
25
+
26
+ Example:
27
+ tts = CartesiaTTS(api_key="...", voice_id="sonic")
28
+
29
+ # Streaming text input (from LLM)
30
+ async for chunk in tts.synthesize_stream_text(llm_text_stream):
31
+ play_audio(chunk)
32
+ """
33
+
34
+ CARTESIA_VERSION = "2025-04-16"
35
+ DEFAULT_VOICE_ID = "c8605446-247c-4d39-acd4-8f4c28aa363c" # Edith voice
36
+ WS_URL = "wss://api.cartesia.ai/tts/websocket"
37
+
38
+ def __init__(
39
+ self,
40
+ api_key: Optional[str] = None,
41
+ voice_id: Optional[str] = None,
42
+ model: Optional[str] = "sonic-3",
43
+ speed: float = 1.0,
44
+ output_format: AudioFormat = AudioFormat.PCM_16K,
45
+ sample_rate: int = 16000,
46
+ max_buffer_delay_ms: int = 1500,
47
+ **kwargs,
48
+ ):
49
+ """
50
+ Initialize Cartesia TTS provider.
51
+
52
+ Args:
53
+ api_key: Cartesia API key (or None to use CARTESIA_API_KEY env var)
54
+ voice_id: Voice identifier (UUID or default Edith)
55
+ model: Model to use (default: sonic-3)
56
+ speed: Speech speed multiplier (1.0 = normal)
57
+ output_format: Desired audio output format (default 16kHz PCM)
58
+ sample_rate: Output sample rate in Hz (default: 16000)
59
+ max_buffer_delay_ms: Buffering delay for streaming (0-5000ms)
60
+ **kwargs: Additional config
61
+ """
62
+ # Fallback to environment variable if not provided
63
+ if not api_key:
64
+ import os
65
+
66
+ api_key = os.getenv("CARTESIA_API_KEY")
67
+
68
+ super().__init__(
69
+ api_key=api_key,
70
+ voice_id=voice_id or self.DEFAULT_VOICE_ID,
71
+ model=model,
72
+ speed=speed,
73
+ output_format=output_format,
74
+ **kwargs,
75
+ )
76
+ self._sample_rate = sample_rate
77
+ self.max_buffer_delay_ms = max_buffer_delay_ms
78
+
79
+ @property
80
+ def name(self) -> str:
81
+ """Return provider name."""
82
+ return "cartesia"
83
+
84
+ @property
85
+ def sample_rate(self) -> int:
86
+ """Return the sample rate for this provider's output."""
87
+ return self._sample_rate
88
+
89
+ async def connect(self):
90
+ """Cartesia uses WebSocket connections - no persistent client needed."""
91
+ pass
92
+
93
+ async def disconnect(self):
94
+ """Cartesia uses WebSocket connections - no persistent client needed."""
95
+ pass
96
+
97
+ async def synthesize(self, text: str) -> bytes:
98
+ """
99
+ Synthesize complete audio from text (non-streaming).
100
+
101
+ Args:
102
+ text: Text to convert to speech
103
+
104
+ Returns:
105
+ Complete audio as bytes (PCM)
106
+ """
107
+ audio_data = bytearray()
108
+ async for chunk in self.synthesize_stream_text(self._text_to_async_iter(text)):
109
+ if chunk.data and not chunk.is_final:
110
+ audio_data.extend(chunk.data)
111
+ return bytes(audio_data)
112
+
113
+ async def synthesize_stream(self, text: str) -> AsyncIterator[AudioChunk]:
114
+ """
115
+ Synthesize streaming audio from text.
116
+
117
+ Args:
118
+ text: Text to convert to speech
119
+
120
+ Yields:
121
+ AudioChunk objects with audio data
122
+ """
123
+ async for chunk in self.synthesize_stream_text(self._text_to_async_iter(text)):
124
+ yield chunk
125
+
126
+ async def synthesize_stream_text(
127
+ self, text_stream: AsyncIterator[str]
128
+ ) -> AsyncIterator[AudioChunk]:
129
+ """
130
+ Synthesize streaming audio from streaming text input via WebSocket.
131
+
132
+ Uses continuations to maintain natural prosody across streamed text chunks.
133
+
134
+ Args:
135
+ text_stream: Async iterator yielding text tokens
136
+
137
+ Yields:
138
+ AudioChunk objects with audio data
139
+ """
140
+ if websockets is None:
141
+ raise ImportError("websockets package required. Install: pip install websockets")
142
+
143
+ if not self.api_key:
144
+ raise ValueError("api_key required for Cartesia TTS")
145
+
146
+ # Use unique context ID for this synthesis session
147
+ import uuid
148
+
149
+ context_id = str(uuid.uuid4())
150
+
151
+ ws_url = (
152
+ f"{self.WS_URL}" f"?api_key={self.api_key}" f"&cartesia_version={self.CARTESIA_VERSION}"
153
+ )
154
+
155
+ try:
156
+ async with websockets.connect(ws_url) as websocket:
157
+ logger.debug(f"Cartesia TTS WebSocket connected | Context: {context_id}")
158
+
159
+ # Task to receive audio from WebSocket
160
+ async def receive_audio():
161
+ """Receive audio chunks from TTS WebSocket."""
162
+ logger.debug("Cartesia: receive_audio started")
163
+ try:
164
+ async for message in websocket:
165
+ if isinstance(message, str):
166
+ try:
167
+ response = json.loads(message)
168
+ logger.debug(
169
+ f"Cartesia: received response type={response.get('type')}"
170
+ )
171
+ # Handle audio chunk (base64 in "data" field)
172
+ if response.get("type") == "chunk" and response.get("data"):
173
+ audio_bytes = base64.b64decode(response["data"])
174
+ yield audio_bytes
175
+ logger.debug(
176
+ f"Cartesia: received audio chunk {len(audio_bytes)} bytes"
177
+ )
178
+ # Handle buffer flush
179
+ elif response.get("type") == "flush_done":
180
+ logger.debug("Cartesia: buffer flushed")
181
+ # Handle completion
182
+ elif response.get("type") == "done":
183
+ logger.info("Cartesia: TTS generation complete")
184
+ break
185
+ # Handle error
186
+ elif response.get("type") == "error":
187
+ error_msg = (
188
+ response.get("error")
189
+ or response.get("error_message")
190
+ or response.get("message")
191
+ or str(response)
192
+ )
193
+ logger.error(f"Cartesia TTS error: {error_msg}")
194
+ raise RuntimeError(f"Cartesia API error: {error_msg}")
195
+ else:
196
+ logger.debug(
197
+ f"Cartesia: response type {response.get('type')}"
198
+ )
199
+ except json.JSONDecodeError:
200
+ logger.warning(f"Failed to parse Cartesia response: {message}")
201
+ except Exception as e:
202
+ logger.error(f"Cartesia receive error: {e}", exc_info=True)
203
+ raise
204
+
205
+ # Task to send text to WebSocket
206
+ async def send_text():
207
+ """Send text tokens to TTS WebSocket."""
208
+ logger.debug("Cartesia: send_text started")
209
+ accumulated_text = ""
210
+ first_token_timeout = 30.0
211
+ subsequent_token_timeout = 2.0
212
+ first_token_received = False
213
+
214
+ try:
215
+ while True:
216
+ try:
217
+ # Wait for token with appropriate timeout
218
+ timeout = (
219
+ first_token_timeout
220
+ if not first_token_received
221
+ else subsequent_token_timeout
222
+ )
223
+ token = await asyncio.wait_for(
224
+ self._get_next_token(text_stream),
225
+ timeout=timeout,
226
+ )
227
+ first_token_received = True
228
+ except asyncio.TimeoutError:
229
+ logger.debug(
230
+ f"Cartesia: token timeout (first_token={first_token_received})"
231
+ )
232
+ # Send accumulated text even on timeout
233
+ if accumulated_text.strip():
234
+ request = {
235
+ "model_id": self.model,
236
+ "transcript": accumulated_text,
237
+ "context_id": context_id,
238
+ "continue": True,
239
+ "max_buffer_delay_ms": self.max_buffer_delay_ms,
240
+ "voice": {
241
+ "mode": "id",
242
+ "id": self.voice_id,
243
+ },
244
+ "output_format": {
245
+ "container": "raw",
246
+ "encoding": "pcm_s16le",
247
+ "sample_rate": self.sample_rate,
248
+ },
249
+ }
250
+ await websocket.send(json.dumps(request))
251
+ logger.debug(f"Cartesia: sent text on timeout (continue=true)")
252
+ accumulated_text = ""
253
+ continue
254
+
255
+ # None signals end of text stream
256
+ if token is None:
257
+ # Send remaining text with continue=false
258
+ if accumulated_text.strip():
259
+ request = {
260
+ "model_id": self.model,
261
+ "transcript": accumulated_text,
262
+ "context_id": context_id,
263
+ "continue": False,
264
+ "max_buffer_delay_ms": self.max_buffer_delay_ms,
265
+ "voice": {
266
+ "mode": "id",
267
+ "id": self.voice_id,
268
+ },
269
+ "output_format": {
270
+ "container": "raw",
271
+ "encoding": "pcm_s16le",
272
+ "sample_rate": self.sample_rate,
273
+ },
274
+ }
275
+ await websocket.send(json.dumps(request))
276
+ logger.debug(f"Cartesia: sent final text (continue=false)")
277
+ else:
278
+ # Send empty transcript to signal end
279
+ request = {
280
+ "model_id": self.model,
281
+ "transcript": "",
282
+ "context_id": context_id,
283
+ "continue": False,
284
+ "max_buffer_delay_ms": self.max_buffer_delay_ms,
285
+ "voice": {
286
+ "mode": "id",
287
+ "id": self.voice_id,
288
+ },
289
+ "output_format": {
290
+ "container": "raw",
291
+ "encoding": "pcm_s16le",
292
+ "sample_rate": self.sample_rate,
293
+ },
294
+ }
295
+ await websocket.send(json.dumps(request))
296
+ logger.debug("Cartesia: sent empty transcript to signal end")
297
+ logger.info("Cartesia: all text sent")
298
+ break
299
+
300
+ # Accumulate token
301
+ accumulated_text += token
302
+ logger.debug(
303
+ f"Cartesia: buffered token {len(accumulated_text)} chars total"
304
+ )
305
+
306
+ # Send when buffer is large enough or ends with punctuation
307
+ if len(accumulated_text) > 30 or token.endswith((".", "!", "?")):
308
+ request = {
309
+ "model_id": self.model,
310
+ "transcript": accumulated_text,
311
+ "context_id": context_id,
312
+ "continue": True,
313
+ "max_buffer_delay_ms": self.max_buffer_delay_ms,
314
+ "voice": {
315
+ "mode": "id",
316
+ "id": self.voice_id,
317
+ },
318
+ "output_format": {
319
+ "container": "raw",
320
+ "encoding": "pcm_s16le",
321
+ "sample_rate": self.sample_rate,
322
+ },
323
+ }
324
+ await websocket.send(json.dumps(request))
325
+ logger.debug(f"Cartesia: sent buffered text (continue=true)")
326
+ accumulated_text = ""
327
+
328
+ except Exception as e:
329
+ logger.error(f"Cartesia send error: {e}")
330
+
331
+ # Run send and receive concurrently
332
+ send_task = asyncio.create_task(send_text())
333
+
334
+ async for audio_bytes in receive_audio():
335
+ yield AudioChunk(
336
+ data=audio_bytes,
337
+ sample_rate=self.sample_rate,
338
+ channels=1,
339
+ format=self.output_format,
340
+ is_final=False,
341
+ )
342
+
343
+ # Wait for send task to complete
344
+ await send_task
345
+
346
+ # Yield final marker
347
+ yield AudioChunk(
348
+ data=b"",
349
+ sample_rate=self.sample_rate,
350
+ channels=1,
351
+ format=self.output_format,
352
+ is_final=True,
353
+ )
354
+
355
+ logger.info("Cartesia: stream complete")
356
+
357
+ except Exception as e:
358
+ logger.error(f"Cartesia streaming text error: {e}")
359
+ raise
360
+
361
+ async def _get_next_token(self, text_stream: AsyncIterator[str]) -> Optional[str]:
362
+ """Get next token from async iterator."""
363
+ try:
364
+ return await text_stream.__anext__()
365
+ except StopAsyncIteration:
366
+ return None
367
+
368
+ async def _text_to_async_iter(self, text: str) -> AsyncIterator[str]:
369
+ """Convert plain text to async iterator."""
370
+ yield text
@@ -0,0 +1,15 @@
1
+ """Utility functions for the audio engine."""
2
+
3
+ from .audio import (
4
+ resample_audio,
5
+ pcm_to_wav,
6
+ wav_to_pcm,
7
+ get_audio_duration,
8
+ )
9
+
10
+ __all__ = [
11
+ "resample_audio",
12
+ "pcm_to_wav",
13
+ "wav_to_pcm",
14
+ "get_audio_duration",
15
+ ]
@@ -0,0 +1,218 @@
1
+ """Audio utility functions."""
2
+
3
+ import struct
4
+ from typing import Optional
5
+
6
+
7
+ def resample_audio(
8
+ audio: bytes,
9
+ from_rate: int,
10
+ to_rate: int,
11
+ channels: int = 1,
12
+ sample_width: int = 2,
13
+ ) -> bytes:
14
+ """
15
+ Resample audio to a different sample rate.
16
+
17
+ Uses linear interpolation for simple resampling.
18
+ For higher quality, consider using librosa or scipy.
19
+
20
+ Args:
21
+ audio: Input audio bytes (PCM format)
22
+ from_rate: Original sample rate
23
+ to_rate: Target sample rate
24
+ channels: Number of audio channels
25
+ sample_width: Bytes per sample (2 for 16-bit)
26
+
27
+ Returns:
28
+ Resampled audio bytes
29
+ """
30
+ if from_rate == to_rate:
31
+ return audio
32
+
33
+ try:
34
+ import numpy as np
35
+ from scipy import signal
36
+
37
+ # Convert bytes to numpy array
38
+ dtype = np.int16 if sample_width == 2 else np.int32
39
+ samples = np.frombuffer(audio, dtype=dtype)
40
+
41
+ # Resample using scipy
42
+ num_samples = int(len(samples) * to_rate / from_rate)
43
+ resampled = signal.resample(samples, num_samples)
44
+
45
+ return resampled.astype(dtype).tobytes()
46
+
47
+ except ImportError:
48
+ # Fallback to simple linear interpolation
49
+ return _simple_resample(audio, from_rate, to_rate, sample_width)
50
+
51
+
52
+ def _simple_resample(
53
+ audio: bytes,
54
+ from_rate: int,
55
+ to_rate: int,
56
+ sample_width: int = 2,
57
+ ) -> bytes:
58
+ """Simple linear interpolation resampling."""
59
+ if sample_width == 2:
60
+ fmt = "<h"
61
+ samples = [struct.unpack(fmt, audio[i : i + 2])[0] for i in range(0, len(audio), 2)]
62
+ else:
63
+ raise ValueError(f"Unsupported sample width: {sample_width}")
64
+
65
+ ratio = from_rate / to_rate
66
+ new_length = int(len(samples) / ratio)
67
+ resampled = []
68
+
69
+ for i in range(new_length):
70
+ pos = i * ratio
71
+ idx = int(pos)
72
+ frac = pos - idx
73
+
74
+ if idx + 1 < len(samples):
75
+ sample = int(samples[idx] * (1 - frac) + samples[idx + 1] * frac)
76
+ else:
77
+ sample = samples[idx]
78
+
79
+ resampled.append(sample)
80
+
81
+ return struct.pack(f"<{len(resampled)}h", *resampled)
82
+
83
+
84
+ def pcm_to_wav(
85
+ pcm_data: bytes,
86
+ sample_rate: int = 16000,
87
+ channels: int = 1,
88
+ bits_per_sample: int = 16,
89
+ ) -> bytes:
90
+ """
91
+ Convert raw PCM data to WAV format.
92
+
93
+ Args:
94
+ pcm_data: Raw PCM audio bytes
95
+ sample_rate: Sample rate in Hz
96
+ channels: Number of audio channels
97
+ bits_per_sample: Bits per sample (typically 16)
98
+
99
+ Returns:
100
+ WAV file as bytes
101
+ """
102
+ byte_rate = sample_rate * channels * bits_per_sample // 8
103
+ block_align = channels * bits_per_sample // 8
104
+ data_size = len(pcm_data)
105
+
106
+ header = struct.pack(
107
+ "<4sI4s4sIHHIIHH4sI",
108
+ b"RIFF",
109
+ 36 + data_size,
110
+ b"WAVE",
111
+ b"fmt ",
112
+ 16, # fmt chunk size
113
+ 1, # audio format (PCM)
114
+ channels,
115
+ sample_rate,
116
+ byte_rate,
117
+ block_align,
118
+ bits_per_sample,
119
+ b"data",
120
+ data_size,
121
+ )
122
+
123
+ return header + pcm_data
124
+
125
+
126
+ def wav_to_pcm(wav_data: bytes) -> tuple[bytes, int, int, int]:
127
+ """
128
+ Extract raw PCM data from WAV format.
129
+
130
+ Args:
131
+ wav_data: WAV file as bytes
132
+
133
+ Returns:
134
+ Tuple of (pcm_data, sample_rate, channels, bits_per_sample)
135
+ """
136
+ # Parse RIFF header
137
+ if wav_data[:4] != b"RIFF" or wav_data[8:12] != b"WAVE":
138
+ raise ValueError("Invalid WAV file")
139
+
140
+ # Find fmt chunk
141
+ pos = 12
142
+ sample_rate = 0
143
+ channels = 0
144
+ bits_per_sample = 0
145
+
146
+ while pos < len(wav_data):
147
+ chunk_id = wav_data[pos : pos + 4]
148
+ chunk_size = struct.unpack("<I", wav_data[pos + 4 : pos + 8])[0]
149
+
150
+ if chunk_id == b"fmt ":
151
+ _, channels, sample_rate, _, _, bits_per_sample = struct.unpack(
152
+ "<HHIIHH", wav_data[pos + 8 : pos + 24]
153
+ )
154
+ elif chunk_id == b"data":
155
+ pcm_data = wav_data[pos + 8 : pos + 8 + chunk_size]
156
+ return pcm_data, sample_rate, channels, bits_per_sample
157
+
158
+ pos += 8 + chunk_size
159
+
160
+ raise ValueError("No data chunk found in WAV file")
161
+
162
+
163
+ def get_audio_duration(
164
+ audio: bytes,
165
+ sample_rate: int,
166
+ channels: int = 1,
167
+ bits_per_sample: int = 16,
168
+ ) -> float:
169
+ """
170
+ Calculate duration of PCM audio in seconds.
171
+
172
+ Args:
173
+ audio: PCM audio bytes
174
+ sample_rate: Sample rate in Hz
175
+ channels: Number of audio channels
176
+ bits_per_sample: Bits per sample
177
+
178
+ Returns:
179
+ Duration in seconds
180
+ """
181
+ bytes_per_sample = bits_per_sample // 8
182
+ total_samples = len(audio) // (bytes_per_sample * channels)
183
+ return total_samples / sample_rate
184
+
185
+
186
+ def normalize_audio(audio: bytes, target_db: float = -20.0) -> bytes:
187
+ """
188
+ Normalize audio to a target dB level.
189
+
190
+ Args:
191
+ audio: PCM audio bytes (16-bit)
192
+ target_db: Target dB level
193
+
194
+ Returns:
195
+ Normalized audio bytes
196
+ """
197
+ try:
198
+ import numpy as np
199
+
200
+ samples = np.frombuffer(audio, dtype=np.int16).astype(np.float32)
201
+
202
+ # Calculate current RMS
203
+ rms = np.sqrt(np.mean(samples**2))
204
+ if rms == 0:
205
+ return audio
206
+
207
+ # Calculate target RMS
208
+ target_rms = 32768 * (10 ** (target_db / 20))
209
+
210
+ # Scale
211
+ gain = target_rms / rms
212
+ normalized = np.clip(samples * gain, -32768, 32767).astype(np.int16)
213
+
214
+ return normalized.tobytes()
215
+
216
+ except ImportError:
217
+ # Return unchanged if numpy not available
218
+ return audio
@@ -1,5 +0,0 @@
1
- audio_engine/__init__.py,sha256=16WB-DwacHIa6_8ejkm6I5NGri4mHbM6FVaY-NM0C64,1499
2
- atom_audio_engine-0.1.4.dist-info/METADATA,sha256=buKMpasiEx1sGryefQhEh70c3kiY-SFPUdryVNsV4FQ,6690
3
- atom_audio_engine-0.1.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
4
- atom_audio_engine-0.1.4.dist-info/top_level.txt,sha256=IyumwgFrsDL7nlZlBijX-0shiSVhhBCFPUNBRNKzWP4,13
5
- atom_audio_engine-0.1.4.dist-info/RECORD,,