atom-audio-engine 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tts/base.py DELETED
@@ -1,155 +0,0 @@
1
- """Abstract base class for TTS (Text-to-Speech) providers."""
2
-
3
- from abc import ABC, abstractmethod
4
- from typing import AsyncIterator, Optional
5
-
6
- from core.types import AudioChunk, AudioFormat
7
-
8
-
9
- class BaseTTS(ABC):
10
- """
11
- Abstract base class for Text-to-Speech providers.
12
-
13
- All TTS implementations must inherit from this class and implement
14
- the required methods for both batch and streaming audio synthesis.
15
- """
16
-
17
- def __init__(
18
- self,
19
- api_key: Optional[str] = None,
20
- voice_id: Optional[str] = None,
21
- model: Optional[str] = None,
22
- speed: float = 1.0,
23
- output_format: AudioFormat = AudioFormat.PCM_24K,
24
- **kwargs
25
- ):
26
- """
27
- Initialize the TTS provider.
28
-
29
- Args:
30
- api_key: API key for the provider
31
- voice_id: Voice identifier to use
32
- model: Model identifier (if applicable)
33
- speed: Speech speed multiplier (1.0 = normal)
34
- output_format: Desired audio output format
35
- **kwargs: Additional provider-specific configuration
36
- """
37
- self.api_key = api_key
38
- self.voice_id = voice_id
39
- self.model = model
40
- self.speed = speed
41
- self.output_format = output_format
42
- self.config = kwargs
43
-
44
- @abstractmethod
45
- async def synthesize(self, text: str) -> bytes:
46
- """
47
- Synthesize complete audio from text.
48
-
49
- Args:
50
- text: Text to convert to speech
51
-
52
- Returns:
53
- Complete audio as bytes
54
- """
55
- pass
56
-
57
- @abstractmethod
58
- async def synthesize_stream(self, text: str) -> AsyncIterator[AudioChunk]:
59
- """
60
- Synthesize streaming audio from text.
61
-
62
- Args:
63
- text: Text to convert to speech
64
-
65
- Yields:
66
- AudioChunk objects with audio data
67
- """
68
- pass
69
-
70
- async def synthesize_stream_text(
71
- self, text_stream: AsyncIterator[str]
72
- ) -> AsyncIterator[AudioChunk]:
73
- """
74
- Synthesize streaming audio from streaming text input.
75
-
76
- This enables sentence-by-sentence TTS as the LLM generates text.
77
- Default implementation buffers until punctuation. Override for
78
- providers with native text streaming support.
79
-
80
- Args:
81
- text_stream: Async iterator yielding text chunks
82
-
83
- Yields:
84
- AudioChunk objects with audio data
85
- """
86
- buffer = ""
87
- sentence_enders = ".!?;"
88
-
89
- async for text_chunk in text_stream:
90
- buffer += text_chunk
91
-
92
- # Check if we have a complete sentence
93
- for ender in sentence_enders:
94
- if ender in buffer:
95
- # Split at the sentence boundary
96
- parts = buffer.split(ender, 1)
97
- sentence = parts[0] + ender
98
-
99
- if sentence.strip():
100
- async for audio_chunk in self.synthesize_stream(
101
- sentence.strip()
102
- ):
103
- yield audio_chunk
104
-
105
- buffer = parts[1] if len(parts) > 1 else ""
106
- break
107
-
108
- # Handle remaining text
109
- if buffer.strip():
110
- async for audio_chunk in self.synthesize_stream(buffer.strip()):
111
- yield audio_chunk
112
-
113
- async def __aenter__(self):
114
- """Async context manager entry."""
115
- await self.connect()
116
- return self
117
-
118
- async def __aexit__(self, exc_type, exc_val, exc_tb):
119
- """Async context manager exit."""
120
- await self.disconnect()
121
-
122
- async def connect(self):
123
- """
124
- Establish connection to the TTS service.
125
- Override in subclasses if needed.
126
- """
127
- pass
128
-
129
- async def disconnect(self):
130
- """
131
- Close connection to the TTS service.
132
- Override in subclasses if needed.
133
- """
134
- pass
135
-
136
- @property
137
- @abstractmethod
138
- def name(self) -> str:
139
- """Return the name of this TTS provider."""
140
- pass
141
-
142
- @property
143
- def supports_streaming(self) -> bool:
144
- """Whether this provider supports streaming audio output."""
145
- return True
146
-
147
- @property
148
- def sample_rate(self) -> int:
149
- """Return the sample rate for this provider's output."""
150
- format_rates = {
151
- AudioFormat.PCM_16K: 16000,
152
- AudioFormat.PCM_24K: 24000,
153
- AudioFormat.PCM_44K: 44100,
154
- }
155
- return format_rates.get(self.output_format, 24000)
tts/cartesia.py DELETED
@@ -1,392 +0,0 @@
1
- """Cartesia API implementation for TTS (Text-to-Speech)."""
2
-
3
- import asyncio
4
- import base64
5
- import json
6
- import logging
7
- from typing import AsyncIterator, Optional
8
-
9
- import websockets
10
-
11
-
12
- from core.types import AudioChunk, AudioFormat
13
- from .base import BaseTTS
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class CartesiaTTS(BaseTTS):
19
- """
20
- Cartesia API client for text-to-speech synthesis.
21
-
22
- Supports streaming synthesis with per-chunk latency < 200ms.
23
- Uses WebSocket connections for real-time streaming with continuations.
24
- Outputs 16kHz PCM by default (can be configured).
25
-
26
- Example:
27
- tts = CartesiaTTS(api_key="...", voice_id="sonic")
28
-
29
- # Streaming text input (from LLM)
30
- async for chunk in tts.synthesize_stream_text(llm_text_stream):
31
- play_audio(chunk)
32
- """
33
-
34
- CARTESIA_VERSION = "2025-04-16"
35
- DEFAULT_VOICE_ID = "c8605446-247c-4d39-acd4-8f4c28aa363c" # Edith voice
36
- WS_URL = "wss://api.cartesia.ai/tts/websocket"
37
-
38
- def __init__(
39
- self,
40
- api_key: Optional[str] = None,
41
- voice_id: Optional[str] = None,
42
- model: Optional[str] = "sonic-3",
43
- speed: float = 1.0,
44
- output_format: AudioFormat = AudioFormat.PCM_16K,
45
- sample_rate: int = 16000,
46
- max_buffer_delay_ms: int = 1500,
47
- **kwargs,
48
- ):
49
- """
50
- Initialize Cartesia TTS provider.
51
-
52
- Args:
53
- api_key: Cartesia API key (or None to use CARTESIA_API_KEY env var)
54
- voice_id: Voice identifier (UUID or default Edith)
55
- model: Model to use (default: sonic-3)
56
- speed: Speech speed multiplier (1.0 = normal)
57
- output_format: Desired audio output format (default 16kHz PCM)
58
- sample_rate: Output sample rate in Hz (default: 16000)
59
- max_buffer_delay_ms: Buffering delay for streaming (0-5000ms)
60
- **kwargs: Additional config
61
- """
62
- # Fallback to environment variable if not provided
63
- if not api_key:
64
- import os
65
-
66
- api_key = os.getenv("CARTESIA_API_KEY")
67
-
68
- super().__init__(
69
- api_key=api_key,
70
- voice_id=voice_id or self.DEFAULT_VOICE_ID,
71
- model=model,
72
- speed=speed,
73
- output_format=output_format,
74
- **kwargs,
75
- )
76
- self._sample_rate = sample_rate
77
- self.max_buffer_delay_ms = max_buffer_delay_ms
78
-
79
- @property
80
- def name(self) -> str:
81
- """Return provider name."""
82
- return "cartesia"
83
-
84
- @property
85
- def sample_rate(self) -> int:
86
- """Return the sample rate for this provider's output."""
87
- return self._sample_rate
88
-
89
- async def connect(self):
90
- """Cartesia uses WebSocket connections - no persistent client needed."""
91
- pass
92
-
93
- async def disconnect(self):
94
- """Cartesia uses WebSocket connections - no persistent client needed."""
95
- pass
96
-
97
- async def synthesize(self, text: str) -> bytes:
98
- """
99
- Synthesize complete audio from text (non-streaming).
100
-
101
- Args:
102
- text: Text to convert to speech
103
-
104
- Returns:
105
- Complete audio as bytes (PCM)
106
- """
107
- audio_data = bytearray()
108
- async for chunk in self.synthesize_stream_text(self._text_to_async_iter(text)):
109
- if chunk.data and not chunk.is_final:
110
- audio_data.extend(chunk.data)
111
- return bytes(audio_data)
112
-
113
- async def synthesize_stream(self, text: str) -> AsyncIterator[AudioChunk]:
114
- """
115
- Synthesize streaming audio from text.
116
-
117
- Args:
118
- text: Text to convert to speech
119
-
120
- Yields:
121
- AudioChunk objects with audio data
122
- """
123
- async for chunk in self.synthesize_stream_text(self._text_to_async_iter(text)):
124
- yield chunk
125
-
126
- async def synthesize_stream_text(
127
- self, text_stream: AsyncIterator[str]
128
- ) -> AsyncIterator[AudioChunk]:
129
- """
130
- Synthesize streaming audio from streaming text input via WebSocket.
131
-
132
- Uses continuations to maintain natural prosody across streamed text chunks.
133
-
134
- Args:
135
- text_stream: Async iterator yielding text tokens
136
-
137
- Yields:
138
- AudioChunk objects with audio data
139
- """
140
- if websockets is None:
141
- raise ImportError(
142
- "websockets package required. Install: pip install websockets"
143
- )
144
-
145
- if not self.api_key:
146
- raise ValueError("api_key required for Cartesia TTS")
147
-
148
- # Use unique context ID for this synthesis session
149
- import uuid
150
-
151
- context_id = str(uuid.uuid4())
152
-
153
- ws_url = (
154
- f"{self.WS_URL}"
155
- f"?api_key={self.api_key}"
156
- f"&cartesia_version={self.CARTESIA_VERSION}"
157
- )
158
-
159
- try:
160
- async with websockets.connect(ws_url) as websocket:
161
- logger.debug(
162
- f"Cartesia TTS WebSocket connected | Context: {context_id}"
163
- )
164
-
165
- # Task to receive audio from WebSocket
166
- async def receive_audio():
167
- """Receive audio chunks from TTS WebSocket."""
168
- logger.debug("Cartesia: receive_audio started")
169
- try:
170
- async for message in websocket:
171
- if isinstance(message, str):
172
- try:
173
- response = json.loads(message)
174
- logger.debug(
175
- f"Cartesia: received response type={response.get('type')}"
176
- )
177
- # Handle audio chunk (base64 in "data" field)
178
- if response.get("type") == "chunk" and response.get(
179
- "data"
180
- ):
181
- audio_bytes = base64.b64decode(response["data"])
182
- yield audio_bytes
183
- logger.debug(
184
- f"Cartesia: received audio chunk {len(audio_bytes)} bytes"
185
- )
186
- # Handle buffer flush
187
- elif response.get("type") == "flush_done":
188
- logger.debug("Cartesia: buffer flushed")
189
- # Handle completion
190
- elif response.get("type") == "done":
191
- logger.info("Cartesia: TTS generation complete")
192
- break
193
- # Handle error
194
- elif response.get("type") == "error":
195
- error_msg = (
196
- response.get("error")
197
- or response.get("error_message")
198
- or response.get("message")
199
- or str(response)
200
- )
201
- logger.error(f"Cartesia TTS error: {error_msg}")
202
- raise RuntimeError(
203
- f"Cartesia API error: {error_msg}"
204
- )
205
- else:
206
- logger.debug(
207
- f"Cartesia: response type {response.get('type')}"
208
- )
209
- except json.JSONDecodeError:
210
- logger.warning(
211
- f"Failed to parse Cartesia response: {message}"
212
- )
213
- except Exception as e:
214
- logger.error(f"Cartesia receive error: {e}", exc_info=True)
215
- raise
216
-
217
- # Task to send text to WebSocket
218
- async def send_text():
219
- """Send text tokens to TTS WebSocket."""
220
- logger.debug("Cartesia: send_text started")
221
- accumulated_text = ""
222
- first_token_timeout = 30.0
223
- subsequent_token_timeout = 2.0
224
- first_token_received = False
225
-
226
- try:
227
- while True:
228
- try:
229
- # Wait for token with appropriate timeout
230
- timeout = (
231
- first_token_timeout
232
- if not first_token_received
233
- else subsequent_token_timeout
234
- )
235
- token = await asyncio.wait_for(
236
- self._get_next_token(text_stream),
237
- timeout=timeout,
238
- )
239
- first_token_received = True
240
- except asyncio.TimeoutError:
241
- logger.debug(
242
- f"Cartesia: token timeout (first_token={first_token_received})"
243
- )
244
- # Send accumulated text even on timeout
245
- if accumulated_text.strip():
246
- request = {
247
- "model_id": self.model,
248
- "transcript": accumulated_text,
249
- "context_id": context_id,
250
- "continue": True,
251
- "max_buffer_delay_ms": self.max_buffer_delay_ms,
252
- "voice": {
253
- "mode": "id",
254
- "id": self.voice_id,
255
- },
256
- "output_format": {
257
- "container": "raw",
258
- "encoding": "pcm_s16le",
259
- "sample_rate": self.sample_rate,
260
- },
261
- }
262
- await websocket.send(json.dumps(request))
263
- logger.debug(
264
- f"Cartesia: sent text on timeout (continue=true)"
265
- )
266
- accumulated_text = ""
267
- continue
268
-
269
- # None signals end of text stream
270
- if token is None:
271
- # Send remaining text with continue=false
272
- if accumulated_text.strip():
273
- request = {
274
- "model_id": self.model,
275
- "transcript": accumulated_text,
276
- "context_id": context_id,
277
- "continue": False,
278
- "max_buffer_delay_ms": self.max_buffer_delay_ms,
279
- "voice": {
280
- "mode": "id",
281
- "id": self.voice_id,
282
- },
283
- "output_format": {
284
- "container": "raw",
285
- "encoding": "pcm_s16le",
286
- "sample_rate": self.sample_rate,
287
- },
288
- }
289
- await websocket.send(json.dumps(request))
290
- logger.debug(
291
- f"Cartesia: sent final text (continue=false)"
292
- )
293
- else:
294
- # Send empty transcript to signal end
295
- request = {
296
- "model_id": self.model,
297
- "transcript": "",
298
- "context_id": context_id,
299
- "continue": False,
300
- "max_buffer_delay_ms": self.max_buffer_delay_ms,
301
- "voice": {
302
- "mode": "id",
303
- "id": self.voice_id,
304
- },
305
- "output_format": {
306
- "container": "raw",
307
- "encoding": "pcm_s16le",
308
- "sample_rate": self.sample_rate,
309
- },
310
- }
311
- await websocket.send(json.dumps(request))
312
- logger.debug(
313
- "Cartesia: sent empty transcript to signal end"
314
- )
315
- logger.info("Cartesia: all text sent")
316
- break
317
-
318
- # Accumulate token
319
- accumulated_text += token
320
- logger.debug(
321
- f"Cartesia: buffered token {len(accumulated_text)} chars total"
322
- )
323
-
324
- # Send when buffer is large enough or ends with punctuation
325
- if len(accumulated_text) > 30 or token.endswith(
326
- (".", "!", "?")
327
- ):
328
- request = {
329
- "model_id": self.model,
330
- "transcript": accumulated_text,
331
- "context_id": context_id,
332
- "continue": True,
333
- "max_buffer_delay_ms": self.max_buffer_delay_ms,
334
- "voice": {
335
- "mode": "id",
336
- "id": self.voice_id,
337
- },
338
- "output_format": {
339
- "container": "raw",
340
- "encoding": "pcm_s16le",
341
- "sample_rate": self.sample_rate,
342
- },
343
- }
344
- await websocket.send(json.dumps(request))
345
- logger.debug(
346
- f"Cartesia: sent buffered text (continue=true)"
347
- )
348
- accumulated_text = ""
349
-
350
- except Exception as e:
351
- logger.error(f"Cartesia send error: {e}")
352
-
353
- # Run send and receive concurrently
354
- send_task = asyncio.create_task(send_text())
355
-
356
- async for audio_bytes in receive_audio():
357
- yield AudioChunk(
358
- data=audio_bytes,
359
- sample_rate=self.sample_rate,
360
- channels=1,
361
- format=self.output_format,
362
- is_final=False,
363
- )
364
-
365
- # Wait for send task to complete
366
- await send_task
367
-
368
- # Yield final marker
369
- yield AudioChunk(
370
- data=b"",
371
- sample_rate=self.sample_rate,
372
- channels=1,
373
- format=self.output_format,
374
- is_final=True,
375
- )
376
-
377
- logger.info("Cartesia: stream complete")
378
-
379
- except Exception as e:
380
- logger.error(f"Cartesia streaming text error: {e}")
381
- raise
382
-
383
- async def _get_next_token(self, text_stream: AsyncIterator[str]) -> Optional[str]:
384
- """Get next token from async iterator."""
385
- try:
386
- return await text_stream.__anext__()
387
- except StopAsyncIteration:
388
- return None
389
-
390
- async def _text_to_async_iter(self, text: str) -> AsyncIterator[str]:
391
- """Convert plain text to async iterator."""
392
- yield text
utils/__init__.py DELETED
@@ -1,15 +0,0 @@
1
- """Utility functions for the audio engine."""
2
-
3
- from utils.audio import (
4
- resample_audio,
5
- pcm_to_wav,
6
- wav_to_pcm,
7
- get_audio_duration,
8
- )
9
-
10
- __all__ = [
11
- "resample_audio",
12
- "pcm_to_wav",
13
- "wav_to_pcm",
14
- "get_audio_duration",
15
- ]