atom-audio-engine 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {atom_audio_engine-0.1.0.dist-info → atom_audio_engine-0.1.1.dist-info}/METADATA +10 -5
- atom_audio_engine-0.1.1.dist-info/RECORD +5 -0
- atom_audio_engine-0.1.1.dist-info/top_level.txt +1 -0
- audio_engine/__init__.py +80 -0
- asr/__init__.py +0 -45
- asr/base.py +0 -89
- asr/cartesia.py +0 -356
- asr/deepgram.py +0 -196
- atom_audio_engine-0.1.0.dist-info/RECORD +0 -25
- atom_audio_engine-0.1.0.dist-info/top_level.txt +0 -8
- core/__init__.py +0 -13
- core/config.py +0 -162
- core/pipeline.py +0 -282
- core/types.py +0 -87
- integrations/__init__.py +0 -5
- integrations/geneface.py +0 -297
- llm/__init__.py +0 -38
- llm/base.py +0 -108
- llm/groq.py +0 -210
- pipelines/__init__.py +0 -1
- streaming/__init__.py +0 -5
- streaming/websocket_server.py +0 -341
- tts/__init__.py +0 -37
- tts/base.py +0 -155
- tts/cartesia.py +0 -392
- utils/__init__.py +0 -15
- utils/audio.py +0 -220
- {atom_audio_engine-0.1.0.dist-info → atom_audio_engine-0.1.1.dist-info}/WHEEL +0 -0
tts/base.py
DELETED
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
"""Abstract base class for TTS (Text-to-Speech) providers."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import AsyncIterator, Optional
|
|
5
|
-
|
|
6
|
-
from core.types import AudioChunk, AudioFormat
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BaseTTS(ABC):
|
|
10
|
-
"""
|
|
11
|
-
Abstract base class for Text-to-Speech providers.
|
|
12
|
-
|
|
13
|
-
All TTS implementations must inherit from this class and implement
|
|
14
|
-
the required methods for both batch and streaming audio synthesis.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
def __init__(
|
|
18
|
-
self,
|
|
19
|
-
api_key: Optional[str] = None,
|
|
20
|
-
voice_id: Optional[str] = None,
|
|
21
|
-
model: Optional[str] = None,
|
|
22
|
-
speed: float = 1.0,
|
|
23
|
-
output_format: AudioFormat = AudioFormat.PCM_24K,
|
|
24
|
-
**kwargs
|
|
25
|
-
):
|
|
26
|
-
"""
|
|
27
|
-
Initialize the TTS provider.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
api_key: API key for the provider
|
|
31
|
-
voice_id: Voice identifier to use
|
|
32
|
-
model: Model identifier (if applicable)
|
|
33
|
-
speed: Speech speed multiplier (1.0 = normal)
|
|
34
|
-
output_format: Desired audio output format
|
|
35
|
-
**kwargs: Additional provider-specific configuration
|
|
36
|
-
"""
|
|
37
|
-
self.api_key = api_key
|
|
38
|
-
self.voice_id = voice_id
|
|
39
|
-
self.model = model
|
|
40
|
-
self.speed = speed
|
|
41
|
-
self.output_format = output_format
|
|
42
|
-
self.config = kwargs
|
|
43
|
-
|
|
44
|
-
@abstractmethod
|
|
45
|
-
async def synthesize(self, text: str) -> bytes:
|
|
46
|
-
"""
|
|
47
|
-
Synthesize complete audio from text.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
text: Text to convert to speech
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
Complete audio as bytes
|
|
54
|
-
"""
|
|
55
|
-
pass
|
|
56
|
-
|
|
57
|
-
@abstractmethod
|
|
58
|
-
async def synthesize_stream(self, text: str) -> AsyncIterator[AudioChunk]:
|
|
59
|
-
"""
|
|
60
|
-
Synthesize streaming audio from text.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
text: Text to convert to speech
|
|
64
|
-
|
|
65
|
-
Yields:
|
|
66
|
-
AudioChunk objects with audio data
|
|
67
|
-
"""
|
|
68
|
-
pass
|
|
69
|
-
|
|
70
|
-
async def synthesize_stream_text(
|
|
71
|
-
self, text_stream: AsyncIterator[str]
|
|
72
|
-
) -> AsyncIterator[AudioChunk]:
|
|
73
|
-
"""
|
|
74
|
-
Synthesize streaming audio from streaming text input.
|
|
75
|
-
|
|
76
|
-
This enables sentence-by-sentence TTS as the LLM generates text.
|
|
77
|
-
Default implementation buffers until punctuation. Override for
|
|
78
|
-
providers with native text streaming support.
|
|
79
|
-
|
|
80
|
-
Args:
|
|
81
|
-
text_stream: Async iterator yielding text chunks
|
|
82
|
-
|
|
83
|
-
Yields:
|
|
84
|
-
AudioChunk objects with audio data
|
|
85
|
-
"""
|
|
86
|
-
buffer = ""
|
|
87
|
-
sentence_enders = ".!?;"
|
|
88
|
-
|
|
89
|
-
async for text_chunk in text_stream:
|
|
90
|
-
buffer += text_chunk
|
|
91
|
-
|
|
92
|
-
# Check if we have a complete sentence
|
|
93
|
-
for ender in sentence_enders:
|
|
94
|
-
if ender in buffer:
|
|
95
|
-
# Split at the sentence boundary
|
|
96
|
-
parts = buffer.split(ender, 1)
|
|
97
|
-
sentence = parts[0] + ender
|
|
98
|
-
|
|
99
|
-
if sentence.strip():
|
|
100
|
-
async for audio_chunk in self.synthesize_stream(
|
|
101
|
-
sentence.strip()
|
|
102
|
-
):
|
|
103
|
-
yield audio_chunk
|
|
104
|
-
|
|
105
|
-
buffer = parts[1] if len(parts) > 1 else ""
|
|
106
|
-
break
|
|
107
|
-
|
|
108
|
-
# Handle remaining text
|
|
109
|
-
if buffer.strip():
|
|
110
|
-
async for audio_chunk in self.synthesize_stream(buffer.strip()):
|
|
111
|
-
yield audio_chunk
|
|
112
|
-
|
|
113
|
-
async def __aenter__(self):
|
|
114
|
-
"""Async context manager entry."""
|
|
115
|
-
await self.connect()
|
|
116
|
-
return self
|
|
117
|
-
|
|
118
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
119
|
-
"""Async context manager exit."""
|
|
120
|
-
await self.disconnect()
|
|
121
|
-
|
|
122
|
-
async def connect(self):
|
|
123
|
-
"""
|
|
124
|
-
Establish connection to the TTS service.
|
|
125
|
-
Override in subclasses if needed.
|
|
126
|
-
"""
|
|
127
|
-
pass
|
|
128
|
-
|
|
129
|
-
async def disconnect(self):
|
|
130
|
-
"""
|
|
131
|
-
Close connection to the TTS service.
|
|
132
|
-
Override in subclasses if needed.
|
|
133
|
-
"""
|
|
134
|
-
pass
|
|
135
|
-
|
|
136
|
-
@property
|
|
137
|
-
@abstractmethod
|
|
138
|
-
def name(self) -> str:
|
|
139
|
-
"""Return the name of this TTS provider."""
|
|
140
|
-
pass
|
|
141
|
-
|
|
142
|
-
@property
|
|
143
|
-
def supports_streaming(self) -> bool:
|
|
144
|
-
"""Whether this provider supports streaming audio output."""
|
|
145
|
-
return True
|
|
146
|
-
|
|
147
|
-
@property
|
|
148
|
-
def sample_rate(self) -> int:
|
|
149
|
-
"""Return the sample rate for this provider's output."""
|
|
150
|
-
format_rates = {
|
|
151
|
-
AudioFormat.PCM_16K: 16000,
|
|
152
|
-
AudioFormat.PCM_24K: 24000,
|
|
153
|
-
AudioFormat.PCM_44K: 44100,
|
|
154
|
-
}
|
|
155
|
-
return format_rates.get(self.output_format, 24000)
|
tts/cartesia.py
DELETED
|
@@ -1,392 +0,0 @@
|
|
|
1
|
-
"""Cartesia API implementation for TTS (Text-to-Speech)."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import base64
|
|
5
|
-
import json
|
|
6
|
-
import logging
|
|
7
|
-
from typing import AsyncIterator, Optional
|
|
8
|
-
|
|
9
|
-
import websockets
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
from core.types import AudioChunk, AudioFormat
|
|
13
|
-
from .base import BaseTTS
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class CartesiaTTS(BaseTTS):
|
|
19
|
-
"""
|
|
20
|
-
Cartesia API client for text-to-speech synthesis.
|
|
21
|
-
|
|
22
|
-
Supports streaming synthesis with per-chunk latency < 200ms.
|
|
23
|
-
Uses WebSocket connections for real-time streaming with continuations.
|
|
24
|
-
Outputs 16kHz PCM by default (can be configured).
|
|
25
|
-
|
|
26
|
-
Example:
|
|
27
|
-
tts = CartesiaTTS(api_key="...", voice_id="sonic")
|
|
28
|
-
|
|
29
|
-
# Streaming text input (from LLM)
|
|
30
|
-
async for chunk in tts.synthesize_stream_text(llm_text_stream):
|
|
31
|
-
play_audio(chunk)
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
CARTESIA_VERSION = "2025-04-16"
|
|
35
|
-
DEFAULT_VOICE_ID = "c8605446-247c-4d39-acd4-8f4c28aa363c" # Edith voice
|
|
36
|
-
WS_URL = "wss://api.cartesia.ai/tts/websocket"
|
|
37
|
-
|
|
38
|
-
def __init__(
|
|
39
|
-
self,
|
|
40
|
-
api_key: Optional[str] = None,
|
|
41
|
-
voice_id: Optional[str] = None,
|
|
42
|
-
model: Optional[str] = "sonic-3",
|
|
43
|
-
speed: float = 1.0,
|
|
44
|
-
output_format: AudioFormat = AudioFormat.PCM_16K,
|
|
45
|
-
sample_rate: int = 16000,
|
|
46
|
-
max_buffer_delay_ms: int = 1500,
|
|
47
|
-
**kwargs,
|
|
48
|
-
):
|
|
49
|
-
"""
|
|
50
|
-
Initialize Cartesia TTS provider.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
api_key: Cartesia API key (or None to use CARTESIA_API_KEY env var)
|
|
54
|
-
voice_id: Voice identifier (UUID or default Edith)
|
|
55
|
-
model: Model to use (default: sonic-3)
|
|
56
|
-
speed: Speech speed multiplier (1.0 = normal)
|
|
57
|
-
output_format: Desired audio output format (default 16kHz PCM)
|
|
58
|
-
sample_rate: Output sample rate in Hz (default: 16000)
|
|
59
|
-
max_buffer_delay_ms: Buffering delay for streaming (0-5000ms)
|
|
60
|
-
**kwargs: Additional config
|
|
61
|
-
"""
|
|
62
|
-
# Fallback to environment variable if not provided
|
|
63
|
-
if not api_key:
|
|
64
|
-
import os
|
|
65
|
-
|
|
66
|
-
api_key = os.getenv("CARTESIA_API_KEY")
|
|
67
|
-
|
|
68
|
-
super().__init__(
|
|
69
|
-
api_key=api_key,
|
|
70
|
-
voice_id=voice_id or self.DEFAULT_VOICE_ID,
|
|
71
|
-
model=model,
|
|
72
|
-
speed=speed,
|
|
73
|
-
output_format=output_format,
|
|
74
|
-
**kwargs,
|
|
75
|
-
)
|
|
76
|
-
self._sample_rate = sample_rate
|
|
77
|
-
self.max_buffer_delay_ms = max_buffer_delay_ms
|
|
78
|
-
|
|
79
|
-
@property
|
|
80
|
-
def name(self) -> str:
|
|
81
|
-
"""Return provider name."""
|
|
82
|
-
return "cartesia"
|
|
83
|
-
|
|
84
|
-
@property
|
|
85
|
-
def sample_rate(self) -> int:
|
|
86
|
-
"""Return the sample rate for this provider's output."""
|
|
87
|
-
return self._sample_rate
|
|
88
|
-
|
|
89
|
-
async def connect(self):
|
|
90
|
-
"""Cartesia uses WebSocket connections - no persistent client needed."""
|
|
91
|
-
pass
|
|
92
|
-
|
|
93
|
-
async def disconnect(self):
|
|
94
|
-
"""Cartesia uses WebSocket connections - no persistent client needed."""
|
|
95
|
-
pass
|
|
96
|
-
|
|
97
|
-
async def synthesize(self, text: str) -> bytes:
|
|
98
|
-
"""
|
|
99
|
-
Synthesize complete audio from text (non-streaming).
|
|
100
|
-
|
|
101
|
-
Args:
|
|
102
|
-
text: Text to convert to speech
|
|
103
|
-
|
|
104
|
-
Returns:
|
|
105
|
-
Complete audio as bytes (PCM)
|
|
106
|
-
"""
|
|
107
|
-
audio_data = bytearray()
|
|
108
|
-
async for chunk in self.synthesize_stream_text(self._text_to_async_iter(text)):
|
|
109
|
-
if chunk.data and not chunk.is_final:
|
|
110
|
-
audio_data.extend(chunk.data)
|
|
111
|
-
return bytes(audio_data)
|
|
112
|
-
|
|
113
|
-
async def synthesize_stream(self, text: str) -> AsyncIterator[AudioChunk]:
|
|
114
|
-
"""
|
|
115
|
-
Synthesize streaming audio from text.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
text: Text to convert to speech
|
|
119
|
-
|
|
120
|
-
Yields:
|
|
121
|
-
AudioChunk objects with audio data
|
|
122
|
-
"""
|
|
123
|
-
async for chunk in self.synthesize_stream_text(self._text_to_async_iter(text)):
|
|
124
|
-
yield chunk
|
|
125
|
-
|
|
126
|
-
async def synthesize_stream_text(
|
|
127
|
-
self, text_stream: AsyncIterator[str]
|
|
128
|
-
) -> AsyncIterator[AudioChunk]:
|
|
129
|
-
"""
|
|
130
|
-
Synthesize streaming audio from streaming text input via WebSocket.
|
|
131
|
-
|
|
132
|
-
Uses continuations to maintain natural prosody across streamed text chunks.
|
|
133
|
-
|
|
134
|
-
Args:
|
|
135
|
-
text_stream: Async iterator yielding text tokens
|
|
136
|
-
|
|
137
|
-
Yields:
|
|
138
|
-
AudioChunk objects with audio data
|
|
139
|
-
"""
|
|
140
|
-
if websockets is None:
|
|
141
|
-
raise ImportError(
|
|
142
|
-
"websockets package required. Install: pip install websockets"
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
if not self.api_key:
|
|
146
|
-
raise ValueError("api_key required for Cartesia TTS")
|
|
147
|
-
|
|
148
|
-
# Use unique context ID for this synthesis session
|
|
149
|
-
import uuid
|
|
150
|
-
|
|
151
|
-
context_id = str(uuid.uuid4())
|
|
152
|
-
|
|
153
|
-
ws_url = (
|
|
154
|
-
f"{self.WS_URL}"
|
|
155
|
-
f"?api_key={self.api_key}"
|
|
156
|
-
f"&cartesia_version={self.CARTESIA_VERSION}"
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
try:
|
|
160
|
-
async with websockets.connect(ws_url) as websocket:
|
|
161
|
-
logger.debug(
|
|
162
|
-
f"Cartesia TTS WebSocket connected | Context: {context_id}"
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
# Task to receive audio from WebSocket
|
|
166
|
-
async def receive_audio():
|
|
167
|
-
"""Receive audio chunks from TTS WebSocket."""
|
|
168
|
-
logger.debug("Cartesia: receive_audio started")
|
|
169
|
-
try:
|
|
170
|
-
async for message in websocket:
|
|
171
|
-
if isinstance(message, str):
|
|
172
|
-
try:
|
|
173
|
-
response = json.loads(message)
|
|
174
|
-
logger.debug(
|
|
175
|
-
f"Cartesia: received response type={response.get('type')}"
|
|
176
|
-
)
|
|
177
|
-
# Handle audio chunk (base64 in "data" field)
|
|
178
|
-
if response.get("type") == "chunk" and response.get(
|
|
179
|
-
"data"
|
|
180
|
-
):
|
|
181
|
-
audio_bytes = base64.b64decode(response["data"])
|
|
182
|
-
yield audio_bytes
|
|
183
|
-
logger.debug(
|
|
184
|
-
f"Cartesia: received audio chunk {len(audio_bytes)} bytes"
|
|
185
|
-
)
|
|
186
|
-
# Handle buffer flush
|
|
187
|
-
elif response.get("type") == "flush_done":
|
|
188
|
-
logger.debug("Cartesia: buffer flushed")
|
|
189
|
-
# Handle completion
|
|
190
|
-
elif response.get("type") == "done":
|
|
191
|
-
logger.info("Cartesia: TTS generation complete")
|
|
192
|
-
break
|
|
193
|
-
# Handle error
|
|
194
|
-
elif response.get("type") == "error":
|
|
195
|
-
error_msg = (
|
|
196
|
-
response.get("error")
|
|
197
|
-
or response.get("error_message")
|
|
198
|
-
or response.get("message")
|
|
199
|
-
or str(response)
|
|
200
|
-
)
|
|
201
|
-
logger.error(f"Cartesia TTS error: {error_msg}")
|
|
202
|
-
raise RuntimeError(
|
|
203
|
-
f"Cartesia API error: {error_msg}"
|
|
204
|
-
)
|
|
205
|
-
else:
|
|
206
|
-
logger.debug(
|
|
207
|
-
f"Cartesia: response type {response.get('type')}"
|
|
208
|
-
)
|
|
209
|
-
except json.JSONDecodeError:
|
|
210
|
-
logger.warning(
|
|
211
|
-
f"Failed to parse Cartesia response: {message}"
|
|
212
|
-
)
|
|
213
|
-
except Exception as e:
|
|
214
|
-
logger.error(f"Cartesia receive error: {e}", exc_info=True)
|
|
215
|
-
raise
|
|
216
|
-
|
|
217
|
-
# Task to send text to WebSocket
|
|
218
|
-
async def send_text():
|
|
219
|
-
"""Send text tokens to TTS WebSocket."""
|
|
220
|
-
logger.debug("Cartesia: send_text started")
|
|
221
|
-
accumulated_text = ""
|
|
222
|
-
first_token_timeout = 30.0
|
|
223
|
-
subsequent_token_timeout = 2.0
|
|
224
|
-
first_token_received = False
|
|
225
|
-
|
|
226
|
-
try:
|
|
227
|
-
while True:
|
|
228
|
-
try:
|
|
229
|
-
# Wait for token with appropriate timeout
|
|
230
|
-
timeout = (
|
|
231
|
-
first_token_timeout
|
|
232
|
-
if not first_token_received
|
|
233
|
-
else subsequent_token_timeout
|
|
234
|
-
)
|
|
235
|
-
token = await asyncio.wait_for(
|
|
236
|
-
self._get_next_token(text_stream),
|
|
237
|
-
timeout=timeout,
|
|
238
|
-
)
|
|
239
|
-
first_token_received = True
|
|
240
|
-
except asyncio.TimeoutError:
|
|
241
|
-
logger.debug(
|
|
242
|
-
f"Cartesia: token timeout (first_token={first_token_received})"
|
|
243
|
-
)
|
|
244
|
-
# Send accumulated text even on timeout
|
|
245
|
-
if accumulated_text.strip():
|
|
246
|
-
request = {
|
|
247
|
-
"model_id": self.model,
|
|
248
|
-
"transcript": accumulated_text,
|
|
249
|
-
"context_id": context_id,
|
|
250
|
-
"continue": True,
|
|
251
|
-
"max_buffer_delay_ms": self.max_buffer_delay_ms,
|
|
252
|
-
"voice": {
|
|
253
|
-
"mode": "id",
|
|
254
|
-
"id": self.voice_id,
|
|
255
|
-
},
|
|
256
|
-
"output_format": {
|
|
257
|
-
"container": "raw",
|
|
258
|
-
"encoding": "pcm_s16le",
|
|
259
|
-
"sample_rate": self.sample_rate,
|
|
260
|
-
},
|
|
261
|
-
}
|
|
262
|
-
await websocket.send(json.dumps(request))
|
|
263
|
-
logger.debug(
|
|
264
|
-
f"Cartesia: sent text on timeout (continue=true)"
|
|
265
|
-
)
|
|
266
|
-
accumulated_text = ""
|
|
267
|
-
continue
|
|
268
|
-
|
|
269
|
-
# None signals end of text stream
|
|
270
|
-
if token is None:
|
|
271
|
-
# Send remaining text with continue=false
|
|
272
|
-
if accumulated_text.strip():
|
|
273
|
-
request = {
|
|
274
|
-
"model_id": self.model,
|
|
275
|
-
"transcript": accumulated_text,
|
|
276
|
-
"context_id": context_id,
|
|
277
|
-
"continue": False,
|
|
278
|
-
"max_buffer_delay_ms": self.max_buffer_delay_ms,
|
|
279
|
-
"voice": {
|
|
280
|
-
"mode": "id",
|
|
281
|
-
"id": self.voice_id,
|
|
282
|
-
},
|
|
283
|
-
"output_format": {
|
|
284
|
-
"container": "raw",
|
|
285
|
-
"encoding": "pcm_s16le",
|
|
286
|
-
"sample_rate": self.sample_rate,
|
|
287
|
-
},
|
|
288
|
-
}
|
|
289
|
-
await websocket.send(json.dumps(request))
|
|
290
|
-
logger.debug(
|
|
291
|
-
f"Cartesia: sent final text (continue=false)"
|
|
292
|
-
)
|
|
293
|
-
else:
|
|
294
|
-
# Send empty transcript to signal end
|
|
295
|
-
request = {
|
|
296
|
-
"model_id": self.model,
|
|
297
|
-
"transcript": "",
|
|
298
|
-
"context_id": context_id,
|
|
299
|
-
"continue": False,
|
|
300
|
-
"max_buffer_delay_ms": self.max_buffer_delay_ms,
|
|
301
|
-
"voice": {
|
|
302
|
-
"mode": "id",
|
|
303
|
-
"id": self.voice_id,
|
|
304
|
-
},
|
|
305
|
-
"output_format": {
|
|
306
|
-
"container": "raw",
|
|
307
|
-
"encoding": "pcm_s16le",
|
|
308
|
-
"sample_rate": self.sample_rate,
|
|
309
|
-
},
|
|
310
|
-
}
|
|
311
|
-
await websocket.send(json.dumps(request))
|
|
312
|
-
logger.debug(
|
|
313
|
-
"Cartesia: sent empty transcript to signal end"
|
|
314
|
-
)
|
|
315
|
-
logger.info("Cartesia: all text sent")
|
|
316
|
-
break
|
|
317
|
-
|
|
318
|
-
# Accumulate token
|
|
319
|
-
accumulated_text += token
|
|
320
|
-
logger.debug(
|
|
321
|
-
f"Cartesia: buffered token {len(accumulated_text)} chars total"
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
# Send when buffer is large enough or ends with punctuation
|
|
325
|
-
if len(accumulated_text) > 30 or token.endswith(
|
|
326
|
-
(".", "!", "?")
|
|
327
|
-
):
|
|
328
|
-
request = {
|
|
329
|
-
"model_id": self.model,
|
|
330
|
-
"transcript": accumulated_text,
|
|
331
|
-
"context_id": context_id,
|
|
332
|
-
"continue": True,
|
|
333
|
-
"max_buffer_delay_ms": self.max_buffer_delay_ms,
|
|
334
|
-
"voice": {
|
|
335
|
-
"mode": "id",
|
|
336
|
-
"id": self.voice_id,
|
|
337
|
-
},
|
|
338
|
-
"output_format": {
|
|
339
|
-
"container": "raw",
|
|
340
|
-
"encoding": "pcm_s16le",
|
|
341
|
-
"sample_rate": self.sample_rate,
|
|
342
|
-
},
|
|
343
|
-
}
|
|
344
|
-
await websocket.send(json.dumps(request))
|
|
345
|
-
logger.debug(
|
|
346
|
-
f"Cartesia: sent buffered text (continue=true)"
|
|
347
|
-
)
|
|
348
|
-
accumulated_text = ""
|
|
349
|
-
|
|
350
|
-
except Exception as e:
|
|
351
|
-
logger.error(f"Cartesia send error: {e}")
|
|
352
|
-
|
|
353
|
-
# Run send and receive concurrently
|
|
354
|
-
send_task = asyncio.create_task(send_text())
|
|
355
|
-
|
|
356
|
-
async for audio_bytes in receive_audio():
|
|
357
|
-
yield AudioChunk(
|
|
358
|
-
data=audio_bytes,
|
|
359
|
-
sample_rate=self.sample_rate,
|
|
360
|
-
channels=1,
|
|
361
|
-
format=self.output_format,
|
|
362
|
-
is_final=False,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
# Wait for send task to complete
|
|
366
|
-
await send_task
|
|
367
|
-
|
|
368
|
-
# Yield final marker
|
|
369
|
-
yield AudioChunk(
|
|
370
|
-
data=b"",
|
|
371
|
-
sample_rate=self.sample_rate,
|
|
372
|
-
channels=1,
|
|
373
|
-
format=self.output_format,
|
|
374
|
-
is_final=True,
|
|
375
|
-
)
|
|
376
|
-
|
|
377
|
-
logger.info("Cartesia: stream complete")
|
|
378
|
-
|
|
379
|
-
except Exception as e:
|
|
380
|
-
logger.error(f"Cartesia streaming text error: {e}")
|
|
381
|
-
raise
|
|
382
|
-
|
|
383
|
-
async def _get_next_token(self, text_stream: AsyncIterator[str]) -> Optional[str]:
|
|
384
|
-
"""Get next token from async iterator."""
|
|
385
|
-
try:
|
|
386
|
-
return await text_stream.__anext__()
|
|
387
|
-
except StopAsyncIteration:
|
|
388
|
-
return None
|
|
389
|
-
|
|
390
|
-
async def _text_to_async_iter(self, text: str) -> AsyncIterator[str]:
|
|
391
|
-
"""Convert plain text to async iterator."""
|
|
392
|
-
yield text
|
utils/__init__.py
DELETED