livekit-plugins-cartesia 0.2.0.dev7__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,13 +19,12 @@ __all__ = ["TTS", "ChunkedStream", "__version__"]
19
19
 
20
20
  from livekit.agents import Plugin
21
21
 
22
+ from .log import logger
23
+
22
24
 
23
25
  class CartesiaPlugin(Plugin):
24
26
  def __init__(self):
25
- super().__init__(__name__, __version__, __package__)
26
-
27
- def download_files(self):
28
- pass
27
+ super().__init__(__name__, __version__, __package__, logger)
29
28
 
30
29
 
31
30
  Plugin.register_plugin(CartesiaPlugin())
@@ -11,4 +11,4 @@ TTSEncoding = Literal[
11
11
 
12
12
  TTSModels = Literal["sonic-english", "sonic-multilingual"]
13
13
  TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
14
- TTSDefaultVoiceId = "248be419-c632-4f23-adf1-5324ed7dbf1d"
14
+ TTSDefaultVoiceId = "c2ac25f9-ecc4-4f56-9095-651354df60c0"
@@ -14,11 +14,15 @@
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ import asyncio
18
+ import base64
19
+ import json
17
20
  import os
18
21
  from dataclasses import dataclass
22
+ from typing import Any
19
23
 
20
24
  import aiohttp
21
- from livekit.agents import tts, utils
25
+ from livekit.agents import tokenize, tts, utils
22
26
 
23
27
  from .log import logger
24
28
  from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
@@ -27,6 +31,9 @@ API_AUTH_HEADER = "X-API-Key"
27
31
  API_VERSION_HEADER = "Cartesia-Version"
28
32
  API_VERSION = "2024-06-10"
29
33
 
34
+ NUM_CHANNELS = 1
35
+ BUFFERED_WORDS_COUNT = 8
36
+
30
37
 
31
38
  @dataclass
32
39
  class _TTSOptions:
@@ -51,9 +58,9 @@ class TTS(tts.TTS):
51
58
  http_session: aiohttp.ClientSession | None = None,
52
59
  ) -> None:
53
60
  super().__init__(
54
- capabilities=tts.TTSCapabilities(streaming=False),
61
+ capabilities=tts.TTSCapabilities(streaming=True),
55
62
  sample_rate=sample_rate,
56
- num_channels=1,
63
+ num_channels=NUM_CHANNELS,
57
64
  )
58
65
 
59
66
  api_key = api_key or os.environ.get("CARTESIA_API_KEY")
@@ -79,8 +86,13 @@ class TTS(tts.TTS):
79
86
  def synthesize(self, text: str) -> "ChunkedStream":
80
87
  return ChunkedStream(text, self._opts, self._ensure_session())
81
88
 
89
+ def stream(self) -> "SynthesizeStream":
90
+ return SynthesizeStream(self._opts, self._ensure_session())
91
+
82
92
 
83
93
  class ChunkedStream(tts.ChunkedStream):
94
+ """Synthesize chunked text using the bytes endpoint"""
95
+
84
96
  def __init__(
85
97
  self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
86
98
  ) -> None:
@@ -90,35 +102,17 @@ class ChunkedStream(tts.ChunkedStream):
90
102
  @utils.log_exceptions(logger=logger)
91
103
  async def _main_task(self):
92
104
  bstream = utils.audio.AudioByteStream(
93
- sample_rate=self._opts.sample_rate, num_channels=1
105
+ sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS
94
106
  )
95
- request_id = utils.shortuuid()
96
- segment_id = utils.shortuuid()
97
-
98
- voice = {}
99
- if isinstance(self._opts.voice, str):
100
- voice["mode"] = "id"
101
- voice["id"] = self._opts.voice
102
- else:
103
- voice["mode"] = "embedding"
104
- voice["embedding"] = self._opts.voice
105
-
106
- data = {
107
- "model_id": self._opts.model,
108
- "transcript": self._text,
109
- "voice": voice,
110
- "output_format": {
111
- "container": "raw",
112
- "encoding": self._opts.encoding,
113
- "sample_rate": self._opts.sample_rate,
114
- },
115
- "language": self._opts.language,
116
- }
107
+ request_id, segment_id = utils.shortuuid(), utils.shortuuid()
108
+
109
+ data = _to_cartesia_options(self._opts)
110
+ data["transcript"] = self._text
117
111
 
118
112
  async with self._session.post(
119
113
  "https://api.cartesia.ai/tts/bytes",
120
114
  headers={
121
- API_AUTH_HEADER: f"{self._opts.api_key}",
115
+ API_AUTH_HEADER: self._opts.api_key,
122
116
  API_VERSION_HEADER: API_VERSION,
123
117
  },
124
118
  json=data,
@@ -137,3 +131,150 @@ class ChunkedStream(tts.ChunkedStream):
137
131
  request_id=request_id, segment_id=segment_id, frame=frame
138
132
  )
139
133
  )
134
+
135
+
136
+ class SynthesizeStream(tts.SynthesizeStream):
137
+ def __init__(
138
+ self,
139
+ opts: _TTSOptions,
140
+ session: aiohttp.ClientSession,
141
+ ):
142
+ super().__init__()
143
+ self._opts, self._session = opts, session
144
+ self._sent_tokenizer_stream = tokenize.basic.SentenceTokenizer(
145
+ min_sentence_len=BUFFERED_WORDS_COUNT
146
+ ).stream()
147
+
148
+ @utils.log_exceptions(logger=logger)
149
+ async def _main_task(self) -> None:
150
+ retry_count = 0
151
+ max_retry = 3
152
+ while self._input_ch.qsize() or not self._input_ch.closed:
153
+ try:
154
+ url = f"wss://api.cartesia.ai/tts/websocket?api_key={self._opts.api_key}&cartesia_version={API_VERSION}"
155
+ ws = await self._session.ws_connect(url)
156
+ retry_count = 0 # connected successfully, reset the retry_count
157
+
158
+ await self._run_ws(ws)
159
+ except Exception as e:
160
+ if retry_count >= max_retry:
161
+ logger.exception(
162
+ f"failed to connect to Cartesia after {max_retry} tries"
163
+ )
164
+ break
165
+
166
+ retry_delay = min(retry_count * 2, 10) # max 10s
167
+ retry_count += 1
168
+
169
+ logger.warning(
170
+ f"Cartesia connection failed, retrying in {retry_delay}s",
171
+ exc_info=e,
172
+ )
173
+ await asyncio.sleep(retry_delay)
174
+
175
+ async def _run_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
176
+ request_id = utils.shortuuid()
177
+
178
+ async def sentence_stream_task():
179
+ base_pkt = _to_cartesia_options(self._opts)
180
+ async for ev in self._sent_tokenizer_stream:
181
+ token_pkt = base_pkt.copy()
182
+ token_pkt["context_id"] = request_id
183
+ token_pkt["transcript"] = ev.token + " "
184
+ token_pkt["continue"] = True
185
+ await ws.send_str(json.dumps(token_pkt))
186
+
187
+ end_pkt = base_pkt.copy()
188
+ end_pkt["context_id"] = request_id
189
+ end_pkt["transcript"] = " "
190
+ end_pkt["continue"] = False
191
+ await ws.send_str(json.dumps(end_pkt))
192
+
193
+ async def input_task():
194
+ async for data in self._input_ch:
195
+ if isinstance(data, self._FlushSentinel):
196
+ self._sent_tokenizer_stream.flush()
197
+ continue
198
+ self._sent_tokenizer_stream.push_text(data)
199
+ self._sent_tokenizer_stream.end_input()
200
+
201
+ async def recv_task():
202
+ audio_bstream = utils.audio.AudioByteStream(
203
+ sample_rate=self._opts.sample_rate,
204
+ num_channels=NUM_CHANNELS,
205
+ )
206
+
207
+ while True:
208
+ msg = await ws.receive()
209
+ if msg.type in (
210
+ aiohttp.WSMsgType.CLOSED,
211
+ aiohttp.WSMsgType.CLOSE,
212
+ aiohttp.WSMsgType.CLOSING,
213
+ ):
214
+ raise Exception("Cartesia connection closed unexpectedly")
215
+
216
+ if msg.type != aiohttp.WSMsgType.TEXT:
217
+ logger.warning("unexpected Cartesia message type %s", msg.type)
218
+ continue
219
+
220
+ data = json.loads(msg.data)
221
+ segment_id = data.get("context_id")
222
+ # Once we receive audio for a segment, we can start a new segment
223
+ if data.get("data"):
224
+ b64data = base64.b64decode(data["data"])
225
+ for frame in audio_bstream.write(b64data):
226
+ self._event_ch.send_nowait(
227
+ tts.SynthesizedAudio(
228
+ request_id=request_id,
229
+ segment_id=segment_id,
230
+ frame=frame,
231
+ )
232
+ )
233
+ elif data.get("done"):
234
+ for frame in audio_bstream.flush():
235
+ self._event_ch.send_nowait(
236
+ tts.SynthesizedAudio(
237
+ request_id=request_id,
238
+ segment_id=segment_id,
239
+ frame=frame,
240
+ )
241
+ )
242
+
243
+ if segment_id == request_id:
244
+ # we're not going to receive more frames, close the connection
245
+ await ws.close()
246
+ break
247
+ else:
248
+ logger.error("unexpected Cartesia message %s", data)
249
+
250
+ tasks = [
251
+ asyncio.create_task(input_task()),
252
+ asyncio.create_task(sentence_stream_task()),
253
+ asyncio.create_task(recv_task()),
254
+ ]
255
+
256
+ try:
257
+ await asyncio.gather(*tasks)
258
+ finally:
259
+ await utils.aio.gracefully_cancel(*tasks)
260
+
261
+
262
+ def _to_cartesia_options(opts: _TTSOptions) -> dict[str, Any]:
263
+ voice: dict[str, Any] = {}
264
+ if isinstance(opts.voice, str):
265
+ voice["mode"] = "id"
266
+ voice["id"] = opts.voice
267
+ else:
268
+ voice["mode"] = "embedding"
269
+ voice["embedding"] = opts.voice
270
+
271
+ return {
272
+ "model_id": opts.model,
273
+ "voice": voice,
274
+ "output_format": {
275
+ "container": "raw",
276
+ "encoding": opts.encoding,
277
+ "sample_rate": opts.sample_rate,
278
+ },
279
+ "language": opts.language,
280
+ }
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.2.0-dev.7"
15
+ __version__ = "0.4.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-cartesia
3
- Version: 0.2.0.dev7
3
+ Version: 0.4.0
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -0,0 +1,10 @@
1
+ livekit/plugins/cartesia/__init__.py,sha256=BUfWY_evL5dUHn9hBDQVor6ssctDKQfbQfZy5SWndN8,926
2
+ livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
+ livekit/plugins/cartesia/models.py,sha256=ZoSyV2ap_LqAIgvBvkmukkPxQR9DfKb3Z3oHtWxMiVg,335
4
+ livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/cartesia/tts.py,sha256=sdiiWinOZR5EBkQFwa3GZAGrkgzXY1-aSRiDZ34K8ww,9527
6
+ livekit/plugins/cartesia/version.py,sha256=yelanl1wEXtgUH0CzoNVXfi2yTc2hElSzuAhULFzANc,600
7
+ livekit_plugins_cartesia-0.4.0.dist-info/METADATA,sha256=BGgicrqKsylOpTbUcRG0B4DZF2qnaERI9q7qwIRLN7s,1252
8
+ livekit_plugins_cartesia-0.4.0.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
9
+ livekit_plugins_cartesia-0.4.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_cartesia-0.4.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.1.0)
2
+ Generator: setuptools (72.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/cartesia/__init__.py,sha256=_a8u7qqya1pjZTV19gNOpMKTO7ccAVZAeCukiDKAG-U,937
2
- livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
- livekit/plugins/cartesia/models.py,sha256=06S-Z-M90kB-kEOQsQk70xfQUD-TztU4ZIU_AfAyUMc,335
4
- livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/cartesia/tts.py,sha256=S5BMSVtsbNI_c2PpgyFK6wvleudmJZLTUt3ZmGNKlRI,4319
6
- livekit/plugins/cartesia/version.py,sha256=cmcEldnzcpX8ulbgiuN3mLrZUz2H7FyBpXlMqC7B5K4,606
7
- livekit_plugins_cartesia-0.2.0.dev7.dist-info/METADATA,sha256=5tWoZTOirxenLlJEp8H0ZMMKK4TKNqkwG-kRizkQ1Mg,1257
8
- livekit_plugins_cartesia-0.2.0.dev7.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
9
- livekit_plugins_cartesia-0.2.0.dev7.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_cartesia-0.2.0.dev7.dist-info/RECORD,,