livekit-plugins-cartesia 0.4.2__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,3 +28,12 @@ class CartesiaPlugin(Plugin):
28
28
 
29
29
 
30
30
  Plugin.register_plugin(CartesiaPlugin())
31
+
32
+ # Cleanup docs of unexported modules
33
+ _module = dir()
34
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
35
+
36
+ __pdoc__ = {}
37
+
38
+ for n in NOT_IN_ALL:
39
+ __pdoc__[n] = False
@@ -22,7 +22,17 @@ from dataclasses import dataclass
22
22
  from typing import Any
23
23
 
24
24
  import aiohttp
25
- from livekit.agents import tokenize, tts, utils
25
+ from livekit import rtc
26
+ from livekit.agents import (
27
+ DEFAULT_API_CONNECT_OPTIONS,
28
+ APIConnectionError,
29
+ APIConnectOptions,
30
+ APIStatusError,
31
+ APITimeoutError,
32
+ tokenize,
33
+ tts,
34
+ utils,
35
+ )
26
36
 
27
37
  from .log import logger
28
38
  from .models import (
@@ -43,7 +53,7 @@ BUFFERED_WORDS_COUNT = 8
43
53
 
44
54
  @dataclass
45
55
  class _TTSOptions:
46
- model: TTSModels
56
+ model: TTSModels | str
47
57
  encoding: TTSEncoding
48
58
  sample_rate: int
49
59
  voice: str | list[float]
@@ -57,7 +67,7 @@ class TTS(tts.TTS):
57
67
  def __init__(
58
68
  self,
59
69
  *,
60
- model: TTSModels = "sonic-english",
70
+ model: TTSModels | str = "sonic-english",
61
71
  language: str = "en",
62
72
  encoding: TTSEncoding = "pcm_s16le",
63
73
  voice: str | list[float] = TTSDefaultVoiceId,
@@ -112,99 +122,145 @@ class TTS(tts.TTS):
112
122
 
113
123
  return self._session
114
124
 
115
- def synthesize(self, text: str) -> "ChunkedStream":
116
- return ChunkedStream(text, self._opts, self._ensure_session())
125
+ def update_options(
126
+ self,
127
+ *,
128
+ model: TTSModels | None = None,
129
+ language: str | None = None,
130
+ voice: str | list[float] | None = None,
131
+ speed: TTSVoiceSpeed | float | None = None,
132
+ emotion: list[TTSVoiceEmotion | str] | None = None,
133
+ ) -> None:
134
+ """
135
+ Update the Text-to-Speech (TTS) configuration options.
136
+
137
+ This method allows updating the TTS settings, including model type, language, voice, speed,
138
+ and emotion. If any parameter is not provided, the existing value will be retained.
139
+
140
+ Args:
141
+ model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
142
+ language (str, optional): The language code for synthesis. Defaults to "en".
143
+ voice (str | list[float], optional): The voice ID or embedding array.
144
+ speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
145
+ emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
146
+ """
147
+ self._opts.model = model or self._opts.model
148
+ self._opts.language = language or self._opts.language
149
+ self._opts.voice = voice or self._opts.voice
150
+ self._opts.speed = speed or self._opts.speed
151
+ if emotion is not None:
152
+ self._opts.emotion = emotion
153
+
154
+ def synthesize(
155
+ self,
156
+ text: str,
157
+ *,
158
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
159
+ ) -> ChunkedStream:
160
+ return ChunkedStream(
161
+ tts=self,
162
+ input_text=text,
163
+ conn_options=conn_options,
164
+ opts=self._opts,
165
+ session=self._ensure_session(),
166
+ )
117
167
 
118
- def stream(self) -> "SynthesizeStream":
119
- return SynthesizeStream(self._opts, self._ensure_session())
168
+ def stream(
169
+ self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
170
+ ) -> "SynthesizeStream":
171
+ return SynthesizeStream(
172
+ tts=self,
173
+ conn_options=conn_options,
174
+ opts=self._opts,
175
+ session=self._ensure_session(),
176
+ )
120
177
 
121
178
 
122
179
  class ChunkedStream(tts.ChunkedStream):
123
180
  """Synthesize chunked text using the bytes endpoint"""
124
181
 
125
182
  def __init__(
126
- self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
183
+ self,
184
+ *,
185
+ tts: TTS,
186
+ input_text: str,
187
+ conn_options: APIConnectOptions,
188
+ opts: _TTSOptions,
189
+ session: aiohttp.ClientSession,
127
190
  ) -> None:
128
- super().__init__()
129
- self._text, self._opts, self._session = text, opts, session
191
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
192
+ self._opts, self._session = opts, session
130
193
 
131
- @utils.log_exceptions(logger=logger)
132
- async def _main_task(self):
194
+ async def _run(self) -> None:
195
+ request_id = utils.shortuuid()
133
196
  bstream = utils.audio.AudioByteStream(
134
197
  sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS
135
198
  )
136
- request_id, segment_id = utils.shortuuid(), utils.shortuuid()
137
-
138
- data = _to_cartesia_options(self._opts)
139
- data["transcript"] = self._text
140
-
141
- async with self._session.post(
142
- "https://api.cartesia.ai/tts/bytes",
143
- headers={
144
- API_AUTH_HEADER: self._opts.api_key,
145
- API_VERSION_HEADER: API_VERSION,
146
- },
147
- json=data,
148
- ) as resp:
149
- async for data, _ in resp.content.iter_chunks():
150
- for frame in bstream.write(data):
151
- self._event_ch.send_nowait(
152
- tts.SynthesizedAudio(
153
- request_id=request_id, segment_id=segment_id, frame=frame
199
+
200
+ json = _to_cartesia_options(self._opts)
201
+ json["transcript"] = self._input_text
202
+
203
+ headers = {
204
+ API_AUTH_HEADER: self._opts.api_key,
205
+ API_VERSION_HEADER: API_VERSION,
206
+ }
207
+
208
+ try:
209
+ async with self._session.post(
210
+ "https://api.cartesia.ai/tts/bytes",
211
+ headers=headers,
212
+ json=json,
213
+ timeout=aiohttp.ClientTimeout(
214
+ total=30,
215
+ sock_connect=self._conn_options.timeout,
216
+ ),
217
+ ) as resp:
218
+ resp.raise_for_status()
219
+ async for data, _ in resp.content.iter_chunks():
220
+ for frame in bstream.write(data):
221
+ self._event_ch.send_nowait(
222
+ tts.SynthesizedAudio(
223
+ request_id=request_id,
224
+ frame=frame,
225
+ )
154
226
  )
155
- )
156
227
 
157
- for frame in bstream.flush():
158
- self._event_ch.send_nowait(
159
- tts.SynthesizedAudio(
160
- request_id=request_id, segment_id=segment_id, frame=frame
228
+ for frame in bstream.flush():
229
+ self._event_ch.send_nowait(
230
+ tts.SynthesizedAudio(request_id=request_id, frame=frame)
161
231
  )
162
- )
232
+ except asyncio.TimeoutError as e:
233
+ raise APITimeoutError() from e
234
+ except aiohttp.ClientResponseError as e:
235
+ raise APIStatusError(
236
+ message=e.message,
237
+ status_code=e.status,
238
+ request_id=None,
239
+ body=None,
240
+ ) from e
241
+ except Exception as e:
242
+ raise APIConnectionError() from e
163
243
 
164
244
 
165
245
  class SynthesizeStream(tts.SynthesizeStream):
166
246
  def __init__(
167
247
  self,
248
+ *,
249
+ tts: TTS,
250
+ conn_options: APIConnectOptions,
168
251
  opts: _TTSOptions,
169
252
  session: aiohttp.ClientSession,
170
253
  ):
171
- super().__init__()
254
+ super().__init__(tts=tts, conn_options=conn_options)
172
255
  self._opts, self._session = opts, session
173
256
  self._sent_tokenizer_stream = tokenize.basic.SentenceTokenizer(
174
257
  min_sentence_len=BUFFERED_WORDS_COUNT
175
258
  ).stream()
176
259
 
177
- @utils.log_exceptions(logger=logger)
178
- async def _main_task(self) -> None:
179
- retry_count = 0
180
- max_retry = 3
181
- while self._input_ch.qsize() or not self._input_ch.closed:
182
- try:
183
- url = f"wss://api.cartesia.ai/tts/websocket?api_key={self._opts.api_key}&cartesia_version={API_VERSION}"
184
- ws = await self._session.ws_connect(url)
185
- retry_count = 0 # connected successfully, reset the retry_count
186
-
187
- await self._run_ws(ws)
188
- except Exception as e:
189
- if retry_count >= max_retry:
190
- logger.exception(
191
- f"failed to connect to Cartesia after {max_retry} tries"
192
- )
193
- break
194
-
195
- retry_delay = min(retry_count * 2, 10) # max 10s
196
- retry_count += 1
197
-
198
- logger.warning(
199
- f"Cartesia connection failed, retrying in {retry_delay}s",
200
- exc_info=e,
201
- )
202
- await asyncio.sleep(retry_delay)
203
-
204
- async def _run_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
260
+ async def _run(self) -> None:
205
261
  request_id = utils.shortuuid()
206
262
 
207
- async def sentence_stream_task():
263
+ async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse):
208
264
  base_pkt = _to_cartesia_options(self._opts)
209
265
  async for ev in self._sent_tokenizer_stream:
210
266
  token_pkt = base_pkt.copy()
@@ -219,7 +275,7 @@ class SynthesizeStream(tts.SynthesizeStream):
219
275
  end_pkt["continue"] = False
220
276
  await ws.send_str(json.dumps(end_pkt))
221
277
 
222
- async def input_task():
278
+ async def _input_task():
223
279
  async for data in self._input_ch:
224
280
  if isinstance(data, self._FlushSentinel):
225
281
  self._sent_tokenizer_stream.flush()
@@ -227,12 +283,28 @@ class SynthesizeStream(tts.SynthesizeStream):
227
283
  self._sent_tokenizer_stream.push_text(data)
228
284
  self._sent_tokenizer_stream.end_input()
229
285
 
230
- async def recv_task():
286
+ async def _recv_task(ws: aiohttp.ClientWebSocketResponse):
231
287
  audio_bstream = utils.audio.AudioByteStream(
232
288
  sample_rate=self._opts.sample_rate,
233
289
  num_channels=NUM_CHANNELS,
234
290
  )
235
291
 
292
+ last_frame: rtc.AudioFrame | None = None
293
+
294
+ def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
295
+ nonlocal last_frame
296
+ if last_frame is not None:
297
+ self._event_ch.send_nowait(
298
+ tts.SynthesizedAudio(
299
+ request_id=request_id,
300
+ segment_id=segment_id,
301
+ frame=last_frame,
302
+ is_final=is_final,
303
+ )
304
+ )
305
+
306
+ last_frame = None
307
+
236
308
  while True:
237
309
  msg = await ws.receive()
238
310
  if msg.type in (
@@ -248,26 +320,18 @@ class SynthesizeStream(tts.SynthesizeStream):
248
320
 
249
321
  data = json.loads(msg.data)
250
322
  segment_id = data.get("context_id")
251
- # Once we receive audio for a segment, we can start a new segment
323
+
252
324
  if data.get("data"):
253
325
  b64data = base64.b64decode(data["data"])
254
326
  for frame in audio_bstream.write(b64data):
255
- self._event_ch.send_nowait(
256
- tts.SynthesizedAudio(
257
- request_id=request_id,
258
- segment_id=segment_id,
259
- frame=frame,
260
- )
261
- )
327
+ _send_last_frame(segment_id=segment_id, is_final=False)
328
+ last_frame = frame
262
329
  elif data.get("done"):
263
330
  for frame in audio_bstream.flush():
264
- self._event_ch.send_nowait(
265
- tts.SynthesizedAudio(
266
- request_id=request_id,
267
- segment_id=segment_id,
268
- frame=frame,
269
- )
270
- )
331
+ _send_last_frame(segment_id=segment_id, is_final=False)
332
+ last_frame = frame
333
+
334
+ _send_last_frame(segment_id=segment_id, is_final=True)
271
335
 
272
336
  if segment_id == request_id:
273
337
  # we're not going to receive more frames, close the connection
@@ -276,16 +340,28 @@ class SynthesizeStream(tts.SynthesizeStream):
276
340
  else:
277
341
  logger.error("unexpected Cartesia message %s", data)
278
342
 
279
- tasks = [
280
- asyncio.create_task(input_task()),
281
- asyncio.create_task(sentence_stream_task()),
282
- asyncio.create_task(recv_task()),
283
- ]
343
+ url = f"wss://api.cartesia.ai/tts/websocket?api_key={self._opts.api_key}&cartesia_version={API_VERSION}"
344
+
345
+ ws: aiohttp.ClientWebSocketResponse | None = None
284
346
 
285
347
  try:
286
- await asyncio.gather(*tasks)
348
+ ws = await asyncio.wait_for(
349
+ self._session.ws_connect(url), self._conn_options.timeout
350
+ )
351
+
352
+ tasks = [
353
+ asyncio.create_task(_input_task()),
354
+ asyncio.create_task(_sentence_stream_task(ws)),
355
+ asyncio.create_task(_recv_task(ws)),
356
+ ]
357
+
358
+ try:
359
+ await asyncio.gather(*tasks)
360
+ finally:
361
+ await utils.aio.gracefully_cancel(*tasks)
287
362
  finally:
288
- await utils.aio.gracefully_cancel(*tasks)
363
+ if ws is not None:
364
+ await ws.close()
289
365
 
290
366
 
291
367
  def _to_cartesia_options(opts: _TTSOptions) -> dict[str, Any]:
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.4.2"
15
+ __version__ = "0.4.5"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-cartesia
3
- Version: 0.4.2
3
+ Version: 0.4.5
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit-agents >=0.8.0.dev0
22
+ Requires-Dist: livekit-agents>=0.11
23
23
 
24
24
  # LiveKit Plugins Cartesia
25
25
 
@@ -0,0 +1,10 @@
1
+ livekit/plugins/cartesia/__init__.py,sha256=UTa6Q7IxhRBCwPftowHEUDvmBg99J_UjGS_yxTzKD7g,1095
2
+ livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
+ livekit/plugins/cartesia/models.py,sha256=fOO276Vzw3OkDUWUVcw7PH95ctFy38rj3q9I6_mYQ7M,950
4
+ livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/cartesia/tts.py,sha256=SZH1tYHxKDgZ5PbBHkC86vATPkxu81UGnU44FCEzasI,13778
6
+ livekit/plugins/cartesia/version.py,sha256=NVa5L7bU73cSrgbGChyGQDqP6rLxpFdXF6hoIrBpXM8,600
7
+ livekit_plugins_cartesia-0.4.5.dist-info/METADATA,sha256=HRCHZl35yVnXjG5yvSYCilcJeefHsve7-xKt1bOkGsE,1245
8
+ livekit_plugins_cartesia-0.4.5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
9
+ livekit_plugins_cartesia-0.4.5.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_cartesia-0.4.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.2)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/cartesia/__init__.py,sha256=BUfWY_evL5dUHn9hBDQVor6ssctDKQfbQfZy5SWndN8,926
2
- livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
- livekit/plugins/cartesia/models.py,sha256=fOO276Vzw3OkDUWUVcw7PH95ctFy38rj3q9I6_mYQ7M,950
4
- livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/cartesia/tts.py,sha256=kUGIhsmHqIK2m_FV44_nwjHp0c7Zb2H7UG9VayNIae8,11341
6
- livekit/plugins/cartesia/version.py,sha256=jabhjXzHcov1Cy2z9FGgyHFpSQ3hFKqu3vly20WQeTs,600
7
- livekit_plugins_cartesia-0.4.2.dist-info/METADATA,sha256=w9ZGYOicE_fUFVTnhgvewGgWgwmaInoG9w6BGTiOu-8,1252
8
- livekit_plugins_cartesia-0.4.2.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
9
- livekit_plugins_cartesia-0.4.2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_cartesia-0.4.2.dist-info/RECORD,,