livekit-plugins-cartesia 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,3 +28,12 @@ class CartesiaPlugin(Plugin):
28
28
 
29
29
 
30
30
  Plugin.register_plugin(CartesiaPlugin())
31
+
32
+ # Cleanup docs of unexported modules
33
+ _module = dir()
34
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
35
+
36
+ __pdoc__ = {}
37
+
38
+ for n in NOT_IN_ALL:
39
+ __pdoc__[n] = False
@@ -8,7 +8,34 @@ TTSEncoding = Literal[
8
8
  # "pcm_alaw",
9
9
  ]
10
10
 
11
-
12
11
  TTSModels = Literal["sonic-english", "sonic-multilingual"]
13
12
  TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
14
13
  TTSDefaultVoiceId = "c2ac25f9-ecc4-4f56-9095-651354df60c0"
14
+ TTSVoiceSpeed = Literal["fastest", "fast", "normal", "slow", "slowest"]
15
+ TTSVoiceEmotion = Literal[
16
+ "anger:lowest",
17
+ "anger:low",
18
+ "anger",
19
+ "anger:high",
20
+ "anger:highest",
21
+ "positivity:lowest",
22
+ "positivity:low",
23
+ "positivity",
24
+ "positivity:high",
25
+ "positivity:highest",
26
+ "surprise:lowest",
27
+ "surprise:low",
28
+ "surprise",
29
+ "surprise:high",
30
+ "surprise:highest",
31
+ "sadness:lowest",
32
+ "sadness:low",
33
+ "sadness",
34
+ "sadness:high",
35
+ "sadness:highest",
36
+ "curiosity:lowest",
37
+ "curiosity:low",
38
+ "curiosity",
39
+ "curiosity:high",
40
+ "curiosity:highest",
41
+ ]
@@ -22,10 +22,24 @@ from dataclasses import dataclass
22
22
  from typing import Any
23
23
 
24
24
  import aiohttp
25
- from livekit.agents import tokenize, tts, utils
25
+ from livekit import rtc
26
+ from livekit.agents import (
27
+ APIConnectionError,
28
+ APIStatusError,
29
+ APITimeoutError,
30
+ tokenize,
31
+ tts,
32
+ utils,
33
+ )
26
34
 
27
35
  from .log import logger
28
- from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
36
+ from .models import (
37
+ TTSDefaultVoiceId,
38
+ TTSEncoding,
39
+ TTSModels,
40
+ TTSVoiceEmotion,
41
+ TTSVoiceSpeed,
42
+ )
29
43
 
30
44
  API_AUTH_HEADER = "X-API-Key"
31
45
  API_VERSION_HEADER = "Cartesia-Version"
@@ -37,10 +51,12 @@ BUFFERED_WORDS_COUNT = 8
37
51
 
38
52
  @dataclass
39
53
  class _TTSOptions:
40
- model: TTSModels
54
+ model: TTSModels | str
41
55
  encoding: TTSEncoding
42
56
  sample_rate: int
43
57
  voice: str | list[float]
58
+ speed: TTSVoiceSpeed | float | None
59
+ emotion: list[TTSVoiceEmotion | str] | None
44
60
  api_key: str
45
61
  language: str
46
62
 
@@ -49,14 +65,33 @@ class TTS(tts.TTS):
49
65
  def __init__(
50
66
  self,
51
67
  *,
52
- model: TTSModels = "sonic-english",
68
+ model: TTSModels | str = "sonic-english",
53
69
  language: str = "en",
54
70
  encoding: TTSEncoding = "pcm_s16le",
55
71
  voice: str | list[float] = TTSDefaultVoiceId,
72
+ speed: TTSVoiceSpeed | float | None = None,
73
+ emotion: list[TTSVoiceEmotion | str] | None = None,
56
74
  sample_rate: int = 24000,
57
75
  api_key: str | None = None,
58
76
  http_session: aiohttp.ClientSession | None = None,
59
77
  ) -> None:
78
+ """
79
+ Create a new instance of Cartesia TTS.
80
+
81
+ See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.
82
+
83
+ Args:
84
+ model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
85
+ language (str, optional): The language code for synthesis. Defaults to "en".
86
+ encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
87
+ voice (str | list[float], optional): The voice ID or embedding array.
88
+ speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
89
+ emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
90
+ sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
91
+ api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
92
+ http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
93
+ """
94
+
60
95
  super().__init__(
61
96
  capabilities=tts.TTSCapabilities(streaming=True),
62
97
  sample_rate=sample_rate,
@@ -73,6 +108,8 @@ class TTS(tts.TTS):
73
108
  encoding=encoding,
74
109
  sample_rate=sample_rate,
75
110
  voice=voice,
111
+ speed=speed,
112
+ emotion=emotion,
76
113
  api_key=api_key,
77
114
  )
78
115
  self._session = http_session
@@ -83,63 +120,106 @@ class TTS(tts.TTS):
83
120
 
84
121
  return self._session
85
122
 
123
+ def update_options(
124
+ self,
125
+ *,
126
+ model: TTSModels | None = None,
127
+ language: str | None = None,
128
+ voice: str | list[float] | None = None,
129
+ speed: TTSVoiceSpeed | float | None = None,
130
+ emotion: list[TTSVoiceEmotion | str] | None = None,
131
+ ) -> None:
132
+ """
133
+ Update the Text-to-Speech (TTS) configuration options.
134
+
135
+ This method allows updating the TTS settings, including model type, language, voice, speed,
136
+ and emotion. If any parameter is not provided, the existing value will be retained.
137
+
138
+ Args:
139
+ model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
140
+ language (str, optional): The language code for synthesis. Defaults to "en".
141
+ voice (str | list[float], optional): The voice ID or embedding array.
142
+ speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
143
+ emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
144
+ """
145
+ self._opts.model = model or self._opts.model
146
+ self._opts.language = language or self._opts.language
147
+ self._opts.voice = voice or self._opts.voice
148
+ self._opts.speed = speed or self._opts.speed
149
+ if emotion is not None:
150
+ self._opts.emotion = emotion
151
+
86
152
  def synthesize(self, text: str) -> "ChunkedStream":
87
- return ChunkedStream(text, self._opts, self._ensure_session())
153
+ return ChunkedStream(self, text, self._opts, self._ensure_session())
88
154
 
89
155
  def stream(self) -> "SynthesizeStream":
90
- return SynthesizeStream(self._opts, self._ensure_session())
156
+ return SynthesizeStream(self, self._opts, self._ensure_session())
91
157
 
92
158
 
93
159
  class ChunkedStream(tts.ChunkedStream):
94
160
  """Synthesize chunked text using the bytes endpoint"""
95
161
 
96
162
  def __init__(
97
- self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
163
+ self, tts: TTS, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
98
164
  ) -> None:
99
- super().__init__()
100
- self._text, self._opts, self._session = text, opts, session
165
+ super().__init__(tts, text)
166
+ self._opts, self._session = opts, session
101
167
 
102
- @utils.log_exceptions(logger=logger)
103
- async def _main_task(self):
168
+ async def _main_task(self) -> None:
169
+ request_id = utils.shortuuid()
104
170
  bstream = utils.audio.AudioByteStream(
105
171
  sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS
106
172
  )
107
- request_id, segment_id = utils.shortuuid(), utils.shortuuid()
108
-
109
- data = _to_cartesia_options(self._opts)
110
- data["transcript"] = self._text
111
-
112
- async with self._session.post(
113
- "https://api.cartesia.ai/tts/bytes",
114
- headers={
115
- API_AUTH_HEADER: self._opts.api_key,
116
- API_VERSION_HEADER: API_VERSION,
117
- },
118
- json=data,
119
- ) as resp:
120
- async for data, _ in resp.content.iter_chunks():
121
- for frame in bstream.write(data):
122
- self._event_ch.send_nowait(
123
- tts.SynthesizedAudio(
124
- request_id=request_id, segment_id=segment_id, frame=frame
173
+
174
+ json = _to_cartesia_options(self._opts)
175
+ json["transcript"] = self._input_text
176
+
177
+ headers = {
178
+ API_AUTH_HEADER: self._opts.api_key,
179
+ API_VERSION_HEADER: API_VERSION,
180
+ }
181
+
182
+ try:
183
+ async with self._session.post(
184
+ "https://api.cartesia.ai/tts/bytes",
185
+ headers=headers,
186
+ json=json,
187
+ ) as resp:
188
+ resp.raise_for_status()
189
+ async for data, _ in resp.content.iter_chunks():
190
+ for frame in bstream.write(data):
191
+ self._event_ch.send_nowait(
192
+ tts.SynthesizedAudio(
193
+ request_id=request_id,
194
+ frame=frame,
195
+ )
125
196
  )
126
- )
127
197
 
128
- for frame in bstream.flush():
129
- self._event_ch.send_nowait(
130
- tts.SynthesizedAudio(
131
- request_id=request_id, segment_id=segment_id, frame=frame
198
+ for frame in bstream.flush():
199
+ self._event_ch.send_nowait(
200
+ tts.SynthesizedAudio(request_id=request_id, frame=frame)
132
201
  )
133
- )
202
+ except asyncio.TimeoutError as e:
203
+ raise APITimeoutError() from e
204
+ except aiohttp.ClientResponseError as e:
205
+ raise APIStatusError(
206
+ message=e.message,
207
+ status_code=e.status,
208
+ request_id=None,
209
+ body=None,
210
+ ) from e
211
+ except Exception as e:
212
+ raise APIConnectionError() from e
134
213
 
135
214
 
136
215
  class SynthesizeStream(tts.SynthesizeStream):
137
216
  def __init__(
138
217
  self,
218
+ tts: TTS,
139
219
  opts: _TTSOptions,
140
220
  session: aiohttp.ClientSession,
141
221
  ):
142
- super().__init__()
222
+ super().__init__(tts)
143
223
  self._opts, self._session = opts, session
144
224
  self._sent_tokenizer_stream = tokenize.basic.SentenceTokenizer(
145
225
  min_sentence_len=BUFFERED_WORDS_COUNT
@@ -204,6 +284,22 @@ class SynthesizeStream(tts.SynthesizeStream):
204
284
  num_channels=NUM_CHANNELS,
205
285
  )
206
286
 
287
+ last_frame: rtc.AudioFrame | None = None
288
+
289
+ def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
290
+ nonlocal last_frame
291
+ if last_frame is not None:
292
+ self._event_ch.send_nowait(
293
+ tts.SynthesizedAudio(
294
+ request_id=request_id,
295
+ segment_id=segment_id,
296
+ frame=last_frame,
297
+ is_final=is_final,
298
+ )
299
+ )
300
+
301
+ last_frame = None
302
+
207
303
  while True:
208
304
  msg = await ws.receive()
209
305
  if msg.type in (
@@ -219,26 +315,18 @@ class SynthesizeStream(tts.SynthesizeStream):
219
315
 
220
316
  data = json.loads(msg.data)
221
317
  segment_id = data.get("context_id")
222
- # Once we receive audio for a segment, we can start a new segment
318
+
223
319
  if data.get("data"):
224
320
  b64data = base64.b64decode(data["data"])
225
321
  for frame in audio_bstream.write(b64data):
226
- self._event_ch.send_nowait(
227
- tts.SynthesizedAudio(
228
- request_id=request_id,
229
- segment_id=segment_id,
230
- frame=frame,
231
- )
232
- )
322
+ _send_last_frame(segment_id=segment_id, is_final=False)
323
+ last_frame = frame
233
324
  elif data.get("done"):
234
325
  for frame in audio_bstream.flush():
235
- self._event_ch.send_nowait(
236
- tts.SynthesizedAudio(
237
- request_id=request_id,
238
- segment_id=segment_id,
239
- frame=frame,
240
- )
241
- )
326
+ _send_last_frame(segment_id=segment_id, is_final=False)
327
+ last_frame = frame
328
+
329
+ _send_last_frame(segment_id=segment_id, is_final=True)
242
330
 
243
331
  if segment_id == request_id:
244
332
  # we're not going to receive more frames, close the connection
@@ -268,6 +356,15 @@ def _to_cartesia_options(opts: _TTSOptions) -> dict[str, Any]:
268
356
  voice["mode"] = "embedding"
269
357
  voice["embedding"] = opts.voice
270
358
 
359
+ voice_controls: dict = {}
360
+ if opts.speed is not None:
361
+ voice_controls["speed"] = opts.speed
362
+ if opts.emotion is not None:
363
+ voice_controls["emotion"] = opts.emotion
364
+
365
+ if voice_controls:
366
+ voice["__experimental_controls"] = voice_controls
367
+
271
368
  return {
272
369
  "model_id": opts.model,
273
370
  "voice": voice,
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.4.1"
15
+ __version__ = "0.4.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-cartesia
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit-agents >=0.8.0.dev0
22
+ Requires-Dist: livekit-agents >=0.11
23
23
 
24
24
  # LiveKit Plugins Cartesia
25
25
 
@@ -0,0 +1,10 @@
1
+ livekit/plugins/cartesia/__init__.py,sha256=UTa6Q7IxhRBCwPftowHEUDvmBg99J_UjGS_yxTzKD7g,1095
2
+ livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
+ livekit/plugins/cartesia/models.py,sha256=fOO276Vzw3OkDUWUVcw7PH95ctFy38rj3q9I6_mYQ7M,950
4
+ livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/cartesia/tts.py,sha256=2xwWOIjwLDOF4TbHlDibrZpUju9If8WrNpHQ2JMuBC0,13533
6
+ livekit/plugins/cartesia/version.py,sha256=u7PSD5TBbPRIhE8vJkBVJzq_eGqYfg6RP5c3VKNlKGk,600
7
+ livekit_plugins_cartesia-0.4.3.dist-info/METADATA,sha256=w5q0oz6rdHDL5cxAyT5hWbHqhZnOPnZYGl3aUKsr3z4,1246
8
+ livekit_plugins_cartesia-0.4.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
9
+ livekit_plugins_cartesia-0.4.3.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_cartesia-0.4.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.2.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/cartesia/__init__.py,sha256=BUfWY_evL5dUHn9hBDQVor6ssctDKQfbQfZy5SWndN8,926
2
- livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
- livekit/plugins/cartesia/models.py,sha256=ZoSyV2ap_LqAIgvBvkmukkPxQR9DfKb3Z3oHtWxMiVg,335
4
- livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/cartesia/tts.py,sha256=sdiiWinOZR5EBkQFwa3GZAGrkgzXY1-aSRiDZ34K8ww,9527
6
- livekit/plugins/cartesia/version.py,sha256=GSGiYNpxiJSu-Mwsw7PqdHsxkwAqS-5ceh44QLp4ovU,600
7
- livekit_plugins_cartesia-0.4.1.dist-info/METADATA,sha256=B1B0c8a2ik7feImTl3nQHul9bqMhKebkmO880BCwF7Y,1252
8
- livekit_plugins_cartesia-0.4.1.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
9
- livekit_plugins_cartesia-0.4.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_cartesia-0.4.1.dist-info/RECORD,,