livekit-plugins-cartesia 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/cartesia/__init__.py +9 -0
- livekit/plugins/cartesia/models.py +28 -1
- livekit/plugins/cartesia/tts.py +148 -51
- livekit/plugins/cartesia/version.py +1 -1
- {livekit_plugins_cartesia-0.4.1.dist-info → livekit_plugins_cartesia-0.4.3.dist-info}/METADATA +2 -2
- livekit_plugins_cartesia-0.4.3.dist-info/RECORD +10 -0
- {livekit_plugins_cartesia-0.4.1.dist-info → livekit_plugins_cartesia-0.4.3.dist-info}/WHEEL +1 -1
- livekit_plugins_cartesia-0.4.1.dist-info/RECORD +0 -10
- {livekit_plugins_cartesia-0.4.1.dist-info → livekit_plugins_cartesia-0.4.3.dist-info}/top_level.txt +0 -0
@@ -28,3 +28,12 @@ class CartesiaPlugin(Plugin):
|
|
28
28
|
|
29
29
|
|
30
30
|
Plugin.register_plugin(CartesiaPlugin())
|
31
|
+
|
32
|
+
# Cleanup docs of unexported modules
|
33
|
+
_module = dir()
|
34
|
+
NOT_IN_ALL = [m for m in _module if m not in __all__]
|
35
|
+
|
36
|
+
__pdoc__ = {}
|
37
|
+
|
38
|
+
for n in NOT_IN_ALL:
|
39
|
+
__pdoc__[n] = False
|
@@ -8,7 +8,34 @@ TTSEncoding = Literal[
|
|
8
8
|
# "pcm_alaw",
|
9
9
|
]
|
10
10
|
|
11
|
-
|
12
11
|
TTSModels = Literal["sonic-english", "sonic-multilingual"]
|
13
12
|
TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
|
14
13
|
TTSDefaultVoiceId = "c2ac25f9-ecc4-4f56-9095-651354df60c0"
|
14
|
+
TTSVoiceSpeed = Literal["fastest", "fast", "normal", "slow", "slowest"]
|
15
|
+
TTSVoiceEmotion = Literal[
|
16
|
+
"anger:lowest",
|
17
|
+
"anger:low",
|
18
|
+
"anger",
|
19
|
+
"anger:high",
|
20
|
+
"anger:highest",
|
21
|
+
"positivity:lowest",
|
22
|
+
"positivity:low",
|
23
|
+
"positivity",
|
24
|
+
"positivity:high",
|
25
|
+
"positivity:highest",
|
26
|
+
"surprise:lowest",
|
27
|
+
"surprise:low",
|
28
|
+
"surprise",
|
29
|
+
"surprise:high",
|
30
|
+
"surprise:highest",
|
31
|
+
"sadness:lowest",
|
32
|
+
"sadness:low",
|
33
|
+
"sadness",
|
34
|
+
"sadness:high",
|
35
|
+
"sadness:highest",
|
36
|
+
"curiosity:lowest",
|
37
|
+
"curiosity:low",
|
38
|
+
"curiosity",
|
39
|
+
"curiosity:high",
|
40
|
+
"curiosity:highest",
|
41
|
+
]
|
livekit/plugins/cartesia/tts.py
CHANGED
@@ -22,10 +22,24 @@ from dataclasses import dataclass
|
|
22
22
|
from typing import Any
|
23
23
|
|
24
24
|
import aiohttp
|
25
|
-
from livekit
|
25
|
+
from livekit import rtc
|
26
|
+
from livekit.agents import (
|
27
|
+
APIConnectionError,
|
28
|
+
APIStatusError,
|
29
|
+
APITimeoutError,
|
30
|
+
tokenize,
|
31
|
+
tts,
|
32
|
+
utils,
|
33
|
+
)
|
26
34
|
|
27
35
|
from .log import logger
|
28
|
-
from .models import
|
36
|
+
from .models import (
|
37
|
+
TTSDefaultVoiceId,
|
38
|
+
TTSEncoding,
|
39
|
+
TTSModels,
|
40
|
+
TTSVoiceEmotion,
|
41
|
+
TTSVoiceSpeed,
|
42
|
+
)
|
29
43
|
|
30
44
|
API_AUTH_HEADER = "X-API-Key"
|
31
45
|
API_VERSION_HEADER = "Cartesia-Version"
|
@@ -37,10 +51,12 @@ BUFFERED_WORDS_COUNT = 8
|
|
37
51
|
|
38
52
|
@dataclass
|
39
53
|
class _TTSOptions:
|
40
|
-
model: TTSModels
|
54
|
+
model: TTSModels | str
|
41
55
|
encoding: TTSEncoding
|
42
56
|
sample_rate: int
|
43
57
|
voice: str | list[float]
|
58
|
+
speed: TTSVoiceSpeed | float | None
|
59
|
+
emotion: list[TTSVoiceEmotion | str] | None
|
44
60
|
api_key: str
|
45
61
|
language: str
|
46
62
|
|
@@ -49,14 +65,33 @@ class TTS(tts.TTS):
|
|
49
65
|
def __init__(
|
50
66
|
self,
|
51
67
|
*,
|
52
|
-
model: TTSModels = "sonic-english",
|
68
|
+
model: TTSModels | str = "sonic-english",
|
53
69
|
language: str = "en",
|
54
70
|
encoding: TTSEncoding = "pcm_s16le",
|
55
71
|
voice: str | list[float] = TTSDefaultVoiceId,
|
72
|
+
speed: TTSVoiceSpeed | float | None = None,
|
73
|
+
emotion: list[TTSVoiceEmotion | str] | None = None,
|
56
74
|
sample_rate: int = 24000,
|
57
75
|
api_key: str | None = None,
|
58
76
|
http_session: aiohttp.ClientSession | None = None,
|
59
77
|
) -> None:
|
78
|
+
"""
|
79
|
+
Create a new instance of Cartesia TTS.
|
80
|
+
|
81
|
+
See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
|
85
|
+
language (str, optional): The language code for synthesis. Defaults to "en".
|
86
|
+
encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
|
87
|
+
voice (str | list[float], optional): The voice ID or embedding array.
|
88
|
+
speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
|
89
|
+
emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
|
90
|
+
sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
|
91
|
+
api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
|
92
|
+
http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
|
93
|
+
"""
|
94
|
+
|
60
95
|
super().__init__(
|
61
96
|
capabilities=tts.TTSCapabilities(streaming=True),
|
62
97
|
sample_rate=sample_rate,
|
@@ -73,6 +108,8 @@ class TTS(tts.TTS):
|
|
73
108
|
encoding=encoding,
|
74
109
|
sample_rate=sample_rate,
|
75
110
|
voice=voice,
|
111
|
+
speed=speed,
|
112
|
+
emotion=emotion,
|
76
113
|
api_key=api_key,
|
77
114
|
)
|
78
115
|
self._session = http_session
|
@@ -83,63 +120,106 @@ class TTS(tts.TTS):
|
|
83
120
|
|
84
121
|
return self._session
|
85
122
|
|
123
|
+
def update_options(
|
124
|
+
self,
|
125
|
+
*,
|
126
|
+
model: TTSModels | None = None,
|
127
|
+
language: str | None = None,
|
128
|
+
voice: str | list[float] | None = None,
|
129
|
+
speed: TTSVoiceSpeed | float | None = None,
|
130
|
+
emotion: list[TTSVoiceEmotion | str] | None = None,
|
131
|
+
) -> None:
|
132
|
+
"""
|
133
|
+
Update the Text-to-Speech (TTS) configuration options.
|
134
|
+
|
135
|
+
This method allows updating the TTS settings, including model type, language, voice, speed,
|
136
|
+
and emotion. If any parameter is not provided, the existing value will be retained.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
|
140
|
+
language (str, optional): The language code for synthesis. Defaults to "en".
|
141
|
+
voice (str | list[float], optional): The voice ID or embedding array.
|
142
|
+
speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
|
143
|
+
emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
|
144
|
+
"""
|
145
|
+
self._opts.model = model or self._opts.model
|
146
|
+
self._opts.language = language or self._opts.language
|
147
|
+
self._opts.voice = voice or self._opts.voice
|
148
|
+
self._opts.speed = speed or self._opts.speed
|
149
|
+
if emotion is not None:
|
150
|
+
self._opts.emotion = emotion
|
151
|
+
|
86
152
|
def synthesize(self, text: str) -> "ChunkedStream":
|
87
|
-
return ChunkedStream(text, self._opts, self._ensure_session())
|
153
|
+
return ChunkedStream(self, text, self._opts, self._ensure_session())
|
88
154
|
|
89
155
|
def stream(self) -> "SynthesizeStream":
|
90
|
-
return SynthesizeStream(self._opts, self._ensure_session())
|
156
|
+
return SynthesizeStream(self, self._opts, self._ensure_session())
|
91
157
|
|
92
158
|
|
93
159
|
class ChunkedStream(tts.ChunkedStream):
|
94
160
|
"""Synthesize chunked text using the bytes endpoint"""
|
95
161
|
|
96
162
|
def __init__(
|
97
|
-
self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
|
163
|
+
self, tts: TTS, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
|
98
164
|
) -> None:
|
99
|
-
super().__init__()
|
100
|
-
self.
|
165
|
+
super().__init__(tts, text)
|
166
|
+
self._opts, self._session = opts, session
|
101
167
|
|
102
|
-
|
103
|
-
|
168
|
+
async def _main_task(self) -> None:
|
169
|
+
request_id = utils.shortuuid()
|
104
170
|
bstream = utils.audio.AudioByteStream(
|
105
171
|
sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS
|
106
172
|
)
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
173
|
+
|
174
|
+
json = _to_cartesia_options(self._opts)
|
175
|
+
json["transcript"] = self._input_text
|
176
|
+
|
177
|
+
headers = {
|
178
|
+
API_AUTH_HEADER: self._opts.api_key,
|
179
|
+
API_VERSION_HEADER: API_VERSION,
|
180
|
+
}
|
181
|
+
|
182
|
+
try:
|
183
|
+
async with self._session.post(
|
184
|
+
"https://api.cartesia.ai/tts/bytes",
|
185
|
+
headers=headers,
|
186
|
+
json=json,
|
187
|
+
) as resp:
|
188
|
+
resp.raise_for_status()
|
189
|
+
async for data, _ in resp.content.iter_chunks():
|
190
|
+
for frame in bstream.write(data):
|
191
|
+
self._event_ch.send_nowait(
|
192
|
+
tts.SynthesizedAudio(
|
193
|
+
request_id=request_id,
|
194
|
+
frame=frame,
|
195
|
+
)
|
125
196
|
)
|
126
|
-
)
|
127
197
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
request_id=request_id, segment_id=segment_id, frame=frame
|
198
|
+
for frame in bstream.flush():
|
199
|
+
self._event_ch.send_nowait(
|
200
|
+
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
132
201
|
)
|
133
|
-
|
202
|
+
except asyncio.TimeoutError as e:
|
203
|
+
raise APITimeoutError() from e
|
204
|
+
except aiohttp.ClientResponseError as e:
|
205
|
+
raise APIStatusError(
|
206
|
+
message=e.message,
|
207
|
+
status_code=e.status,
|
208
|
+
request_id=None,
|
209
|
+
body=None,
|
210
|
+
) from e
|
211
|
+
except Exception as e:
|
212
|
+
raise APIConnectionError() from e
|
134
213
|
|
135
214
|
|
136
215
|
class SynthesizeStream(tts.SynthesizeStream):
|
137
216
|
def __init__(
|
138
217
|
self,
|
218
|
+
tts: TTS,
|
139
219
|
opts: _TTSOptions,
|
140
220
|
session: aiohttp.ClientSession,
|
141
221
|
):
|
142
|
-
super().__init__()
|
222
|
+
super().__init__(tts)
|
143
223
|
self._opts, self._session = opts, session
|
144
224
|
self._sent_tokenizer_stream = tokenize.basic.SentenceTokenizer(
|
145
225
|
min_sentence_len=BUFFERED_WORDS_COUNT
|
@@ -204,6 +284,22 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
204
284
|
num_channels=NUM_CHANNELS,
|
205
285
|
)
|
206
286
|
|
287
|
+
last_frame: rtc.AudioFrame | None = None
|
288
|
+
|
289
|
+
def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
|
290
|
+
nonlocal last_frame
|
291
|
+
if last_frame is not None:
|
292
|
+
self._event_ch.send_nowait(
|
293
|
+
tts.SynthesizedAudio(
|
294
|
+
request_id=request_id,
|
295
|
+
segment_id=segment_id,
|
296
|
+
frame=last_frame,
|
297
|
+
is_final=is_final,
|
298
|
+
)
|
299
|
+
)
|
300
|
+
|
301
|
+
last_frame = None
|
302
|
+
|
207
303
|
while True:
|
208
304
|
msg = await ws.receive()
|
209
305
|
if msg.type in (
|
@@ -219,26 +315,18 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
219
315
|
|
220
316
|
data = json.loads(msg.data)
|
221
317
|
segment_id = data.get("context_id")
|
222
|
-
|
318
|
+
|
223
319
|
if data.get("data"):
|
224
320
|
b64data = base64.b64decode(data["data"])
|
225
321
|
for frame in audio_bstream.write(b64data):
|
226
|
-
|
227
|
-
|
228
|
-
request_id=request_id,
|
229
|
-
segment_id=segment_id,
|
230
|
-
frame=frame,
|
231
|
-
)
|
232
|
-
)
|
322
|
+
_send_last_frame(segment_id=segment_id, is_final=False)
|
323
|
+
last_frame = frame
|
233
324
|
elif data.get("done"):
|
234
325
|
for frame in audio_bstream.flush():
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
frame=frame,
|
240
|
-
)
|
241
|
-
)
|
326
|
+
_send_last_frame(segment_id=segment_id, is_final=False)
|
327
|
+
last_frame = frame
|
328
|
+
|
329
|
+
_send_last_frame(segment_id=segment_id, is_final=True)
|
242
330
|
|
243
331
|
if segment_id == request_id:
|
244
332
|
# we're not going to receive more frames, close the connection
|
@@ -268,6 +356,15 @@ def _to_cartesia_options(opts: _TTSOptions) -> dict[str, Any]:
|
|
268
356
|
voice["mode"] = "embedding"
|
269
357
|
voice["embedding"] = opts.voice
|
270
358
|
|
359
|
+
voice_controls: dict = {}
|
360
|
+
if opts.speed is not None:
|
361
|
+
voice_controls["speed"] = opts.speed
|
362
|
+
if opts.emotion is not None:
|
363
|
+
voice_controls["emotion"] = opts.emotion
|
364
|
+
|
365
|
+
if voice_controls:
|
366
|
+
voice["__experimental_controls"] = voice_controls
|
367
|
+
|
271
368
|
return {
|
272
369
|
"model_id": opts.model,
|
273
370
|
"voice": voice,
|
{livekit_plugins_cartesia-0.4.1.dist-info → livekit_plugins_cartesia-0.4.3.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-cartesia
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.3
|
4
4
|
Summary: LiveKit Agents Plugin for Cartesia
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: livekit-agents >=0.
|
22
|
+
Requires-Dist: livekit-agents >=0.11
|
23
23
|
|
24
24
|
# LiveKit Plugins Cartesia
|
25
25
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/cartesia/__init__.py,sha256=UTa6Q7IxhRBCwPftowHEUDvmBg99J_UjGS_yxTzKD7g,1095
|
2
|
+
livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
|
3
|
+
livekit/plugins/cartesia/models.py,sha256=fOO276Vzw3OkDUWUVcw7PH95ctFy38rj3q9I6_mYQ7M,950
|
4
|
+
livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/cartesia/tts.py,sha256=2xwWOIjwLDOF4TbHlDibrZpUju9If8WrNpHQ2JMuBC0,13533
|
6
|
+
livekit/plugins/cartesia/version.py,sha256=u7PSD5TBbPRIhE8vJkBVJzq_eGqYfg6RP5c3VKNlKGk,600
|
7
|
+
livekit_plugins_cartesia-0.4.3.dist-info/METADATA,sha256=w5q0oz6rdHDL5cxAyT5hWbHqhZnOPnZYGl3aUKsr3z4,1246
|
8
|
+
livekit_plugins_cartesia-0.4.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
9
|
+
livekit_plugins_cartesia-0.4.3.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_cartesia-0.4.3.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/cartesia/__init__.py,sha256=BUfWY_evL5dUHn9hBDQVor6ssctDKQfbQfZy5SWndN8,926
|
2
|
-
livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
|
3
|
-
livekit/plugins/cartesia/models.py,sha256=ZoSyV2ap_LqAIgvBvkmukkPxQR9DfKb3Z3oHtWxMiVg,335
|
4
|
-
livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/cartesia/tts.py,sha256=sdiiWinOZR5EBkQFwa3GZAGrkgzXY1-aSRiDZ34K8ww,9527
|
6
|
-
livekit/plugins/cartesia/version.py,sha256=GSGiYNpxiJSu-Mwsw7PqdHsxkwAqS-5ceh44QLp4ovU,600
|
7
|
-
livekit_plugins_cartesia-0.4.1.dist-info/METADATA,sha256=B1B0c8a2ik7feImTl3nQHul9bqMhKebkmO880BCwF7Y,1252
|
8
|
-
livekit_plugins_cartesia-0.4.1.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
9
|
-
livekit_plugins_cartesia-0.4.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_cartesia-0.4.1.dist-info/RECORD,,
|
{livekit_plugins_cartesia-0.4.1.dist-info → livekit_plugins_cartesia-0.4.3.dist-info}/top_level.txt
RENAMED
File without changes
|