livekit-plugins-cartesia 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/cartesia/models.py +3 -36
- livekit/plugins/cartesia/tts.py +48 -89
- livekit/plugins/cartesia/version.py +1 -1
- {livekit_plugins_cartesia-0.1.0.dist-info → livekit_plugins_cartesia-0.2.0.dist-info}/METADATA +3 -5
- livekit_plugins_cartesia-0.2.0.dist-info/RECORD +10 -0
- {livekit_plugins_cartesia-0.1.0.dist-info → livekit_plugins_cartesia-0.2.0.dist-info}/WHEEL +1 -1
- livekit_plugins_cartesia-0.1.0.dist-info/RECORD +0 -10
- {livekit_plugins_cartesia-0.1.0.dist-info → livekit_plugins_cartesia-0.2.0.dist-info}/top_level.txt +0 -0
@@ -9,39 +9,6 @@ TTSEncoding = Literal[
|
|
9
9
|
]
|
10
10
|
|
11
11
|
|
12
|
-
TTSModels = Literal["
|
13
|
-
|
14
|
-
|
15
|
-
# fmt: off
|
16
|
-
# Barbershop Man in upbeat-moon
|
17
|
-
TTSDefaultVoiceEmbedding: list[float] = [
|
18
|
-
-0.033633083, 0.072083704, -0.01807767, -0.083488315, -0.04407617, 0.0022592682, 0.070505895,
|
19
|
-
0.023946615, -0.04788024, -0.06388413, -0.0716355, -0.0022612812, -0.0053448505, -0.07848381,
|
20
|
-
0.0348162, -0.053745482, -0.092399485, -0.02950225, 0.028591828, -0.10556894, 0.023313355,
|
21
|
-
0.06224387, 0.0362463, 0.029258432, 0.10769641, 0.043595582, -0.058543224, -0.080402784,
|
22
|
-
-0.0953816, -0.008988032, -0.0028981369, -0.004752721, -0.20742874, 0.058907595, 0.08813939,
|
23
|
-
-0.06192675, 0.099082634, -0.09661578, -0.0077761724, -0.013982456, -0.025798267, 0.04467142,
|
24
|
-
0.026222011, 0.023023574, 0.011227064, -0.17462021, -0.09880612, -0.1521035, -0.060464993,
|
25
|
-
-0.04735665, -0.09725187, -0.006127679, 0.15818526, -0.039493002, -0.067719474, 0.0066190436,
|
26
|
-
-0.10636633, 0.17073768, -0.051717706, 0.03186961, -0.020547207, -0.02244247, 0.013196935,
|
27
|
-
-0.06431055, -0.115360335, 0.016918058, -0.033195216, 0.11255181, 0.020366343, -0.041032124,
|
28
|
-
0.08780918, -0.040567942, 0.057276532, 0.05848221, -0.077479474, -0.073524915, -0.01913317,
|
29
|
-
-0.029291833, 0.11210393, -0.09859328, 0.2152541, -0.022976823, 0.028627992, -0.039598297,
|
30
|
-
0.041829932, -0.05593181, -0.06444655, -0.018057477, -0.008098263, 0.05994528, 0.10430693,
|
31
|
-
-0.13121894, -0.06512868, -0.026126215, 0.046727825, -0.17180993, -0.10577226, -0.08610466,
|
32
|
-
0.008862588, 0.09547498, -0.010965332, -0.061217085, -0.038954042, 0.019930292, -0.017192135,
|
33
|
-
0.007296275, 0.03273872, 0.04389937, -0.056483064, 0.003420891, -0.10319067, -0.015706042,
|
34
|
-
0.1308774, -0.0018035866, -0.03582506, 0.077131025, 0.013398928, 0.003188886, 0.12039741,
|
35
|
-
-0.033974767, 0.06899378, -0.059775922, -0.026934423, 0.028482193, 0.100996524, 0.004498743,
|
36
|
-
-0.02291186, 0.078752205, -0.0063796206, 0.04206536, 0.05721349, 0.06290694, 0.06130212,
|
37
|
-
0.096969016, -0.057664312, -0.16727506, -0.035220966, 0.090760484, 0.010039947, 0.06513242,
|
38
|
-
0.011055657, -0.004258431, -0.08316792, -0.15650468, -0.076931365, 0.11385587, -0.038372636,
|
39
|
-
0.015648656, -0.12029895, -0.06604956, 0.009441591, -0.11912808, 0.013378132, 0.029525978,
|
40
|
-
-0.0056742397, -0.0075976513, 0.019999338, -0.05521377, -0.07650746, -0.017710293, -0.033986397,
|
41
|
-
-0.047768556, 0.13857274, 0.099290825, 0.11736938, 0.017834296, -0.07140237, -0.052047748,
|
42
|
-
-0.06398965, -0.037033975, -0.061061256, -0.03330076, -0.024472248, -0.059656, 0.05359946,
|
43
|
-
-0.043915518, -0.086325996, 0.14189173, 0.021086395, 0.02945159, 0.1029604, 0.018490415,
|
44
|
-
-0.028736332, -0.025272416, -0.06082937, -0.031339463, -0.0007249595, 0.025595888, 0.007144545,
|
45
|
-
-0.16938712, -0.1160664, -0.0654145,
|
46
|
-
]
|
47
|
-
# fmt: on
|
12
|
+
TTSModels = Literal["sonic-english", "sonic-multilingual"]
|
13
|
+
TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
|
14
|
+
TTSDefaultVoiceId = "248be419-c632-4f23-adf1-5324ed7dbf1d"
|
livekit/plugins/cartesia/tts.py
CHANGED
@@ -14,18 +14,14 @@
|
|
14
14
|
|
15
15
|
from __future__ import annotations
|
16
16
|
|
17
|
-
import asyncio
|
18
|
-
import contextlib
|
19
17
|
import os
|
20
18
|
from dataclasses import dataclass
|
21
|
-
from typing import Optional
|
22
19
|
|
23
20
|
import aiohttp
|
24
|
-
from livekit import rtc
|
25
21
|
from livekit.agents import tts, utils
|
26
22
|
|
27
23
|
from .log import logger
|
28
|
-
from .models import
|
24
|
+
from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
|
29
25
|
|
30
26
|
API_AUTH_HEADER = "X-API-Key"
|
31
27
|
API_VERSION_HEADER = "Cartesia-Version"
|
@@ -39,21 +35,23 @@ class _TTSOptions:
|
|
39
35
|
sample_rate: int
|
40
36
|
voice: str | list[float]
|
41
37
|
api_key: str
|
38
|
+
language: str
|
42
39
|
|
43
40
|
|
44
41
|
class TTS(tts.TTS):
|
45
42
|
def __init__(
|
46
43
|
self,
|
47
44
|
*,
|
48
|
-
model: TTSModels = "
|
45
|
+
model: TTSModels = "sonic-english",
|
46
|
+
language: str = "en",
|
49
47
|
encoding: TTSEncoding = "pcm_s16le",
|
50
|
-
voice: str | list[float] =
|
48
|
+
voice: str | list[float] = TTSDefaultVoiceId,
|
51
49
|
sample_rate: int = 24000,
|
52
50
|
api_key: str | None = None,
|
53
51
|
http_session: aiohttp.ClientSession | None = None,
|
54
52
|
) -> None:
|
55
53
|
super().__init__(
|
56
|
-
|
54
|
+
capabilities=tts.TTSCapabilities(streaming=False),
|
57
55
|
sample_rate=sample_rate,
|
58
56
|
num_channels=1,
|
59
57
|
)
|
@@ -64,6 +62,7 @@ class TTS(tts.TTS):
|
|
64
62
|
|
65
63
|
self._opts = _TTSOptions(
|
66
64
|
model=model,
|
65
|
+
language=language,
|
67
66
|
encoding=encoding,
|
68
67
|
sample_rate=sample_rate,
|
69
68
|
voice=voice,
|
@@ -73,14 +72,11 @@ class TTS(tts.TTS):
|
|
73
72
|
|
74
73
|
def _ensure_session(self) -> aiohttp.ClientSession:
|
75
74
|
if not self._session:
|
76
|
-
self._session = utils.http_session()
|
75
|
+
self._session = utils.http_context.http_session()
|
77
76
|
|
78
77
|
return self._session
|
79
78
|
|
80
|
-
def synthesize(
|
81
|
-
self,
|
82
|
-
text: str,
|
83
|
-
) -> "ChunkedStream":
|
79
|
+
def synthesize(self, text: str) -> "ChunkedStream":
|
84
80
|
return ChunkedStream(text, self._opts, self._ensure_session())
|
85
81
|
|
86
82
|
|
@@ -88,14 +84,17 @@ class ChunkedStream(tts.ChunkedStream):
|
|
88
84
|
def __init__(
|
89
85
|
self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
|
90
86
|
) -> None:
|
91
|
-
|
92
|
-
self._text = text
|
93
|
-
self._session = session
|
94
|
-
self._main_task: asyncio.Task | None = None
|
95
|
-
self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
|
87
|
+
super().__init__()
|
88
|
+
self._text, self._opts, self._session = text, opts, session
|
96
89
|
|
97
90
|
@utils.log_exceptions(logger=logger)
|
98
|
-
async def
|
91
|
+
async def _main_task(self):
|
92
|
+
bstream = utils.audio.AudioByteStream(
|
93
|
+
sample_rate=self._opts.sample_rate, num_channels=1
|
94
|
+
)
|
95
|
+
request_id = utils.shortuuid()
|
96
|
+
segment_id = utils.shortuuid()
|
97
|
+
|
99
98
|
voice = {}
|
100
99
|
if isinstance(self._opts.voice, str):
|
101
100
|
voice["mode"] = "id"
|
@@ -104,77 +103,37 @@ class ChunkedStream(tts.ChunkedStream):
|
|
104
103
|
voice["mode"] = "embedding"
|
105
104
|
voice["embedding"] = self._opts.voice
|
106
105
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
while len(buf) >= bytes_per_frame:
|
132
|
-
frame_data = buf[:bytes_per_frame]
|
133
|
-
buf = buf[bytes_per_frame:]
|
134
|
-
|
135
|
-
self._queue.put_nowait(
|
136
|
-
tts.SynthesizedAudio(
|
137
|
-
text=self._text,
|
138
|
-
data=rtc.AudioFrame(
|
139
|
-
data=frame_data,
|
140
|
-
sample_rate=self._opts.sample_rate,
|
141
|
-
num_channels=1,
|
142
|
-
samples_per_channel=len(frame_data) // 2,
|
143
|
-
),
|
144
|
-
)
|
145
|
-
)
|
146
|
-
|
147
|
-
# send any remaining data
|
148
|
-
if len(buf) > 0:
|
149
|
-
self._queue.put_nowait(
|
106
|
+
data = {
|
107
|
+
"model_id": self._opts.model,
|
108
|
+
"transcript": self._text,
|
109
|
+
"voice": voice,
|
110
|
+
"output_format": {
|
111
|
+
"container": "raw",
|
112
|
+
"encoding": self._opts.encoding,
|
113
|
+
"sample_rate": self._opts.sample_rate,
|
114
|
+
},
|
115
|
+
"language": self._opts.language,
|
116
|
+
}
|
117
|
+
|
118
|
+
async with self._session.post(
|
119
|
+
"https://api.cartesia.ai/tts/bytes",
|
120
|
+
headers={
|
121
|
+
API_AUTH_HEADER: f"{self._opts.api_key}",
|
122
|
+
API_VERSION_HEADER: API_VERSION,
|
123
|
+
},
|
124
|
+
json=data,
|
125
|
+
) as resp:
|
126
|
+
async for data, _ in resp.content.iter_chunks():
|
127
|
+
for frame in bstream.write(data):
|
128
|
+
self._event_ch.send_nowait(
|
150
129
|
tts.SynthesizedAudio(
|
151
|
-
|
152
|
-
data=rtc.AudioFrame(
|
153
|
-
data=buf,
|
154
|
-
sample_rate=self._opts.sample_rate,
|
155
|
-
num_channels=1,
|
156
|
-
samples_per_channel=len(buf) // 2,
|
157
|
-
),
|
130
|
+
request_id=request_id, segment_id=segment_id, frame=frame
|
158
131
|
)
|
159
132
|
)
|
160
133
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
frame = await self._queue.get()
|
169
|
-
if frame is None:
|
170
|
-
raise StopAsyncIteration
|
171
|
-
|
172
|
-
return frame
|
173
|
-
|
174
|
-
async def aclose(self) -> None:
|
175
|
-
if not self._main_task:
|
176
|
-
return
|
177
|
-
|
178
|
-
self._main_task.cancel()
|
179
|
-
with contextlib.suppress(asyncio.CancelledError):
|
180
|
-
await self._main_task
|
134
|
+
for frame in bstream.flush():
|
135
|
+
self._event_ch.send_nowait(
|
136
|
+
tts.SynthesizedAudio(
|
137
|
+
request_id=request_id, segment_id=segment_id, frame=frame
|
138
|
+
)
|
139
|
+
)
|
{livekit_plugins_cartesia-0.1.0.dist-info → livekit_plugins_cartesia-0.2.0.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-cartesia
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: LiveKit Agents Plugin for Cartesia
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -14,14 +14,12 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
14
14
|
Classifier: Topic :: Multimedia :: Video
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
17
|
-
Classifier: Programming Language :: Python :: 3.8
|
18
17
|
Classifier: Programming Language :: Python :: 3.9
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
20
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
|
-
Requires-Python: >=3.
|
20
|
+
Requires-Python: >=3.9.0
|
22
21
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: livekit
|
24
|
-
Requires-Dist: livekit-agents ~=0.7
|
22
|
+
Requires-Dist: livekit-agents >=0.8.0.dev0
|
25
23
|
|
26
24
|
# LiveKit Plugins Cartesia
|
27
25
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/cartesia/__init__.py,sha256=_a8u7qqya1pjZTV19gNOpMKTO7ccAVZAeCukiDKAG-U,937
|
2
|
+
livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
|
3
|
+
livekit/plugins/cartesia/models.py,sha256=06S-Z-M90kB-kEOQsQk70xfQUD-TztU4ZIU_AfAyUMc,335
|
4
|
+
livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/cartesia/tts.py,sha256=S5BMSVtsbNI_c2PpgyFK6wvleudmJZLTUt3ZmGNKlRI,4319
|
6
|
+
livekit/plugins/cartesia/version.py,sha256=cLFCdnm5S21CiJ5UJBcqfRvvFkCQ8p6M5fFUJVJkEiM,600
|
7
|
+
livekit_plugins_cartesia-0.2.0.dist-info/METADATA,sha256=z0oS7uWDrLjMFWqYxGNJD9SENd5W5sdNwogpe-tTeA4,1252
|
8
|
+
livekit_plugins_cartesia-0.2.0.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
9
|
+
livekit_plugins_cartesia-0.2.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_cartesia-0.2.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/cartesia/__init__.py,sha256=_a8u7qqya1pjZTV19gNOpMKTO7ccAVZAeCukiDKAG-U,937
|
2
|
-
livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
|
3
|
-
livekit/plugins/cartesia/models.py,sha256=Qhl51ZScuB61bEzN1tBlHMuHO_kCXSzuVOicYa16EL8,2922
|
4
|
-
livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/cartesia/tts.py,sha256=16BneZFQQsS-lB9Ug1HYj4QW7-VnNdpTJ0CW5A1b9EU,5725
|
6
|
-
livekit/plugins/cartesia/version.py,sha256=vQH9cItKAVYAmrLbOntkbLqmxrUZrPiKb1TjkZ8jRKQ,600
|
7
|
-
livekit_plugins_cartesia-0.1.0.dist-info/METADATA,sha256=lpj60lrnFHBzooiT3f3Dtt74J0vq9ZvHfxqCb7YCBmg,1325
|
8
|
-
livekit_plugins_cartesia-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
-
livekit_plugins_cartesia-0.1.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_cartesia-0.1.0.dist-info/RECORD,,
|
{livekit_plugins_cartesia-0.1.0.dist-info → livekit_plugins_cartesia-0.2.0.dist-info}/top_level.txt
RENAMED
File without changes
|