livekit-plugins-cartesia 0.1.1__py3-none-any.whl → 0.2.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/cartesia/models.py +3 -36
- livekit/plugins/cartesia/tts.py +48 -89
- livekit/plugins/cartesia/version.py +1 -1
- {livekit_plugins_cartesia-0.1.1.dist-info → livekit_plugins_cartesia-0.2.0.dev1.dist-info}/METADATA +1 -1
- livekit_plugins_cartesia-0.2.0.dev1.dist-info/RECORD +10 -0
- {livekit_plugins_cartesia-0.1.1.dist-info → livekit_plugins_cartesia-0.2.0.dev1.dist-info}/WHEEL +1 -1
- livekit_plugins_cartesia-0.1.1.dist-info/RECORD +0 -10
- {livekit_plugins_cartesia-0.1.1.dist-info → livekit_plugins_cartesia-0.2.0.dev1.dist-info}/top_level.txt +0 -0
@@ -9,39 +9,6 @@ TTSEncoding = Literal[
|
|
9
9
|
]
|
10
10
|
|
11
11
|
|
12
|
-
TTSModels = Literal["
|
13
|
-
|
14
|
-
|
15
|
-
# fmt: off
|
16
|
-
# Barbershop Man in upbeat-moon
|
17
|
-
TTSDefaultVoiceEmbedding: list[float] = [
|
18
|
-
-0.033633083, 0.072083704, -0.01807767, -0.083488315, -0.04407617, 0.0022592682, 0.070505895,
|
19
|
-
0.023946615, -0.04788024, -0.06388413, -0.0716355, -0.0022612812, -0.0053448505, -0.07848381,
|
20
|
-
0.0348162, -0.053745482, -0.092399485, -0.02950225, 0.028591828, -0.10556894, 0.023313355,
|
21
|
-
0.06224387, 0.0362463, 0.029258432, 0.10769641, 0.043595582, -0.058543224, -0.080402784,
|
22
|
-
-0.0953816, -0.008988032, -0.0028981369, -0.004752721, -0.20742874, 0.058907595, 0.08813939,
|
23
|
-
-0.06192675, 0.099082634, -0.09661578, -0.0077761724, -0.013982456, -0.025798267, 0.04467142,
|
24
|
-
0.026222011, 0.023023574, 0.011227064, -0.17462021, -0.09880612, -0.1521035, -0.060464993,
|
25
|
-
-0.04735665, -0.09725187, -0.006127679, 0.15818526, -0.039493002, -0.067719474, 0.0066190436,
|
26
|
-
-0.10636633, 0.17073768, -0.051717706, 0.03186961, -0.020547207, -0.02244247, 0.013196935,
|
27
|
-
-0.06431055, -0.115360335, 0.016918058, -0.033195216, 0.11255181, 0.020366343, -0.041032124,
|
28
|
-
0.08780918, -0.040567942, 0.057276532, 0.05848221, -0.077479474, -0.073524915, -0.01913317,
|
29
|
-
-0.029291833, 0.11210393, -0.09859328, 0.2152541, -0.022976823, 0.028627992, -0.039598297,
|
30
|
-
0.041829932, -0.05593181, -0.06444655, -0.018057477, -0.008098263, 0.05994528, 0.10430693,
|
31
|
-
-0.13121894, -0.06512868, -0.026126215, 0.046727825, -0.17180993, -0.10577226, -0.08610466,
|
32
|
-
0.008862588, 0.09547498, -0.010965332, -0.061217085, -0.038954042, 0.019930292, -0.017192135,
|
33
|
-
0.007296275, 0.03273872, 0.04389937, -0.056483064, 0.003420891, -0.10319067, -0.015706042,
|
34
|
-
0.1308774, -0.0018035866, -0.03582506, 0.077131025, 0.013398928, 0.003188886, 0.12039741,
|
35
|
-
-0.033974767, 0.06899378, -0.059775922, -0.026934423, 0.028482193, 0.100996524, 0.004498743,
|
36
|
-
-0.02291186, 0.078752205, -0.0063796206, 0.04206536, 0.05721349, 0.06290694, 0.06130212,
|
37
|
-
0.096969016, -0.057664312, -0.16727506, -0.035220966, 0.090760484, 0.010039947, 0.06513242,
|
38
|
-
0.011055657, -0.004258431, -0.08316792, -0.15650468, -0.076931365, 0.11385587, -0.038372636,
|
39
|
-
0.015648656, -0.12029895, -0.06604956, 0.009441591, -0.11912808, 0.013378132, 0.029525978,
|
40
|
-
-0.0056742397, -0.0075976513, 0.019999338, -0.05521377, -0.07650746, -0.017710293, -0.033986397,
|
41
|
-
-0.047768556, 0.13857274, 0.099290825, 0.11736938, 0.017834296, -0.07140237, -0.052047748,
|
42
|
-
-0.06398965, -0.037033975, -0.061061256, -0.03330076, -0.024472248, -0.059656, 0.05359946,
|
43
|
-
-0.043915518, -0.086325996, 0.14189173, 0.021086395, 0.02945159, 0.1029604, 0.018490415,
|
44
|
-
-0.028736332, -0.025272416, -0.06082937, -0.031339463, -0.0007249595, 0.025595888, 0.007144545,
|
45
|
-
-0.16938712, -0.1160664, -0.0654145,
|
46
|
-
]
|
47
|
-
# fmt: on
|
12
|
+
TTSModels = Literal["sonic-english", "sonic-multilingual"]
|
13
|
+
TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
|
14
|
+
TTSDefaultVoiceId = "248be419-c632-4f23-adf1-5324ed7dbf1d"
|
livekit/plugins/cartesia/tts.py
CHANGED
@@ -14,18 +14,14 @@
|
|
14
14
|
|
15
15
|
from __future__ import annotations
|
16
16
|
|
17
|
-
import asyncio
|
18
|
-
import contextlib
|
19
17
|
import os
|
20
18
|
from dataclasses import dataclass
|
21
|
-
from typing import Optional
|
22
19
|
|
23
20
|
import aiohttp
|
24
|
-
from livekit import rtc
|
25
21
|
from livekit.agents import tts, utils
|
26
22
|
|
27
23
|
from .log import logger
|
28
|
-
from .models import
|
24
|
+
from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
|
29
25
|
|
30
26
|
API_AUTH_HEADER = "X-API-Key"
|
31
27
|
API_VERSION_HEADER = "Cartesia-Version"
|
@@ -39,21 +35,23 @@ class _TTSOptions:
|
|
39
35
|
sample_rate: int
|
40
36
|
voice: str | list[float]
|
41
37
|
api_key: str
|
38
|
+
language: str
|
42
39
|
|
43
40
|
|
44
41
|
class TTS(tts.TTS):
|
45
42
|
def __init__(
|
46
43
|
self,
|
47
44
|
*,
|
48
|
-
model: TTSModels = "
|
45
|
+
model: TTSModels = "sonic-english",
|
46
|
+
language: str = "en",
|
49
47
|
encoding: TTSEncoding = "pcm_s16le",
|
50
|
-
voice: str | list[float] =
|
48
|
+
voice: str | list[float] = TTSDefaultVoiceId,
|
51
49
|
sample_rate: int = 24000,
|
52
50
|
api_key: str | None = None,
|
53
51
|
http_session: aiohttp.ClientSession | None = None,
|
54
52
|
) -> None:
|
55
53
|
super().__init__(
|
56
|
-
|
54
|
+
capabilities=tts.TTSCapabilities(streaming=False),
|
57
55
|
sample_rate=sample_rate,
|
58
56
|
num_channels=1,
|
59
57
|
)
|
@@ -64,6 +62,7 @@ class TTS(tts.TTS):
|
|
64
62
|
|
65
63
|
self._opts = _TTSOptions(
|
66
64
|
model=model,
|
65
|
+
language=language,
|
67
66
|
encoding=encoding,
|
68
67
|
sample_rate=sample_rate,
|
69
68
|
voice=voice,
|
@@ -73,14 +72,11 @@ class TTS(tts.TTS):
|
|
73
72
|
|
74
73
|
def _ensure_session(self) -> aiohttp.ClientSession:
|
75
74
|
if not self._session:
|
76
|
-
self._session = utils.http_session()
|
75
|
+
self._session = utils.http_context.http_session()
|
77
76
|
|
78
77
|
return self._session
|
79
78
|
|
80
|
-
def synthesize(
|
81
|
-
self,
|
82
|
-
text: str,
|
83
|
-
) -> "ChunkedStream":
|
79
|
+
def synthesize(self, text: str) -> "ChunkedStream":
|
84
80
|
return ChunkedStream(text, self._opts, self._ensure_session())
|
85
81
|
|
86
82
|
|
@@ -88,14 +84,17 @@ class ChunkedStream(tts.ChunkedStream):
|
|
88
84
|
def __init__(
|
89
85
|
self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
|
90
86
|
) -> None:
|
91
|
-
|
92
|
-
self._text = text
|
93
|
-
self._session = session
|
94
|
-
self._main_task: asyncio.Task | None = None
|
95
|
-
self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
|
87
|
+
super().__init__()
|
88
|
+
self._text, self._opts, self._session = text, opts, session
|
96
89
|
|
97
90
|
@utils.log_exceptions(logger=logger)
|
98
|
-
async def
|
91
|
+
async def _main_task(self):
|
92
|
+
bstream = utils.audio.AudioByteStream(
|
93
|
+
sample_rate=self._opts.sample_rate, num_channels=1
|
94
|
+
)
|
95
|
+
request_id = utils.shortuuid()
|
96
|
+
segment_id = utils.shortuuid()
|
97
|
+
|
99
98
|
voice = {}
|
100
99
|
if isinstance(self._opts.voice, str):
|
101
100
|
voice["mode"] = "id"
|
@@ -104,77 +103,37 @@ class ChunkedStream(tts.ChunkedStream):
|
|
104
103
|
voice["mode"] = "embedding"
|
105
104
|
voice["embedding"] = self._opts.voice
|
106
105
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
while len(buf) >= bytes_per_frame:
|
132
|
-
frame_data = buf[:bytes_per_frame]
|
133
|
-
buf = buf[bytes_per_frame:]
|
134
|
-
|
135
|
-
self._queue.put_nowait(
|
136
|
-
tts.SynthesizedAudio(
|
137
|
-
text=self._text,
|
138
|
-
data=rtc.AudioFrame(
|
139
|
-
data=frame_data,
|
140
|
-
sample_rate=self._opts.sample_rate,
|
141
|
-
num_channels=1,
|
142
|
-
samples_per_channel=len(frame_data) // 2,
|
143
|
-
),
|
144
|
-
)
|
145
|
-
)
|
146
|
-
|
147
|
-
# send any remaining data
|
148
|
-
if len(buf) > 0:
|
149
|
-
self._queue.put_nowait(
|
106
|
+
data = {
|
107
|
+
"model_id": self._opts.model,
|
108
|
+
"transcript": self._text,
|
109
|
+
"voice": voice,
|
110
|
+
"output_format": {
|
111
|
+
"container": "raw",
|
112
|
+
"encoding": self._opts.encoding,
|
113
|
+
"sample_rate": self._opts.sample_rate,
|
114
|
+
},
|
115
|
+
"language": self._opts.language,
|
116
|
+
}
|
117
|
+
|
118
|
+
async with self._session.post(
|
119
|
+
"https://api.cartesia.ai/tts/bytes",
|
120
|
+
headers={
|
121
|
+
API_AUTH_HEADER: f"{self._opts.api_key}",
|
122
|
+
API_VERSION_HEADER: API_VERSION,
|
123
|
+
},
|
124
|
+
json=data,
|
125
|
+
) as resp:
|
126
|
+
async for data, _ in resp.content.iter_chunks():
|
127
|
+
for frame in bstream.write(data):
|
128
|
+
self._event_ch.send_nowait(
|
150
129
|
tts.SynthesizedAudio(
|
151
|
-
|
152
|
-
data=rtc.AudioFrame(
|
153
|
-
data=buf,
|
154
|
-
sample_rate=self._opts.sample_rate,
|
155
|
-
num_channels=1,
|
156
|
-
samples_per_channel=len(buf) // 2,
|
157
|
-
),
|
130
|
+
request_id=request_id, segment_id=segment_id, frame=frame
|
158
131
|
)
|
159
132
|
)
|
160
133
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
frame = await self._queue.get()
|
169
|
-
if frame is None:
|
170
|
-
raise StopAsyncIteration
|
171
|
-
|
172
|
-
return frame
|
173
|
-
|
174
|
-
async def aclose(self) -> None:
|
175
|
-
if not self._main_task:
|
176
|
-
return
|
177
|
-
|
178
|
-
self._main_task.cancel()
|
179
|
-
with contextlib.suppress(asyncio.CancelledError):
|
180
|
-
await self._main_task
|
134
|
+
for frame in bstream.flush():
|
135
|
+
self._event_ch.send_nowait(
|
136
|
+
tts.SynthesizedAudio(
|
137
|
+
request_id=request_id, segment_id=segment_id, frame=frame
|
138
|
+
)
|
139
|
+
)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/cartesia/__init__.py,sha256=_a8u7qqya1pjZTV19gNOpMKTO7ccAVZAeCukiDKAG-U,937
|
2
|
+
livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
|
3
|
+
livekit/plugins/cartesia/models.py,sha256=06S-Z-M90kB-kEOQsQk70xfQUD-TztU4ZIU_AfAyUMc,335
|
4
|
+
livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/cartesia/tts.py,sha256=S5BMSVtsbNI_c2PpgyFK6wvleudmJZLTUt3ZmGNKlRI,4319
|
6
|
+
livekit/plugins/cartesia/version.py,sha256=ypu6ttoYyC198vzZ_HCF0aB8kPNeygXXxDGxbrCf9s4,606
|
7
|
+
livekit_plugins_cartesia-0.2.0.dev1.dist-info/METADATA,sha256=fCqrA_MFJSMweSuHmtD29iZoSVYzFD1TROXmay8AWcE,1250
|
8
|
+
livekit_plugins_cartesia-0.2.0.dev1.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
9
|
+
livekit_plugins_cartesia-0.2.0.dev1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_cartesia-0.2.0.dev1.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/cartesia/__init__.py,sha256=_a8u7qqya1pjZTV19gNOpMKTO7ccAVZAeCukiDKAG-U,937
|
2
|
-
livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
|
3
|
-
livekit/plugins/cartesia/models.py,sha256=Qhl51ZScuB61bEzN1tBlHMuHO_kCXSzuVOicYa16EL8,2922
|
4
|
-
livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/cartesia/tts.py,sha256=16BneZFQQsS-lB9Ug1HYj4QW7-VnNdpTJ0CW5A1b9EU,5725
|
6
|
-
livekit/plugins/cartesia/version.py,sha256=3-nEcobvIJfZdV4yNIRuYpAGQ3svREnYIv2ivxoIZcQ,600
|
7
|
-
livekit_plugins_cartesia-0.1.1.dist-info/METADATA,sha256=MfqyeBD4BF8NE4A8O9hIboC0WMmQ5EKo8RPzkGc8-a8,1245
|
8
|
-
livekit_plugins_cartesia-0.1.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
-
livekit_plugins_cartesia-0.1.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_cartesia-0.1.1.dist-info/RECORD,,
|
File without changes
|