livekit-plugins-cartesia 0.2.0.dev7__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/PKG-INFO +1 -1
  2. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit/plugins/cartesia/__init__.py +3 -4
  3. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit/plugins/cartesia/models.py +1 -1
  4. livekit_plugins_cartesia-0.3.0/livekit/plugins/cartesia/tts.py +303 -0
  5. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit/plugins/cartesia/version.py +1 -1
  6. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit_plugins_cartesia.egg-info/PKG-INFO +1 -1
  7. livekit_plugins_cartesia-0.2.0.dev7/livekit/plugins/cartesia/tts.py +0 -139
  8. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/README.md +0 -0
  9. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit/plugins/cartesia/log.py +0 -0
  10. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit/plugins/cartesia/py.typed +0 -0
  11. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit_plugins_cartesia.egg-info/SOURCES.txt +0 -0
  12. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit_plugins_cartesia.egg-info/dependency_links.txt +0 -0
  13. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit_plugins_cartesia.egg-info/requires.txt +0 -0
  14. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/livekit_plugins_cartesia.egg-info/top_level.txt +0 -0
  15. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/pyproject.toml +0 -0
  16. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/setup.cfg +0 -0
  17. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.3.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-cartesia
3
- Version: 0.2.0.dev7
3
+ Version: 0.3.0
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,13 +19,12 @@ __all__ = ["TTS", "ChunkedStream", "__version__"]
19
19
 
20
20
  from livekit.agents import Plugin
21
21
 
22
+ from .log import logger
23
+
22
24
 
23
25
  class CartesiaPlugin(Plugin):
24
26
  def __init__(self):
25
- super().__init__(__name__, __version__, __package__)
26
-
27
- def download_files(self):
28
- pass
27
+ super().__init__(__name__, __version__, __package__, logger)
29
28
 
30
29
 
31
30
  Plugin.register_plugin(CartesiaPlugin())
@@ -11,4 +11,4 @@ TTSEncoding = Literal[
11
11
 
12
12
  TTSModels = Literal["sonic-english", "sonic-multilingual"]
13
13
  TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
14
- TTSDefaultVoiceId = "248be419-c632-4f23-adf1-5324ed7dbf1d"
14
+ TTSDefaultVoiceId = "b7d50908-b17c-442d-ad8d-810c63997ed9"
@@ -0,0 +1,303 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ from dataclasses import dataclass
22
+
23
+ import aiohttp
24
+ from livekit.agents import tokenize, tts, utils
25
+
26
+ from .log import logger
27
+ from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
28
+
29
+ API_AUTH_HEADER = "X-API-Key"
30
+ API_VERSION_HEADER = "Cartesia-Version"
31
+ API_VERSION = "2024-06-10"
32
+
33
+ NUM_CHANNELS = 1
34
+ BUFFERED_WORDS_COUNT = 8
35
+
36
+
37
+ @dataclass
38
+ class _TTSOptions:
39
+ model: TTSModels
40
+ encoding: TTSEncoding
41
+ sample_rate: int
42
+ voice: str | list[float]
43
+ api_key: str
44
+ language: str
45
+ word_tokenizer: tokenize.WordTokenizer
46
+
47
+
48
+ class TTS(tts.TTS):
49
+ def __init__(
50
+ self,
51
+ *,
52
+ model: TTSModels = "sonic-english",
53
+ language: str = "en",
54
+ encoding: TTSEncoding = "pcm_s16le",
55
+ voice: str | list[float] = TTSDefaultVoiceId,
56
+ sample_rate: int = 24000,
57
+ api_key: str | None = None,
58
+ http_session: aiohttp.ClientSession | None = None,
59
+ word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
60
+ ignore_punctuation=False
61
+ ),
62
+ ) -> None:
63
+ super().__init__(
64
+ capabilities=tts.TTSCapabilities(streaming=True),
65
+ sample_rate=sample_rate,
66
+ num_channels=NUM_CHANNELS,
67
+ )
68
+
69
+ api_key = api_key or os.environ.get("CARTESIA_API_KEY")
70
+ if not api_key:
71
+ raise ValueError("CARTESIA_API_KEY must be set")
72
+
73
+ self._opts = _TTSOptions(
74
+ model=model,
75
+ language=language,
76
+ encoding=encoding,
77
+ sample_rate=sample_rate,
78
+ voice=voice,
79
+ api_key=api_key,
80
+ word_tokenizer=word_tokenizer,
81
+ )
82
+ self._session = http_session
83
+
84
+ def _ensure_session(self) -> aiohttp.ClientSession:
85
+ if not self._session:
86
+ self._session = utils.http_context.http_session()
87
+
88
+ return self._session
89
+
90
+ def synthesize(self, text: str) -> "ChunkedStream":
91
+ return ChunkedStream(text, self._opts, self._ensure_session())
92
+
93
+ def stream(self) -> "SynthesizeStream":
94
+ return SynthesizeStream(self._opts, self._ensure_session())
95
+
96
+
97
+ class ChunkedStream(tts.ChunkedStream):
98
+ """Synthesize chunked text using the bytes endpoint"""
99
+
100
+ def __init__(
101
+ self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
102
+ ) -> None:
103
+ super().__init__()
104
+ self._text, self._opts, self._session = text, opts, session
105
+
106
+ @utils.log_exceptions(logger=logger)
107
+ async def _main_task(self):
108
+ bstream = utils.audio.AudioByteStream(
109
+ sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS
110
+ )
111
+ request_id, segment_id = utils.shortuuid(), utils.shortuuid()
112
+
113
+ data = _to_cartesia_options(self._opts)
114
+ data["transcript"] = self._text
115
+
116
+ async with self._session.post(
117
+ "https://api.cartesia.ai/tts/bytes",
118
+ headers={
119
+ API_AUTH_HEADER: self._opts.api_key,
120
+ API_VERSION_HEADER: API_VERSION,
121
+ },
122
+ json=data,
123
+ ) as resp:
124
+ async for data, _ in resp.content.iter_chunks():
125
+ for frame in bstream.write(data):
126
+ self._event_ch.send_nowait(
127
+ tts.SynthesizedAudio(
128
+ request_id=request_id, segment_id=segment_id, frame=frame
129
+ )
130
+ )
131
+
132
+ for frame in bstream.flush():
133
+ self._event_ch.send_nowait(
134
+ tts.SynthesizedAudio(
135
+ request_id=request_id, segment_id=segment_id, frame=frame
136
+ )
137
+ )
138
+
139
+
140
+ class SynthesizeStream(tts.SynthesizeStream):
141
+ def __init__(
142
+ self,
143
+ opts: _TTSOptions,
144
+ session: aiohttp.ClientSession,
145
+ ):
146
+ super().__init__()
147
+ self._opts, self._session = opts, session
148
+ self._buf = ""
149
+
150
+ @utils.log_exceptions(logger=logger)
151
+ async def _main_task(self) -> None:
152
+ retry_count = 0
153
+ max_retry = 3
154
+ while self._input_ch.qsize() or not self._input_ch.closed:
155
+ try:
156
+ url = f"wss://api.cartesia.ai/tts/websocket?api_key={self._opts.api_key}&cartesia_version={API_VERSION}"
157
+ ws = await self._session.ws_connect(url)
158
+ retry_count = 0 # connected successfully, reset the retry_count
159
+
160
+ await self._run_ws(ws)
161
+ except Exception as e:
162
+ if retry_count >= max_retry:
163
+ logger.exception(
164
+ f"failed to connect to Cartesia after {max_retry} tries"
165
+ )
166
+ break
167
+
168
+ retry_delay = min(retry_count * 2, 10) # max 10s
169
+ retry_count += 1
170
+
171
+ logger.warning(
172
+ f"Cartesia connection failed, retrying in {retry_delay}s",
173
+ exc_info=e,
174
+ )
175
+ await asyncio.sleep(retry_delay)
176
+
177
+ async def _run_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
178
+ request_id = utils.shortuuid()
179
+ pending_segments = []
180
+
181
+ async def send_task():
182
+ base_pkt = _to_cartesia_options(self._opts)
183
+
184
+ def _new_segment():
185
+ segment_id = utils.shortuuid()
186
+ pending_segments.append(segment_id)
187
+ return segment_id
188
+
189
+ current_segment_id: str | None = _new_segment()
190
+
191
+ async for data in self._input_ch:
192
+ if isinstance(data, self._FlushSentinel):
193
+ if current_segment_id is None:
194
+ continue
195
+
196
+ end_pkt = base_pkt.copy()
197
+ end_pkt["context_id"] = current_segment_id
198
+ end_pkt["transcript"] = self._buf + " "
199
+ end_pkt["continue"] = False
200
+ await ws.send_str(json.dumps(end_pkt))
201
+
202
+ current_segment_id = None
203
+ self._buf = ""
204
+ elif data:
205
+ if current_segment_id is None:
206
+ current_segment_id = _new_segment()
207
+
208
+ self._buf += data
209
+ words = self._opts.word_tokenizer.tokenize(text=self._buf)
210
+ if len(words) < BUFFERED_WORDS_COUNT + 1:
211
+ continue
212
+
213
+ data = self._opts.word_tokenizer.format_words(words[:-1]) + " "
214
+ self._buf = words[-1]
215
+
216
+ token_pkt = base_pkt.copy()
217
+ token_pkt["context_id"] = current_segment_id
218
+ token_pkt["transcript"] = data
219
+ token_pkt["continue"] = True
220
+ await ws.send_str(json.dumps(token_pkt))
221
+
222
+ if len(pending_segments) == 0:
223
+ await ws.close()
224
+
225
+ async def recv_task():
226
+ audio_bstream = utils.audio.AudioByteStream(
227
+ sample_rate=self._opts.sample_rate,
228
+ num_channels=NUM_CHANNELS,
229
+ )
230
+
231
+ while True:
232
+ msg = await ws.receive()
233
+ if msg.type in (
234
+ aiohttp.WSMsgType.CLOSED,
235
+ aiohttp.WSMsgType.CLOSE,
236
+ aiohttp.WSMsgType.CLOSING,
237
+ ):
238
+ raise Exception("Cartesia connection closed unexpectedly")
239
+
240
+ if msg.type != aiohttp.WSMsgType.TEXT:
241
+ logger.warning("unexpected Cartesia message type %s", msg.type)
242
+ continue
243
+
244
+ data = json.loads(msg.data)
245
+ segment_id = data.get("context_id")
246
+ if data.get("data"):
247
+ b64data = base64.b64decode(data["data"])
248
+ for frame in audio_bstream.write(b64data):
249
+ self._event_ch.send_nowait(
250
+ tts.SynthesizedAudio(
251
+ request_id=request_id,
252
+ segment_id=segment_id,
253
+ frame=frame,
254
+ )
255
+ )
256
+ elif data.get("done"):
257
+ for frame in audio_bstream.flush():
258
+ self._event_ch.send_nowait(
259
+ tts.SynthesizedAudio(
260
+ request_id=request_id,
261
+ segment_id=segment_id,
262
+ frame=frame,
263
+ )
264
+ )
265
+
266
+ pending_segments.remove(segment_id)
267
+ if len(pending_segments) == 0 and self._input_ch.closed:
268
+ # we're not going to receive more frames, close the connection
269
+ await ws.close()
270
+ break
271
+ else:
272
+ logger.error("unexpected Cartesia message %s", data)
273
+
274
+ tasks = [
275
+ asyncio.create_task(send_task()),
276
+ asyncio.create_task(recv_task()),
277
+ ]
278
+
279
+ try:
280
+ await asyncio.gather(*tasks)
281
+ finally:
282
+ await utils.aio.gracefully_cancel(*tasks)
283
+
284
+
285
+ def _to_cartesia_options(opts: _TTSOptions) -> dict:
286
+ voice: dict = {}
287
+ if isinstance(opts.voice, str):
288
+ voice["mode"] = "id"
289
+ voice["id"] = opts.voice
290
+ else:
291
+ voice["mode"] = "embedding"
292
+ voice["embedding"] = opts.voice
293
+
294
+ return {
295
+ "model_id": opts.model,
296
+ "voice": voice,
297
+ "output_format": {
298
+ "container": "raw",
299
+ "encoding": opts.encoding,
300
+ "sample_rate": opts.sample_rate,
301
+ },
302
+ "language": opts.language,
303
+ }
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.2.0-dev.7"
15
+ __version__ = "0.3.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-cartesia
3
- Version: 0.2.0.dev7
3
+ Version: 0.3.0
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -1,139 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import os
18
- from dataclasses import dataclass
19
-
20
- import aiohttp
21
- from livekit.agents import tts, utils
22
-
23
- from .log import logger
24
- from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
25
-
26
- API_AUTH_HEADER = "X-API-Key"
27
- API_VERSION_HEADER = "Cartesia-Version"
28
- API_VERSION = "2024-06-10"
29
-
30
-
31
- @dataclass
32
- class _TTSOptions:
33
- model: TTSModels
34
- encoding: TTSEncoding
35
- sample_rate: int
36
- voice: str | list[float]
37
- api_key: str
38
- language: str
39
-
40
-
41
- class TTS(tts.TTS):
42
- def __init__(
43
- self,
44
- *,
45
- model: TTSModels = "sonic-english",
46
- language: str = "en",
47
- encoding: TTSEncoding = "pcm_s16le",
48
- voice: str | list[float] = TTSDefaultVoiceId,
49
- sample_rate: int = 24000,
50
- api_key: str | None = None,
51
- http_session: aiohttp.ClientSession | None = None,
52
- ) -> None:
53
- super().__init__(
54
- capabilities=tts.TTSCapabilities(streaming=False),
55
- sample_rate=sample_rate,
56
- num_channels=1,
57
- )
58
-
59
- api_key = api_key or os.environ.get("CARTESIA_API_KEY")
60
- if not api_key:
61
- raise ValueError("CARTESIA_API_KEY must be set")
62
-
63
- self._opts = _TTSOptions(
64
- model=model,
65
- language=language,
66
- encoding=encoding,
67
- sample_rate=sample_rate,
68
- voice=voice,
69
- api_key=api_key,
70
- )
71
- self._session = http_session
72
-
73
- def _ensure_session(self) -> aiohttp.ClientSession:
74
- if not self._session:
75
- self._session = utils.http_context.http_session()
76
-
77
- return self._session
78
-
79
- def synthesize(self, text: str) -> "ChunkedStream":
80
- return ChunkedStream(text, self._opts, self._ensure_session())
81
-
82
-
83
- class ChunkedStream(tts.ChunkedStream):
84
- def __init__(
85
- self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
86
- ) -> None:
87
- super().__init__()
88
- self._text, self._opts, self._session = text, opts, session
89
-
90
- @utils.log_exceptions(logger=logger)
91
- async def _main_task(self):
92
- bstream = utils.audio.AudioByteStream(
93
- sample_rate=self._opts.sample_rate, num_channels=1
94
- )
95
- request_id = utils.shortuuid()
96
- segment_id = utils.shortuuid()
97
-
98
- voice = {}
99
- if isinstance(self._opts.voice, str):
100
- voice["mode"] = "id"
101
- voice["id"] = self._opts.voice
102
- else:
103
- voice["mode"] = "embedding"
104
- voice["embedding"] = self._opts.voice
105
-
106
- data = {
107
- "model_id": self._opts.model,
108
- "transcript": self._text,
109
- "voice": voice,
110
- "output_format": {
111
- "container": "raw",
112
- "encoding": self._opts.encoding,
113
- "sample_rate": self._opts.sample_rate,
114
- },
115
- "language": self._opts.language,
116
- }
117
-
118
- async with self._session.post(
119
- "https://api.cartesia.ai/tts/bytes",
120
- headers={
121
- API_AUTH_HEADER: f"{self._opts.api_key}",
122
- API_VERSION_HEADER: API_VERSION,
123
- },
124
- json=data,
125
- ) as resp:
126
- async for data, _ in resp.content.iter_chunks():
127
- for frame in bstream.write(data):
128
- self._event_ch.send_nowait(
129
- tts.SynthesizedAudio(
130
- request_id=request_id, segment_id=segment_id, frame=frame
131
- )
132
- )
133
-
134
- for frame in bstream.flush():
135
- self._event_ch.send_nowait(
136
- tts.SynthesizedAudio(
137
- request_id=request_id, segment_id=segment_id, frame=frame
138
- )
139
- )