livekit-plugins-cartesia 0.2.0.dev7__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/PKG-INFO +1 -1
  2. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit/plugins/cartesia/__init__.py +3 -4
  3. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit/plugins/cartesia/models.py +1 -1
  4. livekit_plugins_cartesia-0.4.0/livekit/plugins/cartesia/tts.py +280 -0
  5. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit/plugins/cartesia/version.py +1 -1
  6. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit_plugins_cartesia.egg-info/PKG-INFO +1 -1
  7. livekit_plugins_cartesia-0.2.0.dev7/livekit/plugins/cartesia/tts.py +0 -139
  8. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/README.md +0 -0
  9. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit/plugins/cartesia/log.py +0 -0
  10. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit/plugins/cartesia/py.typed +0 -0
  11. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit_plugins_cartesia.egg-info/SOURCES.txt +0 -0
  12. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit_plugins_cartesia.egg-info/dependency_links.txt +0 -0
  13. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit_plugins_cartesia.egg-info/requires.txt +0 -0
  14. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/livekit_plugins_cartesia.egg-info/top_level.txt +0 -0
  15. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/pyproject.toml +0 -0
  16. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/setup.cfg +0 -0
  17. {livekit_plugins_cartesia-0.2.0.dev7 → livekit_plugins_cartesia-0.4.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-cartesia
3
- Version: 0.2.0.dev7
3
+ Version: 0.4.0
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,13 +19,12 @@ __all__ = ["TTS", "ChunkedStream", "__version__"]
19
19
 
20
20
  from livekit.agents import Plugin
21
21
 
22
+ from .log import logger
23
+
22
24
 
23
25
  class CartesiaPlugin(Plugin):
24
26
  def __init__(self):
25
- super().__init__(__name__, __version__, __package__)
26
-
27
- def download_files(self):
28
- pass
27
+ super().__init__(__name__, __version__, __package__, logger)
29
28
 
30
29
 
31
30
  Plugin.register_plugin(CartesiaPlugin())
@@ -11,4 +11,4 @@ TTSEncoding = Literal[
11
11
 
12
12
  TTSModels = Literal["sonic-english", "sonic-multilingual"]
13
13
  TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
14
- TTSDefaultVoiceId = "248be419-c632-4f23-adf1-5324ed7dbf1d"
14
+ TTSDefaultVoiceId = "c2ac25f9-ecc4-4f56-9095-651354df60c0"
@@ -0,0 +1,280 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ from dataclasses import dataclass
22
+ from typing import Any
23
+
24
+ import aiohttp
25
+ from livekit.agents import tokenize, tts, utils
26
+
27
+ from .log import logger
28
+ from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
29
+
30
+ API_AUTH_HEADER = "X-API-Key"
31
+ API_VERSION_HEADER = "Cartesia-Version"
32
+ API_VERSION = "2024-06-10"
33
+
34
+ NUM_CHANNELS = 1
35
+ BUFFERED_WORDS_COUNT = 8
36
+
37
+
38
+ @dataclass
39
+ class _TTSOptions:
40
+ model: TTSModels
41
+ encoding: TTSEncoding
42
+ sample_rate: int
43
+ voice: str | list[float]
44
+ api_key: str
45
+ language: str
46
+
47
+
48
+ class TTS(tts.TTS):
49
+ def __init__(
50
+ self,
51
+ *,
52
+ model: TTSModels = "sonic-english",
53
+ language: str = "en",
54
+ encoding: TTSEncoding = "pcm_s16le",
55
+ voice: str | list[float] = TTSDefaultVoiceId,
56
+ sample_rate: int = 24000,
57
+ api_key: str | None = None,
58
+ http_session: aiohttp.ClientSession | None = None,
59
+ ) -> None:
60
+ super().__init__(
61
+ capabilities=tts.TTSCapabilities(streaming=True),
62
+ sample_rate=sample_rate,
63
+ num_channels=NUM_CHANNELS,
64
+ )
65
+
66
+ api_key = api_key or os.environ.get("CARTESIA_API_KEY")
67
+ if not api_key:
68
+ raise ValueError("CARTESIA_API_KEY must be set")
69
+
70
+ self._opts = _TTSOptions(
71
+ model=model,
72
+ language=language,
73
+ encoding=encoding,
74
+ sample_rate=sample_rate,
75
+ voice=voice,
76
+ api_key=api_key,
77
+ )
78
+ self._session = http_session
79
+
80
+ def _ensure_session(self) -> aiohttp.ClientSession:
81
+ if not self._session:
82
+ self._session = utils.http_context.http_session()
83
+
84
+ return self._session
85
+
86
+ def synthesize(self, text: str) -> "ChunkedStream":
87
+ return ChunkedStream(text, self._opts, self._ensure_session())
88
+
89
+ def stream(self) -> "SynthesizeStream":
90
+ return SynthesizeStream(self._opts, self._ensure_session())
91
+
92
+
93
+ class ChunkedStream(tts.ChunkedStream):
94
+ """Synthesize chunked text using the bytes endpoint"""
95
+
96
+ def __init__(
97
+ self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
98
+ ) -> None:
99
+ super().__init__()
100
+ self._text, self._opts, self._session = text, opts, session
101
+
102
+ @utils.log_exceptions(logger=logger)
103
+ async def _main_task(self):
104
+ bstream = utils.audio.AudioByteStream(
105
+ sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS
106
+ )
107
+ request_id, segment_id = utils.shortuuid(), utils.shortuuid()
108
+
109
+ data = _to_cartesia_options(self._opts)
110
+ data["transcript"] = self._text
111
+
112
+ async with self._session.post(
113
+ "https://api.cartesia.ai/tts/bytes",
114
+ headers={
115
+ API_AUTH_HEADER: self._opts.api_key,
116
+ API_VERSION_HEADER: API_VERSION,
117
+ },
118
+ json=data,
119
+ ) as resp:
120
+ async for data, _ in resp.content.iter_chunks():
121
+ for frame in bstream.write(data):
122
+ self._event_ch.send_nowait(
123
+ tts.SynthesizedAudio(
124
+ request_id=request_id, segment_id=segment_id, frame=frame
125
+ )
126
+ )
127
+
128
+ for frame in bstream.flush():
129
+ self._event_ch.send_nowait(
130
+ tts.SynthesizedAudio(
131
+ request_id=request_id, segment_id=segment_id, frame=frame
132
+ )
133
+ )
134
+
135
+
136
+ class SynthesizeStream(tts.SynthesizeStream):
137
+ def __init__(
138
+ self,
139
+ opts: _TTSOptions,
140
+ session: aiohttp.ClientSession,
141
+ ):
142
+ super().__init__()
143
+ self._opts, self._session = opts, session
144
+ self._sent_tokenizer_stream = tokenize.basic.SentenceTokenizer(
145
+ min_sentence_len=BUFFERED_WORDS_COUNT
146
+ ).stream()
147
+
148
+ @utils.log_exceptions(logger=logger)
149
+ async def _main_task(self) -> None:
150
+ retry_count = 0
151
+ max_retry = 3
152
+ while self._input_ch.qsize() or not self._input_ch.closed:
153
+ try:
154
+ url = f"wss://api.cartesia.ai/tts/websocket?api_key={self._opts.api_key}&cartesia_version={API_VERSION}"
155
+ ws = await self._session.ws_connect(url)
156
+ retry_count = 0 # connected successfully, reset the retry_count
157
+
158
+ await self._run_ws(ws)
159
+ except Exception as e:
160
+ if retry_count >= max_retry:
161
+ logger.exception(
162
+ f"failed to connect to Cartesia after {max_retry} tries"
163
+ )
164
+ break
165
+
166
+ retry_delay = min(retry_count * 2, 10) # max 10s
167
+ retry_count += 1
168
+
169
+ logger.warning(
170
+ f"Cartesia connection failed, retrying in {retry_delay}s",
171
+ exc_info=e,
172
+ )
173
+ await asyncio.sleep(retry_delay)
174
+
175
+ async def _run_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
176
+ request_id = utils.shortuuid()
177
+
178
+ async def sentence_stream_task():
179
+ base_pkt = _to_cartesia_options(self._opts)
180
+ async for ev in self._sent_tokenizer_stream:
181
+ token_pkt = base_pkt.copy()
182
+ token_pkt["context_id"] = request_id
183
+ token_pkt["transcript"] = ev.token + " "
184
+ token_pkt["continue"] = True
185
+ await ws.send_str(json.dumps(token_pkt))
186
+
187
+ end_pkt = base_pkt.copy()
188
+ end_pkt["context_id"] = request_id
189
+ end_pkt["transcript"] = " "
190
+ end_pkt["continue"] = False
191
+ await ws.send_str(json.dumps(end_pkt))
192
+
193
+ async def input_task():
194
+ async for data in self._input_ch:
195
+ if isinstance(data, self._FlushSentinel):
196
+ self._sent_tokenizer_stream.flush()
197
+ continue
198
+ self._sent_tokenizer_stream.push_text(data)
199
+ self._sent_tokenizer_stream.end_input()
200
+
201
+ async def recv_task():
202
+ audio_bstream = utils.audio.AudioByteStream(
203
+ sample_rate=self._opts.sample_rate,
204
+ num_channels=NUM_CHANNELS,
205
+ )
206
+
207
+ while True:
208
+ msg = await ws.receive()
209
+ if msg.type in (
210
+ aiohttp.WSMsgType.CLOSED,
211
+ aiohttp.WSMsgType.CLOSE,
212
+ aiohttp.WSMsgType.CLOSING,
213
+ ):
214
+ raise Exception("Cartesia connection closed unexpectedly")
215
+
216
+ if msg.type != aiohttp.WSMsgType.TEXT:
217
+ logger.warning("unexpected Cartesia message type %s", msg.type)
218
+ continue
219
+
220
+ data = json.loads(msg.data)
221
+ segment_id = data.get("context_id")
222
+ # Once we receive audio for a segment, we can start a new segment
223
+ if data.get("data"):
224
+ b64data = base64.b64decode(data["data"])
225
+ for frame in audio_bstream.write(b64data):
226
+ self._event_ch.send_nowait(
227
+ tts.SynthesizedAudio(
228
+ request_id=request_id,
229
+ segment_id=segment_id,
230
+ frame=frame,
231
+ )
232
+ )
233
+ elif data.get("done"):
234
+ for frame in audio_bstream.flush():
235
+ self._event_ch.send_nowait(
236
+ tts.SynthesizedAudio(
237
+ request_id=request_id,
238
+ segment_id=segment_id,
239
+ frame=frame,
240
+ )
241
+ )
242
+
243
+ if segment_id == request_id:
244
+ # we're not going to receive more frames, close the connection
245
+ await ws.close()
246
+ break
247
+ else:
248
+ logger.error("unexpected Cartesia message %s", data)
249
+
250
+ tasks = [
251
+ asyncio.create_task(input_task()),
252
+ asyncio.create_task(sentence_stream_task()),
253
+ asyncio.create_task(recv_task()),
254
+ ]
255
+
256
+ try:
257
+ await asyncio.gather(*tasks)
258
+ finally:
259
+ await utils.aio.gracefully_cancel(*tasks)
260
+
261
+
262
+ def _to_cartesia_options(opts: _TTSOptions) -> dict[str, Any]:
263
+ voice: dict[str, Any] = {}
264
+ if isinstance(opts.voice, str):
265
+ voice["mode"] = "id"
266
+ voice["id"] = opts.voice
267
+ else:
268
+ voice["mode"] = "embedding"
269
+ voice["embedding"] = opts.voice
270
+
271
+ return {
272
+ "model_id": opts.model,
273
+ "voice": voice,
274
+ "output_format": {
275
+ "container": "raw",
276
+ "encoding": opts.encoding,
277
+ "sample_rate": opts.sample_rate,
278
+ },
279
+ "language": opts.language,
280
+ }
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.2.0-dev.7"
15
+ __version__ = "0.4.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-cartesia
3
- Version: 0.2.0.dev7
3
+ Version: 0.4.0
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -1,139 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import os
18
- from dataclasses import dataclass
19
-
20
- import aiohttp
21
- from livekit.agents import tts, utils
22
-
23
- from .log import logger
24
- from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
25
-
26
- API_AUTH_HEADER = "X-API-Key"
27
- API_VERSION_HEADER = "Cartesia-Version"
28
- API_VERSION = "2024-06-10"
29
-
30
-
31
- @dataclass
32
- class _TTSOptions:
33
- model: TTSModels
34
- encoding: TTSEncoding
35
- sample_rate: int
36
- voice: str | list[float]
37
- api_key: str
38
- language: str
39
-
40
-
41
- class TTS(tts.TTS):
42
- def __init__(
43
- self,
44
- *,
45
- model: TTSModels = "sonic-english",
46
- language: str = "en",
47
- encoding: TTSEncoding = "pcm_s16le",
48
- voice: str | list[float] = TTSDefaultVoiceId,
49
- sample_rate: int = 24000,
50
- api_key: str | None = None,
51
- http_session: aiohttp.ClientSession | None = None,
52
- ) -> None:
53
- super().__init__(
54
- capabilities=tts.TTSCapabilities(streaming=False),
55
- sample_rate=sample_rate,
56
- num_channels=1,
57
- )
58
-
59
- api_key = api_key or os.environ.get("CARTESIA_API_KEY")
60
- if not api_key:
61
- raise ValueError("CARTESIA_API_KEY must be set")
62
-
63
- self._opts = _TTSOptions(
64
- model=model,
65
- language=language,
66
- encoding=encoding,
67
- sample_rate=sample_rate,
68
- voice=voice,
69
- api_key=api_key,
70
- )
71
- self._session = http_session
72
-
73
- def _ensure_session(self) -> aiohttp.ClientSession:
74
- if not self._session:
75
- self._session = utils.http_context.http_session()
76
-
77
- return self._session
78
-
79
- def synthesize(self, text: str) -> "ChunkedStream":
80
- return ChunkedStream(text, self._opts, self._ensure_session())
81
-
82
-
83
- class ChunkedStream(tts.ChunkedStream):
84
- def __init__(
85
- self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
86
- ) -> None:
87
- super().__init__()
88
- self._text, self._opts, self._session = text, opts, session
89
-
90
- @utils.log_exceptions(logger=logger)
91
- async def _main_task(self):
92
- bstream = utils.audio.AudioByteStream(
93
- sample_rate=self._opts.sample_rate, num_channels=1
94
- )
95
- request_id = utils.shortuuid()
96
- segment_id = utils.shortuuid()
97
-
98
- voice = {}
99
- if isinstance(self._opts.voice, str):
100
- voice["mode"] = "id"
101
- voice["id"] = self._opts.voice
102
- else:
103
- voice["mode"] = "embedding"
104
- voice["embedding"] = self._opts.voice
105
-
106
- data = {
107
- "model_id": self._opts.model,
108
- "transcript": self._text,
109
- "voice": voice,
110
- "output_format": {
111
- "container": "raw",
112
- "encoding": self._opts.encoding,
113
- "sample_rate": self._opts.sample_rate,
114
- },
115
- "language": self._opts.language,
116
- }
117
-
118
- async with self._session.post(
119
- "https://api.cartesia.ai/tts/bytes",
120
- headers={
121
- API_AUTH_HEADER: f"{self._opts.api_key}",
122
- API_VERSION_HEADER: API_VERSION,
123
- },
124
- json=data,
125
- ) as resp:
126
- async for data, _ in resp.content.iter_chunks():
127
- for frame in bstream.write(data):
128
- self._event_ch.send_nowait(
129
- tts.SynthesizedAudio(
130
- request_id=request_id, segment_id=segment_id, frame=frame
131
- )
132
- )
133
-
134
- for frame in bstream.flush():
135
- self._event_ch.send_nowait(
136
- tts.SynthesizedAudio(
137
- request_id=request_id, segment_id=segment_id, frame=frame
138
- )
139
- )