livekit-plugins-elevenlabs 0.6.dev0__tar.gz → 0.7.0.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/PKG-INFO +2 -4
  2. livekit_plugins_elevenlabs-0.7.0.dev1/livekit/plugins/elevenlabs/tts.py +398 -0
  3. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit/plugins/elevenlabs/version.py +1 -1
  4. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit_plugins_elevenlabs.egg-info/PKG-INFO +2 -4
  5. livekit_plugins_elevenlabs-0.7.0.dev1/livekit_plugins_elevenlabs.egg-info/requires.txt +1 -0
  6. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/setup.py +2 -8
  7. livekit_plugins_elevenlabs-0.6.dev0/livekit/plugins/elevenlabs/tts.py +0 -528
  8. livekit_plugins_elevenlabs-0.6.dev0/livekit_plugins_elevenlabs.egg-info/requires.txt +0 -3
  9. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/README.md +0 -0
  10. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit/plugins/elevenlabs/__init__.py +0 -0
  11. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit/plugins/elevenlabs/log.py +0 -0
  12. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit/plugins/elevenlabs/models.py +0 -0
  13. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit/plugins/elevenlabs/py.typed +0 -0
  14. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit_plugins_elevenlabs.egg-info/SOURCES.txt +0 -0
  15. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit_plugins_elevenlabs.egg-info/dependency_links.txt +0 -0
  16. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/livekit_plugins_elevenlabs.egg-info/top_level.txt +0 -0
  17. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/pyproject.toml +0 -0
  18. {livekit_plugins_elevenlabs-0.6.dev0 → livekit_plugins_elevenlabs-0.7.0.dev1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.6.dev0
3
+ Version: 0.7.0.dev1
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,9 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit~=0.11
23
- Requires-Dist: livekit-agents[codecs]~=0.8.dev0
24
- Requires-Dist: aiohttp>=3.8.5
22
+ Requires-Dist: livekit-agents[codecs]>=0.7.2
25
23
 
26
24
  # LiveKit Plugins Elevenlabs
27
25
 
@@ -0,0 +1,398 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import dataclasses
20
+ import json
21
+ import os
22
+ from dataclasses import dataclass
23
+ from typing import Any, List, Literal
24
+
25
+ import aiohttp
26
+ from livekit import rtc
27
+ from livekit.agents import tokenize, tts, utils
28
+
29
+ from .log import logger
30
+ from .models import TTSEncoding, TTSModels
31
+
32
+ _Encoding = Literal["mp3", "pcm"]
33
+
34
+
35
+ def _sample_rate_from_format(output_format: TTSEncoding) -> int:
36
+ split = output_format.split("_") # e.g: mp3_22050_32
37
+ return int(split[1])
38
+
39
+
40
+ def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
41
+ if output_format.startswith("mp3"):
42
+ return "mp3"
43
+ elif output_format.startswith("pcm"):
44
+ return "pcm"
45
+
46
+ raise ValueError(f"Unknown format: {output_format}")
47
+
48
+
49
+ @dataclass
50
+ class VoiceSettings:
51
+ stability: float # [0.0 - 1.0]
52
+ similarity_boost: float # [0.0 - 1.0]
53
+ style: float | None = None # [0.0 - 1.0]
54
+ use_speaker_boost: bool | None = False
55
+
56
+
57
+ @dataclass
58
+ class Voice:
59
+ id: str
60
+ name: str
61
+ category: str
62
+ settings: VoiceSettings | None = None
63
+
64
+
65
+ DEFAULT_VOICE = Voice(
66
+ id="EXAVITQu4vr4xnSDxMaL",
67
+ name="Bella",
68
+ category="premade",
69
+ settings=VoiceSettings(
70
+ stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
71
+ ),
72
+ )
73
+
74
+ API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
75
+ AUTHORIZATION_HEADER = "xi-api-key"
76
+
77
+
78
+ @dataclass
79
+ class _TTSOptions:
80
+ api_key: str
81
+ voice: Voice
82
+ model_id: TTSModels
83
+ base_url: str
84
+ encoding: TTSEncoding
85
+ sample_rate: int
86
+ streaming_latency: int
87
+ word_tokenizer: tokenize.WordTokenizer
88
+ chunk_length_schedule: list[int]
89
+
90
+
91
+ class TTS(tts.TTS):
92
+ def __init__(
93
+ self,
94
+ *,
95
+ voice: Voice = DEFAULT_VOICE,
96
+ model_id: TTSModels = "eleven_turbo_v2",
97
+ api_key: str | None = None,
98
+ base_url: str | None = None,
99
+ encoding: TTSEncoding = "mp3_22050_32",
100
+ streaming_latency: int = 3,
101
+ word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
102
+ ignore_punctuation=False # punctuation can help for intonation
103
+ ),
104
+ # default value of 11labs is [120, 160, 250, 290], but we want faster responses by default
105
+ # (range is 50-500)
106
+ chunk_length_schedule: list[int] = [80, 120, 200, 260],
107
+ http_session: aiohttp.ClientSession | None = None,
108
+ ) -> None:
109
+ super().__init__(
110
+ capabilities=tts.TTSCapabilities(
111
+ streaming=True,
112
+ ),
113
+ sample_rate=_sample_rate_from_format(encoding),
114
+ num_channels=1,
115
+ )
116
+ api_key = api_key or os.environ.get("ELEVEN_API_KEY")
117
+ if not api_key:
118
+ raise ValueError("ELEVEN_API_KEY must be set")
119
+
120
+ self._opts = _TTSOptions(
121
+ voice=voice,
122
+ model_id=model_id,
123
+ api_key=api_key,
124
+ base_url=base_url or API_BASE_URL_V1,
125
+ encoding=encoding,
126
+ sample_rate=self.sample_rate,
127
+ streaming_latency=streaming_latency,
128
+ word_tokenizer=word_tokenizer,
129
+ chunk_length_schedule=chunk_length_schedule,
130
+ )
131
+ self._session = http_session
132
+
133
+ def _ensure_session(self) -> aiohttp.ClientSession:
134
+ if not self._session:
135
+ self._session = utils.http_context.http_session()
136
+
137
+ return self._session
138
+
139
+ async def list_voices(self) -> List[Voice]:
140
+ async with self._ensure_session().get(
141
+ f"{self._opts.base_url}/voices",
142
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
143
+ ) as resp:
144
+ return _dict_to_voices_list(await resp.json())
145
+
146
+ def synthesize(self, text: str) -> "ChunkedStream":
147
+ return ChunkedStream(text, self._opts, self._ensure_session())
148
+
149
+ def stream(self) -> "SynthesizeStream":
150
+ return SynthesizeStream(self._ensure_session(), self._opts)
151
+
152
+
153
+ class ChunkedStream(tts.ChunkedStream):
154
+ """Synthesize using the chunked api endpoint"""
155
+
156
+ def __init__(
157
+ self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
158
+ ) -> None:
159
+ super().__init__()
160
+ self._text, self._opts, self._session = text, opts, session
161
+
162
+ @utils.log_exceptions(logger=logger)
163
+ async def _main_task(self) -> None:
164
+ bstream = utils.audio.AudioByteStream(
165
+ sample_rate=self._opts.sample_rate, num_channels=1
166
+ )
167
+ request_id = utils.shortuuid()
168
+ segment_id = utils.shortuuid()
169
+
170
+ voice_settings = (
171
+ dataclasses.asdict(self._opts.voice.settings)
172
+ if self._opts.voice.settings
173
+ else None
174
+ )
175
+ data = {
176
+ "text": self._text,
177
+ "model_id": self._opts.model_id,
178
+ "voice_settings": voice_settings,
179
+ }
180
+
181
+ async with self._session.post(
182
+ _synthesize_url(self._opts),
183
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
184
+ json=data,
185
+ ) as resp:
186
+ async for data, _ in resp.content.iter_chunks():
187
+ for frame in bstream.write(data):
188
+ self._event_ch.send_nowait(
189
+ tts.SynthesizedAudio(
190
+ request_id=request_id, segment_id=segment_id, frame=frame
191
+ )
192
+ )
193
+
194
+ for frame in bstream.flush():
195
+ self._event_ch.send_nowait(
196
+ tts.SynthesizedAudio(
197
+ request_id=request_id, segment_id=segment_id, frame=frame
198
+ )
199
+ )
200
+
201
+
202
+ class SynthesizeStream(tts.SynthesizeStream):
203
+ """Streamed API using websockets"""
204
+
205
+ def __init__(
206
+ self,
207
+ session: aiohttp.ClientSession,
208
+ opts: _TTSOptions,
209
+ ):
210
+ super().__init__()
211
+ self._opts = opts
212
+ self._session = session
213
+ self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
214
+
215
+ @utils.log_exceptions(logger=logger)
216
+ async def _main_task(self) -> None:
217
+ self._segments_ch = utils.aio.Chan[tokenize.WordStream]()
218
+
219
+ @utils.log_exceptions(logger=logger)
220
+ async def _tokenize_input():
221
+ """tokenize text from the input_ch to words"""
222
+ word_stream = None
223
+ async for input in self._input_ch:
224
+ if isinstance(input, str):
225
+ if not word_stream:
226
+ word_stream = self._opts.word_tokenizer.stream()
227
+ self._segments_ch.send_nowait(word_stream)
228
+
229
+ word_stream.push_text(input)
230
+ elif isinstance(input, self._FlushSentinel):
231
+ word_stream.end_input()
232
+ word_stream = None
233
+
234
+ self._segments_ch.close()
235
+
236
+ async def _run():
237
+ async for word_stream in self._segments_ch:
238
+ await self._run_ws(word_stream)
239
+
240
+ await asyncio.gather(_tokenize_input(), _run(), return_exceptions=True)
241
+
242
+ async def _run_ws(
243
+ self,
244
+ word_stream: tokenize.WordStream,
245
+ max_retry: int = 1,
246
+ ) -> None:
247
+ request_id = utils.shortuuid()
248
+ segment_id = utils.shortuuid()
249
+
250
+ ws_conn: aiohttp.ClientWebSocketResponse | None = None
251
+ for try_i in range(max_retry):
252
+ retry_delay = 5
253
+ try:
254
+ if try_i > 0:
255
+ await asyncio.sleep(retry_delay)
256
+
257
+ ws_conn = await self._session.ws_connect(
258
+ _stream_url(self._opts),
259
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
260
+ )
261
+ break
262
+ except Exception as e:
263
+ logger.warning(
264
+ f"failed to connect to 11labs, retrying in {retry_delay}s",
265
+ exc_info=e,
266
+ )
267
+
268
+ if ws_conn is None:
269
+ raise Exception(f"failed to connect to 11labs after {max_retry} retries")
270
+
271
+ init_pkt = dict(
272
+ text=" ",
273
+ try_trigger_generation=True,
274
+ voice_settings=dataclasses.asdict(self._opts.voice.settings)
275
+ if self._opts.voice.settings
276
+ else None,
277
+ generation_config=dict(
278
+ chunk_length_schedule=self._opts.chunk_length_schedule
279
+ ),
280
+ )
281
+ await ws_conn.send_str(json.dumps(init_pkt))
282
+ eos_sent = False
283
+
284
+ async def send_task():
285
+ nonlocal eos_sent
286
+
287
+ async for data in word_stream:
288
+ # try_trigger_generation=True is a bad practice, we expose
289
+ # chunk_length_schedule instead
290
+ data_pkt = dict(
291
+ text=f"{data.token} ", # must always end with a space
292
+ try_trigger_generation=False,
293
+ )
294
+ print(data_pkt)
295
+ await ws_conn.send_str(json.dumps(data_pkt))
296
+
297
+ # no more token, mark eos
298
+ eos_pkt = dict(text="")
299
+ await ws_conn.send_str(json.dumps(eos_pkt))
300
+ eos_sent = True
301
+
302
+ async def recv_task():
303
+ while True:
304
+ msg = await ws_conn.receive()
305
+ if msg.type in (
306
+ aiohttp.WSMsgType.CLOSED,
307
+ aiohttp.WSMsgType.CLOSE,
308
+ aiohttp.WSMsgType.CLOSING,
309
+ ):
310
+ if not eos_sent:
311
+ raise Exception(
312
+ "11labs connection closed unexpectedly, not all tokens have been consumed"
313
+ )
314
+ return
315
+
316
+ if msg.type != aiohttp.WSMsgType.TEXT:
317
+ logger.warning("unexpected 11labs message type %s", msg.type)
318
+ continue
319
+
320
+ self._process_stream_event(
321
+ data=json.loads(msg.data),
322
+ request_id=request_id,
323
+ segment_id=segment_id,
324
+ )
325
+
326
+ await asyncio.gather(send_task(), recv_task())
327
+
328
+ def _process_stream_event(
329
+ self, *, data: dict, request_id: str, segment_id: str
330
+ ) -> None:
331
+ encoding = _encoding_from_format(self._opts.encoding)
332
+ if data.get("audio"):
333
+ b64data = base64.b64decode(data["audio"])
334
+ if encoding == "mp3":
335
+ for frame in self._mp3_decoder.decode_chunk(b64data):
336
+ self._event_ch.send_nowait(
337
+ tts.SynthesizedAudio(
338
+ request_id=request_id,
339
+ segment_id=segment_id,
340
+ frame=frame,
341
+ )
342
+ )
343
+ else:
344
+ chunk_frame = rtc.AudioFrame(
345
+ data=b64data,
346
+ sample_rate=self._opts.sample_rate,
347
+ num_channels=1,
348
+ samples_per_channel=len(b64data) // 2,
349
+ )
350
+ self._event_ch.send_nowait(
351
+ tts.SynthesizedAudio(
352
+ request_id=request_id,
353
+ segment_id=segment_id,
354
+ frame=chunk_frame,
355
+ )
356
+ )
357
+ elif data.get("error"):
358
+ logger.error("11labs reported an error: %s", data["error"])
359
+ elif not data.get("isFinal"):
360
+ logger.error("unexpected 11labs message %s", data)
361
+
362
+
363
+ def _dict_to_voices_list(data: dict[str, Any]):
364
+ voices: List[Voice] = []
365
+ for voice in data["voices"]:
366
+ voices.append(
367
+ Voice(
368
+ id=voice["voice_id"],
369
+ name=voice["name"],
370
+ category=voice["category"],
371
+ settings=None,
372
+ )
373
+ )
374
+ return voices
375
+
376
+
377
+ def _synthesize_url(opts: _TTSOptions) -> str:
378
+ base_url = opts.base_url
379
+ voice_id = opts.voice.id
380
+ model_id = opts.model_id
381
+ sample_rate = _sample_rate_from_format(opts.encoding)
382
+ latency = opts.streaming_latency
383
+ return (
384
+ f"{base_url}/text-to-speech/{voice_id}/stream?"
385
+ f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
386
+ )
387
+
388
+
389
+ def _stream_url(opts: _TTSOptions) -> str:
390
+ base_url = opts.base_url
391
+ voice_id = opts.voice.id
392
+ model_id = opts.model_id
393
+ output_format = opts.encoding
394
+ latency = opts.streaming_latency
395
+ return (
396
+ f"{base_url}/text-to-speech/{voice_id}/stream-input?"
397
+ f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
398
+ )
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.6.dev0"
15
+ __version__ = "0.7.0-dev.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.6.dev0
3
+ Version: 0.7.0.dev1
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,9 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit~=0.11
23
- Requires-Dist: livekit-agents[codecs]~=0.8.dev0
24
- Requires-Dist: aiohttp>=3.8.5
22
+ Requires-Dist: livekit-agents[codecs]>=0.7.2
25
23
 
26
24
  # LiveKit Plugins Elevenlabs
27
25
 
@@ -0,0 +1 @@
1
+ livekit-agents[codecs]>=0.7.2
@@ -49,14 +49,8 @@ setuptools.setup(
49
49
  license="Apache-2.0",
50
50
  packages=setuptools.find_namespace_packages(include=["livekit.*"]),
51
51
  python_requires=">=3.9.0",
52
- install_requires=[
53
- "livekit ~= 0.11",
54
- "livekit-agents[codecs]~=0.8.dev0",
55
- "aiohttp >= 3.8.5",
56
- ],
57
- package_data={
58
- "livekit.plugins.elevenlabs": ["py.typed"],
59
- },
52
+ install_requires=["livekit-agents[codecs]>=0.7.2"],
53
+ package_data={"livekit.plugins.elevenlabs": ["py.typed"]},
60
54
  project_urls={
61
55
  "Documentation": "https://docs.livekit.io",
62
56
  "Website": "https://livekit.io/",
@@ -1,528 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import asyncio
18
- import base64
19
- import contextlib
20
- import dataclasses
21
- import json
22
- import os
23
- from dataclasses import dataclass
24
- from typing import List, Literal, Optional
25
-
26
- import aiohttp
27
- from livekit import rtc
28
- from livekit.agents import aio, codecs, tokenize, tts, utils
29
-
30
- from .log import logger
31
- from .models import (
32
- TTSEncoding,
33
- TTSModels,
34
- )
35
-
36
- _Encoding = Literal[
37
- "mp3",
38
- "pcm",
39
- ]
40
-
41
-
42
- def _sample_rate_from_format(output_format: TTSEncoding) -> int:
43
- split = output_format.split("_") # e.g: mp3_22050_32
44
- return int(split[1])
45
-
46
-
47
- def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
48
- if output_format.startswith("mp3"):
49
- return "mp3"
50
- elif output_format.startswith("pcm"):
51
- return "pcm"
52
-
53
- raise ValueError(f"Unknown format: {output_format}")
54
-
55
-
56
- @dataclass
57
- class VoiceSettings:
58
- stability: float # [0.0 - 1.0]
59
- similarity_boost: float # [0.0 - 1.0]
60
- style: float | None = None # [0.0 - 1.0]
61
- use_speaker_boost: bool | None = False
62
-
63
-
64
- @dataclass
65
- class Voice:
66
- id: str
67
- name: str
68
- category: str
69
- settings: VoiceSettings | None = None
70
-
71
-
72
- DEFAULT_VOICE = Voice(
73
- id="EXAVITQu4vr4xnSDxMaL",
74
- name="Bella",
75
- category="premade",
76
- settings=VoiceSettings(
77
- stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
78
- ),
79
- )
80
-
81
- API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
82
- AUTHORIZATION_HEADER = "xi-api-key"
83
-
84
-
85
- @dataclass
86
- class _TTSOptions:
87
- api_key: str
88
- voice: Voice
89
- model_id: TTSModels
90
- base_url: str
91
- encoding: TTSEncoding
92
- sample_rate: int
93
- streaming_latency: int
94
- word_tokenizer: tokenize.WordTokenizer
95
- chunk_length_schedule: list[int]
96
-
97
-
98
- class TTS(tts.TTS):
99
- def __init__(
100
- self,
101
- *,
102
- voice: Voice = DEFAULT_VOICE,
103
- model_id: TTSModels = "eleven_turbo_v2",
104
- api_key: str | None = None,
105
- base_url: str | None = None,
106
- encoding: TTSEncoding = "mp3_22050_32",
107
- streaming_latency: int = 3,
108
- word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
109
- ignore_punctuation=False # punctuation can help for intonation
110
- ),
111
- # default value of 11labs is [120, 160, 250, 290], but we want faster responses by default
112
- # (range is 50-500)
113
- chunk_length_schedule: list[int] = [80, 120, 200, 260],
114
- http_session: aiohttp.ClientSession | None = None,
115
- ) -> None:
116
- super().__init__(
117
- streaming_supported=True,
118
- sample_rate=_sample_rate_from_format(encoding),
119
- num_channels=1,
120
- )
121
- api_key = api_key or os.environ.get("ELEVEN_API_KEY")
122
- if not api_key:
123
- raise ValueError("ELEVEN_API_KEY must be set")
124
-
125
- self._opts = _TTSOptions(
126
- voice=voice,
127
- model_id=model_id,
128
- api_key=api_key,
129
- base_url=base_url or API_BASE_URL_V1,
130
- encoding=encoding,
131
- sample_rate=self.sample_rate,
132
- streaming_latency=streaming_latency,
133
- word_tokenizer=word_tokenizer,
134
- chunk_length_schedule=chunk_length_schedule,
135
- )
136
- self._session = http_session
137
-
138
- def _ensure_session(self) -> aiohttp.ClientSession:
139
- if not self._session:
140
- self._session = utils.http_session()
141
-
142
- return self._session
143
-
144
- async def list_voices(self) -> List[Voice]:
145
- async with self._ensure_session().get(
146
- f"{self._opts.base_url}/voices",
147
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
148
- ) as resp:
149
- return _dict_to_voices_list(await resp.json())
150
-
151
- def synthesize(
152
- self,
153
- text: str,
154
- ) -> "ChunkedStream":
155
- return ChunkedStream(text, self._opts, self._ensure_session())
156
-
157
- def stream(
158
- self,
159
- ) -> "SynthesizeStream":
160
- return SynthesizeStream(self._ensure_session(), self._opts)
161
-
162
-
163
- class ChunkedStream(tts.ChunkedStream):
164
- """Synthesize using the chunked api endpoint"""
165
-
166
- def __init__(
167
- self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
168
- ) -> None:
169
- self._opts = opts
170
- self._text = text
171
- self._session = session
172
- self._task: asyncio.Task | None = None
173
- self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
174
-
175
- def _synthesize_url(self) -> str:
176
- base_url = self._opts.base_url
177
- voice_id = self._opts.voice.id
178
- model_id = self._opts.model_id
179
- sample_rate = _sample_rate_from_format(self._opts.encoding)
180
- latency = self._opts.streaming_latency
181
- url = (
182
- f"{base_url}/text-to-speech/{voice_id}/stream?"
183
- f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
184
- )
185
- return url
186
-
187
- async def _main_task(self):
188
- try:
189
- await self._run()
190
- except Exception:
191
- logger.exception("11labs main task failed in chunked stream")
192
- finally:
193
- self._queue.put_nowait(None)
194
-
195
- async def _run(self) -> None:
196
- async with self._session.post(
197
- self._synthesize_url(),
198
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
199
- json=dict(
200
- text=self._text,
201
- model_id=self._opts.model_id,
202
- voice_settings=(
203
- dataclasses.asdict(self._opts.voice.settings)
204
- if self._opts.voice.settings
205
- else None
206
- ),
207
- ),
208
- ) as resp:
209
- # avoid very small frames. chunk by 10ms 16bits
210
- bytes_per_frame = (self._opts.sample_rate // 100) * 2
211
- buf = bytearray()
212
- async for data, _ in resp.content.iter_chunks():
213
- buf.extend(data)
214
-
215
- while len(buf) >= bytes_per_frame:
216
- frame_data = buf[:bytes_per_frame]
217
- buf = buf[bytes_per_frame:]
218
-
219
- self._queue.put_nowait(
220
- tts.SynthesizedAudio(
221
- text=self._text,
222
- data=rtc.AudioFrame(
223
- data=frame_data,
224
- sample_rate=self._opts.sample_rate,
225
- num_channels=1,
226
- samples_per_channel=len(frame_data) // 2,
227
- ),
228
- )
229
- )
230
-
231
- # send any remaining data
232
- if len(buf) > 0:
233
- self._queue.put_nowait(
234
- tts.SynthesizedAudio(
235
- text=self._text,
236
- data=rtc.AudioFrame(
237
- data=buf,
238
- sample_rate=self._opts.sample_rate,
239
- num_channels=1,
240
- samples_per_channel=len(buf) // 2,
241
- ),
242
- )
243
- )
244
-
245
- async def __anext__(self) -> tts.SynthesizedAudio:
246
- if not self._task:
247
- self._task = asyncio.create_task(self._main_task())
248
-
249
- frame = await self._queue.get()
250
- if frame is None:
251
- raise StopAsyncIteration
252
-
253
- return frame
254
-
255
- async def aclose(self) -> None:
256
- if not self._task:
257
- return
258
-
259
- self._task.cancel()
260
- with contextlib.suppress(asyncio.CancelledError):
261
- await self._task
262
-
263
-
264
- class SynthesizeStream(tts.SynthesizeStream):
265
- """Streamed API using websockets"""
266
-
267
- @dataclass
268
- class _SegmentConnection:
269
- audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
270
- task: asyncio.Task
271
-
272
- def __init__(
273
- self,
274
- session: aiohttp.ClientSession,
275
- opts: _TTSOptions,
276
- max_retry_per_segment: int = 3,
277
- ):
278
- self._opts = opts
279
- self._session = session
280
- self._main_task = asyncio.create_task(self._run(max_retry_per_segment))
281
- self._event_queue = asyncio.Queue[Optional[tts.SynthesisEvent]]()
282
- self._closed = False
283
- self._word_stream = opts.word_tokenizer.stream()
284
-
285
- def _stream_url(self) -> str:
286
- base_url = self._opts.base_url
287
- voice_id = self._opts.voice.id
288
- model_id = self._opts.model_id
289
- output_format = self._opts.encoding
290
- latency = self._opts.streaming_latency
291
- url = (
292
- f"{base_url}/text-to-speech/{voice_id}/stream-input?"
293
- f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
294
- )
295
-
296
- return url
297
-
298
- def push_text(self, token: str | None) -> None:
299
- if self._closed:
300
- raise ValueError("cannot push to a closed stream")
301
-
302
- if token is None:
303
- self._word_stream.mark_segment_end()
304
- return
305
-
306
- self._word_stream.push_text(token)
307
-
308
- async def aclose(self, *, wait: bool = True) -> None:
309
- self._closed = True
310
- await self._word_stream.aclose()
311
-
312
- if not wait:
313
- self._main_task.cancel()
314
-
315
- with contextlib.suppress(asyncio.CancelledError):
316
- await self._main_task
317
-
318
- async def _run(self, max_retry_per_segment: int) -> None:
319
- conns_q = asyncio.Queue[Optional[SynthesizeStream._SegmentConnection]]()
320
-
321
- async def _forward_events() -> None:
322
- """forward events from the ws connections to the event queue.
323
- This is used to keep the right order."""
324
- while True:
325
- c = await conns_q.get()
326
- if c is None:
327
- break # no more segment, stream closed
328
-
329
- self._event_queue.put_nowait(
330
- tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
331
- )
332
-
333
- async for frame in c.audio_rx:
334
- self._event_queue.put_nowait(
335
- tts.SynthesisEvent(
336
- type=tts.SynthesisEventType.AUDIO, audio=frame
337
- )
338
- )
339
-
340
- self._event_queue.put_nowait(
341
- tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
342
- )
343
-
344
- async def _read_tokens() -> None:
345
- """read tokens from the word stream and create connections for each segment,
346
- (this also allows concurrent connections to 11labs)"""
347
-
348
- cur_segment: SynthesizeStream._SegmentConnection | None = None
349
- token_tx: aio.ChanSender[str] | None = None
350
- async for ev in self._word_stream:
351
- if ev.type == tokenize.TokenEventType.STARTED:
352
- token_tx, token_rx = aio.channel()
353
- audio_tx: aio.ChanSender[tts.SynthesizedAudio]
354
- audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
355
- audio_tx, audio_rx = aio.channel()
356
- task = asyncio.create_task(
357
- self._run_ws(max_retry_per_segment, audio_tx, token_rx)
358
- )
359
- cur_segment = SynthesizeStream._SegmentConnection(audio_rx, task)
360
- conns_q.put_nowait(cur_segment)
361
- elif ev.type == tokenize.TokenEventType.TOKEN:
362
- assert token_tx is not None
363
- token_tx.send_nowait(ev.token)
364
- elif ev.type == tokenize.TokenEventType.FINISHED:
365
- assert token_tx is not None
366
- token_tx.close()
367
- cur_segment = token_tx = None
368
-
369
- conns_q.put_nowait(None)
370
-
371
- try:
372
- await asyncio.gather(_forward_events(), _read_tokens())
373
- except Exception:
374
- logger.exception("11labs task failed")
375
-
376
- self._event_queue.put_nowait(None)
377
-
378
- async def _run_ws(
379
- self,
380
- max_retry: int,
381
- audio_tx: aio.ChanSender[tts.SynthesizedAudio],
382
- token_rx: aio.ChanReceiver[str],
383
- ) -> None:
384
- # try to connect to 11labs
385
- ws_conn: aiohttp.ClientWebSocketResponse | None = None
386
- for try_i in range(max_retry):
387
- try:
388
- ws_conn = await self._session.ws_connect(
389
- self._stream_url(),
390
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
391
- )
392
-
393
- voice_settings = None
394
- if self._opts.voice.settings is not None:
395
- voice_settings = dataclasses.asdict(self._opts.voice.settings)
396
-
397
- init_pkt = dict(
398
- text=" ",
399
- try_trigger_generation=True,
400
- voice_settings=voice_settings,
401
- generation_config=dict(
402
- chunk_length_schedule=self._opts.chunk_length_schedule,
403
- ),
404
- )
405
- await ws_conn.send_str(json.dumps(init_pkt))
406
- except Exception:
407
- if try_i + 1 == max_retry:
408
- logger.exception(
409
- f"failed to connect to 11labs after {max_retry} retries"
410
- )
411
- return
412
-
413
- retry_delay = min(try_i * 5, 5) # max 5s
414
- logger.warning(
415
- f"failed to connect to 11labs, retrying in {retry_delay}s"
416
- )
417
- await asyncio.sleep(retry_delay)
418
-
419
- assert ws_conn is not None
420
-
421
- all_tokens_consumed = False
422
-
423
- async def send_task():
424
- async for token in token_rx:
425
- if token == "":
426
- continue # empty token is closing the stream in 11labs protocol
427
-
428
- # try_trigger_generation=True is a bad practice, we expose
429
- # chunk_length_schedule instead
430
- data_pkt = dict(
431
- text=f"{token} ", # must always end with a space
432
- try_trigger_generation=False,
433
- )
434
- await ws_conn.send_str(json.dumps(data_pkt))
435
-
436
- # no more token, mark eos
437
- flush_pkt = dict(
438
- text="",
439
- )
440
- await ws_conn.send_str(json.dumps(flush_pkt))
441
-
442
- nonlocal all_tokens_consumed
443
- all_tokens_consumed = True
444
-
445
- async def recv_task():
446
- encoding = _encoding_from_format(self._opts.encoding)
447
- mp3_decoder = codecs.Mp3StreamDecoder()
448
- while True:
449
- msg = await ws_conn.receive()
450
- if msg.type in (
451
- aiohttp.WSMsgType.CLOSED,
452
- aiohttp.WSMsgType.CLOSE,
453
- aiohttp.WSMsgType.CLOSING,
454
- ):
455
- if all_tokens_consumed:
456
- return # close is expected
457
-
458
- raise Exception(
459
- "11labs connection closed unexpectedly, not all tokens have been consumed"
460
- )
461
-
462
- if msg.type != aiohttp.WSMsgType.TEXT:
463
- # audio frames are serialized in base64..
464
- logger.warning("unexpected 11labs message type %s", msg.type)
465
- continue
466
-
467
- data: dict = json.loads(msg.data)
468
- audio = data.get("audio")
469
-
470
- if data.get("error"):
471
- logger.error("11labs error %s", data)
472
- return
473
- elif audio is not None:
474
- if audio == "":
475
- # 11labs sometimes sends empty audio, ignore
476
- continue
477
-
478
- b64data = base64.b64decode(audio)
479
- frame: rtc.AudioFrame
480
- if encoding == "mp3":
481
- frames = mp3_decoder.decode_chunk(b64data)
482
- frame = utils.merge_frames(frames)
483
- else:
484
- frame = rtc.AudioFrame(
485
- data=b64data,
486
- sample_rate=self._opts.sample_rate,
487
- num_channels=1,
488
- samples_per_channel=len(b64data) // 2,
489
- )
490
-
491
- text = ""
492
- if data.get("alignment"):
493
- text = "".join(data["alignment"].get("chars", ""))
494
-
495
- audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
496
- continue
497
- elif data.get("isFinal"):
498
- return # last message
499
-
500
- logger.error("unexpected 11labs message %s", data)
501
-
502
- try:
503
- await asyncio.gather(send_task(), recv_task())
504
- except Exception:
505
- logger.exception("11labs ws connection failed")
506
- finally:
507
- audio_tx.close()
508
-
509
- async def __anext__(self) -> tts.SynthesisEvent:
510
- evt = await self._event_queue.get()
511
- if evt is None:
512
- raise StopAsyncIteration
513
-
514
- return evt
515
-
516
-
517
- def _dict_to_voices_list(data: dict) -> List[Voice]:
518
- voices = []
519
- for voice in data["voices"]:
520
- voices.append(
521
- Voice(
522
- id=voice["voice_id"],
523
- name=voice["name"],
524
- category=voice["category"],
525
- settings=None,
526
- )
527
- )
528
- return voices
@@ -1,3 +0,0 @@
1
- livekit~=0.11
2
- livekit-agents[codecs]~=0.8.dev0
3
- aiohttp>=3.8.5