livekit-plugins-elevenlabs 0.5.dev0__py3-none-any.whl → 0.6.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/__init__.py +10 -1
- livekit/plugins/elevenlabs/models.py +12 -0
- livekit/plugins/elevenlabs/tts.py +60 -19
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.5.dev0.dist-info → livekit_plugins_elevenlabs-0.6.dev0.dist-info}/METADATA +2 -2
- livekit_plugins_elevenlabs-0.6.dev0.dist-info/RECORD +10 -0
- livekit_plugins_elevenlabs-0.5.dev0.dist-info/RECORD +0 -10
- {livekit_plugins_elevenlabs-0.5.dev0.dist-info → livekit_plugins_elevenlabs-0.6.dev0.dist-info}/WHEEL +0 -0
- {livekit_plugins_elevenlabs-0.5.dev0.dist-info → livekit_plugins_elevenlabs-0.6.dev0.dist-info}/top_level.txt +0 -0
@@ -12,10 +12,19 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from .models import TTSEncoding, TTSModels
|
15
16
|
from .tts import DEFAULT_VOICE, TTS, Voice, VoiceSettings
|
16
17
|
from .version import __version__
|
17
18
|
|
18
|
-
__all__ = [
|
19
|
+
__all__ = [
|
20
|
+
"TTS",
|
21
|
+
"Voice",
|
22
|
+
"VoiceSettings",
|
23
|
+
"TTSEncoding",
|
24
|
+
"TTSModels",
|
25
|
+
"DEFAULT_VOICE",
|
26
|
+
"__version__",
|
27
|
+
]
|
19
28
|
|
20
29
|
from livekit.agents import Plugin
|
21
30
|
|
@@ -6,3 +6,15 @@ TTSModels = Literal[
|
|
6
6
|
"eleven_multilingual_v2",
|
7
7
|
"eleven_turbo_v2",
|
8
8
|
]
|
9
|
+
|
10
|
+
TTSEncoding = Literal[
|
11
|
+
"mp3_22050_32",
|
12
|
+
"mp3_44100_32",
|
13
|
+
"mp3_44100_64",
|
14
|
+
"mp3_44100_96",
|
15
|
+
"mp3_44100_128",
|
16
|
+
"mp3_44100_192",
|
17
|
+
"pcm_16000",
|
18
|
+
"pcm_22050",
|
19
|
+
"pcm_44100",
|
20
|
+
]
|
@@ -21,14 +21,36 @@ import dataclasses
|
|
21
21
|
import json
|
22
22
|
import os
|
23
23
|
from dataclasses import dataclass
|
24
|
-
from typing import List, Optional
|
24
|
+
from typing import List, Literal, Optional
|
25
25
|
|
26
26
|
import aiohttp
|
27
27
|
from livekit import rtc
|
28
|
-
from livekit.agents import aio, tokenize, tts, utils
|
28
|
+
from livekit.agents import aio, codecs, tokenize, tts, utils
|
29
29
|
|
30
30
|
from .log import logger
|
31
|
-
from .models import
|
31
|
+
from .models import (
|
32
|
+
TTSEncoding,
|
33
|
+
TTSModels,
|
34
|
+
)
|
35
|
+
|
36
|
+
_Encoding = Literal[
|
37
|
+
"mp3",
|
38
|
+
"pcm",
|
39
|
+
]
|
40
|
+
|
41
|
+
|
42
|
+
def _sample_rate_from_format(output_format: TTSEncoding) -> int:
|
43
|
+
split = output_format.split("_") # e.g: mp3_22050_32
|
44
|
+
return int(split[1])
|
45
|
+
|
46
|
+
|
47
|
+
def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
|
48
|
+
if output_format.startswith("mp3"):
|
49
|
+
return "mp3"
|
50
|
+
elif output_format.startswith("pcm"):
|
51
|
+
return "pcm"
|
52
|
+
|
53
|
+
raise ValueError(f"Unknown format: {output_format}")
|
32
54
|
|
33
55
|
|
34
56
|
@dataclass
|
@@ -66,6 +88,7 @@ class _TTSOptions:
|
|
66
88
|
voice: Voice
|
67
89
|
model_id: TTSModels
|
68
90
|
base_url: str
|
91
|
+
encoding: TTSEncoding
|
69
92
|
sample_rate: int
|
70
93
|
streaming_latency: int
|
71
94
|
word_tokenizer: tokenize.WordTokenizer
|
@@ -80,7 +103,7 @@ class TTS(tts.TTS):
|
|
80
103
|
model_id: TTSModels = "eleven_turbo_v2",
|
81
104
|
api_key: str | None = None,
|
82
105
|
base_url: str | None = None,
|
83
|
-
|
106
|
+
encoding: TTSEncoding = "mp3_22050_32",
|
84
107
|
streaming_latency: int = 3,
|
85
108
|
word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
|
86
109
|
ignore_punctuation=False # punctuation can help for intonation
|
@@ -91,7 +114,9 @@ class TTS(tts.TTS):
|
|
91
114
|
http_session: aiohttp.ClientSession | None = None,
|
92
115
|
) -> None:
|
93
116
|
super().__init__(
|
94
|
-
streaming_supported=True,
|
117
|
+
streaming_supported=True,
|
118
|
+
sample_rate=_sample_rate_from_format(encoding),
|
119
|
+
num_channels=1,
|
95
120
|
)
|
96
121
|
api_key = api_key or os.environ.get("ELEVEN_API_KEY")
|
97
122
|
if not api_key:
|
@@ -102,7 +127,8 @@ class TTS(tts.TTS):
|
|
102
127
|
model_id=model_id,
|
103
128
|
api_key=api_key,
|
104
129
|
base_url=base_url or API_BASE_URL_V1,
|
105
|
-
|
130
|
+
encoding=encoding,
|
131
|
+
sample_rate=self.sample_rate,
|
106
132
|
streaming_latency=streaming_latency,
|
107
133
|
word_tokenizer=word_tokenizer,
|
108
134
|
chunk_length_schedule=chunk_length_schedule,
|
@@ -150,7 +176,7 @@ class ChunkedStream(tts.ChunkedStream):
|
|
150
176
|
base_url = self._opts.base_url
|
151
177
|
voice_id = self._opts.voice.id
|
152
178
|
model_id = self._opts.model_id
|
153
|
-
sample_rate = self._opts.
|
179
|
+
sample_rate = _sample_rate_from_format(self._opts.encoding)
|
154
180
|
latency = self._opts.streaming_latency
|
155
181
|
url = (
|
156
182
|
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
@@ -260,11 +286,11 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
260
286
|
base_url = self._opts.base_url
|
261
287
|
voice_id = self._opts.voice.id
|
262
288
|
model_id = self._opts.model_id
|
263
|
-
|
289
|
+
output_format = self._opts.encoding
|
264
290
|
latency = self._opts.streaming_latency
|
265
291
|
url = (
|
266
292
|
f"{base_url}/text-to-speech/{voice_id}/stream-input?"
|
267
|
-
f"model_id={model_id}&output_format=
|
293
|
+
f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
|
268
294
|
)
|
269
295
|
|
270
296
|
return url
|
@@ -417,6 +443,8 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
417
443
|
all_tokens_consumed = True
|
418
444
|
|
419
445
|
async def recv_task():
|
446
|
+
encoding = _encoding_from_format(self._opts.encoding)
|
447
|
+
mp3_decoder = codecs.Mp3StreamDecoder()
|
420
448
|
while True:
|
421
449
|
msg = await ws_conn.receive()
|
422
450
|
if msg.type in (
|
@@ -437,19 +465,32 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
437
465
|
continue
|
438
466
|
|
439
467
|
data: dict = json.loads(msg.data)
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
468
|
+
audio = data.get("audio")
|
469
|
+
|
470
|
+
if data.get("error"):
|
471
|
+
logger.error("11labs error %s", data)
|
472
|
+
return
|
473
|
+
elif audio is not None:
|
474
|
+
if audio == "":
|
475
|
+
# 11labs sometimes sends empty audio, ignore
|
476
|
+
continue
|
477
|
+
|
478
|
+
b64data = base64.b64decode(audio)
|
479
|
+
frame: rtc.AudioFrame
|
480
|
+
if encoding == "mp3":
|
481
|
+
frames = mp3_decoder.decode_chunk(b64data)
|
482
|
+
frame = utils.merge_frames(frames)
|
483
|
+
else:
|
484
|
+
frame = rtc.AudioFrame(
|
485
|
+
data=b64data,
|
486
|
+
sample_rate=self._opts.sample_rate,
|
487
|
+
num_channels=1,
|
488
|
+
samples_per_channel=len(b64data) // 2,
|
489
|
+
)
|
449
490
|
|
450
491
|
text = ""
|
451
492
|
if data.get("alignment"):
|
452
|
-
text = data["alignment"].get("chars", "")
|
493
|
+
text = "".join(data["alignment"].get("chars", ""))
|
453
494
|
|
454
495
|
audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
|
455
496
|
continue
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.dev0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
22
|
Requires-Dist: livekit ~=0.11
|
23
|
-
Requires-Dist: livekit-agents ~=0.
|
23
|
+
Requires-Dist: livekit-agents[codecs] ~=0.8.dev0
|
24
24
|
Requires-Dist: aiohttp >=3.8.5
|
25
25
|
|
26
26
|
# LiveKit Plugins Elevenlabs
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=ez1ybDPt7GfKAKgPkxZFRB7Vyd-_i-0hfUMI79GQ5w4,1091
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=8jTchztgpiTokHEaWUK8PPxWWfvm5SMrOGsJpzxbYAw,362
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=GTcyQwBVVPzCYLgsnw9q5oFOq9cV3hIKndDaBPSFMr4,17738
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=yB6WnbnD5MFhQDT5ItJ02XWVsNanlDYiOezzwv0IdcM,603
|
7
|
+
livekit_plugins_elevenlabs-0.6.dev0.dist-info/METADATA,sha256=kfWET-iNGQYX7TGoo87CiMIoMINIwE28YT4-hbp8NDY,1373
|
8
|
+
livekit_plugins_elevenlabs-0.6.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
+
livekit_plugins_elevenlabs-0.6.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_elevenlabs-0.6.dev0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=p7mEiUDR6gbqEUrLp1lgTkJ3ounN6rhnenYoYqWNF2k,16418
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=h2gCxcJSMvCrVP7h14ON6HaghqLCkbl3--HZKEopR_8,603
|
7
|
-
livekit_plugins_elevenlabs-0.5.dev0.dist-info/METADATA,sha256=5uCb2q4zTTGaCSSN448GLqhj9-41bg0jjR2CSeov8ms,1365
|
8
|
-
livekit_plugins_elevenlabs-0.5.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
-
livekit_plugins_elevenlabs-0.5.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.5.dev0.dist-info/RECORD,,
|
File without changes
|
File without changes
|