livekit-plugins-elevenlabs 0.5.0__py3-none-any.whl → 0.5.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/__init__.py +1 -10
- livekit/plugins/elevenlabs/models.py +0 -12
- livekit/plugins/elevenlabs/tts.py +19 -60
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.5.0.dist-info → livekit_plugins_elevenlabs-0.5.dev0.dist-info}/METADATA +2 -2
- livekit_plugins_elevenlabs-0.5.dev0.dist-info/RECORD +10 -0
- livekit_plugins_elevenlabs-0.5.0.dist-info/RECORD +0 -10
- {livekit_plugins_elevenlabs-0.5.0.dist-info → livekit_plugins_elevenlabs-0.5.dev0.dist-info}/WHEEL +0 -0
- {livekit_plugins_elevenlabs-0.5.0.dist-info → livekit_plugins_elevenlabs-0.5.dev0.dist-info}/top_level.txt +0 -0
@@ -12,19 +12,10 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from .models import TTSEncoding, TTSModels
|
16
15
|
from .tts import DEFAULT_VOICE, TTS, Voice, VoiceSettings
|
17
16
|
from .version import __version__
|
18
17
|
|
19
|
-
__all__ = [
|
20
|
-
"TTS",
|
21
|
-
"Voice",
|
22
|
-
"VoiceSettings",
|
23
|
-
"TTSEncoding",
|
24
|
-
"TTSModels",
|
25
|
-
"DEFAULT_VOICE",
|
26
|
-
"__version__",
|
27
|
-
]
|
18
|
+
__all__ = ["TTS", "Voice", "VoiceSettings", "DEFAULT_VOICE", "__version__"]
|
28
19
|
|
29
20
|
from livekit.agents import Plugin
|
30
21
|
|
@@ -6,15 +6,3 @@ TTSModels = Literal[
|
|
6
6
|
"eleven_multilingual_v2",
|
7
7
|
"eleven_turbo_v2",
|
8
8
|
]
|
9
|
-
|
10
|
-
TTSEncoding = Literal[
|
11
|
-
"mp3_22050_32",
|
12
|
-
"mp3_44100_32",
|
13
|
-
"mp3_44100_64",
|
14
|
-
"mp3_44100_96",
|
15
|
-
"mp3_44100_128",
|
16
|
-
"mp3_44100_192",
|
17
|
-
"pcm_16000",
|
18
|
-
"pcm_22050",
|
19
|
-
"pcm_44100",
|
20
|
-
]
|
@@ -21,36 +21,14 @@ import dataclasses
|
|
21
21
|
import json
|
22
22
|
import os
|
23
23
|
from dataclasses import dataclass
|
24
|
-
from typing import List,
|
24
|
+
from typing import List, Optional
|
25
25
|
|
26
26
|
import aiohttp
|
27
27
|
from livekit import rtc
|
28
|
-
from livekit.agents import aio,
|
28
|
+
from livekit.agents import aio, tokenize, tts, utils
|
29
29
|
|
30
30
|
from .log import logger
|
31
|
-
from .models import
|
32
|
-
TTSEncoding,
|
33
|
-
TTSModels,
|
34
|
-
)
|
35
|
-
|
36
|
-
_Encoding = Literal[
|
37
|
-
"mp3",
|
38
|
-
"pcm",
|
39
|
-
]
|
40
|
-
|
41
|
-
|
42
|
-
def _sample_rate_from_format(output_format: TTSEncoding) -> int:
|
43
|
-
split = output_format.split("_") # e.g: mp3_22050_32
|
44
|
-
return int(split[1])
|
45
|
-
|
46
|
-
|
47
|
-
def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
|
48
|
-
if output_format.startswith("mp3"):
|
49
|
-
return "mp3"
|
50
|
-
elif output_format.startswith("pcm"):
|
51
|
-
return "pcm"
|
52
|
-
|
53
|
-
raise ValueError(f"Unknown format: {output_format}")
|
31
|
+
from .models import TTSModels
|
54
32
|
|
55
33
|
|
56
34
|
@dataclass
|
@@ -88,7 +66,6 @@ class _TTSOptions:
|
|
88
66
|
voice: Voice
|
89
67
|
model_id: TTSModels
|
90
68
|
base_url: str
|
91
|
-
encoding: TTSEncoding
|
92
69
|
sample_rate: int
|
93
70
|
streaming_latency: int
|
94
71
|
word_tokenizer: tokenize.WordTokenizer
|
@@ -103,7 +80,7 @@ class TTS(tts.TTS):
|
|
103
80
|
model_id: TTSModels = "eleven_turbo_v2",
|
104
81
|
api_key: str | None = None,
|
105
82
|
base_url: str | None = None,
|
106
|
-
|
83
|
+
sample_rate: int = 24000,
|
107
84
|
streaming_latency: int = 3,
|
108
85
|
word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
|
109
86
|
ignore_punctuation=False # punctuation can help for intonation
|
@@ -114,9 +91,7 @@ class TTS(tts.TTS):
|
|
114
91
|
http_session: aiohttp.ClientSession | None = None,
|
115
92
|
) -> None:
|
116
93
|
super().__init__(
|
117
|
-
streaming_supported=True,
|
118
|
-
sample_rate=_sample_rate_from_format(encoding),
|
119
|
-
num_channels=1,
|
94
|
+
streaming_supported=True, sample_rate=sample_rate, num_channels=1
|
120
95
|
)
|
121
96
|
api_key = api_key or os.environ.get("ELEVEN_API_KEY")
|
122
97
|
if not api_key:
|
@@ -127,8 +102,7 @@ class TTS(tts.TTS):
|
|
127
102
|
model_id=model_id,
|
128
103
|
api_key=api_key,
|
129
104
|
base_url=base_url or API_BASE_URL_V1,
|
130
|
-
|
131
|
-
sample_rate=self.sample_rate,
|
105
|
+
sample_rate=sample_rate,
|
132
106
|
streaming_latency=streaming_latency,
|
133
107
|
word_tokenizer=word_tokenizer,
|
134
108
|
chunk_length_schedule=chunk_length_schedule,
|
@@ -176,7 +150,7 @@ class ChunkedStream(tts.ChunkedStream):
|
|
176
150
|
base_url = self._opts.base_url
|
177
151
|
voice_id = self._opts.voice.id
|
178
152
|
model_id = self._opts.model_id
|
179
|
-
sample_rate =
|
153
|
+
sample_rate = self._opts.sample_rate
|
180
154
|
latency = self._opts.streaming_latency
|
181
155
|
url = (
|
182
156
|
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
@@ -286,11 +260,11 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
286
260
|
base_url = self._opts.base_url
|
287
261
|
voice_id = self._opts.voice.id
|
288
262
|
model_id = self._opts.model_id
|
289
|
-
|
263
|
+
sample_rate = self._opts.sample_rate
|
290
264
|
latency = self._opts.streaming_latency
|
291
265
|
url = (
|
292
266
|
f"{base_url}/text-to-speech/{voice_id}/stream-input?"
|
293
|
-
f"model_id={model_id}&output_format={
|
267
|
+
f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
294
268
|
)
|
295
269
|
|
296
270
|
return url
|
@@ -443,8 +417,6 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
443
417
|
all_tokens_consumed = True
|
444
418
|
|
445
419
|
async def recv_task():
|
446
|
-
encoding = _encoding_from_format(self._opts.encoding)
|
447
|
-
mp3_decoder = codecs.Mp3StreamDecoder()
|
448
420
|
while True:
|
449
421
|
msg = await ws_conn.receive()
|
450
422
|
if msg.type in (
|
@@ -465,32 +437,19 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
465
437
|
continue
|
466
438
|
|
467
439
|
data: dict = json.loads(msg.data)
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
b64data = base64.b64decode(audio)
|
479
|
-
frame: rtc.AudioFrame
|
480
|
-
if encoding == "mp3":
|
481
|
-
frames = mp3_decoder.decode_chunk(b64data)
|
482
|
-
frame = utils.merge_frames(frames)
|
483
|
-
else:
|
484
|
-
frame = rtc.AudioFrame(
|
485
|
-
data=b64data,
|
486
|
-
sample_rate=self._opts.sample_rate,
|
487
|
-
num_channels=1,
|
488
|
-
samples_per_channel=len(b64data) // 2,
|
489
|
-
)
|
440
|
+
if data.get("audio"):
|
441
|
+
b64data = base64.b64decode(data["audio"])
|
442
|
+
|
443
|
+
frame = rtc.AudioFrame(
|
444
|
+
data=b64data,
|
445
|
+
sample_rate=self._opts.sample_rate,
|
446
|
+
num_channels=1,
|
447
|
+
samples_per_channel=len(b64data) // 2,
|
448
|
+
)
|
490
449
|
|
491
450
|
text = ""
|
492
451
|
if data.get("alignment"):
|
493
|
-
text =
|
452
|
+
text = data["alignment"].get("chars", "")
|
494
453
|
|
495
454
|
audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
|
496
455
|
continue
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.dev0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
22
|
Requires-Dist: livekit ~=0.11
|
23
|
-
Requires-Dist: livekit-agents
|
23
|
+
Requires-Dist: livekit-agents ~=0.7.dev0
|
24
24
|
Requires-Dist: aiohttp >=3.8.5
|
25
25
|
|
26
26
|
# LiveKit Plugins Elevenlabs
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=p7mEiUDR6gbqEUrLp1lgTkJ3ounN6rhnenYoYqWNF2k,16418
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=h2gCxcJSMvCrVP7h14ON6HaghqLCkbl3--HZKEopR_8,603
|
7
|
+
livekit_plugins_elevenlabs-0.5.dev0.dist-info/METADATA,sha256=5uCb2q4zTTGaCSSN448GLqhj9-41bg0jjR2CSeov8ms,1365
|
8
|
+
livekit_plugins_elevenlabs-0.5.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
+
livekit_plugins_elevenlabs-0.5.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_elevenlabs-0.5.dev0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=ez1ybDPt7GfKAKgPkxZFRB7Vyd-_i-0hfUMI79GQ5w4,1091
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=8jTchztgpiTokHEaWUK8PPxWWfvm5SMrOGsJpzxbYAw,362
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=GTcyQwBVVPzCYLgsnw9q5oFOq9cV3hIKndDaBPSFMr4,17738
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=pZ7bgeWLjw4VCWymU1ntHaHorKRusUkm56y6tZe5gmQ,600
|
7
|
-
livekit_plugins_elevenlabs-0.5.0.dist-info/METADATA,sha256=nmaTaWHwzuzT9nBjaLsJlzTAanMsxl7lv8wH5Sq7boI,1367
|
8
|
-
livekit_plugins_elevenlabs-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
-
livekit_plugins_elevenlabs-0.5.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.5.0.dist-info/RECORD,,
|
{livekit_plugins_elevenlabs-0.5.0.dist-info → livekit_plugins_elevenlabs-0.5.dev0.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|