livekit-plugins-hume 1.0.22__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of livekit-plugins-hume might be problematic. Click here for more details.
- {livekit_plugins_hume-1.0.22 → livekit_plugins_hume-1.1.0}/.gitignore +3 -0
- {livekit_plugins_hume-1.0.22 → livekit_plugins_hume-1.1.0}/PKG-INFO +2 -3
- {livekit_plugins_hume-1.0.22 → livekit_plugins_hume-1.1.0}/livekit/plugins/hume/__init__.py +3 -25
- livekit_plugins_hume-1.1.0/livekit/plugins/hume/tts.py +180 -0
- {livekit_plugins_hume-1.0.22 → livekit_plugins_hume-1.1.0}/livekit/plugins/hume/version.py +1 -1
- {livekit_plugins_hume-1.0.22 → livekit_plugins_hume-1.1.0}/pyproject.toml +1 -2
- livekit_plugins_hume-1.0.22/livekit/plugins/hume/models.py +0 -0
- livekit_plugins_hume-1.0.22/livekit/plugins/hume/tts.py +0 -283
- {livekit_plugins_hume-1.0.22 → livekit_plugins_hume-1.1.0}/README.md +0 -0
- {livekit_plugins_hume-1.0.22 → livekit_plugins_hume-1.1.0}/livekit/plugins/hume/log.py +0 -0
- {livekit_plugins_hume-1.0.22 → livekit_plugins_hume-1.1.0}/livekit/plugins/hume/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: livekit-plugins-hume
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Hume TTS plugin for LiveKit agents
|
|
5
5
|
Project-URL: Documentation, https://docs.livekit.io
|
|
6
6
|
Project-URL: Website, https://livekit.io/
|
|
@@ -17,8 +17,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
18
|
Requires-Python: >=3.9.0
|
|
19
19
|
Requires-Dist: aiohttp>=3.8.0
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
Requires-Dist: livekit-agents>=1.0.22
|
|
20
|
+
Requires-Dist: livekit-agents>=1.1.0
|
|
22
21
|
Description-Content-Type: text/markdown
|
|
23
22
|
|
|
24
23
|
# Hume AI TTS plugin for LiveKit Agents
|
|
@@ -19,25 +19,12 @@ See https://docs.livekit.io/agents/integrations/tts/hume/ for more information.
|
|
|
19
19
|
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
|
-
__version__ = "1.0.0"
|
|
23
|
-
|
|
24
|
-
# make imports available
|
|
25
|
-
from hume.tts import (
|
|
26
|
-
Format,
|
|
27
|
-
PostedContext,
|
|
28
|
-
PostedUtteranceVoice,
|
|
29
|
-
)
|
|
30
22
|
from livekit.agents import Plugin
|
|
31
23
|
|
|
32
|
-
from .tts import TTS
|
|
24
|
+
from .tts import TTS, PostedContext, PostedUtterance
|
|
25
|
+
from .version import __version__
|
|
33
26
|
|
|
34
|
-
|
|
35
|
-
__all__ = [
|
|
36
|
-
"TTS",
|
|
37
|
-
"Format",
|
|
38
|
-
"PostedContext",
|
|
39
|
-
"PostedUtteranceVoice",
|
|
40
|
-
]
|
|
27
|
+
__all__ = ["TTS", "PostedContext", "PostedUtterance"]
|
|
41
28
|
|
|
42
29
|
|
|
43
30
|
class HumeAIPlugin(Plugin):
|
|
@@ -55,12 +42,3 @@ __pdoc__ = {}
|
|
|
55
42
|
|
|
56
43
|
for n in NOT_IN_ALL:
|
|
57
44
|
__pdoc__[n] = False
|
|
58
|
-
|
|
59
|
-
# Cleanup docs of unexported modules
|
|
60
|
-
_module = dir()
|
|
61
|
-
NOT_IN_ALL = [m for m in _module if m not in __all__]
|
|
62
|
-
|
|
63
|
-
__pdoc__ = {}
|
|
64
|
-
|
|
65
|
-
for n in NOT_IN_ALL:
|
|
66
|
-
__pdoc__[n] = False
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import base64
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
from dataclasses import dataclass, replace
|
|
22
|
+
from typing import Any, TypedDict
|
|
23
|
+
|
|
24
|
+
import aiohttp
|
|
25
|
+
|
|
26
|
+
from livekit.agents import APIConnectionError, APIConnectOptions, APITimeoutError, tts, utils
|
|
27
|
+
from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
|
|
28
|
+
from livekit.agents.utils import is_given
|
|
29
|
+
|
|
30
|
+
API_AUTH_HEADER = "X-Hume-Api-Key"
|
|
31
|
+
STREAM_PATH = "/v0/tts/stream/json"
|
|
32
|
+
DEFAULT_BASE_URL = "https://api.hume.ai"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PostedUtterance(TypedDict, total=False):
|
|
36
|
+
text: str
|
|
37
|
+
description: str
|
|
38
|
+
voice: dict[str, Any]
|
|
39
|
+
speed: float
|
|
40
|
+
trailing_silence: float
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class PostedContext(TypedDict, total=False):
|
|
44
|
+
utterances: list[PostedUtterance]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class _TTSOptions:
|
|
49
|
+
api_key: str
|
|
50
|
+
utterance_options: PostedUtterance
|
|
51
|
+
context: PostedContext | None
|
|
52
|
+
sample_rate: int
|
|
53
|
+
split_utterances: bool
|
|
54
|
+
instant_mode: bool
|
|
55
|
+
base_url: str
|
|
56
|
+
|
|
57
|
+
def http_url(self, path: str) -> str:
|
|
58
|
+
return f"{self.base_url}{path}"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class TTS(tts.TTS):
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
*,
|
|
65
|
+
api_key: str | None = None,
|
|
66
|
+
utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
|
|
67
|
+
split_utterances: bool = True,
|
|
68
|
+
instant_mode: bool = True,
|
|
69
|
+
sample_rate: int = 24000,
|
|
70
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
71
|
+
http_session: aiohttp.ClientSession | None = None,
|
|
72
|
+
):
|
|
73
|
+
super().__init__(
|
|
74
|
+
capabilities=tts.TTSCapabilities(streaming=True),
|
|
75
|
+
sample_rate=sample_rate,
|
|
76
|
+
num_channels=1,
|
|
77
|
+
)
|
|
78
|
+
key = api_key or os.environ.get("HUME_API_KEY")
|
|
79
|
+
if not key:
|
|
80
|
+
raise ValueError("Hume API key is required via api_key or HUME_API_KEY env var")
|
|
81
|
+
|
|
82
|
+
default_utterance: PostedUtterance = {
|
|
83
|
+
"speed": 1.0,
|
|
84
|
+
"trailing_silence": 0.35,
|
|
85
|
+
}
|
|
86
|
+
if is_given(utterance_options):
|
|
87
|
+
default_utterance.update(utterance_options)
|
|
88
|
+
|
|
89
|
+
self._opts = _TTSOptions(
|
|
90
|
+
api_key=key,
|
|
91
|
+
utterance_options=default_utterance,
|
|
92
|
+
context=None,
|
|
93
|
+
sample_rate=sample_rate,
|
|
94
|
+
split_utterances=split_utterances,
|
|
95
|
+
instant_mode=instant_mode,
|
|
96
|
+
base_url=base_url,
|
|
97
|
+
)
|
|
98
|
+
self._session = http_session
|
|
99
|
+
|
|
100
|
+
def _ensure_session(self) -> aiohttp.ClientSession:
|
|
101
|
+
if not self._session:
|
|
102
|
+
self._session = utils.http_context.http_session()
|
|
103
|
+
|
|
104
|
+
return self._session
|
|
105
|
+
|
|
106
|
+
def update_options(
|
|
107
|
+
self,
|
|
108
|
+
*,
|
|
109
|
+
utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
|
|
110
|
+
context: NotGivenOr[PostedContext] = NOT_GIVEN,
|
|
111
|
+
split_utterances: NotGivenOr[bool] = NOT_GIVEN,
|
|
112
|
+
instant_mode: NotGivenOr[bool] = NOT_GIVEN,
|
|
113
|
+
) -> None:
|
|
114
|
+
if is_given(utterance_options):
|
|
115
|
+
self._opts.utterance_options = utterance_options
|
|
116
|
+
if is_given(context): #
|
|
117
|
+
self._opts.context = context
|
|
118
|
+
if is_given(split_utterances):
|
|
119
|
+
self._opts.split_utterances = split_utterances
|
|
120
|
+
if is_given(instant_mode):
|
|
121
|
+
self._opts.instant_mode = instant_mode
|
|
122
|
+
|
|
123
|
+
def synthesize(
|
|
124
|
+
self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
|
|
125
|
+
) -> tts.ChunkedStream:
|
|
126
|
+
return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class ChunkedStream(tts.ChunkedStream):
|
|
130
|
+
def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
|
|
131
|
+
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
|
132
|
+
self._tts: TTS = tts
|
|
133
|
+
self._opts = replace(tts._opts)
|
|
134
|
+
|
|
135
|
+
async def _run(self, output_emitter: tts.AudioEmitter) -> None:
|
|
136
|
+
utterance: PostedUtterance = {"text": self._input_text}
|
|
137
|
+
utterance.update(self._opts.utterance_options)
|
|
138
|
+
|
|
139
|
+
payload: dict[str, Any] = {
|
|
140
|
+
"utterances": [utterance],
|
|
141
|
+
"split_utterances": self._opts.split_utterances,
|
|
142
|
+
"strip_headers": True,
|
|
143
|
+
"instant_mode": self._opts.instant_mode,
|
|
144
|
+
"format": {"type": "mp3"},
|
|
145
|
+
}
|
|
146
|
+
if self._opts.context:
|
|
147
|
+
payload["context"] = self._opts.context
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
async with self._tts._ensure_session().post(
|
|
151
|
+
self._opts.http_url(STREAM_PATH),
|
|
152
|
+
headers={API_AUTH_HEADER: self._opts.api_key},
|
|
153
|
+
json=payload,
|
|
154
|
+
timeout=aiohttp.ClientTimeout(total=None, sock_connect=self._conn_options.timeout),
|
|
155
|
+
# large read_bufsize to avoid `ValueError: Chunk too big`
|
|
156
|
+
read_bufsize=10 * 1024 * 1024,
|
|
157
|
+
) as resp:
|
|
158
|
+
resp.raise_for_status()
|
|
159
|
+
output_emitter.initialize(
|
|
160
|
+
request_id=utils.shortuuid(),
|
|
161
|
+
sample_rate=self._opts.sample_rate,
|
|
162
|
+
num_channels=self._tts.num_channels,
|
|
163
|
+
mime_type="audio/mp3",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
async for raw_line in resp.content:
|
|
167
|
+
line = raw_line.strip()
|
|
168
|
+
if not line:
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
data = json.loads(line.decode())
|
|
172
|
+
audio_b64 = data.get("audio")
|
|
173
|
+
if audio_b64:
|
|
174
|
+
output_emitter.push(base64.b64decode(audio_b64))
|
|
175
|
+
|
|
176
|
+
output_emitter.flush()
|
|
177
|
+
except asyncio.TimeoutError:
|
|
178
|
+
raise APITimeoutError() from None
|
|
179
|
+
except Exception as e:
|
|
180
|
+
raise APIConnectionError() from e
|
|
File without changes
|
|
@@ -1,283 +0,0 @@
|
|
|
1
|
-
# Copyright 2023 LiveKit, Inc.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from __future__ import annotations
|
|
16
|
-
|
|
17
|
-
import asyncio
|
|
18
|
-
import base64
|
|
19
|
-
import os
|
|
20
|
-
from dataclasses import dataclass
|
|
21
|
-
|
|
22
|
-
import aiohttp
|
|
23
|
-
|
|
24
|
-
from hume import AsyncHumeClient
|
|
25
|
-
from hume.tts import Format, FormatWav, PostedContext, PostedUtterance, PostedUtteranceVoice
|
|
26
|
-
from livekit.agents import (
|
|
27
|
-
APIConnectionError,
|
|
28
|
-
APIConnectOptions,
|
|
29
|
-
APITimeoutError,
|
|
30
|
-
tokenize,
|
|
31
|
-
tts,
|
|
32
|
-
utils,
|
|
33
|
-
)
|
|
34
|
-
from livekit.agents.types import (
|
|
35
|
-
DEFAULT_API_CONNECT_OPTIONS,
|
|
36
|
-
NOT_GIVEN,
|
|
37
|
-
NotGivenOr,
|
|
38
|
-
)
|
|
39
|
-
from livekit.agents.utils import is_given
|
|
40
|
-
|
|
41
|
-
# Default audio settings
|
|
42
|
-
DEFAULT_SAMPLE_RATE = 48000
|
|
43
|
-
DEFAULT_NUM_CHANNELS = 1
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@dataclass
|
|
47
|
-
class _TTSOptions:
|
|
48
|
-
"""TTS options for Hume API"""
|
|
49
|
-
|
|
50
|
-
api_key: str
|
|
51
|
-
voice: PostedUtteranceVoice | None
|
|
52
|
-
description: str | None
|
|
53
|
-
speed: float | None
|
|
54
|
-
context: PostedContext | None
|
|
55
|
-
format: Format
|
|
56
|
-
strip_headers: bool
|
|
57
|
-
instant_mode: bool
|
|
58
|
-
word_tokenizer: tokenize.WordTokenizer
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class TTS(tts.TTS):
|
|
62
|
-
def __init__(
|
|
63
|
-
self,
|
|
64
|
-
*,
|
|
65
|
-
voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
|
|
66
|
-
description: NotGivenOr[str] = NOT_GIVEN,
|
|
67
|
-
speed: NotGivenOr[float] = NOT_GIVEN,
|
|
68
|
-
context: NotGivenOr[PostedContext] = NOT_GIVEN,
|
|
69
|
-
format: NotGivenOr[Format] = NOT_GIVEN,
|
|
70
|
-
instant_mode: bool = False,
|
|
71
|
-
strip_headers: bool = True,
|
|
72
|
-
api_key: NotGivenOr[str] = NOT_GIVEN,
|
|
73
|
-
word_tokenizer: tokenize.WordTokenizer | None = None,
|
|
74
|
-
http_session: aiohttp.ClientSession | None = None,
|
|
75
|
-
) -> None:
|
|
76
|
-
"""Initialize the Hume TTS client.
|
|
77
|
-
|
|
78
|
-
See https://dev.hume.ai/reference/text-to-speech-tts/synthesize-json-streaming for API doc
|
|
79
|
-
|
|
80
|
-
Args:
|
|
81
|
-
voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
|
|
82
|
-
used. When no voice is specified, a novel voice will be generated based on the
|
|
83
|
-
text and optionally provided description.
|
|
84
|
-
description (NotGivenOr[str]): Natural language instructions describing how the
|
|
85
|
-
synthesized speech should sound, including but not limited to tone, intonation,
|
|
86
|
-
pacing, and accent. If a Voice is specified in the request, this description
|
|
87
|
-
serves as acting instructions. If no Voice is specified, a new voice is generated
|
|
88
|
-
based on this description.
|
|
89
|
-
speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
|
|
90
|
-
from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
|
|
91
|
-
pace.
|
|
92
|
-
context (NotGivenOr[PostedContext]): Utterances to use as context for generating
|
|
93
|
-
consistent speech style and prosody across multiple requests.
|
|
94
|
-
format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
|
|
95
|
-
Defaults to WAV format.
|
|
96
|
-
instant_mode (bool): Enables ultra-low latency streaming, reducing time to first chunk.
|
|
97
|
-
Recommended for real-time applications. Only for streaming endpoints.
|
|
98
|
-
With this enabled, requests incur 10% higher cost. Defaults to False.
|
|
99
|
-
strip_headers (bool): If enabled, the audio for all the chunks of a generation.
|
|
100
|
-
Once concatenated together, will constitute a single audio file.
|
|
101
|
-
If disabled, each chunk’s audio will be its own audio file, each with its headers.
|
|
102
|
-
api_key (NotGivenOr[str]): Hume API key for authentication. If not provided,
|
|
103
|
-
will attempt to read from HUME_API_KEY environment variable.
|
|
104
|
-
word_tokenizer (tokenize.WordTokenizer | None): Custom word tokenizer to use for text.
|
|
105
|
-
If None, a basic word tokenizer will be used.
|
|
106
|
-
http_session (aiohttp.ClientSession | None): Optional HTTP session for API requests.
|
|
107
|
-
If None, a new session will be created.
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
super().__init__(
|
|
111
|
-
capabilities=tts.TTSCapabilities(
|
|
112
|
-
streaming=False,
|
|
113
|
-
),
|
|
114
|
-
sample_rate=DEFAULT_SAMPLE_RATE,
|
|
115
|
-
num_channels=DEFAULT_NUM_CHANNELS,
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
self._api_key = api_key if is_given(api_key) else os.environ.get("HUME_API_KEY")
|
|
119
|
-
if not self._api_key:
|
|
120
|
-
raise ValueError(
|
|
121
|
-
"Hume API key is required, either as argument or set HUME_API_KEY env variable"
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
if not word_tokenizer:
|
|
125
|
-
word_tokenizer = tokenize.basic.WordTokenizer(ignore_punctuation=False)
|
|
126
|
-
|
|
127
|
-
self._opts = _TTSOptions(
|
|
128
|
-
voice=voice if is_given(voice) else None,
|
|
129
|
-
description=description if is_given(description) else None,
|
|
130
|
-
speed=speed if is_given(speed) else None,
|
|
131
|
-
context=context if is_given(context) else None,
|
|
132
|
-
format=format if is_given(format) else FormatWav(),
|
|
133
|
-
api_key=self._api_key,
|
|
134
|
-
strip_headers=strip_headers,
|
|
135
|
-
instant_mode=instant_mode,
|
|
136
|
-
word_tokenizer=word_tokenizer,
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
self._client = AsyncHumeClient(api_key=self._api_key)
|
|
140
|
-
self._session = http_session
|
|
141
|
-
|
|
142
|
-
def _ensure_session(self) -> aiohttp.ClientSession:
|
|
143
|
-
if not self._session:
|
|
144
|
-
self._session = utils.http_context.http_session()
|
|
145
|
-
return self._session
|
|
146
|
-
|
|
147
|
-
def update_options(
|
|
148
|
-
self,
|
|
149
|
-
*,
|
|
150
|
-
voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
|
|
151
|
-
description: NotGivenOr[str] = NOT_GIVEN,
|
|
152
|
-
speed: NotGivenOr[float] = NOT_GIVEN,
|
|
153
|
-
context: NotGivenOr[PostedContext] = NOT_GIVEN,
|
|
154
|
-
format: NotGivenOr[Format] = NOT_GIVEN,
|
|
155
|
-
instant_mode: NotGivenOr[bool] = NOT_GIVEN,
|
|
156
|
-
strip_headers: NotGivenOr[bool] = NOT_GIVEN,
|
|
157
|
-
) -> None:
|
|
158
|
-
"""Update TTS options for synthesizing speech.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
|
|
162
|
-
used. When no voice is specified, a novel voice will be generated based on the
|
|
163
|
-
text and optionally provided description.
|
|
164
|
-
description (NotGivenOr[str]): Natural language instructions describing how the
|
|
165
|
-
synthesized speech should sound, including but not limited to tone, intonation,
|
|
166
|
-
pacing, and accent. If a Voice is specified in the request, this description
|
|
167
|
-
serves as acting instructions. If no Voice is specified, a new voice is generated
|
|
168
|
-
based on this description.
|
|
169
|
-
speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
|
|
170
|
-
from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
|
|
171
|
-
pace.
|
|
172
|
-
context (Optional[PostedContext]): Utterances to use as context for generating
|
|
173
|
-
consistent speech style and prosody across multiple requests.
|
|
174
|
-
format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
|
|
175
|
-
instant_mode (NotGivenOr[bool]): Enables ultra-low latency streaming.
|
|
176
|
-
Reduces time to first audio chunk, recommended for real-time applications.
|
|
177
|
-
Note: Incurs 10% higher cost when enabled.
|
|
178
|
-
strip_headers (NotGivenOr[bool]): If enabled, the audio for the chunks of a generation.
|
|
179
|
-
Once concatenated together, will constitute a single audio file.
|
|
180
|
-
If disabled, each chunk’s audio will be its own audio file, each with its headers.
|
|
181
|
-
"""
|
|
182
|
-
|
|
183
|
-
if is_given(voice):
|
|
184
|
-
self._opts.voice = voice
|
|
185
|
-
if is_given(description):
|
|
186
|
-
self._opts.description = description
|
|
187
|
-
if is_given(speed):
|
|
188
|
-
self._opts.speed = speed
|
|
189
|
-
if is_given(format):
|
|
190
|
-
self._opts.format = format
|
|
191
|
-
if is_given(context):
|
|
192
|
-
self._opts.context = context
|
|
193
|
-
if is_given(instant_mode):
|
|
194
|
-
self._opts.instant_mode = instant_mode
|
|
195
|
-
if is_given(strip_headers):
|
|
196
|
-
self._opts.strip_headers = strip_headers
|
|
197
|
-
|
|
198
|
-
def synthesize(
|
|
199
|
-
self,
|
|
200
|
-
text: str,
|
|
201
|
-
*,
|
|
202
|
-
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
|
203
|
-
) -> ChunkedStream:
|
|
204
|
-
return ChunkedStream(
|
|
205
|
-
tts=self,
|
|
206
|
-
input_text=text,
|
|
207
|
-
conn_options=conn_options,
|
|
208
|
-
opts=self._opts,
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
class ChunkedStream(tts.ChunkedStream):
|
|
213
|
-
"""Stream for Hume TTS JSON streaming API."""
|
|
214
|
-
|
|
215
|
-
def __init__(
|
|
216
|
-
self,
|
|
217
|
-
*,
|
|
218
|
-
tts: TTS,
|
|
219
|
-
input_text: str,
|
|
220
|
-
opts: _TTSOptions,
|
|
221
|
-
conn_options: APIConnectOptions,
|
|
222
|
-
) -> None:
|
|
223
|
-
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
|
224
|
-
self._opts = opts
|
|
225
|
-
self._client = tts._client
|
|
226
|
-
|
|
227
|
-
async def _run(self) -> None:
|
|
228
|
-
request_id = utils.shortuuid()
|
|
229
|
-
|
|
230
|
-
decoder = utils.codecs.AudioStreamDecoder(
|
|
231
|
-
sample_rate=DEFAULT_SAMPLE_RATE,
|
|
232
|
-
num_channels=DEFAULT_NUM_CHANNELS,
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
decode_task: asyncio.Task | None = None
|
|
236
|
-
|
|
237
|
-
try:
|
|
238
|
-
|
|
239
|
-
async def _decode_loop():
|
|
240
|
-
utterance_options = {
|
|
241
|
-
"voice": self._opts.voice,
|
|
242
|
-
"description": self._opts.description,
|
|
243
|
-
"speed": self._opts.speed,
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
utterance_kwargs = {
|
|
247
|
-
"text": self._input_text,
|
|
248
|
-
**{k: v for k, v in utterance_options.items() if v is not None},
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
try:
|
|
252
|
-
utterance = PostedUtterance(**utterance_kwargs)
|
|
253
|
-
|
|
254
|
-
async for chunk in self._client.tts.synthesize_json_streaming(
|
|
255
|
-
utterances=[utterance],
|
|
256
|
-
context=self._opts.context,
|
|
257
|
-
format=self._opts.format,
|
|
258
|
-
instant_mode=self._opts.instant_mode,
|
|
259
|
-
strip_headers=self._opts.strip_headers,
|
|
260
|
-
):
|
|
261
|
-
decoder.push(base64.b64decode(chunk.audio))
|
|
262
|
-
|
|
263
|
-
finally:
|
|
264
|
-
decoder.end_input()
|
|
265
|
-
|
|
266
|
-
decode_task = asyncio.create_task(_decode_loop())
|
|
267
|
-
emitter = tts.SynthesizedAudioEmitter(
|
|
268
|
-
event_ch=self._event_ch,
|
|
269
|
-
request_id=request_id,
|
|
270
|
-
)
|
|
271
|
-
async for frame in decoder:
|
|
272
|
-
emitter.push(frame)
|
|
273
|
-
|
|
274
|
-
emitter.flush()
|
|
275
|
-
|
|
276
|
-
except asyncio.TimeoutError:
|
|
277
|
-
raise APITimeoutError() from None
|
|
278
|
-
except Exception as e:
|
|
279
|
-
raise APIConnectionError() from e
|
|
280
|
-
finally:
|
|
281
|
-
if decode_task:
|
|
282
|
-
await utils.aio.gracefully_cancel(decode_task)
|
|
283
|
-
await decoder.aclose()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|