livekit-plugins-hume 1.0.22__tar.gz → 1.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -146,6 +146,9 @@ venv.bak/
146
146
  .dmypy.json
147
147
  dmypy.json
148
148
 
149
+ # trunk
150
+ .trunk/
151
+
149
152
  # Pyre type checker
150
153
  .pyre/
151
154
 
@@ -166,4 +169,11 @@ node_modules
166
169
 
167
170
  credentials.json
168
171
  pyrightconfig.json
169
- docs/
172
+ docs/
173
+
174
+ # Database files
175
+ *.db
176
+
177
+
178
+ # Examples for development
179
+ examples/dev/*
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-hume
3
- Version: 1.0.22
3
+ Version: 1.3.3
4
4
  Summary: Hume TTS plugin for LiveKit agents
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
7
7
  Project-URL: Source, https://github.com/livekit/agents
8
8
  Author-email: LiveKit <info@livekit.io>
9
9
  License-Expression: Apache-2.0
10
- Keywords: Hume,HumeAI,Octave,audio,livekit,realtime,webrtc
10
+ Keywords: Hume,HumeAI,Octave,ai,audio,livekit,realtime,video,voice
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: Programming Language :: Python :: 3
13
13
  Classifier: Programming Language :: Python :: 3 :: Only
@@ -17,8 +17,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
17
17
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.9.0
19
19
  Requires-Dist: aiohttp>=3.8.0
20
- Requires-Dist: hume>=0.8.3
21
- Requires-Dist: livekit-agents>=1.0.22
20
+ Requires-Dist: livekit-agents>=1.3.3
22
21
  Description-Content-Type: text/markdown
23
22
 
24
23
  # Hume AI TTS plugin for LiveKit Agents
@@ -19,24 +19,26 @@ See https://docs.livekit.io/agents/integrations/tts/hume/ for more information.
19
19
 
20
20
  from __future__ import annotations
21
21
 
22
- __version__ = "1.0.0"
23
-
24
- # make imports available
25
- from hume.tts import (
26
- Format,
27
- PostedContext,
28
- PostedUtteranceVoice,
29
- )
30
22
  from livekit.agents import Plugin
31
23
 
32
- from .tts import TTS
24
+ from .tts import (
25
+ TTS,
26
+ AudioFormat,
27
+ Utterance,
28
+ VoiceById,
29
+ VoiceByName,
30
+ VoiceProvider,
31
+ )
32
+ from .version import __version__
33
33
 
34
- # all exports
35
34
  __all__ = [
36
35
  "TTS",
37
- "Format",
38
- "PostedContext",
39
- "PostedUtteranceVoice",
36
+ "AudioFormat",
37
+ "ModelVersion",
38
+ "VoiceById",
39
+ "VoiceByName",
40
+ "VoiceProvider",
41
+ "Utterance",
40
42
  ]
41
43
 
42
44
 
@@ -55,12 +57,3 @@ __pdoc__ = {}
55
57
 
56
58
  for n in NOT_IN_ALL:
57
59
  __pdoc__[n] = False
58
-
59
- # Cleanup docs of unexported modules
60
- _module = dir()
61
- NOT_IN_ALL = [m for m in _module if m not in __all__]
62
-
63
- __pdoc__ = {}
64
-
65
- for n in NOT_IN_ALL:
66
- __pdoc__[n] = False
@@ -0,0 +1,304 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ from dataclasses import dataclass, replace
22
+ from enum import Enum
23
+ from typing import Any, Literal, TypedDict
24
+
25
+ import aiohttp
26
+
27
+ from livekit.agents import (
28
+ APIConnectionError,
29
+ APIConnectOptions,
30
+ APIError,
31
+ APITimeoutError,
32
+ tts,
33
+ utils,
34
+ )
35
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
36
+ from livekit.agents.utils import is_given
37
+
38
+ from .version import __version__
39
+
40
+
41
+ class VoiceById(TypedDict, total=False):
42
+ id: str
43
+ provider: VoiceProvider | None
44
+
45
+
46
+ class VoiceByName(TypedDict, total=False):
47
+ name: str
48
+ provider: VoiceProvider | None
49
+
50
+
51
+ class Utterance(TypedDict, total=False):
52
+ """Utterance for TTS synthesis."""
53
+
54
+ text: str
55
+ description: str | None
56
+ speed: float | None
57
+ voice: VoiceById | VoiceByName | None
58
+ trailing_silence: float | None
59
+
60
+
61
+ class VoiceProvider(str, Enum):
62
+ """Voice provider for the voice library."""
63
+
64
+ hume = "HUME_AI"
65
+ custom = "CUSTOM_VOICE"
66
+
67
+
68
+ class AudioFormat(str, Enum):
69
+ """Audio format for the synthesized speech."""
70
+
71
+ mp3 = "mp3"
72
+ wav = "wav"
73
+ pcm = "pcm"
74
+
75
+
76
+ ModelVersion = Literal["1", "2"]
77
+
78
+
79
+ DEFAULT_HEADERS = {
80
+ "X-Hume-Client-Name": "livekit",
81
+ "X-Hume-Client-Version": __version__,
82
+ }
83
+ API_AUTH_HEADER = "X-Hume-Api-Key"
84
+ STREAM_PATH = "/v0/tts/stream/json"
85
+ DEFAULT_BASE_URL = "https://api.hume.ai"
86
+ SUPPORTED_SAMPLE_RATE = 48000
87
+ DEFAULT_VOICE = VoiceByName(name="Male English Actor", provider=VoiceProvider.hume)
88
+
89
+
90
+ @dataclass
91
+ class _TTSOptions:
92
+ api_key: str
93
+ base_url: str
94
+ voice: VoiceById | VoiceByName | None
95
+ model_version: ModelVersion | None
96
+ description: str | None
97
+ speed: float | None
98
+ trailing_silence: float | None
99
+ context: str | list[Utterance] | None
100
+ instant_mode: bool | None
101
+ audio_format: AudioFormat
102
+
103
+ def http_url(self, path: str) -> str:
104
+ return f"{self.base_url}{path}"
105
+
106
+
107
+ class TTS(tts.TTS):
108
+ def __init__(
109
+ self,
110
+ *,
111
+ api_key: str | None = None,
112
+ voice: VoiceById | VoiceByName | None = DEFAULT_VOICE,
113
+ model_version: ModelVersion | None = "1",
114
+ description: str | None = None,
115
+ speed: float | None = None,
116
+ trailing_silence: float | None = None,
117
+ context: str | list[Utterance] | None = None,
118
+ instant_mode: NotGivenOr[bool] = NOT_GIVEN,
119
+ audio_format: AudioFormat = AudioFormat.mp3,
120
+ base_url: str = DEFAULT_BASE_URL,
121
+ http_session: aiohttp.ClientSession | None = None,
122
+ ):
123
+ """Initialize the Hume AI TTS client. Options will be used for all future synthesis
124
+ (until updated with update_options).
125
+
126
+ Args:
127
+ api_key: Hume AI API key. If not provided, will look for HUME_API_KEY environment
128
+ variable.
129
+ voice: A voice from the voice library specified by name or id.
130
+ model_version: Specifies which version of Octave to use. See Hume's documentation for
131
+ details on model version differences: https://dev.hume.ai/docs/text-to-speech-tts/overview.
132
+ description: Natural language instructions describing how the synthesized speech
133
+ should sound (≤1000 characters).
134
+ speed: Speed multiplier for the synthesized speech (≥0.5, ≤2.0, default: 1.0).
135
+ trailing_silence: Duration of trailing silence (in seconds) to add to each utterance
136
+ (≥0, ≤5.0, default: 0.35).
137
+ context: Optional context for synthesis, either as text or list of utterances.
138
+ instant_mode: Whether to use instant mode. Defaults to True if voice specified,
139
+ False otherwise. Requires a voice to be specified when enabled.
140
+ audio_format: Output audio format (mp3, wav, or pcm). Defaults to mp3.
141
+ base_url: Base URL for Hume AI API. Defaults to https://api.hume.ai
142
+ http_session: Optional aiohttp ClientSession to use for requests.
143
+ """
144
+ super().__init__(
145
+ capabilities=tts.TTSCapabilities(streaming=False),
146
+ sample_rate=SUPPORTED_SAMPLE_RATE,
147
+ num_channels=1,
148
+ )
149
+ key = api_key or os.environ.get("HUME_API_KEY")
150
+ if not key:
151
+ raise ValueError("Hume API key is required via api_key or HUME_API_KEY env var")
152
+
153
+ has_voice = voice is not None
154
+
155
+ # Default instant_mode is True if a voice is specified, otherwise False
156
+ # (Hume API requires a voice for instant mode)
157
+ if not is_given(instant_mode):
158
+ resolved_instant_mode = has_voice
159
+ elif instant_mode and not has_voice:
160
+ raise ValueError("Hume TTS: instant_mode cannot be enabled without specifying a voice")
161
+ else:
162
+ resolved_instant_mode = instant_mode
163
+
164
+ self._opts = _TTSOptions(
165
+ api_key=key,
166
+ voice=voice,
167
+ model_version=model_version,
168
+ description=description,
169
+ speed=speed,
170
+ trailing_silence=trailing_silence,
171
+ context=context,
172
+ instant_mode=resolved_instant_mode,
173
+ audio_format=audio_format,
174
+ base_url=base_url,
175
+ )
176
+ self._session = http_session
177
+
178
+ @property
179
+ def model(self) -> str:
180
+ return "Octave"
181
+
182
+ @property
183
+ def provider(self) -> str:
184
+ return "Hume"
185
+
186
+ def _ensure_session(self) -> aiohttp.ClientSession:
187
+ if not self._session:
188
+ self._session = utils.http_context.http_session()
189
+
190
+ return self._session
191
+
192
+ def update_options(
193
+ self,
194
+ *,
195
+ description: NotGivenOr[str | None] = NOT_GIVEN,
196
+ speed: NotGivenOr[float | None] = NOT_GIVEN,
197
+ voice: NotGivenOr[VoiceById | VoiceByName | None] = NOT_GIVEN,
198
+ trailing_silence: NotGivenOr[float | None] = NOT_GIVEN,
199
+ context: NotGivenOr[str | list[Utterance] | None] = NOT_GIVEN,
200
+ instant_mode: NotGivenOr[bool] = NOT_GIVEN,
201
+ audio_format: NotGivenOr[AudioFormat] = NOT_GIVEN,
202
+ ) -> None:
203
+ """Update TTS options used for all future synthesis (until updated again)
204
+
205
+ Args:
206
+ voice: A voice from the voice library specified by name or id.
207
+ description: Natural language instructions describing how the synthesized speech
208
+ should sound (≤1000 characters).
209
+ speed: Speed multiplier for the synthesized speech (≥0.5, ≤2.0, default: 1.0).
210
+ trailing_silence: Duration of trailing silence (in seconds) to add to each utterance.
211
+ context: Optional context for synthesis, either as text or list of utterances.
212
+ instant_mode: Whether to use instant mode.
213
+ audio_format: Output audio format (mp3, wav, or pcm).
214
+ """
215
+ if is_given(description):
216
+ self._opts.description = description
217
+ if is_given(speed):
218
+ self._opts.speed = speed
219
+ if is_given(voice):
220
+ self._opts.voice = voice # type: ignore
221
+ if is_given(trailing_silence):
222
+ self._opts.trailing_silence = trailing_silence
223
+ if is_given(context):
224
+ self._opts.context = context # type: ignore
225
+ if is_given(instant_mode):
226
+ self._opts.instant_mode = instant_mode
227
+ if is_given(audio_format):
228
+ self._opts.audio_format = audio_format
229
+
230
+ def synthesize(
231
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
232
+ ) -> tts.ChunkedStream:
233
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
234
+
235
+
236
+ class ChunkedStream(tts.ChunkedStream):
237
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
238
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
239
+ self._tts: TTS = tts
240
+ self._opts = replace(tts._opts)
241
+
242
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
243
+ utterance: Utterance = {
244
+ "text": self._input_text,
245
+ }
246
+
247
+ if self._opts.voice:
248
+ utterance["voice"] = self._opts.voice
249
+ if self._opts.description:
250
+ utterance["description"] = self._opts.description
251
+ if self._opts.speed:
252
+ utterance["speed"] = self._opts.speed
253
+ if self._opts.trailing_silence:
254
+ utterance["trailing_silence"] = self._opts.trailing_silence
255
+
256
+ payload: dict[str, Any] = {
257
+ "utterances": [utterance],
258
+ "version": self._opts.model_version,
259
+ "strip_headers": True,
260
+ "instant_mode": self._opts.instant_mode,
261
+ "format": {"type": self._opts.audio_format.value},
262
+ }
263
+ if isinstance(self._opts.context, str):
264
+ payload["context"] = {"generation_id": self._opts.context}
265
+ elif isinstance(self._opts.context, list):
266
+ payload["context"] = {"utterances": self._opts.context}
267
+
268
+ try:
269
+ async with self._tts._ensure_session().post(
270
+ self._opts.http_url(STREAM_PATH),
271
+ headers={**DEFAULT_HEADERS, API_AUTH_HEADER: self._opts.api_key},
272
+ json=payload,
273
+ timeout=aiohttp.ClientTimeout(total=None, sock_connect=self._conn_options.timeout),
274
+ # large read_bufsize to avoid `ValueError: Chunk too big`
275
+ read_bufsize=10 * 1024 * 1024,
276
+ ) as resp:
277
+ resp.raise_for_status()
278
+
279
+ output_emitter.initialize(
280
+ request_id=utils.shortuuid(),
281
+ sample_rate=SUPPORTED_SAMPLE_RATE,
282
+ num_channels=self._tts.num_channels,
283
+ mime_type=f"audio/{self._opts.audio_format.value}",
284
+ )
285
+
286
+ async for raw_line in resp.content:
287
+ line = raw_line.strip()
288
+ if not line:
289
+ continue
290
+
291
+ data = json.loads(line.decode())
292
+ if data.get("type") == "error":
293
+ raise APIError(message=str(data))
294
+
295
+ audio_b64 = data.get("audio")
296
+ if audio_b64:
297
+ output_emitter.push(base64.b64decode(audio_b64))
298
+
299
+ output_emitter.flush()
300
+
301
+ except asyncio.TimeoutError:
302
+ raise APITimeoutError() from None
303
+ except Exception as e:
304
+ raise APIConnectionError() from e
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.22"
15
+ __version__ = "1.3.3"
@@ -12,7 +12,7 @@ requires-python = ">=3.9.0"
12
12
  authors = [
13
13
  {name = "LiveKit", email = "info@livekit.io"}
14
14
  ]
15
- keywords = ["webrtc", "realtime", "audio", "livekit", "HumeAI", "Hume", "Octave"]
15
+ keywords = ["voice", "ai", "realtime", "audio", "video", "livekit", "HumeAI", "Hume", "Octave"]
16
16
  classifiers = [
17
17
  "Intended Audience :: Developers",
18
18
  "Topic :: Multimedia :: Sound/Audio",
@@ -24,8 +24,7 @@ classifiers = [
24
24
  ]
25
25
  dependencies = [
26
26
  "aiohttp>=3.8.0",
27
- "livekit-agents>=1.0.22",
28
- "hume>=0.8.3"
27
+ "livekit-agents>=1.3.3",
29
28
  ]
30
29
 
31
30
  [project.urls]
@@ -1,283 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import asyncio
18
- import base64
19
- import os
20
- from dataclasses import dataclass
21
-
22
- import aiohttp
23
-
24
- from hume import AsyncHumeClient
25
- from hume.tts import Format, FormatWav, PostedContext, PostedUtterance, PostedUtteranceVoice
26
- from livekit.agents import (
27
- APIConnectionError,
28
- APIConnectOptions,
29
- APITimeoutError,
30
- tokenize,
31
- tts,
32
- utils,
33
- )
34
- from livekit.agents.types import (
35
- DEFAULT_API_CONNECT_OPTIONS,
36
- NOT_GIVEN,
37
- NotGivenOr,
38
- )
39
- from livekit.agents.utils import is_given
40
-
41
- # Default audio settings
42
- DEFAULT_SAMPLE_RATE = 48000
43
- DEFAULT_NUM_CHANNELS = 1
44
-
45
-
46
- @dataclass
47
- class _TTSOptions:
48
- """TTS options for Hume API"""
49
-
50
- api_key: str
51
- voice: PostedUtteranceVoice | None
52
- description: str | None
53
- speed: float | None
54
- context: PostedContext | None
55
- format: Format
56
- strip_headers: bool
57
- instant_mode: bool
58
- word_tokenizer: tokenize.WordTokenizer
59
-
60
-
61
- class TTS(tts.TTS):
62
- def __init__(
63
- self,
64
- *,
65
- voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
66
- description: NotGivenOr[str] = NOT_GIVEN,
67
- speed: NotGivenOr[float] = NOT_GIVEN,
68
- context: NotGivenOr[PostedContext] = NOT_GIVEN,
69
- format: NotGivenOr[Format] = NOT_GIVEN,
70
- instant_mode: bool = False,
71
- strip_headers: bool = True,
72
- api_key: NotGivenOr[str] = NOT_GIVEN,
73
- word_tokenizer: tokenize.WordTokenizer | None = None,
74
- http_session: aiohttp.ClientSession | None = None,
75
- ) -> None:
76
- """Initialize the Hume TTS client.
77
-
78
- See https://dev.hume.ai/reference/text-to-speech-tts/synthesize-json-streaming for API doc
79
-
80
- Args:
81
- voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
82
- used. When no voice is specified, a novel voice will be generated based on the
83
- text and optionally provided description.
84
- description (NotGivenOr[str]): Natural language instructions describing how the
85
- synthesized speech should sound, including but not limited to tone, intonation,
86
- pacing, and accent. If a Voice is specified in the request, this description
87
- serves as acting instructions. If no Voice is specified, a new voice is generated
88
- based on this description.
89
- speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
90
- from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
91
- pace.
92
- context (NotGivenOr[PostedContext]): Utterances to use as context for generating
93
- consistent speech style and prosody across multiple requests.
94
- format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
95
- Defaults to WAV format.
96
- instant_mode (bool): Enables ultra-low latency streaming, reducing time to first chunk.
97
- Recommended for real-time applications. Only for streaming endpoints.
98
- With this enabled, requests incur 10% higher cost. Defaults to False.
99
- strip_headers (bool): If enabled, the audio for all the chunks of a generation.
100
- Once concatenated together, will constitute a single audio file.
101
- If disabled, each chunk’s audio will be its own audio file, each with its headers.
102
- api_key (NotGivenOr[str]): Hume API key for authentication. If not provided,
103
- will attempt to read from HUME_API_KEY environment variable.
104
- word_tokenizer (tokenize.WordTokenizer | None): Custom word tokenizer to use for text.
105
- If None, a basic word tokenizer will be used.
106
- http_session (aiohttp.ClientSession | None): Optional HTTP session for API requests.
107
- If None, a new session will be created.
108
- """
109
-
110
- super().__init__(
111
- capabilities=tts.TTSCapabilities(
112
- streaming=False,
113
- ),
114
- sample_rate=DEFAULT_SAMPLE_RATE,
115
- num_channels=DEFAULT_NUM_CHANNELS,
116
- )
117
-
118
- self._api_key = api_key if is_given(api_key) else os.environ.get("HUME_API_KEY")
119
- if not self._api_key:
120
- raise ValueError(
121
- "Hume API key is required, either as argument or set HUME_API_KEY env variable"
122
- )
123
-
124
- if not word_tokenizer:
125
- word_tokenizer = tokenize.basic.WordTokenizer(ignore_punctuation=False)
126
-
127
- self._opts = _TTSOptions(
128
- voice=voice if is_given(voice) else None,
129
- description=description if is_given(description) else None,
130
- speed=speed if is_given(speed) else None,
131
- context=context if is_given(context) else None,
132
- format=format if is_given(format) else FormatWav(),
133
- api_key=self._api_key,
134
- strip_headers=strip_headers,
135
- instant_mode=instant_mode,
136
- word_tokenizer=word_tokenizer,
137
- )
138
-
139
- self._client = AsyncHumeClient(api_key=self._api_key)
140
- self._session = http_session
141
-
142
- def _ensure_session(self) -> aiohttp.ClientSession:
143
- if not self._session:
144
- self._session = utils.http_context.http_session()
145
- return self._session
146
-
147
- def update_options(
148
- self,
149
- *,
150
- voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
151
- description: NotGivenOr[str] = NOT_GIVEN,
152
- speed: NotGivenOr[float] = NOT_GIVEN,
153
- context: NotGivenOr[PostedContext] = NOT_GIVEN,
154
- format: NotGivenOr[Format] = NOT_GIVEN,
155
- instant_mode: NotGivenOr[bool] = NOT_GIVEN,
156
- strip_headers: NotGivenOr[bool] = NOT_GIVEN,
157
- ) -> None:
158
- """Update TTS options for synthesizing speech.
159
-
160
- Args:
161
- voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
162
- used. When no voice is specified, a novel voice will be generated based on the
163
- text and optionally provided description.
164
- description (NotGivenOr[str]): Natural language instructions describing how the
165
- synthesized speech should sound, including but not limited to tone, intonation,
166
- pacing, and accent. If a Voice is specified in the request, this description
167
- serves as acting instructions. If no Voice is specified, a new voice is generated
168
- based on this description.
169
- speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
170
- from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
171
- pace.
172
- context (Optional[PostedContext]): Utterances to use as context for generating
173
- consistent speech style and prosody across multiple requests.
174
- format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
175
- instant_mode (NotGivenOr[bool]): Enables ultra-low latency streaming.
176
- Reduces time to first audio chunk, recommended for real-time applications.
177
- Note: Incurs 10% higher cost when enabled.
178
- strip_headers (NotGivenOr[bool]): If enabled, the audio for the chunks of a generation.
179
- Once concatenated together, will constitute a single audio file.
180
- If disabled, each chunk’s audio will be its own audio file, each with its headers.
181
- """
182
-
183
- if is_given(voice):
184
- self._opts.voice = voice
185
- if is_given(description):
186
- self._opts.description = description
187
- if is_given(speed):
188
- self._opts.speed = speed
189
- if is_given(format):
190
- self._opts.format = format
191
- if is_given(context):
192
- self._opts.context = context
193
- if is_given(instant_mode):
194
- self._opts.instant_mode = instant_mode
195
- if is_given(strip_headers):
196
- self._opts.strip_headers = strip_headers
197
-
198
- def synthesize(
199
- self,
200
- text: str,
201
- *,
202
- conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
203
- ) -> ChunkedStream:
204
- return ChunkedStream(
205
- tts=self,
206
- input_text=text,
207
- conn_options=conn_options,
208
- opts=self._opts,
209
- )
210
-
211
-
212
- class ChunkedStream(tts.ChunkedStream):
213
- """Stream for Hume TTS JSON streaming API."""
214
-
215
- def __init__(
216
- self,
217
- *,
218
- tts: TTS,
219
- input_text: str,
220
- opts: _TTSOptions,
221
- conn_options: APIConnectOptions,
222
- ) -> None:
223
- super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
224
- self._opts = opts
225
- self._client = tts._client
226
-
227
- async def _run(self) -> None:
228
- request_id = utils.shortuuid()
229
-
230
- decoder = utils.codecs.AudioStreamDecoder(
231
- sample_rate=DEFAULT_SAMPLE_RATE,
232
- num_channels=DEFAULT_NUM_CHANNELS,
233
- )
234
-
235
- decode_task: asyncio.Task | None = None
236
-
237
- try:
238
-
239
- async def _decode_loop():
240
- utterance_options = {
241
- "voice": self._opts.voice,
242
- "description": self._opts.description,
243
- "speed": self._opts.speed,
244
- }
245
-
246
- utterance_kwargs = {
247
- "text": self._input_text,
248
- **{k: v for k, v in utterance_options.items() if v is not None},
249
- }
250
-
251
- try:
252
- utterance = PostedUtterance(**utterance_kwargs)
253
-
254
- async for chunk in self._client.tts.synthesize_json_streaming(
255
- utterances=[utterance],
256
- context=self._opts.context,
257
- format=self._opts.format,
258
- instant_mode=self._opts.instant_mode,
259
- strip_headers=self._opts.strip_headers,
260
- ):
261
- decoder.push(base64.b64decode(chunk.audio))
262
-
263
- finally:
264
- decoder.end_input()
265
-
266
- decode_task = asyncio.create_task(_decode_loop())
267
- emitter = tts.SynthesizedAudioEmitter(
268
- event_ch=self._event_ch,
269
- request_id=request_id,
270
- )
271
- async for frame in decoder:
272
- emitter.push(frame)
273
-
274
- emitter.flush()
275
-
276
- except asyncio.TimeoutError:
277
- raise APITimeoutError() from None
278
- except Exception as e:
279
- raise APIConnectionError() from e
280
- finally:
281
- if decode_task:
282
- await utils.aio.gracefully_cancel(decode_task)
283
- await decoder.aclose()