livekit-plugins-hume 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-hume might be problematic. Click here for more details.

@@ -21,10 +21,24 @@ from __future__ import annotations
21
21
 
22
22
  from livekit.agents import Plugin
23
23
 
24
- from .tts import TTS, PostedContext, PostedUtterance
24
+ from .tts import (
25
+ TTS,
26
+ AudioFormat,
27
+ Utterance,
28
+ VoiceById,
29
+ VoiceByName,
30
+ VoiceProvider,
31
+ )
25
32
  from .version import __version__
26
33
 
27
- __all__ = ["TTS", "PostedContext", "PostedUtterance"]
34
+ __all__ = [
35
+ "TTS",
36
+ "AudioFormat",
37
+ "VoiceById",
38
+ "VoiceByName",
39
+ "VoiceProvider",
40
+ "Utterance",
41
+ ]
28
42
 
29
43
 
30
44
  class HumeAIPlugin(Plugin):
@@ -19,6 +19,7 @@ import base64
19
19
  import json
20
20
  import os
21
21
  from dataclasses import dataclass, replace
22
+ from enum import Enum
22
23
  from typing import Any, TypedDict
23
24
 
24
25
  import aiohttp
@@ -27,32 +28,66 @@ from livekit.agents import APIConnectionError, APIConnectOptions, APITimeoutErro
27
28
  from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
28
29
  from livekit.agents.utils import is_given
29
30
 
30
- API_AUTH_HEADER = "X-Hume-Api-Key"
31
- STREAM_PATH = "/v0/tts/stream/json"
32
- DEFAULT_BASE_URL = "https://api.hume.ai"
31
+ from .version import __version__
32
+
33
+
34
+ class VoiceById(TypedDict, total=False):
35
+ id: str
36
+ provider: VoiceProvider | None
37
+
38
+
39
+ class VoiceByName(TypedDict, total=False):
40
+ name: str
41
+ provider: VoiceProvider | None
33
42
 
34
43
 
35
- class PostedUtterance(TypedDict, total=False):
44
+ class Utterance(TypedDict, total=False):
45
+ """Utterance for TTS synthesis."""
46
+
36
47
  text: str
37
- description: str
38
- voice: dict[str, Any]
39
- speed: float
40
- trailing_silence: float
48
+ description: str | None
49
+ speed: float | None
50
+ voice: VoiceById | VoiceByName | None
51
+ trailing_silence: float | None
52
+
53
+
54
+ class VoiceProvider(str, Enum):
55
+ """Voice provider for the voice library."""
56
+
57
+ hume = "HUME_AI"
58
+ custom = "CUSTOM_VOICE"
59
+
60
+
61
+ class AudioFormat(str, Enum):
62
+ """Audio format for the synthesized speech."""
41
63
 
64
+ mp3 = "mp3"
65
+ wav = "wav"
66
+ pcm = "pcm"
42
67
 
43
- class PostedContext(TypedDict, total=False):
44
- utterances: list[PostedUtterance]
68
+
69
+ DEFAULT_HEADERS = {
70
+ "X-Hume-Client-Name": "livekit",
71
+ "X-Hume-Client-Version": __version__,
72
+ }
73
+ API_AUTH_HEADER = "X-Hume-Api-Key"
74
+ STREAM_PATH = "/v0/tts/stream/json"
75
+ DEFAULT_BASE_URL = "https://api.hume.ai"
76
+ SUPPORTED_SAMPLE_RATE = 48000
77
+ DEFAULT_VOICE = VoiceByName(name="Male English Actor", provider=VoiceProvider.hume)
45
78
 
46
79
 
47
80
  @dataclass
48
81
  class _TTSOptions:
49
82
  api_key: str
50
- utterance_options: PostedUtterance
51
- context: PostedContext | None
52
- sample_rate: int
53
- split_utterances: bool
54
- instant_mode: bool
55
83
  base_url: str
84
+ voice: VoiceById | VoiceByName | None
85
+ description: str | None
86
+ speed: float | None
87
+ trailing_silence: float | None
88
+ context: str | list[Utterance] | None
89
+ instant_mode: bool | None
90
+ audio_format: AudioFormat
56
91
 
57
92
  def http_url(self, path: str) -> str:
58
93
  return f"{self.base_url}{path}"
@@ -63,36 +98,64 @@ class TTS(tts.TTS):
63
98
  self,
64
99
  *,
65
100
  api_key: str | None = None,
66
- utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
67
- split_utterances: bool = True,
68
- instant_mode: bool = True,
69
- sample_rate: int = 24000,
101
+ voice: VoiceById | VoiceByName | None = DEFAULT_VOICE,
102
+ description: str | None = None,
103
+ speed: float | None = None,
104
+ trailing_silence: float | None = None,
105
+ context: str | list[Utterance] | None = None,
106
+ instant_mode: NotGivenOr[bool] = NOT_GIVEN,
107
+ audio_format: AudioFormat = AudioFormat.mp3,
70
108
  base_url: str = DEFAULT_BASE_URL,
71
109
  http_session: aiohttp.ClientSession | None = None,
72
110
  ):
111
+ """Initialize the Hume AI TTS client. Options will be used for all future synthesis
112
+ (until updated with update_options).
113
+
114
+ Args:
115
+ api_key: Hume AI API key. If not provided, will look for HUME_API_KEY environment
116
+ variable.
117
+ voice: A voice from the voice library specifed by name or id.
118
+ description: Natural language instructions describing how the synthesized speech
119
+ should sound (≤1000 characters).
120
+ speed: Speed multiplier for the synthesized speech (≥0.25, ≤3.0, default: 1.0).
121
+ trailing_silence: Duration of trailing silence (in seconds) to add to each utterance
122
+ (≥0, ≤5.0, default: 0.35).
123
+ context: Optional context for synthesis, either as text or list of utterances.
124
+ instant_mode: Whether to use instant mode. Defaults to True if voice specified,
125
+ False otherwise. Requires a voice to be specified when enabled.
126
+ audio_format: Output audio format (mp3, wav, or pcm). Defaults to mp3.
127
+ base_url: Base URL for Hume AI API. Defaults to https://api.hume.ai
128
+ http_session: Optional aiohttp ClientSession to use for requests.
129
+ """
73
130
  super().__init__(
74
- capabilities=tts.TTSCapabilities(streaming=True),
75
- sample_rate=sample_rate,
131
+ capabilities=tts.TTSCapabilities(streaming=False),
132
+ sample_rate=SUPPORTED_SAMPLE_RATE,
76
133
  num_channels=1,
77
134
  )
78
135
  key = api_key or os.environ.get("HUME_API_KEY")
79
136
  if not key:
80
137
  raise ValueError("Hume API key is required via api_key or HUME_API_KEY env var")
81
138
 
82
- default_utterance: PostedUtterance = {
83
- "speed": 1.0,
84
- "trailing_silence": 0.35,
85
- }
86
- if is_given(utterance_options):
87
- default_utterance.update(utterance_options)
139
+ has_voice = voice is not None
140
+
141
+ # Default instant_mode is True if a voice is specified, otherwise False
142
+ # (Hume API requires a voice for instant mode)
143
+ if not is_given(instant_mode):
144
+ resolved_instant_mode = has_voice
145
+ elif instant_mode and not has_voice:
146
+ raise ValueError("Hume TTS: instant_mode cannot be enabled without specifying a voice")
147
+ else:
148
+ resolved_instant_mode = instant_mode
88
149
 
89
150
  self._opts = _TTSOptions(
90
151
  api_key=key,
91
- utterance_options=default_utterance,
92
- context=None,
93
- sample_rate=sample_rate,
94
- split_utterances=split_utterances,
95
- instant_mode=instant_mode,
152
+ voice=voice,
153
+ description=description,
154
+ speed=speed,
155
+ trailing_silence=trailing_silence,
156
+ context=context,
157
+ instant_mode=resolved_instant_mode,
158
+ audio_format=audio_format,
96
159
  base_url=base_url,
97
160
  )
98
161
  self._session = http_session
@@ -106,19 +169,40 @@ class TTS(tts.TTS):
106
169
  def update_options(
107
170
  self,
108
171
  *,
109
- utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
110
- context: NotGivenOr[PostedContext] = NOT_GIVEN,
111
- split_utterances: NotGivenOr[bool] = NOT_GIVEN,
172
+ description: NotGivenOr[str | None] = NOT_GIVEN,
173
+ speed: NotGivenOr[float | None] = NOT_GIVEN,
174
+ voice: NotGivenOr[VoiceById | VoiceByName | None] = NOT_GIVEN,
175
+ trailing_silence: NotGivenOr[float | None] = NOT_GIVEN,
176
+ context: NotGivenOr[str | list[Utterance] | None] = NOT_GIVEN,
112
177
  instant_mode: NotGivenOr[bool] = NOT_GIVEN,
178
+ audio_format: NotGivenOr[AudioFormat] = NOT_GIVEN,
113
179
  ) -> None:
114
- if is_given(utterance_options):
115
- self._opts.utterance_options = utterance_options
116
- if is_given(context): #
117
- self._opts.context = context
118
- if is_given(split_utterances):
119
- self._opts.split_utterances = split_utterances
180
+ """Update TTS options used for all future synthesis (until updated again)
181
+
182
+ Args:
183
+ voice: A voice from the voice library specifed by name or id.
184
+ description: Natural language instructions describing how the synthesized speech
185
+ should sound (≤1000 characters).
186
+ speed: Speed multiplier for the synthesized speech (≥0.25, ≤3.0, default: 1.0).
187
+ trailing_silence: Duration of trailing silence (in seconds) to add to each utterance.
188
+ context: Optional context for synthesis, either as text or list of utterances.
189
+ instant_mode: Whether to use instant mode.
190
+ audio_format: Output audio format (mp3, wav, or pcm).
191
+ """
192
+ if is_given(description):
193
+ self._opts.description = description
194
+ if is_given(speed):
195
+ self._opts.speed = speed
196
+ if is_given(voice):
197
+ self._opts.voice = voice # type: ignore
198
+ if is_given(trailing_silence):
199
+ self._opts.trailing_silence = trailing_silence
200
+ if is_given(context):
201
+ self._opts.context = context # type: ignore
120
202
  if is_given(instant_mode):
121
203
  self._opts.instant_mode = instant_mode
204
+ if is_given(audio_format):
205
+ self._opts.audio_format = audio_format
122
206
 
123
207
  def synthesize(
124
208
  self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
@@ -133,34 +217,46 @@ class ChunkedStream(tts.ChunkedStream):
133
217
  self._opts = replace(tts._opts)
134
218
 
135
219
  async def _run(self, output_emitter: tts.AudioEmitter) -> None:
136
- utterance: PostedUtterance = {"text": self._input_text}
137
- utterance.update(self._opts.utterance_options)
220
+ utterance: Utterance = {
221
+ "text": self._input_text,
222
+ }
223
+
224
+ if self._opts.voice:
225
+ utterance["voice"] = self._opts.voice
226
+ if self._opts.description:
227
+ utterance["description"] = self._opts.description
228
+ if self._opts.speed:
229
+ utterance["speed"] = self._opts.speed
230
+ if self._opts.trailing_silence:
231
+ utterance["trailing_silence"] = self._opts.trailing_silence
138
232
 
139
233
  payload: dict[str, Any] = {
140
234
  "utterances": [utterance],
141
- "split_utterances": self._opts.split_utterances,
142
235
  "strip_headers": True,
143
236
  "instant_mode": self._opts.instant_mode,
144
- "format": {"type": "mp3"},
237
+ "format": {"type": self._opts.audio_format.value},
145
238
  }
146
- if self._opts.context:
147
- payload["context"] = self._opts.context
239
+ if isinstance(self._opts.context, str):
240
+ payload["context"] = {"generation_id": self._opts.context}
241
+ elif isinstance(self._opts.context, list):
242
+ payload["context"] = {"utterances": self._opts.context}
148
243
 
149
244
  try:
150
245
  async with self._tts._ensure_session().post(
151
246
  self._opts.http_url(STREAM_PATH),
152
- headers={API_AUTH_HEADER: self._opts.api_key},
247
+ headers={**DEFAULT_HEADERS, API_AUTH_HEADER: self._opts.api_key},
153
248
  json=payload,
154
249
  timeout=aiohttp.ClientTimeout(total=None, sock_connect=self._conn_options.timeout),
155
250
  # large read_bufsize to avoid `ValueError: Chunk too big`
156
251
  read_bufsize=10 * 1024 * 1024,
157
252
  ) as resp:
158
253
  resp.raise_for_status()
254
+
159
255
  output_emitter.initialize(
160
256
  request_id=utils.shortuuid(),
161
- sample_rate=self._opts.sample_rate,
257
+ sample_rate=SUPPORTED_SAMPLE_RATE,
162
258
  num_channels=self._tts.num_channels,
163
- mime_type="audio/mp3",
259
+ mime_type=f"audio/{self._opts.audio_format.value}",
164
260
  )
165
261
 
166
262
  async for raw_line in resp.content:
@@ -174,6 +270,7 @@ class ChunkedStream(tts.ChunkedStream):
174
270
  output_emitter.push(base64.b64decode(audio_b64))
175
271
 
176
272
  output_emitter.flush()
273
+
177
274
  except asyncio.TimeoutError:
178
275
  raise APITimeoutError() from None
179
276
  except Exception as e:
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.1.0"
15
+ __version__ = "1.1.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-hume
3
- Version: 1.1.0
3
+ Version: 1.1.2
4
4
  Summary: Hume TTS plugin for LiveKit agents
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -17,7 +17,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
17
17
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.9.0
19
19
  Requires-Dist: aiohttp>=3.8.0
20
- Requires-Dist: livekit-agents>=1.1.0
20
+ Requires-Dist: livekit-agents>=1.1.2
21
21
  Description-Content-Type: text/markdown
22
22
 
23
23
  # Hume AI TTS plugin for LiveKit Agents
@@ -0,0 +1,8 @@
1
+ livekit/plugins/hume/__init__.py,sha256=yYTwSJaYq5ufZ_EnoSuLa2FfSsnOZu-swAzYjNQAhhw,1374
2
+ livekit/plugins/hume/log.py,sha256=TwpK1FOwgD6Jb0A2nl-9nIgi0q5qWo9HGDrDuV_2g0g,67
3
+ livekit/plugins/hume/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
4
+ livekit/plugins/hume/tts.py,sha256=VYduFRxndfE0R-3A_Pt16pvcLd80VWnUJIda4iQBgPo,10301
5
+ livekit/plugins/hume/version.py,sha256=tjKfbKrDsLPqL_l2Ydi4f1M2Vj8ALS0HobeZCqpZ6aQ,600
6
+ livekit_plugins_hume-1.1.2.dist-info/METADATA,sha256=A3PtGt325UlL5vS1o5y9-m-jjx74OlGdGR3MLpADYV8,1354
7
+ livekit_plugins_hume-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
+ livekit_plugins_hume-1.1.2.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- livekit/plugins/hume/__init__.py,sha256=--F5e6CdoZM8eyw5ca-H-khoKdDJxdflwvrMCSwAHws,1250
2
- livekit/plugins/hume/log.py,sha256=TwpK1FOwgD6Jb0A2nl-9nIgi0q5qWo9HGDrDuV_2g0g,67
3
- livekit/plugins/hume/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
4
- livekit/plugins/hume/tts.py,sha256=ZnVqxzzs75OpHe_YDMr5X_BgZlZRlQSiCYs0z1Yq5gg,6128
5
- livekit/plugins/hume/version.py,sha256=Pl1D4Jol4f5vcHwFlr83NvnRouDwUNzW3Vxxi0E2uEA,600
6
- livekit_plugins_hume-1.1.0.dist-info/METADATA,sha256=v2jQuMBnbp9uUwF9Tmwd8GCOXpc1DliK-0hYOe8Cbwk,1354
7
- livekit_plugins_hume-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
- livekit_plugins_hume-1.1.0.dist-info/RECORD,,