livekit-plugins-hume 1.1.1__tar.gz → 1.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-hume might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-hume
3
- Version: 1.1.1
3
+ Version: 1.1.2
4
4
  Summary: Hume TTS plugin for LiveKit agents
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -17,7 +17,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
17
17
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.9.0
19
19
  Requires-Dist: aiohttp>=3.8.0
20
- Requires-Dist: livekit-agents>=1.1.1
20
+ Requires-Dist: livekit-agents>=1.1.2
21
21
  Description-Content-Type: text/markdown
22
22
 
23
23
  # Hume AI TTS plugin for LiveKit Agents
@@ -21,10 +21,24 @@ from __future__ import annotations
21
21
 
22
22
  from livekit.agents import Plugin
23
23
 
24
- from .tts import TTS, PostedContext, PostedUtterance
24
+ from .tts import (
25
+ TTS,
26
+ AudioFormat,
27
+ Utterance,
28
+ VoiceById,
29
+ VoiceByName,
30
+ VoiceProvider,
31
+ )
25
32
  from .version import __version__
26
33
 
27
- __all__ = ["TTS", "PostedContext", "PostedUtterance"]
34
+ __all__ = [
35
+ "TTS",
36
+ "AudioFormat",
37
+ "VoiceById",
38
+ "VoiceByName",
39
+ "VoiceProvider",
40
+ "Utterance",
41
+ ]
28
42
 
29
43
 
30
44
  class HumeAIPlugin(Plugin):
@@ -0,0 +1,277 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ from dataclasses import dataclass, replace
22
+ from enum import Enum
23
+ from typing import Any, TypedDict
24
+
25
+ import aiohttp
26
+
27
+ from livekit.agents import APIConnectionError, APIConnectOptions, APITimeoutError, tts, utils
28
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
29
+ from livekit.agents.utils import is_given
30
+
31
+ from .version import __version__
32
+
33
+
34
+ class VoiceById(TypedDict, total=False):
35
+ id: str
36
+ provider: VoiceProvider | None
37
+
38
+
39
+ class VoiceByName(TypedDict, total=False):
40
+ name: str
41
+ provider: VoiceProvider | None
42
+
43
+
44
+ class Utterance(TypedDict, total=False):
45
+ """Utterance for TTS synthesis."""
46
+
47
+ text: str
48
+ description: str | None
49
+ speed: float | None
50
+ voice: VoiceById | VoiceByName | None
51
+ trailing_silence: float | None
52
+
53
+
54
+ class VoiceProvider(str, Enum):
55
+ """Voice provider for the voice library."""
56
+
57
+ hume = "HUME_AI"
58
+ custom = "CUSTOM_VOICE"
59
+
60
+
61
+ class AudioFormat(str, Enum):
62
+ """Audio format for the synthesized speech."""
63
+
64
+ mp3 = "mp3"
65
+ wav = "wav"
66
+ pcm = "pcm"
67
+
68
+
69
+ DEFAULT_HEADERS = {
70
+ "X-Hume-Client-Name": "livekit",
71
+ "X-Hume-Client-Version": __version__,
72
+ }
73
+ API_AUTH_HEADER = "X-Hume-Api-Key"
74
+ STREAM_PATH = "/v0/tts/stream/json"
75
+ DEFAULT_BASE_URL = "https://api.hume.ai"
76
+ SUPPORTED_SAMPLE_RATE = 48000
77
+ DEFAULT_VOICE = VoiceByName(name="Male English Actor", provider=VoiceProvider.hume)
78
+
79
+
80
+ @dataclass
81
+ class _TTSOptions:
82
+ api_key: str
83
+ base_url: str
84
+ voice: VoiceById | VoiceByName | None
85
+ description: str | None
86
+ speed: float | None
87
+ trailing_silence: float | None
88
+ context: str | list[Utterance] | None
89
+ instant_mode: bool | None
90
+ audio_format: AudioFormat
91
+
92
+ def http_url(self, path: str) -> str:
93
+ return f"{self.base_url}{path}"
94
+
95
+
96
+ class TTS(tts.TTS):
97
+ def __init__(
98
+ self,
99
+ *,
100
+ api_key: str | None = None,
101
+ voice: VoiceById | VoiceByName | None = DEFAULT_VOICE,
102
+ description: str | None = None,
103
+ speed: float | None = None,
104
+ trailing_silence: float | None = None,
105
+ context: str | list[Utterance] | None = None,
106
+ instant_mode: NotGivenOr[bool] = NOT_GIVEN,
107
+ audio_format: AudioFormat = AudioFormat.mp3,
108
+ base_url: str = DEFAULT_BASE_URL,
109
+ http_session: aiohttp.ClientSession | None = None,
110
+ ):
111
+ """Initialize the Hume AI TTS client. Options will be used for all future synthesis
112
+ (until updated with update_options).
113
+
114
+ Args:
115
+ api_key: Hume AI API key. If not provided, will look for HUME_API_KEY environment
116
+ variable.
117
+ voice: A voice from the voice library specifed by name or id.
118
+ description: Natural language instructions describing how the synthesized speech
119
+ should sound (≤1000 characters).
120
+ speed: Speed multiplier for the synthesized speech (≥0.25, ≤3.0, default: 1.0).
121
+ trailing_silence: Duration of trailing silence (in seconds) to add to each utterance
122
+ (≥0, ≤5.0, default: 0.35).
123
+ context: Optional context for synthesis, either as text or list of utterances.
124
+ instant_mode: Whether to use instant mode. Defaults to True if voice specified,
125
+ False otherwise. Requires a voice to be specified when enabled.
126
+ audio_format: Output audio format (mp3, wav, or pcm). Defaults to mp3.
127
+ base_url: Base URL for Hume AI API. Defaults to https://api.hume.ai
128
+ http_session: Optional aiohttp ClientSession to use for requests.
129
+ """
130
+ super().__init__(
131
+ capabilities=tts.TTSCapabilities(streaming=False),
132
+ sample_rate=SUPPORTED_SAMPLE_RATE,
133
+ num_channels=1,
134
+ )
135
+ key = api_key or os.environ.get("HUME_API_KEY")
136
+ if not key:
137
+ raise ValueError("Hume API key is required via api_key or HUME_API_KEY env var")
138
+
139
+ has_voice = voice is not None
140
+
141
+ # Default instant_mode is True if a voice is specified, otherwise False
142
+ # (Hume API requires a voice for instant mode)
143
+ if not is_given(instant_mode):
144
+ resolved_instant_mode = has_voice
145
+ elif instant_mode and not has_voice:
146
+ raise ValueError("Hume TTS: instant_mode cannot be enabled without specifying a voice")
147
+ else:
148
+ resolved_instant_mode = instant_mode
149
+
150
+ self._opts = _TTSOptions(
151
+ api_key=key,
152
+ voice=voice,
153
+ description=description,
154
+ speed=speed,
155
+ trailing_silence=trailing_silence,
156
+ context=context,
157
+ instant_mode=resolved_instant_mode,
158
+ audio_format=audio_format,
159
+ base_url=base_url,
160
+ )
161
+ self._session = http_session
162
+
163
+ def _ensure_session(self) -> aiohttp.ClientSession:
164
+ if not self._session:
165
+ self._session = utils.http_context.http_session()
166
+
167
+ return self._session
168
+
169
+ def update_options(
170
+ self,
171
+ *,
172
+ description: NotGivenOr[str | None] = NOT_GIVEN,
173
+ speed: NotGivenOr[float | None] = NOT_GIVEN,
174
+ voice: NotGivenOr[VoiceById | VoiceByName | None] = NOT_GIVEN,
175
+ trailing_silence: NotGivenOr[float | None] = NOT_GIVEN,
176
+ context: NotGivenOr[str | list[Utterance] | None] = NOT_GIVEN,
177
+ instant_mode: NotGivenOr[bool] = NOT_GIVEN,
178
+ audio_format: NotGivenOr[AudioFormat] = NOT_GIVEN,
179
+ ) -> None:
180
+ """Update TTS options used for all future synthesis (until updated again)
181
+
182
+ Args:
183
+ voice: A voice from the voice library specifed by name or id.
184
+ description: Natural language instructions describing how the synthesized speech
185
+ should sound (≤1000 characters).
186
+ speed: Speed multiplier for the synthesized speech (≥0.25, ≤3.0, default: 1.0).
187
+ trailing_silence: Duration of trailing silence (in seconds) to add to each utterance.
188
+ context: Optional context for synthesis, either as text or list of utterances.
189
+ instant_mode: Whether to use instant mode.
190
+ audio_format: Output audio format (mp3, wav, or pcm).
191
+ """
192
+ if is_given(description):
193
+ self._opts.description = description
194
+ if is_given(speed):
195
+ self._opts.speed = speed
196
+ if is_given(voice):
197
+ self._opts.voice = voice # type: ignore
198
+ if is_given(trailing_silence):
199
+ self._opts.trailing_silence = trailing_silence
200
+ if is_given(context):
201
+ self._opts.context = context # type: ignore
202
+ if is_given(instant_mode):
203
+ self._opts.instant_mode = instant_mode
204
+ if is_given(audio_format):
205
+ self._opts.audio_format = audio_format
206
+
207
+ def synthesize(
208
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
209
+ ) -> tts.ChunkedStream:
210
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
211
+
212
+
213
+ class ChunkedStream(tts.ChunkedStream):
214
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
215
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
216
+ self._tts: TTS = tts
217
+ self._opts = replace(tts._opts)
218
+
219
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
220
+ utterance: Utterance = {
221
+ "text": self._input_text,
222
+ }
223
+
224
+ if self._opts.voice:
225
+ utterance["voice"] = self._opts.voice
226
+ if self._opts.description:
227
+ utterance["description"] = self._opts.description
228
+ if self._opts.speed:
229
+ utterance["speed"] = self._opts.speed
230
+ if self._opts.trailing_silence:
231
+ utterance["trailing_silence"] = self._opts.trailing_silence
232
+
233
+ payload: dict[str, Any] = {
234
+ "utterances": [utterance],
235
+ "strip_headers": True,
236
+ "instant_mode": self._opts.instant_mode,
237
+ "format": {"type": self._opts.audio_format.value},
238
+ }
239
+ if isinstance(self._opts.context, str):
240
+ payload["context"] = {"generation_id": self._opts.context}
241
+ elif isinstance(self._opts.context, list):
242
+ payload["context"] = {"utterances": self._opts.context}
243
+
244
+ try:
245
+ async with self._tts._ensure_session().post(
246
+ self._opts.http_url(STREAM_PATH),
247
+ headers={**DEFAULT_HEADERS, API_AUTH_HEADER: self._opts.api_key},
248
+ json=payload,
249
+ timeout=aiohttp.ClientTimeout(total=None, sock_connect=self._conn_options.timeout),
250
+ # large read_bufsize to avoid `ValueError: Chunk too big`
251
+ read_bufsize=10 * 1024 * 1024,
252
+ ) as resp:
253
+ resp.raise_for_status()
254
+
255
+ output_emitter.initialize(
256
+ request_id=utils.shortuuid(),
257
+ sample_rate=SUPPORTED_SAMPLE_RATE,
258
+ num_channels=self._tts.num_channels,
259
+ mime_type=f"audio/{self._opts.audio_format.value}",
260
+ )
261
+
262
+ async for raw_line in resp.content:
263
+ line = raw_line.strip()
264
+ if not line:
265
+ continue
266
+
267
+ data = json.loads(line.decode())
268
+ audio_b64 = data.get("audio")
269
+ if audio_b64:
270
+ output_emitter.push(base64.b64decode(audio_b64))
271
+
272
+ output_emitter.flush()
273
+
274
+ except asyncio.TimeoutError:
275
+ raise APITimeoutError() from None
276
+ except Exception as e:
277
+ raise APIConnectionError() from e
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.1.1"
15
+ __version__ = "1.1.2"
@@ -24,7 +24,7 @@ classifiers = [
24
24
  ]
25
25
  dependencies = [
26
26
  "aiohttp>=3.8.0",
27
- "livekit-agents>=1.1.1",
27
+ "livekit-agents>=1.1.2",
28
28
  ]
29
29
 
30
30
  [project.urls]
@@ -1,180 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import asyncio
18
- import base64
19
- import json
20
- import os
21
- from dataclasses import dataclass, replace
22
- from typing import Any, TypedDict
23
-
24
- import aiohttp
25
-
26
- from livekit.agents import APIConnectionError, APIConnectOptions, APITimeoutError, tts, utils
27
- from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
28
- from livekit.agents.utils import is_given
29
-
30
- API_AUTH_HEADER = "X-Hume-Api-Key"
31
- STREAM_PATH = "/v0/tts/stream/json"
32
- DEFAULT_BASE_URL = "https://api.hume.ai"
33
-
34
-
35
- class PostedUtterance(TypedDict, total=False):
36
- text: str
37
- description: str
38
- voice: dict[str, Any]
39
- speed: float
40
- trailing_silence: float
41
-
42
-
43
- class PostedContext(TypedDict, total=False):
44
- utterances: list[PostedUtterance]
45
-
46
-
47
- @dataclass
48
- class _TTSOptions:
49
- api_key: str
50
- utterance_options: PostedUtterance
51
- context: PostedContext | None
52
- sample_rate: int
53
- split_utterances: bool
54
- instant_mode: bool
55
- base_url: str
56
-
57
- def http_url(self, path: str) -> str:
58
- return f"{self.base_url}{path}"
59
-
60
-
61
- class TTS(tts.TTS):
62
- def __init__(
63
- self,
64
- *,
65
- api_key: str | None = None,
66
- utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
67
- split_utterances: bool = True,
68
- instant_mode: bool = True,
69
- sample_rate: int = 24000,
70
- base_url: str = DEFAULT_BASE_URL,
71
- http_session: aiohttp.ClientSession | None = None,
72
- ):
73
- super().__init__(
74
- capabilities=tts.TTSCapabilities(streaming=True),
75
- sample_rate=sample_rate,
76
- num_channels=1,
77
- )
78
- key = api_key or os.environ.get("HUME_API_KEY")
79
- if not key:
80
- raise ValueError("Hume API key is required via api_key or HUME_API_KEY env var")
81
-
82
- default_utterance: PostedUtterance = {
83
- "speed": 1.0,
84
- "trailing_silence": 0.35,
85
- }
86
- if is_given(utterance_options):
87
- default_utterance.update(utterance_options)
88
-
89
- self._opts = _TTSOptions(
90
- api_key=key,
91
- utterance_options=default_utterance,
92
- context=None,
93
- sample_rate=sample_rate,
94
- split_utterances=split_utterances,
95
- instant_mode=instant_mode,
96
- base_url=base_url,
97
- )
98
- self._session = http_session
99
-
100
- def _ensure_session(self) -> aiohttp.ClientSession:
101
- if not self._session:
102
- self._session = utils.http_context.http_session()
103
-
104
- return self._session
105
-
106
- def update_options(
107
- self,
108
- *,
109
- utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
110
- context: NotGivenOr[PostedContext] = NOT_GIVEN,
111
- split_utterances: NotGivenOr[bool] = NOT_GIVEN,
112
- instant_mode: NotGivenOr[bool] = NOT_GIVEN,
113
- ) -> None:
114
- if is_given(utterance_options):
115
- self._opts.utterance_options = utterance_options
116
- if is_given(context): #
117
- self._opts.context = context
118
- if is_given(split_utterances):
119
- self._opts.split_utterances = split_utterances
120
- if is_given(instant_mode):
121
- self._opts.instant_mode = instant_mode
122
-
123
- def synthesize(
124
- self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
125
- ) -> tts.ChunkedStream:
126
- return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
127
-
128
-
129
- class ChunkedStream(tts.ChunkedStream):
130
- def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
131
- super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
132
- self._tts: TTS = tts
133
- self._opts = replace(tts._opts)
134
-
135
- async def _run(self, output_emitter: tts.AudioEmitter) -> None:
136
- utterance: PostedUtterance = {"text": self._input_text}
137
- utterance.update(self._opts.utterance_options)
138
-
139
- payload: dict[str, Any] = {
140
- "utterances": [utterance],
141
- "split_utterances": self._opts.split_utterances,
142
- "strip_headers": True,
143
- "instant_mode": self._opts.instant_mode,
144
- "format": {"type": "mp3"},
145
- }
146
- if self._opts.context:
147
- payload["context"] = self._opts.context
148
-
149
- try:
150
- async with self._tts._ensure_session().post(
151
- self._opts.http_url(STREAM_PATH),
152
- headers={API_AUTH_HEADER: self._opts.api_key},
153
- json=payload,
154
- timeout=aiohttp.ClientTimeout(total=None, sock_connect=self._conn_options.timeout),
155
- # large read_bufsize to avoid `ValueError: Chunk too big`
156
- read_bufsize=10 * 1024 * 1024,
157
- ) as resp:
158
- resp.raise_for_status()
159
- output_emitter.initialize(
160
- request_id=utils.shortuuid(),
161
- sample_rate=self._opts.sample_rate,
162
- num_channels=self._tts.num_channels,
163
- mime_type="audio/mp3",
164
- )
165
-
166
- async for raw_line in resp.content:
167
- line = raw_line.strip()
168
- if not line:
169
- continue
170
-
171
- data = json.loads(line.decode())
172
- audio_b64 = data.get("audio")
173
- if audio_b64:
174
- output_emitter.push(base64.b64decode(audio_b64))
175
-
176
- output_emitter.flush()
177
- except asyncio.TimeoutError:
178
- raise APITimeoutError() from None
179
- except Exception as e:
180
- raise APIConnectionError() from e