livekit-plugins-hume 1.0.23__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-hume might be problematic. Click here for more details.

@@ -19,25 +19,12 @@ See https://docs.livekit.io/agents/integrations/tts/hume/ for more information.
19
19
 
20
20
  from __future__ import annotations
21
21
 
22
- __version__ = "1.0.0"
23
-
24
- # make imports available
25
- from hume.tts import (
26
- Format,
27
- PostedContext,
28
- PostedUtteranceVoice,
29
- )
30
22
  from livekit.agents import Plugin
31
23
 
32
- from .tts import TTS
24
+ from .tts import TTS, PostedContext, PostedUtterance
25
+ from .version import __version__
33
26
 
34
- # all exports
35
- __all__ = [
36
- "TTS",
37
- "Format",
38
- "PostedContext",
39
- "PostedUtteranceVoice",
40
- ]
27
+ __all__ = ["TTS", "PostedContext", "PostedUtterance"]
41
28
 
42
29
 
43
30
  class HumeAIPlugin(Plugin):
@@ -55,12 +42,3 @@ __pdoc__ = {}
55
42
 
56
43
  for n in NOT_IN_ALL:
57
44
  __pdoc__[n] = False
58
-
59
- # Cleanup docs of unexported modules
60
- _module = dir()
61
- NOT_IN_ALL = [m for m in _module if m not in __all__]
62
-
63
- __pdoc__ = {}
64
-
65
- for n in NOT_IN_ALL:
66
- __pdoc__[n] = False
@@ -16,268 +16,165 @@ from __future__ import annotations
16
16
 
17
17
  import asyncio
18
18
  import base64
19
+ import json
19
20
  import os
20
- from dataclasses import dataclass
21
+ from dataclasses import dataclass, replace
22
+ from typing import Any, TypedDict
21
23
 
22
24
  import aiohttp
23
25
 
24
- from hume import AsyncHumeClient
25
- from hume.tts import Format, FormatWav, PostedContext, PostedUtterance, PostedUtteranceVoice
26
- from livekit.agents import (
27
- APIConnectionError,
28
- APIConnectOptions,
29
- APITimeoutError,
30
- tokenize,
31
- tts,
32
- utils,
33
- )
34
- from livekit.agents.types import (
35
- DEFAULT_API_CONNECT_OPTIONS,
36
- NOT_GIVEN,
37
- NotGivenOr,
38
- )
26
+ from livekit.agents import APIConnectionError, APIConnectOptions, APITimeoutError, tts, utils
27
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
39
28
  from livekit.agents.utils import is_given
40
29
 
41
- # Default audio settings
42
- DEFAULT_SAMPLE_RATE = 48000
43
- DEFAULT_NUM_CHANNELS = 1
30
+ API_AUTH_HEADER = "X-Hume-Api-Key"
31
+ STREAM_PATH = "/v0/tts/stream/json"
32
+ DEFAULT_BASE_URL = "https://api.hume.ai"
33
+
34
+
35
+ class PostedUtterance(TypedDict, total=False):
36
+ text: str
37
+ description: str
38
+ voice: dict[str, Any]
39
+ speed: float
40
+ trailing_silence: float
41
+
42
+
43
+ class PostedContext(TypedDict, total=False):
44
+ utterances: list[PostedUtterance]
44
45
 
45
46
 
46
47
  @dataclass
47
48
  class _TTSOptions:
48
- """TTS options for Hume API"""
49
-
50
49
  api_key: str
51
- voice: PostedUtteranceVoice | None
52
- description: str | None
53
- speed: float | None
50
+ utterance_options: PostedUtterance
54
51
  context: PostedContext | None
55
- format: Format
56
- strip_headers: bool
52
+ sample_rate: int
53
+ split_utterances: bool
57
54
  instant_mode: bool
58
- word_tokenizer: tokenize.WordTokenizer
55
+ base_url: str
56
+
57
+ def http_url(self, path: str) -> str:
58
+ return f"{self.base_url}{path}"
59
59
 
60
60
 
61
61
  class TTS(tts.TTS):
62
62
  def __init__(
63
63
  self,
64
64
  *,
65
- voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
66
- description: NotGivenOr[str] = NOT_GIVEN,
67
- speed: NotGivenOr[float] = NOT_GIVEN,
68
- context: NotGivenOr[PostedContext] = NOT_GIVEN,
69
- format: NotGivenOr[Format] = NOT_GIVEN,
70
- instant_mode: bool = False,
71
- strip_headers: bool = True,
72
- api_key: NotGivenOr[str] = NOT_GIVEN,
73
- word_tokenizer: tokenize.WordTokenizer | None = None,
65
+ api_key: str | None = None,
66
+ utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
67
+ split_utterances: bool = True,
68
+ instant_mode: bool = True,
69
+ sample_rate: int = 24000,
70
+ base_url: str = DEFAULT_BASE_URL,
74
71
  http_session: aiohttp.ClientSession | None = None,
75
- ) -> None:
76
- """Initialize the Hume TTS client.
77
-
78
- See https://dev.hume.ai/reference/text-to-speech-tts/synthesize-json-streaming for API doc
79
-
80
- Args:
81
- voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
82
- used. When no voice is specified, a novel voice will be generated based on the
83
- text and optionally provided description.
84
- description (NotGivenOr[str]): Natural language instructions describing how the
85
- synthesized speech should sound, including but not limited to tone, intonation,
86
- pacing, and accent. If a Voice is specified in the request, this description
87
- serves as acting instructions. If no Voice is specified, a new voice is generated
88
- based on this description.
89
- speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
90
- from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
91
- pace.
92
- context (NotGivenOr[PostedContext]): Utterances to use as context for generating
93
- consistent speech style and prosody across multiple requests.
94
- format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
95
- Defaults to WAV format.
96
- instant_mode (bool): Enables ultra-low latency streaming, reducing time to first chunk.
97
- Recommended for real-time applications. Only for streaming endpoints.
98
- With this enabled, requests incur 10% higher cost. Defaults to False.
99
- strip_headers (bool): If enabled, the audio for all the chunks of a generation.
100
- Once concatenated together, will constitute a single audio file.
101
- If disabled, each chunk’s audio will be its own audio file, each with its headers.
102
- api_key (NotGivenOr[str]): Hume API key for authentication. If not provided,
103
- will attempt to read from HUME_API_KEY environment variable.
104
- word_tokenizer (tokenize.WordTokenizer | None): Custom word tokenizer to use for text.
105
- If None, a basic word tokenizer will be used.
106
- http_session (aiohttp.ClientSession | None): Optional HTTP session for API requests.
107
- If None, a new session will be created.
108
- """
109
-
72
+ ):
110
73
  super().__init__(
111
- capabilities=tts.TTSCapabilities(
112
- streaming=False,
113
- ),
114
- sample_rate=DEFAULT_SAMPLE_RATE,
115
- num_channels=DEFAULT_NUM_CHANNELS,
74
+ capabilities=tts.TTSCapabilities(streaming=True),
75
+ sample_rate=sample_rate,
76
+ num_channels=1,
116
77
  )
78
+ key = api_key or os.environ.get("HUME_API_KEY")
79
+ if not key:
80
+ raise ValueError("Hume API key is required via api_key or HUME_API_KEY env var")
117
81
 
118
- self._api_key = api_key if is_given(api_key) else os.environ.get("HUME_API_KEY")
119
- if not self._api_key:
120
- raise ValueError(
121
- "Hume API key is required, either as argument or set HUME_API_KEY env variable"
122
- )
123
-
124
- if not word_tokenizer:
125
- word_tokenizer = tokenize.basic.WordTokenizer(ignore_punctuation=False)
82
+ default_utterance: PostedUtterance = {
83
+ "speed": 1.0,
84
+ "trailing_silence": 0.35,
85
+ }
86
+ if is_given(utterance_options):
87
+ default_utterance.update(utterance_options)
126
88
 
127
89
  self._opts = _TTSOptions(
128
- voice=voice if is_given(voice) else None,
129
- description=description if is_given(description) else None,
130
- speed=speed if is_given(speed) else None,
131
- context=context if is_given(context) else None,
132
- format=format if is_given(format) else FormatWav(),
133
- api_key=self._api_key,
134
- strip_headers=strip_headers,
90
+ api_key=key,
91
+ utterance_options=default_utterance,
92
+ context=None,
93
+ sample_rate=sample_rate,
94
+ split_utterances=split_utterances,
135
95
  instant_mode=instant_mode,
136
- word_tokenizer=word_tokenizer,
96
+ base_url=base_url,
137
97
  )
138
-
139
- self._client = AsyncHumeClient(api_key=self._api_key)
140
98
  self._session = http_session
141
99
 
142
100
  def _ensure_session(self) -> aiohttp.ClientSession:
143
101
  if not self._session:
144
102
  self._session = utils.http_context.http_session()
103
+
145
104
  return self._session
146
105
 
147
106
  def update_options(
148
107
  self,
149
108
  *,
150
- voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
151
- description: NotGivenOr[str] = NOT_GIVEN,
152
- speed: NotGivenOr[float] = NOT_GIVEN,
109
+ utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
153
110
  context: NotGivenOr[PostedContext] = NOT_GIVEN,
154
- format: NotGivenOr[Format] = NOT_GIVEN,
111
+ split_utterances: NotGivenOr[bool] = NOT_GIVEN,
155
112
  instant_mode: NotGivenOr[bool] = NOT_GIVEN,
156
- strip_headers: NotGivenOr[bool] = NOT_GIVEN,
157
113
  ) -> None:
158
- """Update TTS options for synthesizing speech.
159
-
160
- Args:
161
- voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
162
- used. When no voice is specified, a novel voice will be generated based on the
163
- text and optionally provided description.
164
- description (NotGivenOr[str]): Natural language instructions describing how the
165
- synthesized speech should sound, including but not limited to tone, intonation,
166
- pacing, and accent. If a Voice is specified in the request, this description
167
- serves as acting instructions. If no Voice is specified, a new voice is generated
168
- based on this description.
169
- speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
170
- from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
171
- pace.
172
- context (Optional[PostedContext]): Utterances to use as context for generating
173
- consistent speech style and prosody across multiple requests.
174
- format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
175
- instant_mode (NotGivenOr[bool]): Enables ultra-low latency streaming.
176
- Reduces time to first audio chunk, recommended for real-time applications.
177
- Note: Incurs 10% higher cost when enabled.
178
- strip_headers (NotGivenOr[bool]): If enabled, the audio for the chunks of a generation.
179
- Once concatenated together, will constitute a single audio file.
180
- If disabled, each chunk’s audio will be its own audio file, each with its headers.
181
- """
182
-
183
- if is_given(voice):
184
- self._opts.voice = voice
185
- if is_given(description):
186
- self._opts.description = description
187
- if is_given(speed):
188
- self._opts.speed = speed
189
- if is_given(format):
190
- self._opts.format = format
191
- if is_given(context):
114
+ if is_given(utterance_options):
115
+ self._opts.utterance_options = utterance_options
116
+ if is_given(context): #
192
117
  self._opts.context = context
118
+ if is_given(split_utterances):
119
+ self._opts.split_utterances = split_utterances
193
120
  if is_given(instant_mode):
194
121
  self._opts.instant_mode = instant_mode
195
- if is_given(strip_headers):
196
- self._opts.strip_headers = strip_headers
197
122
 
198
123
  def synthesize(
199
- self,
200
- text: str,
201
- *,
202
- conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
203
- ) -> ChunkedStream:
204
- return ChunkedStream(
205
- tts=self,
206
- input_text=text,
207
- conn_options=conn_options,
208
- opts=self._opts,
209
- )
124
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
125
+ ) -> tts.ChunkedStream:
126
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
210
127
 
211
128
 
212
129
  class ChunkedStream(tts.ChunkedStream):
213
- """Stream for Hume TTS JSON streaming API."""
214
-
215
- def __init__(
216
- self,
217
- *,
218
- tts: TTS,
219
- input_text: str,
220
- opts: _TTSOptions,
221
- conn_options: APIConnectOptions,
222
- ) -> None:
130
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
223
131
  super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
224
- self._opts = opts
225
- self._client = tts._client
226
-
227
- async def _run(self) -> None:
228
- request_id = utils.shortuuid()
229
-
230
- decoder = utils.codecs.AudioStreamDecoder(
231
- sample_rate=DEFAULT_SAMPLE_RATE,
232
- num_channels=DEFAULT_NUM_CHANNELS,
233
- )
234
-
235
- decode_task: asyncio.Task | None = None
132
+ self._tts: TTS = tts
133
+ self._opts = replace(tts._opts)
134
+
135
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
136
+ utterance: PostedUtterance = {"text": self._input_text}
137
+ utterance.update(self._opts.utterance_options)
138
+
139
+ payload: dict[str, Any] = {
140
+ "utterances": [utterance],
141
+ "split_utterances": self._opts.split_utterances,
142
+ "strip_headers": True,
143
+ "instant_mode": self._opts.instant_mode,
144
+ "format": {"type": "mp3"},
145
+ }
146
+ if self._opts.context:
147
+ payload["context"] = self._opts.context
236
148
 
237
149
  try:
238
-
239
- async def _decode_loop():
240
- utterance_options = {
241
- "voice": self._opts.voice,
242
- "description": self._opts.description,
243
- "speed": self._opts.speed,
244
- }
245
-
246
- utterance_kwargs = {
247
- "text": self._input_text,
248
- **{k: v for k, v in utterance_options.items() if v is not None},
249
- }
250
-
251
- try:
252
- utterance = PostedUtterance(**utterance_kwargs)
253
-
254
- async for chunk in self._client.tts.synthesize_json_streaming(
255
- utterances=[utterance],
256
- context=self._opts.context,
257
- format=self._opts.format,
258
- instant_mode=self._opts.instant_mode,
259
- strip_headers=self._opts.strip_headers,
260
- ):
261
- decoder.push(base64.b64decode(chunk.audio))
262
-
263
- finally:
264
- decoder.end_input()
265
-
266
- decode_task = asyncio.create_task(_decode_loop())
267
- emitter = tts.SynthesizedAudioEmitter(
268
- event_ch=self._event_ch,
269
- request_id=request_id,
270
- )
271
- async for frame in decoder:
272
- emitter.push(frame)
273
-
274
- emitter.flush()
275
-
150
+ async with self._tts._ensure_session().post(
151
+ self._opts.http_url(STREAM_PATH),
152
+ headers={API_AUTH_HEADER: self._opts.api_key},
153
+ json=payload,
154
+ timeout=aiohttp.ClientTimeout(total=None, sock_connect=self._conn_options.timeout),
155
+ # large read_bufsize to avoid `ValueError: Chunk too big`
156
+ read_bufsize=10 * 1024 * 1024,
157
+ ) as resp:
158
+ resp.raise_for_status()
159
+ output_emitter.initialize(
160
+ request_id=utils.shortuuid(),
161
+ sample_rate=self._opts.sample_rate,
162
+ num_channels=self._tts.num_channels,
163
+ mime_type="audio/mp3",
164
+ )
165
+
166
+ async for raw_line in resp.content:
167
+ line = raw_line.strip()
168
+ if not line:
169
+ continue
170
+
171
+ data = json.loads(line.decode())
172
+ audio_b64 = data.get("audio")
173
+ if audio_b64:
174
+ output_emitter.push(base64.b64decode(audio_b64))
175
+
176
+ output_emitter.flush()
276
177
  except asyncio.TimeoutError:
277
178
  raise APITimeoutError() from None
278
179
  except Exception as e:
279
180
  raise APIConnectionError() from e
280
- finally:
281
- if decode_task:
282
- await utils.aio.gracefully_cancel(decode_task)
283
- await decoder.aclose()
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.23"
15
+ __version__ = "1.1.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-hume
3
- Version: 1.0.23
3
+ Version: 1.1.0
4
4
  Summary: Hume TTS plugin for LiveKit agents
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -17,8 +17,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
17
17
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.9.0
19
19
  Requires-Dist: aiohttp>=3.8.0
20
- Requires-Dist: hume>=0.8.3
21
- Requires-Dist: livekit-agents>=1.0.23
20
+ Requires-Dist: livekit-agents>=1.1.0
22
21
  Description-Content-Type: text/markdown
23
22
 
24
23
  # Hume AI TTS plugin for LiveKit Agents
@@ -0,0 +1,8 @@
1
+ livekit/plugins/hume/__init__.py,sha256=--F5e6CdoZM8eyw5ca-H-khoKdDJxdflwvrMCSwAHws,1250
2
+ livekit/plugins/hume/log.py,sha256=TwpK1FOwgD6Jb0A2nl-9nIgi0q5qWo9HGDrDuV_2g0g,67
3
+ livekit/plugins/hume/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
4
+ livekit/plugins/hume/tts.py,sha256=ZnVqxzzs75OpHe_YDMr5X_BgZlZRlQSiCYs0z1Yq5gg,6128
5
+ livekit/plugins/hume/version.py,sha256=Pl1D4Jol4f5vcHwFlr83NvnRouDwUNzW3Vxxi0E2uEA,600
6
+ livekit_plugins_hume-1.1.0.dist-info/METADATA,sha256=v2jQuMBnbp9uUwF9Tmwd8GCOXpc1DliK-0hYOe8Cbwk,1354
7
+ livekit_plugins_hume-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
+ livekit_plugins_hume-1.1.0.dist-info/RECORD,,
File without changes
@@ -1,9 +0,0 @@
1
- livekit/plugins/hume/__init__.py,sha256=uiMWP6s61RLTHIfjtXt1R70PmVgFFs3_Pc65soDI-Ts,1532
2
- livekit/plugins/hume/log.py,sha256=TwpK1FOwgD6Jb0A2nl-9nIgi0q5qWo9HGDrDuV_2g0g,67
3
- livekit/plugins/hume/models.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- livekit/plugins/hume/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
5
- livekit/plugins/hume/tts.py,sha256=JGUDNeVzPttrErCSP783bayByGBZraAHAZf23RgEz9A,11251
6
- livekit/plugins/hume/version.py,sha256=RWLIYs1l9IicWuoeuZpELTqDyou3coUv7IHCj4188tc,601
7
- livekit_plugins_hume-1.0.23.dist-info/METADATA,sha256=gM36eJhQ77vDH64C6qygbt-0fPwJumlWheP9uV0BvvI,1383
8
- livekit_plugins_hume-1.0.23.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
- livekit_plugins_hume-1.0.23.dist-info/RECORD,,