livekit-plugins-hume 1.0.23__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-hume might be problematic. Click here for more details.

@@ -146,6 +146,9 @@ venv.bak/
146
146
  .dmypy.json
147
147
  dmypy.json
148
148
 
149
+ # trunk
150
+ .trunk/
151
+
149
152
  # Pyre type checker
150
153
  .pyre/
151
154
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-hume
3
- Version: 1.0.23
3
+ Version: 1.1.0
4
4
  Summary: Hume TTS plugin for LiveKit agents
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -17,8 +17,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
17
17
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.9.0
19
19
  Requires-Dist: aiohttp>=3.8.0
20
- Requires-Dist: hume>=0.8.3
21
- Requires-Dist: livekit-agents>=1.0.23
20
+ Requires-Dist: livekit-agents>=1.1.0
22
21
  Description-Content-Type: text/markdown
23
22
 
24
23
  # Hume AI TTS plugin for LiveKit Agents
@@ -19,25 +19,12 @@ See https://docs.livekit.io/agents/integrations/tts/hume/ for more information.
19
19
 
20
20
  from __future__ import annotations
21
21
 
22
- __version__ = "1.0.0"
23
-
24
- # make imports available
25
- from hume.tts import (
26
- Format,
27
- PostedContext,
28
- PostedUtteranceVoice,
29
- )
30
22
  from livekit.agents import Plugin
31
23
 
32
- from .tts import TTS
24
+ from .tts import TTS, PostedContext, PostedUtterance
25
+ from .version import __version__
33
26
 
34
- # all exports
35
- __all__ = [
36
- "TTS",
37
- "Format",
38
- "PostedContext",
39
- "PostedUtteranceVoice",
40
- ]
27
+ __all__ = ["TTS", "PostedContext", "PostedUtterance"]
41
28
 
42
29
 
43
30
  class HumeAIPlugin(Plugin):
@@ -55,12 +42,3 @@ __pdoc__ = {}
55
42
 
56
43
  for n in NOT_IN_ALL:
57
44
  __pdoc__[n] = False
58
-
59
- # Cleanup docs of unexported modules
60
- _module = dir()
61
- NOT_IN_ALL = [m for m in _module if m not in __all__]
62
-
63
- __pdoc__ = {}
64
-
65
- for n in NOT_IN_ALL:
66
- __pdoc__[n] = False
@@ -0,0 +1,180 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ from dataclasses import dataclass, replace
22
+ from typing import Any, TypedDict
23
+
24
+ import aiohttp
25
+
26
+ from livekit.agents import APIConnectionError, APIConnectOptions, APITimeoutError, tts, utils
27
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
28
+ from livekit.agents.utils import is_given
29
+
30
+ API_AUTH_HEADER = "X-Hume-Api-Key"
31
+ STREAM_PATH = "/v0/tts/stream/json"
32
+ DEFAULT_BASE_URL = "https://api.hume.ai"
33
+
34
+
35
+ class PostedUtterance(TypedDict, total=False):
36
+ text: str
37
+ description: str
38
+ voice: dict[str, Any]
39
+ speed: float
40
+ trailing_silence: float
41
+
42
+
43
+ class PostedContext(TypedDict, total=False):
44
+ utterances: list[PostedUtterance]
45
+
46
+
47
+ @dataclass
48
+ class _TTSOptions:
49
+ api_key: str
50
+ utterance_options: PostedUtterance
51
+ context: PostedContext | None
52
+ sample_rate: int
53
+ split_utterances: bool
54
+ instant_mode: bool
55
+ base_url: str
56
+
57
+ def http_url(self, path: str) -> str:
58
+ return f"{self.base_url}{path}"
59
+
60
+
61
+ class TTS(tts.TTS):
62
+ def __init__(
63
+ self,
64
+ *,
65
+ api_key: str | None = None,
66
+ utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
67
+ split_utterances: bool = True,
68
+ instant_mode: bool = True,
69
+ sample_rate: int = 24000,
70
+ base_url: str = DEFAULT_BASE_URL,
71
+ http_session: aiohttp.ClientSession | None = None,
72
+ ):
73
+ super().__init__(
74
+ capabilities=tts.TTSCapabilities(streaming=True),
75
+ sample_rate=sample_rate,
76
+ num_channels=1,
77
+ )
78
+ key = api_key or os.environ.get("HUME_API_KEY")
79
+ if not key:
80
+ raise ValueError("Hume API key is required via api_key or HUME_API_KEY env var")
81
+
82
+ default_utterance: PostedUtterance = {
83
+ "speed": 1.0,
84
+ "trailing_silence": 0.35,
85
+ }
86
+ if is_given(utterance_options):
87
+ default_utterance.update(utterance_options)
88
+
89
+ self._opts = _TTSOptions(
90
+ api_key=key,
91
+ utterance_options=default_utterance,
92
+ context=None,
93
+ sample_rate=sample_rate,
94
+ split_utterances=split_utterances,
95
+ instant_mode=instant_mode,
96
+ base_url=base_url,
97
+ )
98
+ self._session = http_session
99
+
100
+ def _ensure_session(self) -> aiohttp.ClientSession:
101
+ if not self._session:
102
+ self._session = utils.http_context.http_session()
103
+
104
+ return self._session
105
+
106
+ def update_options(
107
+ self,
108
+ *,
109
+ utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
110
+ context: NotGivenOr[PostedContext] = NOT_GIVEN,
111
+ split_utterances: NotGivenOr[bool] = NOT_GIVEN,
112
+ instant_mode: NotGivenOr[bool] = NOT_GIVEN,
113
+ ) -> None:
114
+ if is_given(utterance_options):
115
+ self._opts.utterance_options = utterance_options
116
+ if is_given(context): #
117
+ self._opts.context = context
118
+ if is_given(split_utterances):
119
+ self._opts.split_utterances = split_utterances
120
+ if is_given(instant_mode):
121
+ self._opts.instant_mode = instant_mode
122
+
123
+ def synthesize(
124
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
125
+ ) -> tts.ChunkedStream:
126
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
127
+
128
+
129
+ class ChunkedStream(tts.ChunkedStream):
130
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
131
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
132
+ self._tts: TTS = tts
133
+ self._opts = replace(tts._opts)
134
+
135
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
136
+ utterance: PostedUtterance = {"text": self._input_text}
137
+ utterance.update(self._opts.utterance_options)
138
+
139
+ payload: dict[str, Any] = {
140
+ "utterances": [utterance],
141
+ "split_utterances": self._opts.split_utterances,
142
+ "strip_headers": True,
143
+ "instant_mode": self._opts.instant_mode,
144
+ "format": {"type": "mp3"},
145
+ }
146
+ if self._opts.context:
147
+ payload["context"] = self._opts.context
148
+
149
+ try:
150
+ async with self._tts._ensure_session().post(
151
+ self._opts.http_url(STREAM_PATH),
152
+ headers={API_AUTH_HEADER: self._opts.api_key},
153
+ json=payload,
154
+ timeout=aiohttp.ClientTimeout(total=None, sock_connect=self._conn_options.timeout),
155
+ # large read_bufsize to avoid `ValueError: Chunk too big`
156
+ read_bufsize=10 * 1024 * 1024,
157
+ ) as resp:
158
+ resp.raise_for_status()
159
+ output_emitter.initialize(
160
+ request_id=utils.shortuuid(),
161
+ sample_rate=self._opts.sample_rate,
162
+ num_channels=self._tts.num_channels,
163
+ mime_type="audio/mp3",
164
+ )
165
+
166
+ async for raw_line in resp.content:
167
+ line = raw_line.strip()
168
+ if not line:
169
+ continue
170
+
171
+ data = json.loads(line.decode())
172
+ audio_b64 = data.get("audio")
173
+ if audio_b64:
174
+ output_emitter.push(base64.b64decode(audio_b64))
175
+
176
+ output_emitter.flush()
177
+ except asyncio.TimeoutError:
178
+ raise APITimeoutError() from None
179
+ except Exception as e:
180
+ raise APIConnectionError() from e
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.23"
15
+ __version__ = "1.1.0"
@@ -24,8 +24,7 @@ classifiers = [
24
24
  ]
25
25
  dependencies = [
26
26
  "aiohttp>=3.8.0",
27
- "livekit-agents>=1.0.23",
28
- "hume>=0.8.3"
27
+ "livekit-agents>=1.1.0",
29
28
  ]
30
29
 
31
30
  [project.urls]
@@ -1,283 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import asyncio
18
- import base64
19
- import os
20
- from dataclasses import dataclass
21
-
22
- import aiohttp
23
-
24
- from hume import AsyncHumeClient
25
- from hume.tts import Format, FormatWav, PostedContext, PostedUtterance, PostedUtteranceVoice
26
- from livekit.agents import (
27
- APIConnectionError,
28
- APIConnectOptions,
29
- APITimeoutError,
30
- tokenize,
31
- tts,
32
- utils,
33
- )
34
- from livekit.agents.types import (
35
- DEFAULT_API_CONNECT_OPTIONS,
36
- NOT_GIVEN,
37
- NotGivenOr,
38
- )
39
- from livekit.agents.utils import is_given
40
-
41
- # Default audio settings
42
- DEFAULT_SAMPLE_RATE = 48000
43
- DEFAULT_NUM_CHANNELS = 1
44
-
45
-
46
- @dataclass
47
- class _TTSOptions:
48
- """TTS options for Hume API"""
49
-
50
- api_key: str
51
- voice: PostedUtteranceVoice | None
52
- description: str | None
53
- speed: float | None
54
- context: PostedContext | None
55
- format: Format
56
- strip_headers: bool
57
- instant_mode: bool
58
- word_tokenizer: tokenize.WordTokenizer
59
-
60
-
61
- class TTS(tts.TTS):
62
- def __init__(
63
- self,
64
- *,
65
- voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
66
- description: NotGivenOr[str] = NOT_GIVEN,
67
- speed: NotGivenOr[float] = NOT_GIVEN,
68
- context: NotGivenOr[PostedContext] = NOT_GIVEN,
69
- format: NotGivenOr[Format] = NOT_GIVEN,
70
- instant_mode: bool = False,
71
- strip_headers: bool = True,
72
- api_key: NotGivenOr[str] = NOT_GIVEN,
73
- word_tokenizer: tokenize.WordTokenizer | None = None,
74
- http_session: aiohttp.ClientSession | None = None,
75
- ) -> None:
76
- """Initialize the Hume TTS client.
77
-
78
- See https://dev.hume.ai/reference/text-to-speech-tts/synthesize-json-streaming for API doc
79
-
80
- Args:
81
- voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
82
- used. When no voice is specified, a novel voice will be generated based on the
83
- text and optionally provided description.
84
- description (NotGivenOr[str]): Natural language instructions describing how the
85
- synthesized speech should sound, including but not limited to tone, intonation,
86
- pacing, and accent. If a Voice is specified in the request, this description
87
- serves as acting instructions. If no Voice is specified, a new voice is generated
88
- based on this description.
89
- speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
90
- from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
91
- pace.
92
- context (NotGivenOr[PostedContext]): Utterances to use as context for generating
93
- consistent speech style and prosody across multiple requests.
94
- format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
95
- Defaults to WAV format.
96
- instant_mode (bool): Enables ultra-low latency streaming, reducing time to first chunk.
97
- Recommended for real-time applications. Only for streaming endpoints.
98
- With this enabled, requests incur 10% higher cost. Defaults to False.
99
- strip_headers (bool): If enabled, the audio for all the chunks of a generation.
100
- Once concatenated together, will constitute a single audio file.
101
- If disabled, each chunk’s audio will be its own audio file, each with its headers.
102
- api_key (NotGivenOr[str]): Hume API key for authentication. If not provided,
103
- will attempt to read from HUME_API_KEY environment variable.
104
- word_tokenizer (tokenize.WordTokenizer | None): Custom word tokenizer to use for text.
105
- If None, a basic word tokenizer will be used.
106
- http_session (aiohttp.ClientSession | None): Optional HTTP session for API requests.
107
- If None, a new session will be created.
108
- """
109
-
110
- super().__init__(
111
- capabilities=tts.TTSCapabilities(
112
- streaming=False,
113
- ),
114
- sample_rate=DEFAULT_SAMPLE_RATE,
115
- num_channels=DEFAULT_NUM_CHANNELS,
116
- )
117
-
118
- self._api_key = api_key if is_given(api_key) else os.environ.get("HUME_API_KEY")
119
- if not self._api_key:
120
- raise ValueError(
121
- "Hume API key is required, either as argument or set HUME_API_KEY env variable"
122
- )
123
-
124
- if not word_tokenizer:
125
- word_tokenizer = tokenize.basic.WordTokenizer(ignore_punctuation=False)
126
-
127
- self._opts = _TTSOptions(
128
- voice=voice if is_given(voice) else None,
129
- description=description if is_given(description) else None,
130
- speed=speed if is_given(speed) else None,
131
- context=context if is_given(context) else None,
132
- format=format if is_given(format) else FormatWav(),
133
- api_key=self._api_key,
134
- strip_headers=strip_headers,
135
- instant_mode=instant_mode,
136
- word_tokenizer=word_tokenizer,
137
- )
138
-
139
- self._client = AsyncHumeClient(api_key=self._api_key)
140
- self._session = http_session
141
-
142
- def _ensure_session(self) -> aiohttp.ClientSession:
143
- if not self._session:
144
- self._session = utils.http_context.http_session()
145
- return self._session
146
-
147
- def update_options(
148
- self,
149
- *,
150
- voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
151
- description: NotGivenOr[str] = NOT_GIVEN,
152
- speed: NotGivenOr[float] = NOT_GIVEN,
153
- context: NotGivenOr[PostedContext] = NOT_GIVEN,
154
- format: NotGivenOr[Format] = NOT_GIVEN,
155
- instant_mode: NotGivenOr[bool] = NOT_GIVEN,
156
- strip_headers: NotGivenOr[bool] = NOT_GIVEN,
157
- ) -> None:
158
- """Update TTS options for synthesizing speech.
159
-
160
- Args:
161
- voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
162
- used. When no voice is specified, a novel voice will be generated based on the
163
- text and optionally provided description.
164
- description (NotGivenOr[str]): Natural language instructions describing how the
165
- synthesized speech should sound, including but not limited to tone, intonation,
166
- pacing, and accent. If a Voice is specified in the request, this description
167
- serves as acting instructions. If no Voice is specified, a new voice is generated
168
- based on this description.
169
- speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
170
- from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
171
- pace.
172
- context (Optional[PostedContext]): Utterances to use as context for generating
173
- consistent speech style and prosody across multiple requests.
174
- format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
175
- instant_mode (NotGivenOr[bool]): Enables ultra-low latency streaming.
176
- Reduces time to first audio chunk, recommended for real-time applications.
177
- Note: Incurs 10% higher cost when enabled.
178
- strip_headers (NotGivenOr[bool]): If enabled, the audio for the chunks of a generation.
179
- Once concatenated together, will constitute a single audio file.
180
- If disabled, each chunk’s audio will be its own audio file, each with its headers.
181
- """
182
-
183
- if is_given(voice):
184
- self._opts.voice = voice
185
- if is_given(description):
186
- self._opts.description = description
187
- if is_given(speed):
188
- self._opts.speed = speed
189
- if is_given(format):
190
- self._opts.format = format
191
- if is_given(context):
192
- self._opts.context = context
193
- if is_given(instant_mode):
194
- self._opts.instant_mode = instant_mode
195
- if is_given(strip_headers):
196
- self._opts.strip_headers = strip_headers
197
-
198
- def synthesize(
199
- self,
200
- text: str,
201
- *,
202
- conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
203
- ) -> ChunkedStream:
204
- return ChunkedStream(
205
- tts=self,
206
- input_text=text,
207
- conn_options=conn_options,
208
- opts=self._opts,
209
- )
210
-
211
-
212
- class ChunkedStream(tts.ChunkedStream):
213
- """Stream for Hume TTS JSON streaming API."""
214
-
215
- def __init__(
216
- self,
217
- *,
218
- tts: TTS,
219
- input_text: str,
220
- opts: _TTSOptions,
221
- conn_options: APIConnectOptions,
222
- ) -> None:
223
- super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
224
- self._opts = opts
225
- self._client = tts._client
226
-
227
- async def _run(self) -> None:
228
- request_id = utils.shortuuid()
229
-
230
- decoder = utils.codecs.AudioStreamDecoder(
231
- sample_rate=DEFAULT_SAMPLE_RATE,
232
- num_channels=DEFAULT_NUM_CHANNELS,
233
- )
234
-
235
- decode_task: asyncio.Task | None = None
236
-
237
- try:
238
-
239
- async def _decode_loop():
240
- utterance_options = {
241
- "voice": self._opts.voice,
242
- "description": self._opts.description,
243
- "speed": self._opts.speed,
244
- }
245
-
246
- utterance_kwargs = {
247
- "text": self._input_text,
248
- **{k: v for k, v in utterance_options.items() if v is not None},
249
- }
250
-
251
- try:
252
- utterance = PostedUtterance(**utterance_kwargs)
253
-
254
- async for chunk in self._client.tts.synthesize_json_streaming(
255
- utterances=[utterance],
256
- context=self._opts.context,
257
- format=self._opts.format,
258
- instant_mode=self._opts.instant_mode,
259
- strip_headers=self._opts.strip_headers,
260
- ):
261
- decoder.push(base64.b64decode(chunk.audio))
262
-
263
- finally:
264
- decoder.end_input()
265
-
266
- decode_task = asyncio.create_task(_decode_loop())
267
- emitter = tts.SynthesizedAudioEmitter(
268
- event_ch=self._event_ch,
269
- request_id=request_id,
270
- )
271
- async for frame in decoder:
272
- emitter.push(frame)
273
-
274
- emitter.flush()
275
-
276
- except asyncio.TimeoutError:
277
- raise APITimeoutError() from None
278
- except Exception as e:
279
- raise APIConnectionError() from e
280
- finally:
281
- if decode_task:
282
- await utils.aio.gracefully_cancel(decode_task)
283
- await decoder.aclose()