livekit-plugins-hume 1.0.17__tar.gz → 1.0.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-hume might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-hume
3
- Version: 1.0.17
3
+ Version: 1.0.19
4
4
  Summary: Hume TTS plugin for LiveKit agents
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -17,8 +17,8 @@ Classifier: Topic :: Multimedia :: Sound/Audio
17
17
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.9.0
19
19
  Requires-Dist: aiohttp>=3.8.0
20
- Requires-Dist: hume
21
- Requires-Dist: livekit-agents>=1.0.17
20
+ Requires-Dist: hume>=0.8.3
21
+ Requires-Dist: livekit-agents>=1.0.19
22
22
  Description-Content-Type: text/markdown
23
23
 
24
24
  # LiveKit Plugins Hume AI TTS
@@ -20,9 +20,7 @@ __version__ = "1.0.0"
20
20
  from hume.tts import (
21
21
  Format,
22
22
  PostedContext,
23
- PostedUtterance,
24
- PostedUtteranceVoiceWithId,
25
- PostedUtteranceVoiceWithName,
23
+ PostedUtteranceVoice,
26
24
  )
27
25
  from livekit.agents import Plugin
28
26
 
@@ -32,10 +30,8 @@ from .tts import TTS
32
30
  __all__ = [
33
31
  "TTS",
34
32
  "Format",
35
- "PostedUtterance",
36
33
  "PostedContext",
37
- "PostedUtteranceVoiceWithName",
38
- "PostedUtteranceVoiceWithId",
34
+ "PostedUtteranceVoice",
39
35
  ]
40
36
 
41
37
 
@@ -22,7 +22,7 @@ from dataclasses import dataclass
22
22
  import aiohttp
23
23
 
24
24
  from hume import AsyncHumeClient
25
- from hume.tts import Format, FormatWav, PostedContext, PostedUtterance, PostedUtteranceVoiceWithName
25
+ from hume.tts import Format, FormatWav, PostedContext, PostedUtterance, PostedUtteranceVoice
26
26
  from livekit.agents import (
27
27
  APIConnectionError,
28
28
  APIConnectOptions,
@@ -39,31 +39,21 @@ from livekit.agents.types import (
39
39
  from livekit.agents.utils import is_given
40
40
 
41
41
  # Default audio settings
42
- DEFAULT_SAMPLE_RATE = 24000
42
+ DEFAULT_SAMPLE_RATE = 48000
43
43
  DEFAULT_NUM_CHANNELS = 1
44
44
 
45
- # Default TTS settings
46
- DEFAULT_VOICE = PostedUtteranceVoiceWithName(name="Colton Rivers", provider="HUME_AI")
47
-
48
- # text is required in PostedUtterance but it is declared as an empty string
49
- # it will be overwritten when input tokens are received
50
- DEFAULT_UTTERANCE = PostedUtterance(
51
- voice=DEFAULT_VOICE, speed=1, trailing_silence=0.35, description="", text=""
52
- )
53
-
54
45
 
55
46
  @dataclass
56
47
  class _TTSOptions:
57
48
  """TTS options for Hume API"""
58
49
 
59
50
  api_key: str
60
- utterance_options: PostedUtterance
51
+ voice: PostedUtteranceVoice | None
52
+ description: str | None
53
+ speed: float | None
61
54
  context: PostedContext | None
62
55
  format: Format
63
- sample_rate: int
64
- split_utterances: bool
65
56
  strip_headers: bool
66
- num_generations: int
67
57
  instant_mode: bool
68
58
  word_tokenizer: tokenize.WordTokenizer
69
59
 
@@ -72,35 +62,37 @@ class TTS(tts.TTS):
72
62
  def __init__(
73
63
  self,
74
64
  *,
75
- utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
65
+ voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
66
+ description: NotGivenOr[str] = NOT_GIVEN,
67
+ speed: NotGivenOr[float] = NOT_GIVEN,
76
68
  context: NotGivenOr[PostedContext] = NOT_GIVEN,
77
69
  format: NotGivenOr[Format] = NOT_GIVEN,
78
- split_utterances: bool = False,
79
- num_generations: int = 1,
80
70
  instant_mode: bool = False,
81
71
  strip_headers: bool = True,
82
72
  api_key: NotGivenOr[str] = NOT_GIVEN,
83
73
  word_tokenizer: tokenize.WordTokenizer | None = None,
84
74
  http_session: aiohttp.ClientSession | None = None,
85
- sample_rate: int = 24000,
86
75
  ) -> None:
87
76
  """Initialize the Hume TTS client.
88
77
 
89
78
  See https://dev.hume.ai/reference/text-to-speech-tts/synthesize-json-streaming for API doc
90
79
 
91
80
  Args:
92
- utterance_options (NotGivenOr[PostedUtterance]): Default options for utterances,
93
- including description, voice, and delivery controls.
81
+ voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
82
+ used. When no voice is specified, a novel voice will be generated based on the
83
+ text and optionally provided description.
84
+ description (NotGivenOr[str]): Natural language instructions describing how the
85
+ synthesized speech should sound, including but not limited to tone, intonation,
86
+ pacing, and accent. If a Voice is specified in the request, this description
87
+ serves as acting instructions. If no Voice is specified, a new voice is generated
88
+ based on this description.
89
+ speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
90
+ from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
91
+ pace.
94
92
  context (NotGivenOr[PostedContext]): Utterances to use as context for generating
95
93
  consistent speech style and prosody across multiple requests.
96
94
  format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
97
95
  Defaults to WAV format.
98
- split_utterances (bool): Controls how audio output is segmented in the response.
99
- When enabled (True), input utterances are split into natural-sounding segments.
100
- When disabled (False), maintains one-to-one mapping between input and output.
101
- Defaults to False.
102
- num_generations (int): Number of generations of the audio to produce.
103
- Must be between 1 and 5. Defaults to 1.
104
96
  instant_mode (bool): Enables ultra-low latency streaming, reducing time to first chunk.
105
97
  Recommended for real-time applications. Only for streaming endpoints.
106
98
  With this enabled, requests incur 10% higher cost. Defaults to False.
@@ -113,14 +105,13 @@ class TTS(tts.TTS):
113
105
  If None, a basic word tokenizer will be used.
114
106
  http_session (aiohttp.ClientSession | None): Optional HTTP session for API requests.
115
107
  If None, a new session will be created.
116
- sample_rate (int): Audio sample rate in Hz. Defaults to 24000.
117
108
  """
118
109
 
119
110
  super().__init__(
120
111
  capabilities=tts.TTSCapabilities(
121
112
  streaming=False,
122
113
  ),
123
- sample_rate=sample_rate,
114
+ sample_rate=DEFAULT_SAMPLE_RATE,
124
115
  num_channels=DEFAULT_NUM_CHANNELS,
125
116
  )
126
117
 
@@ -134,15 +125,12 @@ class TTS(tts.TTS):
134
125
  word_tokenizer = tokenize.basic.WordTokenizer(ignore_punctuation=False)
135
126
 
136
127
  self._opts = _TTSOptions(
137
- utterance_options=utterance_options
138
- if is_given(utterance_options)
139
- else DEFAULT_UTTERANCE,
128
+ voice=voice if is_given(voice) else None,
129
+ description=description if is_given(description) else None,
130
+ speed=speed if is_given(speed) else None,
140
131
  context=context if is_given(context) else None,
141
132
  format=format if is_given(format) else FormatWav(),
142
133
  api_key=self._api_key,
143
- sample_rate=self.sample_rate,
144
- split_utterances=split_utterances,
145
- num_generations=num_generations,
146
134
  strip_headers=strip_headers,
147
135
  instant_mode=instant_mode,
148
136
  word_tokenizer=word_tokenizer,
@@ -159,26 +147,31 @@ class TTS(tts.TTS):
159
147
  def update_options(
160
148
  self,
161
149
  *,
162
- utterance_options: NotGivenOr[PostedUtterance] = NOT_GIVEN,
150
+ voice: NotGivenOr[PostedUtteranceVoice] = NOT_GIVEN,
151
+ description: NotGivenOr[str] = NOT_GIVEN,
152
+ speed: NotGivenOr[float] = NOT_GIVEN,
163
153
  context: NotGivenOr[PostedContext] = NOT_GIVEN,
164
154
  format: NotGivenOr[Format] = NOT_GIVEN,
165
- split_utterances: NotGivenOr[bool] = NOT_GIVEN,
166
- num_generations: NotGivenOr[int] = NOT_GIVEN,
167
155
  instant_mode: NotGivenOr[bool] = NOT_GIVEN,
168
156
  strip_headers: NotGivenOr[bool] = NOT_GIVEN,
169
157
  ) -> None:
170
158
  """Update TTS options for synthesizing speech.
171
159
 
172
160
  Args:
173
- utterance_options (NotGivenOr[PostedUtterance]): Options for utterances,
174
- including text, description, voice, and additional controls.
161
+ voice (NotGivenOr[PostedUtteranceVoice]): The voice, specified by name or id, to be
162
+ used. When no voice is specified, a novel voice will be generated based on the
163
+ text and optionally provided description.
164
+ description (NotGivenOr[str]): Natural language instructions describing how the
165
+ synthesized speech should sound, including but not limited to tone, intonation,
166
+ pacing, and accent. If a Voice is specified in the request, this description
167
+ serves as acting instructions. If no Voice is specified, a new voice is generated
168
+ based on this description.
169
+ speed: (NotGivenOr[float]): Adjusts the relative speaking rate on a non-linear scale
170
+ from 0.25 (much slower) to 3.0 (much faster), where 1.0 represents normal speaking
171
+ pace.
175
172
  context (Optional[PostedContext]): Utterances to use as context for generating
176
173
  consistent speech style and prosody across multiple requests.
177
174
  format (NotGivenOr[Format]): Specifies the output audio file format (WAV, MP3 or PCM).
178
- split_utterances (NotGivenOr[bool]): Controls how audio output is segmented.
179
- When True, utterances are split into natural-sounding segments.
180
- When False, maintains one-to-one mapping between input and output.
181
- num_generations (NotGivenOr[int]): Number of speech generations to produce (1-5).
182
175
  instant_mode (NotGivenOr[bool]): Enables ultra-low latency streaming.
183
176
  Reduces time to first audio chunk, recommended for real-time applications.
184
177
  Note: Incurs 10% higher cost when enabled.
@@ -187,26 +180,16 @@ class TTS(tts.TTS):
187
180
  If disabled, each chunk’s audio will be its own audio file, each with its headers.
188
181
  """
189
182
 
190
- if is_given(utterance_options):
191
- # text is required in PostedUtterance but it is declared as an empty string
192
- # it will be overwritten when input tokens are received
193
- self._opts.utterance_options = PostedUtterance(
194
- description=utterance_options.description if utterance_options.description else "",
195
- voice=utterance_options.voice if utterance_options.voice else DEFAULT_VOICE,
196
- speed=utterance_options.speed if utterance_options.speed else 1,
197
- trailing_silence=utterance_options.trailing_silence
198
- if utterance_options.trailing_silence
199
- else 0.35,
200
- text="",
201
- )
183
+ if is_given(voice):
184
+ self._opts.voice = voice
185
+ if is_given(description):
186
+ self._opts.description = description
187
+ if is_given(speed):
188
+ self._opts.speed = speed
202
189
  if is_given(format):
203
190
  self._opts.format = format
204
191
  if is_given(context):
205
192
  self._opts.context = context
206
- if is_given(split_utterances):
207
- self._opts.split_utterances = split_utterances
208
- if is_given(num_generations):
209
- self._opts.num_generations = num_generations
210
193
  if is_given(instant_mode):
211
194
  self._opts.instant_mode = instant_mode
212
195
  if is_given(strip_headers):
@@ -245,7 +228,7 @@ class ChunkedStream(tts.ChunkedStream):
245
228
  request_id = utils.shortuuid()
246
229
 
247
230
  decoder = utils.codecs.AudioStreamDecoder(
248
- sample_rate=self._opts.sample_rate,
231
+ sample_rate=DEFAULT_SAMPLE_RATE,
249
232
  num_channels=DEFAULT_NUM_CHANNELS,
250
233
  )
251
234
 
@@ -254,21 +237,24 @@ class ChunkedStream(tts.ChunkedStream):
254
237
  try:
255
238
 
256
239
  async def _decode_loop():
240
+ utterance_options = {
241
+ "voice": self._opts.voice,
242
+ "description": self._opts.description,
243
+ "speed": self._opts.speed,
244
+ }
245
+
246
+ utterance_kwargs = {
247
+ "text": self._input_text,
248
+ **{k: v for k, v in utterance_options.items() if v is not None},
249
+ }
250
+
257
251
  try:
252
+ utterance = PostedUtterance(**utterance_kwargs)
253
+
258
254
  async for chunk in self._client.tts.synthesize_json_streaming(
259
- utterances=[
260
- PostedUtterance(
261
- text=self._input_text,
262
- description=self._opts.utterance_options.description,
263
- voice=self._opts.utterance_options.voice,
264
- speed=self._opts.utterance_options.speed,
265
- trailing_silence=self._opts.utterance_options.trailing_silence,
266
- )
267
- ],
255
+ utterances=[utterance],
268
256
  context=self._opts.context,
269
257
  format=self._opts.format,
270
- num_generations=self._opts.num_generations,
271
- split_utterances=self._opts.split_utterances,
272
258
  instant_mode=self._opts.instant_mode,
273
259
  strip_headers=self._opts.strip_headers,
274
260
  ):
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.17"
15
+ __version__ = "1.0.19"
@@ -24,8 +24,8 @@ classifiers = [
24
24
  ]
25
25
  dependencies = [
26
26
  "aiohttp>=3.8.0",
27
- "livekit-agents>=1.0.17",
28
- "hume"
27
+ "livekit-agents>=1.0.19",
28
+ "hume>=0.8.3"
29
29
  ]
30
30
 
31
31
  [project.urls]