hume 0.13.3 → 0.13.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.mock/definition/empathic-voice/__package__.yml +25 -27
- package/.mock/definition/empathic-voice/chat.yml +10 -10
- package/.mock/definition/empathic-voice/configs.yml +1 -11
- package/.mock/definition/tts/__package__.yml +169 -120
- package/.mock/definition/tts/streamInput.yml +56 -0
- package/.mock/fern.config.json +1 -1
- package/api/resources/empathicVoice/resources/configs/client/requests/PostedConfig.d.ts +1 -5
- package/api/resources/empathicVoice/types/AssistantInput.d.ts +1 -1
- package/api/resources/empathicVoice/types/AssistantMessage.d.ts +1 -1
- package/api/resources/empathicVoice/types/ChatMetadata.d.ts +2 -2
- package/api/resources/empathicVoice/types/PauseAssistantMessage.d.ts +1 -1
- package/api/resources/empathicVoice/types/SessionSettings.d.ts +7 -7
- package/api/resources/empathicVoice/types/ToolCallMessage.d.ts +1 -1
- package/api/resources/empathicVoice/types/ToolErrorMessage.d.ts +2 -2
- package/api/resources/empathicVoice/types/ToolResponseMessage.d.ts +3 -3
- package/api/resources/empathicVoice/types/UserMessage.d.ts +3 -3
- package/api/resources/tts/types/PublishTts.d.ts +23 -0
- package/api/resources/tts/types/PublishTts.js +5 -0
- package/api/resources/tts/types/SnippetAudioChunk.d.ts +1 -1
- package/api/resources/tts/types/index.d.ts +7 -6
- package/api/resources/tts/types/index.js +7 -6
- package/dist/api/resources/empathicVoice/resources/configs/client/requests/PostedConfig.d.ts +1 -5
- package/dist/api/resources/empathicVoice/types/AssistantInput.d.ts +1 -1
- package/dist/api/resources/empathicVoice/types/AssistantMessage.d.ts +1 -1
- package/dist/api/resources/empathicVoice/types/ChatMetadata.d.ts +2 -2
- package/dist/api/resources/empathicVoice/types/PauseAssistantMessage.d.ts +1 -1
- package/dist/api/resources/empathicVoice/types/SessionSettings.d.ts +7 -7
- package/dist/api/resources/empathicVoice/types/ToolCallMessage.d.ts +1 -1
- package/dist/api/resources/empathicVoice/types/ToolErrorMessage.d.ts +2 -2
- package/dist/api/resources/empathicVoice/types/ToolResponseMessage.d.ts +3 -3
- package/dist/api/resources/empathicVoice/types/UserMessage.d.ts +3 -3
- package/dist/api/resources/tts/types/PublishTts.d.ts +23 -0
- package/dist/api/resources/tts/types/PublishTts.js +5 -0
- package/dist/api/resources/tts/types/SnippetAudioChunk.d.ts +1 -1
- package/dist/api/resources/tts/types/index.d.ts +7 -6
- package/dist/api/resources/tts/types/index.js +7 -6
- package/dist/serialization/resources/tts/types/PublishTts.d.ts +19 -0
- package/dist/serialization/resources/tts/types/PublishTts.js +50 -0
- package/dist/serialization/resources/tts/types/SnippetAudioChunk.d.ts +1 -1
- package/dist/serialization/resources/tts/types/SnippetAudioChunk.js +1 -1
- package/dist/serialization/resources/tts/types/index.d.ts +7 -6
- package/dist/serialization/resources/tts/types/index.js +7 -6
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/wrapper/EVIWebAudioPlayer.d.ts +6 -7
- package/dist/wrapper/EVIWebAudioPlayer.js +237 -73
- package/dist/wrapper/convertFrequencyScale.d.ts +1 -0
- package/dist/wrapper/convertFrequencyScale.js +28 -0
- package/dist/wrapper/generateEmptyFft.d.ts +1 -0
- package/dist/wrapper/generateEmptyFft.js +6 -0
- package/package.json +2 -1
- package/serialization/resources/tts/types/PublishTts.d.ts +19 -0
- package/serialization/resources/tts/types/PublishTts.js +50 -0
- package/serialization/resources/tts/types/SnippetAudioChunk.d.ts +1 -1
- package/serialization/resources/tts/types/SnippetAudioChunk.js +1 -1
- package/serialization/resources/tts/types/index.d.ts +7 -6
- package/serialization/resources/tts/types/index.js +7 -6
- package/version.d.ts +1 -1
- package/version.js +1 -1
- package/wrapper/EVIWebAudioPlayer.d.ts +6 -7
- package/wrapper/EVIWebAudioPlayer.js +237 -73
- package/wrapper/convertFrequencyScale.d.ts +1 -0
- package/wrapper/convertFrequencyScale.js +28 -0
- package/wrapper/generateEmptyFft.d.ts +1 -0
- package/wrapper/generateEmptyFft.js +6 -0
|
@@ -32,7 +32,7 @@ types:
|
|
|
32
32
|
inflections and tones to the text based on the user's expressions and
|
|
33
33
|
the context of the conversation. The synthesized audio is streamed
|
|
34
34
|
back to the user as an [Assistant
|
|
35
|
-
Message](/reference/
|
|
35
|
+
Message](/reference/speech-to-speech-evi/chat#receive.AssistantMessage).
|
|
36
36
|
source:
|
|
37
37
|
openapi: evi-asyncapi.json
|
|
38
38
|
AudioConfiguration:
|
|
@@ -165,7 +165,7 @@ types:
|
|
|
165
165
|
|
|
166
166
|
Once this message is sent, EVI will not respond until a [Resume
|
|
167
167
|
Assistant
|
|
168
|
-
message](/reference/
|
|
168
|
+
message](/reference/speech-to-speech-evi/chat#send.ResumeAssistantMessage)
|
|
169
169
|
is sent. When paused, EVI won't respond, but transcriptions of your
|
|
170
170
|
audio inputs will still be recorded.
|
|
171
171
|
custom_session_id:
|
|
@@ -228,7 +228,7 @@ types:
|
|
|
228
228
|
|
|
229
229
|
|
|
230
230
|
For more information, please refer to the [Session Settings
|
|
231
|
-
guide](/docs/
|
|
231
|
+
guide](/docs/speech-to-speech-evi/configuration/session-settings).
|
|
232
232
|
custom_session_id:
|
|
233
233
|
type: optional<string>
|
|
234
234
|
docs: >-
|
|
@@ -245,8 +245,8 @@ types:
|
|
|
245
245
|
It is recommended to pass a `custom_session_id` if you are using a
|
|
246
246
|
Custom Language Model. Please see our guide to [using a custom
|
|
247
247
|
language
|
|
248
|
-
model](/docs/
|
|
249
|
-
|
|
248
|
+
model](/docs/speech-to-speech-evi/guides/custom-language-model) with
|
|
249
|
+
EVI to learn more.
|
|
250
250
|
system_prompt:
|
|
251
251
|
type: optional<string>
|
|
252
252
|
docs: >-
|
|
@@ -268,7 +268,7 @@ types:
|
|
|
268
268
|
|
|
269
269
|
|
|
270
270
|
For help writing a system prompt, see our [Prompting
|
|
271
|
-
Guide](/docs/
|
|
271
|
+
Guide](/docs/speech-to-speech-evi/guides/prompting).
|
|
272
272
|
context:
|
|
273
273
|
type: optional<Context>
|
|
274
274
|
docs: >-
|
|
@@ -294,7 +294,7 @@ types:
|
|
|
294
294
|
in PCM Linear 16 (16-bit, little-endian, signed PCM WAV data). For
|
|
295
295
|
detailed instructions on how to configure session settings for PCM
|
|
296
296
|
Linear 16 audio, please refer to the [Session Settings
|
|
297
|
-
guide](/docs/
|
|
297
|
+
guide](/docs/speech-to-speech-evi/configuration/session-settings).
|
|
298
298
|
language_model_api_key:
|
|
299
299
|
type: optional<string>
|
|
300
300
|
docs: >-
|
|
@@ -314,7 +314,7 @@ types:
|
|
|
314
314
|
searching the web or calling external APIs. Built-in tools, like web
|
|
315
315
|
search, are natively integrated, while user-defined tools are created
|
|
316
316
|
and invoked by the user. To learn more, see our [Tool Use
|
|
317
|
-
Guide](/docs/
|
|
317
|
+
Guide](/docs/speech-to-speech-evi/features/tool-use).
|
|
318
318
|
builtin_tools:
|
|
319
319
|
type: optional<list<BuiltinToolConfig>>
|
|
320
320
|
docs: >-
|
|
@@ -325,7 +325,7 @@ types:
|
|
|
325
325
|
searching the web or calling external APIs. Built-in tools, like web
|
|
326
326
|
search, are natively integrated, while user-defined tools are created
|
|
327
327
|
and invoked by the user. To learn more, see our [Tool Use
|
|
328
|
-
Guide](/docs/
|
|
328
|
+
Guide](/docs/speech-to-speech-evi/features/tool-use).
|
|
329
329
|
|
|
330
330
|
|
|
331
331
|
Currently, the only built-in tool Hume provides is **Web Search**.
|
|
@@ -350,7 +350,7 @@ types:
|
|
|
350
350
|
Using this field, you can personalize responses based on
|
|
351
351
|
session-specific details. For more guidance, see our [guide on using
|
|
352
352
|
dynamic
|
|
353
|
-
variables](/docs/
|
|
353
|
+
variables](/docs/speech-to-speech-evi/features/dynamic-variables).
|
|
354
354
|
source:
|
|
355
355
|
openapi: evi-asyncapi.json
|
|
356
356
|
Tool:
|
|
@@ -395,7 +395,7 @@ types:
|
|
|
395
395
|
|
|
396
396
|
|
|
397
397
|
Upon receiving a [Tool Call
|
|
398
|
-
message](/reference/
|
|
398
|
+
message](/reference/speech-to-speech-evi/chat#receive.ToolCallMessage)
|
|
399
399
|
and failing to invoke the function, this message is sent to notify EVI
|
|
400
400
|
of the tool's failure.
|
|
401
401
|
custom_session_id:
|
|
@@ -418,7 +418,7 @@ types:
|
|
|
418
418
|
invocation, ensuring that the Tool Error message is linked to the
|
|
419
419
|
appropriate tool call request. The specified `tool_call_id` must match
|
|
420
420
|
the one received in the [Tool Call
|
|
421
|
-
message](/reference/
|
|
421
|
+
message](/reference/speech-to-speech-evi/chat#receive.ToolCallMessage).
|
|
422
422
|
content:
|
|
423
423
|
type: optional<string>
|
|
424
424
|
docs: >-
|
|
@@ -449,7 +449,7 @@ types:
|
|
|
449
449
|
|
|
450
450
|
|
|
451
451
|
Upon receiving a [Tool Call
|
|
452
|
-
message](/reference/
|
|
452
|
+
message](/reference/speech-to-speech-evi/chat#receive.ToolCallMessage)
|
|
453
453
|
and successfully invoking the function, this message is sent to convey
|
|
454
454
|
the result of the function call back to EVI.
|
|
455
455
|
custom_session_id:
|
|
@@ -467,7 +467,7 @@ types:
|
|
|
467
467
|
invocation, ensuring that the correct response is linked to the
|
|
468
468
|
appropriate request. The specified `tool_call_id` must match the one
|
|
469
469
|
received in the [Tool Call
|
|
470
|
-
message](/reference/
|
|
470
|
+
message](/reference/speech-to-speech-evi/chat#receive.ToolCallMessage.tool_call_id).
|
|
471
471
|
content:
|
|
472
472
|
type: string
|
|
473
473
|
docs: >-
|
|
@@ -482,7 +482,7 @@ types:
|
|
|
482
482
|
Include this optional field to help the supplemental LLM identify
|
|
483
483
|
which tool generated the response. The specified `tool_name` must
|
|
484
484
|
match the one received in the [Tool Call
|
|
485
|
-
message](/reference/
|
|
485
|
+
message](/reference/speech-to-speech-evi/chat#receive.ToolCallMessage).
|
|
486
486
|
tool_type:
|
|
487
487
|
type: optional<ToolType>
|
|
488
488
|
docs: >-
|
|
@@ -584,7 +584,7 @@ types:
|
|
|
584
584
|
docs: >-
|
|
585
585
|
Indicates if this message was inserted into the conversation as text
|
|
586
586
|
from an [Assistant Input
|
|
587
|
-
message](/reference/
|
|
587
|
+
message](/reference/speech-to-speech-evi/chat#send.AssistantInput.text).
|
|
588
588
|
source:
|
|
589
589
|
openapi: evi-asyncapi.json
|
|
590
590
|
AssistantProsody:
|
|
@@ -693,14 +693,14 @@ types:
|
|
|
693
693
|
|
|
694
694
|
|
|
695
695
|
Used to resume a Chat when passed in the
|
|
696
|
-
[resumed_chat_group_id](/reference/
|
|
696
|
+
[resumed_chat_group_id](/reference/speech-to-speech-evi/chat#request.query.resumed_chat_group_id)
|
|
697
697
|
query parameter of a subsequent connection request. This allows EVI to
|
|
698
698
|
continue the conversation from where it left off within the Chat
|
|
699
699
|
Group.
|
|
700
700
|
|
|
701
701
|
|
|
702
702
|
Learn more about [supporting chat
|
|
703
|
-
resumability](/docs/
|
|
703
|
+
resumability](/docs/speech-to-speech-evi/faq#does-evi-support-chat-resumability)
|
|
704
704
|
from the EVI FAQ.
|
|
705
705
|
chat_id:
|
|
706
706
|
type: string
|
|
@@ -897,9 +897,9 @@ types:
|
|
|
897
897
|
docs: >-
|
|
898
898
|
Indicates whether a response to the tool call is required from the
|
|
899
899
|
developer, either in the form of a [Tool Response
|
|
900
|
-
message](/reference/
|
|
900
|
+
message](/reference/speech-to-speech-evi/chat#send.ToolResponseMessage)
|
|
901
901
|
or a [Tool Error
|
|
902
|
-
message](/reference/
|
|
902
|
+
message](/reference/speech-to-speech-evi/chat#send.ToolErrorMessage).
|
|
903
903
|
source:
|
|
904
904
|
openapi: evi-asyncapi.json
|
|
905
905
|
UserInterruption:
|
|
@@ -939,12 +939,10 @@ types:
|
|
|
939
939
|
|
|
940
940
|
This message contains both a transcript of the user's input and the
|
|
941
941
|
expression measurement predictions if the input was sent as an [Audio
|
|
942
|
-
Input
|
|
943
|
-
message](/reference/empathic-voice-interface-evi/chat/chat#send.AudioInput.type).
|
|
942
|
+
Input message](/reference/speech-to-speech-evi/chat#send.AudioInput).
|
|
944
943
|
Expression measurement predictions are not provided for a [User Input
|
|
945
|
-
message](/reference/
|
|
946
|
-
|
|
947
|
-
alone.
|
|
944
|
+
message](/reference/speech-to-speech-evi/chat#send.UserInput), as the
|
|
945
|
+
prosody model relies on audio input and cannot process text alone.
|
|
948
946
|
custom_session_id:
|
|
949
947
|
type: optional<string>
|
|
950
948
|
docs: >-
|
|
@@ -964,7 +962,7 @@ types:
|
|
|
964
962
|
docs: >-
|
|
965
963
|
Indicates if this message was inserted into the conversation as text
|
|
966
964
|
from a [User
|
|
967
|
-
Input](/reference/
|
|
965
|
+
Input](/reference/speech-to-speech-evi/chat#send.UserInput.text)
|
|
968
966
|
message.
|
|
969
967
|
interim:
|
|
970
968
|
type: boolean
|
|
@@ -981,7 +979,7 @@ types:
|
|
|
981
979
|
|
|
982
980
|
|
|
983
981
|
Interim transcripts are only sent when the
|
|
984
|
-
[`verbose_transcription`](/reference/
|
|
982
|
+
[`verbose_transcription`](/reference/speech-to-speech-evi/chat#request.query.verbose_transcription)
|
|
985
983
|
query parameter is set to `true` in the initial handshake.
|
|
986
984
|
source:
|
|
987
985
|
openapi: evi-asyncapi.json
|
|
@@ -13,11 +13,11 @@ channel:
|
|
|
13
13
|
Include this ID in your connection request to equip EVI with the Prompt,
|
|
14
14
|
Language Model, Voice, and Tools associated with the specified
|
|
15
15
|
configuration. If omitted, EVI will apply [default configuration
|
|
16
|
-
settings](/docs/
|
|
16
|
+
settings](/docs/speech-to-speech-evi/configuration/build-a-configuration#default-configuration).
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
For help obtaining this ID, see our [Configuration
|
|
20
|
-
Guide](/docs/
|
|
20
|
+
Guide](/docs/speech-to-speech-evi/configuration).
|
|
21
21
|
config_version:
|
|
22
22
|
type: optional<integer>
|
|
23
23
|
docs: >-
|
|
@@ -57,7 +57,7 @@ channel:
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
- [Chat
|
|
60
|
-
Metadata](/reference/
|
|
60
|
+
Metadata](/reference/speech-to-speech-evi/chat#receive.ChatMetadata):
|
|
61
61
|
Upon establishing a WebSocket connection with EVI, the user receives a
|
|
62
62
|
Chat Metadata message. This message contains a `chat_group_id`, which
|
|
63
63
|
can be used to resume conversations within this chat group in future
|
|
@@ -65,14 +65,14 @@ channel:
|
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
- [List Chats
|
|
68
|
-
endpoint](/reference/
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
68
|
+
endpoint](/reference/speech-to-speech-evi/chats/list-chats): Use the GET
|
|
69
|
+
`/v0/evi/chats` endpoint to obtain the Chat Group ID of individual Chat
|
|
70
|
+
sessions. This endpoint lists all available Chat sessions and their
|
|
71
|
+
associated Chat Group ID.
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
- [List Chat Groups
|
|
75
|
-
endpoint](/reference/
|
|
75
|
+
endpoint](/reference/speech-to-speech-evi/chat-groups/list-chat-groups):
|
|
76
76
|
Use the GET `/v0/evi/chat_groups` endpoint to obtain the Chat Group IDs
|
|
77
77
|
of all Chat Groups associated with an API key. This endpoint returns a
|
|
78
78
|
list of all available chat groups.
|
|
@@ -89,9 +89,9 @@ channel:
|
|
|
89
89
|
A flag to enable verbose transcription. Set this query parameter to
|
|
90
90
|
`true` to have unfinalized user transcripts be sent to the client as
|
|
91
91
|
interim UserMessage messages. The
|
|
92
|
-
[interim](/reference/
|
|
92
|
+
[interim](/reference/speech-to-speech-evi/chat#receive.UserMessage.interim)
|
|
93
93
|
field on a
|
|
94
|
-
[UserMessage](/reference/
|
|
94
|
+
[UserMessage](/reference/speech-to-speech-evi/chat#receive.UserMessage)
|
|
95
95
|
denotes whether the message is "interim" or "final."
|
|
96
96
|
event_limit:
|
|
97
97
|
type: optional<integer>
|
|
@@ -140,17 +140,7 @@ service:
|
|
|
140
140
|
properties:
|
|
141
141
|
evi_version:
|
|
142
142
|
type: string
|
|
143
|
-
docs:
|
|
144
|
-
Specifies the EVI version to use. See our [EVI Version
|
|
145
|
-
Guide](/docs/speech-to-speech-evi/configuration/evi-version) for
|
|
146
|
-
differences between versions.
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
**We're officially sunsetting EVI versions 1 and 2 on August 30,
|
|
150
|
-
2025**. To keep things running smoothly, be sure to [migrate to
|
|
151
|
-
EVI
|
|
152
|
-
3](/docs/speech-to-speech-evi/configuration/evi-version#migrating-to-evi-3)
|
|
153
|
-
before then.
|
|
143
|
+
docs: EVI version to use. Only version `3` is supported.
|
|
154
144
|
name:
|
|
155
145
|
type: string
|
|
156
146
|
docs: Name applied to all versions of a particular Config.
|
|
@@ -189,6 +189,175 @@ service:
|
|
|
189
189
|
source:
|
|
190
190
|
openapi: tts-openapi.json
|
|
191
191
|
types:
|
|
192
|
+
PublishTts:
|
|
193
|
+
docs: Input message type for the TTS stream.
|
|
194
|
+
properties:
|
|
195
|
+
text:
|
|
196
|
+
type: optional<string>
|
|
197
|
+
docs: The input text to be converted to speech output.
|
|
198
|
+
default: ''
|
|
199
|
+
validation:
|
|
200
|
+
maxLength: 5000
|
|
201
|
+
description:
|
|
202
|
+
type: optional<string>
|
|
203
|
+
docs: >-
|
|
204
|
+
Natural language instructions describing how the text should be spoken
|
|
205
|
+
by the model (e.g., `"a soft, gentle voice with a strong British
|
|
206
|
+
accent"`).
|
|
207
|
+
validation:
|
|
208
|
+
maxLength: 1000
|
|
209
|
+
voice:
|
|
210
|
+
type: optional<PostedUtteranceVoice>
|
|
211
|
+
docs: >-
|
|
212
|
+
The name or ID of the voice from the `Voice Library` to be used as the
|
|
213
|
+
speaker for this and all subsequent utterances, until the `"voice"`
|
|
214
|
+
field is updated again.
|
|
215
|
+
speed:
|
|
216
|
+
type: optional<double>
|
|
217
|
+
docs: A relative measure of how fast this utterance should be spoken.
|
|
218
|
+
default: 1
|
|
219
|
+
validation:
|
|
220
|
+
min: 0.25
|
|
221
|
+
max: 3
|
|
222
|
+
trailing_silence:
|
|
223
|
+
type: optional<double>
|
|
224
|
+
docs: Duration of trailing silence (in seconds) to add to this utterance
|
|
225
|
+
default: 0
|
|
226
|
+
validation:
|
|
227
|
+
min: 0
|
|
228
|
+
max: 5
|
|
229
|
+
flush:
|
|
230
|
+
type: optional<boolean>
|
|
231
|
+
docs: >-
|
|
232
|
+
Force the generation of audio regardless of how much text has been
|
|
233
|
+
supplied.
|
|
234
|
+
default: false
|
|
235
|
+
close:
|
|
236
|
+
type: optional<boolean>
|
|
237
|
+
docs: Force the generation of audio and close the stream.
|
|
238
|
+
default: false
|
|
239
|
+
source:
|
|
240
|
+
openapi: tts-asyncapi.json
|
|
241
|
+
PostedUtteranceVoiceWithId:
|
|
242
|
+
properties:
|
|
243
|
+
id:
|
|
244
|
+
type: string
|
|
245
|
+
docs: The unique ID associated with the **Voice**.
|
|
246
|
+
provider:
|
|
247
|
+
type: optional<VoiceProvider>
|
|
248
|
+
docs: >-
|
|
249
|
+
Specifies the source provider associated with the chosen voice.
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
- **`HUME_AI`**: Select voices from Hume's [Voice
|
|
253
|
+
Library](https://platform.hume.ai/tts/voice-library), containing a
|
|
254
|
+
variety of preset, shared voices.
|
|
255
|
+
|
|
256
|
+
- **`CUSTOM_VOICE`**: Select from voices you've personally generated
|
|
257
|
+
and saved in your account.
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
If no provider is explicitly set, the default provider is
|
|
261
|
+
`CUSTOM_VOICE`. When using voices from Hume's **Voice Library**, you
|
|
262
|
+
must explicitly set the provider to `HUME_AI`.
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
Preset voices from Hume's **Voice Library** are accessible by all
|
|
266
|
+
users. In contrast, your custom voices are private and accessible only
|
|
267
|
+
via requests authenticated with your API key.
|
|
268
|
+
source:
|
|
269
|
+
openapi: tts-openapi.json
|
|
270
|
+
PostedUtteranceVoiceWithName:
|
|
271
|
+
properties:
|
|
272
|
+
name:
|
|
273
|
+
type: string
|
|
274
|
+
docs: The name of a **Voice**.
|
|
275
|
+
provider:
|
|
276
|
+
type: optional<VoiceProvider>
|
|
277
|
+
docs: >-
|
|
278
|
+
Specifies the source provider associated with the chosen voice.
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
- **`HUME_AI`**: Select voices from Hume's [Voice
|
|
282
|
+
Library](https://platform.hume.ai/tts/voice-library), containing a
|
|
283
|
+
variety of preset, shared voices.
|
|
284
|
+
|
|
285
|
+
- **`CUSTOM_VOICE`**: Select from voices you've personally generated
|
|
286
|
+
and saved in your account.
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
If no provider is explicitly set, the default provider is
|
|
290
|
+
`CUSTOM_VOICE`. When using voices from Hume's **Voice Library**, you
|
|
291
|
+
must explicitly set the provider to `HUME_AI`.
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
Preset voices from Hume's **Voice Library** are accessible by all
|
|
295
|
+
users. In contrast, your custom voices are private and accessible only
|
|
296
|
+
via requests authenticated with your API key.
|
|
297
|
+
source:
|
|
298
|
+
openapi: tts-openapi.json
|
|
299
|
+
VoiceProvider:
|
|
300
|
+
enum:
|
|
301
|
+
- HUME_AI
|
|
302
|
+
- CUSTOM_VOICE
|
|
303
|
+
source:
|
|
304
|
+
openapi: tts-openapi.json
|
|
305
|
+
PostedUtteranceVoice:
|
|
306
|
+
discriminated: false
|
|
307
|
+
union:
|
|
308
|
+
- type: PostedUtteranceVoiceWithId
|
|
309
|
+
- type: PostedUtteranceVoiceWithName
|
|
310
|
+
source:
|
|
311
|
+
openapi: tts-openapi.json
|
|
312
|
+
AudioFormatType:
|
|
313
|
+
enum:
|
|
314
|
+
- mp3
|
|
315
|
+
- pcm
|
|
316
|
+
- wav
|
|
317
|
+
source:
|
|
318
|
+
openapi: tts-openapi.json
|
|
319
|
+
SnippetAudioChunk:
|
|
320
|
+
properties:
|
|
321
|
+
generation_id:
|
|
322
|
+
type: string
|
|
323
|
+
docs: >-
|
|
324
|
+
The generation ID of the parent snippet that this chunk corresponds
|
|
325
|
+
to.
|
|
326
|
+
snippet_id:
|
|
327
|
+
type: string
|
|
328
|
+
docs: The ID of the parent snippet that this chunk corresponds to.
|
|
329
|
+
text:
|
|
330
|
+
type: string
|
|
331
|
+
docs: The text of the parent snippet that this chunk corresponds to.
|
|
332
|
+
transcribed_text:
|
|
333
|
+
type: optional<string>
|
|
334
|
+
docs: >-
|
|
335
|
+
The transcribed text of the generated audio of the parent snippet that
|
|
336
|
+
this chunk corresponds to. It is only present if `instant_mode` is set
|
|
337
|
+
to `false`.
|
|
338
|
+
chunk_index:
|
|
339
|
+
type: integer
|
|
340
|
+
docs: The index of the audio chunk in the snippet.
|
|
341
|
+
audio:
|
|
342
|
+
type: string
|
|
343
|
+
docs: The generated audio output chunk in the requested format.
|
|
344
|
+
audio_format:
|
|
345
|
+
type: AudioFormatType
|
|
346
|
+
docs: The generated audio output format.
|
|
347
|
+
is_last_chunk:
|
|
348
|
+
type: boolean
|
|
349
|
+
docs: >-
|
|
350
|
+
Whether or not this is the last chunk streamed back from the decoder
|
|
351
|
+
for one input snippet.
|
|
352
|
+
utterance_index:
|
|
353
|
+
type: optional<integer>
|
|
354
|
+
docs: >-
|
|
355
|
+
The index of the utterance in the request that the parent snippet of
|
|
356
|
+
this chunk corresponds to.
|
|
357
|
+
snippet:
|
|
358
|
+
type: optional<Snippet>
|
|
359
|
+
source:
|
|
360
|
+
openapi: tts-openapi.json
|
|
192
361
|
PostedContextWithGenerationId:
|
|
193
362
|
properties:
|
|
194
363
|
generation_id:
|
|
@@ -220,13 +389,6 @@ types:
|
|
|
220
389
|
is `48000 Hz`.
|
|
221
390
|
source:
|
|
222
391
|
openapi: tts-openapi.json
|
|
223
|
-
AudioFormatType:
|
|
224
|
-
enum:
|
|
225
|
-
- mp3
|
|
226
|
-
- pcm
|
|
227
|
-
- wav
|
|
228
|
-
source:
|
|
229
|
-
openapi: tts-openapi.json
|
|
230
392
|
ReturnGeneration:
|
|
231
393
|
properties:
|
|
232
394
|
generation_id:
|
|
@@ -435,48 +597,6 @@ types:
|
|
|
435
597
|
base64 string.
|
|
436
598
|
source:
|
|
437
599
|
openapi: tts-openapi.json
|
|
438
|
-
SnippetAudioChunk:
|
|
439
|
-
properties:
|
|
440
|
-
generation_id:
|
|
441
|
-
type: string
|
|
442
|
-
docs: >-
|
|
443
|
-
The generation ID of the parent snippet that this chunk corresponds
|
|
444
|
-
to.
|
|
445
|
-
snippet_id:
|
|
446
|
-
type: string
|
|
447
|
-
docs: The ID of the parent snippet that this chunk corresponds to.
|
|
448
|
-
text:
|
|
449
|
-
type: string
|
|
450
|
-
docs: The text of the parent snippet that this chunk corresponds to.
|
|
451
|
-
transcribed_text:
|
|
452
|
-
type: optional<string>
|
|
453
|
-
docs: >-
|
|
454
|
-
The transcribed text of the generated audio of the parent snippet that
|
|
455
|
-
this chunk corresponds to. It is only present if `instant_mode` is set
|
|
456
|
-
to `false`.
|
|
457
|
-
chunk_index:
|
|
458
|
-
type: integer
|
|
459
|
-
docs: The index of the audio chunk in the snippet.
|
|
460
|
-
audio:
|
|
461
|
-
type: string
|
|
462
|
-
docs: The generated audio output chunk in the requested format.
|
|
463
|
-
audio_format:
|
|
464
|
-
type: AudioFormatType
|
|
465
|
-
docs: The generated audio output format.
|
|
466
|
-
is_last_chunk:
|
|
467
|
-
type: boolean
|
|
468
|
-
docs: >-
|
|
469
|
-
Whether or not this is the last chunk streamed back from the decoder
|
|
470
|
-
for one input snippet.
|
|
471
|
-
utterance_index:
|
|
472
|
-
type: optional<integer>
|
|
473
|
-
docs: >-
|
|
474
|
-
The index of the utterance in the request that the parent snippet of
|
|
475
|
-
this chunk corresponds to.
|
|
476
|
-
snippet:
|
|
477
|
-
type: Snippet
|
|
478
|
-
source:
|
|
479
|
-
openapi: tts-openapi.json
|
|
480
600
|
PostedUtterance:
|
|
481
601
|
properties:
|
|
482
602
|
text:
|
|
@@ -547,77 +667,6 @@ types:
|
|
|
547
667
|
type: string
|
|
548
668
|
source:
|
|
549
669
|
openapi: tts-openapi.json
|
|
550
|
-
PostedUtteranceVoiceWithId:
|
|
551
|
-
properties:
|
|
552
|
-
id:
|
|
553
|
-
type: string
|
|
554
|
-
docs: The unique ID associated with the **Voice**.
|
|
555
|
-
provider:
|
|
556
|
-
type: optional<VoiceProvider>
|
|
557
|
-
docs: >-
|
|
558
|
-
Specifies the source provider associated with the chosen voice.
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
- **`HUME_AI`**: Select voices from Hume's [Voice
|
|
562
|
-
Library](https://platform.hume.ai/tts/voice-library), containing a
|
|
563
|
-
variety of preset, shared voices.
|
|
564
|
-
|
|
565
|
-
- **`CUSTOM_VOICE`**: Select from voices you've personally generated
|
|
566
|
-
and saved in your account.
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
If no provider is explicitly set, the default provider is
|
|
570
|
-
`CUSTOM_VOICE`. When using voices from Hume's **Voice Library**, you
|
|
571
|
-
must explicitly set the provider to `HUME_AI`.
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
Preset voices from Hume's **Voice Library** are accessible by all
|
|
575
|
-
users. In contrast, your custom voices are private and accessible only
|
|
576
|
-
via requests authenticated with your API key.
|
|
577
|
-
source:
|
|
578
|
-
openapi: tts-openapi.json
|
|
579
|
-
PostedUtteranceVoiceWithName:
|
|
580
|
-
properties:
|
|
581
|
-
name:
|
|
582
|
-
type: string
|
|
583
|
-
docs: The name of a **Voice**.
|
|
584
|
-
provider:
|
|
585
|
-
type: optional<VoiceProvider>
|
|
586
|
-
docs: >-
|
|
587
|
-
Specifies the source provider associated with the chosen voice.
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
- **`HUME_AI`**: Select voices from Hume's [Voice
|
|
591
|
-
Library](https://platform.hume.ai/tts/voice-library), containing a
|
|
592
|
-
variety of preset, shared voices.
|
|
593
|
-
|
|
594
|
-
- **`CUSTOM_VOICE`**: Select from voices you've personally generated
|
|
595
|
-
and saved in your account.
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
If no provider is explicitly set, the default provider is
|
|
599
|
-
`CUSTOM_VOICE`. When using voices from Hume's **Voice Library**, you
|
|
600
|
-
must explicitly set the provider to `HUME_AI`.
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
Preset voices from Hume's **Voice Library** are accessible by all
|
|
604
|
-
users. In contrast, your custom voices are private and accessible only
|
|
605
|
-
via requests authenticated with your API key.
|
|
606
|
-
source:
|
|
607
|
-
openapi: tts-openapi.json
|
|
608
|
-
VoiceProvider:
|
|
609
|
-
enum:
|
|
610
|
-
- HUME_AI
|
|
611
|
-
- CUSTOM_VOICE
|
|
612
|
-
source:
|
|
613
|
-
openapi: tts-openapi.json
|
|
614
|
-
PostedUtteranceVoice:
|
|
615
|
-
discriminated: false
|
|
616
|
-
union:
|
|
617
|
-
- type: PostedUtteranceVoiceWithId
|
|
618
|
-
- type: PostedUtteranceVoiceWithName
|
|
619
|
-
source:
|
|
620
|
-
openapi: tts-openapi.json
|
|
621
670
|
FormatWav:
|
|
622
671
|
properties: {}
|
|
623
672
|
source:
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
imports:
|
|
2
|
+
root: __package__.yml
|
|
3
|
+
channel:
|
|
4
|
+
path: /stream/input
|
|
5
|
+
url: prod
|
|
6
|
+
auth: false
|
|
7
|
+
docs: Generate emotionally expressive speech.
|
|
8
|
+
query-parameters:
|
|
9
|
+
context_generation_id:
|
|
10
|
+
type: optional<string>
|
|
11
|
+
docs: >-
|
|
12
|
+
The ID of a prior TTS generation to use as context for generating
|
|
13
|
+
consistent speech style and prosody across multiple requests. Including
|
|
14
|
+
context may increase audio generation times.
|
|
15
|
+
format_type: root.AudioFormatType
|
|
16
|
+
strip_headers:
|
|
17
|
+
type: optional<boolean>
|
|
18
|
+
default: false
|
|
19
|
+
docs: >-
|
|
20
|
+
If enabled, the audio for all the chunks of a generation, once
|
|
21
|
+
concatenated together, will constitute a single audio file. Otherwise,
|
|
22
|
+
if disabled, each chunk's audio will be its own audio file, each with
|
|
23
|
+
its own headers (if applicable).
|
|
24
|
+
instant_mode:
|
|
25
|
+
type: optional<boolean>
|
|
26
|
+
default: true
|
|
27
|
+
docs: >-
|
|
28
|
+
Accelerates processing to reduce streaming latency.Incurs approximately
|
|
29
|
+
10% additional cost while preserving full voice quality.
|
|
30
|
+
no_binary:
|
|
31
|
+
type: optional<boolean>
|
|
32
|
+
default: false
|
|
33
|
+
docs: If enabled, no binary websocket messages will be sent to the client.
|
|
34
|
+
messages:
|
|
35
|
+
publish:
|
|
36
|
+
origin: client
|
|
37
|
+
body:
|
|
38
|
+
type: root.PublishTts
|
|
39
|
+
subscribe:
|
|
40
|
+
origin: server
|
|
41
|
+
body:
|
|
42
|
+
type: root.SnippetAudioChunk
|
|
43
|
+
examples:
|
|
44
|
+
- messages:
|
|
45
|
+
- type: publish
|
|
46
|
+
body: {}
|
|
47
|
+
- type: subscribe
|
|
48
|
+
body:
|
|
49
|
+
request_id: request_id
|
|
50
|
+
generation_id: generation_id
|
|
51
|
+
snippet_id: snippet_id
|
|
52
|
+
text: text
|
|
53
|
+
chunk_index: 1
|
|
54
|
+
audio: audio
|
|
55
|
+
audio_format: mp3
|
|
56
|
+
is_last_chunk: true
|
package/.mock/fern.config.json
CHANGED
|
@@ -37,11 +37,7 @@ import * as Hume from "../../../../../../index";
|
|
|
37
37
|
* }
|
|
38
38
|
*/
|
|
39
39
|
export interface PostedConfig {
|
|
40
|
-
/**
|
|
41
|
-
* Specifies the EVI version to use. See our [EVI Version Guide](/docs/speech-to-speech-evi/configuration/evi-version) for differences between versions.
|
|
42
|
-
*
|
|
43
|
-
* **We're officially sunsetting EVI versions 1 and 2 on August 30, 2025**. To keep things running smoothly, be sure to [migrate to EVI 3](/docs/speech-to-speech-evi/configuration/evi-version#migrating-to-evi-3) before then.
|
|
44
|
-
*/
|
|
40
|
+
/** EVI version to use. Only version `3` is supported. */
|
|
45
41
|
eviVersion: string;
|
|
46
42
|
/** Name applied to all versions of a particular Config. */
|
|
47
43
|
name: string;
|
|
@@ -12,7 +12,7 @@ export interface AssistantInput {
|
|
|
12
12
|
/**
|
|
13
13
|
* Assistant text to synthesize into spoken audio and insert into the conversation.
|
|
14
14
|
*
|
|
15
|
-
* EVI uses this text to generate spoken audio using our proprietary expressive text-to-speech model. Our model adds appropriate emotional inflections and tones to the text based on the user's expressions and the context of the conversation. The synthesized audio is streamed back to the user as an [Assistant Message](/reference/
|
|
15
|
+
* EVI uses this text to generate spoken audio using our proprietary expressive text-to-speech model. Our model adds appropriate emotional inflections and tones to the text based on the user's expressions and the context of the conversation. The synthesized audio is streamed back to the user as an [Assistant Message](/reference/speech-to-speech-evi/chat#receive.AssistantMessage).
|
|
16
16
|
*/
|
|
17
17
|
text: string;
|
|
18
18
|
}
|