hume 0.13.3 → 0.13.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/api/resources/empathicVoice/resources/configs/client/requests/PostedConfig.d.ts +1 -5
- package/api/resources/empathicVoice/types/AssistantInput.d.ts +1 -1
- package/api/resources/empathicVoice/types/AssistantMessage.d.ts +1 -1
- package/api/resources/empathicVoice/types/ChatMetadata.d.ts +2 -2
- package/api/resources/empathicVoice/types/PauseAssistantMessage.d.ts +1 -1
- package/api/resources/empathicVoice/types/SessionSettings.d.ts +9 -7
- package/api/resources/empathicVoice/types/ToolCallMessage.d.ts +1 -1
- package/api/resources/empathicVoice/types/ToolErrorMessage.d.ts +2 -2
- package/api/resources/empathicVoice/types/ToolResponseMessage.d.ts +3 -3
- package/api/resources/empathicVoice/types/UserMessage.d.ts +3 -3
- package/api/resources/tts/types/PublishTts.d.ts +23 -0
- package/api/resources/tts/types/PublishTts.js +5 -0
- package/api/resources/tts/types/SnippetAudioChunk.d.ts +6 -1
- package/api/resources/tts/types/index.d.ts +7 -6
- package/api/resources/tts/types/index.js +7 -6
- package/dist/api/resources/empathicVoice/resources/configs/client/requests/PostedConfig.d.ts +1 -5
- package/dist/api/resources/empathicVoice/types/AssistantInput.d.ts +1 -1
- package/dist/api/resources/empathicVoice/types/AssistantMessage.d.ts +1 -1
- package/dist/api/resources/empathicVoice/types/ChatMetadata.d.ts +2 -2
- package/dist/api/resources/empathicVoice/types/PauseAssistantMessage.d.ts +1 -1
- package/dist/api/resources/empathicVoice/types/SessionSettings.d.ts +9 -7
- package/dist/api/resources/empathicVoice/types/ToolCallMessage.d.ts +1 -1
- package/dist/api/resources/empathicVoice/types/ToolErrorMessage.d.ts +2 -2
- package/dist/api/resources/empathicVoice/types/ToolResponseMessage.d.ts +3 -3
- package/dist/api/resources/empathicVoice/types/UserMessage.d.ts +3 -3
- package/dist/api/resources/tts/types/PublishTts.d.ts +23 -0
- package/dist/api/resources/tts/types/PublishTts.js +5 -0
- package/dist/api/resources/tts/types/SnippetAudioChunk.d.ts +6 -1
- package/dist/api/resources/tts/types/index.d.ts +7 -6
- package/dist/api/resources/tts/types/index.js +7 -6
- package/dist/serialization/resources/empathicVoice/types/SessionSettings.d.ts +1 -0
- package/dist/serialization/resources/empathicVoice/types/SessionSettings.js +1 -0
- package/dist/serialization/resources/tts/types/PublishTts.d.ts +19 -0
- package/dist/serialization/resources/tts/types/PublishTts.js +50 -0
- package/dist/serialization/resources/tts/types/SnippetAudioChunk.d.ts +2 -1
- package/dist/serialization/resources/tts/types/SnippetAudioChunk.js +2 -1
- package/dist/serialization/resources/tts/types/index.d.ts +7 -6
- package/dist/serialization/resources/tts/types/index.js +7 -6
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/wrapper/EVIWebAudioPlayer.d.ts +6 -7
- package/dist/wrapper/EVIWebAudioPlayer.js +237 -73
- package/dist/wrapper/SilenceFiller.d.ts +85 -0
- package/dist/wrapper/SilenceFiller.js +203 -0
- package/dist/wrapper/collate.d.ts +36 -0
- package/dist/wrapper/collate.js +126 -0
- package/dist/wrapper/convertFrequencyScale.d.ts +1 -0
- package/dist/wrapper/convertFrequencyScale.js +28 -0
- package/dist/wrapper/generateEmptyFft.d.ts +1 -0
- package/dist/wrapper/generateEmptyFft.js +6 -0
- package/dist/wrapper/index.d.ts +2 -0
- package/dist/wrapper/index.js +5 -1
- package/package.json +2 -1
- package/serialization/resources/empathicVoice/types/SessionSettings.d.ts +1 -0
- package/serialization/resources/empathicVoice/types/SessionSettings.js +1 -0
- package/serialization/resources/tts/types/PublishTts.d.ts +19 -0
- package/serialization/resources/tts/types/PublishTts.js +50 -0
- package/serialization/resources/tts/types/SnippetAudioChunk.d.ts +2 -1
- package/serialization/resources/tts/types/SnippetAudioChunk.js +2 -1
- package/serialization/resources/tts/types/index.d.ts +7 -6
- package/serialization/resources/tts/types/index.js +7 -6
- package/version.d.ts +1 -1
- package/version.js +1 -1
- package/wrapper/EVIWebAudioPlayer.d.ts +6 -7
- package/wrapper/EVIWebAudioPlayer.js +237 -73
- package/wrapper/SilenceFiller.d.ts +85 -0
- package/wrapper/SilenceFiller.js +203 -0
- package/wrapper/collate.d.ts +36 -0
- package/wrapper/collate.js +126 -0
- package/wrapper/convertFrequencyScale.d.ts +1 -0
- package/wrapper/convertFrequencyScale.js +28 -0
- package/wrapper/generateEmptyFft.d.ts +1 -0
- package/wrapper/generateEmptyFft.js +6 -0
- package/wrapper/index.d.ts +2 -0
- package/wrapper/index.js +5 -1
- package/.mock/definition/api.yml +0 -12
- package/.mock/definition/empathic-voice/__package__.yml +0 -2973
- package/.mock/definition/empathic-voice/chat.yml +0 -175
- package/.mock/definition/empathic-voice/chatGroups.yml +0 -627
- package/.mock/definition/empathic-voice/chatWebhooks.yml +0 -30
- package/.mock/definition/empathic-voice/chats.yml +0 -506
- package/.mock/definition/empathic-voice/configs.yml +0 -852
- package/.mock/definition/empathic-voice/prompts.yml +0 -558
- package/.mock/definition/empathic-voice/tools.yml +0 -626
- package/.mock/definition/expression-measurement/__package__.yml +0 -1
- package/.mock/definition/expression-measurement/batch/__package__.yml +0 -1803
- package/.mock/definition/expression-measurement/stream/__package__.yml +0 -113
- package/.mock/definition/expression-measurement/stream/stream.yml +0 -438
- package/.mock/definition/tts/__package__.yml +0 -660
- package/.mock/definition/tts/voices.yml +0 -143
- package/.mock/fern.config.json +0 -4
|
@@ -1,660 +0,0 @@
|
|
|
1
|
-
errors:
|
|
2
|
-
UnprocessableEntityError:
|
|
3
|
-
status-code: 422
|
|
4
|
-
type: HTTPValidationError
|
|
5
|
-
docs: Validation Error
|
|
6
|
-
examples:
|
|
7
|
-
- value: {}
|
|
8
|
-
BadRequestError:
|
|
9
|
-
status-code: 400
|
|
10
|
-
type: ErrorResponse
|
|
11
|
-
docs: Bad Request
|
|
12
|
-
examples:
|
|
13
|
-
- value: {}
|
|
14
|
-
service:
|
|
15
|
-
auth: false
|
|
16
|
-
base-path: ''
|
|
17
|
-
endpoints:
|
|
18
|
-
synthesize-json:
|
|
19
|
-
path: /v0/tts
|
|
20
|
-
method: POST
|
|
21
|
-
auth: true
|
|
22
|
-
docs: >-
|
|
23
|
-
Synthesizes one or more input texts into speech using the specified
|
|
24
|
-
voice. If no voice is provided, a novel voice will be generated
|
|
25
|
-
dynamically. Optionally, additional context can be included to influence
|
|
26
|
-
the speech's style and prosody.
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
The response includes the base64-encoded audio and metadata in JSON
|
|
30
|
-
format.
|
|
31
|
-
source:
|
|
32
|
-
openapi: tts-openapi.json
|
|
33
|
-
display-name: Text-to-speech (Json)
|
|
34
|
-
request:
|
|
35
|
-
body:
|
|
36
|
-
type: PostedTts
|
|
37
|
-
content-type: application/json
|
|
38
|
-
response:
|
|
39
|
-
docs: Successful Response
|
|
40
|
-
type: ReturnTts
|
|
41
|
-
status-code: 200
|
|
42
|
-
errors:
|
|
43
|
-
- UnprocessableEntityError
|
|
44
|
-
examples:
|
|
45
|
-
- request:
|
|
46
|
-
context:
|
|
47
|
-
utterances:
|
|
48
|
-
- text: How can people see beauty so differently?
|
|
49
|
-
description: >-
|
|
50
|
-
A curious student with a clear and respectful tone, seeking
|
|
51
|
-
clarification on Hume's ideas with a straightforward
|
|
52
|
-
question.
|
|
53
|
-
format:
|
|
54
|
-
type: mp3
|
|
55
|
-
num_generations: 1
|
|
56
|
-
utterances:
|
|
57
|
-
- text: >-
|
|
58
|
-
Beauty is no quality in things themselves: It exists merely in
|
|
59
|
-
the mind which contemplates them.
|
|
60
|
-
description: >-
|
|
61
|
-
Middle-aged masculine voice with a clear, rhythmic Scots lilt,
|
|
62
|
-
rounded vowels, and a warm, steady tone with an articulate,
|
|
63
|
-
academic quality.
|
|
64
|
-
response:
|
|
65
|
-
body:
|
|
66
|
-
generations:
|
|
67
|
-
- audio: //PExAA0DDYRvkpNfhv3JI5JZ...etc.
|
|
68
|
-
duration: 7.44225
|
|
69
|
-
encoding:
|
|
70
|
-
format: mp3
|
|
71
|
-
sample_rate: 48000
|
|
72
|
-
file_size: 120192
|
|
73
|
-
generation_id: 795c949a-1510-4a80-9646-7d0863b023ab
|
|
74
|
-
snippets:
|
|
75
|
-
- - audio: //PExAA0DDYRvkpNfhv3JI5JZ...etc.
|
|
76
|
-
generation_id: 795c949a-1510-4a80-9646-7d0863b023ab
|
|
77
|
-
id: 37b1b1b1-1b1b-1b1b-1b1b-1b1b1b1b1b1b
|
|
78
|
-
text: >-
|
|
79
|
-
Beauty is no quality in things themselves: It exists
|
|
80
|
-
merely in the mind which contemplates them.
|
|
81
|
-
utterance_index: 0
|
|
82
|
-
request_id: 66e01f90-4501-4aa0-bbaf-74f45dc15aa725906
|
|
83
|
-
synthesize-file:
|
|
84
|
-
path: /v0/tts/file
|
|
85
|
-
method: POST
|
|
86
|
-
auth: true
|
|
87
|
-
docs: >-
|
|
88
|
-
Synthesizes one or more input texts into speech using the specified
|
|
89
|
-
voice. If no voice is provided, a novel voice will be generated
|
|
90
|
-
dynamically. Optionally, additional context can be included to influence
|
|
91
|
-
the speech's style and prosody.
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
The response contains the generated audio file in the requested format.
|
|
95
|
-
source:
|
|
96
|
-
openapi: tts-openapi.json
|
|
97
|
-
display-name: Text-to-speech (File)
|
|
98
|
-
request:
|
|
99
|
-
body:
|
|
100
|
-
type: PostedTts
|
|
101
|
-
content-type: application/json
|
|
102
|
-
response:
|
|
103
|
-
docs: OK
|
|
104
|
-
type: file
|
|
105
|
-
status-code: 200
|
|
106
|
-
errors:
|
|
107
|
-
- UnprocessableEntityError
|
|
108
|
-
examples:
|
|
109
|
-
- request:
|
|
110
|
-
context:
|
|
111
|
-
generation_id: 09ad914d-8e7f-40f8-a279-e34f07f7dab2
|
|
112
|
-
format:
|
|
113
|
-
type: mp3
|
|
114
|
-
num_generations: 1
|
|
115
|
-
utterances:
|
|
116
|
-
- text: >-
|
|
117
|
-
Beauty is no quality in things themselves: It exists merely in
|
|
118
|
-
the mind which contemplates them.
|
|
119
|
-
description: >-
|
|
120
|
-
Middle-aged masculine voice with a clear, rhythmic Scots lilt,
|
|
121
|
-
rounded vowels, and a warm, steady tone with an articulate,
|
|
122
|
-
academic quality.
|
|
123
|
-
synthesize-json-streaming:
|
|
124
|
-
path: /v0/tts/stream/json
|
|
125
|
-
method: POST
|
|
126
|
-
auth: true
|
|
127
|
-
docs: >-
|
|
128
|
-
Streams synthesized speech using the specified voice. If no voice is
|
|
129
|
-
provided, a novel voice will be generated dynamically. Optionally,
|
|
130
|
-
additional context can be included to influence the speech's style and
|
|
131
|
-
prosody.
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
The response is a stream of JSON objects including audio encoded in
|
|
135
|
-
base64.
|
|
136
|
-
source:
|
|
137
|
-
openapi: tts-openapi.json
|
|
138
|
-
display-name: Text-to-speech (Streamed JSON)
|
|
139
|
-
request:
|
|
140
|
-
body:
|
|
141
|
-
type: PostedTts
|
|
142
|
-
content-type: application/json
|
|
143
|
-
response-stream:
|
|
144
|
-
docs: Successful Response
|
|
145
|
-
type: SnippetAudioChunk
|
|
146
|
-
format: json
|
|
147
|
-
errors:
|
|
148
|
-
- UnprocessableEntityError
|
|
149
|
-
examples:
|
|
150
|
-
- request:
|
|
151
|
-
utterances:
|
|
152
|
-
- text: >-
|
|
153
|
-
Beauty is no quality in things themselves: It exists merely in
|
|
154
|
-
the mind which contemplates them.
|
|
155
|
-
voice:
|
|
156
|
-
name: Male English Actor
|
|
157
|
-
provider: HUME_AI
|
|
158
|
-
synthesize-file-streaming:
|
|
159
|
-
path: /v0/tts/stream/file
|
|
160
|
-
method: POST
|
|
161
|
-
auth: true
|
|
162
|
-
docs: >-
|
|
163
|
-
Streams synthesized speech using the specified voice. If no voice is
|
|
164
|
-
provided, a novel voice will be generated dynamically. Optionally,
|
|
165
|
-
additional context can be included to influence the speech's style and
|
|
166
|
-
prosody.
|
|
167
|
-
source:
|
|
168
|
-
openapi: tts-openapi.json
|
|
169
|
-
display-name: Text-to-speech (Streamed File)
|
|
170
|
-
request:
|
|
171
|
-
body:
|
|
172
|
-
type: PostedTts
|
|
173
|
-
content-type: application/json
|
|
174
|
-
response:
|
|
175
|
-
docs: OK
|
|
176
|
-
type: file
|
|
177
|
-
status-code: 200
|
|
178
|
-
errors:
|
|
179
|
-
- UnprocessableEntityError
|
|
180
|
-
examples:
|
|
181
|
-
- request:
|
|
182
|
-
utterances:
|
|
183
|
-
- text: >-
|
|
184
|
-
Beauty is no quality in things themselves: It exists merely in
|
|
185
|
-
the mind which contemplates them.
|
|
186
|
-
voice:
|
|
187
|
-
name: Male English Actor
|
|
188
|
-
provider: HUME_AI
|
|
189
|
-
source:
|
|
190
|
-
openapi: tts-openapi.json
|
|
191
|
-
types:
|
|
192
|
-
PostedContextWithGenerationId:
|
|
193
|
-
properties:
|
|
194
|
-
generation_id:
|
|
195
|
-
type: string
|
|
196
|
-
docs: >-
|
|
197
|
-
The ID of a prior TTS generation to use as context for generating
|
|
198
|
-
consistent speech style and prosody across multiple requests.
|
|
199
|
-
Including context may increase audio generation times.
|
|
200
|
-
source:
|
|
201
|
-
openapi: tts-openapi.json
|
|
202
|
-
PostedContextWithUtterances:
|
|
203
|
-
properties:
|
|
204
|
-
utterances:
|
|
205
|
-
type: list<PostedUtterance>
|
|
206
|
-
source:
|
|
207
|
-
openapi: tts-openapi.json
|
|
208
|
-
AudioEncoding:
|
|
209
|
-
docs: >-
|
|
210
|
-
Encoding information about the generated audio, including the `format` and
|
|
211
|
-
`sample_rate`.
|
|
212
|
-
properties:
|
|
213
|
-
format:
|
|
214
|
-
type: AudioFormatType
|
|
215
|
-
docs: Format for the output audio.
|
|
216
|
-
sample_rate:
|
|
217
|
-
type: integer
|
|
218
|
-
docs: >-
|
|
219
|
-
The sample rate (`Hz`) of the generated audio. The default sample rate
|
|
220
|
-
is `48000 Hz`.
|
|
221
|
-
source:
|
|
222
|
-
openapi: tts-openapi.json
|
|
223
|
-
AudioFormatType:
|
|
224
|
-
enum:
|
|
225
|
-
- mp3
|
|
226
|
-
- pcm
|
|
227
|
-
- wav
|
|
228
|
-
source:
|
|
229
|
-
openapi: tts-openapi.json
|
|
230
|
-
ReturnGeneration:
|
|
231
|
-
properties:
|
|
232
|
-
generation_id:
|
|
233
|
-
type: string
|
|
234
|
-
docs: >-
|
|
235
|
-
A unique ID associated with this TTS generation that can be used as
|
|
236
|
-
context for generating consistent speech style and prosody across
|
|
237
|
-
multiple requests.
|
|
238
|
-
duration:
|
|
239
|
-
type: double
|
|
240
|
-
docs: Duration of the generated audio in seconds.
|
|
241
|
-
file_size:
|
|
242
|
-
type: integer
|
|
243
|
-
docs: Size of the generated audio in bytes.
|
|
244
|
-
encoding:
|
|
245
|
-
type: AudioEncoding
|
|
246
|
-
audio:
|
|
247
|
-
type: string
|
|
248
|
-
docs: >-
|
|
249
|
-
The generated audio output in the requested format, encoded as a
|
|
250
|
-
base64 string.
|
|
251
|
-
snippets:
|
|
252
|
-
docs: >-
|
|
253
|
-
A list of snippet groups where each group corresponds to an utterance
|
|
254
|
-
in the request. Each group contains segmented snippets that represent
|
|
255
|
-
the original utterance divided into more natural-sounding units
|
|
256
|
-
optimized for speech delivery.
|
|
257
|
-
type: list<list<Snippet>>
|
|
258
|
-
source:
|
|
259
|
-
openapi: tts-openapi.json
|
|
260
|
-
HTTPValidationError:
|
|
261
|
-
properties:
|
|
262
|
-
detail:
|
|
263
|
-
type: optional<list<ValidationError>>
|
|
264
|
-
source:
|
|
265
|
-
openapi: tts-openapi.json
|
|
266
|
-
FormatMp3:
|
|
267
|
-
properties: {}
|
|
268
|
-
source:
|
|
269
|
-
openapi: tts-openapi.json
|
|
270
|
-
PostedContext:
|
|
271
|
-
discriminated: false
|
|
272
|
-
docs: >-
|
|
273
|
-
Utterances to use as context for generating consistent speech style and
|
|
274
|
-
prosody across multiple requests. These will not be converted to speech
|
|
275
|
-
output.
|
|
276
|
-
union:
|
|
277
|
-
- type: PostedContextWithGenerationId
|
|
278
|
-
- type: PostedContextWithUtterances
|
|
279
|
-
source:
|
|
280
|
-
openapi: tts-openapi.json
|
|
281
|
-
inline: true
|
|
282
|
-
Format:
|
|
283
|
-
discriminant: type
|
|
284
|
-
base-properties: {}
|
|
285
|
-
docs: Specifies the output audio file format.
|
|
286
|
-
union:
|
|
287
|
-
mp3:
|
|
288
|
-
type: FormatMp3
|
|
289
|
-
pcm:
|
|
290
|
-
type: FormatPcm
|
|
291
|
-
wav:
|
|
292
|
-
type: FormatWav
|
|
293
|
-
source:
|
|
294
|
-
openapi: tts-openapi.json
|
|
295
|
-
PostedTts:
|
|
296
|
-
properties:
|
|
297
|
-
context:
|
|
298
|
-
type: optional<PostedContext>
|
|
299
|
-
docs: >-
|
|
300
|
-
Utterances to use as context for generating consistent speech style
|
|
301
|
-
and prosody across multiple requests. These will not be converted to
|
|
302
|
-
speech output.
|
|
303
|
-
utterances:
|
|
304
|
-
docs: >-
|
|
305
|
-
A list of **Utterances** to be converted to speech output.
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
An **Utterance** is a unit of input for
|
|
309
|
-
[Octave](/docs/text-to-speech-tts/overview), and includes input
|
|
310
|
-
`text`, an optional `description` to serve as the prompt for how the
|
|
311
|
-
speech should be delivered, an optional `voice` specification, and
|
|
312
|
-
additional controls to guide delivery for `speed` and
|
|
313
|
-
`trailing_silence`.
|
|
314
|
-
type: list<PostedUtterance>
|
|
315
|
-
num_generations:
|
|
316
|
-
type: optional<integer>
|
|
317
|
-
docs: Number of generations of the audio to produce.
|
|
318
|
-
default: 1
|
|
319
|
-
validation:
|
|
320
|
-
min: 1
|
|
321
|
-
max: 5
|
|
322
|
-
format:
|
|
323
|
-
type: optional<Format>
|
|
324
|
-
docs: Specifies the output audio file format.
|
|
325
|
-
split_utterances:
|
|
326
|
-
type: optional<boolean>
|
|
327
|
-
docs: >-
|
|
328
|
-
Controls how audio output is segmented in the response.
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
- When **enabled** (`true`), input utterances are automatically split
|
|
332
|
-
into natural-sounding speech segments.
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
- When **disabled** (`false`), the response maintains a strict
|
|
336
|
-
one-to-one mapping between input utterances and output snippets.
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
This setting affects how the `snippets` array is structured in the
|
|
340
|
-
response, which may be important for applications that need to track
|
|
341
|
-
the relationship between input text and generated audio segments. When
|
|
342
|
-
setting to `false`, avoid including utterances with long `text`, as
|
|
343
|
-
this can result in distorted output.
|
|
344
|
-
default: true
|
|
345
|
-
strip_headers:
|
|
346
|
-
type: optional<boolean>
|
|
347
|
-
docs: >-
|
|
348
|
-
If enabled, the audio for all the chunks of a generation, once
|
|
349
|
-
concatenated together, will constitute a single audio file. Otherwise,
|
|
350
|
-
if disabled, each chunk's audio will be its own audio file, each with
|
|
351
|
-
its own headers (if applicable).
|
|
352
|
-
default: false
|
|
353
|
-
instant_mode:
|
|
354
|
-
type: optional<boolean>
|
|
355
|
-
docs: >-
|
|
356
|
-
Enables ultra-low latency streaming, significantly reducing the time
|
|
357
|
-
until the first audio chunk is received. Recommended for real-time
|
|
358
|
-
applications requiring immediate audio playback. For further details,
|
|
359
|
-
see our documentation on [instant
|
|
360
|
-
mode](/docs/text-to-speech-tts/overview#ultra-low-latency-streaming-instant-mode).
|
|
361
|
-
|
|
362
|
-
- A
|
|
363
|
-
[voice](/reference/text-to-speech-tts/synthesize-json-streaming#request.body.utterances.voice)
|
|
364
|
-
must be specified when instant mode is enabled. Dynamic voice
|
|
365
|
-
generation is not supported with this mode.
|
|
366
|
-
|
|
367
|
-
- Instant mode is only supported for streaming endpoints (e.g.,
|
|
368
|
-
[/v0/tts/stream/json](/reference/text-to-speech-tts/synthesize-json-streaming),
|
|
369
|
-
[/v0/tts/stream/file](/reference/text-to-speech-tts/synthesize-file-streaming)).
|
|
370
|
-
|
|
371
|
-
- Ensure only a single generation is requested
|
|
372
|
-
([num_generations](/reference/text-to-speech-tts/synthesize-json-streaming#request.body.num_generations)
|
|
373
|
-
must be `1` or omitted).
|
|
374
|
-
default: true
|
|
375
|
-
source:
|
|
376
|
-
openapi: tts-openapi.json
|
|
377
|
-
ReturnTts:
|
|
378
|
-
properties:
|
|
379
|
-
request_id:
|
|
380
|
-
type: optional<string>
|
|
381
|
-
docs: >-
|
|
382
|
-
A unique ID associated with this request for tracking and
|
|
383
|
-
troubleshooting. Use this ID when contacting [support](/support) for
|
|
384
|
-
troubleshooting assistance.
|
|
385
|
-
generations:
|
|
386
|
-
type: list<ReturnGeneration>
|
|
387
|
-
source:
|
|
388
|
-
openapi: tts-openapi.json
|
|
389
|
-
ReturnVoice:
|
|
390
|
-
docs: An Octave voice available for text-to-speech
|
|
391
|
-
properties:
|
|
392
|
-
id:
|
|
393
|
-
type: optional<string>
|
|
394
|
-
docs: ID of the voice in the `Voice Library`.
|
|
395
|
-
name:
|
|
396
|
-
type: optional<string>
|
|
397
|
-
docs: Name of the voice in the `Voice Library`.
|
|
398
|
-
provider:
|
|
399
|
-
type: optional<VoiceProvider>
|
|
400
|
-
docs: >-
|
|
401
|
-
The provider associated with the created voice.
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
Voices created through this endpoint will always have the provider set
|
|
405
|
-
to `CUSTOM_VOICE`, indicating a custom voice stored in your account.
|
|
406
|
-
source:
|
|
407
|
-
openapi: tts-openapi.json
|
|
408
|
-
FormatPcm:
|
|
409
|
-
properties: {}
|
|
410
|
-
source:
|
|
411
|
-
openapi: tts-openapi.json
|
|
412
|
-
Snippet:
|
|
413
|
-
properties:
|
|
414
|
-
id:
|
|
415
|
-
type: string
|
|
416
|
-
docs: A unique ID associated with this **Snippet**.
|
|
417
|
-
text:
|
|
418
|
-
type: string
|
|
419
|
-
docs: The text for this **Snippet**.
|
|
420
|
-
generation_id:
|
|
421
|
-
type: string
|
|
422
|
-
docs: The generation ID this snippet corresponds to.
|
|
423
|
-
utterance_index:
|
|
424
|
-
type: optional<integer>
|
|
425
|
-
docs: The index of the utterance in the request this snippet corresponds to.
|
|
426
|
-
transcribed_text:
|
|
427
|
-
type: optional<string>
|
|
428
|
-
docs: >-
|
|
429
|
-
The transcribed text of the generated audio. It is only present if
|
|
430
|
-
`instant_mode` is set to `false`.
|
|
431
|
-
audio:
|
|
432
|
-
type: string
|
|
433
|
-
docs: >-
|
|
434
|
-
The segmented audio output in the requested format, encoded as a
|
|
435
|
-
base64 string.
|
|
436
|
-
source:
|
|
437
|
-
openapi: tts-openapi.json
|
|
438
|
-
SnippetAudioChunk:
|
|
439
|
-
properties:
|
|
440
|
-
generation_id:
|
|
441
|
-
type: string
|
|
442
|
-
docs: >-
|
|
443
|
-
The generation ID of the parent snippet that this chunk corresponds
|
|
444
|
-
to.
|
|
445
|
-
snippet_id:
|
|
446
|
-
type: string
|
|
447
|
-
docs: The ID of the parent snippet that this chunk corresponds to.
|
|
448
|
-
text:
|
|
449
|
-
type: string
|
|
450
|
-
docs: The text of the parent snippet that this chunk corresponds to.
|
|
451
|
-
transcribed_text:
|
|
452
|
-
type: optional<string>
|
|
453
|
-
docs: >-
|
|
454
|
-
The transcribed text of the generated audio of the parent snippet that
|
|
455
|
-
this chunk corresponds to. It is only present if `instant_mode` is set
|
|
456
|
-
to `false`.
|
|
457
|
-
chunk_index:
|
|
458
|
-
type: integer
|
|
459
|
-
docs: The index of the audio chunk in the snippet.
|
|
460
|
-
audio:
|
|
461
|
-
type: string
|
|
462
|
-
docs: The generated audio output chunk in the requested format.
|
|
463
|
-
audio_format:
|
|
464
|
-
type: AudioFormatType
|
|
465
|
-
docs: The generated audio output format.
|
|
466
|
-
is_last_chunk:
|
|
467
|
-
type: boolean
|
|
468
|
-
docs: >-
|
|
469
|
-
Whether or not this is the last chunk streamed back from the decoder
|
|
470
|
-
for one input snippet.
|
|
471
|
-
utterance_index:
|
|
472
|
-
type: optional<integer>
|
|
473
|
-
docs: >-
|
|
474
|
-
The index of the utterance in the request that the parent snippet of
|
|
475
|
-
this chunk corresponds to.
|
|
476
|
-
snippet:
|
|
477
|
-
type: Snippet
|
|
478
|
-
source:
|
|
479
|
-
openapi: tts-openapi.json
|
|
480
|
-
PostedUtterance:
|
|
481
|
-
properties:
|
|
482
|
-
text:
|
|
483
|
-
type: string
|
|
484
|
-
docs: The input text to be synthesized into speech.
|
|
485
|
-
validation:
|
|
486
|
-
maxLength: 5000
|
|
487
|
-
description:
|
|
488
|
-
type: optional<string>
|
|
489
|
-
docs: >-
|
|
490
|
-
Natural language instructions describing how the synthesized speech
|
|
491
|
-
should sound, including but not limited to tone, intonation, pacing,
|
|
492
|
-
and accent.
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
**This field behaves differently depending on whether a voice is
|
|
496
|
-
specified**:
|
|
497
|
-
|
|
498
|
-
- **Voice specified**: the description will serve as acting directions
|
|
499
|
-
for delivery. Keep directions concise—100 characters or fewer—for best
|
|
500
|
-
results. See our guide on [acting
|
|
501
|
-
instructions](/docs/text-to-speech-tts/acting-instructions).
|
|
502
|
-
|
|
503
|
-
- **Voice not specified**: the description will serve as a voice
|
|
504
|
-
prompt for generating a voice. See our [prompting
|
|
505
|
-
guide](/docs/text-to-speech-tts/prompting) for design tips.
|
|
506
|
-
validation:
|
|
507
|
-
maxLength: 1000
|
|
508
|
-
voice:
|
|
509
|
-
type: optional<PostedUtteranceVoice>
|
|
510
|
-
docs: >-
|
|
511
|
-
The `name` or `id` associated with a **Voice** from the **Voice
|
|
512
|
-
Library** to be used as the speaker for this and all subsequent
|
|
513
|
-
`utterances`, until the `voice` field is updated again.
|
|
514
|
-
|
|
515
|
-
See our [voices guide](/docs/text-to-speech-tts/voices) for more details on generating and specifying **Voices**.
|
|
516
|
-
speed:
|
|
517
|
-
type: optional<double>
|
|
518
|
-
docs: >-
|
|
519
|
-
Speed multiplier for the synthesized speech. Extreme values below 0.75
|
|
520
|
-
and above 1.5 may sometimes cause instability to the generated output.
|
|
521
|
-
default: 1
|
|
522
|
-
validation:
|
|
523
|
-
min: 0.5
|
|
524
|
-
max: 2
|
|
525
|
-
trailing_silence:
|
|
526
|
-
type: optional<double>
|
|
527
|
-
docs: Duration of trailing silence (in seconds) to add to this utterance
|
|
528
|
-
default: 0
|
|
529
|
-
validation:
|
|
530
|
-
min: 0
|
|
531
|
-
max: 5
|
|
532
|
-
source:
|
|
533
|
-
openapi: tts-openapi.json
|
|
534
|
-
ValidationErrorLocItem:
|
|
535
|
-
discriminated: false
|
|
536
|
-
union:
|
|
537
|
-
- string
|
|
538
|
-
- integer
|
|
539
|
-
source:
|
|
540
|
-
openapi: tts-openapi.json
|
|
541
|
-
inline: true
|
|
542
|
-
ValidationError:
|
|
543
|
-
properties:
|
|
544
|
-
loc:
|
|
545
|
-
type: list<ValidationErrorLocItem>
|
|
546
|
-
msg: string
|
|
547
|
-
type: string
|
|
548
|
-
source:
|
|
549
|
-
openapi: tts-openapi.json
|
|
550
|
-
PostedUtteranceVoiceWithId:
|
|
551
|
-
properties:
|
|
552
|
-
id:
|
|
553
|
-
type: string
|
|
554
|
-
docs: The unique ID associated with the **Voice**.
|
|
555
|
-
provider:
|
|
556
|
-
type: optional<VoiceProvider>
|
|
557
|
-
docs: >-
|
|
558
|
-
Specifies the source provider associated with the chosen voice.
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
- **`HUME_AI`**: Select voices from Hume's [Voice
|
|
562
|
-
Library](https://platform.hume.ai/tts/voice-library), containing a
|
|
563
|
-
variety of preset, shared voices.
|
|
564
|
-
|
|
565
|
-
- **`CUSTOM_VOICE`**: Select from voices you've personally generated
|
|
566
|
-
and saved in your account.
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
If no provider is explicitly set, the default provider is
|
|
570
|
-
`CUSTOM_VOICE`. When using voices from Hume's **Voice Library**, you
|
|
571
|
-
must explicitly set the provider to `HUME_AI`.
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
Preset voices from Hume's **Voice Library** are accessible by all
|
|
575
|
-
users. In contrast, your custom voices are private and accessible only
|
|
576
|
-
via requests authenticated with your API key.
|
|
577
|
-
source:
|
|
578
|
-
openapi: tts-openapi.json
|
|
579
|
-
PostedUtteranceVoiceWithName:
|
|
580
|
-
properties:
|
|
581
|
-
name:
|
|
582
|
-
type: string
|
|
583
|
-
docs: The name of a **Voice**.
|
|
584
|
-
provider:
|
|
585
|
-
type: optional<VoiceProvider>
|
|
586
|
-
docs: >-
|
|
587
|
-
Specifies the source provider associated with the chosen voice.
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
- **`HUME_AI`**: Select voices from Hume's [Voice
|
|
591
|
-
Library](https://platform.hume.ai/tts/voice-library), containing a
|
|
592
|
-
variety of preset, shared voices.
|
|
593
|
-
|
|
594
|
-
- **`CUSTOM_VOICE`**: Select from voices you've personally generated
|
|
595
|
-
and saved in your account.
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
If no provider is explicitly set, the default provider is
|
|
599
|
-
`CUSTOM_VOICE`. When using voices from Hume's **Voice Library**, you
|
|
600
|
-
must explicitly set the provider to `HUME_AI`.
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
Preset voices from Hume's **Voice Library** are accessible by all
|
|
604
|
-
users. In contrast, your custom voices are private and accessible only
|
|
605
|
-
via requests authenticated with your API key.
|
|
606
|
-
source:
|
|
607
|
-
openapi: tts-openapi.json
|
|
608
|
-
VoiceProvider:
|
|
609
|
-
enum:
|
|
610
|
-
- HUME_AI
|
|
611
|
-
- CUSTOM_VOICE
|
|
612
|
-
source:
|
|
613
|
-
openapi: tts-openapi.json
|
|
614
|
-
PostedUtteranceVoice:
|
|
615
|
-
discriminated: false
|
|
616
|
-
union:
|
|
617
|
-
- type: PostedUtteranceVoiceWithId
|
|
618
|
-
- type: PostedUtteranceVoiceWithName
|
|
619
|
-
source:
|
|
620
|
-
openapi: tts-openapi.json
|
|
621
|
-
FormatWav:
|
|
622
|
-
properties: {}
|
|
623
|
-
source:
|
|
624
|
-
openapi: tts-openapi.json
|
|
625
|
-
ErrorResponse:
|
|
626
|
-
properties:
|
|
627
|
-
error: optional<string>
|
|
628
|
-
message: optional<string>
|
|
629
|
-
code: optional<string>
|
|
630
|
-
source:
|
|
631
|
-
openapi: tts-openapi.json
|
|
632
|
-
ReturnPagedVoices:
|
|
633
|
-
docs: A paginated list Octave voices available for text-to-speech
|
|
634
|
-
properties:
|
|
635
|
-
page_number:
|
|
636
|
-
type: optional<integer>
|
|
637
|
-
docs: >-
|
|
638
|
-
The page number of the returned list.
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
This value corresponds to the `page_number` parameter specified in the
|
|
642
|
-
request. Pagination uses zero-based indexing.
|
|
643
|
-
page_size:
|
|
644
|
-
type: optional<integer>
|
|
645
|
-
docs: >-
|
|
646
|
-
The maximum number of items returned per page.
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
This value corresponds to the `page_size` parameter specified in the
|
|
650
|
-
request.
|
|
651
|
-
total_pages:
|
|
652
|
-
type: optional<integer>
|
|
653
|
-
docs: The total number of pages in the collection.
|
|
654
|
-
voices_page:
|
|
655
|
-
type: optional<list<ReturnVoice>>
|
|
656
|
-
docs: >-
|
|
657
|
-
List of voices returned for the specified `page_number` and
|
|
658
|
-
`page_size`.
|
|
659
|
-
source:
|
|
660
|
-
openapi: tts-openapi.json
|