@cartesia/cartesia-js 3.0.0-b3 → 3.0.0-b4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/CHANGELOG.md +9 -0
  2. package/README.md +21 -21
  3. package/client.d.mts +2 -2
  4. package/client.d.mts.map +1 -1
  5. package/client.d.ts +2 -2
  6. package/client.d.ts.map +1 -1
  7. package/client.js.map +1 -1
  8. package/client.mjs.map +1 -1
  9. package/package.json +1 -1
  10. package/resources/access-token.d.mts +5 -0
  11. package/resources/access-token.d.mts.map +1 -1
  12. package/resources/access-token.d.ts +5 -0
  13. package/resources/access-token.d.ts.map +1 -1
  14. package/resources/agents/agents.js +1 -1
  15. package/resources/agents/agents.js.map +1 -1
  16. package/resources/agents/agents.mjs +1 -1
  17. package/resources/agents/agents.mjs.map +1 -1
  18. package/resources/agents/metrics/index.d.mts +1 -1
  19. package/resources/agents/metrics/index.d.mts.map +1 -1
  20. package/resources/agents/metrics/index.d.ts +1 -1
  21. package/resources/agents/metrics/index.d.ts.map +1 -1
  22. package/resources/agents/metrics/index.js.map +1 -1
  23. package/resources/agents/metrics/index.mjs.map +1 -1
  24. package/resources/agents/metrics/metrics.d.mts +2 -2
  25. package/resources/agents/metrics/metrics.d.mts.map +1 -1
  26. package/resources/agents/metrics/metrics.d.ts +2 -2
  27. package/resources/agents/metrics/metrics.d.ts.map +1 -1
  28. package/resources/agents/metrics/metrics.js.map +1 -1
  29. package/resources/agents/metrics/metrics.mjs.map +1 -1
  30. package/resources/agents/metrics/results.d.mts +23 -22
  31. package/resources/agents/metrics/results.d.mts.map +1 -1
  32. package/resources/agents/metrics/results.d.ts +23 -22
  33. package/resources/agents/metrics/results.d.ts.map +1 -1
  34. package/resources/agents/metrics/results.js +4 -5
  35. package/resources/agents/metrics/results.js.map +1 -1
  36. package/resources/agents/metrics/results.mjs +4 -5
  37. package/resources/agents/metrics/results.mjs.map +1 -1
  38. package/resources/index.d.mts +1 -1
  39. package/resources/index.d.mts.map +1 -1
  40. package/resources/index.d.ts +1 -1
  41. package/resources/index.d.ts.map +1 -1
  42. package/resources/index.js.map +1 -1
  43. package/resources/index.mjs.map +1 -1
  44. package/resources/infill.d.mts +3 -4
  45. package/resources/infill.d.mts.map +1 -1
  46. package/resources/infill.d.ts +3 -4
  47. package/resources/infill.d.ts.map +1 -1
  48. package/resources/infill.js +0 -2
  49. package/resources/infill.js.map +1 -1
  50. package/resources/infill.mjs +0 -2
  51. package/resources/infill.mjs.map +1 -1
  52. package/resources/pronunciation-dicts.d.mts +0 -8
  53. package/resources/pronunciation-dicts.d.mts.map +1 -1
  54. package/resources/pronunciation-dicts.d.ts +0 -8
  55. package/resources/pronunciation-dicts.d.ts.map +1 -1
  56. package/resources/pronunciation-dicts.js +0 -18
  57. package/resources/pronunciation-dicts.js.map +1 -1
  58. package/resources/pronunciation-dicts.mjs +0 -18
  59. package/resources/pronunciation-dicts.mjs.map +1 -1
  60. package/resources/stt.d.mts +1 -104
  61. package/resources/stt.d.mts.map +1 -1
  62. package/resources/stt.d.ts +1 -104
  63. package/resources/stt.d.ts.map +1 -1
  64. package/resources/tts/tts.d.mts +79 -115
  65. package/resources/tts/tts.d.mts.map +1 -1
  66. package/resources/tts/tts.d.ts +79 -115
  67. package/resources/tts/tts.d.ts.map +1 -1
  68. package/resources/voice-changer.d.mts +2 -2
  69. package/resources/voice-changer.d.mts.map +1 -1
  70. package/resources/voice-changer.d.ts +2 -2
  71. package/resources/voice-changer.d.ts.map +1 -1
  72. package/resources/voices.d.mts +70 -34
  73. package/resources/voices.d.mts.map +1 -1
  74. package/resources/voices.d.ts +70 -34
  75. package/resources/voices.d.ts.map +1 -1
  76. package/resources/voices.js +45 -3
  77. package/resources/voices.js.map +1 -1
  78. package/resources/voices.mjs +45 -3
  79. package/resources/voices.mjs.map +1 -1
  80. package/src/client.ts +2 -0
  81. package/src/resources/access-token.ts +6 -0
  82. package/src/resources/agents/agents.ts +1 -1
  83. package/src/resources/agents/metrics/index.ts +1 -0
  84. package/src/resources/agents/metrics/metrics.ts +2 -0
  85. package/src/resources/agents/metrics/results.ts +27 -23
  86. package/src/resources/index.ts +1 -0
  87. package/src/resources/infill.ts +3 -4
  88. package/src/resources/pronunciation-dicts.ts +0 -20
  89. package/src/resources/stt.ts +102 -104
  90. package/src/resources/tts/tts.ts +146 -128
  91. package/src/resources/voice-changer.ts +2 -2
  92. package/src/resources/voices.ts +105 -38
  93. package/src/version.ts +1 -1
  94. package/version.d.mts +1 -1
  95. package/version.d.ts +1 -1
  96. package/version.js +1 -1
  97. package/version.mjs +1 -1
@@ -107,111 +107,109 @@ export interface SttTranscribeParams {
107
107
  /**
108
108
  * Body param: The language of the input audio in ISO-639-1 format. Defaults to
109
109
  * `en`.
110
- *
111
- * <Accordion title="Supported languages">
112
- * - `en` (English)
113
- * - `zh` (Chinese)
114
- * - `de` (German)
115
- * - `es` (Spanish)
116
- * - `ru` (Russian)
117
- * - `ko` (Korean)
118
- * - `fr` (French)
119
- * - `ja` (Japanese)
120
- * - `pt` (Portuguese)
121
- * - `tr` (Turkish)
122
- * - `pl` (Polish)
123
- * - `ca` (Catalan)
124
- * - `nl` (Dutch)
125
- * - `ar` (Arabic)
126
- * - `sv` (Swedish)
127
- * - `it` (Italian)
128
- * - `id` (Indonesian)
129
- * - `hi` (Hindi)
130
- * - `fi` (Finnish)
131
- * - `vi` (Vietnamese)
132
- * - `he` (Hebrew)
133
- * - `uk` (Ukrainian)
134
- * - `el` (Greek)
135
- * - `ms` (Malay)
136
- * - `cs` (Czech)
137
- * - `ro` (Romanian)
138
- * - `da` (Danish)
139
- * - `hu` (Hungarian)
140
- * - `ta` (Tamil)
141
- * - `no` (Norwegian)
142
- * - `th` (Thai)
143
- * - `ur` (Urdu)
144
- * - `hr` (Croatian)
145
- * - `bg` (Bulgarian)
146
- * - `lt` (Lithuanian)
147
- * - `la` (Latin)
148
- * - `mi` (Maori)
149
- * - `ml` (Malayalam)
150
- * - `cy` (Welsh)
151
- * - `sk` (Slovak)
152
- * - `te` (Telugu)
153
- * - `fa` (Persian)
154
- * - `lv` (Latvian)
155
- * - `bn` (Bengali)
156
- * - `sr` (Serbian)
157
- * - `az` (Azerbaijani)
158
- * - `sl` (Slovenian)
159
- * - `kn` (Kannada)
160
- * - `et` (Estonian)
161
- * - `mk` (Macedonian)
162
- * - `br` (Breton)
163
- * - `eu` (Basque)
164
- * - `is` (Icelandic)
165
- * - `hy` (Armenian)
166
- * - `ne` (Nepali)
167
- * - `mn` (Mongolian)
168
- * - `bs` (Bosnian)
169
- * - `kk` (Kazakh)
170
- * - `sq` (Albanian)
171
- * - `sw` (Swahili)
172
- * - `gl` (Galician)
173
- * - `mr` (Marathi)
174
- * - `pa` (Punjabi)
175
- * - `si` (Sinhala)
176
- * - `km` (Khmer)
177
- * - `sn` (Shona)
178
- * - `yo` (Yoruba)
179
- * - `so` (Somali)
180
- * - `af` (Afrikaans)
181
- * - `oc` (Occitan)
182
- * - `ka` (Georgian)
183
- * - `be` (Belarusian)
184
- * - `tg` (Tajik)
185
- * - `sd` (Sindhi)
186
- * - `gu` (Gujarati)
187
- * - `am` (Amharic)
188
- * - `yi` (Yiddish)
189
- * - `lo` (Lao)
190
- * - `uz` (Uzbek)
191
- * - `fo` (Faroese)
192
- * - `ht` (Haitian Creole)
193
- * - `ps` (Pashto)
194
- * - `tk` (Turkmen)
195
- * - `nn` (Nynorsk)
196
- * - `mt` (Maltese)
197
- * - `sa` (Sanskrit)
198
- * - `lb` (Luxembourgish)
199
- * - `my` (Myanmar)
200
- * - `bo` (Tibetan)
201
- * - `tl` (Tagalog)
202
- * - `mg` (Malagasy)
203
- * - `as` (Assamese)
204
- * - `tt` (Tatar)
205
- * - `haw` (Hawaiian)
206
- * - `ln` (Lingala)
207
- * - `ha` (Hausa)
208
- * - `ba` (Bashkir)
209
- * - `jw` (Javanese)
210
- * - `su` (Sundanese)
211
- * - `yue` (Cantonese)
212
- * </Accordion>
213
110
  */
214
- language?: string | null;
111
+ language?:
112
+ | 'en'
113
+ | 'zh'
114
+ | 'de'
115
+ | 'es'
116
+ | 'ru'
117
+ | 'ko'
118
+ | 'fr'
119
+ | 'ja'
120
+ | 'pt'
121
+ | 'tr'
122
+ | 'pl'
123
+ | 'ca'
124
+ | 'nl'
125
+ | 'ar'
126
+ | 'sv'
127
+ | 'it'
128
+ | 'id'
129
+ | 'hi'
130
+ | 'fi'
131
+ | 'vi'
132
+ | 'he'
133
+ | 'uk'
134
+ | 'el'
135
+ | 'ms'
136
+ | 'cs'
137
+ | 'ro'
138
+ | 'da'
139
+ | 'hu'
140
+ | 'ta'
141
+ | 'no'
142
+ | 'th'
143
+ | 'ur'
144
+ | 'hr'
145
+ | 'bg'
146
+ | 'lt'
147
+ | 'la'
148
+ | 'mi'
149
+ | 'ml'
150
+ | 'cy'
151
+ | 'sk'
152
+ | 'te'
153
+ | 'fa'
154
+ | 'lv'
155
+ | 'bn'
156
+ | 'sr'
157
+ | 'az'
158
+ | 'sl'
159
+ | 'kn'
160
+ | 'et'
161
+ | 'mk'
162
+ | 'br'
163
+ | 'eu'
164
+ | 'is'
165
+ | 'hy'
166
+ | 'ne'
167
+ | 'mn'
168
+ | 'bs'
169
+ | 'kk'
170
+ | 'sq'
171
+ | 'sw'
172
+ | 'gl'
173
+ | 'mr'
174
+ | 'pa'
175
+ | 'si'
176
+ | 'km'
177
+ | 'sn'
178
+ | 'yo'
179
+ | 'so'
180
+ | 'af'
181
+ | 'oc'
182
+ | 'ka'
183
+ | 'be'
184
+ | 'tg'
185
+ | 'sd'
186
+ | 'gu'
187
+ | 'am'
188
+ | 'yi'
189
+ | 'lo'
190
+ | 'uz'
191
+ | 'fo'
192
+ | 'ht'
193
+ | 'ps'
194
+ | 'tk'
195
+ | 'nn'
196
+ | 'mt'
197
+ | 'sa'
198
+ | 'lb'
199
+ | 'my'
200
+ | 'bo'
201
+ | 'tl'
202
+ | 'mg'
203
+ | 'as'
204
+ | 'tt'
205
+ | 'haw'
206
+ | 'ln'
207
+ | 'ha'
208
+ | 'ba'
209
+ | 'jw'
210
+ | 'su'
211
+ | 'yue'
212
+ | null;
215
213
 
216
214
  /**
217
215
  * Body param: ID of the model to use for transcription. Use `ink-whisper` for the
@@ -34,42 +34,88 @@ export class TTS extends APIResource {
34
34
  }
35
35
 
36
36
  /**
37
- * Configure the various attributes of the generated speech. These controls are
38
- * only available for `sonic-3-preview` and will have no effect on earlier models.
37
+ * Configure the various attributes of the generated speech. These are only for
38
+ * `sonic-3` and have no effect on earlier models.
39
+ *
40
+ * See
41
+ * [Volume, Speed, and Emotion in Sonic-3](/build-with-cartesia/sonic-3/volume-speed-emotion)
42
+ * for a guide on this option.
39
43
  */
40
44
  export interface GenerationConfig {
41
45
  /**
42
- * These controls are **experimental** and subject to breaking changes.
43
- */
44
- experimental?: GenerationConfig.Experimental | null;
45
-
46
- /**
47
- * Adjust the speed of the generated speech between -1.0 (slower) and 1.0 (faster).
48
- * 0.0 is the default speed.
49
- */
50
- speed?: number | null;
51
-
52
- /**
53
- * Adjust the volume of the generated speech between -1.0 (softer) and 1.0
54
- * (louder). 0.0 is the default volume.
55
- */
56
- volume?: number | null;
57
- }
58
-
59
- export namespace GenerationConfig {
60
- /**
61
- * These controls are **experimental** and subject to breaking changes.
62
- */
63
- export interface Experimental {
64
- /**
65
- * Toggle accent localization: 0 (disabled, default) or 1 (enabled). When enabled,
66
- * the voice adapts to match the transcript language's accent while preserving
67
- * vocal characteristics. When disabled, maintains the original voice accent. For
68
- * more information, see
69
- * [Localize Voices](/build-with-sonic/capabilities/localize-voices).
70
- */
71
- accent_localization?: number | null;
72
- }
46
+ * Guide the emotion of the generated speech.
47
+ */
48
+ emotion?:
49
+ | 'neutral'
50
+ | 'happy'
51
+ | 'excited'
52
+ | 'enthusiastic'
53
+ | 'elated'
54
+ | 'euphoric'
55
+ | 'triumphant'
56
+ | 'amazed'
57
+ | 'surprised'
58
+ | 'flirtatious'
59
+ | 'curious'
60
+ | 'content'
61
+ | 'peaceful'
62
+ | 'serene'
63
+ | 'calm'
64
+ | 'grateful'
65
+ | 'affectionate'
66
+ | 'trust'
67
+ | 'sympathetic'
68
+ | 'anticipation'
69
+ | 'mysterious'
70
+ | 'angry'
71
+ | 'mad'
72
+ | 'outraged'
73
+ | 'frustrated'
74
+ | 'agitated'
75
+ | 'threatened'
76
+ | 'disgusted'
77
+ | 'contempt'
78
+ | 'envious'
79
+ | 'sarcastic'
80
+ | 'ironic'
81
+ | 'sad'
82
+ | 'dejected'
83
+ | 'melancholic'
84
+ | 'disappointed'
85
+ | 'hurt'
86
+ | 'guilty'
87
+ | 'bored'
88
+ | 'tired'
89
+ | 'rejected'
90
+ | 'nostalgic'
91
+ | 'wistful'
92
+ | 'apologetic'
93
+ | 'hesitant'
94
+ | 'insecure'
95
+ | 'confused'
96
+ | 'resigned'
97
+ | 'anxious'
98
+ | 'panicked'
99
+ | 'alarmed'
100
+ | 'scared'
101
+ | 'proud'
102
+ | 'confident'
103
+ | 'distant'
104
+ | 'skeptical'
105
+ | 'contemplative'
106
+ | 'determined';
107
+
108
+ /**
109
+ * Adjust the speed of the generated speech between 0.6x and 1.5x the original
110
+ * speed (default is 1.0x). Valid values are between [0.6, 1.5] inclusive.
111
+ */
112
+ speed?: number;
113
+
114
+ /**
115
+ * Adjust the volume of the generated speech between 0.5x and 2.0x the original
116
+ * volume (default is 1.0x). Valid values are between [0.5, 2.0] inclusive.
117
+ */
118
+ volume?: number;
73
119
  }
74
120
 
75
121
  export interface GenerationRequest {
@@ -117,30 +163,30 @@ export interface GenerationRequest {
117
163
  */
118
164
  continue?: boolean | null;
119
165
 
120
- /**
121
- * The maximum duration of the audio in seconds. You do not usually need to specify
122
- * this. If the duration is not appropriate for the length of the transcript, the
123
- * output audio may be truncated.
124
- */
125
- duration?: number | null;
126
-
127
166
  /**
128
167
  * Whether to flush the context.
129
168
  */
130
169
  flush?: boolean | null;
131
170
 
132
171
  /**
133
- * The language that the given voice should speak the transcript in.
172
+ * Configure the various attributes of the generated speech. These are only for
173
+ * `sonic-3` and have no effect on earlier models.
134
174
  *
135
- * Options: English (en), French (fr), German (de), Spanish (es), Portuguese (pt),
136
- * Chinese (zh), Japanese (ja), Hindi (hi), Italian (it), Korean (ko), Dutch (nl),
137
- * Polish (pl), Russian (ru), Swedish (sv), Turkish (tr).
175
+ * See
176
+ * [Volume, Speed, and Emotion in Sonic-3](/build-with-cartesia/sonic-3/volume-speed-emotion)
177
+ * for a guide on this option.
138
178
  */
139
- language?: VoicesAPI.SupportedLanguage | null;
179
+ generation_config?: GenerationConfig;
180
+
181
+ /**
182
+ * The language that the given voice should speak the transcript in. For valid
183
+ * options, see [Models](/build-with-cartesia/tts-models).
184
+ */
185
+ language?: VoicesAPI.SupportedLanguage;
140
186
 
141
187
  /**
142
188
  * The maximum time in milliseconds to buffer text before starting generation.
143
- * Values between [0, 1000]ms are supported. Defaults to 0 (no buffering).
189
+ * Values between [0, 5000]ms are supported. Defaults to 3000ms.
144
190
  *
145
191
  * When set, the model will buffer incoming text chunks until it's confident it has
146
192
  * enough context to generate high-quality speech, or the buffer delay elapses,
@@ -153,22 +199,18 @@ export interface GenerationRequest {
153
199
  max_buffer_delay_ms?: number | null;
154
200
 
155
201
  /**
156
- * A list of pronunciation dict IDs to use for the generation. This will be applied
157
- * in addition to the pinned pronunciation dict, which will be treated as the first
158
- * element of the list. If there are conflicts with dict items, the latest dict
159
- * will take precedence.
202
+ * The ID of a pronunciation dictionary to use for the generation. Pronunciation
203
+ * dictionaries are supported by `sonic-3` models and newer.
160
204
  */
161
- pronunciation_dict_ids?: Array<string> | null;
205
+ pronunciation_dict_id?: string | null;
162
206
 
163
207
  /**
164
- * > This feature is experimental and may not work for all voices.
165
- *
166
- * Speed setting for the model. Defaults to `normal`.
167
- *
168
- * Influences the speed of the generated speech. Faster speeds may reduce
169
- * hallucination rate.
208
+ * @deprecated Use `generation_config.speed` for sonic-3. Speed setting for the
209
+ * model. Defaults to `normal`. This feature is experimental and may not work for
210
+ * all voices. Influences the speed of the generated speech. Faster speeds may
211
+ * reduce hallucination rate.
170
212
  */
171
- speed?: ModelSpeed | null;
213
+ speed?: ModelSpeed;
172
214
 
173
215
  /**
174
216
  * Whether to use normalized timestamps (True) or original timestamps (False).
@@ -182,24 +224,22 @@ export namespace GenerationRequest {
182
224
 
183
225
  encoding: InfillAPI.RawEncoding;
184
226
 
185
- sample_rate: number;
227
+ sample_rate: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
186
228
  }
187
229
  }
188
230
 
189
231
  /**
190
- * > This feature is experimental and may not work for all voices.
191
- *
192
- * Speed setting for the model. Defaults to `normal`.
193
- *
194
- * Influences the speed of the generated speech. Faster speeds may reduce
195
- * hallucination rate.
232
+ * @deprecated Use `generation_config.speed` for sonic-3. Speed setting for the
233
+ * model. Defaults to `normal`. This feature is experimental and may not work for
234
+ * all voices. Influences the speed of the generated speech. Faster speeds may
235
+ * reduce hallucination rate.
196
236
  */
197
237
  export type ModelSpeed = 'slow' | 'normal' | 'fast';
198
238
 
199
239
  export interface RawOutputFormat {
200
240
  encoding: InfillAPI.RawEncoding;
201
241
 
202
- sample_rate: number;
242
+ sample_rate: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
203
243
  }
204
244
 
205
245
  export interface VoiceSpecifier {
@@ -245,16 +285,10 @@ export type WebsocketResponse =
245
285
 
246
286
  export namespace WebsocketResponse {
247
287
  export interface Chunk {
248
- data: string;
249
-
250
288
  done: boolean;
251
289
 
252
290
  status_code: number;
253
291
 
254
- step_time: number;
255
-
256
- type: 'chunk';
257
-
258
292
  /**
259
293
  * A unique identifier for the context. You can use any unique identifier, like a
260
294
  * UUID or human ID.
@@ -263,6 +297,8 @@ export namespace WebsocketResponse {
263
297
  * conversation IDs) as context IDs.
264
298
  */
265
299
  context_id?: string | null;
300
+
301
+ type?: 'chunk';
266
302
  }
267
303
 
268
304
  export interface FlushDone {
@@ -368,34 +404,26 @@ export interface TTSGenerateParams {
368
404
  voice: VoiceSpecifier;
369
405
 
370
406
  /**
371
- * The maximum duration of the audio in seconds. You do not usually need to specify
372
- * this. If the duration is not appropriate for the length of the transcript, the
373
- * output audio may be truncated.
374
- */
375
- duration?: number | null;
376
-
377
- /**
378
- * Configure the various attributes of the generated speech. These controls are
379
- * only available for `sonic-3-preview` and will have no effect on earlier models.
407
+ * Configure the various attributes of the generated speech. These are only for
408
+ * `sonic-3` and have no effect on earlier models.
409
+ *
410
+ * See
411
+ * [Volume, Speed, and Emotion in Sonic-3](/build-with-cartesia/sonic-3/volume-speed-emotion)
412
+ * for a guide on this option.
380
413
  */
381
- generation_config?: GenerationConfig | null;
414
+ generation_config?: GenerationConfig;
382
415
 
383
416
  /**
384
- * The language that the given voice should speak the transcript in.
385
- *
386
- * Options: English (en), French (fr), German (de), Spanish (es), Portuguese (pt),
387
- * Chinese (zh), Japanese (ja), Hindi (hi), Italian (it), Korean (ko), Dutch (nl),
388
- * Polish (pl), Russian (ru), Swedish (sv), Turkish (tr).
417
+ * The language that the given voice should speak the transcript in. For valid
418
+ * options, see [Models](/build-with-cartesia/tts-models).
389
419
  */
390
420
  language?: VoicesAPI.SupportedLanguage | null;
391
421
 
392
422
  /**
393
- * A list of pronunciation dict IDs to use for the generation. This will be applied
394
- * in addition to the pinned pronunciation dict, which will be treated as the first
395
- * element of the list. If there are conflicts with dict items, the latest dict
396
- * will take precedence.
423
+ * The ID of a pronunciation dictionary to use for the generation. Pronunciation
424
+ * dictionaries are supported by `sonic-3` models and newer.
397
425
  */
398
- pronunciation_dict_ids?: Array<string> | null;
426
+ pronunciation_dict_id?: string | null;
399
427
 
400
428
  /**
401
429
  * Whether to save the generated audio file. When true, the response will include a
@@ -404,14 +432,12 @@ export interface TTSGenerateParams {
404
432
  save?: boolean | null;
405
433
 
406
434
  /**
407
- * > This feature is experimental and may not work for all voices.
408
- *
409
- * Speed setting for the model. Defaults to `normal`.
410
- *
411
- * Influences the speed of the generated speech. Faster speeds may reduce
412
- * hallucination rate.
435
+ * @deprecated Use `generation_config.speed` for sonic-3. Speed setting for the
436
+ * model. Defaults to `normal`. This feature is experimental and may not work for
437
+ * all voices. Influences the speed of the generated speech. Faster speeds may
438
+ * reduce hallucination rate.
413
439
  */
414
- speed?: ModelSpeed | null;
440
+ speed?: ModelSpeed;
415
441
  }
416
442
 
417
443
  export namespace TTSGenerateParams {
@@ -424,13 +450,9 @@ export namespace TTSGenerateParams {
424
450
  }
425
451
 
426
452
  export interface MP3OutputFormat {
427
- /**
428
- * The bit rate of the audio in bits per second. Supported bit rates are 32000,
429
- * 64000, 96000, 128000, 192000.
430
- */
431
- bit_rate: number;
453
+ bit_rate: 32000 | 64000 | 96000 | 128000 | 192000;
432
454
 
433
- sample_rate: number;
455
+ sample_rate: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
434
456
 
435
457
  container?: 'mp3';
436
458
  }
@@ -469,38 +491,34 @@ export interface TTSGenerateSseParams {
469
491
  context_id?: string | null;
470
492
 
471
493
  /**
472
- * The maximum duration of the audio in seconds. You do not usually need to specify
473
- * this. If the duration is not appropriate for the length of the transcript, the
474
- * output audio may be truncated.
494
+ * Configure the various attributes of the generated speech. These are only for
495
+ * `sonic-3` and have no effect on earlier models.
496
+ *
497
+ * See
498
+ * [Volume, Speed, and Emotion in Sonic-3](/build-with-cartesia/sonic-3/volume-speed-emotion)
499
+ * for a guide on this option.
475
500
  */
476
- duration?: number | null;
501
+ generation_config?: GenerationConfig;
477
502
 
478
503
  /**
479
- * The language that the given voice should speak the transcript in.
480
- *
481
- * Options: English (en), French (fr), German (de), Spanish (es), Portuguese (pt),
482
- * Chinese (zh), Japanese (ja), Hindi (hi), Italian (it), Korean (ko), Dutch (nl),
483
- * Polish (pl), Russian (ru), Swedish (sv), Turkish (tr).
504
+ * The language that the given voice should speak the transcript in. For valid
505
+ * options, see [Models](/build-with-cartesia/tts-models).
484
506
  */
485
- language?: VoicesAPI.SupportedLanguage | null;
507
+ language?: VoicesAPI.SupportedLanguage;
486
508
 
487
509
  /**
488
- * A list of pronunciation dict IDs to use for the generation. This will be applied
489
- * in addition to the pinned pronunciation dict, which will be treated as the first
490
- * element of the list. If there are conflicts with dict items, the latest dict
491
- * will take precedence.
510
+ * The ID of a pronunciation dictionary to use for the generation. Pronunciation
511
+ * dictionaries are supported by `sonic-3` models and newer.
492
512
  */
493
- pronunciation_dict_ids?: Array<string> | null;
513
+ pronunciation_dict_id?: string | null;
494
514
 
495
515
  /**
496
- * > This feature is experimental and may not work for all voices.
497
- *
498
- * Speed setting for the model. Defaults to `normal`.
499
- *
500
- * Influences the speed of the generated speech. Faster speeds may reduce
501
- * hallucination rate.
516
+ * @deprecated Use `generation_config.speed` for sonic-3. Speed setting for the
517
+ * model. Defaults to `normal`. This feature is experimental and may not work for
518
+ * all voices. Influences the speed of the generated speech. Faster speeds may
519
+ * reduce hallucination rate.
502
520
  */
503
- speed?: ModelSpeed | null;
521
+ speed?: ModelSpeed;
504
522
 
505
523
  /**
506
524
  * Whether to use normalized timestamps (True) or original timestamps (False).
@@ -514,7 +532,7 @@ export namespace TTSGenerateSseParams {
514
532
 
515
533
  encoding: InfillAPI.RawEncoding;
516
534
 
517
- sample_rate: number;
535
+ sample_rate: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
518
536
  }
519
537
  }
520
538
 
@@ -54,7 +54,7 @@ export interface VoiceChangerChangeVoiceBytesParams {
54
54
  */
55
55
  'output_format[encoding]'?: InfillAPI.RawEncoding | null;
56
56
 
57
- 'output_format[sample_rate]'?: number;
57
+ 'output_format[sample_rate]'?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
58
58
 
59
59
  'voice[id]'?: string;
60
60
  }
@@ -74,7 +74,7 @@ export interface VoiceChangerChangeVoiceSseParams {
74
74
  */
75
75
  'output_format[encoding]'?: InfillAPI.RawEncoding | null;
76
76
 
77
- 'output_format[sample_rate]'?: number;
77
+ 'output_format[sample_rate]'?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
78
78
 
79
79
  'voice[id]'?: string;
80
80
  }