@cartesia/cartesia-js 3.0.0-b3 → 3.0.0-b4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/README.md +21 -21
- package/client.d.mts +2 -2
- package/client.d.mts.map +1 -1
- package/client.d.ts +2 -2
- package/client.d.ts.map +1 -1
- package/client.js.map +1 -1
- package/client.mjs.map +1 -1
- package/package.json +1 -1
- package/resources/access-token.d.mts +5 -0
- package/resources/access-token.d.mts.map +1 -1
- package/resources/access-token.d.ts +5 -0
- package/resources/access-token.d.ts.map +1 -1
- package/resources/agents/agents.js +1 -1
- package/resources/agents/agents.js.map +1 -1
- package/resources/agents/agents.mjs +1 -1
- package/resources/agents/agents.mjs.map +1 -1
- package/resources/agents/metrics/index.d.mts +1 -1
- package/resources/agents/metrics/index.d.mts.map +1 -1
- package/resources/agents/metrics/index.d.ts +1 -1
- package/resources/agents/metrics/index.d.ts.map +1 -1
- package/resources/agents/metrics/index.js.map +1 -1
- package/resources/agents/metrics/index.mjs.map +1 -1
- package/resources/agents/metrics/metrics.d.mts +2 -2
- package/resources/agents/metrics/metrics.d.mts.map +1 -1
- package/resources/agents/metrics/metrics.d.ts +2 -2
- package/resources/agents/metrics/metrics.d.ts.map +1 -1
- package/resources/agents/metrics/metrics.js.map +1 -1
- package/resources/agents/metrics/metrics.mjs.map +1 -1
- package/resources/agents/metrics/results.d.mts +23 -22
- package/resources/agents/metrics/results.d.mts.map +1 -1
- package/resources/agents/metrics/results.d.ts +23 -22
- package/resources/agents/metrics/results.d.ts.map +1 -1
- package/resources/agents/metrics/results.js +4 -5
- package/resources/agents/metrics/results.js.map +1 -1
- package/resources/agents/metrics/results.mjs +4 -5
- package/resources/agents/metrics/results.mjs.map +1 -1
- package/resources/index.d.mts +1 -1
- package/resources/index.d.mts.map +1 -1
- package/resources/index.d.ts +1 -1
- package/resources/index.d.ts.map +1 -1
- package/resources/index.js.map +1 -1
- package/resources/index.mjs.map +1 -1
- package/resources/infill.d.mts +3 -4
- package/resources/infill.d.mts.map +1 -1
- package/resources/infill.d.ts +3 -4
- package/resources/infill.d.ts.map +1 -1
- package/resources/infill.js +0 -2
- package/resources/infill.js.map +1 -1
- package/resources/infill.mjs +0 -2
- package/resources/infill.mjs.map +1 -1
- package/resources/pronunciation-dicts.d.mts +0 -8
- package/resources/pronunciation-dicts.d.mts.map +1 -1
- package/resources/pronunciation-dicts.d.ts +0 -8
- package/resources/pronunciation-dicts.d.ts.map +1 -1
- package/resources/pronunciation-dicts.js +0 -18
- package/resources/pronunciation-dicts.js.map +1 -1
- package/resources/pronunciation-dicts.mjs +0 -18
- package/resources/pronunciation-dicts.mjs.map +1 -1
- package/resources/stt.d.mts +1 -104
- package/resources/stt.d.mts.map +1 -1
- package/resources/stt.d.ts +1 -104
- package/resources/stt.d.ts.map +1 -1
- package/resources/tts/tts.d.mts +79 -115
- package/resources/tts/tts.d.mts.map +1 -1
- package/resources/tts/tts.d.ts +79 -115
- package/resources/tts/tts.d.ts.map +1 -1
- package/resources/voice-changer.d.mts +2 -2
- package/resources/voice-changer.d.mts.map +1 -1
- package/resources/voice-changer.d.ts +2 -2
- package/resources/voice-changer.d.ts.map +1 -1
- package/resources/voices.d.mts +70 -34
- package/resources/voices.d.mts.map +1 -1
- package/resources/voices.d.ts +70 -34
- package/resources/voices.d.ts.map +1 -1
- package/resources/voices.js +45 -3
- package/resources/voices.js.map +1 -1
- package/resources/voices.mjs +45 -3
- package/resources/voices.mjs.map +1 -1
- package/src/client.ts +2 -0
- package/src/resources/access-token.ts +6 -0
- package/src/resources/agents/agents.ts +1 -1
- package/src/resources/agents/metrics/index.ts +1 -0
- package/src/resources/agents/metrics/metrics.ts +2 -0
- package/src/resources/agents/metrics/results.ts +27 -23
- package/src/resources/index.ts +1 -0
- package/src/resources/infill.ts +3 -4
- package/src/resources/pronunciation-dicts.ts +0 -20
- package/src/resources/stt.ts +102 -104
- package/src/resources/tts/tts.ts +146 -128
- package/src/resources/voice-changer.ts +2 -2
- package/src/resources/voices.ts +105 -38
- package/src/version.ts +1 -1
- package/version.d.mts +1 -1
- package/version.d.ts +1 -1
- package/version.js +1 -1
- package/version.mjs +1 -1
package/src/resources/stt.ts
CHANGED
|
@@ -107,111 +107,109 @@ export interface SttTranscribeParams {
|
|
|
107
107
|
/**
|
|
108
108
|
* Body param: The language of the input audio in ISO-639-1 format. Defaults to
|
|
109
109
|
* `en`.
|
|
110
|
-
*
|
|
111
|
-
* <Accordion title="Supported languages">
|
|
112
|
-
* - `en` (English)
|
|
113
|
-
* - `zh` (Chinese)
|
|
114
|
-
* - `de` (German)
|
|
115
|
-
* - `es` (Spanish)
|
|
116
|
-
* - `ru` (Russian)
|
|
117
|
-
* - `ko` (Korean)
|
|
118
|
-
* - `fr` (French)
|
|
119
|
-
* - `ja` (Japanese)
|
|
120
|
-
* - `pt` (Portuguese)
|
|
121
|
-
* - `tr` (Turkish)
|
|
122
|
-
* - `pl` (Polish)
|
|
123
|
-
* - `ca` (Catalan)
|
|
124
|
-
* - `nl` (Dutch)
|
|
125
|
-
* - `ar` (Arabic)
|
|
126
|
-
* - `sv` (Swedish)
|
|
127
|
-
* - `it` (Italian)
|
|
128
|
-
* - `id` (Indonesian)
|
|
129
|
-
* - `hi` (Hindi)
|
|
130
|
-
* - `fi` (Finnish)
|
|
131
|
-
* - `vi` (Vietnamese)
|
|
132
|
-
* - `he` (Hebrew)
|
|
133
|
-
* - `uk` (Ukrainian)
|
|
134
|
-
* - `el` (Greek)
|
|
135
|
-
* - `ms` (Malay)
|
|
136
|
-
* - `cs` (Czech)
|
|
137
|
-
* - `ro` (Romanian)
|
|
138
|
-
* - `da` (Danish)
|
|
139
|
-
* - `hu` (Hungarian)
|
|
140
|
-
* - `ta` (Tamil)
|
|
141
|
-
* - `no` (Norwegian)
|
|
142
|
-
* - `th` (Thai)
|
|
143
|
-
* - `ur` (Urdu)
|
|
144
|
-
* - `hr` (Croatian)
|
|
145
|
-
* - `bg` (Bulgarian)
|
|
146
|
-
* - `lt` (Lithuanian)
|
|
147
|
-
* - `la` (Latin)
|
|
148
|
-
* - `mi` (Maori)
|
|
149
|
-
* - `ml` (Malayalam)
|
|
150
|
-
* - `cy` (Welsh)
|
|
151
|
-
* - `sk` (Slovak)
|
|
152
|
-
* - `te` (Telugu)
|
|
153
|
-
* - `fa` (Persian)
|
|
154
|
-
* - `lv` (Latvian)
|
|
155
|
-
* - `bn` (Bengali)
|
|
156
|
-
* - `sr` (Serbian)
|
|
157
|
-
* - `az` (Azerbaijani)
|
|
158
|
-
* - `sl` (Slovenian)
|
|
159
|
-
* - `kn` (Kannada)
|
|
160
|
-
* - `et` (Estonian)
|
|
161
|
-
* - `mk` (Macedonian)
|
|
162
|
-
* - `br` (Breton)
|
|
163
|
-
* - `eu` (Basque)
|
|
164
|
-
* - `is` (Icelandic)
|
|
165
|
-
* - `hy` (Armenian)
|
|
166
|
-
* - `ne` (Nepali)
|
|
167
|
-
* - `mn` (Mongolian)
|
|
168
|
-
* - `bs` (Bosnian)
|
|
169
|
-
* - `kk` (Kazakh)
|
|
170
|
-
* - `sq` (Albanian)
|
|
171
|
-
* - `sw` (Swahili)
|
|
172
|
-
* - `gl` (Galician)
|
|
173
|
-
* - `mr` (Marathi)
|
|
174
|
-
* - `pa` (Punjabi)
|
|
175
|
-
* - `si` (Sinhala)
|
|
176
|
-
* - `km` (Khmer)
|
|
177
|
-
* - `sn` (Shona)
|
|
178
|
-
* - `yo` (Yoruba)
|
|
179
|
-
* - `so` (Somali)
|
|
180
|
-
* - `af` (Afrikaans)
|
|
181
|
-
* - `oc` (Occitan)
|
|
182
|
-
* - `ka` (Georgian)
|
|
183
|
-
* - `be` (Belarusian)
|
|
184
|
-
* - `tg` (Tajik)
|
|
185
|
-
* - `sd` (Sindhi)
|
|
186
|
-
* - `gu` (Gujarati)
|
|
187
|
-
* - `am` (Amharic)
|
|
188
|
-
* - `yi` (Yiddish)
|
|
189
|
-
* - `lo` (Lao)
|
|
190
|
-
* - `uz` (Uzbek)
|
|
191
|
-
* - `fo` (Faroese)
|
|
192
|
-
* - `ht` (Haitian Creole)
|
|
193
|
-
* - `ps` (Pashto)
|
|
194
|
-
* - `tk` (Turkmen)
|
|
195
|
-
* - `nn` (Nynorsk)
|
|
196
|
-
* - `mt` (Maltese)
|
|
197
|
-
* - `sa` (Sanskrit)
|
|
198
|
-
* - `lb` (Luxembourgish)
|
|
199
|
-
* - `my` (Myanmar)
|
|
200
|
-
* - `bo` (Tibetan)
|
|
201
|
-
* - `tl` (Tagalog)
|
|
202
|
-
* - `mg` (Malagasy)
|
|
203
|
-
* - `as` (Assamese)
|
|
204
|
-
* - `tt` (Tatar)
|
|
205
|
-
* - `haw` (Hawaiian)
|
|
206
|
-
* - `ln` (Lingala)
|
|
207
|
-
* - `ha` (Hausa)
|
|
208
|
-
* - `ba` (Bashkir)
|
|
209
|
-
* - `jw` (Javanese)
|
|
210
|
-
* - `su` (Sundanese)
|
|
211
|
-
* - `yue` (Cantonese)
|
|
212
|
-
* </Accordion>
|
|
213
110
|
*/
|
|
214
|
-
language?:
|
|
111
|
+
language?:
|
|
112
|
+
| 'en'
|
|
113
|
+
| 'zh'
|
|
114
|
+
| 'de'
|
|
115
|
+
| 'es'
|
|
116
|
+
| 'ru'
|
|
117
|
+
| 'ko'
|
|
118
|
+
| 'fr'
|
|
119
|
+
| 'ja'
|
|
120
|
+
| 'pt'
|
|
121
|
+
| 'tr'
|
|
122
|
+
| 'pl'
|
|
123
|
+
| 'ca'
|
|
124
|
+
| 'nl'
|
|
125
|
+
| 'ar'
|
|
126
|
+
| 'sv'
|
|
127
|
+
| 'it'
|
|
128
|
+
| 'id'
|
|
129
|
+
| 'hi'
|
|
130
|
+
| 'fi'
|
|
131
|
+
| 'vi'
|
|
132
|
+
| 'he'
|
|
133
|
+
| 'uk'
|
|
134
|
+
| 'el'
|
|
135
|
+
| 'ms'
|
|
136
|
+
| 'cs'
|
|
137
|
+
| 'ro'
|
|
138
|
+
| 'da'
|
|
139
|
+
| 'hu'
|
|
140
|
+
| 'ta'
|
|
141
|
+
| 'no'
|
|
142
|
+
| 'th'
|
|
143
|
+
| 'ur'
|
|
144
|
+
| 'hr'
|
|
145
|
+
| 'bg'
|
|
146
|
+
| 'lt'
|
|
147
|
+
| 'la'
|
|
148
|
+
| 'mi'
|
|
149
|
+
| 'ml'
|
|
150
|
+
| 'cy'
|
|
151
|
+
| 'sk'
|
|
152
|
+
| 'te'
|
|
153
|
+
| 'fa'
|
|
154
|
+
| 'lv'
|
|
155
|
+
| 'bn'
|
|
156
|
+
| 'sr'
|
|
157
|
+
| 'az'
|
|
158
|
+
| 'sl'
|
|
159
|
+
| 'kn'
|
|
160
|
+
| 'et'
|
|
161
|
+
| 'mk'
|
|
162
|
+
| 'br'
|
|
163
|
+
| 'eu'
|
|
164
|
+
| 'is'
|
|
165
|
+
| 'hy'
|
|
166
|
+
| 'ne'
|
|
167
|
+
| 'mn'
|
|
168
|
+
| 'bs'
|
|
169
|
+
| 'kk'
|
|
170
|
+
| 'sq'
|
|
171
|
+
| 'sw'
|
|
172
|
+
| 'gl'
|
|
173
|
+
| 'mr'
|
|
174
|
+
| 'pa'
|
|
175
|
+
| 'si'
|
|
176
|
+
| 'km'
|
|
177
|
+
| 'sn'
|
|
178
|
+
| 'yo'
|
|
179
|
+
| 'so'
|
|
180
|
+
| 'af'
|
|
181
|
+
| 'oc'
|
|
182
|
+
| 'ka'
|
|
183
|
+
| 'be'
|
|
184
|
+
| 'tg'
|
|
185
|
+
| 'sd'
|
|
186
|
+
| 'gu'
|
|
187
|
+
| 'am'
|
|
188
|
+
| 'yi'
|
|
189
|
+
| 'lo'
|
|
190
|
+
| 'uz'
|
|
191
|
+
| 'fo'
|
|
192
|
+
| 'ht'
|
|
193
|
+
| 'ps'
|
|
194
|
+
| 'tk'
|
|
195
|
+
| 'nn'
|
|
196
|
+
| 'mt'
|
|
197
|
+
| 'sa'
|
|
198
|
+
| 'lb'
|
|
199
|
+
| 'my'
|
|
200
|
+
| 'bo'
|
|
201
|
+
| 'tl'
|
|
202
|
+
| 'mg'
|
|
203
|
+
| 'as'
|
|
204
|
+
| 'tt'
|
|
205
|
+
| 'haw'
|
|
206
|
+
| 'ln'
|
|
207
|
+
| 'ha'
|
|
208
|
+
| 'ba'
|
|
209
|
+
| 'jw'
|
|
210
|
+
| 'su'
|
|
211
|
+
| 'yue'
|
|
212
|
+
| null;
|
|
215
213
|
|
|
216
214
|
/**
|
|
217
215
|
* Body param: ID of the model to use for transcription. Use `ink-whisper` for the
|
package/src/resources/tts/tts.ts
CHANGED
|
@@ -34,42 +34,88 @@ export class TTS extends APIResource {
|
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
/**
|
|
37
|
-
* Configure the various attributes of the generated speech. These
|
|
38
|
-
*
|
|
37
|
+
* Configure the various attributes of the generated speech. These are only for
|
|
38
|
+
* `sonic-3` and have no effect on earlier models.
|
|
39
|
+
*
|
|
40
|
+
* See
|
|
41
|
+
* [Volume, Speed, and Emotion in Sonic-3](/build-with-cartesia/sonic-3/volume-speed-emotion)
|
|
42
|
+
* for a guide on this option.
|
|
39
43
|
*/
|
|
40
44
|
export interface GenerationConfig {
|
|
41
45
|
/**
|
|
42
|
-
*
|
|
43
|
-
*/
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
46
|
+
* Guide the emotion of the generated speech.
|
|
47
|
+
*/
|
|
48
|
+
emotion?:
|
|
49
|
+
| 'neutral'
|
|
50
|
+
| 'happy'
|
|
51
|
+
| 'excited'
|
|
52
|
+
| 'enthusiastic'
|
|
53
|
+
| 'elated'
|
|
54
|
+
| 'euphoric'
|
|
55
|
+
| 'triumphant'
|
|
56
|
+
| 'amazed'
|
|
57
|
+
| 'surprised'
|
|
58
|
+
| 'flirtatious'
|
|
59
|
+
| 'curious'
|
|
60
|
+
| 'content'
|
|
61
|
+
| 'peaceful'
|
|
62
|
+
| 'serene'
|
|
63
|
+
| 'calm'
|
|
64
|
+
| 'grateful'
|
|
65
|
+
| 'affectionate'
|
|
66
|
+
| 'trust'
|
|
67
|
+
| 'sympathetic'
|
|
68
|
+
| 'anticipation'
|
|
69
|
+
| 'mysterious'
|
|
70
|
+
| 'angry'
|
|
71
|
+
| 'mad'
|
|
72
|
+
| 'outraged'
|
|
73
|
+
| 'frustrated'
|
|
74
|
+
| 'agitated'
|
|
75
|
+
| 'threatened'
|
|
76
|
+
| 'disgusted'
|
|
77
|
+
| 'contempt'
|
|
78
|
+
| 'envious'
|
|
79
|
+
| 'sarcastic'
|
|
80
|
+
| 'ironic'
|
|
81
|
+
| 'sad'
|
|
82
|
+
| 'dejected'
|
|
83
|
+
| 'melancholic'
|
|
84
|
+
| 'disappointed'
|
|
85
|
+
| 'hurt'
|
|
86
|
+
| 'guilty'
|
|
87
|
+
| 'bored'
|
|
88
|
+
| 'tired'
|
|
89
|
+
| 'rejected'
|
|
90
|
+
| 'nostalgic'
|
|
91
|
+
| 'wistful'
|
|
92
|
+
| 'apologetic'
|
|
93
|
+
| 'hesitant'
|
|
94
|
+
| 'insecure'
|
|
95
|
+
| 'confused'
|
|
96
|
+
| 'resigned'
|
|
97
|
+
| 'anxious'
|
|
98
|
+
| 'panicked'
|
|
99
|
+
| 'alarmed'
|
|
100
|
+
| 'scared'
|
|
101
|
+
| 'proud'
|
|
102
|
+
| 'confident'
|
|
103
|
+
| 'distant'
|
|
104
|
+
| 'skeptical'
|
|
105
|
+
| 'contemplative'
|
|
106
|
+
| 'determined';
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Adjust the speed of the generated speech between 0.6x and 1.5x the original
|
|
110
|
+
* speed (default is 1.0x). Valid values are between [0.6, 1.5] inclusive.
|
|
111
|
+
*/
|
|
112
|
+
speed?: number;
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Adjust the volume of the generated speech between 0.5x and 2.0x the original
|
|
116
|
+
* volume (default is 1.0x). Valid values are between [0.5, 2.0] inclusive.
|
|
117
|
+
*/
|
|
118
|
+
volume?: number;
|
|
73
119
|
}
|
|
74
120
|
|
|
75
121
|
export interface GenerationRequest {
|
|
@@ -117,30 +163,30 @@ export interface GenerationRequest {
|
|
|
117
163
|
*/
|
|
118
164
|
continue?: boolean | null;
|
|
119
165
|
|
|
120
|
-
/**
|
|
121
|
-
* The maximum duration of the audio in seconds. You do not usually need to specify
|
|
122
|
-
* this. If the duration is not appropriate for the length of the transcript, the
|
|
123
|
-
* output audio may be truncated.
|
|
124
|
-
*/
|
|
125
|
-
duration?: number | null;
|
|
126
|
-
|
|
127
166
|
/**
|
|
128
167
|
* Whether to flush the context.
|
|
129
168
|
*/
|
|
130
169
|
flush?: boolean | null;
|
|
131
170
|
|
|
132
171
|
/**
|
|
133
|
-
*
|
|
172
|
+
* Configure the various attributes of the generated speech. These are only for
|
|
173
|
+
* `sonic-3` and have no effect on earlier models.
|
|
134
174
|
*
|
|
135
|
-
*
|
|
136
|
-
*
|
|
137
|
-
*
|
|
175
|
+
* See
|
|
176
|
+
* [Volume, Speed, and Emotion in Sonic-3](/build-with-cartesia/sonic-3/volume-speed-emotion)
|
|
177
|
+
* for a guide on this option.
|
|
138
178
|
*/
|
|
139
|
-
|
|
179
|
+
generation_config?: GenerationConfig;
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* The language that the given voice should speak the transcript in. For valid
|
|
183
|
+
* options, see [Models](/build-with-cartesia/tts-models).
|
|
184
|
+
*/
|
|
185
|
+
language?: VoicesAPI.SupportedLanguage;
|
|
140
186
|
|
|
141
187
|
/**
|
|
142
188
|
* The maximum time in milliseconds to buffer text before starting generation.
|
|
143
|
-
* Values between [0,
|
|
189
|
+
* Values between [0, 5000]ms are supported. Defaults to 3000ms.
|
|
144
190
|
*
|
|
145
191
|
* When set, the model will buffer incoming text chunks until it's confident it has
|
|
146
192
|
* enough context to generate high-quality speech, or the buffer delay elapses,
|
|
@@ -153,22 +199,18 @@ export interface GenerationRequest {
|
|
|
153
199
|
max_buffer_delay_ms?: number | null;
|
|
154
200
|
|
|
155
201
|
/**
|
|
156
|
-
*
|
|
157
|
-
*
|
|
158
|
-
* element of the list. If there are conflicts with dict items, the latest dict
|
|
159
|
-
* will take precedence.
|
|
202
|
+
* The ID of a pronunciation dictionary to use for the generation. Pronunciation
|
|
203
|
+
* dictionaries are supported by `sonic-3` models and newer.
|
|
160
204
|
*/
|
|
161
|
-
|
|
205
|
+
pronunciation_dict_id?: string | null;
|
|
162
206
|
|
|
163
207
|
/**
|
|
164
|
-
*
|
|
165
|
-
*
|
|
166
|
-
*
|
|
167
|
-
*
|
|
168
|
-
* Influences the speed of the generated speech. Faster speeds may reduce
|
|
169
|
-
* hallucination rate.
|
|
208
|
+
* @deprecated Use `generation_config.speed` for sonic-3. Speed setting for the
|
|
209
|
+
* model. Defaults to `normal`. This feature is experimental and may not work for
|
|
210
|
+
* all voices. Influences the speed of the generated speech. Faster speeds may
|
|
211
|
+
* reduce hallucination rate.
|
|
170
212
|
*/
|
|
171
|
-
speed?: ModelSpeed
|
|
213
|
+
speed?: ModelSpeed;
|
|
172
214
|
|
|
173
215
|
/**
|
|
174
216
|
* Whether to use normalized timestamps (True) or original timestamps (False).
|
|
@@ -182,24 +224,22 @@ export namespace GenerationRequest {
|
|
|
182
224
|
|
|
183
225
|
encoding: InfillAPI.RawEncoding;
|
|
184
226
|
|
|
185
|
-
sample_rate:
|
|
227
|
+
sample_rate: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
|
|
186
228
|
}
|
|
187
229
|
}
|
|
188
230
|
|
|
189
231
|
/**
|
|
190
|
-
*
|
|
191
|
-
*
|
|
192
|
-
*
|
|
193
|
-
*
|
|
194
|
-
* Influences the speed of the generated speech. Faster speeds may reduce
|
|
195
|
-
* hallucination rate.
|
|
232
|
+
* @deprecated Use `generation_config.speed` for sonic-3. Speed setting for the
|
|
233
|
+
* model. Defaults to `normal`. This feature is experimental and may not work for
|
|
234
|
+
* all voices. Influences the speed of the generated speech. Faster speeds may
|
|
235
|
+
* reduce hallucination rate.
|
|
196
236
|
*/
|
|
197
237
|
export type ModelSpeed = 'slow' | 'normal' | 'fast';
|
|
198
238
|
|
|
199
239
|
export interface RawOutputFormat {
|
|
200
240
|
encoding: InfillAPI.RawEncoding;
|
|
201
241
|
|
|
202
|
-
sample_rate:
|
|
242
|
+
sample_rate: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
|
|
203
243
|
}
|
|
204
244
|
|
|
205
245
|
export interface VoiceSpecifier {
|
|
@@ -245,16 +285,10 @@ export type WebsocketResponse =
|
|
|
245
285
|
|
|
246
286
|
export namespace WebsocketResponse {
|
|
247
287
|
export interface Chunk {
|
|
248
|
-
data: string;
|
|
249
|
-
|
|
250
288
|
done: boolean;
|
|
251
289
|
|
|
252
290
|
status_code: number;
|
|
253
291
|
|
|
254
|
-
step_time: number;
|
|
255
|
-
|
|
256
|
-
type: 'chunk';
|
|
257
|
-
|
|
258
292
|
/**
|
|
259
293
|
* A unique identifier for the context. You can use any unique identifier, like a
|
|
260
294
|
* UUID or human ID.
|
|
@@ -263,6 +297,8 @@ export namespace WebsocketResponse {
|
|
|
263
297
|
* conversation IDs) as context IDs.
|
|
264
298
|
*/
|
|
265
299
|
context_id?: string | null;
|
|
300
|
+
|
|
301
|
+
type?: 'chunk';
|
|
266
302
|
}
|
|
267
303
|
|
|
268
304
|
export interface FlushDone {
|
|
@@ -368,34 +404,26 @@ export interface TTSGenerateParams {
|
|
|
368
404
|
voice: VoiceSpecifier;
|
|
369
405
|
|
|
370
406
|
/**
|
|
371
|
-
*
|
|
372
|
-
*
|
|
373
|
-
*
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
/**
|
|
378
|
-
* Configure the various attributes of the generated speech. These controls are
|
|
379
|
-
* only available for `sonic-3-preview` and will have no effect on earlier models.
|
|
407
|
+
* Configure the various attributes of the generated speech. These are only for
|
|
408
|
+
* `sonic-3` and have no effect on earlier models.
|
|
409
|
+
*
|
|
410
|
+
* See
|
|
411
|
+
* [Volume, Speed, and Emotion in Sonic-3](/build-with-cartesia/sonic-3/volume-speed-emotion)
|
|
412
|
+
* for a guide on this option.
|
|
380
413
|
*/
|
|
381
|
-
generation_config?: GenerationConfig
|
|
414
|
+
generation_config?: GenerationConfig;
|
|
382
415
|
|
|
383
416
|
/**
|
|
384
|
-
* The language that the given voice should speak the transcript in.
|
|
385
|
-
*
|
|
386
|
-
* Options: English (en), French (fr), German (de), Spanish (es), Portuguese (pt),
|
|
387
|
-
* Chinese (zh), Japanese (ja), Hindi (hi), Italian (it), Korean (ko), Dutch (nl),
|
|
388
|
-
* Polish (pl), Russian (ru), Swedish (sv), Turkish (tr).
|
|
417
|
+
* The language that the given voice should speak the transcript in. For valid
|
|
418
|
+
* options, see [Models](/build-with-cartesia/tts-models).
|
|
389
419
|
*/
|
|
390
420
|
language?: VoicesAPI.SupportedLanguage | null;
|
|
391
421
|
|
|
392
422
|
/**
|
|
393
|
-
*
|
|
394
|
-
*
|
|
395
|
-
* element of the list. If there are conflicts with dict items, the latest dict
|
|
396
|
-
* will take precedence.
|
|
423
|
+
* The ID of a pronunciation dictionary to use for the generation. Pronunciation
|
|
424
|
+
* dictionaries are supported by `sonic-3` models and newer.
|
|
397
425
|
*/
|
|
398
|
-
|
|
426
|
+
pronunciation_dict_id?: string | null;
|
|
399
427
|
|
|
400
428
|
/**
|
|
401
429
|
* Whether to save the generated audio file. When true, the response will include a
|
|
@@ -404,14 +432,12 @@ export interface TTSGenerateParams {
|
|
|
404
432
|
save?: boolean | null;
|
|
405
433
|
|
|
406
434
|
/**
|
|
407
|
-
*
|
|
408
|
-
*
|
|
409
|
-
*
|
|
410
|
-
*
|
|
411
|
-
* Influences the speed of the generated speech. Faster speeds may reduce
|
|
412
|
-
* hallucination rate.
|
|
435
|
+
* @deprecated Use `generation_config.speed` for sonic-3. Speed setting for the
|
|
436
|
+
* model. Defaults to `normal`. This feature is experimental and may not work for
|
|
437
|
+
* all voices. Influences the speed of the generated speech. Faster speeds may
|
|
438
|
+
* reduce hallucination rate.
|
|
413
439
|
*/
|
|
414
|
-
speed?: ModelSpeed
|
|
440
|
+
speed?: ModelSpeed;
|
|
415
441
|
}
|
|
416
442
|
|
|
417
443
|
export namespace TTSGenerateParams {
|
|
@@ -424,13 +450,9 @@ export namespace TTSGenerateParams {
|
|
|
424
450
|
}
|
|
425
451
|
|
|
426
452
|
export interface MP3OutputFormat {
|
|
427
|
-
|
|
428
|
-
* The bit rate of the audio in bits per second. Supported bit rates are 32000,
|
|
429
|
-
* 64000, 96000, 128000, 192000.
|
|
430
|
-
*/
|
|
431
|
-
bit_rate: number;
|
|
453
|
+
bit_rate: 32000 | 64000 | 96000 | 128000 | 192000;
|
|
432
454
|
|
|
433
|
-
sample_rate:
|
|
455
|
+
sample_rate: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
|
|
434
456
|
|
|
435
457
|
container?: 'mp3';
|
|
436
458
|
}
|
|
@@ -469,38 +491,34 @@ export interface TTSGenerateSseParams {
|
|
|
469
491
|
context_id?: string | null;
|
|
470
492
|
|
|
471
493
|
/**
|
|
472
|
-
*
|
|
473
|
-
*
|
|
474
|
-
*
|
|
494
|
+
* Configure the various attributes of the generated speech. These are only for
|
|
495
|
+
* `sonic-3` and have no effect on earlier models.
|
|
496
|
+
*
|
|
497
|
+
* See
|
|
498
|
+
* [Volume, Speed, and Emotion in Sonic-3](/build-with-cartesia/sonic-3/volume-speed-emotion)
|
|
499
|
+
* for a guide on this option.
|
|
475
500
|
*/
|
|
476
|
-
|
|
501
|
+
generation_config?: GenerationConfig;
|
|
477
502
|
|
|
478
503
|
/**
|
|
479
|
-
* The language that the given voice should speak the transcript in.
|
|
480
|
-
*
|
|
481
|
-
* Options: English (en), French (fr), German (de), Spanish (es), Portuguese (pt),
|
|
482
|
-
* Chinese (zh), Japanese (ja), Hindi (hi), Italian (it), Korean (ko), Dutch (nl),
|
|
483
|
-
* Polish (pl), Russian (ru), Swedish (sv), Turkish (tr).
|
|
504
|
+
* The language that the given voice should speak the transcript in. For valid
|
|
505
|
+
* options, see [Models](/build-with-cartesia/tts-models).
|
|
484
506
|
*/
|
|
485
|
-
language?: VoicesAPI.SupportedLanguage
|
|
507
|
+
language?: VoicesAPI.SupportedLanguage;
|
|
486
508
|
|
|
487
509
|
/**
|
|
488
|
-
*
|
|
489
|
-
*
|
|
490
|
-
* element of the list. If there are conflicts with dict items, the latest dict
|
|
491
|
-
* will take precedence.
|
|
510
|
+
* The ID of a pronunciation dictionary to use for the generation. Pronunciation
|
|
511
|
+
* dictionaries are supported by `sonic-3` models and newer.
|
|
492
512
|
*/
|
|
493
|
-
|
|
513
|
+
pronunciation_dict_id?: string | null;
|
|
494
514
|
|
|
495
515
|
/**
|
|
496
|
-
*
|
|
497
|
-
*
|
|
498
|
-
*
|
|
499
|
-
*
|
|
500
|
-
* Influences the speed of the generated speech. Faster speeds may reduce
|
|
501
|
-
* hallucination rate.
|
|
516
|
+
* @deprecated Use `generation_config.speed` for sonic-3. Speed setting for the
|
|
517
|
+
* model. Defaults to `normal`. This feature is experimental and may not work for
|
|
518
|
+
* all voices. Influences the speed of the generated speech. Faster speeds may
|
|
519
|
+
* reduce hallucination rate.
|
|
502
520
|
*/
|
|
503
|
-
speed?: ModelSpeed
|
|
521
|
+
speed?: ModelSpeed;
|
|
504
522
|
|
|
505
523
|
/**
|
|
506
524
|
* Whether to use normalized timestamps (True) or original timestamps (False).
|
|
@@ -514,7 +532,7 @@ export namespace TTSGenerateSseParams {
|
|
|
514
532
|
|
|
515
533
|
encoding: InfillAPI.RawEncoding;
|
|
516
534
|
|
|
517
|
-
sample_rate:
|
|
535
|
+
sample_rate: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
|
|
518
536
|
}
|
|
519
537
|
}
|
|
520
538
|
|
|
@@ -54,7 +54,7 @@ export interface VoiceChangerChangeVoiceBytesParams {
|
|
|
54
54
|
*/
|
|
55
55
|
'output_format[encoding]'?: InfillAPI.RawEncoding | null;
|
|
56
56
|
|
|
57
|
-
'output_format[sample_rate]'?:
|
|
57
|
+
'output_format[sample_rate]'?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
|
|
58
58
|
|
|
59
59
|
'voice[id]'?: string;
|
|
60
60
|
}
|
|
@@ -74,7 +74,7 @@ export interface VoiceChangerChangeVoiceSseParams {
|
|
|
74
74
|
*/
|
|
75
75
|
'output_format[encoding]'?: InfillAPI.RawEncoding | null;
|
|
76
76
|
|
|
77
|
-
'output_format[sample_rate]'?:
|
|
77
|
+
'output_format[sample_rate]'?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000;
|
|
78
78
|
|
|
79
79
|
'voice[id]'?: string;
|
|
80
80
|
}
|