@cartesia/cartesia-js 3.0.0-b11 → 3.0.0-b12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/backcompat/errors.d.mts +1 -1
- package/backcompat/errors.d.mts.map +1 -1
- package/backcompat/errors.d.ts +1 -1
- package/backcompat/errors.d.ts.map +1 -1
- package/backcompat/errors.js +1 -1
- package/backcompat/errors.js.map +1 -1
- package/backcompat/errors.mjs +1 -1
- package/backcompat/errors.mjs.map +1 -1
- package/backcompat/index.d.mts.map +1 -1
- package/backcompat/index.d.ts.map +1 -1
- package/backcompat/index.js +5 -5
- package/backcompat/index.js.map +1 -1
- package/backcompat/index.mjs +5 -5
- package/backcompat/index.mjs.map +1 -1
- package/backcompat/tts-wrapper.d.mts +33 -10
- package/backcompat/tts-wrapper.d.mts.map +1 -1
- package/backcompat/tts-wrapper.d.ts +33 -10
- package/backcompat/tts-wrapper.d.ts.map +1 -1
- package/backcompat/tts-wrapper.js +83 -26
- package/backcompat/tts-wrapper.js.map +1 -1
- package/backcompat/tts-wrapper.mjs +85 -28
- package/backcompat/tts-wrapper.mjs.map +1 -1
- package/backcompat/types.d.mts +1 -1
- package/backcompat/types.d.mts.map +1 -1
- package/backcompat/types.d.ts +1 -1
- package/backcompat/types.d.ts.map +1 -1
- package/backcompat/voice-changer-wrapper.d.mts +4 -4
- package/backcompat/voice-changer-wrapper.d.mts.map +1 -1
- package/backcompat/voice-changer-wrapper.d.ts +4 -4
- package/backcompat/voice-changer-wrapper.d.ts.map +1 -1
- package/backcompat/voice-changer-wrapper.js +6 -6
- package/backcompat/voice-changer-wrapper.js.map +1 -1
- package/backcompat/voice-changer-wrapper.mjs +7 -7
- package/backcompat/voice-changer-wrapper.mjs.map +1 -1
- package/backcompat/voices-wrapper.d.mts +18 -5
- package/backcompat/voices-wrapper.d.mts.map +1 -1
- package/backcompat/voices-wrapper.d.ts +18 -5
- package/backcompat/voices-wrapper.d.ts.map +1 -1
- package/backcompat/voices-wrapper.js +63 -0
- package/backcompat/voices-wrapper.js.map +1 -1
- package/backcompat/voices-wrapper.mjs +63 -0
- package/backcompat/voices-wrapper.mjs.map +1 -1
- package/package.json +1 -1
- package/resources/agents/agents.d.mts +4 -4
- package/resources/agents/agents.d.mts.map +1 -1
- package/resources/agents/agents.d.ts +4 -4
- package/resources/agents/agents.d.ts.map +1 -1
- package/resources/agents/agents.js +9 -9
- package/resources/agents/agents.js.map +1 -1
- package/resources/agents/agents.mjs +9 -9
- package/resources/agents/agents.mjs.map +1 -1
- package/resources/agents/calls.d.mts +1 -1
- package/resources/agents/calls.d.mts.map +1 -1
- package/resources/agents/calls.d.ts +1 -1
- package/resources/agents/calls.d.ts.map +1 -1
- package/resources/agents/calls.js +1 -2
- package/resources/agents/calls.js.map +1 -1
- package/resources/agents/calls.mjs +1 -2
- package/resources/agents/calls.mjs.map +1 -1
- package/resources/datasets/datasets.d.mts +3 -3
- package/resources/datasets/datasets.d.mts.map +1 -1
- package/resources/datasets/datasets.d.ts +3 -3
- package/resources/datasets/datasets.d.ts.map +1 -1
- package/resources/datasets/datasets.js +1 -1
- package/resources/datasets/datasets.mjs +1 -1
- package/resources/datasets/files.d.mts +3 -3
- package/resources/datasets/files.d.ts +3 -3
- package/resources/datasets/files.js +1 -1
- package/resources/datasets/files.mjs +1 -1
- package/resources/datasets/index.d.mts +1 -1
- package/resources/datasets/index.d.ts +1 -1
- package/resources/infill.d.mts +1 -1
- package/resources/infill.d.mts.map +1 -1
- package/resources/infill.d.ts +1 -1
- package/resources/infill.d.ts.map +1 -1
- package/resources/infill.js +1 -6
- package/resources/infill.js.map +1 -1
- package/resources/infill.mjs +1 -6
- package/resources/infill.mjs.map +1 -1
- package/resources/pronunciation-dicts.d.mts +1 -1
- package/resources/pronunciation-dicts.d.ts +1 -1
- package/resources/pronunciation-dicts.js +1 -1
- package/resources/pronunciation-dicts.mjs +1 -1
- package/resources/voice-changer.d.mts +1 -1
- package/resources/voice-changer.d.mts.map +1 -1
- package/resources/voice-changer.d.ts +1 -1
- package/resources/voice-changer.d.ts.map +1 -1
- package/resources/voice-changer.js +1 -6
- package/resources/voice-changer.js.map +1 -1
- package/resources/voice-changer.mjs +1 -6
- package/resources/voice-changer.mjs.map +1 -1
- package/resources/voices.d.mts +9 -9
- package/resources/voices.d.mts.map +1 -1
- package/resources/voices.d.ts +9 -9
- package/resources/voices.d.ts.map +1 -1
- package/resources/voices.js +14 -14
- package/resources/voices.js.map +1 -1
- package/resources/voices.mjs +14 -14
- package/resources/voices.mjs.map +1 -1
- package/src/backcompat/errors.ts +32 -40
- package/src/backcompat/index.ts +64 -67
- package/src/backcompat/tts-wrapper.ts +405 -322
- package/src/backcompat/types.ts +13 -13
- package/src/backcompat/voice-changer-wrapper.ts +58 -56
- package/src/backcompat/voices-wrapper.ts +217 -150
- package/src/resources/agents/agents.ts +10 -10
- package/src/resources/agents/calls.ts +2 -3
- package/src/resources/datasets/datasets.ts +3 -3
- package/src/resources/datasets/files.ts +3 -3
- package/src/resources/datasets/index.ts +1 -1
- package/src/resources/infill.ts +2 -7
- package/src/resources/pronunciation-dicts.ts +1 -1
- package/src/resources/voice-changer.ts +2 -7
- package/src/resources/voices.ts +15 -15
- package/src/version.ts +1 -1
- package/version.d.mts +1 -1
- package/version.d.ts +1 -1
- package/version.js +1 -1
- package/version.mjs +1 -1
|
@@ -1,351 +1,434 @@
|
|
|
1
|
-
import WebSocket from
|
|
2
|
-
import { Cartesia } from
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import { Readable } from "stream";
|
|
1
|
+
import WebSocket from 'ws';
|
|
2
|
+
import { Cartesia } from '../client';
|
|
3
|
+
import { BackCompatRequestOptions } from './types';
|
|
4
|
+
import { wrap } from './errors';
|
|
5
|
+
import { Readable } from 'stream';
|
|
7
6
|
|
|
8
7
|
// Define compatible interfaces to match the old SDK types for WebSocket
|
|
9
8
|
export interface BackCompatWebSocketOptions {
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
9
|
+
container?: 'raw' | 'wav' | 'mp3';
|
|
10
|
+
encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_alaw' | 'pcm_mulaw';
|
|
11
|
+
sampleRate: number;
|
|
13
12
|
}
|
|
14
13
|
|
|
15
14
|
export type BackCompatTtsRequestVoiceSpecifier =
|
|
16
|
-
|
|
17
|
-
|
|
15
|
+
| { mode: 'id'; id: string }
|
|
16
|
+
| { mode: 'embedding'; embedding: number[] };
|
|
18
17
|
|
|
19
18
|
export interface BackCompatGenerationConfig {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
volume?: number;
|
|
20
|
+
speed?: number;
|
|
21
|
+
emotion?: string[]; // Simplified from strict union for backcompat flexibility
|
|
23
22
|
}
|
|
24
23
|
|
|
25
24
|
export interface BackCompatWebSocketTtsRequest {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
25
|
+
modelId: string;
|
|
26
|
+
transcript: string;
|
|
27
|
+
voice: BackCompatTtsRequestVoiceSpecifier;
|
|
28
|
+
generationConfig?: BackCompatGenerationConfig;
|
|
29
|
+
outputFormat?: {
|
|
30
|
+
container?: 'raw' | 'wav' | 'mp3';
|
|
31
|
+
encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_alaw' | 'pcm_mulaw';
|
|
32
|
+
sampleRate?: number;
|
|
33
|
+
bitRate?: number;
|
|
34
|
+
};
|
|
35
|
+
contextId?: string; // Backcompat might pass this in request?
|
|
36
|
+
// Add other fields as needed
|
|
37
|
+
continue?: boolean;
|
|
38
|
+
duration?: number;
|
|
39
|
+
addTimestamps?: boolean;
|
|
40
|
+
addPhonemeTimestamps?: boolean;
|
|
42
41
|
}
|
|
43
42
|
|
|
44
43
|
export interface BackCompatTtsRequest {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
44
|
+
modelId: string;
|
|
45
|
+
transcript: string;
|
|
46
|
+
voice: BackCompatTtsRequestVoiceSpecifier;
|
|
47
|
+
language?: string;
|
|
48
|
+
outputFormat: {
|
|
49
|
+
container: 'raw' | 'wav' | 'mp3';
|
|
50
|
+
encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_alaw' | 'pcm_mulaw';
|
|
51
|
+
sampleRate: number;
|
|
52
|
+
bitRate?: number;
|
|
53
|
+
};
|
|
54
|
+
generationConfig?: BackCompatGenerationConfig;
|
|
55
|
+
duration?: number;
|
|
56
|
+
speed?: 'slow' | 'normal' | 'fast';
|
|
57
|
+
pronunciationDictId?: string;
|
|
59
58
|
}
|
|
60
59
|
|
|
61
60
|
// Helper for generating UUIDs. Not cryptographically secure.
|
|
62
61
|
function uuidv4() {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
62
|
+
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, (c) => {
|
|
63
|
+
var r = (Math.random() * 16) | 0,
|
|
64
|
+
v = c === 'x' ? r : (r & 0x3) | 0x8;
|
|
65
|
+
return v.toString(16);
|
|
66
|
+
});
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
class AudioSource {
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
70
|
+
private buffers: Buffer[] = [];
|
|
71
|
+
private waiter: ((val?: any) => void) | null = null;
|
|
72
|
+
public isDone = false;
|
|
73
|
+
|
|
74
|
+
push(data: Buffer) {
|
|
75
|
+
this.buffers.push(data);
|
|
76
|
+
if (this.waiter) {
|
|
77
|
+
this.waiter();
|
|
78
|
+
this.waiter = null;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
markDone() {
|
|
83
|
+
this.isDone = true;
|
|
84
|
+
if (this.waiter) {
|
|
85
|
+
this.waiter();
|
|
86
|
+
this.waiter = null;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async read(outBuffer: Float32Array): Promise<number> {
|
|
91
|
+
if (this.buffers.length === 0 && !this.isDone) {
|
|
92
|
+
await new Promise<void>((resolve) => {
|
|
93
|
+
this.waiter = resolve;
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (this.buffers.length === 0 && this.isDone) {
|
|
98
|
+
return 0;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
let totalFloatsRead = 0;
|
|
102
|
+
let outOffset = 0;
|
|
103
|
+
const maxFloats = outBuffer.length;
|
|
104
|
+
|
|
105
|
+
while (this.buffers.length > 0 && totalFloatsRead < maxFloats) {
|
|
106
|
+
const buf = this.buffers[0] as Buffer; // ts not smart enough to check loop condition
|
|
107
|
+
const floatsInBuf = buf.length / 4;
|
|
108
|
+
const floatsNeeded = maxFloats - totalFloatsRead;
|
|
109
|
+
|
|
110
|
+
const floatsToCopy = Math.min(floatsInBuf, floatsNeeded);
|
|
111
|
+
const bytesToCopy = floatsToCopy * 4;
|
|
112
|
+
|
|
113
|
+
// Copy to outBuffer.
|
|
114
|
+
// Create a view on the buffer to read floats.
|
|
115
|
+
|
|
116
|
+
// We need to ensure byteOffset is a multiple of 4.
|
|
117
|
+
// If not, we must copy the buffer to a new one.
|
|
118
|
+
let srcFloats: Float32Array;
|
|
119
|
+
if (buf.byteOffset % 4 === 0) {
|
|
120
|
+
srcFloats = new Float32Array(buf.buffer, buf.byteOffset, floatsInBuf);
|
|
121
|
+
} else {
|
|
122
|
+
const alignedBuf = new Uint8Array(buf);
|
|
123
|
+
srcFloats = new Float32Array(alignedBuf.buffer, alignedBuf.byteOffset, floatsInBuf);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
outBuffer.set(srcFloats.subarray(0, floatsToCopy), outOffset);
|
|
127
|
+
|
|
128
|
+
totalFloatsRead += floatsToCopy;
|
|
129
|
+
outOffset += floatsToCopy;
|
|
130
|
+
|
|
131
|
+
if (floatsToCopy < floatsInBuf) {
|
|
132
|
+
// We didn't use the whole buffer. Update it.
|
|
133
|
+
this.buffers[0] = buf.subarray(bytesToCopy);
|
|
134
|
+
} else {
|
|
135
|
+
// We used the whole buffer. Remove it.
|
|
136
|
+
this.buffers.shift();
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return totalFloatsRead;
|
|
141
|
+
}
|
|
140
142
|
}
|
|
141
143
|
|
|
142
144
|
export class WebSocketWrapper {
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
145
|
+
private client: Cartesia;
|
|
146
|
+
private config: BackCompatWebSocketOptions;
|
|
147
|
+
private socket: WebSocket | null = null;
|
|
148
|
+
private sources: Map<string, AudioSource> = new Map();
|
|
149
|
+
// Fallback source for messages without context_id or if we just want to capture everything (legacy behavior?)
|
|
150
|
+
// The original test didn't use context_id explicitly in send() but expected a response source.
|
|
151
|
+
// We'll map context_id to source.
|
|
152
|
+
private defaultSource: AudioSource | null = null;
|
|
153
|
+
|
|
154
|
+
constructor(client: Cartesia, config: BackCompatWebSocketOptions) {
|
|
155
|
+
this.client = client;
|
|
156
|
+
this.config = config;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
async connect() {
|
|
160
|
+
const baseURL = this.client.baseURL;
|
|
161
|
+
// Construct WebSocket URL
|
|
162
|
+
// baseURL is like https://api.cartesia.ai
|
|
163
|
+
let urlStr = baseURL.replace(/^http/, 'ws');
|
|
164
|
+
if (!urlStr.includes('/tts/websocket')) {
|
|
165
|
+
if (urlStr.endsWith('/')) {
|
|
166
|
+
urlStr += 'tts/websocket';
|
|
167
|
+
} else {
|
|
168
|
+
urlStr += '/tts/websocket';
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const url = new URL(urlStr);
|
|
173
|
+
|
|
174
|
+
const headers: any = {
|
|
175
|
+
'cartesia-version': '2025-04-16',
|
|
176
|
+
};
|
|
177
|
+
if (this.client.apiKey) {
|
|
178
|
+
headers['Authorization'] = `Bearer ${this.client.apiKey}`;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
this.socket = new WebSocket(url.toString(), {
|
|
182
|
+
headers: headers,
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
return new Promise<void>((resolve, reject) => {
|
|
186
|
+
this.socket!.on('open', () => {
|
|
187
|
+
console.log('WebSocket connected.');
|
|
188
|
+
resolve();
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
this.socket!.on('error', (err) => {
|
|
192
|
+
console.error('WebSocket error:', err);
|
|
193
|
+
reject(err);
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
this.socket!.on('message', (data) => {
|
|
197
|
+
this.handleMessage(data);
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
this.socket!.on('close', () => {
|
|
201
|
+
console.log('WebSocket closed.');
|
|
202
|
+
this.sources.forEach((s) => {
|
|
203
|
+
s.markDone();
|
|
204
|
+
});
|
|
205
|
+
if (this.defaultSource) this.defaultSource.markDone();
|
|
206
|
+
});
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
private handleMessage(data: WebSocket.Data) {
|
|
211
|
+
try {
|
|
212
|
+
const str = data.toString();
|
|
213
|
+
const msg = JSON.parse(str);
|
|
214
|
+
|
|
215
|
+
const contextId = msg.context_id;
|
|
216
|
+
let source = contextId ? this.sources.get(contextId) : this.defaultSource;
|
|
217
|
+
|
|
218
|
+
// If we received a message for a context we don't know about, and we have a default source, use it
|
|
219
|
+
if (!source && this.defaultSource) {
|
|
220
|
+
source = this.defaultSource;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
if (msg.type === 'chunk' && msg.data) {
|
|
224
|
+
const audioData = Buffer.from(msg.data, 'base64');
|
|
225
|
+
if (source) source.push(audioData);
|
|
226
|
+
} else if (msg.type === 'done') {
|
|
227
|
+
if (source) source.markDone();
|
|
228
|
+
} else if (msg.type === 'error') {
|
|
229
|
+
console.error('Server error:', msg);
|
|
230
|
+
if (source) source.markDone(); // Fail the stream?
|
|
231
|
+
}
|
|
232
|
+
} catch (e) {
|
|
233
|
+
console.error('Error parsing message:', e);
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
async send(request: BackCompatWebSocketTtsRequest) {
|
|
238
|
+
if (!this.socket) {
|
|
239
|
+
throw new Error('WebSocket not connected');
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Ensure request has a context_id so we can route the response
|
|
243
|
+
const contextId = request.contextId || uuidv4();
|
|
244
|
+
|
|
245
|
+
const source = new AudioSource();
|
|
246
|
+
this.sources.set(contextId, source);
|
|
247
|
+
// Also set as default source if none exists, for compatibility with simple tests
|
|
248
|
+
if (!this.defaultSource) {
|
|
249
|
+
this.defaultSource = source;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Construct payload
|
|
253
|
+
const payload: any = {
|
|
254
|
+
model_id: request.modelId,
|
|
255
|
+
transcript: request.transcript,
|
|
256
|
+
voice: request.voice,
|
|
257
|
+
context_id: contextId,
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
// Output Format
|
|
261
|
+
if (request.outputFormat) {
|
|
262
|
+
payload.output_format = {
|
|
263
|
+
container: request.outputFormat.container,
|
|
264
|
+
encoding: request.outputFormat.encoding,
|
|
265
|
+
sample_rate: request.outputFormat.sampleRate,
|
|
266
|
+
bit_rate: request.outputFormat.bitRate,
|
|
267
|
+
};
|
|
268
|
+
} else if (this.config) {
|
|
269
|
+
payload.output_format = {
|
|
270
|
+
container: this.config.container,
|
|
271
|
+
encoding: this.config.encoding,
|
|
272
|
+
sample_rate: this.config.sampleRate,
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Generation Config
|
|
277
|
+
if (request.generationConfig) {
|
|
278
|
+
payload.generation_config = request.generationConfig;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Other fields
|
|
282
|
+
if (request.continue !== undefined) payload.continue = request.continue;
|
|
283
|
+
if (request.duration !== undefined) payload.duration = request.duration;
|
|
284
|
+
if (request.addTimestamps !== undefined) payload.add_timestamps = request.addTimestamps;
|
|
285
|
+
if (request.addPhonemeTimestamps !== undefined)
|
|
286
|
+
payload.add_phoneme_timestamps = request.addPhonemeTimestamps;
|
|
287
|
+
|
|
288
|
+
this.socket.send(JSON.stringify(payload));
|
|
289
|
+
|
|
290
|
+
return {
|
|
291
|
+
source: source,
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
disconnect() {
|
|
296
|
+
if (this.socket) {
|
|
297
|
+
this.socket.close();
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
export interface BackCompatTtsGenerateOptions {
|
|
303
|
+
modelId?: string;
|
|
304
|
+
outputFormat?: {
|
|
305
|
+
container: 'raw' | 'wav' | 'mp3';
|
|
306
|
+
encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_alaw' | 'pcm_mulaw';
|
|
307
|
+
sampleRate: number;
|
|
308
|
+
bitRate?: number;
|
|
309
|
+
};
|
|
310
|
+
language?: string;
|
|
311
|
+
generationConfig?: BackCompatGenerationConfig;
|
|
312
|
+
speed?: 'slow' | 'normal' | 'fast';
|
|
313
|
+
pronunciationDictId?: string;
|
|
295
314
|
}
|
|
296
315
|
|
|
297
316
|
/** @deprecated Use the new SDK's tts methods on the {@link Cartesia} instance instead. */
|
|
298
317
|
export class TTSWrapper {
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
318
|
+
private client: Cartesia;
|
|
319
|
+
|
|
320
|
+
constructor(client: Cartesia) {
|
|
321
|
+
this.client = client;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/** @deprecated Use {@link Cartesia.tts.websocket} instead. */
|
|
325
|
+
websocket(config: BackCompatWebSocketOptions) {
|
|
326
|
+
return new WebSocketWrapper(this.client, config);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/**
|
|
330
|
+
* Generate speech from text.
|
|
331
|
+
* @param transcript The text to convert to speech
|
|
332
|
+
* @param voiceId The voice ID to use
|
|
333
|
+
* @param options Generation options
|
|
334
|
+
* @param signal Optional abort signal
|
|
335
|
+
* @param _source Optional source identifier (e.g., "playground_tts") - for tracking purposes
|
|
336
|
+
* @deprecated Use {@link Cartesia.tts.generate} instead.
|
|
337
|
+
*/
|
|
338
|
+
async generate(
|
|
339
|
+
transcript: string,
|
|
340
|
+
voiceId: string,
|
|
341
|
+
options?: BackCompatTtsGenerateOptions,
|
|
342
|
+
signal?: AbortSignal,
|
|
343
|
+
_source?: string,
|
|
344
|
+
): Promise<Readable> {
|
|
345
|
+
const params: any = {
|
|
346
|
+
model_id: options?.modelId ?? 'sonic-2',
|
|
347
|
+
transcript,
|
|
348
|
+
voice: { mode: 'id', id: voiceId },
|
|
349
|
+
};
|
|
350
|
+
|
|
351
|
+
if (options?.outputFormat) {
|
|
352
|
+
params.output_format = {
|
|
353
|
+
container: options.outputFormat.container,
|
|
354
|
+
encoding: options.outputFormat.encoding,
|
|
355
|
+
sample_rate: options.outputFormat.sampleRate,
|
|
356
|
+
bit_rate: options.outputFormat.bitRate,
|
|
357
|
+
};
|
|
358
|
+
} else {
|
|
359
|
+
// Default output format
|
|
360
|
+
params.output_format = {
|
|
361
|
+
container: 'wav',
|
|
362
|
+
encoding: 'pcm_s16le',
|
|
363
|
+
sample_rate: 44100,
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
if (options?.language) {
|
|
368
|
+
params.language = options.language;
|
|
369
|
+
}
|
|
370
|
+
if (options?.generationConfig) {
|
|
371
|
+
params.generation_config = options.generationConfig;
|
|
372
|
+
}
|
|
373
|
+
if (options?.speed) {
|
|
374
|
+
params.speed = options.speed;
|
|
375
|
+
}
|
|
376
|
+
if (options?.pronunciationDictId) {
|
|
377
|
+
params.pronunciation_dict_id = options.pronunciationDictId;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
const requestOptions: any = {};
|
|
381
|
+
if (signal) {
|
|
382
|
+
requestOptions.signal = signal;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
const response = await wrap(this.client.tts.generate(params, requestOptions));
|
|
386
|
+
if (!response.body) {
|
|
387
|
+
throw new Error('Response body is null');
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
return Readable.fromWeb(response.body);
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
/** @deprecated Use {@link Cartesia.tts.generate} instead. */
|
|
394
|
+
async bytes(request: BackCompatTtsRequest, requestOptions?: BackCompatRequestOptions): Promise<Readable> {
|
|
395
|
+
const params: any = {
|
|
396
|
+
model_id: request.modelId,
|
|
397
|
+
transcript: request.transcript,
|
|
398
|
+
voice: request.voice,
|
|
399
|
+
generation_config: request.generationConfig,
|
|
400
|
+
duration: request.duration,
|
|
401
|
+
language: request.language,
|
|
402
|
+
speed: request.speed,
|
|
403
|
+
pronunciation_dict_id: request.pronunciationDictId,
|
|
404
|
+
};
|
|
405
|
+
|
|
406
|
+
if (request.outputFormat) {
|
|
407
|
+
params.output_format = {
|
|
408
|
+
container: request.outputFormat.container,
|
|
409
|
+
encoding: request.outputFormat.encoding,
|
|
410
|
+
sample_rate: request.outputFormat.sampleRate,
|
|
411
|
+
bit_rate: request.outputFormat.bitRate,
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
const options: any = {};
|
|
416
|
+
if (requestOptions) {
|
|
417
|
+
if (requestOptions.timeoutInSeconds) {
|
|
418
|
+
options.timeout = requestOptions.timeoutInSeconds * 1000;
|
|
419
|
+
}
|
|
420
|
+
if (requestOptions.maxRetries !== undefined) {
|
|
421
|
+
options.maxRetries = requestOptions.maxRetries;
|
|
422
|
+
}
|
|
423
|
+
options.headers = requestOptions.headers;
|
|
424
|
+
options.signal = requestOptions.abortSignal;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
const response = await wrap(this.client.tts.generate(params, options));
|
|
428
|
+
if (!response.body) {
|
|
429
|
+
throw new Error('Response body is null');
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
return Readable.fromWeb(response.body);
|
|
433
|
+
}
|
|
351
434
|
}
|