kugelaudio 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/README.md +35 -14
- package/dist/index.d.mts +518 -26
- package/dist/index.d.ts +518 -26
- package/dist/index.js +864 -111
- package/dist/index.mjs +858 -111
- package/package.json +8 -7
- package/src/client.test.ts +548 -0
- package/src/client.ts +885 -103
- package/src/errors.ts +266 -18
- package/src/index.ts +17 -2
- package/src/types.ts +215 -9
- package/src/websocket.ts +38 -18
package/src/client.ts
CHANGED
|
@@ -3,26 +3,50 @@
|
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
5
|
import {
|
|
6
|
-
|
|
7
|
-
InsufficientCreditsError,
|
|
6
|
+
ConnectionError,
|
|
8
7
|
KugelAudioError,
|
|
9
|
-
|
|
8
|
+
ValidationError,
|
|
9
|
+
classifyHttpError,
|
|
10
|
+
classifyWsClose,
|
|
11
|
+
classifyWsFrame,
|
|
12
|
+
classifyWsHandshakeError,
|
|
10
13
|
} from './errors';
|
|
11
14
|
import type {
|
|
12
15
|
AudioChunk,
|
|
13
16
|
AudioResponse,
|
|
17
|
+
CreateVoiceOptions,
|
|
14
18
|
GenerateOptions,
|
|
15
19
|
GenerationStats,
|
|
16
20
|
KugelAudioOptions,
|
|
17
21
|
Model,
|
|
18
22
|
StreamCallbacks,
|
|
19
|
-
|
|
23
|
+
StreamConfig,
|
|
24
|
+
StreamingSessionCallbacks,
|
|
25
|
+
UpdateVoiceOptions,
|
|
26
|
+
VoiceDetail,
|
|
27
|
+
VoiceListResponse,
|
|
28
|
+
VoiceReference,
|
|
20
29
|
WordTimestamp
|
|
21
30
|
} from './types';
|
|
22
31
|
import { base64ToArrayBuffer } from './utils';
|
|
23
32
|
import { getWebSocket } from './websocket';
|
|
24
33
|
|
|
34
|
+
import type { Region } from './types';
|
|
35
|
+
|
|
25
36
|
const DEFAULT_API_URL = 'https://api.kugelaudio.com';
|
|
37
|
+
const EU_API_URL = 'https://api.eu.kugelaudio.com';
|
|
38
|
+
const SUPPORTED_REGIONS = ['eu', 'us', 'global'] as const;
|
|
39
|
+
|
|
40
|
+
const REGION_PREFIXES = ['eu-', 'us-', 'global-'] as const;
|
|
41
|
+
|
|
42
|
+
function parseApiKey(apiKey: string): { cleanKey: string; detectedRegion?: Region } {
|
|
43
|
+
for (const prefix of REGION_PREFIXES) {
|
|
44
|
+
if (apiKey.startsWith(prefix)) {
|
|
45
|
+
return { cleanKey: apiKey.slice(prefix.length), detectedRegion: prefix.slice(0, -1) as Region };
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return { cleanKey: apiKey };
|
|
49
|
+
}
|
|
26
50
|
|
|
27
51
|
/**
|
|
28
52
|
* Create a new WebSocket instance.
|
|
@@ -37,6 +61,23 @@ function createWs(url: string): WebSocket {
|
|
|
37
61
|
/** WebSocket OPEN readyState constant. */
|
|
38
62
|
const WS_OPEN = 1;
|
|
39
63
|
|
|
64
|
+
let _languageWarningLogged = false;
|
|
65
|
+
|
|
66
|
+
function warnIfNoLanguage(
|
|
67
|
+
language: string | undefined,
|
|
68
|
+
normalize: boolean | undefined
|
|
69
|
+
): void {
|
|
70
|
+
const normEnabled = normalize === undefined || normalize;
|
|
71
|
+
if (!language && normEnabled && !_languageWarningLogged) {
|
|
72
|
+
_languageWarningLogged = true;
|
|
73
|
+
console.warn(
|
|
74
|
+
"[KugelAudio] No 'language' set with normalization enabled — the server " +
|
|
75
|
+
'will auto-detect the language, adding ~60-150ms to TTFA. Set language ' +
|
|
76
|
+
"(e.g., language: 'en') for optimal latency."
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
40
81
|
/**
|
|
41
82
|
* Models resource for listing TTS models.
|
|
42
83
|
*/
|
|
@@ -72,52 +113,211 @@ class VoicesResource {
|
|
|
72
113
|
language?: string;
|
|
73
114
|
includePublic?: boolean;
|
|
74
115
|
limit?: number;
|
|
75
|
-
|
|
116
|
+
offset?: number;
|
|
117
|
+
}): Promise<VoiceListResponse> {
|
|
76
118
|
const params = new URLSearchParams();
|
|
77
119
|
if (options?.language) params.set('language', options.language);
|
|
78
120
|
if (options?.includePublic !== undefined) {
|
|
79
121
|
params.set('include_public', String(options.includePublic));
|
|
80
122
|
}
|
|
81
123
|
if (options?.limit) params.set('limit', String(options.limit));
|
|
124
|
+
if (options?.offset) params.set('offset', String(options.offset));
|
|
82
125
|
|
|
83
126
|
const query = params.toString();
|
|
84
127
|
const path = query ? `/v1/voices?${query}` : '/v1/voices';
|
|
85
|
-
const response = await this.client.request<{ voices: any[] }>('GET', path);
|
|
128
|
+
const response = await this.client.request<{ voices: any[]; total: number; limit: number; offset: number }>('GET', path);
|
|
86
129
|
|
|
87
|
-
return
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
130
|
+
return {
|
|
131
|
+
voices: response.voices.map((v) => ({
|
|
132
|
+
id: v.id,
|
|
133
|
+
name: v.name,
|
|
134
|
+
description: v.description,
|
|
135
|
+
category: v.category,
|
|
136
|
+
sex: v.sex,
|
|
137
|
+
age: v.age,
|
|
138
|
+
supportedLanguages: v.supported_languages || [],
|
|
139
|
+
sampleText: v.sample_text,
|
|
140
|
+
avatarUrl: v.avatar_url,
|
|
141
|
+
sampleUrl: v.sample_url,
|
|
142
|
+
isPublic: v.is_public || false,
|
|
143
|
+
verified: v.verified || false,
|
|
144
|
+
})),
|
|
145
|
+
total: response.total,
|
|
146
|
+
limit: response.limit,
|
|
147
|
+
offset: response.offset,
|
|
148
|
+
};
|
|
101
149
|
}
|
|
102
150
|
|
|
103
151
|
/**
|
|
104
152
|
* Get a specific voice by ID.
|
|
105
153
|
*/
|
|
106
|
-
async get(voiceId: number): Promise<
|
|
154
|
+
async get(voiceId: number): Promise<VoiceDetail> {
|
|
107
155
|
const v = await this.client.request<any>('GET', `/v1/voices/${voiceId}`);
|
|
156
|
+
return this.mapVoiceDetail(v);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Create a new voice.
|
|
161
|
+
*/
|
|
162
|
+
async create(options: CreateVoiceOptions): Promise<VoiceDetail> {
|
|
163
|
+
const metadata = {
|
|
164
|
+
name: options.name,
|
|
165
|
+
sex: options.sex,
|
|
166
|
+
description: options.description ?? '',
|
|
167
|
+
category: options.category ?? 'conversational',
|
|
168
|
+
age: options.age ?? 'middle_age',
|
|
169
|
+
quality: options.quality ?? 'mid',
|
|
170
|
+
supported_languages: options.supportedLanguages ?? ['en'],
|
|
171
|
+
is_public: options.isPublic ?? false,
|
|
172
|
+
sample_text: options.sampleText ?? '',
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
const formData = new FormData();
|
|
176
|
+
formData.append(
|
|
177
|
+
'metadata',
|
|
178
|
+
new Blob([JSON.stringify(metadata)], { type: 'application/json' }),
|
|
179
|
+
);
|
|
180
|
+
|
|
181
|
+
if (options.referenceFiles) {
|
|
182
|
+
for (const file of options.referenceFiles) {
|
|
183
|
+
formData.append('files', file);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const v = await this.client.requestMultipart<any>('POST', '/v1/voices', formData);
|
|
188
|
+
return this.mapVoiceDetail(v);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Update an existing voice. Only provided fields are updated.
|
|
193
|
+
*/
|
|
194
|
+
async update(voiceId: number, options: UpdateVoiceOptions): Promise<VoiceDetail> {
|
|
195
|
+
const payload: Record<string, unknown> = {};
|
|
196
|
+
if (options.name !== undefined) payload.name = options.name;
|
|
197
|
+
if (options.description !== undefined) payload.description = options.description;
|
|
198
|
+
if (options.category !== undefined) payload.category = options.category;
|
|
199
|
+
if (options.age !== undefined) payload.age = options.age;
|
|
200
|
+
if (options.sex !== undefined) payload.sex = options.sex;
|
|
201
|
+
if (options.quality !== undefined) payload.quality = options.quality;
|
|
202
|
+
if (options.supportedLanguages !== undefined) payload.supported_languages = options.supportedLanguages;
|
|
203
|
+
if (options.isPublic !== undefined) payload.is_public = options.isPublic;
|
|
204
|
+
if (options.sampleText !== undefined) payload.sample_text = options.sampleText;
|
|
205
|
+
|
|
206
|
+
const v = await this.client.request<any>('PATCH', `/v1/voices/${voiceId}`, payload);
|
|
207
|
+
return this.mapVoiceDetail(v);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Delete a voice.
|
|
212
|
+
*/
|
|
213
|
+
async delete(voiceId: number): Promise<void> {
|
|
214
|
+
await this.client.request<any>('DELETE', `/v1/voices/${voiceId}`);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// -- Reference management --
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* List reference audio files for a voice.
|
|
221
|
+
*/
|
|
222
|
+
async listReferences(voiceId: number): Promise<VoiceReference[]> {
|
|
223
|
+
const response = await this.client.request<{ references: any[] }>(
|
|
224
|
+
'GET',
|
|
225
|
+
`/v1/voices/${voiceId}/references`,
|
|
226
|
+
);
|
|
227
|
+
return response.references.map((r) => this.mapVoiceReference(r));
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Upload a reference audio file to a voice.
|
|
232
|
+
*
|
|
233
|
+
* @param voiceId - Voice ID
|
|
234
|
+
* @param file - Audio file (File in browser, Blob in Node.js)
|
|
235
|
+
* @param referenceText - Optional transcript of the reference audio
|
|
236
|
+
*/
|
|
237
|
+
async addReference(
|
|
238
|
+
voiceId: number,
|
|
239
|
+
file: File | Blob,
|
|
240
|
+
referenceText?: string,
|
|
241
|
+
): Promise<VoiceReference> {
|
|
242
|
+
const formData = new FormData();
|
|
243
|
+
formData.append('file', file);
|
|
244
|
+
if (referenceText) {
|
|
245
|
+
formData.append('reference_text', referenceText);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const r = await this.client.requestMultipart<any>(
|
|
249
|
+
'POST',
|
|
250
|
+
`/v1/voices/${voiceId}/references`,
|
|
251
|
+
formData,
|
|
252
|
+
);
|
|
253
|
+
return this.mapVoiceReference(r);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Delete a reference audio file from a voice.
|
|
258
|
+
*/
|
|
259
|
+
async deleteReference(voiceId: number, referenceId: number): Promise<void> {
|
|
260
|
+
await this.client.request<any>(
|
|
261
|
+
'DELETE',
|
|
262
|
+
`/v1/voices/${voiceId}/references/${referenceId}`,
|
|
263
|
+
);
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// -- Publishing --
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Request publication of a voice. Sets it as public and marks it
|
|
270
|
+
* as pending verification by an admin.
|
|
271
|
+
*/
|
|
272
|
+
async publish(voiceId: number): Promise<VoiceDetail> {
|
|
273
|
+
const v = await this.client.request<any>('POST', `/v1/voices/${voiceId}/publish`);
|
|
274
|
+
return this.mapVoiceDetail(v);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// -- Sample generation --
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Trigger sample audio generation for a voice.
|
|
281
|
+
*/
|
|
282
|
+
async generateSample(voiceId: number): Promise<VoiceDetail> {
|
|
283
|
+
const v = await this.client.request<any>(
|
|
284
|
+
'POST',
|
|
285
|
+
`/v1/voices/${voiceId}/generate-sample`,
|
|
286
|
+
);
|
|
287
|
+
return this.mapVoiceDetail(v);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// -- Helpers --
|
|
291
|
+
|
|
292
|
+
private mapVoiceDetail(v: any): VoiceDetail {
|
|
108
293
|
return {
|
|
109
294
|
id: v.id,
|
|
110
295
|
name: v.name,
|
|
111
|
-
description: v.description,
|
|
112
|
-
|
|
113
|
-
|
|
296
|
+
description: v.description ?? '',
|
|
297
|
+
generativeVoiceDescription: v.generative_voice_description ?? '',
|
|
298
|
+
supportedLanguages: v.supported_languages ?? [],
|
|
299
|
+
category: v.category ?? 'cloned',
|
|
114
300
|
age: v.age,
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
301
|
+
sex: v.sex,
|
|
302
|
+
quality: v.quality ?? 'mid',
|
|
303
|
+
isPublic: v.is_public ?? false,
|
|
304
|
+
verified: v.verified ?? false,
|
|
305
|
+
pendingVerification: v.pending_verification ?? false,
|
|
118
306
|
sampleUrl: v.sample_url,
|
|
119
|
-
|
|
120
|
-
|
|
307
|
+
avatarUrl: v.avatar_url,
|
|
308
|
+
sampleText: v.sample_text ?? '',
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
private mapVoiceReference(r: any): VoiceReference {
|
|
313
|
+
return {
|
|
314
|
+
id: r.id,
|
|
315
|
+
voiceId: r.voice_id,
|
|
316
|
+
name: r.name ?? '',
|
|
317
|
+
referenceText: r.reference_text ?? '',
|
|
318
|
+
s3Path: r.s3_path ?? '',
|
|
319
|
+
audioUrl: r.audio_url,
|
|
320
|
+
isGenerated: r.is_generated ?? false,
|
|
121
321
|
};
|
|
122
322
|
}
|
|
123
323
|
}
|
|
@@ -135,6 +335,7 @@ class TTSResource {
|
|
|
135
335
|
reject: (error: Error) => void;
|
|
136
336
|
}> = new Map();
|
|
137
337
|
private requestCounter = 0;
|
|
338
|
+
private keepaliveTimer: ReturnType<typeof setInterval> | null = null;
|
|
138
339
|
|
|
139
340
|
constructor(private client: KugelAudio) {}
|
|
140
341
|
|
|
@@ -207,6 +408,67 @@ class TTSResource {
|
|
|
207
408
|
};
|
|
208
409
|
}
|
|
209
410
|
|
|
411
|
+
/**
|
|
412
|
+
* Stream audio and return a Node.js Readable stream of raw PCM16 binary data.
|
|
413
|
+
*
|
|
414
|
+
* **Node.js only** — this method requires the `stream` built-in module and is
|
|
415
|
+
* intended for server-side integrations such as Vapi custom TTS endpoints,
|
|
416
|
+
* Express/Fastify handlers, or any pipeline that expects a Node.js `Readable`.
|
|
417
|
+
*
|
|
418
|
+
* Compared to manually wiring `onChunk` to a `Readable`, this method avoids
|
|
419
|
+
* a common race-condition: the stream object is created and returned **before**
|
|
420
|
+
* any chunks arrive, so the caller can safely pipe or attach listeners before
|
|
421
|
+
* the first audio byte is pushed.
|
|
422
|
+
*
|
|
423
|
+
* @example Vapi custom TTS endpoint
|
|
424
|
+
* ```typescript
|
|
425
|
+
* app.post('/synthesize', (req, res) => {
|
|
426
|
+
* res.setHeader('Content-Type', 'audio/pcm');
|
|
427
|
+
* res.setHeader('Transfer-Encoding', 'chunked');
|
|
428
|
+
*
|
|
429
|
+
* const readable = client.tts.toReadable({
|
|
430
|
+
* text: req.body.message.text,
|
|
431
|
+
* modelId: 'kugel-1-turbo',
|
|
432
|
+
* sampleRate: req.body.message.sampleRate,
|
|
433
|
+
* language: 'en',
|
|
434
|
+
* });
|
|
435
|
+
*
|
|
436
|
+
* readable.pipe(res);
|
|
437
|
+
* });
|
|
438
|
+
* ```
|
|
439
|
+
*
|
|
440
|
+
* @param options - TTS generation options (same as `stream()`)
|
|
441
|
+
* @param reuseConnection - Reuse the pooled WebSocket connection (default: true)
|
|
442
|
+
* @returns Node.js Readable stream emitting raw PCM16 binary Buffer chunks
|
|
443
|
+
*/
|
|
444
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
445
|
+
toReadable(options: GenerateOptions, reuseConnection = true): any {
|
|
446
|
+
// Dynamic require keeps browser bundles free of Node.js built-ins.
|
|
447
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
448
|
+
const { Readable } = require('stream') as typeof import('stream');
|
|
449
|
+
const readable = new Readable({ read() {} });
|
|
450
|
+
|
|
451
|
+
this.stream(
|
|
452
|
+
options,
|
|
453
|
+
{
|
|
454
|
+
onChunk: (chunk: AudioChunk) => {
|
|
455
|
+
readable.push(Buffer.from(chunk.audio, 'base64'));
|
|
456
|
+
},
|
|
457
|
+
onFinal: () => {
|
|
458
|
+
readable.push(null);
|
|
459
|
+
},
|
|
460
|
+
onError: (error: Error) => {
|
|
461
|
+
readable.destroy(error);
|
|
462
|
+
},
|
|
463
|
+
},
|
|
464
|
+
reuseConnection
|
|
465
|
+
).catch((error: Error) => {
|
|
466
|
+
readable.destroy(error);
|
|
467
|
+
});
|
|
468
|
+
|
|
469
|
+
return readable;
|
|
470
|
+
}
|
|
471
|
+
|
|
210
472
|
/**
|
|
211
473
|
* Build the WebSocket URL with appropriate auth param.
|
|
212
474
|
*/
|
|
@@ -265,11 +527,20 @@ class TTSResource {
|
|
|
265
527
|
this.wsConnection = ws;
|
|
266
528
|
this.wsUrl = url;
|
|
267
529
|
this.setupMessageHandler(ws);
|
|
530
|
+
this.startKeepalive(ws);
|
|
268
531
|
resolve(ws);
|
|
269
532
|
};
|
|
270
533
|
|
|
271
|
-
ws.onerror = () => {
|
|
272
|
-
|
|
534
|
+
ws.onerror = (event: unknown) => {
|
|
535
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
536
|
+
const typed = classifyWsHandshakeError(underlying);
|
|
537
|
+
reject(
|
|
538
|
+
typed ??
|
|
539
|
+
new ConnectionError(
|
|
540
|
+
`Could not establish KugelAudio WebSocket connection to ${url}. ` +
|
|
541
|
+
'Check network connectivity.',
|
|
542
|
+
),
|
|
543
|
+
);
|
|
273
544
|
};
|
|
274
545
|
});
|
|
275
546
|
}
|
|
@@ -293,7 +564,7 @@ class TTSResource {
|
|
|
293
564
|
if (!pending) return;
|
|
294
565
|
|
|
295
566
|
if (data.error) {
|
|
296
|
-
const error = this.parseError(data
|
|
567
|
+
const error = this.parseError(data);
|
|
297
568
|
pending.callbacks.onError?.(error);
|
|
298
569
|
this.pendingRequests.delete(requestId);
|
|
299
570
|
pending.reject(error);
|
|
@@ -307,7 +578,6 @@ class TTSResource {
|
|
|
307
578
|
totalSamples: data.total_samples,
|
|
308
579
|
durationMs: data.dur_ms,
|
|
309
580
|
generationMs: data.gen_ms,
|
|
310
|
-
ttfaMs: data.ttfa_ms,
|
|
311
581
|
rtf: data.rtf,
|
|
312
582
|
error: data.error,
|
|
313
583
|
};
|
|
@@ -347,17 +617,25 @@ class TTSResource {
|
|
|
347
617
|
};
|
|
348
618
|
|
|
349
619
|
ws.onclose = (event) => {
|
|
350
|
-
// Clear connection pool
|
|
620
|
+
// Clear connection pool and keepalive
|
|
621
|
+
this.stopKeepalive();
|
|
351
622
|
this.wsConnection = null;
|
|
352
623
|
this.wsUrl = null;
|
|
353
624
|
|
|
354
|
-
// Reject all pending requests
|
|
625
|
+
// Reject all pending requests with appropriate error types
|
|
355
626
|
for (const [id, pending] of this.pendingRequests) {
|
|
356
627
|
pending.callbacks.onClose?.();
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
628
|
+
// Only surface server-initiated error close codes; normal closes
|
|
629
|
+
// (1000, 1001) should not reject pending requests with an error.
|
|
630
|
+
if (
|
|
631
|
+
event.code === 4001 ||
|
|
632
|
+
event.code === 4003 ||
|
|
633
|
+
event.code === 4029 ||
|
|
634
|
+
event.code === 4500
|
|
635
|
+
) {
|
|
636
|
+
const error = classifyWsClose(event.code, event.reason);
|
|
637
|
+
pending.callbacks.onError?.(error);
|
|
638
|
+
pending.reject(error);
|
|
361
639
|
}
|
|
362
640
|
this.pendingRequests.delete(id);
|
|
363
641
|
}
|
|
@@ -365,7 +643,9 @@ class TTSResource {
|
|
|
365
643
|
|
|
366
644
|
ws.onerror = () => {
|
|
367
645
|
// Reject all pending requests
|
|
368
|
-
const error = new
|
|
646
|
+
const error = new ConnectionError(
|
|
647
|
+
'KugelAudio WebSocket connection error. Check network connectivity.',
|
|
648
|
+
);
|
|
369
649
|
for (const [id, pending] of this.pendingRequests) {
|
|
370
650
|
pending.callbacks.onError?.(error);
|
|
371
651
|
pending.reject(error);
|
|
@@ -400,6 +680,7 @@ class TTSResource {
|
|
|
400
680
|
options: GenerateOptions,
|
|
401
681
|
callbacks: StreamCallbacks
|
|
402
682
|
): Promise<void> {
|
|
683
|
+
warnIfNoLanguage(options.language, options.normalize);
|
|
403
684
|
const ws = await this.getConnection();
|
|
404
685
|
const requestId = ++this.requestCounter;
|
|
405
686
|
|
|
@@ -413,11 +694,14 @@ class TTSResource {
|
|
|
413
694
|
model_id: options.modelId || 'kugel-1-turbo',
|
|
414
695
|
voice_id: options.voiceId,
|
|
415
696
|
cfg_scale: options.cfgScale ?? 2.0,
|
|
697
|
+
...(options.temperature !== undefined && { temperature: options.temperature }),
|
|
416
698
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
417
699
|
sample_rate: options.sampleRate ?? 24000,
|
|
418
700
|
normalize: options.normalize ?? true,
|
|
419
701
|
...(options.language && { language: options.language }),
|
|
420
702
|
...(options.wordTimestamps && { word_timestamps: true }),
|
|
703
|
+
...(options.speed !== undefined && { speed: options.speed }),
|
|
704
|
+
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
421
705
|
}));
|
|
422
706
|
});
|
|
423
707
|
}
|
|
@@ -429,6 +713,7 @@ class TTSResource {
|
|
|
429
713
|
options: GenerateOptions,
|
|
430
714
|
callbacks: StreamCallbacks
|
|
431
715
|
): Promise<void> {
|
|
716
|
+
warnIfNoLanguage(options.language, options.normalize);
|
|
432
717
|
return new Promise((resolve, reject) => {
|
|
433
718
|
const url = this.buildWsUrl();
|
|
434
719
|
const ws = createWs(url);
|
|
@@ -446,6 +731,8 @@ class TTSResource {
|
|
|
446
731
|
normalize: options.normalize ?? true,
|
|
447
732
|
...(options.language && { language: options.language }),
|
|
448
733
|
...(options.wordTimestamps && { word_timestamps: true }),
|
|
734
|
+
...(options.speed !== undefined && { speed: options.speed }),
|
|
735
|
+
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
449
736
|
}));
|
|
450
737
|
};
|
|
451
738
|
|
|
@@ -460,7 +747,7 @@ class TTSResource {
|
|
|
460
747
|
const data = JSON.parse(messageData);
|
|
461
748
|
|
|
462
749
|
if (data.error) {
|
|
463
|
-
const error = this.parseError(data
|
|
750
|
+
const error = this.parseError(data);
|
|
464
751
|
callbacks.onError?.(error);
|
|
465
752
|
ws.close();
|
|
466
753
|
reject(error);
|
|
@@ -474,7 +761,6 @@ class TTSResource {
|
|
|
474
761
|
totalSamples: data.total_samples,
|
|
475
762
|
durationMs: data.dur_ms,
|
|
476
763
|
generationMs: data.gen_ms,
|
|
477
|
-
ttfaMs: data.ttfa_ms,
|
|
478
764
|
rtf: data.rtf,
|
|
479
765
|
error: data.error,
|
|
480
766
|
};
|
|
@@ -513,27 +799,68 @@ class TTSResource {
|
|
|
513
799
|
}
|
|
514
800
|
};
|
|
515
801
|
|
|
516
|
-
ws.onerror = () => {
|
|
517
|
-
const
|
|
802
|
+
ws.onerror = (event: unknown) => {
|
|
803
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
804
|
+
const error =
|
|
805
|
+
classifyWsHandshakeError(underlying) ??
|
|
806
|
+
new ConnectionError(
|
|
807
|
+
'KugelAudio WebSocket connection error. Check network connectivity.',
|
|
808
|
+
);
|
|
518
809
|
callbacks.onError?.(error);
|
|
519
810
|
reject(error);
|
|
520
811
|
};
|
|
521
812
|
|
|
522
813
|
ws.onclose = (event) => {
|
|
523
814
|
callbacks.onClose?.();
|
|
524
|
-
if (
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
815
|
+
if (
|
|
816
|
+
event.code === 4001 ||
|
|
817
|
+
event.code === 4003 ||
|
|
818
|
+
event.code === 4029 ||
|
|
819
|
+
event.code === 4500
|
|
820
|
+
) {
|
|
821
|
+
const error = classifyWsClose(event.code, event.reason);
|
|
822
|
+
callbacks.onError?.(error);
|
|
823
|
+
reject(error);
|
|
528
824
|
}
|
|
529
825
|
};
|
|
530
826
|
});
|
|
531
827
|
}
|
|
532
828
|
|
|
829
|
+
/**
|
|
830
|
+
* Start periodic keepalive pings on the pooled connection.
|
|
831
|
+
* Uses the ws package's ping() in Node.js; silently skips in browsers
|
|
832
|
+
* where WebSocket doesn't expose a ping method.
|
|
833
|
+
*/
|
|
834
|
+
private startKeepalive(ws: WebSocket): void {
|
|
835
|
+
this.stopKeepalive();
|
|
836
|
+
const intervalMs = this.client.keepalivePingInterval;
|
|
837
|
+
if (intervalMs == null || intervalMs <= 0) return;
|
|
838
|
+
|
|
839
|
+
this.keepaliveTimer = setInterval(() => {
|
|
840
|
+
if (this.wsConnection !== ws || ws.readyState !== WS_OPEN) {
|
|
841
|
+
this.stopKeepalive();
|
|
842
|
+
return;
|
|
843
|
+
}
|
|
844
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
845
|
+
if (typeof (ws as any).ping === 'function') {
|
|
846
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
847
|
+
(ws as any).ping();
|
|
848
|
+
}
|
|
849
|
+
}, intervalMs);
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
private stopKeepalive(): void {
|
|
853
|
+
if (this.keepaliveTimer !== null) {
|
|
854
|
+
clearInterval(this.keepaliveTimer);
|
|
855
|
+
this.keepaliveTimer = null;
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
|
|
533
859
|
/**
|
|
534
860
|
* Close the pooled WebSocket connection.
|
|
535
861
|
*/
|
|
536
862
|
close(): void {
|
|
863
|
+
this.stopKeepalive();
|
|
537
864
|
if (this.wsConnection) {
|
|
538
865
|
try {
|
|
539
866
|
this.wsConnection.close();
|
|
@@ -545,15 +872,43 @@ class TTSResource {
|
|
|
545
872
|
}
|
|
546
873
|
}
|
|
547
874
|
|
|
548
|
-
private parseError(
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
875
|
+
private parseError(data: { error?: string; error_code?: string; retry_after?: number }): Error {
|
|
876
|
+
return classifyWsFrame(data);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
/**
|
|
880
|
+
* Create a streaming session for LLM integration.
|
|
881
|
+
*
|
|
882
|
+
* The session connects to `/ws/tts/stream` and keeps a persistent
|
|
883
|
+
* connection across multiple {@link StreamingSession.send} calls.
|
|
884
|
+
* The server auto-chunks text at sentence boundaries — no client-side
|
|
885
|
+
* flushing required.
|
|
886
|
+
*
|
|
887
|
+
* @param config - Session configuration (voice, model, chunking strategy).
|
|
888
|
+
* @param callbacks - Callbacks for audio chunks and session lifecycle events.
|
|
889
|
+
* @returns A {@link StreamingSession} instance. Call `.connect()` before sending.
|
|
890
|
+
*
|
|
891
|
+
* @example
|
|
892
|
+
* ```typescript
|
|
893
|
+
* const session = client.tts.streamingSession(
|
|
894
|
+
* { voiceId: 123, autoMode: true, chunkLengthSchedule: [50, 100, 150, 250] },
|
|
895
|
+
* { onChunk: (chunk) => playAudio(chunk.audio) },
|
|
896
|
+
* );
|
|
897
|
+
*
|
|
898
|
+
* session.connect();
|
|
899
|
+
*
|
|
900
|
+
* for await (const token of llmStream) {
|
|
901
|
+
* session.send(token);
|
|
902
|
+
* }
|
|
903
|
+
*
|
|
904
|
+
* await session.close();
|
|
905
|
+
* ```
|
|
906
|
+
*/
|
|
907
|
+
streamingSession(
|
|
908
|
+
config: StreamConfig,
|
|
909
|
+
callbacks: StreamingSessionCallbacks
|
|
910
|
+
): StreamingSession {
|
|
911
|
+
return new StreamingSession(this.client, config, callbacks);
|
|
557
912
|
}
|
|
558
913
|
|
|
559
914
|
/**
|
|
@@ -574,7 +929,7 @@ class TTSResource {
|
|
|
574
929
|
* console.log(`Audio from ${chunk.contextId}`);
|
|
575
930
|
* playAudio(chunk.audio);
|
|
576
931
|
* },
|
|
577
|
-
*
|
|
932
|
+
* onContextClosed: (contextId) => {
|
|
578
933
|
* console.log(`${contextId} finished`);
|
|
579
934
|
* },
|
|
580
935
|
* });
|
|
@@ -625,8 +980,13 @@ class MultiContextSession {
|
|
|
625
980
|
|
|
626
981
|
/**
|
|
627
982
|
* Connect to the multi-context WebSocket endpoint.
|
|
983
|
+
*
|
|
984
|
+
* The returned promise resolves once the WebSocket is OPEN so callers can
|
|
985
|
+
* ``await session.connect(callbacks)`` before invoking
|
|
986
|
+
* {@link createContext} / {@link send}. Pre-open errors reject with the
|
|
987
|
+
* typed error.
|
|
628
988
|
*/
|
|
629
|
-
connect(callbacks: import('./types').MultiContextCallbacks): void {
|
|
989
|
+
connect(callbacks: import('./types').MultiContextCallbacks): Promise<void> {
|
|
630
990
|
this.callbacks = callbacks;
|
|
631
991
|
|
|
632
992
|
const wsUrl = this.client.ttsUrl
|
|
@@ -644,12 +1004,9 @@ class MultiContextSession {
|
|
|
644
1004
|
|
|
645
1005
|
const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`;
|
|
646
1006
|
this.ws = createWs(url);
|
|
1007
|
+
const ws = this.ws;
|
|
647
1008
|
|
|
648
|
-
|
|
649
|
-
// Connection established, ready to create contexts
|
|
650
|
-
};
|
|
651
|
-
|
|
652
|
-
this.ws.onmessage = (event: { data: unknown }) => {
|
|
1009
|
+
ws.onmessage = (event: { data: unknown }) => {
|
|
653
1010
|
try {
|
|
654
1011
|
// Handle both browser (string) and Node.js (Buffer) message formats
|
|
655
1012
|
const messageData = typeof event.data === 'string'
|
|
@@ -690,10 +1047,6 @@ class MultiContextSession {
|
|
|
690
1047
|
this.callbacks.onChunk?.(chunk);
|
|
691
1048
|
}
|
|
692
1049
|
|
|
693
|
-
if (data.is_final) {
|
|
694
|
-
this.callbacks.onContextFinal?.(data.context_id);
|
|
695
|
-
}
|
|
696
|
-
|
|
697
1050
|
if (data.context_closed) {
|
|
698
1051
|
this.contexts.delete(data.context_id);
|
|
699
1052
|
this.callbacks.onContextClosed?.(data.context_id);
|
|
@@ -712,20 +1065,51 @@ class MultiContextSession {
|
|
|
712
1065
|
}
|
|
713
1066
|
};
|
|
714
1067
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
};
|
|
1068
|
+
return new Promise<void>((resolve, reject) => {
|
|
1069
|
+
let opened = false;
|
|
718
1070
|
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
1071
|
+
ws.onopen = () => {
|
|
1072
|
+
opened = true;
|
|
1073
|
+
resolve();
|
|
1074
|
+
};
|
|
1075
|
+
|
|
1076
|
+
ws.onerror = (event: unknown) => {
|
|
1077
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
1078
|
+
const err =
|
|
1079
|
+
classifyWsHandshakeError(underlying) ??
|
|
1080
|
+
new ConnectionError(
|
|
1081
|
+
'KugelAudio multi-context WebSocket connection error. ' +
|
|
1082
|
+
'Check network connectivity.',
|
|
1083
|
+
);
|
|
1084
|
+
if (!opened) reject(err);
|
|
1085
|
+
this.callbacks.onError?.(err);
|
|
1086
|
+
};
|
|
1087
|
+
|
|
1088
|
+
ws.onclose = (event) => {
|
|
1089
|
+
let typedErr: KugelAudioError | null = null;
|
|
1090
|
+
if (
|
|
1091
|
+
event.code === 4001 ||
|
|
1092
|
+
event.code === 4003 ||
|
|
1093
|
+
event.code === 4029 ||
|
|
1094
|
+
event.code === 4500
|
|
1095
|
+
) {
|
|
1096
|
+
typedErr = classifyWsClose(event.code, event.reason);
|
|
1097
|
+
this.callbacks.onError?.(typedErr);
|
|
1098
|
+
}
|
|
1099
|
+
if (!opened) {
|
|
1100
|
+
reject(
|
|
1101
|
+
typedErr ??
|
|
1102
|
+
new ConnectionError(
|
|
1103
|
+
`KugelAudio multi-context WebSocket closed before ready ` +
|
|
1104
|
+
`(code ${event.code}).`,
|
|
1105
|
+
),
|
|
1106
|
+
);
|
|
1107
|
+
}
|
|
1108
|
+
this.ws = null;
|
|
1109
|
+
this.isStarted = false;
|
|
1110
|
+
this.contexts.clear();
|
|
1111
|
+
};
|
|
1112
|
+
});
|
|
729
1113
|
}
|
|
730
1114
|
|
|
731
1115
|
/**
|
|
@@ -749,10 +1133,13 @@ class MultiContextSession {
|
|
|
749
1133
|
|
|
750
1134
|
// Include session config on first context
|
|
751
1135
|
if (!this.isStarted) {
|
|
1136
|
+
warnIfNoLanguage(this.config.language, this.config.normalize);
|
|
752
1137
|
if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
|
|
753
1138
|
if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
|
|
1139
|
+
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
754
1140
|
if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
|
|
755
1141
|
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
1142
|
+
if (this.config.language) msg.language = this.config.language;
|
|
756
1143
|
if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
|
|
757
1144
|
}
|
|
758
1145
|
|
|
@@ -857,6 +1244,336 @@ class MultiContextSession {
|
|
|
857
1244
|
}
|
|
858
1245
|
}
|
|
859
1246
|
|
|
1247
|
+
/**
|
|
1248
|
+
* Streaming session for LLM integration via `/ws/tts/stream`.
|
|
1249
|
+
*
|
|
1250
|
+
* The server accumulates text across multiple {@link send} calls and
|
|
1251
|
+
* auto-chunks it at sentence boundaries, keeping the KV cache warm between
|
|
1252
|
+
* chunks for natural prosody. You never need to call `flush` explicitly —
|
|
1253
|
+
* configure {@link StreamConfig.chunkLengthSchedule} or
|
|
1254
|
+
* {@link StreamConfig.autoMode} instead.
|
|
1255
|
+
*
|
|
1256
|
+
* @example
|
|
1257
|
+
* ```typescript
|
|
1258
|
+
* const session = client.tts.streamingSession({
|
|
1259
|
+
* voiceId: 123,
|
|
1260
|
+
* autoMode: true,
|
|
1261
|
+
* chunkLengthSchedule: [50, 100, 150, 250],
|
|
1262
|
+
* }, {
|
|
1263
|
+
* onChunk: (chunk) => playAudio(chunk.audio),
|
|
1264
|
+
* onSessionClosed: (totalSecs) => console.log(`Done: ${totalSecs}s`),
|
|
1265
|
+
* });
|
|
1266
|
+
*
|
|
1267
|
+
* session.connect();
|
|
1268
|
+
*
|
|
1269
|
+
* for await (const token of llmStream) {
|
|
1270
|
+
* session.send(token);
|
|
1271
|
+
* }
|
|
1272
|
+
*
|
|
1273
|
+
* await session.close();
|
|
1274
|
+
* ```
|
|
1275
|
+
*/
|
|
1276
|
+
class StreamingSession {
|
|
1277
|
+
private ws: WebSocket | null = null;
|
|
1278
|
+
private config: StreamConfig;
|
|
1279
|
+
private callbacks: StreamingSessionCallbacks;
|
|
1280
|
+
private client: KugelAudio;
|
|
1281
|
+
private configSent = false;
|
|
1282
|
+
|
|
1283
|
+
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks) {
|
|
1284
|
+
this.client = client;
|
|
1285
|
+
this.config = config;
|
|
1286
|
+
this.callbacks = callbacks;
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
/**
|
|
1290
|
+
* Open the WebSocket connection and authenticate.
|
|
1291
|
+
*
|
|
1292
|
+
* The returned promise resolves once the WebSocket is OPEN, so callers can
|
|
1293
|
+
* ``await session.connect()`` and then ``send()`` without racing the
|
|
1294
|
+
* handshake. Pre-open errors (network failure, 4001 unauthorized, …) reject
|
|
1295
|
+
* the promise with the typed error.
|
|
1296
|
+
*/
|
|
1297
|
+
connect(): Promise<void> {
|
|
1298
|
+
const wsUrl = this.client.ttsUrl
|
|
1299
|
+
.replace('https://', 'wss://')
|
|
1300
|
+
.replace('http://', 'ws://');
|
|
1301
|
+
|
|
1302
|
+
let authParam: string;
|
|
1303
|
+
if (this.client.isToken) {
|
|
1304
|
+
authParam = 'token';
|
|
1305
|
+
} else if (this.client.isMasterKey) {
|
|
1306
|
+
authParam = 'master_key';
|
|
1307
|
+
} else {
|
|
1308
|
+
authParam = 'api_key';
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
const url = `${wsUrl}/ws/tts/stream?${authParam}=${this.client.apiKey}`;
|
|
1312
|
+
this.ws = createWs(url);
|
|
1313
|
+
const ws = this.ws;
|
|
1314
|
+
|
|
1315
|
+
ws.onmessage = (event: { data: unknown }) => {
|
|
1316
|
+
try {
|
|
1317
|
+
const messageData = typeof event.data === 'string'
|
|
1318
|
+
? event.data
|
|
1319
|
+
: event.data instanceof Buffer
|
|
1320
|
+
? event.data.toString()
|
|
1321
|
+
: String(event.data);
|
|
1322
|
+
const data = JSON.parse(messageData);
|
|
1323
|
+
|
|
1324
|
+
if (data.error) {
|
|
1325
|
+
this.callbacks.onError?.(new KugelAudioError(data.error));
|
|
1326
|
+
return;
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
if (data.audio) {
|
|
1330
|
+
const chunk: AudioChunk = {
|
|
1331
|
+
audio: data.audio,
|
|
1332
|
+
encoding: data.enc || 'pcm_s16le',
|
|
1333
|
+
index: data.idx,
|
|
1334
|
+
sampleRate: data.sr,
|
|
1335
|
+
samples: data.samples,
|
|
1336
|
+
};
|
|
1337
|
+
this.callbacks.onChunk?.(chunk);
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
if (data.word_timestamps) {
|
|
1341
|
+
const timestamps = data.word_timestamps.map((w: Record<string, unknown>) => ({
|
|
1342
|
+
word: w.word as string,
|
|
1343
|
+
startMs: w.start_ms as number,
|
|
1344
|
+
endMs: w.end_ms as number,
|
|
1345
|
+
charStart: w.char_start as number,
|
|
1346
|
+
charEnd: w.char_end as number,
|
|
1347
|
+
score: (w.score as number) ?? 1.0,
|
|
1348
|
+
}));
|
|
1349
|
+
this.callbacks.onWordTimestamps?.(timestamps);
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1352
|
+
if (data.chunk_complete) {
|
|
1353
|
+
this.callbacks.onChunkComplete?.(
|
|
1354
|
+
data.chunk_id ?? 0,
|
|
1355
|
+
data.audio_seconds ?? 0,
|
|
1356
|
+
data.gen_ms ?? 0,
|
|
1357
|
+
);
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
if (data.generation_started) {
|
|
1361
|
+
this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? '');
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
if (data.session_closed) {
|
|
1365
|
+
this.callbacks.onSessionClosed?.(
|
|
1366
|
+
data.total_audio_seconds ?? 0,
|
|
1367
|
+
data.total_text_chunks ?? 0,
|
|
1368
|
+
data.total_audio_chunks ?? 0,
|
|
1369
|
+
);
|
|
1370
|
+
}
|
|
1371
|
+
} catch (e) {
|
|
1372
|
+
console.error('[KugelAudio] Failed to parse streaming session message:', e);
|
|
1373
|
+
}
|
|
1374
|
+
};
|
|
1375
|
+
|
|
1376
|
+
return new Promise<void>((resolve, reject) => {
|
|
1377
|
+
let opened = false;
|
|
1378
|
+
|
|
1379
|
+
ws.onopen = () => {
|
|
1380
|
+
opened = true;
|
|
1381
|
+
resolve();
|
|
1382
|
+
};
|
|
1383
|
+
|
|
1384
|
+
ws.onerror = (event: unknown) => {
|
|
1385
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
1386
|
+
const err =
|
|
1387
|
+
classifyWsHandshakeError(underlying) ??
|
|
1388
|
+
new ConnectionError(
|
|
1389
|
+
'KugelAudio streaming WebSocket connection error. ' +
|
|
1390
|
+
'Check network connectivity.',
|
|
1391
|
+
);
|
|
1392
|
+
if (!opened) reject(err);
|
|
1393
|
+
this.callbacks.onError?.(err);
|
|
1394
|
+
};
|
|
1395
|
+
|
|
1396
|
+
ws.onclose = (event) => {
|
|
1397
|
+
let typedErr: KugelAudioError | null = null;
|
|
1398
|
+
if (
|
|
1399
|
+
event.code === 4001 ||
|
|
1400
|
+
event.code === 4003 ||
|
|
1401
|
+
event.code === 4029 ||
|
|
1402
|
+
event.code === 4500
|
|
1403
|
+
) {
|
|
1404
|
+
typedErr = classifyWsClose(event.code, event.reason);
|
|
1405
|
+
this.callbacks.onError?.(typedErr);
|
|
1406
|
+
}
|
|
1407
|
+
if (!opened) {
|
|
1408
|
+
reject(
|
|
1409
|
+
typedErr ??
|
|
1410
|
+
new ConnectionError(
|
|
1411
|
+
`KugelAudio streaming WebSocket closed before ready ` +
|
|
1412
|
+
`(code ${event.code}).`,
|
|
1413
|
+
),
|
|
1414
|
+
);
|
|
1415
|
+
}
|
|
1416
|
+
this.ws = null;
|
|
1417
|
+
this.configSent = false;
|
|
1418
|
+
};
|
|
1419
|
+
});
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
/**
|
|
1423
|
+
* Send a text chunk to the server (e.g. one LLM output token).
|
|
1424
|
+
*
|
|
1425
|
+
* The server buffers text across multiple calls and starts generating at
|
|
1426
|
+
* natural sentence boundaries automatically — no need to call `flush`.
|
|
1427
|
+
*
|
|
1428
|
+
* @param text - Raw text or LLM token to append to the server buffer.
|
|
1429
|
+
* @param flush - Force immediate generation of whatever is buffered.
|
|
1430
|
+
* **Avoid calling this per-sentence from the client.** Doing so bypasses
|
|
1431
|
+
* the server's semantic chunking, incurs a fresh model prefill cost on
|
|
1432
|
+
* every flush, and makes latency *worse*, not better. Let the server
|
|
1433
|
+
* handle chunking via `chunkLengthSchedule` / `autoMode` instead.
|
|
1434
|
+
*/
|
|
1435
|
+
send(text: string, flush = false): void {
|
|
1436
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) {
|
|
1437
|
+
throw new KugelAudioError('StreamingSession not connected. Call connect() first.');
|
|
1438
|
+
}
|
|
1439
|
+
|
|
1440
|
+
const msg: Record<string, unknown> = { text, flush };
|
|
1441
|
+
|
|
1442
|
+
if (!this.configSent) {
|
|
1443
|
+
if (this.config.voiceId !== undefined) msg.voice_id = this.config.voiceId;
|
|
1444
|
+
if (this.config.modelId !== undefined) msg.model_id = this.config.modelId;
|
|
1445
|
+
if (this.config.cfgScale !== undefined) msg.cfg_scale = this.config.cfgScale;
|
|
1446
|
+
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
1447
|
+
if (this.config.maxNewTokens !== undefined) msg.max_new_tokens = this.config.maxNewTokens;
|
|
1448
|
+
if (this.config.sampleRate !== undefined) msg.sample_rate = this.config.sampleRate;
|
|
1449
|
+
if (this.config.flushTimeoutMs !== undefined) msg.flush_timeout_ms = this.config.flushTimeoutMs;
|
|
1450
|
+
if (this.config.maxBufferLength !== undefined) msg.max_buffer_length = this.config.maxBufferLength;
|
|
1451
|
+
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
1452
|
+
if (this.config.language !== undefined) msg.language = this.config.language;
|
|
1453
|
+
if (this.config.wordTimestamps) msg.word_timestamps = true;
|
|
1454
|
+
if (this.config.autoMode !== undefined) msg.auto_mode = this.config.autoMode;
|
|
1455
|
+
if (this.config.chunkLengthSchedule?.length) msg.chunk_length_schedule = this.config.chunkLengthSchedule;
|
|
1456
|
+
if (this.config.speed !== undefined) msg.speed = this.config.speed;
|
|
1457
|
+
this.configSent = true;
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
this.ws.send(JSON.stringify(msg));
|
|
1461
|
+
}
|
|
1462
|
+
|
|
1463
|
+
/**
|
|
1464
|
+
* End the current session but keep the WebSocket connection open.
|
|
1465
|
+
*
|
|
1466
|
+
* This allows starting a new session on the same connection, avoiding
|
|
1467
|
+
* the overhead of a new WebSocket handshake (~200-300ms). After calling
|
|
1468
|
+
* this, optionally call {@link updateConfig} to change voice/model settings,
|
|
1469
|
+
* then call {@link send} to start the next session.
|
|
1470
|
+
*
|
|
1471
|
+
* The returned promise resolves once the server confirms with a
|
|
1472
|
+
* `session_closed` message, or after a 15 s **quiet** timeout — i.e. 15 s
|
|
1473
|
+
* elapse without *any* server message arriving. The timer resets on every
|
|
1474
|
+
* incoming frame so a long final flush that streams audio for tens of
|
|
1475
|
+
* seconds is not truncated; only a genuinely silent server trips the fuse.
|
|
1476
|
+
*/
|
|
1477
|
+
endSession(): Promise<void> {
|
|
1478
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
|
|
1479
|
+
|
|
1480
|
+
const ws = this.ws;
|
|
1481
|
+
// Quiet timeout: resets on every incoming server message. Trips only when
|
|
1482
|
+
// the server has been silent for this long. The previous wall-clock fuse
|
|
1483
|
+
// (10 s total) silently truncated audio when the final flushed chunk
|
|
1484
|
+
// took longer to generate than the budget — see fix in this commit.
|
|
1485
|
+
const QUIET_TIMEOUT_MS = 15_000;
|
|
1486
|
+
|
|
1487
|
+
return new Promise<void>((resolve) => {
|
|
1488
|
+
let settled = false;
|
|
1489
|
+
let timer: ReturnType<typeof setTimeout>;
|
|
1490
|
+
|
|
1491
|
+
const prevMessage = ws.onmessage;
|
|
1492
|
+
const prevClose = ws.onclose;
|
|
1493
|
+
|
|
1494
|
+
const done = () => {
|
|
1495
|
+
if (settled) return;
|
|
1496
|
+
settled = true;
|
|
1497
|
+
clearTimeout(timer);
|
|
1498
|
+
// Restore the original handlers so subsequent endSession() calls
|
|
1499
|
+
// don't stack wrappers and so the typed-error onclose installed
|
|
1500
|
+
// by connect() remains in effect for the next session.
|
|
1501
|
+
ws.onmessage = prevMessage;
|
|
1502
|
+
ws.onclose = prevClose;
|
|
1503
|
+
this.configSent = false;
|
|
1504
|
+
resolve();
|
|
1505
|
+
};
|
|
1506
|
+
|
|
1507
|
+
const armQuietTimer = () => {
|
|
1508
|
+
clearTimeout(timer);
|
|
1509
|
+
timer = setTimeout(done, QUIET_TIMEOUT_MS);
|
|
1510
|
+
};
|
|
1511
|
+
|
|
1512
|
+
armQuietTimer();
|
|
1513
|
+
|
|
1514
|
+
ws.onmessage = (event: MessageEvent) => {
|
|
1515
|
+
// Reset the quiet timer on EVERY incoming frame — audio chunks for
|
|
1516
|
+
// the final flush count as liveness, not just session_closed.
|
|
1517
|
+
armQuietTimer();
|
|
1518
|
+
if (prevMessage) prevMessage.call(ws, event);
|
|
1519
|
+
try {
|
|
1520
|
+
const raw = typeof event.data === 'string'
|
|
1521
|
+
? event.data
|
|
1522
|
+
: event.data instanceof Buffer
|
|
1523
|
+
? event.data.toString()
|
|
1524
|
+
: String(event.data);
|
|
1525
|
+
if (JSON.parse(raw).session_closed) done();
|
|
1526
|
+
} catch { /* ignore parse errors */ }
|
|
1527
|
+
};
|
|
1528
|
+
|
|
1529
|
+
ws.onclose = (event: CloseEvent) => {
|
|
1530
|
+
this.ws = null;
|
|
1531
|
+
if (prevClose) prevClose.call(ws, event);
|
|
1532
|
+
done();
|
|
1533
|
+
};
|
|
1534
|
+
|
|
1535
|
+
ws.send(JSON.stringify({ close: true }));
|
|
1536
|
+
});
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1539
|
+
/**
|
|
1540
|
+
* Update session configuration for the next session.
|
|
1541
|
+
*
|
|
1542
|
+
* Call this after {@link endSession} and before the next {@link send}
|
|
1543
|
+
* to change voice, model, language, or other settings.
|
|
1544
|
+
*/
|
|
1545
|
+
updateConfig(config: Partial<StreamConfig>): void {
|
|
1546
|
+
Object.assign(this.config, config);
|
|
1547
|
+
this.configSent = false;
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
/**
|
|
1551
|
+
* Close the session and the WebSocket connection.
|
|
1552
|
+
*
|
|
1553
|
+
* For session reuse without closing the connection, use
|
|
1554
|
+
* {@link endSession} instead.
|
|
1555
|
+
*
|
|
1556
|
+
* The returned promise resolves once the server confirms the close with a
|
|
1557
|
+
* `session_closed` message, or after a 15 s **quiet** timeout (no traffic
|
|
1558
|
+
* from the server in that window). Audio frames from the server-side
|
|
1559
|
+
* final-flush of the still-buffered text are delivered to your callbacks
|
|
1560
|
+
* before this promise resolves, and each frame resets the quiet timer.
|
|
1561
|
+
*/
|
|
1562
|
+
async close(): Promise<void> {
|
|
1563
|
+
await this.endSession();
|
|
1564
|
+
|
|
1565
|
+
if (this.ws) {
|
|
1566
|
+
try { this.ws.close(); } catch { /* already closed */ }
|
|
1567
|
+
this.ws = null;
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
/** Whether the underlying WebSocket is open. */
|
|
1572
|
+
get isConnected(): boolean {
|
|
1573
|
+
return this.ws !== null && this.ws.readyState === WS_OPEN;
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
|
|
860
1577
|
/**
|
|
861
1578
|
* KugelAudio API client.
|
|
862
1579
|
*
|
|
@@ -870,13 +1587,13 @@ class MultiContextSession {
|
|
|
870
1587
|
* // List voices
|
|
871
1588
|
* const voices = await client.voices.list();
|
|
872
1589
|
*
|
|
873
|
-
* // Generate audio with fast model
|
|
1590
|
+
* // Generate audio with fast model
|
|
874
1591
|
* const audio = await client.tts.generate({
|
|
875
1592
|
* text: 'Hello, world!',
|
|
876
1593
|
* modelId: 'kugel-1-turbo',
|
|
877
1594
|
* });
|
|
878
1595
|
*
|
|
879
|
-
* // Generate audio with premium model
|
|
1596
|
+
* // Generate audio with premium model
|
|
880
1597
|
* const audio = await client.tts.generate({
|
|
881
1598
|
* text: 'Hello, world!',
|
|
882
1599
|
* modelId: 'kugel-1',
|
|
@@ -891,6 +1608,7 @@ export class KugelAudio {
|
|
|
891
1608
|
private _apiUrl: string;
|
|
892
1609
|
private _ttsUrl: string;
|
|
893
1610
|
private _timeout: number;
|
|
1611
|
+
private _keepalivePingInterval: number | null;
|
|
894
1612
|
|
|
895
1613
|
/** Models resource */
|
|
896
1614
|
public readonly models: ModelsResource;
|
|
@@ -901,17 +1619,40 @@ export class KugelAudio {
|
|
|
901
1619
|
|
|
902
1620
|
constructor(options: KugelAudioOptions) {
|
|
903
1621
|
if (!options.apiKey) {
|
|
904
|
-
throw new
|
|
1622
|
+
throw new ValidationError(
|
|
1623
|
+
'KugelAudio API key is missing. Set the KUGELAUDIO_API_KEY ' +
|
|
1624
|
+
'environment variable or pass { apiKey: ... } to the client. ' +
|
|
1625
|
+
'Get a key at https://app.kugelaudio.com/settings/api-keys.',
|
|
1626
|
+
);
|
|
905
1627
|
}
|
|
906
1628
|
|
|
907
|
-
|
|
1629
|
+
const { cleanKey, detectedRegion } = parseApiKey(options.apiKey);
|
|
1630
|
+
this._apiKey = cleanKey;
|
|
908
1631
|
this._isMasterKey = options.isMasterKey || false;
|
|
909
1632
|
this._isToken = options.isToken || false;
|
|
910
1633
|
this._orgId = options.orgId;
|
|
911
|
-
|
|
1634
|
+
|
|
1635
|
+
if (options.apiUrl) {
|
|
1636
|
+
this._apiUrl = options.apiUrl.replace(/\/$/, '');
|
|
1637
|
+
} else {
|
|
1638
|
+
const effectiveRegion = options.region || detectedRegion;
|
|
1639
|
+
if (!effectiveRegion) {
|
|
1640
|
+
this._apiUrl = DEFAULT_API_URL;
|
|
1641
|
+
} else if (!SUPPORTED_REGIONS.includes(effectiveRegion as Region)) {
|
|
1642
|
+
throw new ValidationError(
|
|
1643
|
+
`Invalid region '${effectiveRegion}'. Must be one of: ${SUPPORTED_REGIONS.join(', ')}.`,
|
|
1644
|
+
);
|
|
1645
|
+
} else {
|
|
1646
|
+
this._apiUrl = effectiveRegion === 'eu' ? EU_API_URL : DEFAULT_API_URL;
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
|
|
912
1650
|
// If ttsUrl not specified, use apiUrl (backend proxies to TTS server)
|
|
913
1651
|
this._ttsUrl = (options.ttsUrl || this._apiUrl).replace(/\/$/, '');
|
|
914
1652
|
this._timeout = options.timeout || 60000;
|
|
1653
|
+
this._keepalivePingInterval = options.keepalivePingInterval !== undefined
|
|
1654
|
+
? options.keepalivePingInterval
|
|
1655
|
+
: 20000;
|
|
915
1656
|
|
|
916
1657
|
this.models = new ModelsResource(this);
|
|
917
1658
|
this.voices = new VoicesResource(this);
|
|
@@ -965,6 +1706,11 @@ export class KugelAudio {
|
|
|
965
1706
|
return this._ttsUrl;
|
|
966
1707
|
}
|
|
967
1708
|
|
|
1709
|
+
/** Get keepalive ping interval in milliseconds, or null if disabled. */
|
|
1710
|
+
get keepalivePingInterval(): number | null {
|
|
1711
|
+
return this._keepalivePingInterval;
|
|
1712
|
+
}
|
|
1713
|
+
|
|
968
1714
|
/**
|
|
969
1715
|
* Close the client and release resources.
|
|
970
1716
|
* This closes any pooled WebSocket connections.
|
|
@@ -1027,25 +1773,57 @@ export class KugelAudio {
|
|
|
1027
1773
|
|
|
1028
1774
|
clearTimeout(timeoutId);
|
|
1029
1775
|
|
|
1030
|
-
if (response.
|
|
1031
|
-
|
|
1776
|
+
if (!response.ok) {
|
|
1777
|
+
const text = await response.text();
|
|
1778
|
+
throw classifyHttpError(response.status, text, response.headers);
|
|
1032
1779
|
}
|
|
1033
|
-
|
|
1034
|
-
|
|
1780
|
+
|
|
1781
|
+
return await response.json();
|
|
1782
|
+
} catch (error) {
|
|
1783
|
+
clearTimeout(timeoutId);
|
|
1784
|
+
if (error instanceof KugelAudioError) {
|
|
1785
|
+
throw error;
|
|
1035
1786
|
}
|
|
1036
|
-
if (
|
|
1037
|
-
throw new
|
|
1787
|
+
if ((error as Error).name === 'AbortError') {
|
|
1788
|
+
throw new ConnectionError(
|
|
1789
|
+
`Request to ${method} ${path} timed out after ${this._timeout}ms.`,
|
|
1790
|
+
);
|
|
1038
1791
|
}
|
|
1792
|
+
throw new ConnectionError(
|
|
1793
|
+
`Could not reach KugelAudio at ${url}: ${(error as Error).message}. ` +
|
|
1794
|
+
'Check network connectivity.',
|
|
1795
|
+
);
|
|
1796
|
+
}
|
|
1797
|
+
}
|
|
1798
|
+
|
|
1799
|
+
/**
|
|
1800
|
+
* Make a multipart/form-data request (for file uploads).
|
|
1801
|
+
* @internal Used by VoicesResource for reference file uploads.
|
|
1802
|
+
*/
|
|
1803
|
+
async requestMultipart<T>(method: string, path: string, formData: FormData): Promise<T> {
|
|
1804
|
+
const url = `${this._apiUrl}${path}`;
|
|
1805
|
+
|
|
1806
|
+
const headers: Record<string, string> = {
|
|
1807
|
+
'X-API-Key': this._apiKey,
|
|
1808
|
+
'Authorization': `Bearer ${this._apiKey}`,
|
|
1809
|
+
};
|
|
1810
|
+
|
|
1811
|
+
const controller = new AbortController();
|
|
1812
|
+
const timeoutId = setTimeout(() => controller.abort(), this._timeout);
|
|
1813
|
+
|
|
1814
|
+
try {
|
|
1815
|
+
const response = await fetch(url, {
|
|
1816
|
+
method,
|
|
1817
|
+
headers,
|
|
1818
|
+
body: formData,
|
|
1819
|
+
signal: controller.signal,
|
|
1820
|
+
});
|
|
1821
|
+
|
|
1822
|
+
clearTimeout(timeoutId);
|
|
1823
|
+
|
|
1039
1824
|
if (!response.ok) {
|
|
1040
1825
|
const text = await response.text();
|
|
1041
|
-
|
|
1042
|
-
try {
|
|
1043
|
-
const json = JSON.parse(text);
|
|
1044
|
-
message = json.detail || json.error || message;
|
|
1045
|
-
} catch {
|
|
1046
|
-
message = text || message;
|
|
1047
|
-
}
|
|
1048
|
-
throw new KugelAudioError(message, response.status);
|
|
1826
|
+
throw classifyHttpError(response.status, text, response.headers);
|
|
1049
1827
|
}
|
|
1050
1828
|
|
|
1051
1829
|
return await response.json();
|
|
@@ -1055,10 +1833,14 @@ export class KugelAudio {
|
|
|
1055
1833
|
throw error;
|
|
1056
1834
|
}
|
|
1057
1835
|
if ((error as Error).name === 'AbortError') {
|
|
1058
|
-
throw new
|
|
1836
|
+
throw new ConnectionError(
|
|
1837
|
+
`Request to ${method} ${path} timed out after ${this._timeout}ms.`,
|
|
1838
|
+
);
|
|
1059
1839
|
}
|
|
1060
|
-
throw new
|
|
1840
|
+
throw new ConnectionError(
|
|
1841
|
+
`Could not reach KugelAudio at ${url}: ${(error as Error).message}. ` +
|
|
1842
|
+
'Check network connectivity.',
|
|
1843
|
+
);
|
|
1061
1844
|
}
|
|
1062
1845
|
}
|
|
1063
1846
|
}
|
|
1064
|
-
|