kugelaudio 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -13
- package/dist/index.d.mts +518 -26
- package/dist/index.d.ts +518 -26
- package/dist/index.js +864 -112
- package/dist/index.mjs +858 -112
- package/package.json +9 -8
- package/src/client.test.ts +548 -0
- package/src/client.ts +885 -103
- package/src/errors.ts +266 -18
- package/src/index.ts +17 -2
- package/src/types.ts +215 -8
- package/src/websocket.ts +38 -18
package/src/client.ts
CHANGED
|
@@ -3,26 +3,52 @@
|
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
5
|
import {
|
|
6
|
-
|
|
7
|
-
InsufficientCreditsError,
|
|
6
|
+
ConnectionError,
|
|
8
7
|
KugelAudioError,
|
|
9
|
-
|
|
8
|
+
ValidationError,
|
|
9
|
+
classifyHttpError,
|
|
10
|
+
classifyWsClose,
|
|
11
|
+
classifyWsFrame,
|
|
12
|
+
classifyWsHandshakeError,
|
|
10
13
|
} from './errors';
|
|
11
14
|
import type {
|
|
12
15
|
AudioChunk,
|
|
13
16
|
AudioResponse,
|
|
17
|
+
CreateVoiceOptions,
|
|
14
18
|
GenerateOptions,
|
|
15
19
|
GenerationStats,
|
|
16
20
|
KugelAudioOptions,
|
|
17
21
|
Model,
|
|
18
22
|
StreamCallbacks,
|
|
19
|
-
|
|
23
|
+
StreamConfig,
|
|
24
|
+
StreamingSessionCallbacks,
|
|
25
|
+
UpdateVoiceOptions,
|
|
26
|
+
VoiceDetail,
|
|
27
|
+
VoiceListResponse,
|
|
28
|
+
VoiceReference,
|
|
20
29
|
WordTimestamp
|
|
21
30
|
} from './types';
|
|
22
31
|
import { base64ToArrayBuffer } from './utils';
|
|
23
32
|
import { getWebSocket } from './websocket';
|
|
24
33
|
|
|
25
|
-
|
|
34
|
+
import type { Region } from './types';
|
|
35
|
+
|
|
36
|
+
const REGION_URLS: Record<Region, string> = {
|
|
37
|
+
eu: 'https://api.kugelaudio.com',
|
|
38
|
+
us: 'https://us-api.kugelaudio.com',
|
|
39
|
+
global: 'https://global-api.kugelaudio.com',
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
const REGION_PREFIXES = ['eu-', 'us-', 'global-'] as const;
|
|
43
|
+
|
|
44
|
+
function parseApiKey(apiKey: string): { cleanKey: string; detectedRegion?: Region } {
|
|
45
|
+
for (const prefix of REGION_PREFIXES) {
|
|
46
|
+
if (apiKey.startsWith(prefix)) {
|
|
47
|
+
return { cleanKey: apiKey.slice(prefix.length), detectedRegion: prefix.slice(0, -1) as Region };
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return { cleanKey: apiKey };
|
|
51
|
+
}
|
|
26
52
|
|
|
27
53
|
/**
|
|
28
54
|
* Create a new WebSocket instance.
|
|
@@ -37,6 +63,23 @@ function createWs(url: string): WebSocket {
|
|
|
37
63
|
/** WebSocket OPEN readyState constant. */
|
|
38
64
|
const WS_OPEN = 1;
|
|
39
65
|
|
|
66
|
+
let _languageWarningLogged = false;
|
|
67
|
+
|
|
68
|
+
function warnIfNoLanguage(
|
|
69
|
+
language: string | undefined,
|
|
70
|
+
normalize: boolean | undefined
|
|
71
|
+
): void {
|
|
72
|
+
const normEnabled = normalize === undefined || normalize;
|
|
73
|
+
if (!language && normEnabled && !_languageWarningLogged) {
|
|
74
|
+
_languageWarningLogged = true;
|
|
75
|
+
console.warn(
|
|
76
|
+
"[KugelAudio] No 'language' set with normalization enabled — the server " +
|
|
77
|
+
'will auto-detect the language, adding ~60-150ms to TTFA. Set language ' +
|
|
78
|
+
"(e.g., language: 'en') for optimal latency."
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
40
83
|
/**
|
|
41
84
|
* Models resource for listing TTS models.
|
|
42
85
|
*/
|
|
@@ -72,52 +115,211 @@ class VoicesResource {
|
|
|
72
115
|
language?: string;
|
|
73
116
|
includePublic?: boolean;
|
|
74
117
|
limit?: number;
|
|
75
|
-
|
|
118
|
+
offset?: number;
|
|
119
|
+
}): Promise<VoiceListResponse> {
|
|
76
120
|
const params = new URLSearchParams();
|
|
77
121
|
if (options?.language) params.set('language', options.language);
|
|
78
122
|
if (options?.includePublic !== undefined) {
|
|
79
123
|
params.set('include_public', String(options.includePublic));
|
|
80
124
|
}
|
|
81
125
|
if (options?.limit) params.set('limit', String(options.limit));
|
|
126
|
+
if (options?.offset) params.set('offset', String(options.offset));
|
|
82
127
|
|
|
83
128
|
const query = params.toString();
|
|
84
129
|
const path = query ? `/v1/voices?${query}` : '/v1/voices';
|
|
85
|
-
const response = await this.client.request<{ voices: any[] }>('GET', path);
|
|
130
|
+
const response = await this.client.request<{ voices: any[]; total: number; limit: number; offset: number }>('GET', path);
|
|
86
131
|
|
|
87
|
-
return
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
132
|
+
return {
|
|
133
|
+
voices: response.voices.map((v) => ({
|
|
134
|
+
id: v.id,
|
|
135
|
+
name: v.name,
|
|
136
|
+
description: v.description,
|
|
137
|
+
category: v.category,
|
|
138
|
+
sex: v.sex,
|
|
139
|
+
age: v.age,
|
|
140
|
+
supportedLanguages: v.supported_languages || [],
|
|
141
|
+
sampleText: v.sample_text,
|
|
142
|
+
avatarUrl: v.avatar_url,
|
|
143
|
+
sampleUrl: v.sample_url,
|
|
144
|
+
isPublic: v.is_public || false,
|
|
145
|
+
verified: v.verified || false,
|
|
146
|
+
})),
|
|
147
|
+
total: response.total,
|
|
148
|
+
limit: response.limit,
|
|
149
|
+
offset: response.offset,
|
|
150
|
+
};
|
|
101
151
|
}
|
|
102
152
|
|
|
103
153
|
/**
|
|
104
154
|
* Get a specific voice by ID.
|
|
105
155
|
*/
|
|
106
|
-
async get(voiceId: number): Promise<
|
|
156
|
+
async get(voiceId: number): Promise<VoiceDetail> {
|
|
107
157
|
const v = await this.client.request<any>('GET', `/v1/voices/${voiceId}`);
|
|
158
|
+
return this.mapVoiceDetail(v);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Create a new voice.
|
|
163
|
+
*/
|
|
164
|
+
async create(options: CreateVoiceOptions): Promise<VoiceDetail> {
|
|
165
|
+
const metadata = {
|
|
166
|
+
name: options.name,
|
|
167
|
+
sex: options.sex,
|
|
168
|
+
description: options.description ?? '',
|
|
169
|
+
category: options.category ?? 'conversational',
|
|
170
|
+
age: options.age ?? 'middle_age',
|
|
171
|
+
quality: options.quality ?? 'mid',
|
|
172
|
+
supported_languages: options.supportedLanguages ?? ['en'],
|
|
173
|
+
is_public: options.isPublic ?? false,
|
|
174
|
+
sample_text: options.sampleText ?? '',
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
const formData = new FormData();
|
|
178
|
+
formData.append(
|
|
179
|
+
'metadata',
|
|
180
|
+
new Blob([JSON.stringify(metadata)], { type: 'application/json' }),
|
|
181
|
+
);
|
|
182
|
+
|
|
183
|
+
if (options.referenceFiles) {
|
|
184
|
+
for (const file of options.referenceFiles) {
|
|
185
|
+
formData.append('files', file);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const v = await this.client.requestMultipart<any>('POST', '/v1/voices', formData);
|
|
190
|
+
return this.mapVoiceDetail(v);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Update an existing voice. Only provided fields are updated.
|
|
195
|
+
*/
|
|
196
|
+
async update(voiceId: number, options: UpdateVoiceOptions): Promise<VoiceDetail> {
|
|
197
|
+
const payload: Record<string, unknown> = {};
|
|
198
|
+
if (options.name !== undefined) payload.name = options.name;
|
|
199
|
+
if (options.description !== undefined) payload.description = options.description;
|
|
200
|
+
if (options.category !== undefined) payload.category = options.category;
|
|
201
|
+
if (options.age !== undefined) payload.age = options.age;
|
|
202
|
+
if (options.sex !== undefined) payload.sex = options.sex;
|
|
203
|
+
if (options.quality !== undefined) payload.quality = options.quality;
|
|
204
|
+
if (options.supportedLanguages !== undefined) payload.supported_languages = options.supportedLanguages;
|
|
205
|
+
if (options.isPublic !== undefined) payload.is_public = options.isPublic;
|
|
206
|
+
if (options.sampleText !== undefined) payload.sample_text = options.sampleText;
|
|
207
|
+
|
|
208
|
+
const v = await this.client.request<any>('PATCH', `/v1/voices/${voiceId}`, payload);
|
|
209
|
+
return this.mapVoiceDetail(v);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Delete a voice.
|
|
214
|
+
*/
|
|
215
|
+
async delete(voiceId: number): Promise<void> {
|
|
216
|
+
await this.client.request<any>('DELETE', `/v1/voices/${voiceId}`);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// -- Reference management --
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* List reference audio files for a voice.
|
|
223
|
+
*/
|
|
224
|
+
async listReferences(voiceId: number): Promise<VoiceReference[]> {
|
|
225
|
+
const response = await this.client.request<{ references: any[] }>(
|
|
226
|
+
'GET',
|
|
227
|
+
`/v1/voices/${voiceId}/references`,
|
|
228
|
+
);
|
|
229
|
+
return response.references.map((r) => this.mapVoiceReference(r));
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Upload a reference audio file to a voice.
|
|
234
|
+
*
|
|
235
|
+
* @param voiceId - Voice ID
|
|
236
|
+
* @param file - Audio file (File in browser, Blob in Node.js)
|
|
237
|
+
* @param referenceText - Optional transcript of the reference audio
|
|
238
|
+
*/
|
|
239
|
+
async addReference(
|
|
240
|
+
voiceId: number,
|
|
241
|
+
file: File | Blob,
|
|
242
|
+
referenceText?: string,
|
|
243
|
+
): Promise<VoiceReference> {
|
|
244
|
+
const formData = new FormData();
|
|
245
|
+
formData.append('file', file);
|
|
246
|
+
if (referenceText) {
|
|
247
|
+
formData.append('reference_text', referenceText);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const r = await this.client.requestMultipart<any>(
|
|
251
|
+
'POST',
|
|
252
|
+
`/v1/voices/${voiceId}/references`,
|
|
253
|
+
formData,
|
|
254
|
+
);
|
|
255
|
+
return this.mapVoiceReference(r);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Delete a reference audio file from a voice.
|
|
260
|
+
*/
|
|
261
|
+
async deleteReference(voiceId: number, referenceId: number): Promise<void> {
|
|
262
|
+
await this.client.request<any>(
|
|
263
|
+
'DELETE',
|
|
264
|
+
`/v1/voices/${voiceId}/references/${referenceId}`,
|
|
265
|
+
);
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// -- Publishing --
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Request publication of a voice. Sets it as public and marks it
|
|
272
|
+
* as pending verification by an admin.
|
|
273
|
+
*/
|
|
274
|
+
async publish(voiceId: number): Promise<VoiceDetail> {
|
|
275
|
+
const v = await this.client.request<any>('POST', `/v1/voices/${voiceId}/publish`);
|
|
276
|
+
return this.mapVoiceDetail(v);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// -- Sample generation --
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Trigger sample audio generation for a voice.
|
|
283
|
+
*/
|
|
284
|
+
async generateSample(voiceId: number): Promise<VoiceDetail> {
|
|
285
|
+
const v = await this.client.request<any>(
|
|
286
|
+
'POST',
|
|
287
|
+
`/v1/voices/${voiceId}/generate-sample`,
|
|
288
|
+
);
|
|
289
|
+
return this.mapVoiceDetail(v);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// -- Helpers --
|
|
293
|
+
|
|
294
|
+
private mapVoiceDetail(v: any): VoiceDetail {
|
|
108
295
|
return {
|
|
109
296
|
id: v.id,
|
|
110
297
|
name: v.name,
|
|
111
|
-
description: v.description,
|
|
112
|
-
|
|
113
|
-
|
|
298
|
+
description: v.description ?? '',
|
|
299
|
+
generativeVoiceDescription: v.generative_voice_description ?? '',
|
|
300
|
+
supportedLanguages: v.supported_languages ?? [],
|
|
301
|
+
category: v.category ?? 'cloned',
|
|
114
302
|
age: v.age,
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
303
|
+
sex: v.sex,
|
|
304
|
+
quality: v.quality ?? 'mid',
|
|
305
|
+
isPublic: v.is_public ?? false,
|
|
306
|
+
verified: v.verified ?? false,
|
|
307
|
+
pendingVerification: v.pending_verification ?? false,
|
|
118
308
|
sampleUrl: v.sample_url,
|
|
119
|
-
|
|
120
|
-
|
|
309
|
+
avatarUrl: v.avatar_url,
|
|
310
|
+
sampleText: v.sample_text ?? '',
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
private mapVoiceReference(r: any): VoiceReference {
|
|
315
|
+
return {
|
|
316
|
+
id: r.id,
|
|
317
|
+
voiceId: r.voice_id,
|
|
318
|
+
name: r.name ?? '',
|
|
319
|
+
referenceText: r.reference_text ?? '',
|
|
320
|
+
s3Path: r.s3_path ?? '',
|
|
321
|
+
audioUrl: r.audio_url,
|
|
322
|
+
isGenerated: r.is_generated ?? false,
|
|
121
323
|
};
|
|
122
324
|
}
|
|
123
325
|
}
|
|
@@ -135,6 +337,7 @@ class TTSResource {
|
|
|
135
337
|
reject: (error: Error) => void;
|
|
136
338
|
}> = new Map();
|
|
137
339
|
private requestCounter = 0;
|
|
340
|
+
private keepaliveTimer: ReturnType<typeof setInterval> | null = null;
|
|
138
341
|
|
|
139
342
|
constructor(private client: KugelAudio) {}
|
|
140
343
|
|
|
@@ -207,6 +410,67 @@ class TTSResource {
|
|
|
207
410
|
};
|
|
208
411
|
}
|
|
209
412
|
|
|
413
|
+
/**
|
|
414
|
+
* Stream audio and return a Node.js Readable stream of raw PCM16 binary data.
|
|
415
|
+
*
|
|
416
|
+
* **Node.js only** — this method requires the `stream` built-in module and is
|
|
417
|
+
* intended for server-side integrations such as Vapi custom TTS endpoints,
|
|
418
|
+
* Express/Fastify handlers, or any pipeline that expects a Node.js `Readable`.
|
|
419
|
+
*
|
|
420
|
+
* Compared to manually wiring `onChunk` to a `Readable`, this method avoids
|
|
421
|
+
* a common race-condition: the stream object is created and returned **before**
|
|
422
|
+
* any chunks arrive, so the caller can safely pipe or attach listeners before
|
|
423
|
+
* the first audio byte is pushed.
|
|
424
|
+
*
|
|
425
|
+
* @example Vapi custom TTS endpoint
|
|
426
|
+
* ```typescript
|
|
427
|
+
* app.post('/synthesize', (req, res) => {
|
|
428
|
+
* res.setHeader('Content-Type', 'audio/pcm');
|
|
429
|
+
* res.setHeader('Transfer-Encoding', 'chunked');
|
|
430
|
+
*
|
|
431
|
+
* const readable = client.tts.toReadable({
|
|
432
|
+
* text: req.body.message.text,
|
|
433
|
+
* modelId: 'kugel-1-turbo',
|
|
434
|
+
* sampleRate: req.body.message.sampleRate,
|
|
435
|
+
* language: 'en',
|
|
436
|
+
* });
|
|
437
|
+
*
|
|
438
|
+
* readable.pipe(res);
|
|
439
|
+
* });
|
|
440
|
+
* ```
|
|
441
|
+
*
|
|
442
|
+
* @param options - TTS generation options (same as `stream()`)
|
|
443
|
+
* @param reuseConnection - Reuse the pooled WebSocket connection (default: true)
|
|
444
|
+
* @returns Node.js Readable stream emitting raw PCM16 binary Buffer chunks
|
|
445
|
+
*/
|
|
446
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
447
|
+
toReadable(options: GenerateOptions, reuseConnection = true): any {
|
|
448
|
+
// Dynamic require keeps browser bundles free of Node.js built-ins.
|
|
449
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
450
|
+
const { Readable } = require('stream') as typeof import('stream');
|
|
451
|
+
const readable = new Readable({ read() {} });
|
|
452
|
+
|
|
453
|
+
this.stream(
|
|
454
|
+
options,
|
|
455
|
+
{
|
|
456
|
+
onChunk: (chunk: AudioChunk) => {
|
|
457
|
+
readable.push(Buffer.from(chunk.audio, 'base64'));
|
|
458
|
+
},
|
|
459
|
+
onFinal: () => {
|
|
460
|
+
readable.push(null);
|
|
461
|
+
},
|
|
462
|
+
onError: (error: Error) => {
|
|
463
|
+
readable.destroy(error);
|
|
464
|
+
},
|
|
465
|
+
},
|
|
466
|
+
reuseConnection
|
|
467
|
+
).catch((error: Error) => {
|
|
468
|
+
readable.destroy(error);
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
return readable;
|
|
472
|
+
}
|
|
473
|
+
|
|
210
474
|
/**
|
|
211
475
|
* Build the WebSocket URL with appropriate auth param.
|
|
212
476
|
*/
|
|
@@ -265,11 +529,20 @@ class TTSResource {
|
|
|
265
529
|
this.wsConnection = ws;
|
|
266
530
|
this.wsUrl = url;
|
|
267
531
|
this.setupMessageHandler(ws);
|
|
532
|
+
this.startKeepalive(ws);
|
|
268
533
|
resolve(ws);
|
|
269
534
|
};
|
|
270
535
|
|
|
271
|
-
ws.onerror = () => {
|
|
272
|
-
|
|
536
|
+
ws.onerror = (event: unknown) => {
|
|
537
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
538
|
+
const typed = classifyWsHandshakeError(underlying);
|
|
539
|
+
reject(
|
|
540
|
+
typed ??
|
|
541
|
+
new ConnectionError(
|
|
542
|
+
`Could not establish KugelAudio WebSocket connection to ${url}. ` +
|
|
543
|
+
'Check network connectivity.',
|
|
544
|
+
),
|
|
545
|
+
);
|
|
273
546
|
};
|
|
274
547
|
});
|
|
275
548
|
}
|
|
@@ -293,7 +566,7 @@ class TTSResource {
|
|
|
293
566
|
if (!pending) return;
|
|
294
567
|
|
|
295
568
|
if (data.error) {
|
|
296
|
-
const error = this.parseError(data
|
|
569
|
+
const error = this.parseError(data);
|
|
297
570
|
pending.callbacks.onError?.(error);
|
|
298
571
|
this.pendingRequests.delete(requestId);
|
|
299
572
|
pending.reject(error);
|
|
@@ -307,7 +580,6 @@ class TTSResource {
|
|
|
307
580
|
totalSamples: data.total_samples,
|
|
308
581
|
durationMs: data.dur_ms,
|
|
309
582
|
generationMs: data.gen_ms,
|
|
310
|
-
ttfaMs: data.ttfa_ms,
|
|
311
583
|
rtf: data.rtf,
|
|
312
584
|
error: data.error,
|
|
313
585
|
};
|
|
@@ -347,17 +619,25 @@ class TTSResource {
|
|
|
347
619
|
};
|
|
348
620
|
|
|
349
621
|
ws.onclose = (event) => {
|
|
350
|
-
// Clear connection pool
|
|
622
|
+
// Clear connection pool and keepalive
|
|
623
|
+
this.stopKeepalive();
|
|
351
624
|
this.wsConnection = null;
|
|
352
625
|
this.wsUrl = null;
|
|
353
626
|
|
|
354
|
-
// Reject all pending requests
|
|
627
|
+
// Reject all pending requests with appropriate error types
|
|
355
628
|
for (const [id, pending] of this.pendingRequests) {
|
|
356
629
|
pending.callbacks.onClose?.();
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
630
|
+
// Only surface server-initiated error close codes; normal closes
|
|
631
|
+
// (1000, 1001) should not reject pending requests with an error.
|
|
632
|
+
if (
|
|
633
|
+
event.code === 4001 ||
|
|
634
|
+
event.code === 4003 ||
|
|
635
|
+
event.code === 4029 ||
|
|
636
|
+
event.code === 4500
|
|
637
|
+
) {
|
|
638
|
+
const error = classifyWsClose(event.code, event.reason);
|
|
639
|
+
pending.callbacks.onError?.(error);
|
|
640
|
+
pending.reject(error);
|
|
361
641
|
}
|
|
362
642
|
this.pendingRequests.delete(id);
|
|
363
643
|
}
|
|
@@ -365,7 +645,9 @@ class TTSResource {
|
|
|
365
645
|
|
|
366
646
|
ws.onerror = () => {
|
|
367
647
|
// Reject all pending requests
|
|
368
|
-
const error = new
|
|
648
|
+
const error = new ConnectionError(
|
|
649
|
+
'KugelAudio WebSocket connection error. Check network connectivity.',
|
|
650
|
+
);
|
|
369
651
|
for (const [id, pending] of this.pendingRequests) {
|
|
370
652
|
pending.callbacks.onError?.(error);
|
|
371
653
|
pending.reject(error);
|
|
@@ -400,6 +682,7 @@ class TTSResource {
|
|
|
400
682
|
options: GenerateOptions,
|
|
401
683
|
callbacks: StreamCallbacks
|
|
402
684
|
): Promise<void> {
|
|
685
|
+
warnIfNoLanguage(options.language, options.normalize);
|
|
403
686
|
const ws = await this.getConnection();
|
|
404
687
|
const requestId = ++this.requestCounter;
|
|
405
688
|
|
|
@@ -413,11 +696,14 @@ class TTSResource {
|
|
|
413
696
|
model_id: options.modelId || 'kugel-1-turbo',
|
|
414
697
|
voice_id: options.voiceId,
|
|
415
698
|
cfg_scale: options.cfgScale ?? 2.0,
|
|
699
|
+
...(options.temperature !== undefined && { temperature: options.temperature }),
|
|
416
700
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
417
701
|
sample_rate: options.sampleRate ?? 24000,
|
|
418
702
|
normalize: options.normalize ?? true,
|
|
419
703
|
...(options.language && { language: options.language }),
|
|
420
704
|
...(options.wordTimestamps && { word_timestamps: true }),
|
|
705
|
+
...(options.speed !== undefined && { speed: options.speed }),
|
|
706
|
+
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
421
707
|
}));
|
|
422
708
|
});
|
|
423
709
|
}
|
|
@@ -429,6 +715,7 @@ class TTSResource {
|
|
|
429
715
|
options: GenerateOptions,
|
|
430
716
|
callbacks: StreamCallbacks
|
|
431
717
|
): Promise<void> {
|
|
718
|
+
warnIfNoLanguage(options.language, options.normalize);
|
|
432
719
|
return new Promise((resolve, reject) => {
|
|
433
720
|
const url = this.buildWsUrl();
|
|
434
721
|
const ws = createWs(url);
|
|
@@ -446,6 +733,8 @@ class TTSResource {
|
|
|
446
733
|
normalize: options.normalize ?? true,
|
|
447
734
|
...(options.language && { language: options.language }),
|
|
448
735
|
...(options.wordTimestamps && { word_timestamps: true }),
|
|
736
|
+
...(options.speed !== undefined && { speed: options.speed }),
|
|
737
|
+
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
449
738
|
}));
|
|
450
739
|
};
|
|
451
740
|
|
|
@@ -460,7 +749,7 @@ class TTSResource {
|
|
|
460
749
|
const data = JSON.parse(messageData);
|
|
461
750
|
|
|
462
751
|
if (data.error) {
|
|
463
|
-
const error = this.parseError(data
|
|
752
|
+
const error = this.parseError(data);
|
|
464
753
|
callbacks.onError?.(error);
|
|
465
754
|
ws.close();
|
|
466
755
|
reject(error);
|
|
@@ -474,7 +763,6 @@ class TTSResource {
|
|
|
474
763
|
totalSamples: data.total_samples,
|
|
475
764
|
durationMs: data.dur_ms,
|
|
476
765
|
generationMs: data.gen_ms,
|
|
477
|
-
ttfaMs: data.ttfa_ms,
|
|
478
766
|
rtf: data.rtf,
|
|
479
767
|
error: data.error,
|
|
480
768
|
};
|
|
@@ -513,27 +801,68 @@ class TTSResource {
|
|
|
513
801
|
}
|
|
514
802
|
};
|
|
515
803
|
|
|
516
|
-
ws.onerror = () => {
|
|
517
|
-
const
|
|
804
|
+
ws.onerror = (event: unknown) => {
|
|
805
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
806
|
+
const error =
|
|
807
|
+
classifyWsHandshakeError(underlying) ??
|
|
808
|
+
new ConnectionError(
|
|
809
|
+
'KugelAudio WebSocket connection error. Check network connectivity.',
|
|
810
|
+
);
|
|
518
811
|
callbacks.onError?.(error);
|
|
519
812
|
reject(error);
|
|
520
813
|
};
|
|
521
814
|
|
|
522
815
|
ws.onclose = (event) => {
|
|
523
816
|
callbacks.onClose?.();
|
|
524
|
-
if (
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
817
|
+
if (
|
|
818
|
+
event.code === 4001 ||
|
|
819
|
+
event.code === 4003 ||
|
|
820
|
+
event.code === 4029 ||
|
|
821
|
+
event.code === 4500
|
|
822
|
+
) {
|
|
823
|
+
const error = classifyWsClose(event.code, event.reason);
|
|
824
|
+
callbacks.onError?.(error);
|
|
825
|
+
reject(error);
|
|
528
826
|
}
|
|
529
827
|
};
|
|
530
828
|
});
|
|
531
829
|
}
|
|
532
830
|
|
|
831
|
+
/**
|
|
832
|
+
* Start periodic keepalive pings on the pooled connection.
|
|
833
|
+
* Uses the ws package's ping() in Node.js; silently skips in browsers
|
|
834
|
+
* where WebSocket doesn't expose a ping method.
|
|
835
|
+
*/
|
|
836
|
+
private startKeepalive(ws: WebSocket): void {
|
|
837
|
+
this.stopKeepalive();
|
|
838
|
+
const intervalMs = this.client.keepalivePingInterval;
|
|
839
|
+
if (intervalMs == null || intervalMs <= 0) return;
|
|
840
|
+
|
|
841
|
+
this.keepaliveTimer = setInterval(() => {
|
|
842
|
+
if (this.wsConnection !== ws || ws.readyState !== WS_OPEN) {
|
|
843
|
+
this.stopKeepalive();
|
|
844
|
+
return;
|
|
845
|
+
}
|
|
846
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
847
|
+
if (typeof (ws as any).ping === 'function') {
|
|
848
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
849
|
+
(ws as any).ping();
|
|
850
|
+
}
|
|
851
|
+
}, intervalMs);
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
private stopKeepalive(): void {
|
|
855
|
+
if (this.keepaliveTimer !== null) {
|
|
856
|
+
clearInterval(this.keepaliveTimer);
|
|
857
|
+
this.keepaliveTimer = null;
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
|
|
533
861
|
/**
|
|
534
862
|
* Close the pooled WebSocket connection.
|
|
535
863
|
*/
|
|
536
864
|
close(): void {
|
|
865
|
+
this.stopKeepalive();
|
|
537
866
|
if (this.wsConnection) {
|
|
538
867
|
try {
|
|
539
868
|
this.wsConnection.close();
|
|
@@ -545,15 +874,43 @@ class TTSResource {
|
|
|
545
874
|
}
|
|
546
875
|
}
|
|
547
876
|
|
|
548
|
-
private parseError(
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
877
|
+
private parseError(data: { error?: string; error_code?: string; retry_after?: number }): Error {
|
|
878
|
+
return classifyWsFrame(data);
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
/**
|
|
882
|
+
* Create a streaming session for LLM integration.
|
|
883
|
+
*
|
|
884
|
+
* The session connects to `/ws/tts/stream` and keeps a persistent
|
|
885
|
+
* connection across multiple {@link StreamingSession.send} calls.
|
|
886
|
+
* The server auto-chunks text at sentence boundaries — no client-side
|
|
887
|
+
* flushing required.
|
|
888
|
+
*
|
|
889
|
+
* @param config - Session configuration (voice, model, chunking strategy).
|
|
890
|
+
* @param callbacks - Callbacks for audio chunks and session lifecycle events.
|
|
891
|
+
* @returns A {@link StreamingSession} instance. Call `.connect()` before sending.
|
|
892
|
+
*
|
|
893
|
+
* @example
|
|
894
|
+
* ```typescript
|
|
895
|
+
* const session = client.tts.streamingSession(
|
|
896
|
+
* { voiceId: 123, autoMode: true, chunkLengthSchedule: [50, 100, 150, 250] },
|
|
897
|
+
* { onChunk: (chunk) => playAudio(chunk.audio) },
|
|
898
|
+
* );
|
|
899
|
+
*
|
|
900
|
+
* session.connect();
|
|
901
|
+
*
|
|
902
|
+
* for await (const token of llmStream) {
|
|
903
|
+
* session.send(token);
|
|
904
|
+
* }
|
|
905
|
+
*
|
|
906
|
+
* await session.close();
|
|
907
|
+
* ```
|
|
908
|
+
*/
|
|
909
|
+
streamingSession(
|
|
910
|
+
config: StreamConfig,
|
|
911
|
+
callbacks: StreamingSessionCallbacks
|
|
912
|
+
): StreamingSession {
|
|
913
|
+
return new StreamingSession(this.client, config, callbacks);
|
|
557
914
|
}
|
|
558
915
|
|
|
559
916
|
/**
|
|
@@ -574,7 +931,7 @@ class TTSResource {
|
|
|
574
931
|
* console.log(`Audio from ${chunk.contextId}`);
|
|
575
932
|
* playAudio(chunk.audio);
|
|
576
933
|
* },
|
|
577
|
-
*
|
|
934
|
+
* onContextClosed: (contextId) => {
|
|
578
935
|
* console.log(`${contextId} finished`);
|
|
579
936
|
* },
|
|
580
937
|
* });
|
|
@@ -625,8 +982,13 @@ class MultiContextSession {
|
|
|
625
982
|
|
|
626
983
|
/**
|
|
627
984
|
* Connect to the multi-context WebSocket endpoint.
|
|
985
|
+
*
|
|
986
|
+
* The returned promise resolves once the WebSocket is OPEN so callers can
|
|
987
|
+
* ``await session.connect(callbacks)`` before invoking
|
|
988
|
+
* {@link createContext} / {@link send}. Pre-open errors reject with the
|
|
989
|
+
* typed error.
|
|
628
990
|
*/
|
|
629
|
-
connect(callbacks: import('./types').MultiContextCallbacks): void {
|
|
991
|
+
connect(callbacks: import('./types').MultiContextCallbacks): Promise<void> {
|
|
630
992
|
this.callbacks = callbacks;
|
|
631
993
|
|
|
632
994
|
const wsUrl = this.client.ttsUrl
|
|
@@ -644,12 +1006,9 @@ class MultiContextSession {
|
|
|
644
1006
|
|
|
645
1007
|
const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`;
|
|
646
1008
|
this.ws = createWs(url);
|
|
1009
|
+
const ws = this.ws;
|
|
647
1010
|
|
|
648
|
-
|
|
649
|
-
// Connection established, ready to create contexts
|
|
650
|
-
};
|
|
651
|
-
|
|
652
|
-
this.ws.onmessage = (event: { data: unknown }) => {
|
|
1011
|
+
ws.onmessage = (event: { data: unknown }) => {
|
|
653
1012
|
try {
|
|
654
1013
|
// Handle both browser (string) and Node.js (Buffer) message formats
|
|
655
1014
|
const messageData = typeof event.data === 'string'
|
|
@@ -690,10 +1049,6 @@ class MultiContextSession {
|
|
|
690
1049
|
this.callbacks.onChunk?.(chunk);
|
|
691
1050
|
}
|
|
692
1051
|
|
|
693
|
-
if (data.is_final) {
|
|
694
|
-
this.callbacks.onContextFinal?.(data.context_id);
|
|
695
|
-
}
|
|
696
|
-
|
|
697
1052
|
if (data.context_closed) {
|
|
698
1053
|
this.contexts.delete(data.context_id);
|
|
699
1054
|
this.callbacks.onContextClosed?.(data.context_id);
|
|
@@ -712,20 +1067,51 @@ class MultiContextSession {
|
|
|
712
1067
|
}
|
|
713
1068
|
};
|
|
714
1069
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
};
|
|
1070
|
+
return new Promise<void>((resolve, reject) => {
|
|
1071
|
+
let opened = false;
|
|
718
1072
|
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
1073
|
+
ws.onopen = () => {
|
|
1074
|
+
opened = true;
|
|
1075
|
+
resolve();
|
|
1076
|
+
};
|
|
1077
|
+
|
|
1078
|
+
ws.onerror = (event: unknown) => {
|
|
1079
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
1080
|
+
const err =
|
|
1081
|
+
classifyWsHandshakeError(underlying) ??
|
|
1082
|
+
new ConnectionError(
|
|
1083
|
+
'KugelAudio multi-context WebSocket connection error. ' +
|
|
1084
|
+
'Check network connectivity.',
|
|
1085
|
+
);
|
|
1086
|
+
if (!opened) reject(err);
|
|
1087
|
+
this.callbacks.onError?.(err);
|
|
1088
|
+
};
|
|
1089
|
+
|
|
1090
|
+
ws.onclose = (event) => {
|
|
1091
|
+
let typedErr: KugelAudioError | null = null;
|
|
1092
|
+
if (
|
|
1093
|
+
event.code === 4001 ||
|
|
1094
|
+
event.code === 4003 ||
|
|
1095
|
+
event.code === 4029 ||
|
|
1096
|
+
event.code === 4500
|
|
1097
|
+
) {
|
|
1098
|
+
typedErr = classifyWsClose(event.code, event.reason);
|
|
1099
|
+
this.callbacks.onError?.(typedErr);
|
|
1100
|
+
}
|
|
1101
|
+
if (!opened) {
|
|
1102
|
+
reject(
|
|
1103
|
+
typedErr ??
|
|
1104
|
+
new ConnectionError(
|
|
1105
|
+
`KugelAudio multi-context WebSocket closed before ready ` +
|
|
1106
|
+
`(code ${event.code}).`,
|
|
1107
|
+
),
|
|
1108
|
+
);
|
|
1109
|
+
}
|
|
1110
|
+
this.ws = null;
|
|
1111
|
+
this.isStarted = false;
|
|
1112
|
+
this.contexts.clear();
|
|
1113
|
+
};
|
|
1114
|
+
});
|
|
729
1115
|
}
|
|
730
1116
|
|
|
731
1117
|
/**
|
|
@@ -749,10 +1135,13 @@ class MultiContextSession {
|
|
|
749
1135
|
|
|
750
1136
|
// Include session config on first context
|
|
751
1137
|
if (!this.isStarted) {
|
|
1138
|
+
warnIfNoLanguage(this.config.language, this.config.normalize);
|
|
752
1139
|
if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
|
|
753
1140
|
if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
|
|
1141
|
+
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
754
1142
|
if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
|
|
755
1143
|
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
1144
|
+
if (this.config.language) msg.language = this.config.language;
|
|
756
1145
|
if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
|
|
757
1146
|
}
|
|
758
1147
|
|
|
@@ -857,6 +1246,336 @@ class MultiContextSession {
|
|
|
857
1246
|
}
|
|
858
1247
|
}
|
|
859
1248
|
|
|
1249
|
+
/**
|
|
1250
|
+
* Streaming session for LLM integration via `/ws/tts/stream`.
|
|
1251
|
+
*
|
|
1252
|
+
* The server accumulates text across multiple {@link send} calls and
|
|
1253
|
+
* auto-chunks it at sentence boundaries, keeping the KV cache warm between
|
|
1254
|
+
* chunks for natural prosody. You never need to call `flush` explicitly —
|
|
1255
|
+
* configure {@link StreamConfig.chunkLengthSchedule} or
|
|
1256
|
+
* {@link StreamConfig.autoMode} instead.
|
|
1257
|
+
*
|
|
1258
|
+
* @example
|
|
1259
|
+
* ```typescript
|
|
1260
|
+
* const session = client.tts.streamingSession({
|
|
1261
|
+
* voiceId: 123,
|
|
1262
|
+
* autoMode: true,
|
|
1263
|
+
* chunkLengthSchedule: [50, 100, 150, 250],
|
|
1264
|
+
* }, {
|
|
1265
|
+
* onChunk: (chunk) => playAudio(chunk.audio),
|
|
1266
|
+
* onSessionClosed: (totalSecs) => console.log(`Done: ${totalSecs}s`),
|
|
1267
|
+
* });
|
|
1268
|
+
*
|
|
1269
|
+
* session.connect();
|
|
1270
|
+
*
|
|
1271
|
+
* for await (const token of llmStream) {
|
|
1272
|
+
* session.send(token);
|
|
1273
|
+
* }
|
|
1274
|
+
*
|
|
1275
|
+
* await session.close();
|
|
1276
|
+
* ```
|
|
1277
|
+
*/
|
|
1278
|
+
class StreamingSession {
|
|
1279
|
+
private ws: WebSocket | null = null;
|
|
1280
|
+
private config: StreamConfig;
|
|
1281
|
+
private callbacks: StreamingSessionCallbacks;
|
|
1282
|
+
private client: KugelAudio;
|
|
1283
|
+
private configSent = false;
|
|
1284
|
+
|
|
1285
|
+
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks) {
|
|
1286
|
+
this.client = client;
|
|
1287
|
+
this.config = config;
|
|
1288
|
+
this.callbacks = callbacks;
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
/**
|
|
1292
|
+
* Open the WebSocket connection and authenticate.
|
|
1293
|
+
*
|
|
1294
|
+
* The returned promise resolves once the WebSocket is OPEN, so callers can
|
|
1295
|
+
* ``await session.connect()`` and then ``send()`` without racing the
|
|
1296
|
+
* handshake. Pre-open errors (network failure, 4001 unauthorized, …) reject
|
|
1297
|
+
* the promise with the typed error.
|
|
1298
|
+
*/
|
|
1299
|
+
connect(): Promise<void> {
|
|
1300
|
+
const wsUrl = this.client.ttsUrl
|
|
1301
|
+
.replace('https://', 'wss://')
|
|
1302
|
+
.replace('http://', 'ws://');
|
|
1303
|
+
|
|
1304
|
+
let authParam: string;
|
|
1305
|
+
if (this.client.isToken) {
|
|
1306
|
+
authParam = 'token';
|
|
1307
|
+
} else if (this.client.isMasterKey) {
|
|
1308
|
+
authParam = 'master_key';
|
|
1309
|
+
} else {
|
|
1310
|
+
authParam = 'api_key';
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
const url = `${wsUrl}/ws/tts/stream?${authParam}=${this.client.apiKey}`;
|
|
1314
|
+
this.ws = createWs(url);
|
|
1315
|
+
const ws = this.ws;
|
|
1316
|
+
|
|
1317
|
+
ws.onmessage = (event: { data: unknown }) => {
|
|
1318
|
+
try {
|
|
1319
|
+
const messageData = typeof event.data === 'string'
|
|
1320
|
+
? event.data
|
|
1321
|
+
: event.data instanceof Buffer
|
|
1322
|
+
? event.data.toString()
|
|
1323
|
+
: String(event.data);
|
|
1324
|
+
const data = JSON.parse(messageData);
|
|
1325
|
+
|
|
1326
|
+
if (data.error) {
|
|
1327
|
+
this.callbacks.onError?.(new KugelAudioError(data.error));
|
|
1328
|
+
return;
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
if (data.audio) {
|
|
1332
|
+
const chunk: AudioChunk = {
|
|
1333
|
+
audio: data.audio,
|
|
1334
|
+
encoding: data.enc || 'pcm_s16le',
|
|
1335
|
+
index: data.idx,
|
|
1336
|
+
sampleRate: data.sr,
|
|
1337
|
+
samples: data.samples,
|
|
1338
|
+
};
|
|
1339
|
+
this.callbacks.onChunk?.(chunk);
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
if (data.word_timestamps) {
|
|
1343
|
+
const timestamps = data.word_timestamps.map((w: Record<string, unknown>) => ({
|
|
1344
|
+
word: w.word as string,
|
|
1345
|
+
startMs: w.start_ms as number,
|
|
1346
|
+
endMs: w.end_ms as number,
|
|
1347
|
+
charStart: w.char_start as number,
|
|
1348
|
+
charEnd: w.char_end as number,
|
|
1349
|
+
score: (w.score as number) ?? 1.0,
|
|
1350
|
+
}));
|
|
1351
|
+
this.callbacks.onWordTimestamps?.(timestamps);
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
if (data.chunk_complete) {
|
|
1355
|
+
this.callbacks.onChunkComplete?.(
|
|
1356
|
+
data.chunk_id ?? 0,
|
|
1357
|
+
data.audio_seconds ?? 0,
|
|
1358
|
+
data.gen_ms ?? 0,
|
|
1359
|
+
);
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
if (data.generation_started) {
|
|
1363
|
+
this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? '');
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
if (data.session_closed) {
|
|
1367
|
+
this.callbacks.onSessionClosed?.(
|
|
1368
|
+
data.total_audio_seconds ?? 0,
|
|
1369
|
+
data.total_text_chunks ?? 0,
|
|
1370
|
+
data.total_audio_chunks ?? 0,
|
|
1371
|
+
);
|
|
1372
|
+
}
|
|
1373
|
+
} catch (e) {
|
|
1374
|
+
console.error('[KugelAudio] Failed to parse streaming session message:', e);
|
|
1375
|
+
}
|
|
1376
|
+
};
|
|
1377
|
+
|
|
1378
|
+
return new Promise<void>((resolve, reject) => {
|
|
1379
|
+
let opened = false;
|
|
1380
|
+
|
|
1381
|
+
ws.onopen = () => {
|
|
1382
|
+
opened = true;
|
|
1383
|
+
resolve();
|
|
1384
|
+
};
|
|
1385
|
+
|
|
1386
|
+
ws.onerror = (event: unknown) => {
|
|
1387
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
1388
|
+
const err =
|
|
1389
|
+
classifyWsHandshakeError(underlying) ??
|
|
1390
|
+
new ConnectionError(
|
|
1391
|
+
'KugelAudio streaming WebSocket connection error. ' +
|
|
1392
|
+
'Check network connectivity.',
|
|
1393
|
+
);
|
|
1394
|
+
if (!opened) reject(err);
|
|
1395
|
+
this.callbacks.onError?.(err);
|
|
1396
|
+
};
|
|
1397
|
+
|
|
1398
|
+
ws.onclose = (event) => {
|
|
1399
|
+
let typedErr: KugelAudioError | null = null;
|
|
1400
|
+
if (
|
|
1401
|
+
event.code === 4001 ||
|
|
1402
|
+
event.code === 4003 ||
|
|
1403
|
+
event.code === 4029 ||
|
|
1404
|
+
event.code === 4500
|
|
1405
|
+
) {
|
|
1406
|
+
typedErr = classifyWsClose(event.code, event.reason);
|
|
1407
|
+
this.callbacks.onError?.(typedErr);
|
|
1408
|
+
}
|
|
1409
|
+
if (!opened) {
|
|
1410
|
+
reject(
|
|
1411
|
+
typedErr ??
|
|
1412
|
+
new ConnectionError(
|
|
1413
|
+
`KugelAudio streaming WebSocket closed before ready ` +
|
|
1414
|
+
`(code ${event.code}).`,
|
|
1415
|
+
),
|
|
1416
|
+
);
|
|
1417
|
+
}
|
|
1418
|
+
this.ws = null;
|
|
1419
|
+
this.configSent = false;
|
|
1420
|
+
};
|
|
1421
|
+
});
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
/**
|
|
1425
|
+
* Send a text chunk to the server (e.g. one LLM output token).
|
|
1426
|
+
*
|
|
1427
|
+
* The server buffers text across multiple calls and starts generating at
|
|
1428
|
+
* natural sentence boundaries automatically — no need to call `flush`.
|
|
1429
|
+
*
|
|
1430
|
+
* @param text - Raw text or LLM token to append to the server buffer.
|
|
1431
|
+
* @param flush - Force immediate generation of whatever is buffered.
|
|
1432
|
+
* **Avoid calling this per-sentence from the client.** Doing so bypasses
|
|
1433
|
+
* the server's semantic chunking, incurs a fresh model prefill cost on
|
|
1434
|
+
* every flush, and makes latency *worse*, not better. Let the server
|
|
1435
|
+
* handle chunking via `chunkLengthSchedule` / `autoMode` instead.
|
|
1436
|
+
*/
|
|
1437
|
+
send(text: string, flush = false): void {
|
|
1438
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) {
|
|
1439
|
+
throw new KugelAudioError('StreamingSession not connected. Call connect() first.');
|
|
1440
|
+
}
|
|
1441
|
+
|
|
1442
|
+
const msg: Record<string, unknown> = { text, flush };
|
|
1443
|
+
|
|
1444
|
+
if (!this.configSent) {
|
|
1445
|
+
if (this.config.voiceId !== undefined) msg.voice_id = this.config.voiceId;
|
|
1446
|
+
if (this.config.modelId !== undefined) msg.model_id = this.config.modelId;
|
|
1447
|
+
if (this.config.cfgScale !== undefined) msg.cfg_scale = this.config.cfgScale;
|
|
1448
|
+
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
1449
|
+
if (this.config.maxNewTokens !== undefined) msg.max_new_tokens = this.config.maxNewTokens;
|
|
1450
|
+
if (this.config.sampleRate !== undefined) msg.sample_rate = this.config.sampleRate;
|
|
1451
|
+
if (this.config.flushTimeoutMs !== undefined) msg.flush_timeout_ms = this.config.flushTimeoutMs;
|
|
1452
|
+
if (this.config.maxBufferLength !== undefined) msg.max_buffer_length = this.config.maxBufferLength;
|
|
1453
|
+
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
1454
|
+
if (this.config.language !== undefined) msg.language = this.config.language;
|
|
1455
|
+
if (this.config.wordTimestamps) msg.word_timestamps = true;
|
|
1456
|
+
if (this.config.autoMode !== undefined) msg.auto_mode = this.config.autoMode;
|
|
1457
|
+
if (this.config.chunkLengthSchedule?.length) msg.chunk_length_schedule = this.config.chunkLengthSchedule;
|
|
1458
|
+
if (this.config.speed !== undefined) msg.speed = this.config.speed;
|
|
1459
|
+
this.configSent = true;
|
|
1460
|
+
}
|
|
1461
|
+
|
|
1462
|
+
this.ws.send(JSON.stringify(msg));
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1465
|
+
/**
|
|
1466
|
+
* End the current session but keep the WebSocket connection open.
|
|
1467
|
+
*
|
|
1468
|
+
* This allows starting a new session on the same connection, avoiding
|
|
1469
|
+
* the overhead of a new WebSocket handshake (~200-300ms). After calling
|
|
1470
|
+
* this, optionally call {@link updateConfig} to change voice/model settings,
|
|
1471
|
+
* then call {@link send} to start the next session.
|
|
1472
|
+
*
|
|
1473
|
+
* The returned promise resolves once the server confirms with a
|
|
1474
|
+
* `session_closed` message, or after a 15 s **quiet** timeout — i.e. 15 s
|
|
1475
|
+
* elapse without *any* server message arriving. The timer resets on every
|
|
1476
|
+
* incoming frame so a long final flush that streams audio for tens of
|
|
1477
|
+
* seconds is not truncated; only a genuinely silent server trips the fuse.
|
|
1478
|
+
*/
|
|
1479
|
+
endSession(): Promise<void> {
|
|
1480
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
|
|
1481
|
+
|
|
1482
|
+
const ws = this.ws;
|
|
1483
|
+
// Quiet timeout: resets on every incoming server message. Trips only when
|
|
1484
|
+
// the server has been silent for this long. The previous wall-clock fuse
|
|
1485
|
+
// (10 s total) silently truncated audio when the final flushed chunk
|
|
1486
|
+
// took longer to generate than the budget — see fix in this commit.
|
|
1487
|
+
const QUIET_TIMEOUT_MS = 15_000;
|
|
1488
|
+
|
|
1489
|
+
return new Promise<void>((resolve) => {
|
|
1490
|
+
let settled = false;
|
|
1491
|
+
let timer: ReturnType<typeof setTimeout>;
|
|
1492
|
+
|
|
1493
|
+
const prevMessage = ws.onmessage;
|
|
1494
|
+
const prevClose = ws.onclose;
|
|
1495
|
+
|
|
1496
|
+
const done = () => {
|
|
1497
|
+
if (settled) return;
|
|
1498
|
+
settled = true;
|
|
1499
|
+
clearTimeout(timer);
|
|
1500
|
+
// Restore the original handlers so subsequent endSession() calls
|
|
1501
|
+
// don't stack wrappers and so the typed-error onclose installed
|
|
1502
|
+
// by connect() remains in effect for the next session.
|
|
1503
|
+
ws.onmessage = prevMessage;
|
|
1504
|
+
ws.onclose = prevClose;
|
|
1505
|
+
this.configSent = false;
|
|
1506
|
+
resolve();
|
|
1507
|
+
};
|
|
1508
|
+
|
|
1509
|
+
const armQuietTimer = () => {
|
|
1510
|
+
clearTimeout(timer);
|
|
1511
|
+
timer = setTimeout(done, QUIET_TIMEOUT_MS);
|
|
1512
|
+
};
|
|
1513
|
+
|
|
1514
|
+
armQuietTimer();
|
|
1515
|
+
|
|
1516
|
+
ws.onmessage = (event: MessageEvent) => {
|
|
1517
|
+
// Reset the quiet timer on EVERY incoming frame — audio chunks for
|
|
1518
|
+
// the final flush count as liveness, not just session_closed.
|
|
1519
|
+
armQuietTimer();
|
|
1520
|
+
if (prevMessage) prevMessage.call(ws, event);
|
|
1521
|
+
try {
|
|
1522
|
+
const raw = typeof event.data === 'string'
|
|
1523
|
+
? event.data
|
|
1524
|
+
: event.data instanceof Buffer
|
|
1525
|
+
? event.data.toString()
|
|
1526
|
+
: String(event.data);
|
|
1527
|
+
if (JSON.parse(raw).session_closed) done();
|
|
1528
|
+
} catch { /* ignore parse errors */ }
|
|
1529
|
+
};
|
|
1530
|
+
|
|
1531
|
+
ws.onclose = (event: CloseEvent) => {
|
|
1532
|
+
this.ws = null;
|
|
1533
|
+
if (prevClose) prevClose.call(ws, event);
|
|
1534
|
+
done();
|
|
1535
|
+
};
|
|
1536
|
+
|
|
1537
|
+
ws.send(JSON.stringify({ close: true }));
|
|
1538
|
+
});
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
/**
|
|
1542
|
+
* Update session configuration for the next session.
|
|
1543
|
+
*
|
|
1544
|
+
* Call this after {@link endSession} and before the next {@link send}
|
|
1545
|
+
* to change voice, model, language, or other settings.
|
|
1546
|
+
*/
|
|
1547
|
+
updateConfig(config: Partial<StreamConfig>): void {
|
|
1548
|
+
Object.assign(this.config, config);
|
|
1549
|
+
this.configSent = false;
|
|
1550
|
+
}
|
|
1551
|
+
|
|
1552
|
+
/**
|
|
1553
|
+
* Close the session and the WebSocket connection.
|
|
1554
|
+
*
|
|
1555
|
+
* For session reuse without closing the connection, use
|
|
1556
|
+
* {@link endSession} instead.
|
|
1557
|
+
*
|
|
1558
|
+
* The returned promise resolves once the server confirms the close with a
|
|
1559
|
+
* `session_closed` message, or after a 15 s **quiet** timeout (no traffic
|
|
1560
|
+
* from the server in that window). Audio frames from the server-side
|
|
1561
|
+
* final-flush of the still-buffered text are delivered to your callbacks
|
|
1562
|
+
* before this promise resolves, and each frame resets the quiet timer.
|
|
1563
|
+
*/
|
|
1564
|
+
async close(): Promise<void> {
|
|
1565
|
+
await this.endSession();
|
|
1566
|
+
|
|
1567
|
+
if (this.ws) {
|
|
1568
|
+
try { this.ws.close(); } catch { /* already closed */ }
|
|
1569
|
+
this.ws = null;
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
/** Whether the underlying WebSocket is open. */
|
|
1574
|
+
get isConnected(): boolean {
|
|
1575
|
+
return this.ws !== null && this.ws.readyState === WS_OPEN;
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
|
|
860
1579
|
/**
|
|
861
1580
|
* KugelAudio API client.
|
|
862
1581
|
*
|
|
@@ -870,13 +1589,13 @@ class MultiContextSession {
|
|
|
870
1589
|
* // List voices
|
|
871
1590
|
* const voices = await client.voices.list();
|
|
872
1591
|
*
|
|
873
|
-
* // Generate audio with fast model
|
|
1592
|
+
* // Generate audio with fast model
|
|
874
1593
|
* const audio = await client.tts.generate({
|
|
875
1594
|
* text: 'Hello, world!',
|
|
876
1595
|
* modelId: 'kugel-1-turbo',
|
|
877
1596
|
* });
|
|
878
1597
|
*
|
|
879
|
-
* // Generate audio with premium model
|
|
1598
|
+
* // Generate audio with premium model
|
|
880
1599
|
* const audio = await client.tts.generate({
|
|
881
1600
|
* text: 'Hello, world!',
|
|
882
1601
|
* modelId: 'kugel-1',
|
|
@@ -891,6 +1610,7 @@ export class KugelAudio {
|
|
|
891
1610
|
private _apiUrl: string;
|
|
892
1611
|
private _ttsUrl: string;
|
|
893
1612
|
private _timeout: number;
|
|
1613
|
+
private _keepalivePingInterval: number | null;
|
|
894
1614
|
|
|
895
1615
|
/** Models resource */
|
|
896
1616
|
public readonly models: ModelsResource;
|
|
@@ -901,17 +1621,37 @@ export class KugelAudio {
|
|
|
901
1621
|
|
|
902
1622
|
constructor(options: KugelAudioOptions) {
|
|
903
1623
|
if (!options.apiKey) {
|
|
904
|
-
throw new
|
|
1624
|
+
throw new ValidationError(
|
|
1625
|
+
'KugelAudio API key is missing. Set the KUGELAUDIO_API_KEY ' +
|
|
1626
|
+
'environment variable or pass { apiKey: ... } to the client. ' +
|
|
1627
|
+
'Get a key at https://app.kugelaudio.com/settings/api-keys.',
|
|
1628
|
+
);
|
|
905
1629
|
}
|
|
906
1630
|
|
|
907
|
-
|
|
1631
|
+
const { cleanKey, detectedRegion } = parseApiKey(options.apiKey);
|
|
1632
|
+
this._apiKey = cleanKey;
|
|
908
1633
|
this._isMasterKey = options.isMasterKey || false;
|
|
909
1634
|
this._isToken = options.isToken || false;
|
|
910
1635
|
this._orgId = options.orgId;
|
|
911
|
-
|
|
1636
|
+
|
|
1637
|
+
if (options.apiUrl) {
|
|
1638
|
+
this._apiUrl = options.apiUrl.replace(/\/$/, '');
|
|
1639
|
+
} else {
|
|
1640
|
+
const effectiveRegion = options.region || detectedRegion || 'eu';
|
|
1641
|
+
if (!(effectiveRegion in REGION_URLS)) {
|
|
1642
|
+
throw new ValidationError(
|
|
1643
|
+
`Invalid region '${effectiveRegion}'. Must be one of: ${Object.keys(REGION_URLS).join(', ')}.`,
|
|
1644
|
+
);
|
|
1645
|
+
}
|
|
1646
|
+
this._apiUrl = REGION_URLS[effectiveRegion as Region];
|
|
1647
|
+
}
|
|
1648
|
+
|
|
912
1649
|
// If ttsUrl not specified, use apiUrl (backend proxies to TTS server)
|
|
913
1650
|
this._ttsUrl = (options.ttsUrl || this._apiUrl).replace(/\/$/, '');
|
|
914
1651
|
this._timeout = options.timeout || 60000;
|
|
1652
|
+
this._keepalivePingInterval = options.keepalivePingInterval !== undefined
|
|
1653
|
+
? options.keepalivePingInterval
|
|
1654
|
+
: 20000;
|
|
915
1655
|
|
|
916
1656
|
this.models = new ModelsResource(this);
|
|
917
1657
|
this.voices = new VoicesResource(this);
|
|
@@ -965,6 +1705,11 @@ export class KugelAudio {
|
|
|
965
1705
|
return this._ttsUrl;
|
|
966
1706
|
}
|
|
967
1707
|
|
|
1708
|
+
/** Get keepalive ping interval in milliseconds, or null if disabled. */
|
|
1709
|
+
get keepalivePingInterval(): number | null {
|
|
1710
|
+
return this._keepalivePingInterval;
|
|
1711
|
+
}
|
|
1712
|
+
|
|
968
1713
|
/**
|
|
969
1714
|
* Close the client and release resources.
|
|
970
1715
|
* This closes any pooled WebSocket connections.
|
|
@@ -1027,25 +1772,57 @@ export class KugelAudio {
|
|
|
1027
1772
|
|
|
1028
1773
|
clearTimeout(timeoutId);
|
|
1029
1774
|
|
|
1030
|
-
if (response.
|
|
1031
|
-
|
|
1775
|
+
if (!response.ok) {
|
|
1776
|
+
const text = await response.text();
|
|
1777
|
+
throw classifyHttpError(response.status, text, response.headers);
|
|
1032
1778
|
}
|
|
1033
|
-
|
|
1034
|
-
|
|
1779
|
+
|
|
1780
|
+
return await response.json();
|
|
1781
|
+
} catch (error) {
|
|
1782
|
+
clearTimeout(timeoutId);
|
|
1783
|
+
if (error instanceof KugelAudioError) {
|
|
1784
|
+
throw error;
|
|
1035
1785
|
}
|
|
1036
|
-
if (
|
|
1037
|
-
throw new
|
|
1786
|
+
if ((error as Error).name === 'AbortError') {
|
|
1787
|
+
throw new ConnectionError(
|
|
1788
|
+
`Request to ${method} ${path} timed out after ${this._timeout}ms.`,
|
|
1789
|
+
);
|
|
1038
1790
|
}
|
|
1791
|
+
throw new ConnectionError(
|
|
1792
|
+
`Could not reach KugelAudio at ${url}: ${(error as Error).message}. ` +
|
|
1793
|
+
'Check network connectivity.',
|
|
1794
|
+
);
|
|
1795
|
+
}
|
|
1796
|
+
}
|
|
1797
|
+
|
|
1798
|
+
/**
|
|
1799
|
+
* Make a multipart/form-data request (for file uploads).
|
|
1800
|
+
* @internal Used by VoicesResource for reference file uploads.
|
|
1801
|
+
*/
|
|
1802
|
+
async requestMultipart<T>(method: string, path: string, formData: FormData): Promise<T> {
|
|
1803
|
+
const url = `${this._apiUrl}${path}`;
|
|
1804
|
+
|
|
1805
|
+
const headers: Record<string, string> = {
|
|
1806
|
+
'X-API-Key': this._apiKey,
|
|
1807
|
+
'Authorization': `Bearer ${this._apiKey}`,
|
|
1808
|
+
};
|
|
1809
|
+
|
|
1810
|
+
const controller = new AbortController();
|
|
1811
|
+
const timeoutId = setTimeout(() => controller.abort(), this._timeout);
|
|
1812
|
+
|
|
1813
|
+
try {
|
|
1814
|
+
const response = await fetch(url, {
|
|
1815
|
+
method,
|
|
1816
|
+
headers,
|
|
1817
|
+
body: formData,
|
|
1818
|
+
signal: controller.signal,
|
|
1819
|
+
});
|
|
1820
|
+
|
|
1821
|
+
clearTimeout(timeoutId);
|
|
1822
|
+
|
|
1039
1823
|
if (!response.ok) {
|
|
1040
1824
|
const text = await response.text();
|
|
1041
|
-
|
|
1042
|
-
try {
|
|
1043
|
-
const json = JSON.parse(text);
|
|
1044
|
-
message = json.detail || json.error || message;
|
|
1045
|
-
} catch {
|
|
1046
|
-
message = text || message;
|
|
1047
|
-
}
|
|
1048
|
-
throw new KugelAudioError(message, response.status);
|
|
1825
|
+
throw classifyHttpError(response.status, text, response.headers);
|
|
1049
1826
|
}
|
|
1050
1827
|
|
|
1051
1828
|
return await response.json();
|
|
@@ -1055,9 +1832,14 @@ export class KugelAudio {
|
|
|
1055
1832
|
throw error;
|
|
1056
1833
|
}
|
|
1057
1834
|
if ((error as Error).name === 'AbortError') {
|
|
1058
|
-
throw new
|
|
1835
|
+
throw new ConnectionError(
|
|
1836
|
+
`Request to ${method} ${path} timed out after ${this._timeout}ms.`,
|
|
1837
|
+
);
|
|
1059
1838
|
}
|
|
1060
|
-
throw new
|
|
1839
|
+
throw new ConnectionError(
|
|
1840
|
+
`Could not reach KugelAudio at ${url}: ${(error as Error).message}. ` +
|
|
1841
|
+
'Check network connectivity.',
|
|
1842
|
+
);
|
|
1061
1843
|
}
|
|
1062
1844
|
}
|
|
1063
1845
|
}
|