kugelaudio 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -13
- package/dist/index.d.mts +550 -26
- package/dist/index.d.ts +550 -26
- package/dist/index.js +898 -113
- package/dist/index.mjs +892 -113
- package/package.json +9 -8
- package/src/client.test.ts +548 -0
- package/src/client.ts +921 -103
- package/src/errors.ts +266 -18
- package/src/index.ts +19 -3
- package/src/types.ts +248 -8
- package/src/websocket.ts +38 -18
package/src/client.ts
CHANGED
|
@@ -3,25 +3,52 @@
|
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
5
|
import {
|
|
6
|
-
|
|
7
|
-
InsufficientCreditsError,
|
|
6
|
+
ConnectionError,
|
|
8
7
|
KugelAudioError,
|
|
9
|
-
|
|
8
|
+
ValidationError,
|
|
9
|
+
classifyHttpError,
|
|
10
|
+
classifyWsClose,
|
|
11
|
+
classifyWsFrame,
|
|
12
|
+
classifyWsHandshakeError,
|
|
10
13
|
} from './errors';
|
|
11
14
|
import type {
|
|
12
15
|
AudioChunk,
|
|
13
16
|
AudioResponse,
|
|
17
|
+
CreateVoiceOptions,
|
|
14
18
|
GenerateOptions,
|
|
15
19
|
GenerationStats,
|
|
16
20
|
KugelAudioOptions,
|
|
17
21
|
Model,
|
|
18
22
|
StreamCallbacks,
|
|
19
|
-
|
|
23
|
+
StreamConfig,
|
|
24
|
+
StreamingSessionCallbacks,
|
|
25
|
+
UpdateVoiceOptions,
|
|
26
|
+
VoiceDetail,
|
|
27
|
+
VoiceListResponse,
|
|
28
|
+
VoiceReference,
|
|
29
|
+
WordTimestamp
|
|
20
30
|
} from './types';
|
|
21
31
|
import { base64ToArrayBuffer } from './utils';
|
|
22
32
|
import { getWebSocket } from './websocket';
|
|
23
33
|
|
|
24
|
-
|
|
34
|
+
import type { Region } from './types';
|
|
35
|
+
|
|
36
|
+
const REGION_URLS: Record<Region, string> = {
|
|
37
|
+
eu: 'https://api.kugelaudio.com',
|
|
38
|
+
us: 'https://us-api.kugelaudio.com',
|
|
39
|
+
global: 'https://global-api.kugelaudio.com',
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
const REGION_PREFIXES = ['eu-', 'us-', 'global-'] as const;
|
|
43
|
+
|
|
44
|
+
function parseApiKey(apiKey: string): { cleanKey: string; detectedRegion?: Region } {
|
|
45
|
+
for (const prefix of REGION_PREFIXES) {
|
|
46
|
+
if (apiKey.startsWith(prefix)) {
|
|
47
|
+
return { cleanKey: apiKey.slice(prefix.length), detectedRegion: prefix.slice(0, -1) as Region };
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return { cleanKey: apiKey };
|
|
51
|
+
}
|
|
25
52
|
|
|
26
53
|
/**
|
|
27
54
|
* Create a new WebSocket instance.
|
|
@@ -36,6 +63,23 @@ function createWs(url: string): WebSocket {
|
|
|
36
63
|
/** WebSocket OPEN readyState constant. */
|
|
37
64
|
const WS_OPEN = 1;
|
|
38
65
|
|
|
66
|
+
let _languageWarningLogged = false;
|
|
67
|
+
|
|
68
|
+
function warnIfNoLanguage(
|
|
69
|
+
language: string | undefined,
|
|
70
|
+
normalize: boolean | undefined
|
|
71
|
+
): void {
|
|
72
|
+
const normEnabled = normalize === undefined || normalize;
|
|
73
|
+
if (!language && normEnabled && !_languageWarningLogged) {
|
|
74
|
+
_languageWarningLogged = true;
|
|
75
|
+
console.warn(
|
|
76
|
+
"[KugelAudio] No 'language' set with normalization enabled — the server " +
|
|
77
|
+
'will auto-detect the language, adding ~60-150ms to TTFA. Set language ' +
|
|
78
|
+
"(e.g., language: 'en') for optimal latency."
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
39
83
|
/**
|
|
40
84
|
* Models resource for listing TTS models.
|
|
41
85
|
*/
|
|
@@ -71,52 +115,211 @@ class VoicesResource {
|
|
|
71
115
|
language?: string;
|
|
72
116
|
includePublic?: boolean;
|
|
73
117
|
limit?: number;
|
|
74
|
-
|
|
118
|
+
offset?: number;
|
|
119
|
+
}): Promise<VoiceListResponse> {
|
|
75
120
|
const params = new URLSearchParams();
|
|
76
121
|
if (options?.language) params.set('language', options.language);
|
|
77
122
|
if (options?.includePublic !== undefined) {
|
|
78
123
|
params.set('include_public', String(options.includePublic));
|
|
79
124
|
}
|
|
80
125
|
if (options?.limit) params.set('limit', String(options.limit));
|
|
126
|
+
if (options?.offset) params.set('offset', String(options.offset));
|
|
81
127
|
|
|
82
128
|
const query = params.toString();
|
|
83
129
|
const path = query ? `/v1/voices?${query}` : '/v1/voices';
|
|
84
|
-
const response = await this.client.request<{ voices: any[] }>('GET', path);
|
|
130
|
+
const response = await this.client.request<{ voices: any[]; total: number; limit: number; offset: number }>('GET', path);
|
|
85
131
|
|
|
86
|
-
return
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
132
|
+
return {
|
|
133
|
+
voices: response.voices.map((v) => ({
|
|
134
|
+
id: v.id,
|
|
135
|
+
name: v.name,
|
|
136
|
+
description: v.description,
|
|
137
|
+
category: v.category,
|
|
138
|
+
sex: v.sex,
|
|
139
|
+
age: v.age,
|
|
140
|
+
supportedLanguages: v.supported_languages || [],
|
|
141
|
+
sampleText: v.sample_text,
|
|
142
|
+
avatarUrl: v.avatar_url,
|
|
143
|
+
sampleUrl: v.sample_url,
|
|
144
|
+
isPublic: v.is_public || false,
|
|
145
|
+
verified: v.verified || false,
|
|
146
|
+
})),
|
|
147
|
+
total: response.total,
|
|
148
|
+
limit: response.limit,
|
|
149
|
+
offset: response.offset,
|
|
150
|
+
};
|
|
100
151
|
}
|
|
101
152
|
|
|
102
153
|
/**
|
|
103
154
|
* Get a specific voice by ID.
|
|
104
155
|
*/
|
|
105
|
-
async get(voiceId: number): Promise<
|
|
156
|
+
async get(voiceId: number): Promise<VoiceDetail> {
|
|
106
157
|
const v = await this.client.request<any>('GET', `/v1/voices/${voiceId}`);
|
|
158
|
+
return this.mapVoiceDetail(v);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Create a new voice.
|
|
163
|
+
*/
|
|
164
|
+
async create(options: CreateVoiceOptions): Promise<VoiceDetail> {
|
|
165
|
+
const metadata = {
|
|
166
|
+
name: options.name,
|
|
167
|
+
sex: options.sex,
|
|
168
|
+
description: options.description ?? '',
|
|
169
|
+
category: options.category ?? 'conversational',
|
|
170
|
+
age: options.age ?? 'middle_age',
|
|
171
|
+
quality: options.quality ?? 'mid',
|
|
172
|
+
supported_languages: options.supportedLanguages ?? ['en'],
|
|
173
|
+
is_public: options.isPublic ?? false,
|
|
174
|
+
sample_text: options.sampleText ?? '',
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
const formData = new FormData();
|
|
178
|
+
formData.append(
|
|
179
|
+
'metadata',
|
|
180
|
+
new Blob([JSON.stringify(metadata)], { type: 'application/json' }),
|
|
181
|
+
);
|
|
182
|
+
|
|
183
|
+
if (options.referenceFiles) {
|
|
184
|
+
for (const file of options.referenceFiles) {
|
|
185
|
+
formData.append('files', file);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const v = await this.client.requestMultipart<any>('POST', '/v1/voices', formData);
|
|
190
|
+
return this.mapVoiceDetail(v);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Update an existing voice. Only provided fields are updated.
|
|
195
|
+
*/
|
|
196
|
+
async update(voiceId: number, options: UpdateVoiceOptions): Promise<VoiceDetail> {
|
|
197
|
+
const payload: Record<string, unknown> = {};
|
|
198
|
+
if (options.name !== undefined) payload.name = options.name;
|
|
199
|
+
if (options.description !== undefined) payload.description = options.description;
|
|
200
|
+
if (options.category !== undefined) payload.category = options.category;
|
|
201
|
+
if (options.age !== undefined) payload.age = options.age;
|
|
202
|
+
if (options.sex !== undefined) payload.sex = options.sex;
|
|
203
|
+
if (options.quality !== undefined) payload.quality = options.quality;
|
|
204
|
+
if (options.supportedLanguages !== undefined) payload.supported_languages = options.supportedLanguages;
|
|
205
|
+
if (options.isPublic !== undefined) payload.is_public = options.isPublic;
|
|
206
|
+
if (options.sampleText !== undefined) payload.sample_text = options.sampleText;
|
|
207
|
+
|
|
208
|
+
const v = await this.client.request<any>('PATCH', `/v1/voices/${voiceId}`, payload);
|
|
209
|
+
return this.mapVoiceDetail(v);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Delete a voice.
|
|
214
|
+
*/
|
|
215
|
+
async delete(voiceId: number): Promise<void> {
|
|
216
|
+
await this.client.request<any>('DELETE', `/v1/voices/${voiceId}`);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// -- Reference management --
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* List reference audio files for a voice.
|
|
223
|
+
*/
|
|
224
|
+
async listReferences(voiceId: number): Promise<VoiceReference[]> {
|
|
225
|
+
const response = await this.client.request<{ references: any[] }>(
|
|
226
|
+
'GET',
|
|
227
|
+
`/v1/voices/${voiceId}/references`,
|
|
228
|
+
);
|
|
229
|
+
return response.references.map((r) => this.mapVoiceReference(r));
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Upload a reference audio file to a voice.
|
|
234
|
+
*
|
|
235
|
+
* @param voiceId - Voice ID
|
|
236
|
+
* @param file - Audio file (File in browser, Blob in Node.js)
|
|
237
|
+
* @param referenceText - Optional transcript of the reference audio
|
|
238
|
+
*/
|
|
239
|
+
async addReference(
|
|
240
|
+
voiceId: number,
|
|
241
|
+
file: File | Blob,
|
|
242
|
+
referenceText?: string,
|
|
243
|
+
): Promise<VoiceReference> {
|
|
244
|
+
const formData = new FormData();
|
|
245
|
+
formData.append('file', file);
|
|
246
|
+
if (referenceText) {
|
|
247
|
+
formData.append('reference_text', referenceText);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const r = await this.client.requestMultipart<any>(
|
|
251
|
+
'POST',
|
|
252
|
+
`/v1/voices/${voiceId}/references`,
|
|
253
|
+
formData,
|
|
254
|
+
);
|
|
255
|
+
return this.mapVoiceReference(r);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Delete a reference audio file from a voice.
|
|
260
|
+
*/
|
|
261
|
+
async deleteReference(voiceId: number, referenceId: number): Promise<void> {
|
|
262
|
+
await this.client.request<any>(
|
|
263
|
+
'DELETE',
|
|
264
|
+
`/v1/voices/${voiceId}/references/${referenceId}`,
|
|
265
|
+
);
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// -- Publishing --
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Request publication of a voice. Sets it as public and marks it
|
|
272
|
+
* as pending verification by an admin.
|
|
273
|
+
*/
|
|
274
|
+
async publish(voiceId: number): Promise<VoiceDetail> {
|
|
275
|
+
const v = await this.client.request<any>('POST', `/v1/voices/${voiceId}/publish`);
|
|
276
|
+
return this.mapVoiceDetail(v);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// -- Sample generation --
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Trigger sample audio generation for a voice.
|
|
283
|
+
*/
|
|
284
|
+
async generateSample(voiceId: number): Promise<VoiceDetail> {
|
|
285
|
+
const v = await this.client.request<any>(
|
|
286
|
+
'POST',
|
|
287
|
+
`/v1/voices/${voiceId}/generate-sample`,
|
|
288
|
+
);
|
|
289
|
+
return this.mapVoiceDetail(v);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// -- Helpers --
|
|
293
|
+
|
|
294
|
+
private mapVoiceDetail(v: any): VoiceDetail {
|
|
107
295
|
return {
|
|
108
296
|
id: v.id,
|
|
109
297
|
name: v.name,
|
|
110
|
-
description: v.description,
|
|
111
|
-
|
|
112
|
-
|
|
298
|
+
description: v.description ?? '',
|
|
299
|
+
generativeVoiceDescription: v.generative_voice_description ?? '',
|
|
300
|
+
supportedLanguages: v.supported_languages ?? [],
|
|
301
|
+
category: v.category ?? 'cloned',
|
|
113
302
|
age: v.age,
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
303
|
+
sex: v.sex,
|
|
304
|
+
quality: v.quality ?? 'mid',
|
|
305
|
+
isPublic: v.is_public ?? false,
|
|
306
|
+
verified: v.verified ?? false,
|
|
307
|
+
pendingVerification: v.pending_verification ?? false,
|
|
117
308
|
sampleUrl: v.sample_url,
|
|
118
|
-
|
|
119
|
-
|
|
309
|
+
avatarUrl: v.avatar_url,
|
|
310
|
+
sampleText: v.sample_text ?? '',
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
private mapVoiceReference(r: any): VoiceReference {
|
|
315
|
+
return {
|
|
316
|
+
id: r.id,
|
|
317
|
+
voiceId: r.voice_id,
|
|
318
|
+
name: r.name ?? '',
|
|
319
|
+
referenceText: r.reference_text ?? '',
|
|
320
|
+
s3Path: r.s3_path ?? '',
|
|
321
|
+
audioUrl: r.audio_url,
|
|
322
|
+
isGenerated: r.is_generated ?? false,
|
|
120
323
|
};
|
|
121
324
|
}
|
|
122
325
|
}
|
|
@@ -134,6 +337,7 @@ class TTSResource {
|
|
|
134
337
|
reject: (error: Error) => void;
|
|
135
338
|
}> = new Map();
|
|
136
339
|
private requestCounter = 0;
|
|
340
|
+
private keepaliveTimer: ReturnType<typeof setInterval> | null = null;
|
|
137
341
|
|
|
138
342
|
constructor(private client: KugelAudio) {}
|
|
139
343
|
|
|
@@ -172,11 +376,15 @@ class TTSResource {
|
|
|
172
376
|
async generate(options: GenerateOptions): Promise<AudioResponse> {
|
|
173
377
|
const chunks: ArrayBuffer[] = [];
|
|
174
378
|
let finalStats: GenerationStats | undefined;
|
|
379
|
+
const allTimestamps: WordTimestamp[] = [];
|
|
175
380
|
|
|
176
381
|
await this.stream(options, {
|
|
177
382
|
onChunk: (chunk) => {
|
|
178
383
|
chunks.push(base64ToArrayBuffer(chunk.audio));
|
|
179
384
|
},
|
|
385
|
+
onWordTimestamps: (timestamps) => {
|
|
386
|
+
allTimestamps.push(...timestamps);
|
|
387
|
+
},
|
|
180
388
|
onFinal: (stats) => {
|
|
181
389
|
finalStats = stats;
|
|
182
390
|
},
|
|
@@ -198,9 +406,71 @@ class TTSResource {
|
|
|
198
406
|
durationMs: finalStats ? finalStats.durationMs : 0,
|
|
199
407
|
generationMs: finalStats ? finalStats.generationMs : 0,
|
|
200
408
|
rtf: finalStats ? finalStats.rtf : 0,
|
|
409
|
+
wordTimestamps: allTimestamps,
|
|
201
410
|
};
|
|
202
411
|
}
|
|
203
412
|
|
|
413
|
+
/**
|
|
414
|
+
* Stream audio and return a Node.js Readable stream of raw PCM16 binary data.
|
|
415
|
+
*
|
|
416
|
+
* **Node.js only** — this method requires the `stream` built-in module and is
|
|
417
|
+
* intended for server-side integrations such as Vapi custom TTS endpoints,
|
|
418
|
+
* Express/Fastify handlers, or any pipeline that expects a Node.js `Readable`.
|
|
419
|
+
*
|
|
420
|
+
* Compared to manually wiring `onChunk` to a `Readable`, this method avoids
|
|
421
|
+
* a common race-condition: the stream object is created and returned **before**
|
|
422
|
+
* any chunks arrive, so the caller can safely pipe or attach listeners before
|
|
423
|
+
* the first audio byte is pushed.
|
|
424
|
+
*
|
|
425
|
+
* @example Vapi custom TTS endpoint
|
|
426
|
+
* ```typescript
|
|
427
|
+
* app.post('/synthesize', (req, res) => {
|
|
428
|
+
* res.setHeader('Content-Type', 'audio/pcm');
|
|
429
|
+
* res.setHeader('Transfer-Encoding', 'chunked');
|
|
430
|
+
*
|
|
431
|
+
* const readable = client.tts.toReadable({
|
|
432
|
+
* text: req.body.message.text,
|
|
433
|
+
* modelId: 'kugel-1-turbo',
|
|
434
|
+
* sampleRate: req.body.message.sampleRate,
|
|
435
|
+
* language: 'en',
|
|
436
|
+
* });
|
|
437
|
+
*
|
|
438
|
+
* readable.pipe(res);
|
|
439
|
+
* });
|
|
440
|
+
* ```
|
|
441
|
+
*
|
|
442
|
+
* @param options - TTS generation options (same as `stream()`)
|
|
443
|
+
* @param reuseConnection - Reuse the pooled WebSocket connection (default: true)
|
|
444
|
+
* @returns Node.js Readable stream emitting raw PCM16 binary Buffer chunks
|
|
445
|
+
*/
|
|
446
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
447
|
+
toReadable(options: GenerateOptions, reuseConnection = true): any {
|
|
448
|
+
// Dynamic require keeps browser bundles free of Node.js built-ins.
|
|
449
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
450
|
+
const { Readable } = require('stream') as typeof import('stream');
|
|
451
|
+
const readable = new Readable({ read() {} });
|
|
452
|
+
|
|
453
|
+
this.stream(
|
|
454
|
+
options,
|
|
455
|
+
{
|
|
456
|
+
onChunk: (chunk: AudioChunk) => {
|
|
457
|
+
readable.push(Buffer.from(chunk.audio, 'base64'));
|
|
458
|
+
},
|
|
459
|
+
onFinal: () => {
|
|
460
|
+
readable.push(null);
|
|
461
|
+
},
|
|
462
|
+
onError: (error: Error) => {
|
|
463
|
+
readable.destroy(error);
|
|
464
|
+
},
|
|
465
|
+
},
|
|
466
|
+
reuseConnection
|
|
467
|
+
).catch((error: Error) => {
|
|
468
|
+
readable.destroy(error);
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
return readable;
|
|
472
|
+
}
|
|
473
|
+
|
|
204
474
|
/**
|
|
205
475
|
* Build the WebSocket URL with appropriate auth param.
|
|
206
476
|
*/
|
|
@@ -259,11 +529,20 @@ class TTSResource {
|
|
|
259
529
|
this.wsConnection = ws;
|
|
260
530
|
this.wsUrl = url;
|
|
261
531
|
this.setupMessageHandler(ws);
|
|
532
|
+
this.startKeepalive(ws);
|
|
262
533
|
resolve(ws);
|
|
263
534
|
};
|
|
264
535
|
|
|
265
|
-
ws.onerror = () => {
|
|
266
|
-
|
|
536
|
+
ws.onerror = (event: unknown) => {
|
|
537
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
538
|
+
const typed = classifyWsHandshakeError(underlying);
|
|
539
|
+
reject(
|
|
540
|
+
typed ??
|
|
541
|
+
new ConnectionError(
|
|
542
|
+
`Could not establish KugelAudio WebSocket connection to ${url}. ` +
|
|
543
|
+
'Check network connectivity.',
|
|
544
|
+
),
|
|
545
|
+
);
|
|
267
546
|
};
|
|
268
547
|
});
|
|
269
548
|
}
|
|
@@ -287,7 +566,7 @@ class TTSResource {
|
|
|
287
566
|
if (!pending) return;
|
|
288
567
|
|
|
289
568
|
if (data.error) {
|
|
290
|
-
const error = this.parseError(data
|
|
569
|
+
const error = this.parseError(data);
|
|
291
570
|
pending.callbacks.onError?.(error);
|
|
292
571
|
this.pendingRequests.delete(requestId);
|
|
293
572
|
pending.reject(error);
|
|
@@ -301,7 +580,6 @@ class TTSResource {
|
|
|
301
580
|
totalSamples: data.total_samples,
|
|
302
581
|
durationMs: data.dur_ms,
|
|
303
582
|
generationMs: data.gen_ms,
|
|
304
|
-
ttfaMs: data.ttfa_ms,
|
|
305
583
|
rtf: data.rtf,
|
|
306
584
|
error: data.error,
|
|
307
585
|
};
|
|
@@ -321,23 +599,45 @@ class TTSResource {
|
|
|
321
599
|
};
|
|
322
600
|
pending.callbacks.onChunk?.(chunk);
|
|
323
601
|
}
|
|
602
|
+
|
|
603
|
+
if (data.word_timestamps) {
|
|
604
|
+
const timestamps: WordTimestamp[] = data.word_timestamps.map(
|
|
605
|
+
(w: Record<string, unknown>) => ({
|
|
606
|
+
word: w.word as string,
|
|
607
|
+
startMs: w.start_ms as number,
|
|
608
|
+
endMs: w.end_ms as number,
|
|
609
|
+
charStart: w.char_start as number,
|
|
610
|
+
charEnd: w.char_end as number,
|
|
611
|
+
score: (w.score as number) ?? 1.0,
|
|
612
|
+
})
|
|
613
|
+
);
|
|
614
|
+
pending.callbacks.onWordTimestamps?.(timestamps);
|
|
615
|
+
}
|
|
324
616
|
} catch (e) {
|
|
325
617
|
console.error('Failed to parse WebSocket message:', e);
|
|
326
618
|
}
|
|
327
619
|
};
|
|
328
620
|
|
|
329
621
|
ws.onclose = (event) => {
|
|
330
|
-
// Clear connection pool
|
|
622
|
+
// Clear connection pool and keepalive
|
|
623
|
+
this.stopKeepalive();
|
|
331
624
|
this.wsConnection = null;
|
|
332
625
|
this.wsUrl = null;
|
|
333
626
|
|
|
334
|
-
// Reject all pending requests
|
|
627
|
+
// Reject all pending requests with appropriate error types
|
|
335
628
|
for (const [id, pending] of this.pendingRequests) {
|
|
336
629
|
pending.callbacks.onClose?.();
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
630
|
+
// Only surface server-initiated error close codes; normal closes
|
|
631
|
+
// (1000, 1001) should not reject pending requests with an error.
|
|
632
|
+
if (
|
|
633
|
+
event.code === 4001 ||
|
|
634
|
+
event.code === 4003 ||
|
|
635
|
+
event.code === 4029 ||
|
|
636
|
+
event.code === 4500
|
|
637
|
+
) {
|
|
638
|
+
const error = classifyWsClose(event.code, event.reason);
|
|
639
|
+
pending.callbacks.onError?.(error);
|
|
640
|
+
pending.reject(error);
|
|
341
641
|
}
|
|
342
642
|
this.pendingRequests.delete(id);
|
|
343
643
|
}
|
|
@@ -345,7 +645,9 @@ class TTSResource {
|
|
|
345
645
|
|
|
346
646
|
ws.onerror = () => {
|
|
347
647
|
// Reject all pending requests
|
|
348
|
-
const error = new
|
|
648
|
+
const error = new ConnectionError(
|
|
649
|
+
'KugelAudio WebSocket connection error. Check network connectivity.',
|
|
650
|
+
);
|
|
349
651
|
for (const [id, pending] of this.pendingRequests) {
|
|
350
652
|
pending.callbacks.onError?.(error);
|
|
351
653
|
pending.reject(error);
|
|
@@ -380,6 +682,7 @@ class TTSResource {
|
|
|
380
682
|
options: GenerateOptions,
|
|
381
683
|
callbacks: StreamCallbacks
|
|
382
684
|
): Promise<void> {
|
|
685
|
+
warnIfNoLanguage(options.language, options.normalize);
|
|
383
686
|
const ws = await this.getConnection();
|
|
384
687
|
const requestId = ++this.requestCounter;
|
|
385
688
|
|
|
@@ -393,10 +696,14 @@ class TTSResource {
|
|
|
393
696
|
model_id: options.modelId || 'kugel-1-turbo',
|
|
394
697
|
voice_id: options.voiceId,
|
|
395
698
|
cfg_scale: options.cfgScale ?? 2.0,
|
|
699
|
+
...(options.temperature !== undefined && { temperature: options.temperature }),
|
|
396
700
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
397
701
|
sample_rate: options.sampleRate ?? 24000,
|
|
398
702
|
normalize: options.normalize ?? true,
|
|
399
703
|
...(options.language && { language: options.language }),
|
|
704
|
+
...(options.wordTimestamps && { word_timestamps: true }),
|
|
705
|
+
...(options.speed !== undefined && { speed: options.speed }),
|
|
706
|
+
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
400
707
|
}));
|
|
401
708
|
});
|
|
402
709
|
}
|
|
@@ -408,6 +715,7 @@ class TTSResource {
|
|
|
408
715
|
options: GenerateOptions,
|
|
409
716
|
callbacks: StreamCallbacks
|
|
410
717
|
): Promise<void> {
|
|
718
|
+
warnIfNoLanguage(options.language, options.normalize);
|
|
411
719
|
return new Promise((resolve, reject) => {
|
|
412
720
|
const url = this.buildWsUrl();
|
|
413
721
|
const ws = createWs(url);
|
|
@@ -424,6 +732,9 @@ class TTSResource {
|
|
|
424
732
|
sample_rate: options.sampleRate ?? 24000,
|
|
425
733
|
normalize: options.normalize ?? true,
|
|
426
734
|
...(options.language && { language: options.language }),
|
|
735
|
+
...(options.wordTimestamps && { word_timestamps: true }),
|
|
736
|
+
...(options.speed !== undefined && { speed: options.speed }),
|
|
737
|
+
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
427
738
|
}));
|
|
428
739
|
};
|
|
429
740
|
|
|
@@ -438,7 +749,7 @@ class TTSResource {
|
|
|
438
749
|
const data = JSON.parse(messageData);
|
|
439
750
|
|
|
440
751
|
if (data.error) {
|
|
441
|
-
const error = this.parseError(data
|
|
752
|
+
const error = this.parseError(data);
|
|
442
753
|
callbacks.onError?.(error);
|
|
443
754
|
ws.close();
|
|
444
755
|
reject(error);
|
|
@@ -452,7 +763,6 @@ class TTSResource {
|
|
|
452
763
|
totalSamples: data.total_samples,
|
|
453
764
|
durationMs: data.dur_ms,
|
|
454
765
|
generationMs: data.gen_ms,
|
|
455
|
-
ttfaMs: data.ttfa_ms,
|
|
456
766
|
rtf: data.rtf,
|
|
457
767
|
error: data.error,
|
|
458
768
|
};
|
|
@@ -472,32 +782,87 @@ class TTSResource {
|
|
|
472
782
|
};
|
|
473
783
|
callbacks.onChunk?.(chunk);
|
|
474
784
|
}
|
|
785
|
+
|
|
786
|
+
if (data.word_timestamps) {
|
|
787
|
+
const timestamps: WordTimestamp[] = data.word_timestamps.map(
|
|
788
|
+
(w: Record<string, unknown>) => ({
|
|
789
|
+
word: w.word as string,
|
|
790
|
+
startMs: w.start_ms as number,
|
|
791
|
+
endMs: w.end_ms as number,
|
|
792
|
+
charStart: w.char_start as number,
|
|
793
|
+
charEnd: w.char_end as number,
|
|
794
|
+
score: (w.score as number) ?? 1.0,
|
|
795
|
+
})
|
|
796
|
+
);
|
|
797
|
+
callbacks.onWordTimestamps?.(timestamps);
|
|
798
|
+
}
|
|
475
799
|
} catch (e) {
|
|
476
800
|
console.error('Failed to parse WebSocket message:', e);
|
|
477
801
|
}
|
|
478
802
|
};
|
|
479
803
|
|
|
480
|
-
ws.onerror = () => {
|
|
481
|
-
const
|
|
804
|
+
ws.onerror = (event: unknown) => {
|
|
805
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
806
|
+
const error =
|
|
807
|
+
classifyWsHandshakeError(underlying) ??
|
|
808
|
+
new ConnectionError(
|
|
809
|
+
'KugelAudio WebSocket connection error. Check network connectivity.',
|
|
810
|
+
);
|
|
482
811
|
callbacks.onError?.(error);
|
|
483
812
|
reject(error);
|
|
484
813
|
};
|
|
485
814
|
|
|
486
815
|
ws.onclose = (event) => {
|
|
487
816
|
callbacks.onClose?.();
|
|
488
|
-
if (
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
817
|
+
if (
|
|
818
|
+
event.code === 4001 ||
|
|
819
|
+
event.code === 4003 ||
|
|
820
|
+
event.code === 4029 ||
|
|
821
|
+
event.code === 4500
|
|
822
|
+
) {
|
|
823
|
+
const error = classifyWsClose(event.code, event.reason);
|
|
824
|
+
callbacks.onError?.(error);
|
|
825
|
+
reject(error);
|
|
492
826
|
}
|
|
493
827
|
};
|
|
494
828
|
});
|
|
495
829
|
}
|
|
496
830
|
|
|
831
|
+
/**
|
|
832
|
+
* Start periodic keepalive pings on the pooled connection.
|
|
833
|
+
* Uses the ws package's ping() in Node.js; silently skips in browsers
|
|
834
|
+
* where WebSocket doesn't expose a ping method.
|
|
835
|
+
*/
|
|
836
|
+
private startKeepalive(ws: WebSocket): void {
|
|
837
|
+
this.stopKeepalive();
|
|
838
|
+
const intervalMs = this.client.keepalivePingInterval;
|
|
839
|
+
if (intervalMs == null || intervalMs <= 0) return;
|
|
840
|
+
|
|
841
|
+
this.keepaliveTimer = setInterval(() => {
|
|
842
|
+
if (this.wsConnection !== ws || ws.readyState !== WS_OPEN) {
|
|
843
|
+
this.stopKeepalive();
|
|
844
|
+
return;
|
|
845
|
+
}
|
|
846
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
847
|
+
if (typeof (ws as any).ping === 'function') {
|
|
848
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
849
|
+
(ws as any).ping();
|
|
850
|
+
}
|
|
851
|
+
}, intervalMs);
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
private stopKeepalive(): void {
|
|
855
|
+
if (this.keepaliveTimer !== null) {
|
|
856
|
+
clearInterval(this.keepaliveTimer);
|
|
857
|
+
this.keepaliveTimer = null;
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
|
|
497
861
|
/**
|
|
498
862
|
* Close the pooled WebSocket connection.
|
|
499
863
|
*/
|
|
500
864
|
close(): void {
|
|
865
|
+
this.stopKeepalive();
|
|
501
866
|
if (this.wsConnection) {
|
|
502
867
|
try {
|
|
503
868
|
this.wsConnection.close();
|
|
@@ -509,15 +874,43 @@ class TTSResource {
|
|
|
509
874
|
}
|
|
510
875
|
}
|
|
511
876
|
|
|
512
|
-
private parseError(
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
877
|
+
private parseError(data: { error?: string; error_code?: string; retry_after?: number }): Error {
|
|
878
|
+
return classifyWsFrame(data);
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
/**
|
|
882
|
+
* Create a streaming session for LLM integration.
|
|
883
|
+
*
|
|
884
|
+
* The session connects to `/ws/tts/stream` and keeps a persistent
|
|
885
|
+
* connection across multiple {@link StreamingSession.send} calls.
|
|
886
|
+
* The server auto-chunks text at sentence boundaries — no client-side
|
|
887
|
+
* flushing required.
|
|
888
|
+
*
|
|
889
|
+
* @param config - Session configuration (voice, model, chunking strategy).
|
|
890
|
+
* @param callbacks - Callbacks for audio chunks and session lifecycle events.
|
|
891
|
+
* @returns A {@link StreamingSession} instance. Call `.connect()` before sending.
|
|
892
|
+
*
|
|
893
|
+
* @example
|
|
894
|
+
* ```typescript
|
|
895
|
+
* const session = client.tts.streamingSession(
|
|
896
|
+
* { voiceId: 123, autoMode: true, chunkLengthSchedule: [50, 100, 150, 250] },
|
|
897
|
+
* { onChunk: (chunk) => playAudio(chunk.audio) },
|
|
898
|
+
* );
|
|
899
|
+
*
|
|
900
|
+
* session.connect();
|
|
901
|
+
*
|
|
902
|
+
* for await (const token of llmStream) {
|
|
903
|
+
* session.send(token);
|
|
904
|
+
* }
|
|
905
|
+
*
|
|
906
|
+
* await session.close();
|
|
907
|
+
* ```
|
|
908
|
+
*/
|
|
909
|
+
streamingSession(
|
|
910
|
+
config: StreamConfig,
|
|
911
|
+
callbacks: StreamingSessionCallbacks
|
|
912
|
+
): StreamingSession {
|
|
913
|
+
return new StreamingSession(this.client, config, callbacks);
|
|
521
914
|
}
|
|
522
915
|
|
|
523
916
|
/**
|
|
@@ -538,7 +931,7 @@ class TTSResource {
|
|
|
538
931
|
* console.log(`Audio from ${chunk.contextId}`);
|
|
539
932
|
* playAudio(chunk.audio);
|
|
540
933
|
* },
|
|
541
|
-
*
|
|
934
|
+
* onContextClosed: (contextId) => {
|
|
542
935
|
* console.log(`${contextId} finished`);
|
|
543
936
|
* },
|
|
544
937
|
* });
|
|
@@ -589,8 +982,13 @@ class MultiContextSession {
|
|
|
589
982
|
|
|
590
983
|
/**
|
|
591
984
|
* Connect to the multi-context WebSocket endpoint.
|
|
985
|
+
*
|
|
986
|
+
* The returned promise resolves once the WebSocket is OPEN so callers can
|
|
987
|
+
* ``await session.connect(callbacks)`` before invoking
|
|
988
|
+
* {@link createContext} / {@link send}. Pre-open errors reject with the
|
|
989
|
+
* typed error.
|
|
592
990
|
*/
|
|
593
|
-
connect(callbacks: import('./types').MultiContextCallbacks): void {
|
|
991
|
+
connect(callbacks: import('./types').MultiContextCallbacks): Promise<void> {
|
|
594
992
|
this.callbacks = callbacks;
|
|
595
993
|
|
|
596
994
|
const wsUrl = this.client.ttsUrl
|
|
@@ -608,12 +1006,9 @@ class MultiContextSession {
|
|
|
608
1006
|
|
|
609
1007
|
const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`;
|
|
610
1008
|
this.ws = createWs(url);
|
|
1009
|
+
const ws = this.ws;
|
|
611
1010
|
|
|
612
|
-
|
|
613
|
-
// Connection established, ready to create contexts
|
|
614
|
-
};
|
|
615
|
-
|
|
616
|
-
this.ws.onmessage = (event: { data: unknown }) => {
|
|
1011
|
+
ws.onmessage = (event: { data: unknown }) => {
|
|
617
1012
|
try {
|
|
618
1013
|
// Handle both browser (string) and Node.js (Buffer) message formats
|
|
619
1014
|
const messageData = typeof event.data === 'string'
|
|
@@ -654,10 +1049,6 @@ class MultiContextSession {
|
|
|
654
1049
|
this.callbacks.onChunk?.(chunk);
|
|
655
1050
|
}
|
|
656
1051
|
|
|
657
|
-
if (data.is_final) {
|
|
658
|
-
this.callbacks.onContextFinal?.(data.context_id);
|
|
659
|
-
}
|
|
660
|
-
|
|
661
1052
|
if (data.context_closed) {
|
|
662
1053
|
this.contexts.delete(data.context_id);
|
|
663
1054
|
this.callbacks.onContextClosed?.(data.context_id);
|
|
@@ -676,20 +1067,51 @@ class MultiContextSession {
|
|
|
676
1067
|
}
|
|
677
1068
|
};
|
|
678
1069
|
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
};
|
|
1070
|
+
return new Promise<void>((resolve, reject) => {
|
|
1071
|
+
let opened = false;
|
|
682
1072
|
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
1073
|
+
ws.onopen = () => {
|
|
1074
|
+
opened = true;
|
|
1075
|
+
resolve();
|
|
1076
|
+
};
|
|
1077
|
+
|
|
1078
|
+
ws.onerror = (event: unknown) => {
|
|
1079
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
1080
|
+
const err =
|
|
1081
|
+
classifyWsHandshakeError(underlying) ??
|
|
1082
|
+
new ConnectionError(
|
|
1083
|
+
'KugelAudio multi-context WebSocket connection error. ' +
|
|
1084
|
+
'Check network connectivity.',
|
|
1085
|
+
);
|
|
1086
|
+
if (!opened) reject(err);
|
|
1087
|
+
this.callbacks.onError?.(err);
|
|
1088
|
+
};
|
|
1089
|
+
|
|
1090
|
+
ws.onclose = (event) => {
|
|
1091
|
+
let typedErr: KugelAudioError | null = null;
|
|
1092
|
+
if (
|
|
1093
|
+
event.code === 4001 ||
|
|
1094
|
+
event.code === 4003 ||
|
|
1095
|
+
event.code === 4029 ||
|
|
1096
|
+
event.code === 4500
|
|
1097
|
+
) {
|
|
1098
|
+
typedErr = classifyWsClose(event.code, event.reason);
|
|
1099
|
+
this.callbacks.onError?.(typedErr);
|
|
1100
|
+
}
|
|
1101
|
+
if (!opened) {
|
|
1102
|
+
reject(
|
|
1103
|
+
typedErr ??
|
|
1104
|
+
new ConnectionError(
|
|
1105
|
+
`KugelAudio multi-context WebSocket closed before ready ` +
|
|
1106
|
+
`(code ${event.code}).`,
|
|
1107
|
+
),
|
|
1108
|
+
);
|
|
1109
|
+
}
|
|
1110
|
+
this.ws = null;
|
|
1111
|
+
this.isStarted = false;
|
|
1112
|
+
this.contexts.clear();
|
|
1113
|
+
};
|
|
1114
|
+
});
|
|
693
1115
|
}
|
|
694
1116
|
|
|
695
1117
|
/**
|
|
@@ -713,10 +1135,13 @@ class MultiContextSession {
|
|
|
713
1135
|
|
|
714
1136
|
// Include session config on first context
|
|
715
1137
|
if (!this.isStarted) {
|
|
1138
|
+
warnIfNoLanguage(this.config.language, this.config.normalize);
|
|
716
1139
|
if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
|
|
717
1140
|
if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
|
|
1141
|
+
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
718
1142
|
if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
|
|
719
1143
|
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
1144
|
+
if (this.config.language) msg.language = this.config.language;
|
|
720
1145
|
if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
|
|
721
1146
|
}
|
|
722
1147
|
|
|
@@ -821,6 +1246,336 @@ class MultiContextSession {
|
|
|
821
1246
|
}
|
|
822
1247
|
}
|
|
823
1248
|
|
|
1249
|
+
/**
|
|
1250
|
+
* Streaming session for LLM integration via `/ws/tts/stream`.
|
|
1251
|
+
*
|
|
1252
|
+
* The server accumulates text across multiple {@link send} calls and
|
|
1253
|
+
* auto-chunks it at sentence boundaries, keeping the KV cache warm between
|
|
1254
|
+
* chunks for natural prosody. You never need to call `flush` explicitly —
|
|
1255
|
+
* configure {@link StreamConfig.chunkLengthSchedule} or
|
|
1256
|
+
* {@link StreamConfig.autoMode} instead.
|
|
1257
|
+
*
|
|
1258
|
+
* @example
|
|
1259
|
+
* ```typescript
|
|
1260
|
+
* const session = client.tts.streamingSession({
|
|
1261
|
+
* voiceId: 123,
|
|
1262
|
+
* autoMode: true,
|
|
1263
|
+
* chunkLengthSchedule: [50, 100, 150, 250],
|
|
1264
|
+
* }, {
|
|
1265
|
+
* onChunk: (chunk) => playAudio(chunk.audio),
|
|
1266
|
+
* onSessionClosed: (totalSecs) => console.log(`Done: ${totalSecs}s`),
|
|
1267
|
+
* });
|
|
1268
|
+
*
|
|
1269
|
+
* session.connect();
|
|
1270
|
+
*
|
|
1271
|
+
* for await (const token of llmStream) {
|
|
1272
|
+
* session.send(token);
|
|
1273
|
+
* }
|
|
1274
|
+
*
|
|
1275
|
+
* await session.close();
|
|
1276
|
+
* ```
|
|
1277
|
+
*/
|
|
1278
|
+
class StreamingSession {
|
|
1279
|
+
private ws: WebSocket | null = null;
|
|
1280
|
+
private config: StreamConfig;
|
|
1281
|
+
private callbacks: StreamingSessionCallbacks;
|
|
1282
|
+
private client: KugelAudio;
|
|
1283
|
+
private configSent = false;
|
|
1284
|
+
|
|
1285
|
+
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks) {
|
|
1286
|
+
this.client = client;
|
|
1287
|
+
this.config = config;
|
|
1288
|
+
this.callbacks = callbacks;
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
/**
|
|
1292
|
+
* Open the WebSocket connection and authenticate.
|
|
1293
|
+
*
|
|
1294
|
+
* The returned promise resolves once the WebSocket is OPEN, so callers can
|
|
1295
|
+
* ``await session.connect()`` and then ``send()`` without racing the
|
|
1296
|
+
* handshake. Pre-open errors (network failure, 4001 unauthorized, …) reject
|
|
1297
|
+
* the promise with the typed error.
|
|
1298
|
+
*/
|
|
1299
|
+
connect(): Promise<void> {
|
|
1300
|
+
const wsUrl = this.client.ttsUrl
|
|
1301
|
+
.replace('https://', 'wss://')
|
|
1302
|
+
.replace('http://', 'ws://');
|
|
1303
|
+
|
|
1304
|
+
let authParam: string;
|
|
1305
|
+
if (this.client.isToken) {
|
|
1306
|
+
authParam = 'token';
|
|
1307
|
+
} else if (this.client.isMasterKey) {
|
|
1308
|
+
authParam = 'master_key';
|
|
1309
|
+
} else {
|
|
1310
|
+
authParam = 'api_key';
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
const url = `${wsUrl}/ws/tts/stream?${authParam}=${this.client.apiKey}`;
|
|
1314
|
+
this.ws = createWs(url);
|
|
1315
|
+
const ws = this.ws;
|
|
1316
|
+
|
|
1317
|
+
ws.onmessage = (event: { data: unknown }) => {
|
|
1318
|
+
try {
|
|
1319
|
+
const messageData = typeof event.data === 'string'
|
|
1320
|
+
? event.data
|
|
1321
|
+
: event.data instanceof Buffer
|
|
1322
|
+
? event.data.toString()
|
|
1323
|
+
: String(event.data);
|
|
1324
|
+
const data = JSON.parse(messageData);
|
|
1325
|
+
|
|
1326
|
+
if (data.error) {
|
|
1327
|
+
this.callbacks.onError?.(new KugelAudioError(data.error));
|
|
1328
|
+
return;
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
if (data.audio) {
|
|
1332
|
+
const chunk: AudioChunk = {
|
|
1333
|
+
audio: data.audio,
|
|
1334
|
+
encoding: data.enc || 'pcm_s16le',
|
|
1335
|
+
index: data.idx,
|
|
1336
|
+
sampleRate: data.sr,
|
|
1337
|
+
samples: data.samples,
|
|
1338
|
+
};
|
|
1339
|
+
this.callbacks.onChunk?.(chunk);
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
if (data.word_timestamps) {
|
|
1343
|
+
const timestamps = data.word_timestamps.map((w: Record<string, unknown>) => ({
|
|
1344
|
+
word: w.word as string,
|
|
1345
|
+
startMs: w.start_ms as number,
|
|
1346
|
+
endMs: w.end_ms as number,
|
|
1347
|
+
charStart: w.char_start as number,
|
|
1348
|
+
charEnd: w.char_end as number,
|
|
1349
|
+
score: (w.score as number) ?? 1.0,
|
|
1350
|
+
}));
|
|
1351
|
+
this.callbacks.onWordTimestamps?.(timestamps);
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
if (data.chunk_complete) {
|
|
1355
|
+
this.callbacks.onChunkComplete?.(
|
|
1356
|
+
data.chunk_id ?? 0,
|
|
1357
|
+
data.audio_seconds ?? 0,
|
|
1358
|
+
data.gen_ms ?? 0,
|
|
1359
|
+
);
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
if (data.generation_started) {
|
|
1363
|
+
this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? '');
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
if (data.session_closed) {
|
|
1367
|
+
this.callbacks.onSessionClosed?.(
|
|
1368
|
+
data.total_audio_seconds ?? 0,
|
|
1369
|
+
data.total_text_chunks ?? 0,
|
|
1370
|
+
data.total_audio_chunks ?? 0,
|
|
1371
|
+
);
|
|
1372
|
+
}
|
|
1373
|
+
} catch (e) {
|
|
1374
|
+
console.error('[KugelAudio] Failed to parse streaming session message:', e);
|
|
1375
|
+
}
|
|
1376
|
+
};
|
|
1377
|
+
|
|
1378
|
+
return new Promise<void>((resolve, reject) => {
|
|
1379
|
+
let opened = false;
|
|
1380
|
+
|
|
1381
|
+
ws.onopen = () => {
|
|
1382
|
+
opened = true;
|
|
1383
|
+
resolve();
|
|
1384
|
+
};
|
|
1385
|
+
|
|
1386
|
+
ws.onerror = (event: unknown) => {
|
|
1387
|
+
const underlying = (event as { error?: unknown } | null)?.error ?? event;
|
|
1388
|
+
const err =
|
|
1389
|
+
classifyWsHandshakeError(underlying) ??
|
|
1390
|
+
new ConnectionError(
|
|
1391
|
+
'KugelAudio streaming WebSocket connection error. ' +
|
|
1392
|
+
'Check network connectivity.',
|
|
1393
|
+
);
|
|
1394
|
+
if (!opened) reject(err);
|
|
1395
|
+
this.callbacks.onError?.(err);
|
|
1396
|
+
};
|
|
1397
|
+
|
|
1398
|
+
ws.onclose = (event) => {
|
|
1399
|
+
let typedErr: KugelAudioError | null = null;
|
|
1400
|
+
if (
|
|
1401
|
+
event.code === 4001 ||
|
|
1402
|
+
event.code === 4003 ||
|
|
1403
|
+
event.code === 4029 ||
|
|
1404
|
+
event.code === 4500
|
|
1405
|
+
) {
|
|
1406
|
+
typedErr = classifyWsClose(event.code, event.reason);
|
|
1407
|
+
this.callbacks.onError?.(typedErr);
|
|
1408
|
+
}
|
|
1409
|
+
if (!opened) {
|
|
1410
|
+
reject(
|
|
1411
|
+
typedErr ??
|
|
1412
|
+
new ConnectionError(
|
|
1413
|
+
`KugelAudio streaming WebSocket closed before ready ` +
|
|
1414
|
+
`(code ${event.code}).`,
|
|
1415
|
+
),
|
|
1416
|
+
);
|
|
1417
|
+
}
|
|
1418
|
+
this.ws = null;
|
|
1419
|
+
this.configSent = false;
|
|
1420
|
+
};
|
|
1421
|
+
});
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
/**
|
|
1425
|
+
* Send a text chunk to the server (e.g. one LLM output token).
|
|
1426
|
+
*
|
|
1427
|
+
* The server buffers text across multiple calls and starts generating at
|
|
1428
|
+
* natural sentence boundaries automatically — no need to call `flush`.
|
|
1429
|
+
*
|
|
1430
|
+
* @param text - Raw text or LLM token to append to the server buffer.
|
|
1431
|
+
* @param flush - Force immediate generation of whatever is buffered.
|
|
1432
|
+
* **Avoid calling this per-sentence from the client.** Doing so bypasses
|
|
1433
|
+
* the server's semantic chunking, incurs a fresh model prefill cost on
|
|
1434
|
+
* every flush, and makes latency *worse*, not better. Let the server
|
|
1435
|
+
* handle chunking via `chunkLengthSchedule` / `autoMode` instead.
|
|
1436
|
+
*/
|
|
1437
|
+
send(text: string, flush = false): void {
|
|
1438
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) {
|
|
1439
|
+
throw new KugelAudioError('StreamingSession not connected. Call connect() first.');
|
|
1440
|
+
}
|
|
1441
|
+
|
|
1442
|
+
const msg: Record<string, unknown> = { text, flush };
|
|
1443
|
+
|
|
1444
|
+
if (!this.configSent) {
|
|
1445
|
+
if (this.config.voiceId !== undefined) msg.voice_id = this.config.voiceId;
|
|
1446
|
+
if (this.config.modelId !== undefined) msg.model_id = this.config.modelId;
|
|
1447
|
+
if (this.config.cfgScale !== undefined) msg.cfg_scale = this.config.cfgScale;
|
|
1448
|
+
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
1449
|
+
if (this.config.maxNewTokens !== undefined) msg.max_new_tokens = this.config.maxNewTokens;
|
|
1450
|
+
if (this.config.sampleRate !== undefined) msg.sample_rate = this.config.sampleRate;
|
|
1451
|
+
if (this.config.flushTimeoutMs !== undefined) msg.flush_timeout_ms = this.config.flushTimeoutMs;
|
|
1452
|
+
if (this.config.maxBufferLength !== undefined) msg.max_buffer_length = this.config.maxBufferLength;
|
|
1453
|
+
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
1454
|
+
if (this.config.language !== undefined) msg.language = this.config.language;
|
|
1455
|
+
if (this.config.wordTimestamps) msg.word_timestamps = true;
|
|
1456
|
+
if (this.config.autoMode !== undefined) msg.auto_mode = this.config.autoMode;
|
|
1457
|
+
if (this.config.chunkLengthSchedule?.length) msg.chunk_length_schedule = this.config.chunkLengthSchedule;
|
|
1458
|
+
if (this.config.speed !== undefined) msg.speed = this.config.speed;
|
|
1459
|
+
this.configSent = true;
|
|
1460
|
+
}
|
|
1461
|
+
|
|
1462
|
+
this.ws.send(JSON.stringify(msg));
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1465
|
+
/**
|
|
1466
|
+
* End the current session but keep the WebSocket connection open.
|
|
1467
|
+
*
|
|
1468
|
+
* This allows starting a new session on the same connection, avoiding
|
|
1469
|
+
* the overhead of a new WebSocket handshake (~200-300ms). After calling
|
|
1470
|
+
* this, optionally call {@link updateConfig} to change voice/model settings,
|
|
1471
|
+
* then call {@link send} to start the next session.
|
|
1472
|
+
*
|
|
1473
|
+
* The returned promise resolves once the server confirms with a
|
|
1474
|
+
* `session_closed` message, or after a 15 s **quiet** timeout — i.e. 15 s
|
|
1475
|
+
* elapse without *any* server message arriving. The timer resets on every
|
|
1476
|
+
* incoming frame so a long final flush that streams audio for tens of
|
|
1477
|
+
* seconds is not truncated; only a genuinely silent server trips the fuse.
|
|
1478
|
+
*/
|
|
1479
|
+
endSession(): Promise<void> {
|
|
1480
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
|
|
1481
|
+
|
|
1482
|
+
const ws = this.ws;
|
|
1483
|
+
// Quiet timeout: resets on every incoming server message. Trips only when
|
|
1484
|
+
// the server has been silent for this long. The previous wall-clock fuse
|
|
1485
|
+
// (10 s total) silently truncated audio when the final flushed chunk
|
|
1486
|
+
// took longer to generate than the budget — see fix in this commit.
|
|
1487
|
+
const QUIET_TIMEOUT_MS = 15_000;
|
|
1488
|
+
|
|
1489
|
+
return new Promise<void>((resolve) => {
|
|
1490
|
+
let settled = false;
|
|
1491
|
+
let timer: ReturnType<typeof setTimeout>;
|
|
1492
|
+
|
|
1493
|
+
const prevMessage = ws.onmessage;
|
|
1494
|
+
const prevClose = ws.onclose;
|
|
1495
|
+
|
|
1496
|
+
const done = () => {
|
|
1497
|
+
if (settled) return;
|
|
1498
|
+
settled = true;
|
|
1499
|
+
clearTimeout(timer);
|
|
1500
|
+
// Restore the original handlers so subsequent endSession() calls
|
|
1501
|
+
// don't stack wrappers and so the typed-error onclose installed
|
|
1502
|
+
// by connect() remains in effect for the next session.
|
|
1503
|
+
ws.onmessage = prevMessage;
|
|
1504
|
+
ws.onclose = prevClose;
|
|
1505
|
+
this.configSent = false;
|
|
1506
|
+
resolve();
|
|
1507
|
+
};
|
|
1508
|
+
|
|
1509
|
+
const armQuietTimer = () => {
|
|
1510
|
+
clearTimeout(timer);
|
|
1511
|
+
timer = setTimeout(done, QUIET_TIMEOUT_MS);
|
|
1512
|
+
};
|
|
1513
|
+
|
|
1514
|
+
armQuietTimer();
|
|
1515
|
+
|
|
1516
|
+
ws.onmessage = (event: MessageEvent) => {
|
|
1517
|
+
// Reset the quiet timer on EVERY incoming frame — audio chunks for
|
|
1518
|
+
// the final flush count as liveness, not just session_closed.
|
|
1519
|
+
armQuietTimer();
|
|
1520
|
+
if (prevMessage) prevMessage.call(ws, event);
|
|
1521
|
+
try {
|
|
1522
|
+
const raw = typeof event.data === 'string'
|
|
1523
|
+
? event.data
|
|
1524
|
+
: event.data instanceof Buffer
|
|
1525
|
+
? event.data.toString()
|
|
1526
|
+
: String(event.data);
|
|
1527
|
+
if (JSON.parse(raw).session_closed) done();
|
|
1528
|
+
} catch { /* ignore parse errors */ }
|
|
1529
|
+
};
|
|
1530
|
+
|
|
1531
|
+
ws.onclose = (event: CloseEvent) => {
|
|
1532
|
+
this.ws = null;
|
|
1533
|
+
if (prevClose) prevClose.call(ws, event);
|
|
1534
|
+
done();
|
|
1535
|
+
};
|
|
1536
|
+
|
|
1537
|
+
ws.send(JSON.stringify({ close: true }));
|
|
1538
|
+
});
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
/**
|
|
1542
|
+
* Update session configuration for the next session.
|
|
1543
|
+
*
|
|
1544
|
+
* Call this after {@link endSession} and before the next {@link send}
|
|
1545
|
+
* to change voice, model, language, or other settings.
|
|
1546
|
+
*/
|
|
1547
|
+
updateConfig(config: Partial<StreamConfig>): void {
|
|
1548
|
+
Object.assign(this.config, config);
|
|
1549
|
+
this.configSent = false;
|
|
1550
|
+
}
|
|
1551
|
+
|
|
1552
|
+
/**
|
|
1553
|
+
* Close the session and the WebSocket connection.
|
|
1554
|
+
*
|
|
1555
|
+
* For session reuse without closing the connection, use
|
|
1556
|
+
* {@link endSession} instead.
|
|
1557
|
+
*
|
|
1558
|
+
* The returned promise resolves once the server confirms the close with a
|
|
1559
|
+
* `session_closed` message, or after a 15 s **quiet** timeout (no traffic
|
|
1560
|
+
* from the server in that window). Audio frames from the server-side
|
|
1561
|
+
* final-flush of the still-buffered text are delivered to your callbacks
|
|
1562
|
+
* before this promise resolves, and each frame resets the quiet timer.
|
|
1563
|
+
*/
|
|
1564
|
+
async close(): Promise<void> {
|
|
1565
|
+
await this.endSession();
|
|
1566
|
+
|
|
1567
|
+
if (this.ws) {
|
|
1568
|
+
try { this.ws.close(); } catch { /* already closed */ }
|
|
1569
|
+
this.ws = null;
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
/** Whether the underlying WebSocket is open. */
|
|
1574
|
+
get isConnected(): boolean {
|
|
1575
|
+
return this.ws !== null && this.ws.readyState === WS_OPEN;
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
|
|
824
1579
|
/**
|
|
825
1580
|
* KugelAudio API client.
|
|
826
1581
|
*
|
|
@@ -834,13 +1589,13 @@ class MultiContextSession {
|
|
|
834
1589
|
* // List voices
|
|
835
1590
|
* const voices = await client.voices.list();
|
|
836
1591
|
*
|
|
837
|
-
* // Generate audio with fast model
|
|
1592
|
+
* // Generate audio with fast model
|
|
838
1593
|
* const audio = await client.tts.generate({
|
|
839
1594
|
* text: 'Hello, world!',
|
|
840
1595
|
* modelId: 'kugel-1-turbo',
|
|
841
1596
|
* });
|
|
842
1597
|
*
|
|
843
|
-
* // Generate audio with premium model
|
|
1598
|
+
* // Generate audio with premium model
|
|
844
1599
|
* const audio = await client.tts.generate({
|
|
845
1600
|
* text: 'Hello, world!',
|
|
846
1601
|
* modelId: 'kugel-1',
|
|
@@ -855,6 +1610,7 @@ export class KugelAudio {
|
|
|
855
1610
|
private _apiUrl: string;
|
|
856
1611
|
private _ttsUrl: string;
|
|
857
1612
|
private _timeout: number;
|
|
1613
|
+
private _keepalivePingInterval: number | null;
|
|
858
1614
|
|
|
859
1615
|
/** Models resource */
|
|
860
1616
|
public readonly models: ModelsResource;
|
|
@@ -865,17 +1621,37 @@ export class KugelAudio {
|
|
|
865
1621
|
|
|
866
1622
|
constructor(options: KugelAudioOptions) {
|
|
867
1623
|
if (!options.apiKey) {
|
|
868
|
-
throw new
|
|
1624
|
+
throw new ValidationError(
|
|
1625
|
+
'KugelAudio API key is missing. Set the KUGELAUDIO_API_KEY ' +
|
|
1626
|
+
'environment variable or pass { apiKey: ... } to the client. ' +
|
|
1627
|
+
'Get a key at https://app.kugelaudio.com/settings/api-keys.',
|
|
1628
|
+
);
|
|
869
1629
|
}
|
|
870
1630
|
|
|
871
|
-
|
|
1631
|
+
const { cleanKey, detectedRegion } = parseApiKey(options.apiKey);
|
|
1632
|
+
this._apiKey = cleanKey;
|
|
872
1633
|
this._isMasterKey = options.isMasterKey || false;
|
|
873
1634
|
this._isToken = options.isToken || false;
|
|
874
1635
|
this._orgId = options.orgId;
|
|
875
|
-
|
|
1636
|
+
|
|
1637
|
+
if (options.apiUrl) {
|
|
1638
|
+
this._apiUrl = options.apiUrl.replace(/\/$/, '');
|
|
1639
|
+
} else {
|
|
1640
|
+
const effectiveRegion = options.region || detectedRegion || 'eu';
|
|
1641
|
+
if (!(effectiveRegion in REGION_URLS)) {
|
|
1642
|
+
throw new ValidationError(
|
|
1643
|
+
`Invalid region '${effectiveRegion}'. Must be one of: ${Object.keys(REGION_URLS).join(', ')}.`,
|
|
1644
|
+
);
|
|
1645
|
+
}
|
|
1646
|
+
this._apiUrl = REGION_URLS[effectiveRegion as Region];
|
|
1647
|
+
}
|
|
1648
|
+
|
|
876
1649
|
// If ttsUrl not specified, use apiUrl (backend proxies to TTS server)
|
|
877
1650
|
this._ttsUrl = (options.ttsUrl || this._apiUrl).replace(/\/$/, '');
|
|
878
1651
|
this._timeout = options.timeout || 60000;
|
|
1652
|
+
this._keepalivePingInterval = options.keepalivePingInterval !== undefined
|
|
1653
|
+
? options.keepalivePingInterval
|
|
1654
|
+
: 20000;
|
|
879
1655
|
|
|
880
1656
|
this.models = new ModelsResource(this);
|
|
881
1657
|
this.voices = new VoicesResource(this);
|
|
@@ -929,6 +1705,11 @@ export class KugelAudio {
|
|
|
929
1705
|
return this._ttsUrl;
|
|
930
1706
|
}
|
|
931
1707
|
|
|
1708
|
+
/** Get keepalive ping interval in milliseconds, or null if disabled. */
|
|
1709
|
+
get keepalivePingInterval(): number | null {
|
|
1710
|
+
return this._keepalivePingInterval;
|
|
1711
|
+
}
|
|
1712
|
+
|
|
932
1713
|
/**
|
|
933
1714
|
* Close the client and release resources.
|
|
934
1715
|
* This closes any pooled WebSocket connections.
|
|
@@ -991,25 +1772,57 @@ export class KugelAudio {
|
|
|
991
1772
|
|
|
992
1773
|
clearTimeout(timeoutId);
|
|
993
1774
|
|
|
994
|
-
if (response.
|
|
995
|
-
|
|
1775
|
+
if (!response.ok) {
|
|
1776
|
+
const text = await response.text();
|
|
1777
|
+
throw classifyHttpError(response.status, text, response.headers);
|
|
996
1778
|
}
|
|
997
|
-
|
|
998
|
-
|
|
1779
|
+
|
|
1780
|
+
return await response.json();
|
|
1781
|
+
} catch (error) {
|
|
1782
|
+
clearTimeout(timeoutId);
|
|
1783
|
+
if (error instanceof KugelAudioError) {
|
|
1784
|
+
throw error;
|
|
999
1785
|
}
|
|
1000
|
-
if (
|
|
1001
|
-
throw new
|
|
1786
|
+
if ((error as Error).name === 'AbortError') {
|
|
1787
|
+
throw new ConnectionError(
|
|
1788
|
+
`Request to ${method} ${path} timed out after ${this._timeout}ms.`,
|
|
1789
|
+
);
|
|
1002
1790
|
}
|
|
1791
|
+
throw new ConnectionError(
|
|
1792
|
+
`Could not reach KugelAudio at ${url}: ${(error as Error).message}. ` +
|
|
1793
|
+
'Check network connectivity.',
|
|
1794
|
+
);
|
|
1795
|
+
}
|
|
1796
|
+
}
|
|
1797
|
+
|
|
1798
|
+
/**
|
|
1799
|
+
* Make a multipart/form-data request (for file uploads).
|
|
1800
|
+
* @internal Used by VoicesResource for reference file uploads.
|
|
1801
|
+
*/
|
|
1802
|
+
async requestMultipart<T>(method: string, path: string, formData: FormData): Promise<T> {
|
|
1803
|
+
const url = `${this._apiUrl}${path}`;
|
|
1804
|
+
|
|
1805
|
+
const headers: Record<string, string> = {
|
|
1806
|
+
'X-API-Key': this._apiKey,
|
|
1807
|
+
'Authorization': `Bearer ${this._apiKey}`,
|
|
1808
|
+
};
|
|
1809
|
+
|
|
1810
|
+
const controller = new AbortController();
|
|
1811
|
+
const timeoutId = setTimeout(() => controller.abort(), this._timeout);
|
|
1812
|
+
|
|
1813
|
+
try {
|
|
1814
|
+
const response = await fetch(url, {
|
|
1815
|
+
method,
|
|
1816
|
+
headers,
|
|
1817
|
+
body: formData,
|
|
1818
|
+
signal: controller.signal,
|
|
1819
|
+
});
|
|
1820
|
+
|
|
1821
|
+
clearTimeout(timeoutId);
|
|
1822
|
+
|
|
1003
1823
|
if (!response.ok) {
|
|
1004
1824
|
const text = await response.text();
|
|
1005
|
-
|
|
1006
|
-
try {
|
|
1007
|
-
const json = JSON.parse(text);
|
|
1008
|
-
message = json.detail || json.error || message;
|
|
1009
|
-
} catch {
|
|
1010
|
-
message = text || message;
|
|
1011
|
-
}
|
|
1012
|
-
throw new KugelAudioError(message, response.status);
|
|
1825
|
+
throw classifyHttpError(response.status, text, response.headers);
|
|
1013
1826
|
}
|
|
1014
1827
|
|
|
1015
1828
|
return await response.json();
|
|
@@ -1019,9 +1832,14 @@ export class KugelAudio {
|
|
|
1019
1832
|
throw error;
|
|
1020
1833
|
}
|
|
1021
1834
|
if ((error as Error).name === 'AbortError') {
|
|
1022
|
-
throw new
|
|
1835
|
+
throw new ConnectionError(
|
|
1836
|
+
`Request to ${method} ${path} timed out after ${this._timeout}ms.`,
|
|
1837
|
+
);
|
|
1023
1838
|
}
|
|
1024
|
-
throw new
|
|
1839
|
+
throw new ConnectionError(
|
|
1840
|
+
`Could not reach KugelAudio at ${url}: ${(error as Error).message}. ` +
|
|
1841
|
+
'Check network connectivity.',
|
|
1842
|
+
);
|
|
1025
1843
|
}
|
|
1026
1844
|
}
|
|
1027
1845
|
}
|