kugelaudio 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -13
- package/dist/index.d.mts +518 -26
- package/dist/index.d.ts +518 -26
- package/dist/index.js +864 -112
- package/dist/index.mjs +858 -112
- package/package.json +9 -8
- package/src/client.test.ts +548 -0
- package/src/client.ts +885 -103
- package/src/errors.ts +266 -18
- package/src/index.ts +17 -2
- package/src/types.ts +215 -8
- package/src/websocket.ts +38 -18
package/dist/index.d.mts
CHANGED
|
@@ -41,6 +41,81 @@ interface Voice {
|
|
|
41
41
|
isPublic: boolean;
|
|
42
42
|
verified: boolean;
|
|
43
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* Paginated response from the voices list endpoint.
|
|
46
|
+
*/
|
|
47
|
+
interface VoiceListResponse {
|
|
48
|
+
voices: Voice[];
|
|
49
|
+
total: number;
|
|
50
|
+
limit: number;
|
|
51
|
+
offset: number;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Voice quality levels.
|
|
55
|
+
*/
|
|
56
|
+
type VoiceQuality = 'low' | 'mid' | 'high';
|
|
57
|
+
/**
|
|
58
|
+
* Extended voice information returned by voice management endpoints.
|
|
59
|
+
*/
|
|
60
|
+
interface VoiceDetail {
|
|
61
|
+
id: number;
|
|
62
|
+
name: string;
|
|
63
|
+
description: string;
|
|
64
|
+
generativeVoiceDescription: string;
|
|
65
|
+
supportedLanguages: string[];
|
|
66
|
+
category: string;
|
|
67
|
+
age?: string;
|
|
68
|
+
sex?: string;
|
|
69
|
+
quality: string;
|
|
70
|
+
isPublic: boolean;
|
|
71
|
+
verified: boolean;
|
|
72
|
+
pendingVerification: boolean;
|
|
73
|
+
sampleUrl?: string;
|
|
74
|
+
avatarUrl?: string;
|
|
75
|
+
sampleText: string;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Voice reference audio metadata.
|
|
79
|
+
*/
|
|
80
|
+
interface VoiceReference {
|
|
81
|
+
id: number;
|
|
82
|
+
voiceId: number;
|
|
83
|
+
name: string;
|
|
84
|
+
referenceText: string;
|
|
85
|
+
s3Path: string;
|
|
86
|
+
audioUrl?: string;
|
|
87
|
+
isGenerated: boolean;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Options for creating a new voice.
|
|
91
|
+
*/
|
|
92
|
+
interface CreateVoiceOptions {
|
|
93
|
+
name: string;
|
|
94
|
+
sex: string;
|
|
95
|
+
description?: string;
|
|
96
|
+
category?: string;
|
|
97
|
+
age?: string;
|
|
98
|
+
quality?: string;
|
|
99
|
+
supportedLanguages?: string[];
|
|
100
|
+
isPublic?: boolean;
|
|
101
|
+
sampleText?: string;
|
|
102
|
+
/** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
|
|
103
|
+
referenceFiles?: Array<File | Blob>;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Options for updating an existing voice.
|
|
107
|
+
*/
|
|
108
|
+
interface UpdateVoiceOptions {
|
|
109
|
+
name?: string;
|
|
110
|
+
description?: string;
|
|
111
|
+
category?: string;
|
|
112
|
+
age?: string;
|
|
113
|
+
sex?: string;
|
|
114
|
+
quality?: string;
|
|
115
|
+
supportedLanguages?: string[];
|
|
116
|
+
isPublic?: boolean;
|
|
117
|
+
sampleText?: string;
|
|
118
|
+
}
|
|
44
119
|
/**
|
|
45
120
|
* Word-level timestamp from server-side forced alignment.
|
|
46
121
|
*/
|
|
@@ -64,12 +139,20 @@ interface WordTimestamp {
|
|
|
64
139
|
interface GenerateOptions {
|
|
65
140
|
/** Text to synthesize */
|
|
66
141
|
text: string;
|
|
67
|
-
/** Model to use: 'kugel-1-turbo' (
|
|
142
|
+
/** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
|
|
68
143
|
modelId?: string;
|
|
69
144
|
/** Voice ID to use */
|
|
70
145
|
voiceId?: number;
|
|
71
146
|
/** CFG scale for generation (default: 2.0) */
|
|
72
147
|
cfgScale?: number;
|
|
148
|
+
/**
|
|
149
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
|
|
150
|
+
* 1 = most variance. Default: 0.5.
|
|
151
|
+
*
|
|
152
|
+
* Lower values produce more consistent reads across regenerations —
|
|
153
|
+
* useful for stable voiceovers, IVR prompts, and e-learning.
|
|
154
|
+
*/
|
|
155
|
+
temperature?: number;
|
|
73
156
|
/** Maximum tokens to generate (default: 2048) */
|
|
74
157
|
maxNewTokens?: number;
|
|
75
158
|
/** Output sample rate (default: 24000) */
|
|
@@ -89,7 +172,8 @@ interface GenerateOptions {
|
|
|
89
172
|
* (adds ~150ms latency).
|
|
90
173
|
*
|
|
91
174
|
* Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
|
|
92
|
-
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko
|
|
175
|
+
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
|
|
176
|
+
* he, fa, ur, bn, ta, yue, th, id, ms
|
|
93
177
|
*/
|
|
94
178
|
language?: string;
|
|
95
179
|
/**
|
|
@@ -98,15 +182,51 @@ interface GenerateOptions {
|
|
|
98
182
|
* Default: false
|
|
99
183
|
*/
|
|
100
184
|
wordTimestamps?: boolean;
|
|
185
|
+
/**
|
|
186
|
+
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
187
|
+
*
|
|
188
|
+
* Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
|
|
189
|
+
* can also be used for per-segment speed control.
|
|
190
|
+
* Range: [0.8, 1.2]. Default: 1.0.
|
|
191
|
+
*/
|
|
192
|
+
speed?: number;
|
|
193
|
+
/**
|
|
194
|
+
* Optional project ID for project-scoped features (custom dictionary
|
|
195
|
+
* replacements, per-project rate limits). The caller MUST verify the
|
|
196
|
+
* authenticated user has access to this project before passing it; the
|
|
197
|
+
* server treats the value as trusted once received.
|
|
198
|
+
*/
|
|
199
|
+
projectId?: number;
|
|
101
200
|
}
|
|
102
201
|
/**
|
|
103
|
-
* Streaming session configuration
|
|
202
|
+
* Streaming session configuration for `/ws/tts/stream`.
|
|
203
|
+
*
|
|
204
|
+
* The server accumulates LLM tokens internally and starts generation at natural
|
|
205
|
+
* sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
|
|
206
|
+
* server begins generating, or set {@link autoMode} to start at the very first
|
|
207
|
+
* clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
|
|
208
|
+
*
|
|
209
|
+
* @example Low-latency preset
|
|
210
|
+
* ```typescript
|
|
211
|
+
* const session = client.tts.streamingSession({
|
|
212
|
+
* voiceId: 123,
|
|
213
|
+
* autoMode: true,
|
|
214
|
+
* chunkLengthSchedule: [50, 100, 150, 250],
|
|
215
|
+
* });
|
|
216
|
+
* ```
|
|
104
217
|
*/
|
|
105
218
|
interface StreamConfig {
|
|
106
219
|
/** Voice ID to use */
|
|
107
220
|
voiceId?: number;
|
|
221
|
+
/** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
|
|
222
|
+
modelId?: string;
|
|
108
223
|
/** CFG scale for generation */
|
|
109
224
|
cfgScale?: number;
|
|
225
|
+
/**
|
|
226
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
|
|
227
|
+
* Default: 0.5.
|
|
228
|
+
*/
|
|
229
|
+
temperature?: number;
|
|
110
230
|
/** Maximum tokens per generation */
|
|
111
231
|
maxNewTokens?: number;
|
|
112
232
|
/** Output sample rate */
|
|
@@ -130,6 +250,63 @@ interface StreamConfig {
|
|
|
130
250
|
* Default: false
|
|
131
251
|
*/
|
|
132
252
|
wordTimestamps?: boolean;
|
|
253
|
+
/**
|
|
254
|
+
* Minimum buffer sizes (in characters) the server must accumulate before
|
|
255
|
+
* auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
|
|
256
|
+
* last value is reused for all subsequent chunks.
|
|
257
|
+
*
|
|
258
|
+
* Smaller values produce lower TTFA at the cost of less prosody context.
|
|
259
|
+
* Larger values improve naturalness but increase TTFA.
|
|
260
|
+
*
|
|
261
|
+
* @example
|
|
262
|
+
* ```typescript
|
|
263
|
+
* chunkLengthSchedule: [50, 100, 150, 250] // low-latency
|
|
264
|
+
* chunkLengthSchedule: [120, 200, 300] // high-quality prosody
|
|
265
|
+
* ```
|
|
266
|
+
*/
|
|
267
|
+
chunkLengthSchedule?: number[];
|
|
268
|
+
/**
|
|
269
|
+
* When `true`, the server starts generating audio at the very first clean
|
|
270
|
+
* sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
|
|
271
|
+
* ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
|
|
272
|
+
* less natural prosody on the first chunk.
|
|
273
|
+
*/
|
|
274
|
+
autoMode?: boolean;
|
|
275
|
+
/**
|
|
276
|
+
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
277
|
+
*
|
|
278
|
+
* Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
|
|
279
|
+
* can also be used for per-segment speed control.
|
|
280
|
+
* Range: [0.8, 1.2]. Default: 1.0.
|
|
281
|
+
*/
|
|
282
|
+
speed?: number;
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* Event callbacks for a streaming session (`/ws/tts/stream`).
|
|
286
|
+
*
|
|
287
|
+
* This is the LLM-integration endpoint: forward raw tokens via
|
|
288
|
+
* {@link StreamingSession.send} and the server auto-chunks them at sentence
|
|
289
|
+
* boundaries.
|
|
290
|
+
*/
|
|
291
|
+
interface StreamingSessionCallbacks {
|
|
292
|
+
/** Called when an audio chunk arrives for any segment. */
|
|
293
|
+
onChunk?: (chunk: AudioChunk) => void;
|
|
294
|
+
/**
|
|
295
|
+
* Called when all audio for one flushed text segment is complete.
|
|
296
|
+
* Carries the segment index, total audio duration, and generation time.
|
|
297
|
+
*/
|
|
298
|
+
onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
|
|
299
|
+
/**
|
|
300
|
+
* Called when the session is fully closed (after `session.close()`).
|
|
301
|
+
* Equivalent to `onFinal` on the one-shot endpoint.
|
|
302
|
+
*/
|
|
303
|
+
onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
304
|
+
/** Called when the server begins generating audio for a text segment. */
|
|
305
|
+
onGenerationStarted?: (chunkId: number, text: string) => void;
|
|
306
|
+
/** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
|
|
307
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
308
|
+
/** Called on any error. */
|
|
309
|
+
onError?: (error: Error) => void;
|
|
133
310
|
}
|
|
134
311
|
/**
|
|
135
312
|
* Audio chunk from streaming TTS.
|
|
@@ -160,8 +337,6 @@ interface GenerationStats {
|
|
|
160
337
|
durationMs: number;
|
|
161
338
|
/** Generation time in milliseconds */
|
|
162
339
|
generationMs: number;
|
|
163
|
-
/** Time to first audio in milliseconds */
|
|
164
|
-
ttfaMs: number | null;
|
|
165
340
|
/** Real-time factor */
|
|
166
341
|
rtf: number;
|
|
167
342
|
/** Error message if any */
|
|
@@ -203,11 +378,18 @@ interface StreamCallbacks {
|
|
|
203
378
|
/** Called when connection closes */
|
|
204
379
|
onClose?: () => void;
|
|
205
380
|
}
|
|
381
|
+
/**
|
|
382
|
+
* Deployment region. Controls which API endpoint the SDK connects to.
|
|
383
|
+
* - `'eu'` — `api.kugelaudio.com` (default)
|
|
384
|
+
* - `'us'` — `us-api.kugelaudio.com`
|
|
385
|
+
* - `'global'` — `global-api.kugelaudio.com` (geo-routed)
|
|
386
|
+
*/
|
|
387
|
+
type Region = 'eu' | 'us' | 'global';
|
|
206
388
|
/**
|
|
207
389
|
* KugelAudio client options.
|
|
208
390
|
*/
|
|
209
391
|
interface KugelAudioOptions {
|
|
210
|
-
/** Your KugelAudio API key or JWT token */
|
|
392
|
+
/** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
|
|
211
393
|
apiKey: string;
|
|
212
394
|
/** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
|
|
213
395
|
isMasterKey?: boolean;
|
|
@@ -215,12 +397,20 @@ interface KugelAudioOptions {
|
|
|
215
397
|
isToken?: boolean;
|
|
216
398
|
/** Organisation ID to bill usage against (required for token auth to enable usage recording). */
|
|
217
399
|
orgId?: number;
|
|
400
|
+
/** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
|
|
401
|
+
region?: Region;
|
|
218
402
|
/** API base URL (default: https://api.kugelaudio.com) */
|
|
219
403
|
apiUrl?: string;
|
|
220
404
|
/** TTS server URL (default: same as apiUrl) */
|
|
221
405
|
ttsUrl?: string;
|
|
222
406
|
/** Request timeout in milliseconds (default: 60000) */
|
|
223
407
|
timeout?: number;
|
|
408
|
+
/**
|
|
409
|
+
* Interval in milliseconds between WebSocket ping frames sent on the pooled connection
|
|
410
|
+
* to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
|
|
411
|
+
* In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
|
|
412
|
+
*/
|
|
413
|
+
keepalivePingInterval?: number | null;
|
|
224
414
|
}
|
|
225
415
|
/**
|
|
226
416
|
* Multi-context session configuration.
|
|
@@ -232,10 +422,21 @@ interface MultiContextConfig {
|
|
|
232
422
|
sampleRate?: number;
|
|
233
423
|
/** CFG scale for generation (default: 2.0) */
|
|
234
424
|
cfgScale?: number;
|
|
425
|
+
/**
|
|
426
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
|
|
427
|
+
* Default: 0.5.
|
|
428
|
+
*/
|
|
429
|
+
temperature?: number;
|
|
235
430
|
/** Maximum tokens to generate (default: 2048) */
|
|
236
431
|
maxNewTokens?: number;
|
|
237
432
|
/** Enable text normalization (default: true) */
|
|
238
433
|
normalize?: boolean;
|
|
434
|
+
/**
|
|
435
|
+
* ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
|
|
436
|
+
* If not set and normalize is true (default), the server auto-detects
|
|
437
|
+
* the language, which adds ~60-150ms to time-to-first-audio.
|
|
438
|
+
*/
|
|
439
|
+
language?: string;
|
|
239
440
|
/** Seconds before context auto-closes (default: 20.0) */
|
|
240
441
|
inactivityTimeout?: number;
|
|
241
442
|
}
|
|
@@ -271,8 +472,6 @@ interface MultiContextCallbacks {
|
|
|
271
472
|
onContextCreated?: (contextId: string) => void;
|
|
272
473
|
/** Called when an audio chunk is received */
|
|
273
474
|
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
274
|
-
/** Called when a context finishes generating */
|
|
275
|
-
onContextFinal?: (contextId: string) => void;
|
|
276
475
|
/** Called when a context is closed */
|
|
277
476
|
onContextClosed?: (contextId: string) => void;
|
|
278
477
|
/** Called when a context times out */
|
|
@@ -307,11 +506,51 @@ declare class VoicesResource {
|
|
|
307
506
|
language?: string;
|
|
308
507
|
includePublic?: boolean;
|
|
309
508
|
limit?: number;
|
|
310
|
-
|
|
509
|
+
offset?: number;
|
|
510
|
+
}): Promise<VoiceListResponse>;
|
|
311
511
|
/**
|
|
312
512
|
* Get a specific voice by ID.
|
|
313
513
|
*/
|
|
314
|
-
get(voiceId: number): Promise<
|
|
514
|
+
get(voiceId: number): Promise<VoiceDetail>;
|
|
515
|
+
/**
|
|
516
|
+
* Create a new voice.
|
|
517
|
+
*/
|
|
518
|
+
create(options: CreateVoiceOptions): Promise<VoiceDetail>;
|
|
519
|
+
/**
|
|
520
|
+
* Update an existing voice. Only provided fields are updated.
|
|
521
|
+
*/
|
|
522
|
+
update(voiceId: number, options: UpdateVoiceOptions): Promise<VoiceDetail>;
|
|
523
|
+
/**
|
|
524
|
+
* Delete a voice.
|
|
525
|
+
*/
|
|
526
|
+
delete(voiceId: number): Promise<void>;
|
|
527
|
+
/**
|
|
528
|
+
* List reference audio files for a voice.
|
|
529
|
+
*/
|
|
530
|
+
listReferences(voiceId: number): Promise<VoiceReference[]>;
|
|
531
|
+
/**
|
|
532
|
+
* Upload a reference audio file to a voice.
|
|
533
|
+
*
|
|
534
|
+
* @param voiceId - Voice ID
|
|
535
|
+
* @param file - Audio file (File in browser, Blob in Node.js)
|
|
536
|
+
* @param referenceText - Optional transcript of the reference audio
|
|
537
|
+
*/
|
|
538
|
+
addReference(voiceId: number, file: File | Blob, referenceText?: string): Promise<VoiceReference>;
|
|
539
|
+
/**
|
|
540
|
+
* Delete a reference audio file from a voice.
|
|
541
|
+
*/
|
|
542
|
+
deleteReference(voiceId: number, referenceId: number): Promise<void>;
|
|
543
|
+
/**
|
|
544
|
+
* Request publication of a voice. Sets it as public and marks it
|
|
545
|
+
* as pending verification by an admin.
|
|
546
|
+
*/
|
|
547
|
+
publish(voiceId: number): Promise<VoiceDetail>;
|
|
548
|
+
/**
|
|
549
|
+
* Trigger sample audio generation for a voice.
|
|
550
|
+
*/
|
|
551
|
+
generateSample(voiceId: number): Promise<VoiceDetail>;
|
|
552
|
+
private mapVoiceDetail;
|
|
553
|
+
private mapVoiceReference;
|
|
315
554
|
}
|
|
316
555
|
/**
|
|
317
556
|
* TTS resource for text-to-speech generation.
|
|
@@ -322,6 +561,7 @@ declare class TTSResource {
|
|
|
322
561
|
private wsUrl;
|
|
323
562
|
private pendingRequests;
|
|
324
563
|
private requestCounter;
|
|
564
|
+
private keepaliveTimer;
|
|
325
565
|
constructor(client: KugelAudio);
|
|
326
566
|
/**
|
|
327
567
|
* Pre-establish WebSocket connection for faster first request.
|
|
@@ -350,6 +590,40 @@ declare class TTSResource {
|
|
|
350
590
|
* Returns complete audio after all chunks are received.
|
|
351
591
|
*/
|
|
352
592
|
generate(options: GenerateOptions): Promise<AudioResponse>;
|
|
593
|
+
/**
|
|
594
|
+
* Stream audio and return a Node.js Readable stream of raw PCM16 binary data.
|
|
595
|
+
*
|
|
596
|
+
* **Node.js only** — this method requires the `stream` built-in module and is
|
|
597
|
+
* intended for server-side integrations such as Vapi custom TTS endpoints,
|
|
598
|
+
* Express/Fastify handlers, or any pipeline that expects a Node.js `Readable`.
|
|
599
|
+
*
|
|
600
|
+
* Compared to manually wiring `onChunk` to a `Readable`, this method avoids
|
|
601
|
+
* a common race-condition: the stream object is created and returned **before**
|
|
602
|
+
* any chunks arrive, so the caller can safely pipe or attach listeners before
|
|
603
|
+
* the first audio byte is pushed.
|
|
604
|
+
*
|
|
605
|
+
* @example Vapi custom TTS endpoint
|
|
606
|
+
* ```typescript
|
|
607
|
+
* app.post('/synthesize', (req, res) => {
|
|
608
|
+
* res.setHeader('Content-Type', 'audio/pcm');
|
|
609
|
+
* res.setHeader('Transfer-Encoding', 'chunked');
|
|
610
|
+
*
|
|
611
|
+
* const readable = client.tts.toReadable({
|
|
612
|
+
* text: req.body.message.text,
|
|
613
|
+
* modelId: 'kugel-1-turbo',
|
|
614
|
+
* sampleRate: req.body.message.sampleRate,
|
|
615
|
+
* language: 'en',
|
|
616
|
+
* });
|
|
617
|
+
*
|
|
618
|
+
* readable.pipe(res);
|
|
619
|
+
* });
|
|
620
|
+
* ```
|
|
621
|
+
*
|
|
622
|
+
* @param options - TTS generation options (same as `stream()`)
|
|
623
|
+
* @param reuseConnection - Reuse the pooled WebSocket connection (default: true)
|
|
624
|
+
* @returns Node.js Readable stream emitting raw PCM16 binary Buffer chunks
|
|
625
|
+
*/
|
|
626
|
+
toReadable(options: GenerateOptions, reuseConnection?: boolean): any;
|
|
353
627
|
/**
|
|
354
628
|
* Build the WebSocket URL with appropriate auth param.
|
|
355
629
|
*/
|
|
@@ -380,11 +654,47 @@ declare class TTSResource {
|
|
|
380
654
|
* Stream without connection pooling (original behavior).
|
|
381
655
|
*/
|
|
382
656
|
private streamWithoutPooling;
|
|
657
|
+
/**
|
|
658
|
+
* Start periodic keepalive pings on the pooled connection.
|
|
659
|
+
* Uses the ws package's ping() in Node.js; silently skips in browsers
|
|
660
|
+
* where WebSocket doesn't expose a ping method.
|
|
661
|
+
*/
|
|
662
|
+
private startKeepalive;
|
|
663
|
+
private stopKeepalive;
|
|
383
664
|
/**
|
|
384
665
|
* Close the pooled WebSocket connection.
|
|
385
666
|
*/
|
|
386
667
|
close(): void;
|
|
387
668
|
private parseError;
|
|
669
|
+
/**
|
|
670
|
+
* Create a streaming session for LLM integration.
|
|
671
|
+
*
|
|
672
|
+
* The session connects to `/ws/tts/stream` and keeps a persistent
|
|
673
|
+
* connection across multiple {@link StreamingSession.send} calls.
|
|
674
|
+
* The server auto-chunks text at sentence boundaries — no client-side
|
|
675
|
+
* flushing required.
|
|
676
|
+
*
|
|
677
|
+
* @param config - Session configuration (voice, model, chunking strategy).
|
|
678
|
+
* @param callbacks - Callbacks for audio chunks and session lifecycle events.
|
|
679
|
+
* @returns A {@link StreamingSession} instance. Call `.connect()` before sending.
|
|
680
|
+
*
|
|
681
|
+
* @example
|
|
682
|
+
* ```typescript
|
|
683
|
+
* const session = client.tts.streamingSession(
|
|
684
|
+
* { voiceId: 123, autoMode: true, chunkLengthSchedule: [50, 100, 150, 250] },
|
|
685
|
+
* { onChunk: (chunk) => playAudio(chunk.audio) },
|
|
686
|
+
* );
|
|
687
|
+
*
|
|
688
|
+
* session.connect();
|
|
689
|
+
*
|
|
690
|
+
* for await (const token of llmStream) {
|
|
691
|
+
* session.send(token);
|
|
692
|
+
* }
|
|
693
|
+
*
|
|
694
|
+
* await session.close();
|
|
695
|
+
* ```
|
|
696
|
+
*/
|
|
697
|
+
streamingSession(config: StreamConfig, callbacks: StreamingSessionCallbacks): StreamingSession;
|
|
388
698
|
/**
|
|
389
699
|
* Create a multi-context session for concurrent TTS streams.
|
|
390
700
|
*
|
|
@@ -403,7 +713,7 @@ declare class TTSResource {
|
|
|
403
713
|
* console.log(`Audio from ${chunk.contextId}`);
|
|
404
714
|
* playAudio(chunk.audio);
|
|
405
715
|
* },
|
|
406
|
-
*
|
|
716
|
+
* onContextClosed: (contextId) => {
|
|
407
717
|
* console.log(`${contextId} finished`);
|
|
408
718
|
* },
|
|
409
719
|
* });
|
|
@@ -440,8 +750,13 @@ declare class MultiContextSession {
|
|
|
440
750
|
get sessionId(): string | null;
|
|
441
751
|
/**
|
|
442
752
|
* Connect to the multi-context WebSocket endpoint.
|
|
753
|
+
*
|
|
754
|
+
* The returned promise resolves once the WebSocket is OPEN so callers can
|
|
755
|
+
* ``await session.connect(callbacks)`` before invoking
|
|
756
|
+
* {@link createContext} / {@link send}. Pre-open errors reject with the
|
|
757
|
+
* typed error.
|
|
443
758
|
*/
|
|
444
|
-
connect(callbacks: MultiContextCallbacks): void
|
|
759
|
+
connect(callbacks: MultiContextCallbacks): Promise<void>;
|
|
445
760
|
/**
|
|
446
761
|
* Create a new context with optional voice settings.
|
|
447
762
|
*/
|
|
@@ -478,6 +793,103 @@ declare class MultiContextSession {
|
|
|
478
793
|
*/
|
|
479
794
|
get isConnected(): boolean;
|
|
480
795
|
}
|
|
796
|
+
/**
|
|
797
|
+
* Streaming session for LLM integration via `/ws/tts/stream`.
|
|
798
|
+
*
|
|
799
|
+
* The server accumulates text across multiple {@link send} calls and
|
|
800
|
+
* auto-chunks it at sentence boundaries, keeping the KV cache warm between
|
|
801
|
+
* chunks for natural prosody. You never need to call `flush` explicitly —
|
|
802
|
+
* configure {@link StreamConfig.chunkLengthSchedule} or
|
|
803
|
+
* {@link StreamConfig.autoMode} instead.
|
|
804
|
+
*
|
|
805
|
+
* @example
|
|
806
|
+
* ```typescript
|
|
807
|
+
* const session = client.tts.streamingSession({
|
|
808
|
+
* voiceId: 123,
|
|
809
|
+
* autoMode: true,
|
|
810
|
+
* chunkLengthSchedule: [50, 100, 150, 250],
|
|
811
|
+
* }, {
|
|
812
|
+
* onChunk: (chunk) => playAudio(chunk.audio),
|
|
813
|
+
* onSessionClosed: (totalSecs) => console.log(`Done: ${totalSecs}s`),
|
|
814
|
+
* });
|
|
815
|
+
*
|
|
816
|
+
* session.connect();
|
|
817
|
+
*
|
|
818
|
+
* for await (const token of llmStream) {
|
|
819
|
+
* session.send(token);
|
|
820
|
+
* }
|
|
821
|
+
*
|
|
822
|
+
* await session.close();
|
|
823
|
+
* ```
|
|
824
|
+
*/
|
|
825
|
+
declare class StreamingSession {
|
|
826
|
+
private ws;
|
|
827
|
+
private config;
|
|
828
|
+
private callbacks;
|
|
829
|
+
private client;
|
|
830
|
+
private configSent;
|
|
831
|
+
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
|
|
832
|
+
/**
|
|
833
|
+
* Open the WebSocket connection and authenticate.
|
|
834
|
+
*
|
|
835
|
+
* The returned promise resolves once the WebSocket is OPEN, so callers can
|
|
836
|
+
* ``await session.connect()`` and then ``send()`` without racing the
|
|
837
|
+
* handshake. Pre-open errors (network failure, 4001 unauthorized, …) reject
|
|
838
|
+
* the promise with the typed error.
|
|
839
|
+
*/
|
|
840
|
+
connect(): Promise<void>;
|
|
841
|
+
/**
|
|
842
|
+
* Send a text chunk to the server (e.g. one LLM output token).
|
|
843
|
+
*
|
|
844
|
+
* The server buffers text across multiple calls and starts generating at
|
|
845
|
+
* natural sentence boundaries automatically — no need to call `flush`.
|
|
846
|
+
*
|
|
847
|
+
* @param text - Raw text or LLM token to append to the server buffer.
|
|
848
|
+
* @param flush - Force immediate generation of whatever is buffered.
|
|
849
|
+
* **Avoid calling this per-sentence from the client.** Doing so bypasses
|
|
850
|
+
* the server's semantic chunking, incurs a fresh model prefill cost on
|
|
851
|
+
* every flush, and makes latency *worse*, not better. Let the server
|
|
852
|
+
* handle chunking via `chunkLengthSchedule` / `autoMode` instead.
|
|
853
|
+
*/
|
|
854
|
+
send(text: string, flush?: boolean): void;
|
|
855
|
+
/**
|
|
856
|
+
* End the current session but keep the WebSocket connection open.
|
|
857
|
+
*
|
|
858
|
+
* This allows starting a new session on the same connection, avoiding
|
|
859
|
+
* the overhead of a new WebSocket handshake (~200-300ms). After calling
|
|
860
|
+
* this, optionally call {@link updateConfig} to change voice/model settings,
|
|
861
|
+
* then call {@link send} to start the next session.
|
|
862
|
+
*
|
|
863
|
+
* The returned promise resolves once the server confirms with a
|
|
864
|
+
* `session_closed` message, or after a 15 s **quiet** timeout — i.e. 15 s
|
|
865
|
+
* elapse without *any* server message arriving. The timer resets on every
|
|
866
|
+
* incoming frame so a long final flush that streams audio for tens of
|
|
867
|
+
* seconds is not truncated; only a genuinely silent server trips the fuse.
|
|
868
|
+
*/
|
|
869
|
+
endSession(): Promise<void>;
|
|
870
|
+
/**
|
|
871
|
+
* Update session configuration for the next session.
|
|
872
|
+
*
|
|
873
|
+
* Call this after {@link endSession} and before the next {@link send}
|
|
874
|
+
* to change voice, model, language, or other settings.
|
|
875
|
+
*/
|
|
876
|
+
updateConfig(config: Partial<StreamConfig>): void;
|
|
877
|
+
/**
|
|
878
|
+
* Close the session and the WebSocket connection.
|
|
879
|
+
*
|
|
880
|
+
* For session reuse without closing the connection, use
|
|
881
|
+
* {@link endSession} instead.
|
|
882
|
+
*
|
|
883
|
+
* The returned promise resolves once the server confirms the close with a
|
|
884
|
+
* `session_closed` message, or after a 15 s **quiet** timeout (no traffic
|
|
885
|
+
* from the server in that window). Audio frames from the server-side
|
|
886
|
+
* final-flush of the still-buffered text are delivered to your callbacks
|
|
887
|
+
* before this promise resolves, and each frame resets the quiet timer.
|
|
888
|
+
*/
|
|
889
|
+
close(): Promise<void>;
|
|
890
|
+
/** Whether the underlying WebSocket is open. */
|
|
891
|
+
get isConnected(): boolean;
|
|
892
|
+
}
|
|
481
893
|
/**
|
|
482
894
|
* KugelAudio API client.
|
|
483
895
|
*
|
|
@@ -491,13 +903,13 @@ declare class MultiContextSession {
|
|
|
491
903
|
* // List voices
|
|
492
904
|
* const voices = await client.voices.list();
|
|
493
905
|
*
|
|
494
|
-
* // Generate audio with fast model
|
|
906
|
+
* // Generate audio with fast model
|
|
495
907
|
* const audio = await client.tts.generate({
|
|
496
908
|
* text: 'Hello, world!',
|
|
497
909
|
* modelId: 'kugel-1-turbo',
|
|
498
910
|
* });
|
|
499
911
|
*
|
|
500
|
-
* // Generate audio with premium model
|
|
912
|
+
* // Generate audio with premium model
|
|
501
913
|
* const audio = await client.tts.generate({
|
|
502
914
|
* text: 'Hello, world!',
|
|
503
915
|
* modelId: 'kugel-1',
|
|
@@ -512,6 +924,7 @@ declare class KugelAudio {
|
|
|
512
924
|
private _apiUrl;
|
|
513
925
|
private _ttsUrl;
|
|
514
926
|
private _timeout;
|
|
927
|
+
private _keepalivePingInterval;
|
|
515
928
|
/** Models resource */
|
|
516
929
|
readonly models: ModelsResource;
|
|
517
930
|
/** Voices resource */
|
|
@@ -546,6 +959,8 @@ declare class KugelAudio {
|
|
|
546
959
|
get orgId(): number | undefined;
|
|
547
960
|
/** Get TTS URL */
|
|
548
961
|
get ttsUrl(): string;
|
|
962
|
+
/** Get keepalive ping interval in milliseconds, or null if disabled. */
|
|
963
|
+
get keepalivePingInterval(): number | null;
|
|
549
964
|
/**
|
|
550
965
|
* Close the client and release resources.
|
|
551
966
|
* This closes any pooled WebSocket connections.
|
|
@@ -578,48 +993,125 @@ declare class KugelAudio {
|
|
|
578
993
|
* @internal
|
|
579
994
|
*/
|
|
580
995
|
request<T>(method: string, path: string, body?: unknown): Promise<T>;
|
|
996
|
+
/**
|
|
997
|
+
* Make a multipart/form-data request (for file uploads).
|
|
998
|
+
* @internal Used by VoicesResource for reference file uploads.
|
|
999
|
+
*/
|
|
1000
|
+
requestMultipart<T>(method: string, path: string, formData: FormData): Promise<T>;
|
|
581
1001
|
}
|
|
582
1002
|
|
|
583
1003
|
/**
|
|
584
1004
|
* Custom errors for KugelAudio SDK.
|
|
1005
|
+
*
|
|
1006
|
+
* All SDK errors inherit from {@link KugelAudioError}. Specific subclasses
|
|
1007
|
+
* map to the server's `error_code` field (see the server-side `ErrorCode`
|
|
1008
|
+
* enum at `tts/src/serving/deployments/errors.py`) so callers can
|
|
1009
|
+
* `instanceof AuthenticationError` without matching on message text.
|
|
585
1010
|
*/
|
|
1011
|
+
declare const ErrorCodes: {
|
|
1012
|
+
readonly UNAUTHORIZED: "UNAUTHORIZED";
|
|
1013
|
+
readonly RATE_LIMITED: "RATE_LIMITED";
|
|
1014
|
+
readonly INSUFFICIENT_CREDITS: "INSUFFICIENT_CREDITS";
|
|
1015
|
+
readonly MODEL_UNAVAILABLE: "MODEL_UNAVAILABLE";
|
|
1016
|
+
readonly EMPTY_AUDIO: "EMPTY_AUDIO";
|
|
1017
|
+
readonly VALIDATION: "VALIDATION_ERROR";
|
|
1018
|
+
readonly INTERNAL: "INTERNAL_ERROR";
|
|
1019
|
+
readonly NOT_FOUND: "NOT_FOUND";
|
|
1020
|
+
};
|
|
1021
|
+
type ErrorCode = typeof ErrorCodes[keyof typeof ErrorCodes];
|
|
1022
|
+
declare const WsCloseCodes: {
|
|
1023
|
+
readonly UNAUTHORIZED: 4001;
|
|
1024
|
+
readonly INSUFFICIENT_CREDITS: 4003;
|
|
1025
|
+
readonly RATE_LIMITED: 4029;
|
|
1026
|
+
readonly MODEL_UNAVAILABLE: 4500;
|
|
1027
|
+
};
|
|
1028
|
+
interface KugelAudioErrorOptions {
|
|
1029
|
+
statusCode?: number;
|
|
1030
|
+
errorCode?: string;
|
|
1031
|
+
requestId?: string;
|
|
1032
|
+
retryAfter?: number;
|
|
1033
|
+
cause?: unknown;
|
|
1034
|
+
}
|
|
586
1035
|
/**
|
|
587
1036
|
* Base error class for KugelAudio SDK.
|
|
588
1037
|
*/
|
|
589
1038
|
declare class KugelAudioError extends Error {
|
|
590
1039
|
readonly statusCode?: number;
|
|
591
|
-
|
|
1040
|
+
readonly errorCode?: string;
|
|
1041
|
+
readonly requestId?: string;
|
|
1042
|
+
readonly retryAfter?: number;
|
|
1043
|
+
constructor(message: string, options?: KugelAudioErrorOptions);
|
|
592
1044
|
}
|
|
593
1045
|
/**
|
|
594
|
-
*
|
|
1046
|
+
* API key was missing, malformed, or rejected by the server.
|
|
595
1047
|
*/
|
|
596
1048
|
declare class AuthenticationError extends KugelAudioError {
|
|
597
|
-
constructor(message?: string);
|
|
1049
|
+
constructor(message?: string, options?: KugelAudioErrorOptions);
|
|
598
1050
|
}
|
|
599
1051
|
/**
|
|
600
|
-
*
|
|
1052
|
+
* Request was rejected by the per-org rate limiter.
|
|
601
1053
|
*/
|
|
602
1054
|
declare class RateLimitError extends KugelAudioError {
|
|
603
|
-
constructor(message?: string);
|
|
1055
|
+
constructor(message?: string, options?: KugelAudioErrorOptions);
|
|
604
1056
|
}
|
|
605
1057
|
/**
|
|
606
|
-
*
|
|
1058
|
+
* Account is out of TTS credits.
|
|
607
1059
|
*/
|
|
608
1060
|
declare class InsufficientCreditsError extends KugelAudioError {
|
|
609
|
-
constructor(message?: string);
|
|
1061
|
+
constructor(message?: string, options?: KugelAudioErrorOptions);
|
|
610
1062
|
}
|
|
611
1063
|
/**
|
|
612
|
-
*
|
|
1064
|
+
* Request was rejected as invalid (bad params, missing fields, etc.).
|
|
613
1065
|
*/
|
|
614
1066
|
declare class ValidationError extends KugelAudioError {
|
|
615
|
-
constructor(message: string);
|
|
1067
|
+
constructor(message: string, options?: KugelAudioErrorOptions);
|
|
616
1068
|
}
|
|
617
1069
|
/**
|
|
618
|
-
*
|
|
1070
|
+
* The SDK could not reach KugelAudio (network error, server down,
|
|
1071
|
+
* or model deployment temporarily unavailable).
|
|
619
1072
|
*/
|
|
620
1073
|
declare class ConnectionError extends KugelAudioError {
|
|
621
|
-
constructor(message?:
|
|
1074
|
+
constructor(message: string, options?: KugelAudioErrorOptions);
|
|
1075
|
+
}
|
|
1076
|
+
interface HttpResponseLike {
|
|
1077
|
+
status: number;
|
|
1078
|
+
headers: {
|
|
1079
|
+
get(name: string): string | null;
|
|
1080
|
+
} | Record<string, string | undefined>;
|
|
1081
|
+
text?: () => Promise<string>;
|
|
622
1082
|
}
|
|
1083
|
+
/**
|
|
1084
|
+
* Build the appropriate `KugelAudioError` from an HTTP response body that
|
|
1085
|
+
* was already parsed. `bodyText` is the raw text fallback.
|
|
1086
|
+
*/
|
|
1087
|
+
declare function classifyHttpError(status: number, bodyText: string, headers: HttpResponseLike['headers']): KugelAudioError;
|
|
1088
|
+
/**
|
|
1089
|
+
* Build a `KugelAudioError` from a server-sent WebSocket error frame
|
|
1090
|
+
* (`{error, error_code, retry_after}`).
|
|
1091
|
+
*/
|
|
1092
|
+
declare function classifyWsFrame(data: {
|
|
1093
|
+
error?: string;
|
|
1094
|
+
error_code?: string;
|
|
1095
|
+
retry_after?: number;
|
|
1096
|
+
}): KugelAudioError;
|
|
1097
|
+
/**
|
|
1098
|
+
* Build a `KugelAudioError` from a WebSocket close code + reason.
|
|
1099
|
+
*/
|
|
1100
|
+
declare function classifyWsClose(code: number | undefined, reason?: string): KugelAudioError;
|
|
1101
|
+
/**
|
|
1102
|
+
* Extract the HTTP status from a `ws` package handshake-rejection error and
|
|
1103
|
+
* return a typed `KugelAudioError`. Returns `null` if the error doesn't look
|
|
1104
|
+
* like a handshake rejection (e.g. pure network failure).
|
|
1105
|
+
*
|
|
1106
|
+
* The `ws` library surfaces rejected upgrades via:
|
|
1107
|
+
* - an Error whose `.message` is `"Unexpected server response: <status>"`
|
|
1108
|
+
* - `error.code === 'EUNEXPECTEDRESPONSE'`, with `error.statusCode` on some versions
|
|
1109
|
+
*
|
|
1110
|
+
* The TTS server rejects WS upgrades with a bare API key using HTTP 403
|
|
1111
|
+
* (not 401), so we treat 403 here as an auth failure — HTTP API callers
|
|
1112
|
+
* keep the generic 403 semantics via {@link classifyHttpError}.
|
|
1113
|
+
*/
|
|
1114
|
+
declare function classifyWsHandshakeError(err: unknown): KugelAudioError | null;
|
|
623
1115
|
|
|
624
1116
|
/**
|
|
625
1117
|
* Utility functions for KugelAudio SDK.
|
|
@@ -641,4 +1133,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
|
|
|
641
1133
|
*/
|
|
642
1134
|
declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
|
|
643
1135
|
|
|
644
|
-
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
|
|
1136
|
+
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type CreateVoiceOptions, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };
|