kugelaudio 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -13
- package/dist/index.d.mts +550 -26
- package/dist/index.d.ts +550 -26
- package/dist/index.js +898 -113
- package/dist/index.mjs +892 -113
- package/package.json +9 -8
- package/src/client.test.ts +548 -0
- package/src/client.ts +921 -103
- package/src/errors.ts +266 -18
- package/src/index.ts +19 -3
- package/src/types.ts +248 -8
- package/src/websocket.ts +38 -18
package/dist/index.d.ts
CHANGED
|
@@ -41,18 +41,118 @@ interface Voice {
|
|
|
41
41
|
isPublic: boolean;
|
|
42
42
|
verified: boolean;
|
|
43
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* Paginated response from the voices list endpoint.
|
|
46
|
+
*/
|
|
47
|
+
interface VoiceListResponse {
|
|
48
|
+
voices: Voice[];
|
|
49
|
+
total: number;
|
|
50
|
+
limit: number;
|
|
51
|
+
offset: number;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Voice quality levels.
|
|
55
|
+
*/
|
|
56
|
+
type VoiceQuality = 'low' | 'mid' | 'high';
|
|
57
|
+
/**
|
|
58
|
+
* Extended voice information returned by voice management endpoints.
|
|
59
|
+
*/
|
|
60
|
+
interface VoiceDetail {
|
|
61
|
+
id: number;
|
|
62
|
+
name: string;
|
|
63
|
+
description: string;
|
|
64
|
+
generativeVoiceDescription: string;
|
|
65
|
+
supportedLanguages: string[];
|
|
66
|
+
category: string;
|
|
67
|
+
age?: string;
|
|
68
|
+
sex?: string;
|
|
69
|
+
quality: string;
|
|
70
|
+
isPublic: boolean;
|
|
71
|
+
verified: boolean;
|
|
72
|
+
pendingVerification: boolean;
|
|
73
|
+
sampleUrl?: string;
|
|
74
|
+
avatarUrl?: string;
|
|
75
|
+
sampleText: string;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Voice reference audio metadata.
|
|
79
|
+
*/
|
|
80
|
+
interface VoiceReference {
|
|
81
|
+
id: number;
|
|
82
|
+
voiceId: number;
|
|
83
|
+
name: string;
|
|
84
|
+
referenceText: string;
|
|
85
|
+
s3Path: string;
|
|
86
|
+
audioUrl?: string;
|
|
87
|
+
isGenerated: boolean;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Options for creating a new voice.
|
|
91
|
+
*/
|
|
92
|
+
interface CreateVoiceOptions {
|
|
93
|
+
name: string;
|
|
94
|
+
sex: string;
|
|
95
|
+
description?: string;
|
|
96
|
+
category?: string;
|
|
97
|
+
age?: string;
|
|
98
|
+
quality?: string;
|
|
99
|
+
supportedLanguages?: string[];
|
|
100
|
+
isPublic?: boolean;
|
|
101
|
+
sampleText?: string;
|
|
102
|
+
/** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
|
|
103
|
+
referenceFiles?: Array<File | Blob>;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Options for updating an existing voice.
|
|
107
|
+
*/
|
|
108
|
+
interface UpdateVoiceOptions {
|
|
109
|
+
name?: string;
|
|
110
|
+
description?: string;
|
|
111
|
+
category?: string;
|
|
112
|
+
age?: string;
|
|
113
|
+
sex?: string;
|
|
114
|
+
quality?: string;
|
|
115
|
+
supportedLanguages?: string[];
|
|
116
|
+
isPublic?: boolean;
|
|
117
|
+
sampleText?: string;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Word-level timestamp from server-side forced alignment.
|
|
121
|
+
*/
|
|
122
|
+
interface WordTimestamp {
|
|
123
|
+
/** The aligned word */
|
|
124
|
+
word: string;
|
|
125
|
+
/** Start time in milliseconds (relative to chunk/audio start) */
|
|
126
|
+
startMs: number;
|
|
127
|
+
/** End time in milliseconds (relative to chunk/audio start) */
|
|
128
|
+
endMs: number;
|
|
129
|
+
/** Start character offset in the original text */
|
|
130
|
+
charStart: number;
|
|
131
|
+
/** End character offset in the original text */
|
|
132
|
+
charEnd: number;
|
|
133
|
+
/** Alignment confidence score (0.0 - 1.0) */
|
|
134
|
+
score: number;
|
|
135
|
+
}
|
|
44
136
|
/**
|
|
45
137
|
* TTS generation request options.
|
|
46
138
|
*/
|
|
47
139
|
interface GenerateOptions {
|
|
48
140
|
/** Text to synthesize */
|
|
49
141
|
text: string;
|
|
50
|
-
/** Model to use: 'kugel-1-turbo' (
|
|
142
|
+
/** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
|
|
51
143
|
modelId?: string;
|
|
52
144
|
/** Voice ID to use */
|
|
53
145
|
voiceId?: number;
|
|
54
146
|
/** CFG scale for generation (default: 2.0) */
|
|
55
147
|
cfgScale?: number;
|
|
148
|
+
/**
|
|
149
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
|
|
150
|
+
* 1 = most variance. Default: 0.5.
|
|
151
|
+
*
|
|
152
|
+
* Lower values produce more consistent reads across regenerations —
|
|
153
|
+
* useful for stable voiceovers, IVR prompts, and e-learning.
|
|
154
|
+
*/
|
|
155
|
+
temperature?: number;
|
|
56
156
|
/** Maximum tokens to generate (default: 2048) */
|
|
57
157
|
maxNewTokens?: number;
|
|
58
158
|
/** Output sample rate (default: 24000) */
|
|
@@ -72,18 +172,61 @@ interface GenerateOptions {
|
|
|
72
172
|
* (adds ~150ms latency).
|
|
73
173
|
*
|
|
74
174
|
* Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
|
|
75
|
-
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko
|
|
175
|
+
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
|
|
176
|
+
* he, fa, ur, bn, ta, yue, th, id, ms
|
|
76
177
|
*/
|
|
77
178
|
language?: string;
|
|
179
|
+
/**
|
|
180
|
+
* Request word-level timestamps alongside audio.
|
|
181
|
+
* When true, the server performs forced alignment and returns per-word timing boundaries.
|
|
182
|
+
* Default: false
|
|
183
|
+
*/
|
|
184
|
+
wordTimestamps?: boolean;
|
|
185
|
+
/**
|
|
186
|
+
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
187
|
+
*
|
|
188
|
+
* Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
|
|
189
|
+
* can also be used for per-segment speed control.
|
|
190
|
+
* Range: [0.8, 1.2]. Default: 1.0.
|
|
191
|
+
*/
|
|
192
|
+
speed?: number;
|
|
193
|
+
/**
|
|
194
|
+
* Optional project ID for project-scoped features (custom dictionary
|
|
195
|
+
* replacements, per-project rate limits). The caller MUST verify the
|
|
196
|
+
* authenticated user has access to this project before passing it; the
|
|
197
|
+
* server treats the value as trusted once received.
|
|
198
|
+
*/
|
|
199
|
+
projectId?: number;
|
|
78
200
|
}
|
|
79
201
|
/**
|
|
80
|
-
* Streaming session configuration
|
|
202
|
+
* Streaming session configuration for `/ws/tts/stream`.
|
|
203
|
+
*
|
|
204
|
+
* The server accumulates LLM tokens internally and starts generation at natural
|
|
205
|
+
* sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
|
|
206
|
+
* server begins generating, or set {@link autoMode} to start at the very first
|
|
207
|
+
* clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
|
|
208
|
+
*
|
|
209
|
+
* @example Low-latency preset
|
|
210
|
+
* ```typescript
|
|
211
|
+
* const session = client.tts.streamingSession({
|
|
212
|
+
* voiceId: 123,
|
|
213
|
+
* autoMode: true,
|
|
214
|
+
* chunkLengthSchedule: [50, 100, 150, 250],
|
|
215
|
+
* });
|
|
216
|
+
* ```
|
|
81
217
|
*/
|
|
82
218
|
interface StreamConfig {
|
|
83
219
|
/** Voice ID to use */
|
|
84
220
|
voiceId?: number;
|
|
221
|
+
/** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
|
|
222
|
+
modelId?: string;
|
|
85
223
|
/** CFG scale for generation */
|
|
86
224
|
cfgScale?: number;
|
|
225
|
+
/**
|
|
226
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
|
|
227
|
+
* Default: 0.5.
|
|
228
|
+
*/
|
|
229
|
+
temperature?: number;
|
|
87
230
|
/** Maximum tokens per generation */
|
|
88
231
|
maxNewTokens?: number;
|
|
89
232
|
/** Output sample rate */
|
|
@@ -102,6 +245,68 @@ interface StreamConfig {
|
|
|
102
245
|
* Specify to avoid ~150ms auto-detection latency.
|
|
103
246
|
*/
|
|
104
247
|
language?: string;
|
|
248
|
+
/**
|
|
249
|
+
* Request word-level timestamps alongside audio.
|
|
250
|
+
* Default: false
|
|
251
|
+
*/
|
|
252
|
+
wordTimestamps?: boolean;
|
|
253
|
+
/**
|
|
254
|
+
* Minimum buffer sizes (in characters) the server must accumulate before
|
|
255
|
+
* auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
|
|
256
|
+
* last value is reused for all subsequent chunks.
|
|
257
|
+
*
|
|
258
|
+
* Smaller values produce lower TTFA at the cost of less prosody context.
|
|
259
|
+
* Larger values improve naturalness but increase TTFA.
|
|
260
|
+
*
|
|
261
|
+
* @example
|
|
262
|
+
* ```typescript
|
|
263
|
+
* chunkLengthSchedule: [50, 100, 150, 250] // low-latency
|
|
264
|
+
* chunkLengthSchedule: [120, 200, 300] // high-quality prosody
|
|
265
|
+
* ```
|
|
266
|
+
*/
|
|
267
|
+
chunkLengthSchedule?: number[];
|
|
268
|
+
/**
|
|
269
|
+
* When `true`, the server starts generating audio at the very first clean
|
|
270
|
+
* sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
|
|
271
|
+
* ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
|
|
272
|
+
* less natural prosody on the first chunk.
|
|
273
|
+
*/
|
|
274
|
+
autoMode?: boolean;
|
|
275
|
+
/**
|
|
276
|
+
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
277
|
+
*
|
|
278
|
+
* Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
|
|
279
|
+
* can also be used for per-segment speed control.
|
|
280
|
+
* Range: [0.8, 1.2]. Default: 1.0.
|
|
281
|
+
*/
|
|
282
|
+
speed?: number;
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* Event callbacks for a streaming session (`/ws/tts/stream`).
|
|
286
|
+
*
|
|
287
|
+
* This is the LLM-integration endpoint: forward raw tokens via
|
|
288
|
+
* {@link StreamingSession.send} and the server auto-chunks them at sentence
|
|
289
|
+
* boundaries.
|
|
290
|
+
*/
|
|
291
|
+
interface StreamingSessionCallbacks {
|
|
292
|
+
/** Called when an audio chunk arrives for any segment. */
|
|
293
|
+
onChunk?: (chunk: AudioChunk) => void;
|
|
294
|
+
/**
|
|
295
|
+
* Called when all audio for one flushed text segment is complete.
|
|
296
|
+
* Carries the segment index, total audio duration, and generation time.
|
|
297
|
+
*/
|
|
298
|
+
onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
|
|
299
|
+
/**
|
|
300
|
+
* Called when the session is fully closed (after `session.close()`).
|
|
301
|
+
* Equivalent to `onFinal` on the one-shot endpoint.
|
|
302
|
+
*/
|
|
303
|
+
onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
304
|
+
/** Called when the server begins generating audio for a text segment. */
|
|
305
|
+
onGenerationStarted?: (chunkId: number, text: string) => void;
|
|
306
|
+
/** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
|
|
307
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
308
|
+
/** Called on any error. */
|
|
309
|
+
onError?: (error: Error) => void;
|
|
105
310
|
}
|
|
106
311
|
/**
|
|
107
312
|
* Audio chunk from streaming TTS.
|
|
@@ -132,8 +337,6 @@ interface GenerationStats {
|
|
|
132
337
|
durationMs: number;
|
|
133
338
|
/** Generation time in milliseconds */
|
|
134
339
|
generationMs: number;
|
|
135
|
-
/** Time to first audio in milliseconds */
|
|
136
|
-
ttfaMs: number | null;
|
|
137
340
|
/** Real-time factor */
|
|
138
341
|
rtf: number;
|
|
139
342
|
/** Error message if any */
|
|
@@ -155,6 +358,8 @@ interface AudioResponse {
|
|
|
155
358
|
generationMs: number;
|
|
156
359
|
/** Real-time factor */
|
|
157
360
|
rtf: number;
|
|
361
|
+
/** Per-word timing boundaries (populated when `wordTimestamps: true`) */
|
|
362
|
+
wordTimestamps: WordTimestamp[];
|
|
158
363
|
}
|
|
159
364
|
/**
|
|
160
365
|
* Event callbacks for streaming.
|
|
@@ -162,6 +367,8 @@ interface AudioResponse {
|
|
|
162
367
|
interface StreamCallbacks {
|
|
163
368
|
/** Called when an audio chunk is received */
|
|
164
369
|
onChunk?: (chunk: AudioChunk) => void;
|
|
370
|
+
/** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
|
|
371
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
165
372
|
/** Called when generation is complete */
|
|
166
373
|
onFinal?: (stats: GenerationStats) => void;
|
|
167
374
|
/** Called on error */
|
|
@@ -171,11 +378,18 @@ interface StreamCallbacks {
|
|
|
171
378
|
/** Called when connection closes */
|
|
172
379
|
onClose?: () => void;
|
|
173
380
|
}
|
|
381
|
+
/**
|
|
382
|
+
* Deployment region. Controls which API endpoint the SDK connects to.
|
|
383
|
+
* - `'eu'` — `api.kugelaudio.com` (default)
|
|
384
|
+
* - `'us'` — `us-api.kugelaudio.com`
|
|
385
|
+
* - `'global'` — `global-api.kugelaudio.com` (geo-routed)
|
|
386
|
+
*/
|
|
387
|
+
type Region = 'eu' | 'us' | 'global';
|
|
174
388
|
/**
|
|
175
389
|
* KugelAudio client options.
|
|
176
390
|
*/
|
|
177
391
|
interface KugelAudioOptions {
|
|
178
|
-
/** Your KugelAudio API key or JWT token */
|
|
392
|
+
/** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
|
|
179
393
|
apiKey: string;
|
|
180
394
|
/** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
|
|
181
395
|
isMasterKey?: boolean;
|
|
@@ -183,12 +397,20 @@ interface KugelAudioOptions {
|
|
|
183
397
|
isToken?: boolean;
|
|
184
398
|
/** Organisation ID to bill usage against (required for token auth to enable usage recording). */
|
|
185
399
|
orgId?: number;
|
|
400
|
+
/** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
|
|
401
|
+
region?: Region;
|
|
186
402
|
/** API base URL (default: https://api.kugelaudio.com) */
|
|
187
403
|
apiUrl?: string;
|
|
188
404
|
/** TTS server URL (default: same as apiUrl) */
|
|
189
405
|
ttsUrl?: string;
|
|
190
406
|
/** Request timeout in milliseconds (default: 60000) */
|
|
191
407
|
timeout?: number;
|
|
408
|
+
/**
|
|
409
|
+
* Interval in milliseconds between WebSocket ping frames sent on the pooled connection
|
|
410
|
+
* to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
|
|
411
|
+
* In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
|
|
412
|
+
*/
|
|
413
|
+
keepalivePingInterval?: number | null;
|
|
192
414
|
}
|
|
193
415
|
/**
|
|
194
416
|
* Multi-context session configuration.
|
|
@@ -200,10 +422,21 @@ interface MultiContextConfig {
|
|
|
200
422
|
sampleRate?: number;
|
|
201
423
|
/** CFG scale for generation (default: 2.0) */
|
|
202
424
|
cfgScale?: number;
|
|
425
|
+
/**
|
|
426
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
|
|
427
|
+
* Default: 0.5.
|
|
428
|
+
*/
|
|
429
|
+
temperature?: number;
|
|
203
430
|
/** Maximum tokens to generate (default: 2048) */
|
|
204
431
|
maxNewTokens?: number;
|
|
205
432
|
/** Enable text normalization (default: true) */
|
|
206
433
|
normalize?: boolean;
|
|
434
|
+
/**
|
|
435
|
+
* ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
|
|
436
|
+
* If not set and normalize is true (default), the server auto-detects
|
|
437
|
+
* the language, which adds ~60-150ms to time-to-first-audio.
|
|
438
|
+
*/
|
|
439
|
+
language?: string;
|
|
207
440
|
/** Seconds before context auto-closes (default: 20.0) */
|
|
208
441
|
inactivityTimeout?: number;
|
|
209
442
|
}
|
|
@@ -239,8 +472,6 @@ interface MultiContextCallbacks {
|
|
|
239
472
|
onContextCreated?: (contextId: string) => void;
|
|
240
473
|
/** Called when an audio chunk is received */
|
|
241
474
|
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
242
|
-
/** Called when a context finishes generating */
|
|
243
|
-
onContextFinal?: (contextId: string) => void;
|
|
244
475
|
/** Called when a context is closed */
|
|
245
476
|
onContextClosed?: (contextId: string) => void;
|
|
246
477
|
/** Called when a context times out */
|
|
@@ -275,11 +506,51 @@ declare class VoicesResource {
|
|
|
275
506
|
language?: string;
|
|
276
507
|
includePublic?: boolean;
|
|
277
508
|
limit?: number;
|
|
278
|
-
|
|
509
|
+
offset?: number;
|
|
510
|
+
}): Promise<VoiceListResponse>;
|
|
279
511
|
/**
|
|
280
512
|
* Get a specific voice by ID.
|
|
281
513
|
*/
|
|
282
|
-
get(voiceId: number): Promise<
|
|
514
|
+
get(voiceId: number): Promise<VoiceDetail>;
|
|
515
|
+
/**
|
|
516
|
+
* Create a new voice.
|
|
517
|
+
*/
|
|
518
|
+
create(options: CreateVoiceOptions): Promise<VoiceDetail>;
|
|
519
|
+
/**
|
|
520
|
+
* Update an existing voice. Only provided fields are updated.
|
|
521
|
+
*/
|
|
522
|
+
update(voiceId: number, options: UpdateVoiceOptions): Promise<VoiceDetail>;
|
|
523
|
+
/**
|
|
524
|
+
* Delete a voice.
|
|
525
|
+
*/
|
|
526
|
+
delete(voiceId: number): Promise<void>;
|
|
527
|
+
/**
|
|
528
|
+
* List reference audio files for a voice.
|
|
529
|
+
*/
|
|
530
|
+
listReferences(voiceId: number): Promise<VoiceReference[]>;
|
|
531
|
+
/**
|
|
532
|
+
* Upload a reference audio file to a voice.
|
|
533
|
+
*
|
|
534
|
+
* @param voiceId - Voice ID
|
|
535
|
+
* @param file - Audio file (File in browser, Blob in Node.js)
|
|
536
|
+
* @param referenceText - Optional transcript of the reference audio
|
|
537
|
+
*/
|
|
538
|
+
addReference(voiceId: number, file: File | Blob, referenceText?: string): Promise<VoiceReference>;
|
|
539
|
+
/**
|
|
540
|
+
* Delete a reference audio file from a voice.
|
|
541
|
+
*/
|
|
542
|
+
deleteReference(voiceId: number, referenceId: number): Promise<void>;
|
|
543
|
+
/**
|
|
544
|
+
* Request publication of a voice. Sets it as public and marks it
|
|
545
|
+
* as pending verification by an admin.
|
|
546
|
+
*/
|
|
547
|
+
publish(voiceId: number): Promise<VoiceDetail>;
|
|
548
|
+
/**
|
|
549
|
+
* Trigger sample audio generation for a voice.
|
|
550
|
+
*/
|
|
551
|
+
generateSample(voiceId: number): Promise<VoiceDetail>;
|
|
552
|
+
private mapVoiceDetail;
|
|
553
|
+
private mapVoiceReference;
|
|
283
554
|
}
|
|
284
555
|
/**
|
|
285
556
|
* TTS resource for text-to-speech generation.
|
|
@@ -290,6 +561,7 @@ declare class TTSResource {
|
|
|
290
561
|
private wsUrl;
|
|
291
562
|
private pendingRequests;
|
|
292
563
|
private requestCounter;
|
|
564
|
+
private keepaliveTimer;
|
|
293
565
|
constructor(client: KugelAudio);
|
|
294
566
|
/**
|
|
295
567
|
* Pre-establish WebSocket connection for faster first request.
|
|
@@ -318,6 +590,40 @@ declare class TTSResource {
|
|
|
318
590
|
* Returns complete audio after all chunks are received.
|
|
319
591
|
*/
|
|
320
592
|
generate(options: GenerateOptions): Promise<AudioResponse>;
|
|
593
|
+
/**
|
|
594
|
+
* Stream audio and return a Node.js Readable stream of raw PCM16 binary data.
|
|
595
|
+
*
|
|
596
|
+
* **Node.js only** — this method requires the `stream` built-in module and is
|
|
597
|
+
* intended for server-side integrations such as Vapi custom TTS endpoints,
|
|
598
|
+
* Express/Fastify handlers, or any pipeline that expects a Node.js `Readable`.
|
|
599
|
+
*
|
|
600
|
+
* Compared to manually wiring `onChunk` to a `Readable`, this method avoids
|
|
601
|
+
* a common race-condition: the stream object is created and returned **before**
|
|
602
|
+
* any chunks arrive, so the caller can safely pipe or attach listeners before
|
|
603
|
+
* the first audio byte is pushed.
|
|
604
|
+
*
|
|
605
|
+
* @example Vapi custom TTS endpoint
|
|
606
|
+
* ```typescript
|
|
607
|
+
* app.post('/synthesize', (req, res) => {
|
|
608
|
+
* res.setHeader('Content-Type', 'audio/pcm');
|
|
609
|
+
* res.setHeader('Transfer-Encoding', 'chunked');
|
|
610
|
+
*
|
|
611
|
+
* const readable = client.tts.toReadable({
|
|
612
|
+
* text: req.body.message.text,
|
|
613
|
+
* modelId: 'kugel-1-turbo',
|
|
614
|
+
* sampleRate: req.body.message.sampleRate,
|
|
615
|
+
* language: 'en',
|
|
616
|
+
* });
|
|
617
|
+
*
|
|
618
|
+
* readable.pipe(res);
|
|
619
|
+
* });
|
|
620
|
+
* ```
|
|
621
|
+
*
|
|
622
|
+
* @param options - TTS generation options (same as `stream()`)
|
|
623
|
+
* @param reuseConnection - Reuse the pooled WebSocket connection (default: true)
|
|
624
|
+
* @returns Node.js Readable stream emitting raw PCM16 binary Buffer chunks
|
|
625
|
+
*/
|
|
626
|
+
toReadable(options: GenerateOptions, reuseConnection?: boolean): any;
|
|
321
627
|
/**
|
|
322
628
|
* Build the WebSocket URL with appropriate auth param.
|
|
323
629
|
*/
|
|
@@ -348,11 +654,47 @@ declare class TTSResource {
|
|
|
348
654
|
* Stream without connection pooling (original behavior).
|
|
349
655
|
*/
|
|
350
656
|
private streamWithoutPooling;
|
|
657
|
+
/**
|
|
658
|
+
* Start periodic keepalive pings on the pooled connection.
|
|
659
|
+
* Uses the ws package's ping() in Node.js; silently skips in browsers
|
|
660
|
+
* where WebSocket doesn't expose a ping method.
|
|
661
|
+
*/
|
|
662
|
+
private startKeepalive;
|
|
663
|
+
private stopKeepalive;
|
|
351
664
|
/**
|
|
352
665
|
* Close the pooled WebSocket connection.
|
|
353
666
|
*/
|
|
354
667
|
close(): void;
|
|
355
668
|
private parseError;
|
|
669
|
+
/**
|
|
670
|
+
* Create a streaming session for LLM integration.
|
|
671
|
+
*
|
|
672
|
+
* The session connects to `/ws/tts/stream` and keeps a persistent
|
|
673
|
+
* connection across multiple {@link StreamingSession.send} calls.
|
|
674
|
+
* The server auto-chunks text at sentence boundaries — no client-side
|
|
675
|
+
* flushing required.
|
|
676
|
+
*
|
|
677
|
+
* @param config - Session configuration (voice, model, chunking strategy).
|
|
678
|
+
* @param callbacks - Callbacks for audio chunks and session lifecycle events.
|
|
679
|
+
* @returns A {@link StreamingSession} instance. Call `.connect()` before sending.
|
|
680
|
+
*
|
|
681
|
+
* @example
|
|
682
|
+
* ```typescript
|
|
683
|
+
* const session = client.tts.streamingSession(
|
|
684
|
+
* { voiceId: 123, autoMode: true, chunkLengthSchedule: [50, 100, 150, 250] },
|
|
685
|
+
* { onChunk: (chunk) => playAudio(chunk.audio) },
|
|
686
|
+
* );
|
|
687
|
+
*
|
|
688
|
+
* session.connect();
|
|
689
|
+
*
|
|
690
|
+
* for await (const token of llmStream) {
|
|
691
|
+
* session.send(token);
|
|
692
|
+
* }
|
|
693
|
+
*
|
|
694
|
+
* await session.close();
|
|
695
|
+
* ```
|
|
696
|
+
*/
|
|
697
|
+
streamingSession(config: StreamConfig, callbacks: StreamingSessionCallbacks): StreamingSession;
|
|
356
698
|
/**
|
|
357
699
|
* Create a multi-context session for concurrent TTS streams.
|
|
358
700
|
*
|
|
@@ -371,7 +713,7 @@ declare class TTSResource {
|
|
|
371
713
|
* console.log(`Audio from ${chunk.contextId}`);
|
|
372
714
|
* playAudio(chunk.audio);
|
|
373
715
|
* },
|
|
374
|
-
*
|
|
716
|
+
* onContextClosed: (contextId) => {
|
|
375
717
|
* console.log(`${contextId} finished`);
|
|
376
718
|
* },
|
|
377
719
|
* });
|
|
@@ -408,8 +750,13 @@ declare class MultiContextSession {
|
|
|
408
750
|
get sessionId(): string | null;
|
|
409
751
|
/**
|
|
410
752
|
* Connect to the multi-context WebSocket endpoint.
|
|
753
|
+
*
|
|
754
|
+
* The returned promise resolves once the WebSocket is OPEN so callers can
|
|
755
|
+
* ``await session.connect(callbacks)`` before invoking
|
|
756
|
+
* {@link createContext} / {@link send}. Pre-open errors reject with the
|
|
757
|
+
* typed error.
|
|
411
758
|
*/
|
|
412
|
-
connect(callbacks: MultiContextCallbacks): void
|
|
759
|
+
connect(callbacks: MultiContextCallbacks): Promise<void>;
|
|
413
760
|
/**
|
|
414
761
|
* Create a new context with optional voice settings.
|
|
415
762
|
*/
|
|
@@ -446,6 +793,103 @@ declare class MultiContextSession {
|
|
|
446
793
|
*/
|
|
447
794
|
get isConnected(): boolean;
|
|
448
795
|
}
|
|
796
|
+
/**
|
|
797
|
+
* Streaming session for LLM integration via `/ws/tts/stream`.
|
|
798
|
+
*
|
|
799
|
+
* The server accumulates text across multiple {@link send} calls and
|
|
800
|
+
* auto-chunks it at sentence boundaries, keeping the KV cache warm between
|
|
801
|
+
* chunks for natural prosody. You never need to call `flush` explicitly —
|
|
802
|
+
* configure {@link StreamConfig.chunkLengthSchedule} or
|
|
803
|
+
* {@link StreamConfig.autoMode} instead.
|
|
804
|
+
*
|
|
805
|
+
* @example
|
|
806
|
+
* ```typescript
|
|
807
|
+
* const session = client.tts.streamingSession({
|
|
808
|
+
* voiceId: 123,
|
|
809
|
+
* autoMode: true,
|
|
810
|
+
* chunkLengthSchedule: [50, 100, 150, 250],
|
|
811
|
+
* }, {
|
|
812
|
+
* onChunk: (chunk) => playAudio(chunk.audio),
|
|
813
|
+
* onSessionClosed: (totalSecs) => console.log(`Done: ${totalSecs}s`),
|
|
814
|
+
* });
|
|
815
|
+
*
|
|
816
|
+
* session.connect();
|
|
817
|
+
*
|
|
818
|
+
* for await (const token of llmStream) {
|
|
819
|
+
* session.send(token);
|
|
820
|
+
* }
|
|
821
|
+
*
|
|
822
|
+
* await session.close();
|
|
823
|
+
* ```
|
|
824
|
+
*/
|
|
825
|
+
declare class StreamingSession {
|
|
826
|
+
private ws;
|
|
827
|
+
private config;
|
|
828
|
+
private callbacks;
|
|
829
|
+
private client;
|
|
830
|
+
private configSent;
|
|
831
|
+
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
|
|
832
|
+
/**
|
|
833
|
+
* Open the WebSocket connection and authenticate.
|
|
834
|
+
*
|
|
835
|
+
* The returned promise resolves once the WebSocket is OPEN, so callers can
|
|
836
|
+
* ``await session.connect()`` and then ``send()`` without racing the
|
|
837
|
+
* handshake. Pre-open errors (network failure, 4001 unauthorized, …) reject
|
|
838
|
+
* the promise with the typed error.
|
|
839
|
+
*/
|
|
840
|
+
connect(): Promise<void>;
|
|
841
|
+
/**
|
|
842
|
+
* Send a text chunk to the server (e.g. one LLM output token).
|
|
843
|
+
*
|
|
844
|
+
* The server buffers text across multiple calls and starts generating at
|
|
845
|
+
* natural sentence boundaries automatically — no need to call `flush`.
|
|
846
|
+
*
|
|
847
|
+
* @param text - Raw text or LLM token to append to the server buffer.
|
|
848
|
+
* @param flush - Force immediate generation of whatever is buffered.
|
|
849
|
+
* **Avoid calling this per-sentence from the client.** Doing so bypasses
|
|
850
|
+
* the server's semantic chunking, incurs a fresh model prefill cost on
|
|
851
|
+
* every flush, and makes latency *worse*, not better. Let the server
|
|
852
|
+
* handle chunking via `chunkLengthSchedule` / `autoMode` instead.
|
|
853
|
+
*/
|
|
854
|
+
send(text: string, flush?: boolean): void;
|
|
855
|
+
/**
|
|
856
|
+
* End the current session but keep the WebSocket connection open.
|
|
857
|
+
*
|
|
858
|
+
* This allows starting a new session on the same connection, avoiding
|
|
859
|
+
* the overhead of a new WebSocket handshake (~200-300ms). After calling
|
|
860
|
+
* this, optionally call {@link updateConfig} to change voice/model settings,
|
|
861
|
+
* then call {@link send} to start the next session.
|
|
862
|
+
*
|
|
863
|
+
* The returned promise resolves once the server confirms with a
|
|
864
|
+
* `session_closed` message, or after a 15 s **quiet** timeout — i.e. 15 s
|
|
865
|
+
* elapse without *any* server message arriving. The timer resets on every
|
|
866
|
+
* incoming frame so a long final flush that streams audio for tens of
|
|
867
|
+
* seconds is not truncated; only a genuinely silent server trips the fuse.
|
|
868
|
+
*/
|
|
869
|
+
endSession(): Promise<void>;
|
|
870
|
+
/**
|
|
871
|
+
* Update session configuration for the next session.
|
|
872
|
+
*
|
|
873
|
+
* Call this after {@link endSession} and before the next {@link send}
|
|
874
|
+
* to change voice, model, language, or other settings.
|
|
875
|
+
*/
|
|
876
|
+
updateConfig(config: Partial<StreamConfig>): void;
|
|
877
|
+
/**
|
|
878
|
+
* Close the session and the WebSocket connection.
|
|
879
|
+
*
|
|
880
|
+
* For session reuse without closing the connection, use
|
|
881
|
+
* {@link endSession} instead.
|
|
882
|
+
*
|
|
883
|
+
* The returned promise resolves once the server confirms the close with a
|
|
884
|
+
* `session_closed` message, or after a 15 s **quiet** timeout (no traffic
|
|
885
|
+
* from the server in that window). Audio frames from the server-side
|
|
886
|
+
* final-flush of the still-buffered text are delivered to your callbacks
|
|
887
|
+
* before this promise resolves, and each frame resets the quiet timer.
|
|
888
|
+
*/
|
|
889
|
+
close(): Promise<void>;
|
|
890
|
+
/** Whether the underlying WebSocket is open. */
|
|
891
|
+
get isConnected(): boolean;
|
|
892
|
+
}
|
|
449
893
|
/**
|
|
450
894
|
* KugelAudio API client.
|
|
451
895
|
*
|
|
@@ -459,13 +903,13 @@ declare class MultiContextSession {
|
|
|
459
903
|
* // List voices
|
|
460
904
|
* const voices = await client.voices.list();
|
|
461
905
|
*
|
|
462
|
-
* // Generate audio with fast model
|
|
906
|
+
* // Generate audio with fast model
|
|
463
907
|
* const audio = await client.tts.generate({
|
|
464
908
|
* text: 'Hello, world!',
|
|
465
909
|
* modelId: 'kugel-1-turbo',
|
|
466
910
|
* });
|
|
467
911
|
*
|
|
468
|
-
* // Generate audio with premium model
|
|
912
|
+
* // Generate audio with premium model
|
|
469
913
|
* const audio = await client.tts.generate({
|
|
470
914
|
* text: 'Hello, world!',
|
|
471
915
|
* modelId: 'kugel-1',
|
|
@@ -480,6 +924,7 @@ declare class KugelAudio {
|
|
|
480
924
|
private _apiUrl;
|
|
481
925
|
private _ttsUrl;
|
|
482
926
|
private _timeout;
|
|
927
|
+
private _keepalivePingInterval;
|
|
483
928
|
/** Models resource */
|
|
484
929
|
readonly models: ModelsResource;
|
|
485
930
|
/** Voices resource */
|
|
@@ -514,6 +959,8 @@ declare class KugelAudio {
|
|
|
514
959
|
get orgId(): number | undefined;
|
|
515
960
|
/** Get TTS URL */
|
|
516
961
|
get ttsUrl(): string;
|
|
962
|
+
/** Get keepalive ping interval in milliseconds, or null if disabled. */
|
|
963
|
+
get keepalivePingInterval(): number | null;
|
|
517
964
|
/**
|
|
518
965
|
* Close the client and release resources.
|
|
519
966
|
* This closes any pooled WebSocket connections.
|
|
@@ -546,48 +993,125 @@ declare class KugelAudio {
|
|
|
546
993
|
* @internal
|
|
547
994
|
*/
|
|
548
995
|
request<T>(method: string, path: string, body?: unknown): Promise<T>;
|
|
996
|
+
/**
|
|
997
|
+
* Make a multipart/form-data request (for file uploads).
|
|
998
|
+
* @internal Used by VoicesResource for reference file uploads.
|
|
999
|
+
*/
|
|
1000
|
+
requestMultipart<T>(method: string, path: string, formData: FormData): Promise<T>;
|
|
549
1001
|
}
|
|
550
1002
|
|
|
551
1003
|
/**
|
|
552
1004
|
* Custom errors for KugelAudio SDK.
|
|
1005
|
+
*
|
|
1006
|
+
* All SDK errors inherit from {@link KugelAudioError}. Specific subclasses
|
|
1007
|
+
* map to the server's `error_code` field (see the server-side `ErrorCode`
|
|
1008
|
+
* enum at `tts/src/serving/deployments/errors.py`) so callers can
|
|
1009
|
+
* `instanceof AuthenticationError` without matching on message text.
|
|
553
1010
|
*/
|
|
1011
|
+
declare const ErrorCodes: {
|
|
1012
|
+
readonly UNAUTHORIZED: "UNAUTHORIZED";
|
|
1013
|
+
readonly RATE_LIMITED: "RATE_LIMITED";
|
|
1014
|
+
readonly INSUFFICIENT_CREDITS: "INSUFFICIENT_CREDITS";
|
|
1015
|
+
readonly MODEL_UNAVAILABLE: "MODEL_UNAVAILABLE";
|
|
1016
|
+
readonly EMPTY_AUDIO: "EMPTY_AUDIO";
|
|
1017
|
+
readonly VALIDATION: "VALIDATION_ERROR";
|
|
1018
|
+
readonly INTERNAL: "INTERNAL_ERROR";
|
|
1019
|
+
readonly NOT_FOUND: "NOT_FOUND";
|
|
1020
|
+
};
|
|
1021
|
+
type ErrorCode = typeof ErrorCodes[keyof typeof ErrorCodes];
|
|
1022
|
+
declare const WsCloseCodes: {
|
|
1023
|
+
readonly UNAUTHORIZED: 4001;
|
|
1024
|
+
readonly INSUFFICIENT_CREDITS: 4003;
|
|
1025
|
+
readonly RATE_LIMITED: 4029;
|
|
1026
|
+
readonly MODEL_UNAVAILABLE: 4500;
|
|
1027
|
+
};
|
|
1028
|
+
interface KugelAudioErrorOptions {
|
|
1029
|
+
statusCode?: number;
|
|
1030
|
+
errorCode?: string;
|
|
1031
|
+
requestId?: string;
|
|
1032
|
+
retryAfter?: number;
|
|
1033
|
+
cause?: unknown;
|
|
1034
|
+
}
|
|
554
1035
|
/**
|
|
555
1036
|
* Base error class for KugelAudio SDK.
|
|
556
1037
|
*/
|
|
557
1038
|
declare class KugelAudioError extends Error {
|
|
558
1039
|
readonly statusCode?: number;
|
|
559
|
-
|
|
1040
|
+
readonly errorCode?: string;
|
|
1041
|
+
readonly requestId?: string;
|
|
1042
|
+
readonly retryAfter?: number;
|
|
1043
|
+
constructor(message: string, options?: KugelAudioErrorOptions);
|
|
560
1044
|
}
|
|
561
1045
|
/**
|
|
562
|
-
*
|
|
1046
|
+
* API key was missing, malformed, or rejected by the server.
|
|
563
1047
|
*/
|
|
564
1048
|
declare class AuthenticationError extends KugelAudioError {
|
|
565
|
-
constructor(message?: string);
|
|
1049
|
+
constructor(message?: string, options?: KugelAudioErrorOptions);
|
|
566
1050
|
}
|
|
567
1051
|
/**
|
|
568
|
-
*
|
|
1052
|
+
* Request was rejected by the per-org rate limiter.
|
|
569
1053
|
*/
|
|
570
1054
|
declare class RateLimitError extends KugelAudioError {
|
|
571
|
-
constructor(message?: string);
|
|
1055
|
+
constructor(message?: string, options?: KugelAudioErrorOptions);
|
|
572
1056
|
}
|
|
573
1057
|
/**
|
|
574
|
-
*
|
|
1058
|
+
* Account is out of TTS credits.
|
|
575
1059
|
*/
|
|
576
1060
|
declare class InsufficientCreditsError extends KugelAudioError {
|
|
577
|
-
constructor(message?: string);
|
|
1061
|
+
constructor(message?: string, options?: KugelAudioErrorOptions);
|
|
578
1062
|
}
|
|
579
1063
|
/**
|
|
580
|
-
*
|
|
1064
|
+
* Request was rejected as invalid (bad params, missing fields, etc.).
|
|
581
1065
|
*/
|
|
582
1066
|
declare class ValidationError extends KugelAudioError {
|
|
583
|
-
constructor(message: string);
|
|
1067
|
+
constructor(message: string, options?: KugelAudioErrorOptions);
|
|
584
1068
|
}
|
|
585
1069
|
/**
|
|
586
|
-
*
|
|
1070
|
+
* The SDK could not reach KugelAudio (network error, server down,
|
|
1071
|
+
* or model deployment temporarily unavailable).
|
|
587
1072
|
*/
|
|
588
1073
|
declare class ConnectionError extends KugelAudioError {
|
|
589
|
-
constructor(message?:
|
|
1074
|
+
constructor(message: string, options?: KugelAudioErrorOptions);
|
|
1075
|
+
}
|
|
1076
|
+
interface HttpResponseLike {
|
|
1077
|
+
status: number;
|
|
1078
|
+
headers: {
|
|
1079
|
+
get(name: string): string | null;
|
|
1080
|
+
} | Record<string, string | undefined>;
|
|
1081
|
+
text?: () => Promise<string>;
|
|
590
1082
|
}
|
|
1083
|
+
/**
|
|
1084
|
+
* Build the appropriate `KugelAudioError` from an HTTP response body that
|
|
1085
|
+
* was already parsed. `bodyText` is the raw text fallback.
|
|
1086
|
+
*/
|
|
1087
|
+
declare function classifyHttpError(status: number, bodyText: string, headers: HttpResponseLike['headers']): KugelAudioError;
|
|
1088
|
+
/**
|
|
1089
|
+
* Build a `KugelAudioError` from a server-sent WebSocket error frame
|
|
1090
|
+
* (`{error, error_code, retry_after}`).
|
|
1091
|
+
*/
|
|
1092
|
+
declare function classifyWsFrame(data: {
|
|
1093
|
+
error?: string;
|
|
1094
|
+
error_code?: string;
|
|
1095
|
+
retry_after?: number;
|
|
1096
|
+
}): KugelAudioError;
|
|
1097
|
+
/**
|
|
1098
|
+
* Build a `KugelAudioError` from a WebSocket close code + reason.
|
|
1099
|
+
*/
|
|
1100
|
+
declare function classifyWsClose(code: number | undefined, reason?: string): KugelAudioError;
|
|
1101
|
+
/**
|
|
1102
|
+
* Extract the HTTP status from a `ws` package handshake-rejection error and
|
|
1103
|
+
* return a typed `KugelAudioError`. Returns `null` if the error doesn't look
|
|
1104
|
+
* like a handshake rejection (e.g. pure network failure).
|
|
1105
|
+
*
|
|
1106
|
+
* The `ws` library surfaces rejected upgrades via:
|
|
1107
|
+
* - an Error whose `.message` is `"Unexpected server response: <status>"`
|
|
1108
|
+
* - `error.code === 'EUNEXPECTEDRESPONSE'`, with `error.statusCode` on some versions
|
|
1109
|
+
*
|
|
1110
|
+
* The TTS server rejects WS upgrades with a bare API key using HTTP 403
|
|
1111
|
+
* (not 401), so we treat 403 here as an auth failure — HTTP API callers
|
|
1112
|
+
* keep the generic 403 semantics via {@link classifyHttpError}.
|
|
1113
|
+
*/
|
|
1114
|
+
declare function classifyWsHandshakeError(err: unknown): KugelAudioError | null;
|
|
591
1115
|
|
|
592
1116
|
/**
|
|
593
1117
|
* Utility functions for KugelAudio SDK.
|
|
@@ -609,4 +1133,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
|
|
|
609
1133
|
*/
|
|
610
1134
|
declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
|
|
611
1135
|
|
|
612
|
-
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
|
|
1136
|
+
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type CreateVoiceOptions, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };
|