kugelaudio 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -13
- package/dist/index.d.mts +518 -26
- package/dist/index.d.ts +518 -26
- package/dist/index.js +864 -112
- package/dist/index.mjs +858 -112
- package/package.json +9 -8
- package/src/client.test.ts +548 -0
- package/src/client.ts +885 -103
- package/src/errors.ts +266 -18
- package/src/index.ts +17 -2
- package/src/types.ts +215 -8
- package/src/websocket.ts +38 -18
package/src/types.ts
CHANGED
|
@@ -47,6 +47,87 @@ export interface Voice {
|
|
|
47
47
|
verified: boolean;
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
/**
|
|
51
|
+
* Paginated response from the voices list endpoint.
|
|
52
|
+
*/
|
|
53
|
+
export interface VoiceListResponse {
|
|
54
|
+
voices: Voice[];
|
|
55
|
+
total: number;
|
|
56
|
+
limit: number;
|
|
57
|
+
offset: number;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Voice quality levels.
|
|
62
|
+
*/
|
|
63
|
+
export type VoiceQuality = 'low' | 'mid' | 'high';
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Extended voice information returned by voice management endpoints.
|
|
67
|
+
*/
|
|
68
|
+
export interface VoiceDetail {
|
|
69
|
+
id: number;
|
|
70
|
+
name: string;
|
|
71
|
+
description: string;
|
|
72
|
+
generativeVoiceDescription: string;
|
|
73
|
+
supportedLanguages: string[];
|
|
74
|
+
category: string;
|
|
75
|
+
age?: string;
|
|
76
|
+
sex?: string;
|
|
77
|
+
quality: string;
|
|
78
|
+
isPublic: boolean;
|
|
79
|
+
verified: boolean;
|
|
80
|
+
pendingVerification: boolean;
|
|
81
|
+
sampleUrl?: string;
|
|
82
|
+
avatarUrl?: string;
|
|
83
|
+
sampleText: string;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Voice reference audio metadata.
|
|
88
|
+
*/
|
|
89
|
+
export interface VoiceReference {
|
|
90
|
+
id: number;
|
|
91
|
+
voiceId: number;
|
|
92
|
+
name: string;
|
|
93
|
+
referenceText: string;
|
|
94
|
+
s3Path: string;
|
|
95
|
+
audioUrl?: string;
|
|
96
|
+
isGenerated: boolean;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Options for creating a new voice.
|
|
101
|
+
*/
|
|
102
|
+
export interface CreateVoiceOptions {
|
|
103
|
+
name: string;
|
|
104
|
+
sex: string;
|
|
105
|
+
description?: string;
|
|
106
|
+
category?: string;
|
|
107
|
+
age?: string;
|
|
108
|
+
quality?: string;
|
|
109
|
+
supportedLanguages?: string[];
|
|
110
|
+
isPublic?: boolean;
|
|
111
|
+
sampleText?: string;
|
|
112
|
+
/** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
|
|
113
|
+
referenceFiles?: Array<File | Blob>;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Options for updating an existing voice.
|
|
118
|
+
*/
|
|
119
|
+
export interface UpdateVoiceOptions {
|
|
120
|
+
name?: string;
|
|
121
|
+
description?: string;
|
|
122
|
+
category?: string;
|
|
123
|
+
age?: string;
|
|
124
|
+
sex?: string;
|
|
125
|
+
quality?: string;
|
|
126
|
+
supportedLanguages?: string[];
|
|
127
|
+
isPublic?: boolean;
|
|
128
|
+
sampleText?: string;
|
|
129
|
+
}
|
|
130
|
+
|
|
50
131
|
/**
|
|
51
132
|
* Word-level timestamp from server-side forced alignment.
|
|
52
133
|
*/
|
|
@@ -71,12 +152,20 @@ export interface WordTimestamp {
|
|
|
71
152
|
export interface GenerateOptions {
|
|
72
153
|
/** Text to synthesize */
|
|
73
154
|
text: string;
|
|
74
|
-
/** Model to use: 'kugel-1-turbo' (
|
|
155
|
+
/** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
|
|
75
156
|
modelId?: string;
|
|
76
157
|
/** Voice ID to use */
|
|
77
158
|
voiceId?: number;
|
|
78
159
|
/** CFG scale for generation (default: 2.0) */
|
|
79
160
|
cfgScale?: number;
|
|
161
|
+
/**
|
|
162
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
|
|
163
|
+
* 1 = most variance. Default: 0.5.
|
|
164
|
+
*
|
|
165
|
+
* Lower values produce more consistent reads across regenerations —
|
|
166
|
+
* useful for stable voiceovers, IVR prompts, and e-learning.
|
|
167
|
+
*/
|
|
168
|
+
temperature?: number;
|
|
80
169
|
/** Maximum tokens to generate (default: 2048) */
|
|
81
170
|
maxNewTokens?: number;
|
|
82
171
|
/** Output sample rate (default: 24000) */
|
|
@@ -96,7 +185,8 @@ export interface GenerateOptions {
|
|
|
96
185
|
* (adds ~150ms latency).
|
|
97
186
|
*
|
|
98
187
|
* Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
|
|
99
|
-
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko
|
|
188
|
+
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
|
|
189
|
+
* he, fa, ur, bn, ta, yue, th, id, ms
|
|
100
190
|
*/
|
|
101
191
|
language?: string;
|
|
102
192
|
/**
|
|
@@ -105,16 +195,52 @@ export interface GenerateOptions {
|
|
|
105
195
|
* Default: false
|
|
106
196
|
*/
|
|
107
197
|
wordTimestamps?: boolean;
|
|
198
|
+
/**
|
|
199
|
+
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
200
|
+
*
|
|
201
|
+
* Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
|
|
202
|
+
* can also be used for per-segment speed control.
|
|
203
|
+
* Range: [0.8, 1.2]. Default: 1.0.
|
|
204
|
+
*/
|
|
205
|
+
speed?: number;
|
|
206
|
+
/**
|
|
207
|
+
* Optional project ID for project-scoped features (custom dictionary
|
|
208
|
+
* replacements, per-project rate limits). The caller MUST verify the
|
|
209
|
+
* authenticated user has access to this project before passing it; the
|
|
210
|
+
* server treats the value as trusted once received.
|
|
211
|
+
*/
|
|
212
|
+
projectId?: number;
|
|
108
213
|
}
|
|
109
214
|
|
|
110
215
|
/**
|
|
111
|
-
* Streaming session configuration
|
|
216
|
+
* Streaming session configuration for `/ws/tts/stream`.
|
|
217
|
+
*
|
|
218
|
+
* The server accumulates LLM tokens internally and starts generation at natural
|
|
219
|
+
* sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
|
|
220
|
+
* server begins generating, or set {@link autoMode} to start at the very first
|
|
221
|
+
* clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
|
|
222
|
+
*
|
|
223
|
+
* @example Low-latency preset
|
|
224
|
+
* ```typescript
|
|
225
|
+
* const session = client.tts.streamingSession({
|
|
226
|
+
* voiceId: 123,
|
|
227
|
+
* autoMode: true,
|
|
228
|
+
* chunkLengthSchedule: [50, 100, 150, 250],
|
|
229
|
+
* });
|
|
230
|
+
* ```
|
|
112
231
|
*/
|
|
113
232
|
export interface StreamConfig {
|
|
114
233
|
/** Voice ID to use */
|
|
115
234
|
voiceId?: number;
|
|
235
|
+
/** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
|
|
236
|
+
modelId?: string;
|
|
116
237
|
/** CFG scale for generation */
|
|
117
238
|
cfgScale?: number;
|
|
239
|
+
/**
|
|
240
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
|
|
241
|
+
* Default: 0.5.
|
|
242
|
+
*/
|
|
243
|
+
temperature?: number;
|
|
118
244
|
/** Maximum tokens per generation */
|
|
119
245
|
maxNewTokens?: number;
|
|
120
246
|
/** Output sample rate */
|
|
@@ -138,6 +264,64 @@ export interface StreamConfig {
|
|
|
138
264
|
* Default: false
|
|
139
265
|
*/
|
|
140
266
|
wordTimestamps?: boolean;
|
|
267
|
+
/**
|
|
268
|
+
* Minimum buffer sizes (in characters) the server must accumulate before
|
|
269
|
+
* auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
|
|
270
|
+
* last value is reused for all subsequent chunks.
|
|
271
|
+
*
|
|
272
|
+
* Smaller values produce lower TTFA at the cost of less prosody context.
|
|
273
|
+
* Larger values improve naturalness but increase TTFA.
|
|
274
|
+
*
|
|
275
|
+
* @example
|
|
276
|
+
* ```typescript
|
|
277
|
+
* chunkLengthSchedule: [50, 100, 150, 250] // low-latency
|
|
278
|
+
* chunkLengthSchedule: [120, 200, 300] // high-quality prosody
|
|
279
|
+
* ```
|
|
280
|
+
*/
|
|
281
|
+
chunkLengthSchedule?: number[];
|
|
282
|
+
/**
|
|
283
|
+
* When `true`, the server starts generating audio at the very first clean
|
|
284
|
+
* sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
|
|
285
|
+
* ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
|
|
286
|
+
* less natural prosody on the first chunk.
|
|
287
|
+
*/
|
|
288
|
+
autoMode?: boolean;
|
|
289
|
+
/**
|
|
290
|
+
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
291
|
+
*
|
|
292
|
+
* Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
|
|
293
|
+
* can also be used for per-segment speed control.
|
|
294
|
+
* Range: [0.8, 1.2]. Default: 1.0.
|
|
295
|
+
*/
|
|
296
|
+
speed?: number;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Event callbacks for a streaming session (`/ws/tts/stream`).
|
|
301
|
+
*
|
|
302
|
+
* This is the LLM-integration endpoint: forward raw tokens via
|
|
303
|
+
* {@link StreamingSession.send} and the server auto-chunks them at sentence
|
|
304
|
+
* boundaries.
|
|
305
|
+
*/
|
|
306
|
+
export interface StreamingSessionCallbacks {
|
|
307
|
+
/** Called when an audio chunk arrives for any segment. */
|
|
308
|
+
onChunk?: (chunk: AudioChunk) => void;
|
|
309
|
+
/**
|
|
310
|
+
* Called when all audio for one flushed text segment is complete.
|
|
311
|
+
* Carries the segment index, total audio duration, and generation time.
|
|
312
|
+
*/
|
|
313
|
+
onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
|
|
314
|
+
/**
|
|
315
|
+
* Called when the session is fully closed (after `session.close()`).
|
|
316
|
+
* Equivalent to `onFinal` on the one-shot endpoint.
|
|
317
|
+
*/
|
|
318
|
+
onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
319
|
+
/** Called when the server begins generating audio for a text segment. */
|
|
320
|
+
onGenerationStarted?: (chunkId: number, text: string) => void;
|
|
321
|
+
/** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
|
|
322
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
323
|
+
/** Called on any error. */
|
|
324
|
+
onError?: (error: Error) => void;
|
|
141
325
|
}
|
|
142
326
|
|
|
143
327
|
/**
|
|
@@ -170,8 +354,6 @@ export interface GenerationStats {
|
|
|
170
354
|
durationMs: number;
|
|
171
355
|
/** Generation time in milliseconds */
|
|
172
356
|
generationMs: number;
|
|
173
|
-
/** Time to first audio in milliseconds */
|
|
174
|
-
ttfaMs: number | null;
|
|
175
357
|
/** Real-time factor */
|
|
176
358
|
rtf: number;
|
|
177
359
|
/** Error message if any */
|
|
@@ -216,11 +398,19 @@ export interface StreamCallbacks {
|
|
|
216
398
|
onClose?: () => void;
|
|
217
399
|
}
|
|
218
400
|
|
|
401
|
+
/**
|
|
402
|
+
* Deployment region. Controls which API endpoint the SDK connects to.
|
|
403
|
+
* - `'eu'` — `api.kugelaudio.com` (default)
|
|
404
|
+
* - `'us'` — `us-api.kugelaudio.com`
|
|
405
|
+
* - `'global'` — `global-api.kugelaudio.com` (geo-routed)
|
|
406
|
+
*/
|
|
407
|
+
export type Region = 'eu' | 'us' | 'global';
|
|
408
|
+
|
|
219
409
|
/**
|
|
220
410
|
* KugelAudio client options.
|
|
221
411
|
*/
|
|
222
412
|
export interface KugelAudioOptions {
|
|
223
|
-
/** Your KugelAudio API key or JWT token */
|
|
413
|
+
/** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
|
|
224
414
|
apiKey: string;
|
|
225
415
|
/** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
|
|
226
416
|
isMasterKey?: boolean;
|
|
@@ -228,12 +418,20 @@ export interface KugelAudioOptions {
|
|
|
228
418
|
isToken?: boolean;
|
|
229
419
|
/** Organisation ID to bill usage against (required for token auth to enable usage recording). */
|
|
230
420
|
orgId?: number;
|
|
421
|
+
/** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
|
|
422
|
+
region?: Region;
|
|
231
423
|
/** API base URL (default: https://api.kugelaudio.com) */
|
|
232
424
|
apiUrl?: string;
|
|
233
425
|
/** TTS server URL (default: same as apiUrl) */
|
|
234
426
|
ttsUrl?: string;
|
|
235
427
|
/** Request timeout in milliseconds (default: 60000) */
|
|
236
428
|
timeout?: number;
|
|
429
|
+
/**
|
|
430
|
+
* Interval in milliseconds between WebSocket ping frames sent on the pooled connection
|
|
431
|
+
* to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
|
|
432
|
+
* In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
|
|
433
|
+
*/
|
|
434
|
+
keepalivePingInterval?: number | null;
|
|
237
435
|
}
|
|
238
436
|
|
|
239
437
|
/**
|
|
@@ -255,10 +453,21 @@ export interface MultiContextConfig {
|
|
|
255
453
|
sampleRate?: number;
|
|
256
454
|
/** CFG scale for generation (default: 2.0) */
|
|
257
455
|
cfgScale?: number;
|
|
456
|
+
/**
|
|
457
|
+
* Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
|
|
458
|
+
* Default: 0.5.
|
|
459
|
+
*/
|
|
460
|
+
temperature?: number;
|
|
258
461
|
/** Maximum tokens to generate (default: 2048) */
|
|
259
462
|
maxNewTokens?: number;
|
|
260
463
|
/** Enable text normalization (default: true) */
|
|
261
464
|
normalize?: boolean;
|
|
465
|
+
/**
|
|
466
|
+
* ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
|
|
467
|
+
* If not set and normalize is true (default), the server auto-detects
|
|
468
|
+
* the language, which adds ~60-150ms to time-to-first-audio.
|
|
469
|
+
*/
|
|
470
|
+
language?: string;
|
|
262
471
|
/** Seconds before context auto-closes (default: 20.0) */
|
|
263
472
|
inactivityTimeout?: number;
|
|
264
473
|
}
|
|
@@ -297,8 +506,6 @@ export interface MultiContextCallbacks {
|
|
|
297
506
|
onContextCreated?: (contextId: string) => void;
|
|
298
507
|
/** Called when an audio chunk is received */
|
|
299
508
|
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
300
|
-
/** Called when a context finishes generating */
|
|
301
|
-
onContextFinal?: (contextId: string) => void;
|
|
302
509
|
/** Called when a context is closed */
|
|
303
510
|
onContextClosed?: (contextId: string) => void;
|
|
304
511
|
/** Called when a context times out */
|
package/src/websocket.ts
CHANGED
|
@@ -8,36 +8,56 @@
|
|
|
8
8
|
|
|
9
9
|
let _cachedWs: typeof WebSocket | null = null;
|
|
10
10
|
|
|
11
|
+
/**
|
|
12
|
+
* Detect whether we are running in Node.js (vs. browser / edge / Deno).
|
|
13
|
+
* We prefer the `ws` package in Node because Node's built-in WebSocket
|
|
14
|
+
* (added in Node 22) surfaces a useless opaque message on handshake
|
|
15
|
+
* failures ("Received network error or non-101 status code"), whereas
|
|
16
|
+
* `ws` exposes the rejected HTTP status in the error, which the error
|
|
17
|
+
* classifier uses to raise `AuthenticationError` / `RateLimitError` etc.
|
|
18
|
+
*/
|
|
19
|
+
function isNodeJs(): boolean {
|
|
20
|
+
return (
|
|
21
|
+
typeof process !== 'undefined' &&
|
|
22
|
+
!!process.versions &&
|
|
23
|
+
typeof process.versions.node === 'string'
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
|
|
11
27
|
/**
|
|
12
28
|
* Get the WebSocket constructor for the current environment.
|
|
13
|
-
*
|
|
29
|
+
* Prefers the `ws` package in Node.js (for richer handshake errors),
|
|
30
|
+
* falls back to the native `globalThis.WebSocket` elsewhere.
|
|
14
31
|
* Result is cached after first call.
|
|
15
32
|
*/
|
|
16
33
|
export function getWebSocket(): typeof WebSocket {
|
|
17
34
|
if (_cachedWs) return _cachedWs;
|
|
18
35
|
|
|
19
|
-
//
|
|
36
|
+
// Node.js — prefer the `ws` package so handshake rejections carry the
|
|
37
|
+
// HTTP status code (see isNodeJs doc above).
|
|
38
|
+
if (isNodeJs()) {
|
|
39
|
+
try {
|
|
40
|
+
// Use Function constructor to hide require from static analysis by bundlers
|
|
41
|
+
// eslint-disable-next-line no-new-func
|
|
42
|
+
const _require = typeof require !== 'undefined'
|
|
43
|
+
? require
|
|
44
|
+
: Function('return typeof require !== "undefined" ? require : undefined')();
|
|
45
|
+
if (_require) {
|
|
46
|
+
const ws = _require('ws');
|
|
47
|
+
_cachedWs = ws.default || ws;
|
|
48
|
+
return _cachedWs!;
|
|
49
|
+
}
|
|
50
|
+
} catch {
|
|
51
|
+
// Fall through to native if the `ws` package isn't installed.
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Browser / edge / Deno environment — use native WebSocket.
|
|
20
56
|
if (typeof globalThis !== 'undefined' && typeof (globalThis as any).WebSocket !== 'undefined') {
|
|
21
57
|
_cachedWs = (globalThis as any).WebSocket;
|
|
22
58
|
return _cachedWs!;
|
|
23
59
|
}
|
|
24
60
|
|
|
25
|
-
// Node.js environment - use ws package via dynamic require
|
|
26
|
-
try {
|
|
27
|
-
// Use Function constructor to hide require from static analysis by bundlers
|
|
28
|
-
// eslint-disable-next-line no-new-func
|
|
29
|
-
const _require = typeof require !== 'undefined'
|
|
30
|
-
? require
|
|
31
|
-
: Function('return typeof require !== "undefined" ? require : undefined')();
|
|
32
|
-
if (_require) {
|
|
33
|
-
const ws = _require('ws');
|
|
34
|
-
_cachedWs = ws.default || ws;
|
|
35
|
-
return _cachedWs!;
|
|
36
|
-
}
|
|
37
|
-
} catch {
|
|
38
|
-
// Fall through to error
|
|
39
|
-
}
|
|
40
|
-
|
|
41
61
|
throw new Error(
|
|
42
62
|
'WebSocket not available. In Node.js, install the "ws" package: npm install ws'
|
|
43
63
|
);
|