kugelaudio 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/types.ts CHANGED
@@ -47,6 +47,87 @@ export interface Voice {
47
47
  verified: boolean;
48
48
  }
49
49
 
50
+ /**
51
+ * Paginated response from the voices list endpoint.
52
+ */
53
+ export interface VoiceListResponse {
54
+ voices: Voice[];
55
+ total: number;
56
+ limit: number;
57
+ offset: number;
58
+ }
59
+
60
+ /**
61
+ * Voice quality levels.
62
+ */
63
+ export type VoiceQuality = 'low' | 'mid' | 'high';
64
+
65
+ /**
66
+ * Extended voice information returned by voice management endpoints.
67
+ */
68
+ export interface VoiceDetail {
69
+ id: number;
70
+ name: string;
71
+ description: string;
72
+ generativeVoiceDescription: string;
73
+ supportedLanguages: string[];
74
+ category: string;
75
+ age?: string;
76
+ sex?: string;
77
+ quality: string;
78
+ isPublic: boolean;
79
+ verified: boolean;
80
+ pendingVerification: boolean;
81
+ sampleUrl?: string;
82
+ avatarUrl?: string;
83
+ sampleText: string;
84
+ }
85
+
86
+ /**
87
+ * Voice reference audio metadata.
88
+ */
89
+ export interface VoiceReference {
90
+ id: number;
91
+ voiceId: number;
92
+ name: string;
93
+ referenceText: string;
94
+ s3Path: string;
95
+ audioUrl?: string;
96
+ isGenerated: boolean;
97
+ }
98
+
99
+ /**
100
+ * Options for creating a new voice.
101
+ */
102
+ export interface CreateVoiceOptions {
103
+ name: string;
104
+ sex: string;
105
+ description?: string;
106
+ category?: string;
107
+ age?: string;
108
+ quality?: string;
109
+ supportedLanguages?: string[];
110
+ isPublic?: boolean;
111
+ sampleText?: string;
112
+ /** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
113
+ referenceFiles?: Array<File | Blob>;
114
+ }
115
+
116
+ /**
117
+ * Options for updating an existing voice.
118
+ */
119
+ export interface UpdateVoiceOptions {
120
+ name?: string;
121
+ description?: string;
122
+ category?: string;
123
+ age?: string;
124
+ sex?: string;
125
+ quality?: string;
126
+ supportedLanguages?: string[];
127
+ isPublic?: boolean;
128
+ sampleText?: string;
129
+ }
130
+
50
131
  /**
51
132
  * Word-level timestamp from server-side forced alignment.
52
133
  */
@@ -71,12 +152,20 @@ export interface WordTimestamp {
71
152
  export interface GenerateOptions {
72
153
  /** Text to synthesize */
73
154
  text: string;
74
- /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
155
+ /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
75
156
  modelId?: string;
76
157
  /** Voice ID to use */
77
158
  voiceId?: number;
78
159
  /** CFG scale for generation (default: 2.0) */
79
160
  cfgScale?: number;
161
+ /**
162
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
163
+ * 1 = most variance. Default: 0.5.
164
+ *
165
+ * Lower values produce more consistent reads across regenerations —
166
+ * useful for stable voiceovers, IVR prompts, and e-learning.
167
+ */
168
+ temperature?: number;
80
169
  /** Maximum tokens to generate (default: 2048) */
81
170
  maxNewTokens?: number;
82
171
  /** Output sample rate (default: 24000) */
@@ -96,7 +185,8 @@ export interface GenerateOptions {
96
185
  * (adds ~150ms latency).
97
186
  *
98
187
  * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
99
- * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
188
+ * el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
189
+ * he, fa, ur, bn, ta, yue, th, id, ms
100
190
  */
101
191
  language?: string;
102
192
  /**
@@ -105,16 +195,52 @@ export interface GenerateOptions {
105
195
  * Default: false
106
196
  */
107
197
  wordTimestamps?: boolean;
198
+ /**
199
+ * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
200
+ *
201
+ * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
202
+ * can also be used for per-segment speed control.
203
+ * Range: [0.8, 1.2]. Default: 1.0.
204
+ */
205
+ speed?: number;
206
+ /**
207
+ * Optional project ID for project-scoped features (custom dictionary
208
+ * replacements, per-project rate limits). The caller MUST verify the
209
+ * authenticated user has access to this project before passing it; the
210
+ * server treats the value as trusted once received.
211
+ */
212
+ projectId?: number;
108
213
  }
109
214
 
110
215
  /**
111
- * Streaming session configuration.
216
+ * Streaming session configuration for `/ws/tts/stream`.
217
+ *
218
+ * The server accumulates LLM tokens internally and starts generation at natural
219
+ * sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
220
+ * server begins generating, or set {@link autoMode} to start at the very first
221
+ * clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
222
+ *
223
+ * @example Low-latency preset
224
+ * ```typescript
225
+ * const session = client.tts.streamingSession({
226
+ * voiceId: 123,
227
+ * autoMode: true,
228
+ * chunkLengthSchedule: [50, 100, 150, 250],
229
+ * });
230
+ * ```
112
231
  */
113
232
  export interface StreamConfig {
114
233
  /** Voice ID to use */
115
234
  voiceId?: number;
235
+ /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
236
+ modelId?: string;
116
237
  /** CFG scale for generation */
117
238
  cfgScale?: number;
239
+ /**
240
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
241
+ * Default: 0.5.
242
+ */
243
+ temperature?: number;
118
244
  /** Maximum tokens per generation */
119
245
  maxNewTokens?: number;
120
246
  /** Output sample rate */
@@ -138,6 +264,64 @@ export interface StreamConfig {
138
264
  * Default: false
139
265
  */
140
266
  wordTimestamps?: boolean;
267
+ /**
268
+ * Minimum buffer sizes (in characters) the server must accumulate before
269
+ * auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
270
+ * last value is reused for all subsequent chunks.
271
+ *
272
+ * Smaller values produce lower TTFA at the cost of less prosody context.
273
+ * Larger values improve naturalness but increase TTFA.
274
+ *
275
+ * @example
276
+ * ```typescript
277
+ * chunkLengthSchedule: [50, 100, 150, 250] // low-latency
278
+ * chunkLengthSchedule: [120, 200, 300] // high-quality prosody
279
+ * ```
280
+ */
281
+ chunkLengthSchedule?: number[];
282
+ /**
283
+ * When `true`, the server starts generating audio at the very first clean
284
+ * sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
285
+ * ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
286
+ * less natural prosody on the first chunk.
287
+ */
288
+ autoMode?: boolean;
289
+ /**
290
+ * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
291
+ *
292
+ * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
293
+ * can also be used for per-segment speed control.
294
+ * Range: [0.8, 1.2]. Default: 1.0.
295
+ */
296
+ speed?: number;
297
+ }
298
+
299
+ /**
300
+ * Event callbacks for a streaming session (`/ws/tts/stream`).
301
+ *
302
+ * This is the LLM-integration endpoint: forward raw tokens via
303
+ * {@link StreamingSession.send} and the server auto-chunks them at sentence
304
+ * boundaries.
305
+ */
306
+ export interface StreamingSessionCallbacks {
307
+ /** Called when an audio chunk arrives for any segment. */
308
+ onChunk?: (chunk: AudioChunk) => void;
309
+ /**
310
+ * Called when all audio for one flushed text segment is complete.
311
+ * Carries the segment index, total audio duration, and generation time.
312
+ */
313
+ onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
314
+ /**
315
+ * Called when the session is fully closed (after `session.close()`).
316
+ * Equivalent to `onFinal` on the one-shot endpoint.
317
+ */
318
+ onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
319
+ /** Called when the server begins generating audio for a text segment. */
320
+ onGenerationStarted?: (chunkId: number, text: string) => void;
321
+ /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
322
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
323
+ /** Called on any error. */
324
+ onError?: (error: Error) => void;
141
325
  }
142
326
 
143
327
  /**
@@ -170,8 +354,6 @@ export interface GenerationStats {
170
354
  durationMs: number;
171
355
  /** Generation time in milliseconds */
172
356
  generationMs: number;
173
- /** Time to first audio in milliseconds */
174
- ttfaMs: number | null;
175
357
  /** Real-time factor */
176
358
  rtf: number;
177
359
  /** Error message if any */
@@ -216,11 +398,19 @@ export interface StreamCallbacks {
216
398
  onClose?: () => void;
217
399
  }
218
400
 
401
+ /**
402
+ * Deployment region. Controls which API endpoint the SDK connects to.
403
+ * - `'eu'` — `api.kugelaudio.com` (default)
404
+ * - `'us'` — `us-api.kugelaudio.com`
405
+ * - `'global'` — `global-api.kugelaudio.com` (geo-routed)
406
+ */
407
+ export type Region = 'eu' | 'us' | 'global';
408
+
219
409
  /**
220
410
  * KugelAudio client options.
221
411
  */
222
412
  export interface KugelAudioOptions {
223
- /** Your KugelAudio API key or JWT token */
413
+ /** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
224
414
  apiKey: string;
225
415
  /** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
226
416
  isMasterKey?: boolean;
@@ -228,12 +418,20 @@ export interface KugelAudioOptions {
228
418
  isToken?: boolean;
229
419
  /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
230
420
  orgId?: number;
421
+ /** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
422
+ region?: Region;
231
423
  /** API base URL (default: https://api.kugelaudio.com) */
232
424
  apiUrl?: string;
233
425
  /** TTS server URL (default: same as apiUrl) */
234
426
  ttsUrl?: string;
235
427
  /** Request timeout in milliseconds (default: 60000) */
236
428
  timeout?: number;
429
+ /**
430
+ * Interval in milliseconds between WebSocket ping frames sent on the pooled connection
431
+ * to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
432
+ * In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
433
+ */
434
+ keepalivePingInterval?: number | null;
237
435
  }
238
436
 
239
437
  /**
@@ -255,10 +453,21 @@ export interface MultiContextConfig {
255
453
  sampleRate?: number;
256
454
  /** CFG scale for generation (default: 2.0) */
257
455
  cfgScale?: number;
456
+ /**
457
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
458
+ * Default: 0.5.
459
+ */
460
+ temperature?: number;
258
461
  /** Maximum tokens to generate (default: 2048) */
259
462
  maxNewTokens?: number;
260
463
  /** Enable text normalization (default: true) */
261
464
  normalize?: boolean;
465
+ /**
466
+ * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
467
+ * If not set and normalize is true (default), the server auto-detects
468
+ * the language, which adds ~60-150ms to time-to-first-audio.
469
+ */
470
+ language?: string;
262
471
  /** Seconds before context auto-closes (default: 20.0) */
263
472
  inactivityTimeout?: number;
264
473
  }
@@ -297,8 +506,6 @@ export interface MultiContextCallbacks {
297
506
  onContextCreated?: (contextId: string) => void;
298
507
  /** Called when an audio chunk is received */
299
508
  onChunk?: (chunk: MultiContextAudioChunk) => void;
300
- /** Called when a context finishes generating */
301
- onContextFinal?: (contextId: string) => void;
302
509
  /** Called when a context is closed */
303
510
  onContextClosed?: (contextId: string) => void;
304
511
  /** Called when a context times out */
package/src/websocket.ts CHANGED
@@ -8,36 +8,56 @@
8
8
 
9
9
  let _cachedWs: typeof WebSocket | null = null;
10
10
 
11
+ /**
12
+ * Detect whether we are running in Node.js (vs. browser / edge / Deno).
13
+ * We prefer the `ws` package in Node because Node's built-in WebSocket
14
+ * (added in Node 22) surfaces a useless opaque message on handshake
15
+ * failures ("Received network error or non-101 status code"), whereas
16
+ * `ws` exposes the rejected HTTP status in the error, which the error
17
+ * classifier uses to raise `AuthenticationError` / `RateLimitError` etc.
18
+ */
19
+ function isNodeJs(): boolean {
20
+ return (
21
+ typeof process !== 'undefined' &&
22
+ !!process.versions &&
23
+ typeof process.versions.node === 'string'
24
+ );
25
+ }
26
+
11
27
  /**
12
28
  * Get the WebSocket constructor for the current environment.
13
- * Uses native WebSocket in browsers, ws package in Node.js.
29
+ * Prefers the `ws` package in Node.js (for richer handshake errors),
30
+ * falls back to the native `globalThis.WebSocket` elsewhere.
14
31
  * Result is cached after first call.
15
32
  */
16
33
  export function getWebSocket(): typeof WebSocket {
17
34
  if (_cachedWs) return _cachedWs;
18
35
 
19
- // Browser environment
36
+ // Node.js — prefer the `ws` package so handshake rejections carry the
37
+ // HTTP status code (see isNodeJs doc above).
38
+ if (isNodeJs()) {
39
+ try {
40
+ // Use Function constructor to hide require from static analysis by bundlers
41
+ // eslint-disable-next-line no-new-func
42
+ const _require = typeof require !== 'undefined'
43
+ ? require
44
+ : Function('return typeof require !== "undefined" ? require : undefined')();
45
+ if (_require) {
46
+ const ws = _require('ws');
47
+ _cachedWs = ws.default || ws;
48
+ return _cachedWs!;
49
+ }
50
+ } catch {
51
+ // Fall through to native if the `ws` package isn't installed.
52
+ }
53
+ }
54
+
55
+ // Browser / edge / Deno environment — use native WebSocket.
20
56
  if (typeof globalThis !== 'undefined' && typeof (globalThis as any).WebSocket !== 'undefined') {
21
57
  _cachedWs = (globalThis as any).WebSocket;
22
58
  return _cachedWs!;
23
59
  }
24
60
 
25
- // Node.js environment - use ws package via dynamic require
26
- try {
27
- // Use Function constructor to hide require from static analysis by bundlers
28
- // eslint-disable-next-line no-new-func
29
- const _require = typeof require !== 'undefined'
30
- ? require
31
- : Function('return typeof require !== "undefined" ? require : undefined')();
32
- if (_require) {
33
- const ws = _require('ws');
34
- _cachedWs = ws.default || ws;
35
- return _cachedWs!;
36
- }
37
- } catch {
38
- // Fall through to error
39
- }
40
-
41
61
  throw new Error(
42
62
  'WebSocket not available. In Node.js, install the "ws" package: npm install ws'
43
63
  );