kugelaudio 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/types.ts CHANGED
@@ -47,18 +47,125 @@ export interface Voice {
47
47
  verified: boolean;
48
48
  }
49
49
 
50
+ /**
51
+ * Paginated response from the voices list endpoint.
52
+ */
53
+ export interface VoiceListResponse {
54
+ voices: Voice[];
55
+ total: number;
56
+ limit: number;
57
+ offset: number;
58
+ }
59
+
60
+ /**
61
+ * Voice quality levels.
62
+ */
63
+ export type VoiceQuality = 'low' | 'mid' | 'high';
64
+
65
+ /**
66
+ * Extended voice information returned by voice management endpoints.
67
+ */
68
+ export interface VoiceDetail {
69
+ id: number;
70
+ name: string;
71
+ description: string;
72
+ generativeVoiceDescription: string;
73
+ supportedLanguages: string[];
74
+ category: string;
75
+ age?: string;
76
+ sex?: string;
77
+ quality: string;
78
+ isPublic: boolean;
79
+ verified: boolean;
80
+ pendingVerification: boolean;
81
+ sampleUrl?: string;
82
+ avatarUrl?: string;
83
+ sampleText: string;
84
+ }
85
+
86
+ /**
87
+ * Voice reference audio metadata.
88
+ */
89
+ export interface VoiceReference {
90
+ id: number;
91
+ voiceId: number;
92
+ name: string;
93
+ referenceText: string;
94
+ s3Path: string;
95
+ audioUrl?: string;
96
+ isGenerated: boolean;
97
+ }
98
+
99
+ /**
100
+ * Options for creating a new voice.
101
+ */
102
+ export interface CreateVoiceOptions {
103
+ name: string;
104
+ sex: string;
105
+ description?: string;
106
+ category?: string;
107
+ age?: string;
108
+ quality?: string;
109
+ supportedLanguages?: string[];
110
+ isPublic?: boolean;
111
+ sampleText?: string;
112
+ /** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
113
+ referenceFiles?: Array<File | Blob>;
114
+ }
115
+
116
+ /**
117
+ * Options for updating an existing voice.
118
+ */
119
+ export interface UpdateVoiceOptions {
120
+ name?: string;
121
+ description?: string;
122
+ category?: string;
123
+ age?: string;
124
+ sex?: string;
125
+ quality?: string;
126
+ supportedLanguages?: string[];
127
+ isPublic?: boolean;
128
+ sampleText?: string;
129
+ }
130
+
131
+ /**
132
+ * Word-level timestamp from server-side forced alignment.
133
+ */
134
+ export interface WordTimestamp {
135
+ /** The aligned word */
136
+ word: string;
137
+ /** Start time in milliseconds (relative to chunk/audio start) */
138
+ startMs: number;
139
+ /** End time in milliseconds (relative to chunk/audio start) */
140
+ endMs: number;
141
+ /** Start character offset in the original text */
142
+ charStart: number;
143
+ /** End character offset in the original text */
144
+ charEnd: number;
145
+ /** Alignment confidence score (0.0 - 1.0) */
146
+ score: number;
147
+ }
148
+
50
149
  /**
51
150
  * TTS generation request options.
52
151
  */
53
152
  export interface GenerateOptions {
54
153
  /** Text to synthesize */
55
154
  text: string;
56
- /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
155
+ /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
57
156
  modelId?: string;
58
157
  /** Voice ID to use */
59
158
  voiceId?: number;
60
159
  /** CFG scale for generation (default: 2.0) */
61
160
  cfgScale?: number;
161
+ /**
162
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
163
+ * 1 = most variance. Default: 0.5.
164
+ *
165
+ * Lower values produce more consistent reads across regenerations —
166
+ * useful for stable voiceovers, IVR prompts, and e-learning.
167
+ */
168
+ temperature?: number;
62
169
  /** Maximum tokens to generate (default: 2048) */
63
170
  maxNewTokens?: number;
64
171
  /** Output sample rate (default: 24000) */
@@ -78,19 +185,62 @@ export interface GenerateOptions {
78
185
  * (adds ~150ms latency).
79
186
  *
80
187
  * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
81
- * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
188
+ * el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
189
+ * he, fa, ur, bn, ta, yue, th, id, ms
82
190
  */
83
191
  language?: string;
192
+ /**
193
+ * Request word-level timestamps alongside audio.
194
+ * When true, the server performs forced alignment and returns per-word timing boundaries.
195
+ * Default: false
196
+ */
197
+ wordTimestamps?: boolean;
198
+ /**
199
+ * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
200
+ *
201
+ * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
202
+ * can also be used for per-segment speed control.
203
+ * Range: [0.8, 1.2]. Default: 1.0.
204
+ */
205
+ speed?: number;
206
+ /**
207
+ * Optional project ID for project-scoped features (custom dictionary
208
+ * replacements, per-project rate limits). The caller MUST verify the
209
+ * authenticated user has access to this project before passing it; the
210
+ * server treats the value as trusted once received.
211
+ */
212
+ projectId?: number;
84
213
  }
85
214
 
86
215
  /**
87
- * Streaming session configuration.
216
+ * Streaming session configuration for `/ws/tts/stream`.
217
+ *
218
+ * The server accumulates LLM tokens internally and starts generation at natural
219
+ * sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
220
+ * server begins generating, or set {@link autoMode} to start at the very first
221
+ * clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
222
+ *
223
+ * @example Low-latency preset
224
+ * ```typescript
225
+ * const session = client.tts.streamingSession({
226
+ * voiceId: 123,
227
+ * autoMode: true,
228
+ * chunkLengthSchedule: [50, 100, 150, 250],
229
+ * });
230
+ * ```
88
231
  */
89
232
  export interface StreamConfig {
90
233
  /** Voice ID to use */
91
234
  voiceId?: number;
235
+ /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
236
+ modelId?: string;
92
237
  /** CFG scale for generation */
93
238
  cfgScale?: number;
239
+ /**
240
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
241
+ * Default: 0.5.
242
+ */
243
+ temperature?: number;
94
244
  /** Maximum tokens per generation */
95
245
  maxNewTokens?: number;
96
246
  /** Output sample rate */
@@ -109,6 +259,69 @@ export interface StreamConfig {
109
259
  * Specify to avoid ~150ms auto-detection latency.
110
260
  */
111
261
  language?: string;
262
+ /**
263
+ * Request word-level timestamps alongside audio.
264
+ * Default: false
265
+ */
266
+ wordTimestamps?: boolean;
267
+ /**
268
+ * Minimum buffer sizes (in characters) the server must accumulate before
269
+ * auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
270
+ * last value is reused for all subsequent chunks.
271
+ *
272
+ * Smaller values produce lower TTFA at the cost of less prosody context.
273
+ * Larger values improve naturalness but increase TTFA.
274
+ *
275
+ * @example
276
+ * ```typescript
277
+ * chunkLengthSchedule: [50, 100, 150, 250] // low-latency
278
+ * chunkLengthSchedule: [120, 200, 300] // high-quality prosody
279
+ * ```
280
+ */
281
+ chunkLengthSchedule?: number[];
282
+ /**
283
+ * When `true`, the server starts generating audio at the very first clean
284
+ * sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
285
+ * ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
286
+ * less natural prosody on the first chunk.
287
+ */
288
+ autoMode?: boolean;
289
+ /**
290
+ * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
291
+ *
292
+ * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
293
+ * can also be used for per-segment speed control.
294
+ * Range: [0.8, 1.2]. Default: 1.0.
295
+ */
296
+ speed?: number;
297
+ }
298
+
299
+ /**
300
+ * Event callbacks for a streaming session (`/ws/tts/stream`).
301
+ *
302
+ * This is the LLM-integration endpoint: forward raw tokens via
303
+ * {@link StreamingSession.send} and the server auto-chunks them at sentence
304
+ * boundaries.
305
+ */
306
+ export interface StreamingSessionCallbacks {
307
+ /** Called when an audio chunk arrives for any segment. */
308
+ onChunk?: (chunk: AudioChunk) => void;
309
+ /**
310
+ * Called when all audio for one flushed text segment is complete.
311
+ * Carries the segment index, total audio duration, and generation time.
312
+ */
313
+ onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
314
+ /**
315
+ * Called when the session is fully closed (after `session.close()`).
316
+ * Equivalent to `onFinal` on the one-shot endpoint.
317
+ */
318
+ onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
319
+ /** Called when the server begins generating audio for a text segment. */
320
+ onGenerationStarted?: (chunkId: number, text: string) => void;
321
+ /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
322
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
323
+ /** Called on any error. */
324
+ onError?: (error: Error) => void;
112
325
  }
113
326
 
114
327
  /**
@@ -141,8 +354,6 @@ export interface GenerationStats {
141
354
  durationMs: number;
142
355
  /** Generation time in milliseconds */
143
356
  generationMs: number;
144
- /** Time to first audio in milliseconds */
145
- ttfaMs: number | null;
146
357
  /** Real-time factor */
147
358
  rtf: number;
148
359
  /** Error message if any */
@@ -165,6 +376,8 @@ export interface AudioResponse {
165
376
  generationMs: number;
166
377
  /** Real-time factor */
167
378
  rtf: number;
379
+ /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
380
+ wordTimestamps: WordTimestamp[];
168
381
  }
169
382
 
170
383
  /**
@@ -173,6 +386,8 @@ export interface AudioResponse {
173
386
  export interface StreamCallbacks {
174
387
  /** Called when an audio chunk is received */
175
388
  onChunk?: (chunk: AudioChunk) => void;
389
+ /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
390
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
176
391
  /** Called when generation is complete */
177
392
  onFinal?: (stats: GenerationStats) => void;
178
393
  /** Called on error */
@@ -183,11 +398,19 @@ export interface StreamCallbacks {
183
398
  onClose?: () => void;
184
399
  }
185
400
 
401
+ /**
402
+ * Deployment region. Controls which API endpoint the SDK connects to.
403
+ * - `'eu'` — `api.kugelaudio.com` (default)
404
+ * - `'us'` — `us-api.kugelaudio.com`
405
+ * - `'global'` — `global-api.kugelaudio.com` (geo-routed)
406
+ */
407
+ export type Region = 'eu' | 'us' | 'global';
408
+
186
409
  /**
187
410
  * KugelAudio client options.
188
411
  */
189
412
  export interface KugelAudioOptions {
190
- /** Your KugelAudio API key or JWT token */
413
+ /** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
191
414
  apiKey: string;
192
415
  /** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
193
416
  isMasterKey?: boolean;
@@ -195,12 +418,20 @@ export interface KugelAudioOptions {
195
418
  isToken?: boolean;
196
419
  /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
197
420
  orgId?: number;
421
+ /** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
422
+ region?: Region;
198
423
  /** API base URL (default: https://api.kugelaudio.com) */
199
424
  apiUrl?: string;
200
425
  /** TTS server URL (default: same as apiUrl) */
201
426
  ttsUrl?: string;
202
427
  /** Request timeout in milliseconds (default: 60000) */
203
428
  timeout?: number;
429
+ /**
430
+ * Interval in milliseconds between WebSocket ping frames sent on the pooled connection
431
+ * to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
432
+ * In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
433
+ */
434
+ keepalivePingInterval?: number | null;
204
435
  }
205
436
 
206
437
  /**
@@ -222,10 +453,21 @@ export interface MultiContextConfig {
222
453
  sampleRate?: number;
223
454
  /** CFG scale for generation (default: 2.0) */
224
455
  cfgScale?: number;
456
+ /**
457
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
458
+ * Default: 0.5.
459
+ */
460
+ temperature?: number;
225
461
  /** Maximum tokens to generate (default: 2048) */
226
462
  maxNewTokens?: number;
227
463
  /** Enable text normalization (default: true) */
228
464
  normalize?: boolean;
465
+ /**
466
+ * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
467
+ * If not set and normalize is true (default), the server auto-detects
468
+ * the language, which adds ~60-150ms to time-to-first-audio.
469
+ */
470
+ language?: string;
229
471
  /** Seconds before context auto-closes (default: 20.0) */
230
472
  inactivityTimeout?: number;
231
473
  }
@@ -264,8 +506,6 @@ export interface MultiContextCallbacks {
264
506
  onContextCreated?: (contextId: string) => void;
265
507
  /** Called when an audio chunk is received */
266
508
  onChunk?: (chunk: MultiContextAudioChunk) => void;
267
- /** Called when a context finishes generating */
268
- onContextFinal?: (contextId: string) => void;
269
509
  /** Called when a context is closed */
270
510
  onContextClosed?: (contextId: string) => void;
271
511
  /** Called when a context times out */
package/src/websocket.ts CHANGED
@@ -8,36 +8,56 @@
8
8
 
9
9
  let _cachedWs: typeof WebSocket | null = null;
10
10
 
11
+ /**
12
+ * Detect whether we are running in Node.js (vs. browser / edge / Deno).
13
+ * We prefer the `ws` package in Node because Node's built-in WebSocket
14
+ * (added in Node 22) surfaces a useless opaque message on handshake
15
+ * failures ("Received network error or non-101 status code"), whereas
16
+ * `ws` exposes the rejected HTTP status in the error, which the error
17
+ * classifier uses to raise `AuthenticationError` / `RateLimitError` etc.
18
+ */
19
+ function isNodeJs(): boolean {
20
+ return (
21
+ typeof process !== 'undefined' &&
22
+ !!process.versions &&
23
+ typeof process.versions.node === 'string'
24
+ );
25
+ }
26
+
11
27
  /**
12
28
  * Get the WebSocket constructor for the current environment.
13
- * Uses native WebSocket in browsers, ws package in Node.js.
29
+ * Prefers the `ws` package in Node.js (for richer handshake errors),
30
+ * falls back to the native `globalThis.WebSocket` elsewhere.
14
31
  * Result is cached after first call.
15
32
  */
16
33
  export function getWebSocket(): typeof WebSocket {
17
34
  if (_cachedWs) return _cachedWs;
18
35
 
19
- // Browser environment
36
+ // Node.js — prefer the `ws` package so handshake rejections carry the
37
+ // HTTP status code (see isNodeJs doc above).
38
+ if (isNodeJs()) {
39
+ try {
40
+ // Use Function constructor to hide require from static analysis by bundlers
41
+ // eslint-disable-next-line no-new-func
42
+ const _require = typeof require !== 'undefined'
43
+ ? require
44
+ : Function('return typeof require !== "undefined" ? require : undefined')();
45
+ if (_require) {
46
+ const ws = _require('ws');
47
+ _cachedWs = ws.default || ws;
48
+ return _cachedWs!;
49
+ }
50
+ } catch {
51
+ // Fall through to native if the `ws` package isn't installed.
52
+ }
53
+ }
54
+
55
+ // Browser / edge / Deno environment — use native WebSocket.
20
56
  if (typeof globalThis !== 'undefined' && typeof (globalThis as any).WebSocket !== 'undefined') {
21
57
  _cachedWs = (globalThis as any).WebSocket;
22
58
  return _cachedWs!;
23
59
  }
24
60
 
25
- // Node.js environment - use ws package via dynamic require
26
- try {
27
- // Use Function constructor to hide require from static analysis by bundlers
28
- // eslint-disable-next-line no-new-func
29
- const _require = typeof require !== 'undefined'
30
- ? require
31
- : Function('return typeof require !== "undefined" ? require : undefined')();
32
- if (_require) {
33
- const ws = _require('ws');
34
- _cachedWs = ws.default || ws;
35
- return _cachedWs!;
36
- }
37
- } catch {
38
- // Fall through to error
39
- }
40
-
41
61
  throw new Error(
42
62
  'WebSocket not available. In Node.js, install the "ws" package: npm install ws'
43
63
  );