kugelaudio 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -15,7 +15,7 @@ interface Model {
15
15
  /**
16
16
  * Voice category types.
17
17
  */
18
- type VoiceCategory = 'premade' | 'cloned' | 'designed';
18
+ type VoiceCategory = 'premade' | 'cloned' | 'designed' | 'conversational' | 'narrative' | 'narrative_story' | 'characters';
19
19
  /**
20
20
  * Voice sex types.
21
21
  */
@@ -41,6 +41,23 @@ interface Voice {
41
41
  isPublic: boolean;
42
42
  verified: boolean;
43
43
  }
44
+ /**
45
+ * Word-level timestamp from server-side forced alignment.
46
+ */
47
+ interface WordTimestamp {
48
+ /** The aligned word */
49
+ word: string;
50
+ /** Start time in milliseconds (relative to chunk/audio start) */
51
+ startMs: number;
52
+ /** End time in milliseconds (relative to chunk/audio start) */
53
+ endMs: number;
54
+ /** Start character offset in the original text */
55
+ charStart: number;
56
+ /** End character offset in the original text */
57
+ charEnd: number;
58
+ /** Alignment confidence score (0.0 - 1.0) */
59
+ score: number;
60
+ }
44
61
  /**
45
62
  * TTS generation request options.
46
63
  */
@@ -48,7 +65,7 @@ interface GenerateOptions {
48
65
  /** Text to synthesize */
49
66
  text: string;
50
67
  /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
51
- model?: string;
68
+ modelId?: string;
52
69
  /** Voice ID to use */
53
70
  voiceId?: number;
54
71
  /** CFG scale for generation (default: 2.0) */
@@ -57,27 +74,30 @@ interface GenerateOptions {
57
74
  maxNewTokens?: number;
58
75
  /** Output sample rate (default: 24000) */
59
76
  sampleRate?: number;
60
- /** Whether to add speaker prefix (default: true) */
61
- speakerPrefix?: boolean;
62
77
  /**
63
78
  * Enable text normalization (converts numbers, dates, etc. to spoken words).
64
79
  * When true, text will be normalized before TTS generation.
65
- * Default: false
80
+ * Default: true
66
81
  *
67
- * ⚠️ WARNING: Using normalize=true without specifying language adds ~150ms
68
- * latency for language auto-detection. For best performance, always specify
69
- * the language parameter when using normalization.
82
+ * ⚠️ For best performance, always specify the language parameter when using
83
+ * normalization. Without it, language auto-detection adds ~150ms latency.
70
84
  */
71
85
  normalize?: boolean;
72
86
  /**
73
87
  * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
74
- * If not provided and normalize is true, language will be auto-detected
88
+ * If not provided and normalize is true (default), language will be auto-detected
75
89
  * (adds ~150ms latency).
76
90
  *
77
91
  * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
78
92
  * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
79
93
  */
80
94
  language?: string;
95
+ /**
96
+ * Request word-level timestamps alongside audio.
97
+ * When true, the server performs forced alignment and returns per-word timing boundaries.
98
+ * Default: false
99
+ */
100
+ wordTimestamps?: boolean;
81
101
  }
82
102
  /**
83
103
  * Streaming session configuration.
@@ -91,12 +111,25 @@ interface StreamConfig {
91
111
  maxNewTokens?: number;
92
112
  /** Output sample rate */
93
113
  sampleRate?: number;
94
- /** Whether to add speaker prefix */
95
- speakerPrefix?: boolean;
96
114
  /** Auto-flush timeout in milliseconds */
97
115
  flushTimeoutMs?: number;
98
116
  /** Maximum buffer length */
99
117
  maxBufferLength?: number;
118
+ /**
119
+ * Enable text normalization (converts numbers, dates, etc. to spoken words).
120
+ * Default: true
121
+ */
122
+ normalize?: boolean;
123
+ /**
124
+ * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
125
+ * Specify to avoid ~150ms auto-detection latency.
126
+ */
127
+ language?: string;
128
+ /**
129
+ * Request word-level timestamps alongside audio.
130
+ * Default: false
131
+ */
132
+ wordTimestamps?: boolean;
100
133
  }
101
134
  /**
102
135
  * Audio chunk from streaming TTS.
@@ -150,6 +183,8 @@ interface AudioResponse {
150
183
  generationMs: number;
151
184
  /** Real-time factor */
152
185
  rtf: number;
186
+ /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
187
+ wordTimestamps: WordTimestamp[];
153
188
  }
154
189
  /**
155
190
  * Event callbacks for streaming.
@@ -157,6 +192,8 @@ interface AudioResponse {
157
192
  interface StreamCallbacks {
158
193
  /** Called when an audio chunk is received */
159
194
  onChunk?: (chunk: AudioChunk) => void;
195
+ /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
196
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
160
197
  /** Called when generation is complete */
161
198
  onFinal?: (stats: GenerationStats) => void;
162
199
  /** Called on error */
@@ -176,17 +213,75 @@ interface KugelAudioOptions {
176
213
  isMasterKey?: boolean;
177
214
  /** Whether apiKey is a JWT token (for user authentication). Takes precedence over isMasterKey. */
178
215
  isToken?: boolean;
216
+ /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
217
+ orgId?: number;
179
218
  /** API base URL (default: https://api.kugelaudio.com) */
180
219
  apiUrl?: string;
181
- /** TTS server URL (default: https://eu.kugelaudio.com) */
220
+ /** TTS server URL (default: same as apiUrl) */
182
221
  ttsUrl?: string;
183
222
  /** Request timeout in milliseconds (default: 60000) */
184
223
  timeout?: number;
185
224
  }
186
-
187
225
  /**
188
- * KugelAudio API Client.
226
+ * Multi-context session configuration.
189
227
  */
228
+ interface MultiContextConfig {
229
+ /** Default voice ID for new contexts */
230
+ defaultVoiceId?: number;
231
+ /** Output sample rate (default: 24000) */
232
+ sampleRate?: number;
233
+ /** CFG scale for generation (default: 2.0) */
234
+ cfgScale?: number;
235
+ /** Maximum tokens to generate (default: 2048) */
236
+ maxNewTokens?: number;
237
+ /** Enable text normalization (default: true) */
238
+ normalize?: boolean;
239
+ /** Seconds before context auto-closes (default: 20.0) */
240
+ inactivityTimeout?: number;
241
+ }
242
+ /**
243
+ * Voice settings for a specific context.
244
+ */
245
+ interface ContextVoiceSettings {
246
+ /** Stability (0.0-1.0) */
247
+ stability?: number;
248
+ /** Similarity boost (0.0-1.0) */
249
+ similarityBoost?: number;
250
+ /** Style (0.0-1.0) */
251
+ style?: number;
252
+ /** Use speaker boost */
253
+ useSpeakerBoost?: boolean;
254
+ /** Speed multiplier */
255
+ speed?: number;
256
+ }
257
+ /**
258
+ * Audio chunk from multi-context streaming.
259
+ */
260
+ interface MultiContextAudioChunk extends AudioChunk {
261
+ /** Context ID this audio belongs to */
262
+ contextId: string;
263
+ }
264
+ /**
265
+ * Event callbacks for multi-context streaming.
266
+ */
267
+ interface MultiContextCallbacks {
268
+ /** Called when session is started */
269
+ onSessionStarted?: (sessionId: string) => void;
270
+ /** Called when a context is created */
271
+ onContextCreated?: (contextId: string) => void;
272
+ /** Called when an audio chunk is received */
273
+ onChunk?: (chunk: MultiContextAudioChunk) => void;
274
+ /** Called when a context finishes generating */
275
+ onContextFinal?: (contextId: string) => void;
276
+ /** Called when a context is closed */
277
+ onContextClosed?: (contextId: string) => void;
278
+ /** Called when a context times out */
279
+ onContextTimeout?: (contextId: string) => void;
280
+ /** Called when session is closed */
281
+ onSessionClosed?: (stats: Record<string, unknown>) => void;
282
+ /** Called on error */
283
+ onError?: (error: Error, contextId?: string) => void;
284
+ }
190
285
 
191
286
  /**
192
287
  * Models resource for listing TTS models.
@@ -290,6 +385,98 @@ declare class TTSResource {
290
385
  */
291
386
  close(): void;
292
387
  private parseError;
388
+ /**
389
+ * Create a multi-context session for concurrent TTS streams.
390
+ *
391
+ * Allows managing up to 5 independent audio generation contexts
392
+ * over a single WebSocket connection. Each context has its own
393
+ * text buffer, voice settings, and generation queue.
394
+ *
395
+ * @example
396
+ * ```typescript
397
+ * const session = client.tts.createMultiContextSession({
398
+ * defaultVoiceId: 123,
399
+ * });
400
+ *
401
+ * session.connect({
402
+ * onChunk: (chunk) => {
403
+ * console.log(`Audio from ${chunk.contextId}`);
404
+ * playAudio(chunk.audio);
405
+ * },
406
+ * onContextFinal: (contextId) => {
407
+ * console.log(`${contextId} finished`);
408
+ * },
409
+ * });
410
+ *
411
+ * // Create contexts with different voices
412
+ * session.createContext('narrator', { voiceId: 123 });
413
+ * session.createContext('character', { voiceId: 456 });
414
+ *
415
+ * // Send text to different speakers
416
+ * session.send('narrator', 'The story begins.', true);
417
+ * session.send('character', 'Hello!', true);
418
+ *
419
+ * // Close when done
420
+ * session.close();
421
+ * ```
422
+ */
423
+ createMultiContextSession(config?: MultiContextConfig): MultiContextSession;
424
+ }
425
+ /**
426
+ * Multi-context WebSocket session for concurrent TTS streams.
427
+ */
428
+ declare class MultiContextSession {
429
+ private client;
430
+ private ws;
431
+ private config;
432
+ private callbacks;
433
+ private contexts;
434
+ private _sessionId;
435
+ private isStarted;
436
+ constructor(client: KugelAudio, config?: MultiContextConfig);
437
+ /**
438
+ * Get the current session ID, or null if not connected.
439
+ */
440
+ get sessionId(): string | null;
441
+ /**
442
+ * Connect to the multi-context WebSocket endpoint.
443
+ */
444
+ connect(callbacks: MultiContextCallbacks): void;
445
+ /**
446
+ * Create a new context with optional voice settings.
447
+ */
448
+ createContext(contextId: string, options?: {
449
+ voiceId?: number;
450
+ voiceSettings?: ContextVoiceSettings;
451
+ }): void;
452
+ /**
453
+ * Send text to a specific context.
454
+ */
455
+ send(contextId: string, text: string, flush?: boolean): void;
456
+ /**
457
+ * Flush a context's buffer.
458
+ */
459
+ flush(contextId: string): void;
460
+ /**
461
+ * Close a specific context.
462
+ */
463
+ closeContext(contextId: string): void;
464
+ /**
465
+ * Send keep-alive to reset a context's inactivity timeout.
466
+ */
467
+ keepAlive(contextId: string): void;
468
+ /**
469
+ * Close the session and all contexts.
470
+ */
471
+ close(): void;
472
+ /**
473
+ * Get active context IDs.
474
+ */
475
+ get activeContexts(): string[];
476
+ /**
477
+ * Check if connected.
478
+ */
479
+ get isConnected(): boolean;
293
480
  }
294
481
  /**
295
482
  * KugelAudio API client.
@@ -307,13 +494,13 @@ declare class TTSResource {
307
494
  * // Generate audio with fast model (1.5B params)
308
495
  * const audio = await client.tts.generate({
309
496
  * text: 'Hello, world!',
310
- * model: 'kugel-1-turbo',
497
+ * modelId: 'kugel-1-turbo',
311
498
  * });
312
499
  *
313
500
  * // Generate audio with premium model (7B params)
314
501
  * const audio = await client.tts.generate({
315
502
  * text: 'Hello, world!',
316
- * model: 'kugel-1',
503
+ * modelId: 'kugel-1',
317
504
  * });
318
505
  * ```
319
506
  */
@@ -321,6 +508,7 @@ declare class KugelAudio {
321
508
  private _apiKey;
322
509
  private _isMasterKey;
323
510
  private _isToken;
511
+ private _orgId;
324
512
  private _apiUrl;
325
513
  private _ttsUrl;
326
514
  private _timeout;
@@ -354,6 +542,8 @@ declare class KugelAudio {
354
542
  get isMasterKey(): boolean;
355
543
  /** Check if using JWT token authentication */
356
544
  get isToken(): boolean;
545
+ /** Get organisation ID for billing */
546
+ get orgId(): number | undefined;
357
547
  /** Get TTS URL */
358
548
  get ttsUrl(): string;
359
549
  /**
@@ -451,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
451
641
  */
452
642
  declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
453
643
 
454
- export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
644
+ export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };