kugelaudio 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/client.ts CHANGED
@@ -16,12 +16,27 @@ import type {
16
16
  KugelAudioOptions,
17
17
  Model,
18
18
  StreamCallbacks,
19
- Voice
19
+ Voice,
20
+ WordTimestamp
20
21
  } from './types';
21
22
  import { base64ToArrayBuffer } from './utils';
23
+ import { getWebSocket } from './websocket';
22
24
 
23
25
  const DEFAULT_API_URL = 'https://api.kugelaudio.com';
24
26
 
27
+ /**
28
+ * Create a new WebSocket instance.
29
+ * Lazily resolves the constructor to avoid top-level side-effects
30
+ * that break server-side bundlers (Turbopack/Webpack).
31
+ */
32
+ function createWs(url: string): WebSocket {
33
+ const WS = getWebSocket();
34
+ return new WS(url);
35
+ }
36
+
37
+ /** WebSocket OPEN readyState constant. */
38
+ const WS_OPEN = 1;
39
+
25
40
  /**
26
41
  * Models resource for listing TTS models.
27
42
  */
@@ -111,6 +126,7 @@ class VoicesResource {
111
126
  * TTS resource for text-to-speech generation.
112
127
  */
113
128
  class TTSResource {
129
+ // Using any for WebSocket to support both browser WebSocket and ws package
114
130
  private wsConnection: WebSocket | null = null;
115
131
  private wsUrl: string | null = null;
116
132
  private pendingRequests: Map<number, {
@@ -147,7 +163,7 @@ class TTSResource {
147
163
  * Check if WebSocket connection is established and open.
148
164
  */
149
165
  isConnected(): boolean {
150
- return this.wsConnection !== null && this.wsConnection.readyState === WebSocket.OPEN;
166
+ return this.wsConnection !== null && this.wsConnection.readyState === WS_OPEN;
151
167
  }
152
168
 
153
169
  /**
@@ -157,11 +173,15 @@ class TTSResource {
157
173
  async generate(options: GenerateOptions): Promise<AudioResponse> {
158
174
  const chunks: ArrayBuffer[] = [];
159
175
  let finalStats: GenerationStats | undefined;
176
+ const allTimestamps: WordTimestamp[] = [];
160
177
 
161
178
  await this.stream(options, {
162
179
  onChunk: (chunk) => {
163
180
  chunks.push(base64ToArrayBuffer(chunk.audio));
164
181
  },
182
+ onWordTimestamps: (timestamps) => {
183
+ allTimestamps.push(...timestamps);
184
+ },
165
185
  onFinal: (stats) => {
166
186
  finalStats = stats;
167
187
  },
@@ -183,6 +203,7 @@ class TTSResource {
183
203
  durationMs: finalStats ? finalStats.durationMs : 0,
184
204
  generationMs: finalStats ? finalStats.generationMs : 0,
185
205
  rtf: finalStats ? finalStats.rtf : 0,
206
+ wordTimestamps: allTimestamps,
186
207
  };
187
208
  }
188
209
 
@@ -202,7 +223,12 @@ class TTSResource {
202
223
  } else {
203
224
  authParam = 'api_key';
204
225
  }
205
- return `${wsUrl}/ws/tts?${authParam}=${this.client.apiKey}`;
226
+ let url = `${wsUrl}/ws/tts?${authParam}=${this.client.apiKey}`;
227
+ // Append org_id for token auth so usage is recorded against the org
228
+ if (this.client.orgId !== undefined) {
229
+ url += `&org_id=${this.client.orgId}`;
230
+ }
231
+ return url;
206
232
  }
207
233
 
208
234
  /**
@@ -216,7 +242,7 @@ class TTSResource {
216
242
  if (
217
243
  this.wsConnection &&
218
244
  this.wsUrl === url &&
219
- this.wsConnection.readyState === WebSocket.OPEN
245
+ this.wsConnection.readyState === WS_OPEN
220
246
  ) {
221
247
  return this.wsConnection;
222
248
  }
@@ -233,7 +259,7 @@ class TTSResource {
233
259
 
234
260
  // Create new connection
235
261
  return new Promise((resolve, reject) => {
236
- const ws = new WebSocket(url);
262
+ const ws = createWs(url);
237
263
 
238
264
  ws.onopen = () => {
239
265
  this.wsConnection = ws;
@@ -252,9 +278,15 @@ class TTSResource {
252
278
  * Setup message handler for pooled connection.
253
279
  */
254
280
  private setupMessageHandler(ws: WebSocket): void {
255
- ws.onmessage = (event) => {
281
+ ws.onmessage = (event: { data: unknown }) => {
256
282
  try {
257
- const data = JSON.parse(event.data);
283
+ // Handle both browser (string) and Node.js (Buffer) message formats
284
+ const messageData = typeof event.data === 'string'
285
+ ? event.data
286
+ : event.data instanceof Buffer
287
+ ? event.data.toString()
288
+ : String(event.data);
289
+ const data = JSON.parse(messageData);
258
290
 
259
291
  // Get the current pending request (we process one at a time)
260
292
  const [requestId, pending] = [...this.pendingRequests.entries()][0] || [];
@@ -295,6 +327,20 @@ class TTSResource {
295
327
  };
296
328
  pending.callbacks.onChunk?.(chunk);
297
329
  }
330
+
331
+ if (data.word_timestamps) {
332
+ const timestamps: WordTimestamp[] = data.word_timestamps.map(
333
+ (w: Record<string, unknown>) => ({
334
+ word: w.word as string,
335
+ startMs: w.start_ms as number,
336
+ endMs: w.end_ms as number,
337
+ charStart: w.char_start as number,
338
+ charEnd: w.char_end as number,
339
+ score: (w.score as number) ?? 1.0,
340
+ })
341
+ );
342
+ pending.callbacks.onWordTimestamps?.(timestamps);
343
+ }
298
344
  } catch (e) {
299
345
  console.error('Failed to parse WebSocket message:', e);
300
346
  }
@@ -364,14 +410,14 @@ class TTSResource {
364
410
 
365
411
  ws.send(JSON.stringify({
366
412
  text: options.text,
367
- model: options.model || 'kugel-1-turbo',
413
+ model_id: options.modelId || 'kugel-1-turbo',
368
414
  voice_id: options.voiceId,
369
415
  cfg_scale: options.cfgScale ?? 2.0,
370
416
  max_new_tokens: options.maxNewTokens ?? 2048,
371
417
  sample_rate: options.sampleRate ?? 24000,
372
- speaker_prefix: options.speakerPrefix ?? true,
373
- normalize: options.normalize ?? false,
418
+ normalize: options.normalize ?? true,
374
419
  ...(options.language && { language: options.language }),
420
+ ...(options.wordTimestamps && { word_timestamps: true }),
375
421
  }));
376
422
  });
377
423
  }
@@ -385,27 +431,33 @@ class TTSResource {
385
431
  ): Promise<void> {
386
432
  return new Promise((resolve, reject) => {
387
433
  const url = this.buildWsUrl();
388
- const ws = new WebSocket(url);
434
+ const ws = createWs(url);
389
435
 
390
436
  ws.onopen = () => {
391
437
  callbacks.onOpen?.();
392
438
  // Send TTS request
393
439
  ws.send(JSON.stringify({
394
440
  text: options.text,
395
- model: options.model || 'kugel-1-turbo',
441
+ model_id: options.modelId || 'kugel-1-turbo',
396
442
  voice_id: options.voiceId,
397
443
  cfg_scale: options.cfgScale ?? 2.0,
398
444
  max_new_tokens: options.maxNewTokens ?? 2048,
399
445
  sample_rate: options.sampleRate ?? 24000,
400
- speaker_prefix: options.speakerPrefix ?? true,
401
- normalize: options.normalize ?? false,
446
+ normalize: options.normalize ?? true,
402
447
  ...(options.language && { language: options.language }),
448
+ ...(options.wordTimestamps && { word_timestamps: true }),
403
449
  }));
404
450
  };
405
451
 
406
- ws.onmessage = (event) => {
452
+ ws.onmessage = (event: { data: unknown }) => {
407
453
  try {
408
- const data = JSON.parse(event.data);
454
+ // Handle both browser (string) and Node.js (Buffer) message formats
455
+ const messageData = typeof event.data === 'string'
456
+ ? event.data
457
+ : event.data instanceof Buffer
458
+ ? event.data.toString()
459
+ : String(event.data);
460
+ const data = JSON.parse(messageData);
409
461
 
410
462
  if (data.error) {
411
463
  const error = this.parseError(data.error);
@@ -442,6 +494,20 @@ class TTSResource {
442
494
  };
443
495
  callbacks.onChunk?.(chunk);
444
496
  }
497
+
498
+ if (data.word_timestamps) {
499
+ const timestamps: WordTimestamp[] = data.word_timestamps.map(
500
+ (w: Record<string, unknown>) => ({
501
+ word: w.word as string,
502
+ startMs: w.start_ms as number,
503
+ endMs: w.end_ms as number,
504
+ charStart: w.char_start as number,
505
+ charEnd: w.char_end as number,
506
+ score: (w.score as number) ?? 1.0,
507
+ })
508
+ );
509
+ callbacks.onWordTimestamps?.(timestamps);
510
+ }
445
511
  } catch (e) {
446
512
  console.error('Failed to parse WebSocket message:', e);
447
513
  }
@@ -489,6 +555,306 @@ class TTSResource {
489
555
  }
490
556
  return new KugelAudioError(message);
491
557
  }
558
+
559
+ /**
560
+ * Create a multi-context session for concurrent TTS streams.
561
+ *
562
+ * Allows managing up to 5 independent audio generation contexts
563
+ * over a single WebSocket connection. Each context has its own
564
+ * text buffer, voice settings, and generation queue.
565
+ *
566
+ * @example
567
+ * ```typescript
568
+ * const session = client.tts.createMultiContextSession({
569
+ * defaultVoiceId: 123,
570
+ * });
571
+ *
572
+ * session.connect({
573
+ * onChunk: (chunk) => {
574
+ * console.log(`Audio from ${chunk.contextId}`);
575
+ * playAudio(chunk.audio);
576
+ * },
577
+ * onContextFinal: (contextId) => {
578
+ * console.log(`${contextId} finished`);
579
+ * },
580
+ * });
581
+ *
582
+ * // Create contexts with different voices
583
+ * session.createContext('narrator', { voiceId: 123 });
584
+ * session.createContext('character', { voiceId: 456 });
585
+ *
586
+ * // Send text to different speakers
587
+ * session.send('narrator', 'The story begins.', true);
588
+ * session.send('character', 'Hello!', true);
589
+ *
590
+ * // Close when done
591
+ * session.close();
592
+ * ```
593
+ */
594
+ createMultiContextSession(
595
+ config?: import('./types').MultiContextConfig
596
+ ): MultiContextSession {
597
+ return new MultiContextSession(this.client, config);
598
+ }
599
+ }
600
+
601
+ /**
602
+ * Multi-context WebSocket session for concurrent TTS streams.
603
+ */
604
+ class MultiContextSession {
605
+ private ws: WebSocket | null = null;
606
+ private config: import('./types').MultiContextConfig;
607
+ private callbacks: import('./types').MultiContextCallbacks = {};
608
+ private contexts: Set<string> = new Set();
609
+ private _sessionId: string | null = null;
610
+ private isStarted = false;
611
+
612
+ constructor(
613
+ private client: KugelAudio,
614
+ config?: import('./types').MultiContextConfig
615
+ ) {
616
+ this.config = config || {};
617
+ }
618
+
619
+ /**
620
+ * Get the current session ID, or null if not connected.
621
+ */
622
+ get sessionId(): string | null {
623
+ return this._sessionId;
624
+ }
625
+
626
+ /**
627
+ * Connect to the multi-context WebSocket endpoint.
628
+ */
629
+ connect(callbacks: import('./types').MultiContextCallbacks): void {
630
+ this.callbacks = callbacks;
631
+
632
+ const wsUrl = this.client.ttsUrl
633
+ .replace('https://', 'wss://')
634
+ .replace('http://', 'ws://');
635
+
636
+ let authParam: string;
637
+ if (this.client.isToken) {
638
+ authParam = 'token';
639
+ } else if (this.client.isMasterKey) {
640
+ authParam = 'master_key';
641
+ } else {
642
+ authParam = 'api_key';
643
+ }
644
+
645
+ const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`;
646
+ this.ws = createWs(url);
647
+
648
+ this.ws.onopen = () => {
649
+ // Connection established, ready to create contexts
650
+ };
651
+
652
+ this.ws.onmessage = (event: { data: unknown }) => {
653
+ try {
654
+ // Handle both browser (string) and Node.js (Buffer) message formats
655
+ const messageData = typeof event.data === 'string'
656
+ ? event.data
657
+ : event.data instanceof Buffer
658
+ ? event.data.toString()
659
+ : String(event.data);
660
+ const data = JSON.parse(messageData);
661
+
662
+ if (data.error) {
663
+ this.callbacks.onError?.(
664
+ new KugelAudioError(data.error),
665
+ data.context_id
666
+ );
667
+ return;
668
+ }
669
+
670
+ if (data.session_started) {
671
+ this._sessionId = data.session_id;
672
+ this.isStarted = true;
673
+ this.callbacks.onSessionStarted?.(data.session_id);
674
+ }
675
+
676
+ if (data.context_created) {
677
+ this.contexts.add(data.context_id);
678
+ this.callbacks.onContextCreated?.(data.context_id);
679
+ }
680
+
681
+ if (data.audio) {
682
+ const chunk: import('./types').MultiContextAudioChunk = {
683
+ audio: data.audio,
684
+ encoding: 'pcm_s16le',
685
+ index: data.idx || 0,
686
+ sampleRate: data.sr || 24000,
687
+ samples: data.samples || 0,
688
+ contextId: data.context_id,
689
+ };
690
+ this.callbacks.onChunk?.(chunk);
691
+ }
692
+
693
+ if (data.is_final) {
694
+ this.callbacks.onContextFinal?.(data.context_id);
695
+ }
696
+
697
+ if (data.context_closed) {
698
+ this.contexts.delete(data.context_id);
699
+ this.callbacks.onContextClosed?.(data.context_id);
700
+ }
701
+
702
+ if (data.context_timeout) {
703
+ this.contexts.delete(data.context_id);
704
+ this.callbacks.onContextTimeout?.(data.context_id);
705
+ }
706
+
707
+ if (data.session_closed) {
708
+ this.callbacks.onSessionClosed?.(data);
709
+ }
710
+ } catch (e) {
711
+ console.error('Failed to parse WebSocket message:', e);
712
+ }
713
+ };
714
+
715
+ this.ws.onerror = () => {
716
+ this.callbacks.onError?.(new KugelAudioError('WebSocket connection error'));
717
+ };
718
+
719
+ this.ws.onclose = (event) => {
720
+ if (event.code === 4001) {
721
+ this.callbacks.onError?.(new AuthenticationError('Authentication failed'));
722
+ } else if (event.code === 4003) {
723
+ this.callbacks.onError?.(new InsufficientCreditsError('Insufficient credits'));
724
+ }
725
+ this.ws = null;
726
+ this.isStarted = false;
727
+ this.contexts.clear();
728
+ };
729
+ }
730
+
731
+ /**
732
+ * Create a new context with optional voice settings.
733
+ */
734
+ createContext(
735
+ contextId: string,
736
+ options?: {
737
+ voiceId?: number;
738
+ voiceSettings?: import('./types').ContextVoiceSettings;
739
+ }
740
+ ): void {
741
+ if (!this.ws || this.ws.readyState !== WS_OPEN) {
742
+ throw new KugelAudioError('WebSocket not connected');
743
+ }
744
+
745
+ const msg: Record<string, unknown> = {
746
+ text: ' ',
747
+ context_id: contextId,
748
+ };
749
+
750
+ // Include session config on first context
751
+ if (!this.isStarted) {
752
+ if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
753
+ if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
754
+ if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
755
+ if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
756
+ if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
757
+ }
758
+
759
+ // Per-context voice
760
+ const voiceId = options?.voiceId || this.config.defaultVoiceId;
761
+ if (voiceId) msg.voice_id = voiceId;
762
+
763
+ if (options?.voiceSettings) {
764
+ msg.voice_settings = {
765
+ stability: options.voiceSettings.stability,
766
+ similarity_boost: options.voiceSettings.similarityBoost,
767
+ style: options.voiceSettings.style,
768
+ use_speaker_boost: options.voiceSettings.useSpeakerBoost,
769
+ speed: options.voiceSettings.speed,
770
+ };
771
+ }
772
+
773
+ this.ws.send(JSON.stringify(msg));
774
+ }
775
+
776
+ /**
777
+ * Send text to a specific context.
778
+ */
779
+ send(contextId: string, text: string, flush = false): void {
780
+ if (!this.ws || this.ws.readyState !== WS_OPEN) {
781
+ throw new KugelAudioError('WebSocket not connected');
782
+ }
783
+
784
+ // Auto-create context if needed
785
+ if (!this.contexts.has(contextId) && !this.isStarted) {
786
+ this.createContext(contextId);
787
+ }
788
+
789
+ this.ws.send(JSON.stringify({
790
+ text,
791
+ context_id: contextId,
792
+ flush,
793
+ }));
794
+ }
795
+
796
+ /**
797
+ * Flush a context's buffer.
798
+ */
799
+ flush(contextId: string): void {
800
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
801
+
802
+ this.ws.send(JSON.stringify({
803
+ flush: true,
804
+ context_id: contextId,
805
+ }));
806
+ }
807
+
808
+ /**
809
+ * Close a specific context.
810
+ */
811
+ closeContext(contextId: string): void {
812
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
813
+
814
+ this.ws.send(JSON.stringify({
815
+ close_context: true,
816
+ context_id: contextId,
817
+ }));
818
+ }
819
+
820
+ /**
821
+ * Send keep-alive to reset a context's inactivity timeout.
822
+ */
823
+ keepAlive(contextId: string): void {
824
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
825
+
826
+ this.ws.send(JSON.stringify({
827
+ text: '',
828
+ context_id: contextId,
829
+ }));
830
+ }
831
+
832
+ /**
833
+ * Close the session and all contexts.
834
+ */
835
+ close(): void {
836
+ if (this.ws && this.ws.readyState === WS_OPEN) {
837
+ this.ws.send(JSON.stringify({ close_socket: true }));
838
+ this.ws.close();
839
+ }
840
+ this.ws = null;
841
+ this.isStarted = false;
842
+ this.contexts.clear();
843
+ }
844
+
845
+ /**
846
+ * Get active context IDs.
847
+ */
848
+ get activeContexts(): string[] {
849
+ return Array.from(this.contexts);
850
+ }
851
+
852
+ /**
853
+ * Check if connected.
854
+ */
855
+ get isConnected(): boolean {
856
+ return this.ws !== null && this.ws.readyState === WS_OPEN;
857
+ }
492
858
  }
493
859
 
494
860
  /**
@@ -507,13 +873,13 @@ class TTSResource {
507
873
  * // Generate audio with fast model (1.5B params)
508
874
  * const audio = await client.tts.generate({
509
875
  * text: 'Hello, world!',
510
- * model: 'kugel-1-turbo',
876
+ * modelId: 'kugel-1-turbo',
511
877
  * });
512
878
  *
513
879
  * // Generate audio with premium model (7B params)
514
880
  * const audio = await client.tts.generate({
515
881
  * text: 'Hello, world!',
516
- * model: 'kugel-1',
882
+ * modelId: 'kugel-1',
517
883
  * });
518
884
  * ```
519
885
  */
@@ -521,6 +887,7 @@ export class KugelAudio {
521
887
  private _apiKey: string;
522
888
  private _isMasterKey: boolean;
523
889
  private _isToken: boolean;
890
+ private _orgId: number | undefined;
524
891
  private _apiUrl: string;
525
892
  private _ttsUrl: string;
526
893
  private _timeout: number;
@@ -540,6 +907,7 @@ export class KugelAudio {
540
907
  this._apiKey = options.apiKey;
541
908
  this._isMasterKey = options.isMasterKey || false;
542
909
  this._isToken = options.isToken || false;
910
+ this._orgId = options.orgId;
543
911
  this._apiUrl = (options.apiUrl || DEFAULT_API_URL).replace(/\/$/, '');
544
912
  // If ttsUrl not specified, use apiUrl (backend proxies to TTS server)
545
913
  this._ttsUrl = (options.ttsUrl || this._apiUrl).replace(/\/$/, '');
@@ -587,6 +955,11 @@ export class KugelAudio {
587
955
  return this._isToken;
588
956
  }
589
957
 
958
+ /** Get organisation ID for billing */
959
+ get orgId(): number | undefined {
960
+ return this._orgId;
961
+ }
962
+
590
963
  /** Get TTS URL */
591
964
  get ttsUrl(): string {
592
965
  return this._ttsUrl;
package/src/index.ts CHANGED
@@ -18,13 +18,13 @@
18
18
  * // Generate audio (non-streaming)
19
19
  * const audio = await client.tts.generate({
20
20
  * text: 'Hello, world!',
21
- * model: 'kugel-1-turbo',
21
+ * modelId: 'kugel-1-turbo',
22
22
  * voiceId: 123,
23
23
  * });
24
24
  *
25
25
  * // Generate audio (streaming)
26
26
  * await client.tts.stream(
27
- * { text: 'Hello, world!', model: 'kugel-1-turbo' },
27
+ * { text: 'Hello, world!', modelId: 'kugel-1-turbo' },
28
28
  * {
29
29
  * onChunk: (chunk) => {
30
30
  * // Process audio chunk
@@ -46,16 +46,21 @@ export { KugelAudio } from './client';
46
46
  export type {
47
47
  AudioChunk,
48
48
  AudioResponse,
49
+ ContextVoiceSettings,
49
50
  GenerateOptions,
50
51
  GenerationStats,
51
52
  KugelAudioOptions,
52
53
  Model,
54
+ MultiContextAudioChunk,
55
+ MultiContextCallbacks,
56
+ MultiContextConfig,
53
57
  StreamCallbacks,
54
58
  StreamConfig,
55
59
  Voice,
56
60
  VoiceAge,
57
61
  VoiceCategory,
58
- VoiceSex
62
+ VoiceSex,
63
+ WordTimestamp
59
64
  } from './types';
60
65
 
61
66
  // Errors