kugelaudio 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -135,8 +135,35 @@ function createWavBlob(audio, sampleRate) {
135
135
  return new Blob([wavBuffer], { type: "audio/wav" });
136
136
  }
137
137
 
138
+ // src/websocket.ts
139
+ var _cachedWs = null;
140
+ function getWebSocket() {
141
+ if (_cachedWs) return _cachedWs;
142
+ if (typeof globalThis !== "undefined" && typeof globalThis.WebSocket !== "undefined") {
143
+ _cachedWs = globalThis.WebSocket;
144
+ return _cachedWs;
145
+ }
146
+ try {
147
+ const _require = typeof require !== "undefined" ? require : Function('return typeof require !== "undefined" ? require : undefined')();
148
+ if (_require) {
149
+ const ws = _require("ws");
150
+ _cachedWs = ws.default || ws;
151
+ return _cachedWs;
152
+ }
153
+ } catch {
154
+ }
155
+ throw new Error(
156
+ 'WebSocket not available. In Node.js, install the "ws" package: npm install ws'
157
+ );
158
+ }
159
+
138
160
  // src/client.ts
139
161
  var DEFAULT_API_URL = "https://api.kugelaudio.com";
162
+ function createWs(url) {
163
+ const WS = getWebSocket();
164
+ return new WS(url);
165
+ }
166
+ var WS_OPEN = 1;
140
167
  var ModelsResource = class {
141
168
  constructor(client) {
142
169
  this.client = client;
@@ -212,6 +239,7 @@ var VoicesResource = class {
212
239
  var TTSResource = class {
213
240
  constructor(client) {
214
241
  this.client = client;
242
+ // Using any for WebSocket to support both browser WebSocket and ws package
215
243
  this.wsConnection = null;
216
244
  this.wsUrl = null;
217
245
  this.pendingRequests = /* @__PURE__ */ new Map();
@@ -241,7 +269,7 @@ var TTSResource = class {
241
269
  * Check if WebSocket connection is established and open.
242
270
  */
243
271
  isConnected() {
244
- return this.wsConnection !== null && this.wsConnection.readyState === WebSocket.OPEN;
272
+ return this.wsConnection !== null && this.wsConnection.readyState === WS_OPEN;
245
273
  }
246
274
  /**
247
275
  * Generate audio from text with streaming via WebSocket.
@@ -250,10 +278,14 @@ var TTSResource = class {
250
278
  async generate(options) {
251
279
  const chunks = [];
252
280
  let finalStats;
281
+ const allTimestamps = [];
253
282
  await this.stream(options, {
254
283
  onChunk: (chunk) => {
255
284
  chunks.push(base64ToArrayBuffer(chunk.audio));
256
285
  },
286
+ onWordTimestamps: (timestamps) => {
287
+ allTimestamps.push(...timestamps);
288
+ },
257
289
  onFinal: (stats) => {
258
290
  finalStats = stats;
259
291
  }
@@ -271,7 +303,8 @@ var TTSResource = class {
271
303
  samples: finalStats ? finalStats.totalSamples : totalLength / 2,
272
304
  durationMs: finalStats ? finalStats.durationMs : 0,
273
305
  generationMs: finalStats ? finalStats.generationMs : 0,
274
- rtf: finalStats ? finalStats.rtf : 0
306
+ rtf: finalStats ? finalStats.rtf : 0,
307
+ wordTimestamps: allTimestamps
275
308
  };
276
309
  }
277
310
  /**
@@ -287,7 +320,11 @@ var TTSResource = class {
287
320
  } else {
288
321
  authParam = "api_key";
289
322
  }
290
- return `${wsUrl}/ws/tts?${authParam}=${this.client.apiKey}`;
323
+ let url = `${wsUrl}/ws/tts?${authParam}=${this.client.apiKey}`;
324
+ if (this.client.orgId !== void 0) {
325
+ url += `&org_id=${this.client.orgId}`;
326
+ }
327
+ return url;
291
328
  }
292
329
  /**
293
330
  * Get or create a WebSocket connection for connection pooling.
@@ -295,7 +332,7 @@ var TTSResource = class {
295
332
  */
296
333
  async getConnection() {
297
334
  const url = this.buildWsUrl();
298
- if (this.wsConnection && this.wsUrl === url && this.wsConnection.readyState === WebSocket.OPEN) {
335
+ if (this.wsConnection && this.wsUrl === url && this.wsConnection.readyState === WS_OPEN) {
299
336
  return this.wsConnection;
300
337
  }
301
338
  if (this.wsConnection) {
@@ -306,7 +343,7 @@ var TTSResource = class {
306
343
  this.wsConnection = null;
307
344
  }
308
345
  return new Promise((resolve, reject) => {
309
- const ws = new WebSocket(url);
346
+ const ws = createWs(url);
310
347
  ws.onopen = () => {
311
348
  this.wsConnection = ws;
312
349
  this.wsUrl = url;
@@ -324,7 +361,8 @@ var TTSResource = class {
324
361
  setupMessageHandler(ws) {
325
362
  ws.onmessage = (event) => {
326
363
  try {
327
- const data = JSON.parse(event.data);
364
+ const messageData = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
365
+ const data = JSON.parse(messageData);
328
366
  const [requestId, pending] = [...this.pendingRequests.entries()][0] || [];
329
367
  if (!pending) return;
330
368
  if (data.error) {
@@ -360,6 +398,19 @@ var TTSResource = class {
360
398
  };
361
399
  pending.callbacks.onChunk?.(chunk);
362
400
  }
401
+ if (data.word_timestamps) {
402
+ const timestamps = data.word_timestamps.map(
403
+ (w) => ({
404
+ word: w.word,
405
+ startMs: w.start_ms,
406
+ endMs: w.end_ms,
407
+ charStart: w.char_start,
408
+ charEnd: w.char_end,
409
+ score: w.score ?? 1
410
+ })
411
+ );
412
+ pending.callbacks.onWordTimestamps?.(timestamps);
413
+ }
363
414
  } catch (e) {
364
415
  console.error("Failed to parse WebSocket message:", e);
365
416
  }
@@ -411,14 +462,14 @@ var TTSResource = class {
411
462
  callbacks.onOpen?.();
412
463
  ws.send(JSON.stringify({
413
464
  text: options.text,
414
- model: options.model || "kugel-1-turbo",
465
+ model_id: options.modelId || "kugel-1-turbo",
415
466
  voice_id: options.voiceId,
416
467
  cfg_scale: options.cfgScale ?? 2,
417
468
  max_new_tokens: options.maxNewTokens ?? 2048,
418
469
  sample_rate: options.sampleRate ?? 24e3,
419
- speaker_prefix: options.speakerPrefix ?? true,
420
- normalize: options.normalize ?? false,
421
- ...options.language && { language: options.language }
470
+ normalize: options.normalize ?? true,
471
+ ...options.language && { language: options.language },
472
+ ...options.wordTimestamps && { word_timestamps: true }
422
473
  }));
423
474
  });
424
475
  }
@@ -428,24 +479,25 @@ var TTSResource = class {
428
479
  streamWithoutPooling(options, callbacks) {
429
480
  return new Promise((resolve, reject) => {
430
481
  const url = this.buildWsUrl();
431
- const ws = new WebSocket(url);
482
+ const ws = createWs(url);
432
483
  ws.onopen = () => {
433
484
  callbacks.onOpen?.();
434
485
  ws.send(JSON.stringify({
435
486
  text: options.text,
436
- model: options.model || "kugel-1-turbo",
487
+ model_id: options.modelId || "kugel-1-turbo",
437
488
  voice_id: options.voiceId,
438
489
  cfg_scale: options.cfgScale ?? 2,
439
490
  max_new_tokens: options.maxNewTokens ?? 2048,
440
491
  sample_rate: options.sampleRate ?? 24e3,
441
- speaker_prefix: options.speakerPrefix ?? true,
442
- normalize: options.normalize ?? false,
443
- ...options.language && { language: options.language }
492
+ normalize: options.normalize ?? true,
493
+ ...options.language && { language: options.language },
494
+ ...options.wordTimestamps && { word_timestamps: true }
444
495
  }));
445
496
  };
446
497
  ws.onmessage = (event) => {
447
498
  try {
448
- const data = JSON.parse(event.data);
499
+ const messageData = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
500
+ const data = JSON.parse(messageData);
449
501
  if (data.error) {
450
502
  const error = this.parseError(data.error);
451
503
  callbacks.onError?.(error);
@@ -479,6 +531,19 @@ var TTSResource = class {
479
531
  };
480
532
  callbacks.onChunk?.(chunk);
481
533
  }
534
+ if (data.word_timestamps) {
535
+ const timestamps = data.word_timestamps.map(
536
+ (w) => ({
537
+ word: w.word,
538
+ startMs: w.start_ms,
539
+ endMs: w.end_ms,
540
+ charStart: w.char_start,
541
+ charEnd: w.char_end,
542
+ score: w.score ?? 1
543
+ })
544
+ );
545
+ callbacks.onWordTimestamps?.(timestamps);
546
+ }
482
547
  } catch (e) {
483
548
  console.error("Failed to parse WebSocket message:", e);
484
549
  }
@@ -521,6 +586,243 @@ var TTSResource = class {
521
586
  }
522
587
  return new KugelAudioError(message);
523
588
  }
589
+ /**
590
+ * Create a multi-context session for concurrent TTS streams.
591
+ *
592
+ * Allows managing up to 5 independent audio generation contexts
593
+ * over a single WebSocket connection. Each context has its own
594
+ * text buffer, voice settings, and generation queue.
595
+ *
596
+ * @example
597
+ * ```typescript
598
+ * const session = client.tts.createMultiContextSession({
599
+ * defaultVoiceId: 123,
600
+ * });
601
+ *
602
+ * session.connect({
603
+ * onChunk: (chunk) => {
604
+ * console.log(`Audio from ${chunk.contextId}`);
605
+ * playAudio(chunk.audio);
606
+ * },
607
+ * onContextFinal: (contextId) => {
608
+ * console.log(`${contextId} finished`);
609
+ * },
610
+ * });
611
+ *
612
+ * // Create contexts with different voices
613
+ * session.createContext('narrator', { voiceId: 123 });
614
+ * session.createContext('character', { voiceId: 456 });
615
+ *
616
+ * // Send text to different speakers
617
+ * session.send('narrator', 'The story begins.', true);
618
+ * session.send('character', 'Hello!', true);
619
+ *
620
+ * // Close when done
621
+ * session.close();
622
+ * ```
623
+ */
624
+ createMultiContextSession(config) {
625
+ return new MultiContextSession(this.client, config);
626
+ }
627
+ };
628
+ var MultiContextSession = class {
629
+ constructor(client, config) {
630
+ this.client = client;
631
+ this.ws = null;
632
+ this.callbacks = {};
633
+ this.contexts = /* @__PURE__ */ new Set();
634
+ this._sessionId = null;
635
+ this.isStarted = false;
636
+ this.config = config || {};
637
+ }
638
+ /**
639
+ * Get the current session ID, or null if not connected.
640
+ */
641
+ get sessionId() {
642
+ return this._sessionId;
643
+ }
644
+ /**
645
+ * Connect to the multi-context WebSocket endpoint.
646
+ */
647
+ connect(callbacks) {
648
+ this.callbacks = callbacks;
649
+ const wsUrl = this.client.ttsUrl.replace("https://", "wss://").replace("http://", "ws://");
650
+ let authParam;
651
+ if (this.client.isToken) {
652
+ authParam = "token";
653
+ } else if (this.client.isMasterKey) {
654
+ authParam = "master_key";
655
+ } else {
656
+ authParam = "api_key";
657
+ }
658
+ const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`;
659
+ this.ws = createWs(url);
660
+ this.ws.onopen = () => {
661
+ };
662
+ this.ws.onmessage = (event) => {
663
+ try {
664
+ const messageData = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
665
+ const data = JSON.parse(messageData);
666
+ if (data.error) {
667
+ this.callbacks.onError?.(
668
+ new KugelAudioError(data.error),
669
+ data.context_id
670
+ );
671
+ return;
672
+ }
673
+ if (data.session_started) {
674
+ this._sessionId = data.session_id;
675
+ this.isStarted = true;
676
+ this.callbacks.onSessionStarted?.(data.session_id);
677
+ }
678
+ if (data.context_created) {
679
+ this.contexts.add(data.context_id);
680
+ this.callbacks.onContextCreated?.(data.context_id);
681
+ }
682
+ if (data.audio) {
683
+ const chunk = {
684
+ audio: data.audio,
685
+ encoding: "pcm_s16le",
686
+ index: data.idx || 0,
687
+ sampleRate: data.sr || 24e3,
688
+ samples: data.samples || 0,
689
+ contextId: data.context_id
690
+ };
691
+ this.callbacks.onChunk?.(chunk);
692
+ }
693
+ if (data.is_final) {
694
+ this.callbacks.onContextFinal?.(data.context_id);
695
+ }
696
+ if (data.context_closed) {
697
+ this.contexts.delete(data.context_id);
698
+ this.callbacks.onContextClosed?.(data.context_id);
699
+ }
700
+ if (data.context_timeout) {
701
+ this.contexts.delete(data.context_id);
702
+ this.callbacks.onContextTimeout?.(data.context_id);
703
+ }
704
+ if (data.session_closed) {
705
+ this.callbacks.onSessionClosed?.(data);
706
+ }
707
+ } catch (e) {
708
+ console.error("Failed to parse WebSocket message:", e);
709
+ }
710
+ };
711
+ this.ws.onerror = () => {
712
+ this.callbacks.onError?.(new KugelAudioError("WebSocket connection error"));
713
+ };
714
+ this.ws.onclose = (event) => {
715
+ if (event.code === 4001) {
716
+ this.callbacks.onError?.(new AuthenticationError("Authentication failed"));
717
+ } else if (event.code === 4003) {
718
+ this.callbacks.onError?.(new InsufficientCreditsError("Insufficient credits"));
719
+ }
720
+ this.ws = null;
721
+ this.isStarted = false;
722
+ this.contexts.clear();
723
+ };
724
+ }
725
+ /**
726
+ * Create a new context with optional voice settings.
727
+ */
728
+ createContext(contextId, options) {
729
+ if (!this.ws || this.ws.readyState !== WS_OPEN) {
730
+ throw new KugelAudioError("WebSocket not connected");
731
+ }
732
+ const msg = {
733
+ text: " ",
734
+ context_id: contextId
735
+ };
736
+ if (!this.isStarted) {
737
+ if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
738
+ if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
739
+ if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
740
+ if (this.config.normalize !== void 0) msg.normalize = this.config.normalize;
741
+ if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
742
+ }
743
+ const voiceId = options?.voiceId || this.config.defaultVoiceId;
744
+ if (voiceId) msg.voice_id = voiceId;
745
+ if (options?.voiceSettings) {
746
+ msg.voice_settings = {
747
+ stability: options.voiceSettings.stability,
748
+ similarity_boost: options.voiceSettings.similarityBoost,
749
+ style: options.voiceSettings.style,
750
+ use_speaker_boost: options.voiceSettings.useSpeakerBoost,
751
+ speed: options.voiceSettings.speed
752
+ };
753
+ }
754
+ this.ws.send(JSON.stringify(msg));
755
+ }
756
+ /**
757
+ * Send text to a specific context.
758
+ */
759
+ send(contextId, text, flush = false) {
760
+ if (!this.ws || this.ws.readyState !== WS_OPEN) {
761
+ throw new KugelAudioError("WebSocket not connected");
762
+ }
763
+ if (!this.contexts.has(contextId) && !this.isStarted) {
764
+ this.createContext(contextId);
765
+ }
766
+ this.ws.send(JSON.stringify({
767
+ text,
768
+ context_id: contextId,
769
+ flush
770
+ }));
771
+ }
772
+ /**
773
+ * Flush a context's buffer.
774
+ */
775
+ flush(contextId) {
776
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
777
+ this.ws.send(JSON.stringify({
778
+ flush: true,
779
+ context_id: contextId
780
+ }));
781
+ }
782
+ /**
783
+ * Close a specific context.
784
+ */
785
+ closeContext(contextId) {
786
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
787
+ this.ws.send(JSON.stringify({
788
+ close_context: true,
789
+ context_id: contextId
790
+ }));
791
+ }
792
+ /**
793
+ * Send keep-alive to reset a context's inactivity timeout.
794
+ */
795
+ keepAlive(contextId) {
796
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
797
+ this.ws.send(JSON.stringify({
798
+ text: "",
799
+ context_id: contextId
800
+ }));
801
+ }
802
+ /**
803
+ * Close the session and all contexts.
804
+ */
805
+ close() {
806
+ if (this.ws && this.ws.readyState === WS_OPEN) {
807
+ this.ws.send(JSON.stringify({ close_socket: true }));
808
+ this.ws.close();
809
+ }
810
+ this.ws = null;
811
+ this.isStarted = false;
812
+ this.contexts.clear();
813
+ }
814
+ /**
815
+ * Get active context IDs.
816
+ */
817
+ get activeContexts() {
818
+ return Array.from(this.contexts);
819
+ }
820
+ /**
821
+ * Check if connected.
822
+ */
823
+ get isConnected() {
824
+ return this.ws !== null && this.ws.readyState === WS_OPEN;
825
+ }
524
826
  };
525
827
  var KugelAudio = class _KugelAudio {
526
828
  constructor(options) {
@@ -530,6 +832,7 @@ var KugelAudio = class _KugelAudio {
530
832
  this._apiKey = options.apiKey;
531
833
  this._isMasterKey = options.isMasterKey || false;
532
834
  this._isToken = options.isToken || false;
835
+ this._orgId = options.orgId;
533
836
  this._apiUrl = (options.apiUrl || DEFAULT_API_URL).replace(/\/$/, "");
534
837
  this._ttsUrl = (options.ttsUrl || this._apiUrl).replace(/\/$/, "");
535
838
  this._timeout = options.timeout || 6e4;
@@ -570,6 +873,10 @@ var KugelAudio = class _KugelAudio {
570
873
  get isToken() {
571
874
  return this._isToken;
572
875
  }
876
+ /** Get organisation ID for billing */
877
+ get orgId() {
878
+ return this._orgId;
879
+ }
573
880
  /** Get TTS URL */
574
881
  get ttsUrl() {
575
882
  return this._ttsUrl;