kugelaudio 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,3 +1,10 @@
1
+ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
2
+ get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
3
+ }) : x)(function(x) {
4
+ if (typeof require !== "undefined") return require.apply(this, arguments);
5
+ throw Error('Dynamic require of "' + x + '" is not supported');
6
+ });
7
+
1
8
  // src/errors.ts
2
9
  var KugelAudioError = class _KugelAudioError extends Error {
3
10
  constructor(message, statusCode) {
@@ -99,8 +106,35 @@ function createWavBlob(audio, sampleRate) {
99
106
  return new Blob([wavBuffer], { type: "audio/wav" });
100
107
  }
101
108
 
109
+ // src/websocket.ts
110
+ var _cachedWs = null;
111
+ function getWebSocket() {
112
+ if (_cachedWs) return _cachedWs;
113
+ if (typeof globalThis !== "undefined" && typeof globalThis.WebSocket !== "undefined") {
114
+ _cachedWs = globalThis.WebSocket;
115
+ return _cachedWs;
116
+ }
117
+ try {
118
+ const _require = typeof __require !== "undefined" ? __require : Function('return typeof require !== "undefined" ? require : undefined')();
119
+ if (_require) {
120
+ const ws = _require("ws");
121
+ _cachedWs = ws.default || ws;
122
+ return _cachedWs;
123
+ }
124
+ } catch {
125
+ }
126
+ throw new Error(
127
+ 'WebSocket not available. In Node.js, install the "ws" package: npm install ws'
128
+ );
129
+ }
130
+
102
131
  // src/client.ts
103
132
  var DEFAULT_API_URL = "https://api.kugelaudio.com";
133
+ function createWs(url) {
134
+ const WS = getWebSocket();
135
+ return new WS(url);
136
+ }
137
+ var WS_OPEN = 1;
104
138
  var ModelsResource = class {
105
139
  constructor(client) {
106
140
  this.client = client;
@@ -176,6 +210,7 @@ var VoicesResource = class {
176
210
  var TTSResource = class {
177
211
  constructor(client) {
178
212
  this.client = client;
213
+ // Using any for WebSocket to support both browser WebSocket and ws package
179
214
  this.wsConnection = null;
180
215
  this.wsUrl = null;
181
216
  this.pendingRequests = /* @__PURE__ */ new Map();
@@ -205,7 +240,7 @@ var TTSResource = class {
205
240
  * Check if WebSocket connection is established and open.
206
241
  */
207
242
  isConnected() {
208
- return this.wsConnection !== null && this.wsConnection.readyState === WebSocket.OPEN;
243
+ return this.wsConnection !== null && this.wsConnection.readyState === WS_OPEN;
209
244
  }
210
245
  /**
211
246
  * Generate audio from text with streaming via WebSocket.
@@ -214,10 +249,14 @@ var TTSResource = class {
214
249
  async generate(options) {
215
250
  const chunks = [];
216
251
  let finalStats;
252
+ const allTimestamps = [];
217
253
  await this.stream(options, {
218
254
  onChunk: (chunk) => {
219
255
  chunks.push(base64ToArrayBuffer(chunk.audio));
220
256
  },
257
+ onWordTimestamps: (timestamps) => {
258
+ allTimestamps.push(...timestamps);
259
+ },
221
260
  onFinal: (stats) => {
222
261
  finalStats = stats;
223
262
  }
@@ -235,7 +274,8 @@ var TTSResource = class {
235
274
  samples: finalStats ? finalStats.totalSamples : totalLength / 2,
236
275
  durationMs: finalStats ? finalStats.durationMs : 0,
237
276
  generationMs: finalStats ? finalStats.generationMs : 0,
238
- rtf: finalStats ? finalStats.rtf : 0
277
+ rtf: finalStats ? finalStats.rtf : 0,
278
+ wordTimestamps: allTimestamps
239
279
  };
240
280
  }
241
281
  /**
@@ -251,7 +291,11 @@ var TTSResource = class {
251
291
  } else {
252
292
  authParam = "api_key";
253
293
  }
254
- return `${wsUrl}/ws/tts?${authParam}=${this.client.apiKey}`;
294
+ let url = `${wsUrl}/ws/tts?${authParam}=${this.client.apiKey}`;
295
+ if (this.client.orgId !== void 0) {
296
+ url += `&org_id=${this.client.orgId}`;
297
+ }
298
+ return url;
255
299
  }
256
300
  /**
257
301
  * Get or create a WebSocket connection for connection pooling.
@@ -259,7 +303,7 @@ var TTSResource = class {
259
303
  */
260
304
  async getConnection() {
261
305
  const url = this.buildWsUrl();
262
- if (this.wsConnection && this.wsUrl === url && this.wsConnection.readyState === WebSocket.OPEN) {
306
+ if (this.wsConnection && this.wsUrl === url && this.wsConnection.readyState === WS_OPEN) {
263
307
  return this.wsConnection;
264
308
  }
265
309
  if (this.wsConnection) {
@@ -270,7 +314,7 @@ var TTSResource = class {
270
314
  this.wsConnection = null;
271
315
  }
272
316
  return new Promise((resolve, reject) => {
273
- const ws = new WebSocket(url);
317
+ const ws = createWs(url);
274
318
  ws.onopen = () => {
275
319
  this.wsConnection = ws;
276
320
  this.wsUrl = url;
@@ -288,7 +332,8 @@ var TTSResource = class {
288
332
  setupMessageHandler(ws) {
289
333
  ws.onmessage = (event) => {
290
334
  try {
291
- const data = JSON.parse(event.data);
335
+ const messageData = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
336
+ const data = JSON.parse(messageData);
292
337
  const [requestId, pending] = [...this.pendingRequests.entries()][0] || [];
293
338
  if (!pending) return;
294
339
  if (data.error) {
@@ -324,6 +369,19 @@ var TTSResource = class {
324
369
  };
325
370
  pending.callbacks.onChunk?.(chunk);
326
371
  }
372
+ if (data.word_timestamps) {
373
+ const timestamps = data.word_timestamps.map(
374
+ (w) => ({
375
+ word: w.word,
376
+ startMs: w.start_ms,
377
+ endMs: w.end_ms,
378
+ charStart: w.char_start,
379
+ charEnd: w.char_end,
380
+ score: w.score ?? 1
381
+ })
382
+ );
383
+ pending.callbacks.onWordTimestamps?.(timestamps);
384
+ }
327
385
  } catch (e) {
328
386
  console.error("Failed to parse WebSocket message:", e);
329
387
  }
@@ -375,14 +433,14 @@ var TTSResource = class {
375
433
  callbacks.onOpen?.();
376
434
  ws.send(JSON.stringify({
377
435
  text: options.text,
378
- model: options.model || "kugel-1-turbo",
436
+ model_id: options.modelId || "kugel-1-turbo",
379
437
  voice_id: options.voiceId,
380
438
  cfg_scale: options.cfgScale ?? 2,
381
439
  max_new_tokens: options.maxNewTokens ?? 2048,
382
440
  sample_rate: options.sampleRate ?? 24e3,
383
- speaker_prefix: options.speakerPrefix ?? true,
384
- normalize: options.normalize ?? false,
385
- ...options.language && { language: options.language }
441
+ normalize: options.normalize ?? true,
442
+ ...options.language && { language: options.language },
443
+ ...options.wordTimestamps && { word_timestamps: true }
386
444
  }));
387
445
  });
388
446
  }
@@ -392,24 +450,25 @@ var TTSResource = class {
392
450
  streamWithoutPooling(options, callbacks) {
393
451
  return new Promise((resolve, reject) => {
394
452
  const url = this.buildWsUrl();
395
- const ws = new WebSocket(url);
453
+ const ws = createWs(url);
396
454
  ws.onopen = () => {
397
455
  callbacks.onOpen?.();
398
456
  ws.send(JSON.stringify({
399
457
  text: options.text,
400
- model: options.model || "kugel-1-turbo",
458
+ model_id: options.modelId || "kugel-1-turbo",
401
459
  voice_id: options.voiceId,
402
460
  cfg_scale: options.cfgScale ?? 2,
403
461
  max_new_tokens: options.maxNewTokens ?? 2048,
404
462
  sample_rate: options.sampleRate ?? 24e3,
405
- speaker_prefix: options.speakerPrefix ?? true,
406
- normalize: options.normalize ?? false,
407
- ...options.language && { language: options.language }
463
+ normalize: options.normalize ?? true,
464
+ ...options.language && { language: options.language },
465
+ ...options.wordTimestamps && { word_timestamps: true }
408
466
  }));
409
467
  };
410
468
  ws.onmessage = (event) => {
411
469
  try {
412
- const data = JSON.parse(event.data);
470
+ const messageData = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
471
+ const data = JSON.parse(messageData);
413
472
  if (data.error) {
414
473
  const error = this.parseError(data.error);
415
474
  callbacks.onError?.(error);
@@ -443,6 +502,19 @@ var TTSResource = class {
443
502
  };
444
503
  callbacks.onChunk?.(chunk);
445
504
  }
505
+ if (data.word_timestamps) {
506
+ const timestamps = data.word_timestamps.map(
507
+ (w) => ({
508
+ word: w.word,
509
+ startMs: w.start_ms,
510
+ endMs: w.end_ms,
511
+ charStart: w.char_start,
512
+ charEnd: w.char_end,
513
+ score: w.score ?? 1
514
+ })
515
+ );
516
+ callbacks.onWordTimestamps?.(timestamps);
517
+ }
446
518
  } catch (e) {
447
519
  console.error("Failed to parse WebSocket message:", e);
448
520
  }
@@ -485,6 +557,243 @@ var TTSResource = class {
485
557
  }
486
558
  return new KugelAudioError(message);
487
559
  }
560
+ /**
561
+ * Create a multi-context session for concurrent TTS streams.
562
+ *
563
+ * Allows managing up to 5 independent audio generation contexts
564
+ * over a single WebSocket connection. Each context has its own
565
+ * text buffer, voice settings, and generation queue.
566
+ *
567
+ * @example
568
+ * ```typescript
569
+ * const session = client.tts.createMultiContextSession({
570
+ * defaultVoiceId: 123,
571
+ * });
572
+ *
573
+ * session.connect({
574
+ * onChunk: (chunk) => {
575
+ * console.log(`Audio from ${chunk.contextId}`);
576
+ * playAudio(chunk.audio);
577
+ * },
578
+ * onContextFinal: (contextId) => {
579
+ * console.log(`${contextId} finished`);
580
+ * },
581
+ * });
582
+ *
583
+ * // Create contexts with different voices
584
+ * session.createContext('narrator', { voiceId: 123 });
585
+ * session.createContext('character', { voiceId: 456 });
586
+ *
587
+ * // Send text to different speakers
588
+ * session.send('narrator', 'The story begins.', true);
589
+ * session.send('character', 'Hello!', true);
590
+ *
591
+ * // Close when done
592
+ * session.close();
593
+ * ```
594
+ */
595
+ createMultiContextSession(config) {
596
+ return new MultiContextSession(this.client, config);
597
+ }
598
+ };
599
+ var MultiContextSession = class {
600
+ constructor(client, config) {
601
+ this.client = client;
602
+ this.ws = null;
603
+ this.callbacks = {};
604
+ this.contexts = /* @__PURE__ */ new Set();
605
+ this._sessionId = null;
606
+ this.isStarted = false;
607
+ this.config = config || {};
608
+ }
609
+ /**
610
+ * Get the current session ID, or null if not connected.
611
+ */
612
+ get sessionId() {
613
+ return this._sessionId;
614
+ }
615
+ /**
616
+ * Connect to the multi-context WebSocket endpoint.
617
+ */
618
+ connect(callbacks) {
619
+ this.callbacks = callbacks;
620
+ const wsUrl = this.client.ttsUrl.replace("https://", "wss://").replace("http://", "ws://");
621
+ let authParam;
622
+ if (this.client.isToken) {
623
+ authParam = "token";
624
+ } else if (this.client.isMasterKey) {
625
+ authParam = "master_key";
626
+ } else {
627
+ authParam = "api_key";
628
+ }
629
+ const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`;
630
+ this.ws = createWs(url);
631
+ this.ws.onopen = () => {
632
+ };
633
+ this.ws.onmessage = (event) => {
634
+ try {
635
+ const messageData = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
636
+ const data = JSON.parse(messageData);
637
+ if (data.error) {
638
+ this.callbacks.onError?.(
639
+ new KugelAudioError(data.error),
640
+ data.context_id
641
+ );
642
+ return;
643
+ }
644
+ if (data.session_started) {
645
+ this._sessionId = data.session_id;
646
+ this.isStarted = true;
647
+ this.callbacks.onSessionStarted?.(data.session_id);
648
+ }
649
+ if (data.context_created) {
650
+ this.contexts.add(data.context_id);
651
+ this.callbacks.onContextCreated?.(data.context_id);
652
+ }
653
+ if (data.audio) {
654
+ const chunk = {
655
+ audio: data.audio,
656
+ encoding: "pcm_s16le",
657
+ index: data.idx || 0,
658
+ sampleRate: data.sr || 24e3,
659
+ samples: data.samples || 0,
660
+ contextId: data.context_id
661
+ };
662
+ this.callbacks.onChunk?.(chunk);
663
+ }
664
+ if (data.is_final) {
665
+ this.callbacks.onContextFinal?.(data.context_id);
666
+ }
667
+ if (data.context_closed) {
668
+ this.contexts.delete(data.context_id);
669
+ this.callbacks.onContextClosed?.(data.context_id);
670
+ }
671
+ if (data.context_timeout) {
672
+ this.contexts.delete(data.context_id);
673
+ this.callbacks.onContextTimeout?.(data.context_id);
674
+ }
675
+ if (data.session_closed) {
676
+ this.callbacks.onSessionClosed?.(data);
677
+ }
678
+ } catch (e) {
679
+ console.error("Failed to parse WebSocket message:", e);
680
+ }
681
+ };
682
+ this.ws.onerror = () => {
683
+ this.callbacks.onError?.(new KugelAudioError("WebSocket connection error"));
684
+ };
685
+ this.ws.onclose = (event) => {
686
+ if (event.code === 4001) {
687
+ this.callbacks.onError?.(new AuthenticationError("Authentication failed"));
688
+ } else if (event.code === 4003) {
689
+ this.callbacks.onError?.(new InsufficientCreditsError("Insufficient credits"));
690
+ }
691
+ this.ws = null;
692
+ this.isStarted = false;
693
+ this.contexts.clear();
694
+ };
695
+ }
696
+ /**
697
+ * Create a new context with optional voice settings.
698
+ */
699
+ createContext(contextId, options) {
700
+ if (!this.ws || this.ws.readyState !== WS_OPEN) {
701
+ throw new KugelAudioError("WebSocket not connected");
702
+ }
703
+ const msg = {
704
+ text: " ",
705
+ context_id: contextId
706
+ };
707
+ if (!this.isStarted) {
708
+ if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
709
+ if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
710
+ if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
711
+ if (this.config.normalize !== void 0) msg.normalize = this.config.normalize;
712
+ if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
713
+ }
714
+ const voiceId = options?.voiceId || this.config.defaultVoiceId;
715
+ if (voiceId) msg.voice_id = voiceId;
716
+ if (options?.voiceSettings) {
717
+ msg.voice_settings = {
718
+ stability: options.voiceSettings.stability,
719
+ similarity_boost: options.voiceSettings.similarityBoost,
720
+ style: options.voiceSettings.style,
721
+ use_speaker_boost: options.voiceSettings.useSpeakerBoost,
722
+ speed: options.voiceSettings.speed
723
+ };
724
+ }
725
+ this.ws.send(JSON.stringify(msg));
726
+ }
727
+ /**
728
+ * Send text to a specific context.
729
+ */
730
+ send(contextId, text, flush = false) {
731
+ if (!this.ws || this.ws.readyState !== WS_OPEN) {
732
+ throw new KugelAudioError("WebSocket not connected");
733
+ }
734
+ if (!this.contexts.has(contextId) && !this.isStarted) {
735
+ this.createContext(contextId);
736
+ }
737
+ this.ws.send(JSON.stringify({
738
+ text,
739
+ context_id: contextId,
740
+ flush
741
+ }));
742
+ }
743
+ /**
744
+ * Flush a context's buffer.
745
+ */
746
+ flush(contextId) {
747
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
748
+ this.ws.send(JSON.stringify({
749
+ flush: true,
750
+ context_id: contextId
751
+ }));
752
+ }
753
+ /**
754
+ * Close a specific context.
755
+ */
756
+ closeContext(contextId) {
757
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
758
+ this.ws.send(JSON.stringify({
759
+ close_context: true,
760
+ context_id: contextId
761
+ }));
762
+ }
763
+ /**
764
+ * Send keep-alive to reset a context's inactivity timeout.
765
+ */
766
+ keepAlive(contextId) {
767
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return;
768
+ this.ws.send(JSON.stringify({
769
+ text: "",
770
+ context_id: contextId
771
+ }));
772
+ }
773
+ /**
774
+ * Close the session and all contexts.
775
+ */
776
+ close() {
777
+ if (this.ws && this.ws.readyState === WS_OPEN) {
778
+ this.ws.send(JSON.stringify({ close_socket: true }));
779
+ this.ws.close();
780
+ }
781
+ this.ws = null;
782
+ this.isStarted = false;
783
+ this.contexts.clear();
784
+ }
785
+ /**
786
+ * Get active context IDs.
787
+ */
788
+ get activeContexts() {
789
+ return Array.from(this.contexts);
790
+ }
791
+ /**
792
+ * Check if connected.
793
+ */
794
+ get isConnected() {
795
+ return this.ws !== null && this.ws.readyState === WS_OPEN;
796
+ }
488
797
  };
489
798
  var KugelAudio = class _KugelAudio {
490
799
  constructor(options) {
@@ -494,6 +803,7 @@ var KugelAudio = class _KugelAudio {
494
803
  this._apiKey = options.apiKey;
495
804
  this._isMasterKey = options.isMasterKey || false;
496
805
  this._isToken = options.isToken || false;
806
+ this._orgId = options.orgId;
497
807
  this._apiUrl = (options.apiUrl || DEFAULT_API_URL).replace(/\/$/, "");
498
808
  this._ttsUrl = (options.ttsUrl || this._apiUrl).replace(/\/$/, "");
499
809
  this._timeout = options.timeout || 6e4;
@@ -534,6 +844,10 @@ var KugelAudio = class _KugelAudio {
534
844
  get isToken() {
535
845
  return this._isToken;
536
846
  }
847
+ /** Get organisation ID for billing */
848
+ get orgId() {
849
+ return this._orgId;
850
+ }
537
851
  /** Get TTS URL */
538
852
  get ttsUrl() {
539
853
  return this._ttsUrl;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kugelaudio",
3
- "version": "0.2.0",
3
+ "version": "0.2.3",
4
4
  "description": "Official JavaScript/TypeScript SDK for KugelAudio TTS API",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",
@@ -52,5 +52,9 @@
52
52
  },
53
53
  "engines": {
54
54
  "node": ">=18.0.0"
55
+ },
56
+ "dependencies": {
57
+ "tsx": "^4.21.0",
58
+ "ws": "^8.18.0"
55
59
  }
56
60
  }