npm - kugelaudio - Versions diffs - 0.2.0 → 0.2.2 - Mend

kugelaudio 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/src/client.ts CHANGED Viewed

@@ -19,9 +19,23 @@ import type {
     Voice
 } from './types';
 import { base64ToArrayBuffer } from './utils';
+import { getWebSocket } from './websocket';
 const DEFAULT_API_URL = 'https://api.kugelaudio.com';
+/**
+ * Create a new WebSocket instance.
+ * Lazily resolves the constructor to avoid top-level side-effects
+ * that break server-side bundlers (Turbopack/Webpack).
+ */
+function createWs(url: string): WebSocket {
+  const WS = getWebSocket();
+  return new WS(url);
+}
+/** WebSocket OPEN readyState constant. */
+const WS_OPEN = 1;
 /**
  * Models resource for listing TTS models.
  */
@@ -111,6 +125,7 @@ class VoicesResource {
  * TTS resource for text-to-speech generation.
  */
 class TTSResource {
+  // Using any for WebSocket to support both browser WebSocket and ws package
   private wsConnection: WebSocket | null = null;
   private wsUrl: string | null = null;
   private pendingRequests: Map<number, {
@@ -147,7 +162,7 @@ class TTSResource {
    * Check if WebSocket connection is established and open.
    */
   isConnected(): boolean {
-    return this.wsConnection !== null && this.wsConnection.readyState === WebSocket.OPEN;
+    return this.wsConnection !== null && this.wsConnection.readyState === WS_OPEN;
   }
   /**
@@ -202,7 +217,12 @@ class TTSResource {
     } else {
       authParam = 'api_key';
     }
-    return `${wsUrl}/ws/tts?${authParam}=${this.client.apiKey}`;
+    let url = `${wsUrl}/ws/tts?${authParam}=${this.client.apiKey}`;
+    // Append org_id for token auth so usage is recorded against the org
+    if (this.client.orgId !== undefined) {
+      url += `&org_id=${this.client.orgId}`;
+    }
+    return url;
   }
   /**
@@ -216,7 +236,7 @@ class TTSResource {
     if (
       this.wsConnection &&
       this.wsUrl === url &&
-      this.wsConnection.readyState === WebSocket.OPEN
+      this.wsConnection.readyState === WS_OPEN
     ) {
       return this.wsConnection;
     }
@@ -233,7 +253,7 @@ class TTSResource {
     // Create new connection
     return new Promise((resolve, reject) => {
-      const ws = new WebSocket(url);
+      const ws = createWs(url);
       ws.onopen = () => {
         this.wsConnection = ws;
@@ -252,9 +272,15 @@ class TTSResource {
    * Setup message handler for pooled connection.
    */
   private setupMessageHandler(ws: WebSocket): void {
-    ws.onmessage = (event) => {
+    ws.onmessage = (event: { data: unknown }) => {
       try {
-        const data = JSON.parse(event.data);
+        // Handle both browser (string) and Node.js (Buffer) message formats
+        const messageData = typeof event.data === 'string'
+          ? event.data
+          : event.data instanceof Buffer
+            ? event.data.toString()
+            : String(event.data);
+        const data = JSON.parse(messageData);
         // Get the current pending request (we process one at a time)
         const [requestId, pending] = [...this.pendingRequests.entries()][0] || [];
@@ -364,13 +390,12 @@ class TTSResource {
       ws.send(JSON.stringify({
         text: options.text,
-        model: options.model || 'kugel-1-turbo',
+        model_id: options.modelId || 'kugel-1-turbo',
         voice_id: options.voiceId,
         cfg_scale: options.cfgScale ?? 2.0,
         max_new_tokens: options.maxNewTokens ?? 2048,
         sample_rate: options.sampleRate ?? 24000,
-        speaker_prefix: options.speakerPrefix ?? true,
-        normalize: options.normalize ?? false,
+        normalize: options.normalize ?? true,
         ...(options.language && { language: options.language }),
       }));
     });
@@ -385,27 +410,32 @@ class TTSResource {
   ): Promise<void> {
     return new Promise((resolve, reject) => {
       const url = this.buildWsUrl();
-      const ws = new WebSocket(url);
+      const ws = createWs(url);
       ws.onopen = () => {
         callbacks.onOpen?.();
         // Send TTS request
         ws.send(JSON.stringify({
           text: options.text,
-          model: options.model || 'kugel-1-turbo',
+          model_id: options.modelId || 'kugel-1-turbo',
           voice_id: options.voiceId,
           cfg_scale: options.cfgScale ?? 2.0,
           max_new_tokens: options.maxNewTokens ?? 2048,
           sample_rate: options.sampleRate ?? 24000,
-          speaker_prefix: options.speakerPrefix ?? true,
-          normalize: options.normalize ?? false,
+          normalize: options.normalize ?? true,
           ...(options.language && { language: options.language }),
         }));
       };
-      ws.onmessage = (event) => {
+      ws.onmessage = (event: { data: unknown }) => {
         try {
-          const data = JSON.parse(event.data);
+          // Handle both browser (string) and Node.js (Buffer) message formats
+          const messageData = typeof event.data === 'string'
+            ? event.data
+            : event.data instanceof Buffer
+              ? event.data.toString()
+              : String(event.data);
+          const data = JSON.parse(messageData);
           if (data.error) {
             const error = this.parseError(data.error);
@@ -489,6 +519,306 @@ class TTSResource {
     }
     return new KugelAudioError(message);
   }
+  /**
+   * Create a multi-context session for concurrent TTS streams.
+   *
+   * Allows managing up to 5 independent audio generation contexts
+   * over a single WebSocket connection. Each context has its own
+   * text buffer, voice settings, and generation queue.
+   *
+   * @example
+   * ```typescript
+   * const session = client.tts.createMultiContextSession({
+   *   defaultVoiceId: 123,
+   * });
+   *
+   * session.connect({
+   *   onChunk: (chunk) => {
+   *     console.log(`Audio from ${chunk.contextId}`);
+   *     playAudio(chunk.audio);
+   *   },
+   *   onContextFinal: (contextId) => {
+   *     console.log(`${contextId} finished`);
+   *   },
+   * });
+   *
+   * // Create contexts with different voices
+   * session.createContext('narrator', { voiceId: 123 });
+   * session.createContext('character', { voiceId: 456 });
+   *
+   * // Send text to different speakers
+   * session.send('narrator', 'The story begins.', true);
+   * session.send('character', 'Hello!', true);
+   *
+   * // Close when done
+   * session.close();
+   * ```
+   */
+  createMultiContextSession(
+    config?: import('./types').MultiContextConfig
+  ): MultiContextSession {
+    return new MultiContextSession(this.client, config);
+  }
+}
+/**
+ * Multi-context WebSocket session for concurrent TTS streams.
+ */
+class MultiContextSession {
+  private ws: WebSocket | null = null;
+  private config: import('./types').MultiContextConfig;
+  private callbacks: import('./types').MultiContextCallbacks = {};
+  private contexts: Set<string> = new Set();
+  private _sessionId: string | null = null;
+  private isStarted = false;
+  constructor(
+    private client: KugelAudio,
+    config?: import('./types').MultiContextConfig
+  ) {
+    this.config = config || {};
+  }
+  /**
+   * Get the current session ID, or null if not connected.
+   */
+  get sessionId(): string | null {
+    return this._sessionId;
+  }
+  /**
+   * Connect to the multi-context WebSocket endpoint.
+   */
+  connect(callbacks: import('./types').MultiContextCallbacks): void {
+    this.callbacks = callbacks;
+    const wsUrl = this.client.ttsUrl
+      .replace('https://', 'wss://')
+      .replace('http://', 'ws://');
+    let authParam: string;
+    if (this.client.isToken) {
+      authParam = 'token';
+    } else if (this.client.isMasterKey) {
+      authParam = 'master_key';
+    } else {
+      authParam = 'api_key';
+    }
+    const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`;
+    this.ws = createWs(url);
+    this.ws.onopen = () => {
+      // Connection established, ready to create contexts
+    };
+    this.ws.onmessage = (event: { data: unknown }) => {
+      try {
+        // Handle both browser (string) and Node.js (Buffer) message formats
+        const messageData = typeof event.data === 'string'
+          ? event.data
+          : event.data instanceof Buffer
+            ? event.data.toString()
+            : String(event.data);
+        const data = JSON.parse(messageData);
+        if (data.error) {
+          this.callbacks.onError?.(
+            new KugelAudioError(data.error),
+            data.context_id
+          );
+          return;
+        }
+        if (data.session_started) {
+          this._sessionId = data.session_id;
+          this.isStarted = true;
+          this.callbacks.onSessionStarted?.(data.session_id);
+        }
+        if (data.context_created) {
+          this.contexts.add(data.context_id);
+          this.callbacks.onContextCreated?.(data.context_id);
+        }
+        if (data.audio) {
+          const chunk: import('./types').MultiContextAudioChunk = {
+            audio: data.audio,
+            encoding: 'pcm_s16le',
+            index: data.idx || 0,
+            sampleRate: data.sr || 24000,
+            samples: data.samples || 0,
+            contextId: data.context_id,
+          };
+          this.callbacks.onChunk?.(chunk);
+        }
+        if (data.is_final) {
+          this.callbacks.onContextFinal?.(data.context_id);
+        }
+        if (data.context_closed) {
+          this.contexts.delete(data.context_id);
+          this.callbacks.onContextClosed?.(data.context_id);
+        }
+        if (data.context_timeout) {
+          this.contexts.delete(data.context_id);
+          this.callbacks.onContextTimeout?.(data.context_id);
+        }
+        if (data.session_closed) {
+          this.callbacks.onSessionClosed?.(data);
+        }
+      } catch (e) {
+        console.error('Failed to parse WebSocket message:', e);
+      }
+    };
+    this.ws.onerror = () => {
+      this.callbacks.onError?.(new KugelAudioError('WebSocket connection error'));
+    };
+    this.ws.onclose = (event) => {
+      if (event.code === 4001) {
+        this.callbacks.onError?.(new AuthenticationError('Authentication failed'));
+      } else if (event.code === 4003) {
+        this.callbacks.onError?.(new InsufficientCreditsError('Insufficient credits'));
+      }
+      this.ws = null;
+      this.isStarted = false;
+      this.contexts.clear();
+    };
+  }
+  /**
+   * Create a new context with optional voice settings.
+   */
+  createContext(
+    contextId: string,
+    options?: {
+      voiceId?: number;
+      voiceSettings?: import('./types').ContextVoiceSettings;
+    }
+  ): void {
+    if (!this.ws || this.ws.readyState !== WS_OPEN) {
+      throw new KugelAudioError('WebSocket not connected');
+    }
+    const msg: Record<string, unknown> = {
+      text: ' ',
+      context_id: contextId,
+    };
+    // Include session config on first context
+    if (!this.isStarted) {
+      if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
+      if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
+      if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
+      if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
+      if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
+    }
+    // Per-context voice
+    const voiceId = options?.voiceId || this.config.defaultVoiceId;
+    if (voiceId) msg.voice_id = voiceId;
+    if (options?.voiceSettings) {
+      msg.voice_settings = {
+        stability: options.voiceSettings.stability,
+        similarity_boost: options.voiceSettings.similarityBoost,
+        style: options.voiceSettings.style,
+        use_speaker_boost: options.voiceSettings.useSpeakerBoost,
+        speed: options.voiceSettings.speed,
+      };
+    }
+    this.ws.send(JSON.stringify(msg));
+  }
+  /**
+   * Send text to a specific context.
+   */
+  send(contextId: string, text: string, flush = false): void {
+    if (!this.ws || this.ws.readyState !== WS_OPEN) {
+      throw new KugelAudioError('WebSocket not connected');
+    }
+    // Auto-create context if needed
+    if (!this.contexts.has(contextId) && !this.isStarted) {
+      this.createContext(contextId);
+    }
+    this.ws.send(JSON.stringify({
+      text,
+      context_id: contextId,
+      flush,
+    }));
+  }
+  /**
+   * Flush a context's buffer.
+   */
+  flush(contextId: string): void {
+    if (!this.ws || this.ws.readyState !== WS_OPEN) return;
+    this.ws.send(JSON.stringify({
+      flush: true,
+      context_id: contextId,
+    }));
+  }
+  /**
+   * Close a specific context.
+   */
+  closeContext(contextId: string): void {
+    if (!this.ws || this.ws.readyState !== WS_OPEN) return;
+    this.ws.send(JSON.stringify({
+      close_context: true,
+      context_id: contextId,
+    }));
+  }
+  /**
+   * Send keep-alive to reset a context's inactivity timeout.
+   */
+  keepAlive(contextId: string): void {
+    if (!this.ws || this.ws.readyState !== WS_OPEN) return;
+    this.ws.send(JSON.stringify({
+      text: '',
+      context_id: contextId,
+    }));
+  }
+  /**
+   * Close the session and all contexts.
+   */
+  close(): void {
+    if (this.ws && this.ws.readyState === WS_OPEN) {
+      this.ws.send(JSON.stringify({ close_socket: true }));
+      this.ws.close();
+    }
+    this.ws = null;
+    this.isStarted = false;
+    this.contexts.clear();
+  }
+  /**
+   * Get active context IDs.
+   */
+  get activeContexts(): string[] {
+    return Array.from(this.contexts);
+  }
+  /**
+   * Check if connected.
+   */
+  get isConnected(): boolean {
+    return this.ws !== null && this.ws.readyState === WS_OPEN;
+  }
 }
 /**
@@ -507,13 +837,13 @@ class TTSResource {
  * // Generate audio with fast model (1.5B params)
  * const audio = await client.tts.generate({
  *   text: 'Hello, world!',
- *   model: 'kugel-1-turbo',
+ *   modelId: 'kugel-1-turbo',
  * });
  *
  * // Generate audio with premium model (7B params)
  * const audio = await client.tts.generate({
  *   text: 'Hello, world!',
- *   model: 'kugel-1',
+ *   modelId: 'kugel-1',
  * });
  * ```
  */
@@ -521,6 +851,7 @@ export class KugelAudio {
   private _apiKey: string;
   private _isMasterKey: boolean;
   private _isToken: boolean;
+  private _orgId: number | undefined;
   private _apiUrl: string;
   private _ttsUrl: string;
   private _timeout: number;
@@ -540,6 +871,7 @@ export class KugelAudio {
     this._apiKey = options.apiKey;
     this._isMasterKey = options.isMasterKey || false;
     this._isToken = options.isToken || false;
+    this._orgId = options.orgId;
     this._apiUrl = (options.apiUrl || DEFAULT_API_URL).replace(/\/$/, '');
     // If ttsUrl not specified, use apiUrl (backend proxies to TTS server)
     this._ttsUrl = (options.ttsUrl || this._apiUrl).replace(/\/$/, '');
@@ -587,6 +919,11 @@ export class KugelAudio {
     return this._isToken;
   }
+  /** Get organisation ID for billing */
+  get orgId(): number | undefined {
+    return this._orgId;
+  }
   /** Get TTS URL */
   get ttsUrl(): string {
     return this._ttsUrl;

package/src/index.ts CHANGED Viewed

@@ -18,13 +18,13 @@
  * // Generate audio (non-streaming)
  * const audio = await client.tts.generate({
  *   text: 'Hello, world!',
- *   model: 'kugel-1-turbo',
+ *   modelId: 'kugel-1-turbo',
  *   voiceId: 123,
  * });
  *
  * // Generate audio (streaming)
  * await client.tts.stream(
- *   { text: 'Hello, world!', model: 'kugel-1-turbo' },
+ *   { text: 'Hello, world!', modelId: 'kugel-1-turbo' },
  *   {
  *     onChunk: (chunk) => {
  *       // Process audio chunk
@@ -46,10 +46,14 @@ export { KugelAudio } from './client';
 export type {
     AudioChunk,
     AudioResponse,
+    ContextVoiceSettings,
     GenerateOptions,
     GenerationStats,
     KugelAudioOptions,
     Model,
+    MultiContextAudioChunk,
+    MultiContextCallbacks,
+    MultiContextConfig,
     StreamCallbacks,
     StreamConfig,
     Voice,

package/src/types.ts CHANGED Viewed

@@ -17,7 +17,7 @@ export interface Model {
 /**
  * Voice category types.
  */
-export type VoiceCategory = 'premade' | 'cloned' | 'designed';
+export type VoiceCategory = 'premade' | 'cloned' | 'designed' | 'conversational' | 'narrative' | 'narrative_story' | 'characters';
 /**
  * Voice sex types.
@@ -54,7 +54,7 @@ export interface GenerateOptions {
   /** Text to synthesize */
   text: string;
   /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
-  model?: string;
+  modelId?: string;
   /** Voice ID to use */
   voiceId?: number;
   /** CFG scale for generation (default: 2.0) */
@@ -63,21 +63,18 @@ export interface GenerateOptions {
   maxNewTokens?: number;
   /** Output sample rate (default: 24000) */
   sampleRate?: number;
-  /** Whether to add speaker prefix (default: true) */
-  speakerPrefix?: boolean;
   /**
    * Enable text normalization (converts numbers, dates, etc. to spoken words).
    * When true, text will be normalized before TTS generation.
-   * Default: false
+   * Default: true
    *
-   * ⚠️ WARNING: Using normalize=true without specifying language adds ~150ms
-   * latency for language auto-detection. For best performance, always specify
-   * the language parameter when using normalization.
+   * ⚠️ For best performance, always specify the language parameter when using
+   * normalization. Without it, language auto-detection adds ~150ms latency.
    */
   normalize?: boolean;
   /**
    * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
-   * If not provided and normalize is true, language will be auto-detected
+   * If not provided and normalize is true (default), language will be auto-detected
    * (adds ~150ms latency).
    *
    * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
@@ -98,12 +95,20 @@ export interface StreamConfig {
   maxNewTokens?: number;
   /** Output sample rate */
   sampleRate?: number;
-  /** Whether to add speaker prefix */
-  speakerPrefix?: boolean;
   /** Auto-flush timeout in milliseconds */
   flushTimeoutMs?: number;
   /** Maximum buffer length */
   maxBufferLength?: number;
+  /**
+   * Enable text normalization (converts numbers, dates, etc. to spoken words).
+   * Default: true
+   */
+  normalize?: boolean;
+  /**
+   * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
+   * Specify to avoid ~150ms auto-detection latency.
+   */
+  language?: string;
 }
 /**
@@ -188,9 +193,11 @@ export interface KugelAudioOptions {
   isMasterKey?: boolean;
   /** Whether apiKey is a JWT token (for user authentication). Takes precedence over isMasterKey. */
   isToken?: boolean;
+  /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
+  orgId?: number;
   /** API base URL (default: https://api.kugelaudio.com) */
   apiUrl?: string;
-  /** TTS server URL (default: https://eu.kugelaudio.com) */
+  /** TTS server URL (default: same as apiUrl) */
   ttsUrl?: string;
   /** Request timeout in milliseconds (default: 60000) */
   timeout?: number;
@@ -205,3 +212,67 @@ export interface ApiError {
   statusCode?: number;
 }
+/**
+ * Multi-context session configuration.
+ */
+export interface MultiContextConfig {
+  /** Default voice ID for new contexts */
+  defaultVoiceId?: number;
+  /** Output sample rate (default: 24000) */
+  sampleRate?: number;
+  /** CFG scale for generation (default: 2.0) */
+  cfgScale?: number;
+  /** Maximum tokens to generate (default: 2048) */
+  maxNewTokens?: number;
+  /** Enable text normalization (default: true) */
+  normalize?: boolean;
+  /** Seconds before context auto-closes (default: 20.0) */
+  inactivityTimeout?: number;
+}
+/**
+ * Voice settings for a specific context.
+ */
+export interface ContextVoiceSettings {
+  /** Stability (0.0-1.0) */
+  stability?: number;
+  /** Similarity boost (0.0-1.0) */
+  similarityBoost?: number;
+  /** Style (0.0-1.0) */
+  style?: number;
+  /** Use speaker boost */
+  useSpeakerBoost?: boolean;
+  /** Speed multiplier */
+  speed?: number;
+}
+/**
+ * Audio chunk from multi-context streaming.
+ */
+export interface MultiContextAudioChunk extends AudioChunk {
+  /** Context ID this audio belongs to */
+  contextId: string;
+}
+/**
+ * Event callbacks for multi-context streaming.
+ */
+export interface MultiContextCallbacks {
+  /** Called when session is started */
+  onSessionStarted?: (sessionId: string) => void;
+  /** Called when a context is created */
+  onContextCreated?: (contextId: string) => void;
+  /** Called when an audio chunk is received */
+  onChunk?: (chunk: MultiContextAudioChunk) => void;
+  /** Called when a context finishes generating */
+  onContextFinal?: (contextId: string) => void;
+  /** Called when a context is closed */
+  onContextClosed?: (contextId: string) => void;
+  /** Called when a context times out */
+  onContextTimeout?: (contextId: string) => void;
+  /** Called when session is closed */
+  onSessionClosed?: (stats: Record<string, unknown>) => void;
+  /** Called on error */
+  onError?: (error: Error, contextId?: string) => void;
+}