npm - kugelaudio - Versions diffs - 0.2.2 → 0.3.0 - Mend

kugelaudio 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/src/types.ts CHANGED Viewed

@@ -47,18 +47,125 @@ export interface Voice {
   verified: boolean;
 }
+/**
+ * Paginated response from the voices list endpoint.
+ */
+export interface VoiceListResponse {
+  voices: Voice[];
+  total: number;
+  limit: number;
+  offset: number;
+}
+/**
+ * Voice quality levels.
+ */
+export type VoiceQuality = 'low' | 'mid' | 'high';
+/**
+ * Extended voice information returned by voice management endpoints.
+ */
+export interface VoiceDetail {
+  id: number;
+  name: string;
+  description: string;
+  generativeVoiceDescription: string;
+  supportedLanguages: string[];
+  category: string;
+  age?: string;
+  sex?: string;
+  quality: string;
+  isPublic: boolean;
+  verified: boolean;
+  pendingVerification: boolean;
+  sampleUrl?: string;
+  avatarUrl?: string;
+  sampleText: string;
+}
+/**
+ * Voice reference audio metadata.
+ */
+export interface VoiceReference {
+  id: number;
+  voiceId: number;
+  name: string;
+  referenceText: string;
+  s3Path: string;
+  audioUrl?: string;
+  isGenerated: boolean;
+}
+/**
+ * Options for creating a new voice.
+ */
+export interface CreateVoiceOptions {
+  name: string;
+  sex: string;
+  description?: string;
+  category?: string;
+  age?: string;
+  quality?: string;
+  supportedLanguages?: string[];
+  isPublic?: boolean;
+  sampleText?: string;
+  /** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
+  referenceFiles?: Array<File | Blob>;
+}
+/**
+ * Options for updating an existing voice.
+ */
+export interface UpdateVoiceOptions {
+  name?: string;
+  description?: string;
+  category?: string;
+  age?: string;
+  sex?: string;
+  quality?: string;
+  supportedLanguages?: string[];
+  isPublic?: boolean;
+  sampleText?: string;
+}
+/**
+ * Word-level timestamp from server-side forced alignment.
+ */
+export interface WordTimestamp {
+  /** The aligned word */
+  word: string;
+  /** Start time in milliseconds (relative to chunk/audio start) */
+  startMs: number;
+  /** End time in milliseconds (relative to chunk/audio start) */
+  endMs: number;
+  /** Start character offset in the original text */
+  charStart: number;
+  /** End character offset in the original text */
+  charEnd: number;
+  /** Alignment confidence score (0.0 - 1.0) */
+  score: number;
+}
 /**
  * TTS generation request options.
  */
 export interface GenerateOptions {
   /** Text to synthesize */
   text: string;
-  /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
+  /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
   modelId?: string;
   /** Voice ID to use */
   voiceId?: number;
   /** CFG scale for generation (default: 2.0) */
   cfgScale?: number;
+  /**
+   * Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
+   * 1 = most variance. Default: 0.5.
+   *
+   * Lower values produce more consistent reads across regenerations —
+   * useful for stable voiceovers, IVR prompts, and e-learning.
+   */
+  temperature?: number;
   /** Maximum tokens to generate (default: 2048) */
   maxNewTokens?: number;
   /** Output sample rate (default: 24000) */
@@ -78,19 +185,62 @@ export interface GenerateOptions {
    * (adds ~150ms latency).
    *
    * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
-   *            el, uk, bg, tr, vi, ar, hi, zh, ja, ko
+   *            el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
+   *            he, fa, ur, bn, ta, yue, th, id, ms
    */
   language?: string;
+  /**
+   * Request word-level timestamps alongside audio.
+   * When true, the server performs forced alignment and returns per-word timing boundaries.
+   * Default: false
+   */
+  wordTimestamps?: boolean;
+  /**
+   * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
+   *
+   * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
+   * can also be used for per-segment speed control.
+   * Range: [0.8, 1.2]. Default: 1.0.
+   */
+  speed?: number;
+  /**
+   * Optional project ID for project-scoped features (custom dictionary
+   * replacements, per-project rate limits). The caller MUST verify the
+   * authenticated user has access to this project before passing it; the
+   * server treats the value as trusted once received.
+   */
+  projectId?: number;
 }
 /**
- * Streaming session configuration.
+ * Streaming session configuration for `/ws/tts/stream`.
+ *
+ * The server accumulates LLM tokens internally and starts generation at natural
+ * sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
+ * server begins generating, or set {@link autoMode} to start at the very first
+ * clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
+ *
+ * @example Low-latency preset
+ * ```typescript
+ * const session = client.tts.streamingSession({
+ *   voiceId: 123,
+ *   autoMode: true,
+ *   chunkLengthSchedule: [50, 100, 150, 250],
+ * });
+ * ```
  */
 export interface StreamConfig {
   /** Voice ID to use */
   voiceId?: number;
+  /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
+  modelId?: string;
   /** CFG scale for generation */
   cfgScale?: number;
+  /**
+   * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
+   * Default: 0.5.
+   */
+  temperature?: number;
   /** Maximum tokens per generation */
   maxNewTokens?: number;
   /** Output sample rate */
@@ -109,6 +259,69 @@ export interface StreamConfig {
    * Specify to avoid ~150ms auto-detection latency.
    */
   language?: string;
+  /**
+   * Request word-level timestamps alongside audio.
+   * Default: false
+   */
+  wordTimestamps?: boolean;
+  /**
+   * Minimum buffer sizes (in characters) the server must accumulate before
+   * auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
+   * last value is reused for all subsequent chunks.
+   *
+   * Smaller values produce lower TTFA at the cost of less prosody context.
+   * Larger values improve naturalness but increase TTFA.
+   *
+   * @example
+   * ```typescript
+   * chunkLengthSchedule: [50, 100, 150, 250]  // low-latency
+   * chunkLengthSchedule: [120, 200, 300]       // high-quality prosody
+   * ```
+   */
+  chunkLengthSchedule?: number[];
+  /**
+   * When `true`, the server starts generating audio at the very first clean
+   * sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
+   * ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
+   * less natural prosody on the first chunk.
+   */
+  autoMode?: boolean;
+  /**
+   * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
+   *
+   * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
+   * can also be used for per-segment speed control.
+   * Range: [0.8, 1.2]. Default: 1.0.
+   */
+  speed?: number;
+}
+/**
+ * Event callbacks for a streaming session (`/ws/tts/stream`).
+ *
+ * This is the LLM-integration endpoint: forward raw tokens via
+ * {@link StreamingSession.send} and the server auto-chunks them at sentence
+ * boundaries.
+ */
+export interface StreamingSessionCallbacks {
+  /** Called when an audio chunk arrives for any segment. */
+  onChunk?: (chunk: AudioChunk) => void;
+  /**
+   * Called when all audio for one flushed text segment is complete.
+   * Carries the segment index, total audio duration, and generation time.
+   */
+  onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
+  /**
+   * Called when the session is fully closed (after `session.close()`).
+   * Equivalent to `onFinal` on the one-shot endpoint.
+   */
+  onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
+  /** Called when the server begins generating audio for a text segment. */
+  onGenerationStarted?: (chunkId: number, text: string) => void;
+  /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
+  onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
+  /** Called on any error. */
+  onError?: (error: Error) => void;
 }
 /**
@@ -141,8 +354,6 @@ export interface GenerationStats {
   durationMs: number;
   /** Generation time in milliseconds */
   generationMs: number;
-  /** Time to first audio in milliseconds */
-  ttfaMs: number | null;
   /** Real-time factor */
   rtf: number;
   /** Error message if any */
@@ -165,6 +376,8 @@ export interface AudioResponse {
   generationMs: number;
   /** Real-time factor */
   rtf: number;
+  /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
+  wordTimestamps: WordTimestamp[];
 }
 /**
@@ -173,6 +386,8 @@ export interface AudioResponse {
 export interface StreamCallbacks {
   /** Called when an audio chunk is received */
   onChunk?: (chunk: AudioChunk) => void;
+  /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
+  onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
   /** Called when generation is complete */
   onFinal?: (stats: GenerationStats) => void;
   /** Called on error */
@@ -183,11 +398,19 @@ export interface StreamCallbacks {
   onClose?: () => void;
 }
+/**
+ * Deployment region. Controls which API endpoint the SDK connects to.
+ * - `'eu'` — `api.kugelaudio.com` (default)
+ * - `'us'` — `us-api.kugelaudio.com`
+ * - `'global'` — `global-api.kugelaudio.com` (geo-routed)
+ */
+export type Region = 'eu' | 'us' | 'global';
 /**
  * KugelAudio client options.
  */
 export interface KugelAudioOptions {
-  /** Your KugelAudio API key or JWT token */
+  /** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
   apiKey: string;
   /** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
   isMasterKey?: boolean;
@@ -195,12 +418,20 @@ export interface KugelAudioOptions {
   isToken?: boolean;
   /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
   orgId?: number;
+  /** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
+  region?: Region;
   /** API base URL (default: https://api.kugelaudio.com) */
   apiUrl?: string;
   /** TTS server URL (default: same as apiUrl) */
   ttsUrl?: string;
   /** Request timeout in milliseconds (default: 60000) */
   timeout?: number;
+  /**
+   * Interval in milliseconds between WebSocket ping frames sent on the pooled connection
+   * to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
+   * In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
+   */
+  keepalivePingInterval?: number | null;
 }
 /**
@@ -222,10 +453,21 @@ export interface MultiContextConfig {
   sampleRate?: number;
   /** CFG scale for generation (default: 2.0) */
   cfgScale?: number;
+  /**
+   * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
+   * Default: 0.5.
+   */
+  temperature?: number;
   /** Maximum tokens to generate (default: 2048) */
   maxNewTokens?: number;
   /** Enable text normalization (default: true) */
   normalize?: boolean;
+  /**
+   * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
+   * If not set and normalize is true (default), the server auto-detects
+   * the language, which adds ~60-150ms to time-to-first-audio.
+   */
+  language?: string;
   /** Seconds before context auto-closes (default: 20.0) */
   inactivityTimeout?: number;
 }
@@ -264,8 +506,6 @@ export interface MultiContextCallbacks {
   onContextCreated?: (contextId: string) => void;
   /** Called when an audio chunk is received */
   onChunk?: (chunk: MultiContextAudioChunk) => void;
-  /** Called when a context finishes generating */
-  onContextFinal?: (contextId: string) => void;
   /** Called when a context is closed */
   onContextClosed?: (contextId: string) => void;
   /** Called when a context times out */

package/src/websocket.ts CHANGED Viewed

@@ -8,36 +8,56 @@
 let _cachedWs: typeof WebSocket | null = null;
+/**
+ * Detect whether we are running in Node.js (vs. browser / edge / Deno).
+ * We prefer the `ws` package in Node because Node's built-in WebSocket
+ * (added in Node 22) surfaces a useless opaque message on handshake
+ * failures ("Received network error or non-101 status code"), whereas
+ * `ws` exposes the rejected HTTP status in the error, which the error
+ * classifier uses to raise `AuthenticationError` / `RateLimitError` etc.
+ */
+function isNodeJs(): boolean {
+    return (
+        typeof process !== 'undefined' &&
+        !!process.versions &&
+        typeof process.versions.node === 'string'
+    );
+}
 /**
  * Get the WebSocket constructor for the current environment.
- * Uses native WebSocket in browsers, ws package in Node.js.
+ * Prefers the `ws` package in Node.js (for richer handshake errors),
+ * falls back to the native `globalThis.WebSocket` elsewhere.
  * Result is cached after first call.
  */
 export function getWebSocket(): typeof WebSocket {
     if (_cachedWs) return _cachedWs;
-    // Browser environment
+    // Node.js — prefer the `ws` package so handshake rejections carry the
+    // HTTP status code (see isNodeJs doc above).
+    if (isNodeJs()) {
+        try {
+            // Use Function constructor to hide require from static analysis by bundlers
+            // eslint-disable-next-line no-new-func
+            const _require = typeof require !== 'undefined'
+                ? require
+                : Function('return typeof require !== "undefined" ? require : undefined')();
+            if (_require) {
+                const ws = _require('ws');
+                _cachedWs = ws.default || ws;
+                return _cachedWs!;
+            }
+        } catch {
+            // Fall through to native if the `ws` package isn't installed.
+        }
+    }
+    // Browser / edge / Deno environment — use native WebSocket.
     if (typeof globalThis !== 'undefined' && typeof (globalThis as any).WebSocket !== 'undefined') {
         _cachedWs = (globalThis as any).WebSocket;
         return _cachedWs!;
     }
-    // Node.js environment - use ws package via dynamic require
-    try {
-        // Use Function constructor to hide require from static analysis by bundlers
-        // eslint-disable-next-line no-new-func
-        const _require = typeof require !== 'undefined'
-            ? require
-            : Function('return typeof require !== "undefined" ? require : undefined')();
-        if (_require) {
-            const ws = _require('ws');
-            _cachedWs = ws.default || ws;
-            return _cachedWs!;
-        }
-    } catch {
-        // Fall through to error
-    }
     throw new Error(
         'WebSocket not available. In Node.js, install the "ws" package: npm install ws'
     );