npm - kugelaudio - Versions diffs - 0.2.0 → 0.2.3 - Mend

kugelaudio 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -15,7 +15,7 @@ interface Model {
 /**
  * Voice category types.
  */
-type VoiceCategory = 'premade' | 'cloned' | 'designed';
+type VoiceCategory = 'premade' | 'cloned' | 'designed' | 'conversational' | 'narrative' | 'narrative_story' | 'characters';
 /**
  * Voice sex types.
  */
@@ -41,6 +41,23 @@ interface Voice {
     isPublic: boolean;
     verified: boolean;
 }
+/**
+ * Word-level timestamp from server-side forced alignment.
+ */
+interface WordTimestamp {
+    /** The aligned word */
+    word: string;
+    /** Start time in milliseconds (relative to chunk/audio start) */
+    startMs: number;
+    /** End time in milliseconds (relative to chunk/audio start) */
+    endMs: number;
+    /** Start character offset in the original text */
+    charStart: number;
+    /** End character offset in the original text */
+    charEnd: number;
+    /** Alignment confidence score (0.0 - 1.0) */
+    score: number;
+}
 /**
  * TTS generation request options.
  */
@@ -48,7 +65,7 @@ interface GenerateOptions {
     /** Text to synthesize */
     text: string;
     /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
-    model?: string;
+    modelId?: string;
     /** Voice ID to use */
     voiceId?: number;
     /** CFG scale for generation (default: 2.0) */
@@ -57,27 +74,30 @@ interface GenerateOptions {
     maxNewTokens?: number;
     /** Output sample rate (default: 24000) */
     sampleRate?: number;
-    /** Whether to add speaker prefix (default: true) */
-    speakerPrefix?: boolean;
     /**
      * Enable text normalization (converts numbers, dates, etc. to spoken words).
      * When true, text will be normalized before TTS generation.
-     * Default: false
+     * Default: true
      *
-     * ⚠️ WARNING: Using normalize=true without specifying language adds ~150ms
-     * latency for language auto-detection. For best performance, always specify
-     * the language parameter when using normalization.
+     * ⚠️ For best performance, always specify the language parameter when using
+     * normalization. Without it, language auto-detection adds ~150ms latency.
      */
     normalize?: boolean;
     /**
      * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
-     * If not provided and normalize is true, language will be auto-detected
+     * If not provided and normalize is true (default), language will be auto-detected
      * (adds ~150ms latency).
      *
      * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
      *            el, uk, bg, tr, vi, ar, hi, zh, ja, ko
      */
     language?: string;
+    /**
+     * Request word-level timestamps alongside audio.
+     * When true, the server performs forced alignment and returns per-word timing boundaries.
+     * Default: false
+     */
+    wordTimestamps?: boolean;
 }
 /**
  * Streaming session configuration.
@@ -91,12 +111,25 @@ interface StreamConfig {
     maxNewTokens?: number;
     /** Output sample rate */
     sampleRate?: number;
-    /** Whether to add speaker prefix */
-    speakerPrefix?: boolean;
     /** Auto-flush timeout in milliseconds */
     flushTimeoutMs?: number;
     /** Maximum buffer length */
     maxBufferLength?: number;
+    /**
+     * Enable text normalization (converts numbers, dates, etc. to spoken words).
+     * Default: true
+     */
+    normalize?: boolean;
+    /**
+     * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
+     * Specify to avoid ~150ms auto-detection latency.
+     */
+    language?: string;
+    /**
+     * Request word-level timestamps alongside audio.
+     * Default: false
+     */
+    wordTimestamps?: boolean;
 }
 /**
  * Audio chunk from streaming TTS.
@@ -150,6 +183,8 @@ interface AudioResponse {
     generationMs: number;
     /** Real-time factor */
     rtf: number;
+    /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
+    wordTimestamps: WordTimestamp[];
 }
 /**
  * Event callbacks for streaming.
@@ -157,6 +192,8 @@ interface AudioResponse {
 interface StreamCallbacks {
     /** Called when an audio chunk is received */
     onChunk?: (chunk: AudioChunk) => void;
+    /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
+    onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
     /** Called when generation is complete */
     onFinal?: (stats: GenerationStats) => void;
     /** Called on error */
@@ -176,17 +213,75 @@ interface KugelAudioOptions {
     isMasterKey?: boolean;
     /** Whether apiKey is a JWT token (for user authentication). Takes precedence over isMasterKey. */
     isToken?: boolean;
+    /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
+    orgId?: number;
     /** API base URL (default: https://api.kugelaudio.com) */
     apiUrl?: string;
-    /** TTS server URL (default: https://eu.kugelaudio.com) */
+    /** TTS server URL (default: same as apiUrl) */
     ttsUrl?: string;
     /** Request timeout in milliseconds (default: 60000) */
     timeout?: number;
 }
 /**
- * KugelAudio API Client.
+ * Multi-context session configuration.
  */
+interface MultiContextConfig {
+    /** Default voice ID for new contexts */
+    defaultVoiceId?: number;
+    /** Output sample rate (default: 24000) */
+    sampleRate?: number;
+    /** CFG scale for generation (default: 2.0) */
+    cfgScale?: number;
+    /** Maximum tokens to generate (default: 2048) */
+    maxNewTokens?: number;
+    /** Enable text normalization (default: true) */
+    normalize?: boolean;
+    /** Seconds before context auto-closes (default: 20.0) */
+    inactivityTimeout?: number;
+}
+/**
+ * Voice settings for a specific context.
+ */
+interface ContextVoiceSettings {
+    /** Stability (0.0-1.0) */
+    stability?: number;
+    /** Similarity boost (0.0-1.0) */
+    similarityBoost?: number;
+    /** Style (0.0-1.0) */
+    style?: number;
+    /** Use speaker boost */
+    useSpeakerBoost?: boolean;
+    /** Speed multiplier */
+    speed?: number;
+}
+/**
+ * Audio chunk from multi-context streaming.
+ */
+interface MultiContextAudioChunk extends AudioChunk {
+    /** Context ID this audio belongs to */
+    contextId: string;
+}
+/**
+ * Event callbacks for multi-context streaming.
+ */
+interface MultiContextCallbacks {
+    /** Called when session is started */
+    onSessionStarted?: (sessionId: string) => void;
+    /** Called when a context is created */
+    onContextCreated?: (contextId: string) => void;
+    /** Called when an audio chunk is received */
+    onChunk?: (chunk: MultiContextAudioChunk) => void;
+    /** Called when a context finishes generating */
+    onContextFinal?: (contextId: string) => void;
+    /** Called when a context is closed */
+    onContextClosed?: (contextId: string) => void;
+    /** Called when a context times out */
+    onContextTimeout?: (contextId: string) => void;
+    /** Called when session is closed */
+    onSessionClosed?: (stats: Record<string, unknown>) => void;
+    /** Called on error */
+    onError?: (error: Error, contextId?: string) => void;
+}
 /**
  * Models resource for listing TTS models.
@@ -290,6 +385,98 @@ declare class TTSResource {
      */
     close(): void;
     private parseError;
+    /**
+     * Create a multi-context session for concurrent TTS streams.
+     *
+     * Allows managing up to 5 independent audio generation contexts
+     * over a single WebSocket connection. Each context has its own
+     * text buffer, voice settings, and generation queue.
+     *
+     * @example
+     * ```typescript
+     * const session = client.tts.createMultiContextSession({
+     *   defaultVoiceId: 123,
+     * });
+     *
+     * session.connect({
+     *   onChunk: (chunk) => {
+     *     console.log(`Audio from ${chunk.contextId}`);
+     *     playAudio(chunk.audio);
+     *   },
+     *   onContextFinal: (contextId) => {
+     *     console.log(`${contextId} finished`);
+     *   },
+     * });
+     *
+     * // Create contexts with different voices
+     * session.createContext('narrator', { voiceId: 123 });
+     * session.createContext('character', { voiceId: 456 });
+     *
+     * // Send text to different speakers
+     * session.send('narrator', 'The story begins.', true);
+     * session.send('character', 'Hello!', true);
+     *
+     * // Close when done
+     * session.close();
+     * ```
+     */
+    createMultiContextSession(config?: MultiContextConfig): MultiContextSession;
+}
+/**
+ * Multi-context WebSocket session for concurrent TTS streams.
+ */
+declare class MultiContextSession {
+    private client;
+    private ws;
+    private config;
+    private callbacks;
+    private contexts;
+    private _sessionId;
+    private isStarted;
+    constructor(client: KugelAudio, config?: MultiContextConfig);
+    /**
+     * Get the current session ID, or null if not connected.
+     */
+    get sessionId(): string | null;
+    /**
+     * Connect to the multi-context WebSocket endpoint.
+     */
+    connect(callbacks: MultiContextCallbacks): void;
+    /**
+     * Create a new context with optional voice settings.
+     */
+    createContext(contextId: string, options?: {
+        voiceId?: number;
+        voiceSettings?: ContextVoiceSettings;
+    }): void;
+    /**
+     * Send text to a specific context.
+     */
+    send(contextId: string, text: string, flush?: boolean): void;
+    /**
+     * Flush a context's buffer.
+     */
+    flush(contextId: string): void;
+    /**
+     * Close a specific context.
+     */
+    closeContext(contextId: string): void;
+    /**
+     * Send keep-alive to reset a context's inactivity timeout.
+     */
+    keepAlive(contextId: string): void;
+    /**
+     * Close the session and all contexts.
+     */
+    close(): void;
+    /**
+     * Get active context IDs.
+     */
+    get activeContexts(): string[];
+    /**
+     * Check if connected.
+     */
+    get isConnected(): boolean;
 }
 /**
  * KugelAudio API client.
@@ -307,13 +494,13 @@ declare class TTSResource {
  * // Generate audio with fast model (1.5B params)
  * const audio = await client.tts.generate({
  *   text: 'Hello, world!',
- *   model: 'kugel-1-turbo',
+ *   modelId: 'kugel-1-turbo',
  * });
  *
  * // Generate audio with premium model (7B params)
  * const audio = await client.tts.generate({
  *   text: 'Hello, world!',
- *   model: 'kugel-1',
+ *   modelId: 'kugel-1',
  * });
  * ```
  */
@@ -321,6 +508,7 @@ declare class KugelAudio {
     private _apiKey;
     private _isMasterKey;
     private _isToken;
+    private _orgId;
     private _apiUrl;
     private _ttsUrl;
     private _timeout;
@@ -354,6 +542,8 @@ declare class KugelAudio {
     get isMasterKey(): boolean;
     /** Check if using JWT token authentication */
     get isToken(): boolean;
+    /** Get organisation ID for billing */
+    get orgId(): number | undefined;
     /** Get TTS URL */
     get ttsUrl(): string;
     /**
@@ -451,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
  */
 declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
-export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
+export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };