npm - kugelaudio - Versions diffs - 0.2.2 → 0.3.0 - Mend

kugelaudio 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -41,18 +41,118 @@ interface Voice {
     isPublic: boolean;
     verified: boolean;
 }
+/**
+ * Paginated response from the voices list endpoint.
+ */
+interface VoiceListResponse {
+    voices: Voice[];
+    total: number;
+    limit: number;
+    offset: number;
+}
+/**
+ * Voice quality levels.
+ */
+type VoiceQuality = 'low' | 'mid' | 'high';
+/**
+ * Extended voice information returned by voice management endpoints.
+ */
+interface VoiceDetail {
+    id: number;
+    name: string;
+    description: string;
+    generativeVoiceDescription: string;
+    supportedLanguages: string[];
+    category: string;
+    age?: string;
+    sex?: string;
+    quality: string;
+    isPublic: boolean;
+    verified: boolean;
+    pendingVerification: boolean;
+    sampleUrl?: string;
+    avatarUrl?: string;
+    sampleText: string;
+}
+/**
+ * Voice reference audio metadata.
+ */
+interface VoiceReference {
+    id: number;
+    voiceId: number;
+    name: string;
+    referenceText: string;
+    s3Path: string;
+    audioUrl?: string;
+    isGenerated: boolean;
+}
+/**
+ * Options for creating a new voice.
+ */
+interface CreateVoiceOptions {
+    name: string;
+    sex: string;
+    description?: string;
+    category?: string;
+    age?: string;
+    quality?: string;
+    supportedLanguages?: string[];
+    isPublic?: boolean;
+    sampleText?: string;
+    /** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
+    referenceFiles?: Array<File | Blob>;
+}
+/**
+ * Options for updating an existing voice.
+ */
+interface UpdateVoiceOptions {
+    name?: string;
+    description?: string;
+    category?: string;
+    age?: string;
+    sex?: string;
+    quality?: string;
+    supportedLanguages?: string[];
+    isPublic?: boolean;
+    sampleText?: string;
+}
+/**
+ * Word-level timestamp from server-side forced alignment.
+ */
+interface WordTimestamp {
+    /** The aligned word */
+    word: string;
+    /** Start time in milliseconds (relative to chunk/audio start) */
+    startMs: number;
+    /** End time in milliseconds (relative to chunk/audio start) */
+    endMs: number;
+    /** Start character offset in the original text */
+    charStart: number;
+    /** End character offset in the original text */
+    charEnd: number;
+    /** Alignment confidence score (0.0 - 1.0) */
+    score: number;
+}
 /**
  * TTS generation request options.
  */
 interface GenerateOptions {
     /** Text to synthesize */
     text: string;
-    /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
+    /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
     modelId?: string;
     /** Voice ID to use */
     voiceId?: number;
     /** CFG scale for generation (default: 2.0) */
     cfgScale?: number;
+    /**
+     * Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
+     * 1 = most variance. Default: 0.5.
+     *
+     * Lower values produce more consistent reads across regenerations —
+     * useful for stable voiceovers, IVR prompts, and e-learning.
+     */
+    temperature?: number;
     /** Maximum tokens to generate (default: 2048) */
     maxNewTokens?: number;
     /** Output sample rate (default: 24000) */
@@ -72,18 +172,61 @@ interface GenerateOptions {
      * (adds ~150ms latency).
      *
      * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
-     *            el, uk, bg, tr, vi, ar, hi, zh, ja, ko
+     *            el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
+     *            he, fa, ur, bn, ta, yue, th, id, ms
      */
     language?: string;
+    /**
+     * Request word-level timestamps alongside audio.
+     * When true, the server performs forced alignment and returns per-word timing boundaries.
+     * Default: false
+     */
+    wordTimestamps?: boolean;
+    /**
+     * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
+     *
+     * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
+     * can also be used for per-segment speed control.
+     * Range: [0.8, 1.2]. Default: 1.0.
+     */
+    speed?: number;
+    /**
+     * Optional project ID for project-scoped features (custom dictionary
+     * replacements, per-project rate limits). The caller MUST verify the
+     * authenticated user has access to this project before passing it; the
+     * server treats the value as trusted once received.
+     */
+    projectId?: number;
 }
 /**
- * Streaming session configuration.
+ * Streaming session configuration for `/ws/tts/stream`.
+ *
+ * The server accumulates LLM tokens internally and starts generation at natural
+ * sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
+ * server begins generating, or set {@link autoMode} to start at the very first
+ * clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
+ *
+ * @example Low-latency preset
+ * ```typescript
+ * const session = client.tts.streamingSession({
+ *   voiceId: 123,
+ *   autoMode: true,
+ *   chunkLengthSchedule: [50, 100, 150, 250],
+ * });
+ * ```
  */
 interface StreamConfig {
     /** Voice ID to use */
     voiceId?: number;
+    /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
+    modelId?: string;
     /** CFG scale for generation */
     cfgScale?: number;
+    /**
+     * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
+     * Default: 0.5.
+     */
+    temperature?: number;
     /** Maximum tokens per generation */
     maxNewTokens?: number;
     /** Output sample rate */
@@ -102,6 +245,68 @@ interface StreamConfig {
      * Specify to avoid ~150ms auto-detection latency.
      */
     language?: string;
+    /**
+     * Request word-level timestamps alongside audio.
+     * Default: false
+     */
+    wordTimestamps?: boolean;
+    /**
+     * Minimum buffer sizes (in characters) the server must accumulate before
+     * auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
+     * last value is reused for all subsequent chunks.
+     *
+     * Smaller values produce lower TTFA at the cost of less prosody context.
+     * Larger values improve naturalness but increase TTFA.
+     *
+     * @example
+     * ```typescript
+     * chunkLengthSchedule: [50, 100, 150, 250]  // low-latency
+     * chunkLengthSchedule: [120, 200, 300]       // high-quality prosody
+     * ```
+     */
+    chunkLengthSchedule?: number[];
+    /**
+     * When `true`, the server starts generating audio at the very first clean
+     * sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
+     * ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
+     * less natural prosody on the first chunk.
+     */
+    autoMode?: boolean;
+    /**
+     * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
+     *
+     * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
+     * can also be used for per-segment speed control.
+     * Range: [0.8, 1.2]. Default: 1.0.
+     */
+    speed?: number;
+}
+/**
+ * Event callbacks for a streaming session (`/ws/tts/stream`).
+ *
+ * This is the LLM-integration endpoint: forward raw tokens via
+ * {@link StreamingSession.send} and the server auto-chunks them at sentence
+ * boundaries.
+ */
+interface StreamingSessionCallbacks {
+    /** Called when an audio chunk arrives for any segment. */
+    onChunk?: (chunk: AudioChunk) => void;
+    /**
+     * Called when all audio for one flushed text segment is complete.
+     * Carries the segment index, total audio duration, and generation time.
+     */
+    onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
+    /**
+     * Called when the session is fully closed (after `session.close()`).
+     * Equivalent to `onFinal` on the one-shot endpoint.
+     */
+    onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
+    /** Called when the server begins generating audio for a text segment. */
+    onGenerationStarted?: (chunkId: number, text: string) => void;
+    /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
+    onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
+    /** Called on any error. */
+    onError?: (error: Error) => void;
 }
 /**
  * Audio chunk from streaming TTS.
@@ -132,8 +337,6 @@ interface GenerationStats {
     durationMs: number;
     /** Generation time in milliseconds */
     generationMs: number;
-    /** Time to first audio in milliseconds */
-    ttfaMs: number | null;
     /** Real-time factor */
     rtf: number;
     /** Error message if any */
@@ -155,6 +358,8 @@ interface AudioResponse {
     generationMs: number;
     /** Real-time factor */
     rtf: number;
+    /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
+    wordTimestamps: WordTimestamp[];
 }
 /**
  * Event callbacks for streaming.
@@ -162,6 +367,8 @@ interface AudioResponse {
 interface StreamCallbacks {
     /** Called when an audio chunk is received */
     onChunk?: (chunk: AudioChunk) => void;
+    /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
+    onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
     /** Called when generation is complete */
     onFinal?: (stats: GenerationStats) => void;
     /** Called on error */
@@ -171,11 +378,18 @@ interface StreamCallbacks {
     /** Called when connection closes */
     onClose?: () => void;
 }
+/**
+ * Deployment region. Controls which API endpoint the SDK connects to.
+ * - `'eu'` — `api.kugelaudio.com` (default)
+ * - `'us'` — `us-api.kugelaudio.com`
+ * - `'global'` — `global-api.kugelaudio.com` (geo-routed)
+ */
+type Region = 'eu' | 'us' | 'global';
 /**
  * KugelAudio client options.
  */
 interface KugelAudioOptions {
-    /** Your KugelAudio API key or JWT token */
+    /** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
     apiKey: string;
     /** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
     isMasterKey?: boolean;
@@ -183,12 +397,20 @@ interface KugelAudioOptions {
     isToken?: boolean;
     /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
     orgId?: number;
+    /** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
+    region?: Region;
     /** API base URL (default: https://api.kugelaudio.com) */
     apiUrl?: string;
     /** TTS server URL (default: same as apiUrl) */
     ttsUrl?: string;
     /** Request timeout in milliseconds (default: 60000) */
     timeout?: number;
+    /**
+     * Interval in milliseconds between WebSocket ping frames sent on the pooled connection
+     * to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
+     * In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
+     */
+    keepalivePingInterval?: number | null;
 }
 /**
  * Multi-context session configuration.
@@ -200,10 +422,21 @@ interface MultiContextConfig {
     sampleRate?: number;
     /** CFG scale for generation (default: 2.0) */
     cfgScale?: number;
+    /**
+     * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
+     * Default: 0.5.
+     */
+    temperature?: number;
     /** Maximum tokens to generate (default: 2048) */
     maxNewTokens?: number;
     /** Enable text normalization (default: true) */
     normalize?: boolean;
+    /**
+     * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
+     * If not set and normalize is true (default), the server auto-detects
+     * the language, which adds ~60-150ms to time-to-first-audio.
+     */
+    language?: string;
     /** Seconds before context auto-closes (default: 20.0) */
     inactivityTimeout?: number;
 }
@@ -239,8 +472,6 @@ interface MultiContextCallbacks {
     onContextCreated?: (contextId: string) => void;
     /** Called when an audio chunk is received */
     onChunk?: (chunk: MultiContextAudioChunk) => void;
-    /** Called when a context finishes generating */
-    onContextFinal?: (contextId: string) => void;
     /** Called when a context is closed */
     onContextClosed?: (contextId: string) => void;
     /** Called when a context times out */
@@ -275,11 +506,51 @@ declare class VoicesResource {
         language?: string;
         includePublic?: boolean;
         limit?: number;
-    }): Promise<Voice[]>;
+        offset?: number;
+    }): Promise<VoiceListResponse>;
     /**
      * Get a specific voice by ID.
      */
-    get(voiceId: number): Promise<Voice>;
+    get(voiceId: number): Promise<VoiceDetail>;
+    /**
+     * Create a new voice.
+     */
+    create(options: CreateVoiceOptions): Promise<VoiceDetail>;
+    /**
+     * Update an existing voice. Only provided fields are updated.
+     */
+    update(voiceId: number, options: UpdateVoiceOptions): Promise<VoiceDetail>;
+    /**
+     * Delete a voice.
+     */
+    delete(voiceId: number): Promise<void>;
+    /**
+     * List reference audio files for a voice.
+     */
+    listReferences(voiceId: number): Promise<VoiceReference[]>;
+    /**
+     * Upload a reference audio file to a voice.
+     *
+     * @param voiceId - Voice ID
+     * @param file - Audio file (File in browser, Blob in Node.js)
+     * @param referenceText - Optional transcript of the reference audio
+     */
+    addReference(voiceId: number, file: File | Blob, referenceText?: string): Promise<VoiceReference>;
+    /**
+     * Delete a reference audio file from a voice.
+     */
+    deleteReference(voiceId: number, referenceId: number): Promise<void>;
+    /**
+     * Request publication of a voice. Sets it as public and marks it
+     * as pending verification by an admin.
+     */
+    publish(voiceId: number): Promise<VoiceDetail>;
+    /**
+     * Trigger sample audio generation for a voice.
+     */
+    generateSample(voiceId: number): Promise<VoiceDetail>;
+    private mapVoiceDetail;
+    private mapVoiceReference;
 }
 /**
  * TTS resource for text-to-speech generation.
@@ -290,6 +561,7 @@ declare class TTSResource {
     private wsUrl;
     private pendingRequests;
     private requestCounter;
+    private keepaliveTimer;
     constructor(client: KugelAudio);
     /**
      * Pre-establish WebSocket connection for faster first request.
@@ -318,6 +590,40 @@ declare class TTSResource {
      * Returns complete audio after all chunks are received.
      */
     generate(options: GenerateOptions): Promise<AudioResponse>;
+    /**
+     * Stream audio and return a Node.js Readable stream of raw PCM16 binary data.
+     *
+     * **Node.js only** — this method requires the `stream` built-in module and is
+     * intended for server-side integrations such as Vapi custom TTS endpoints,
+     * Express/Fastify handlers, or any pipeline that expects a Node.js `Readable`.
+     *
+     * Compared to manually wiring `onChunk` to a `Readable`, this method avoids
+     * a common race-condition: the stream object is created and returned **before**
+     * any chunks arrive, so the caller can safely pipe or attach listeners before
+     * the first audio byte is pushed.
+     *
+     * @example Vapi custom TTS endpoint
+     * ```typescript
+     * app.post('/synthesize', (req, res) => {
+     *   res.setHeader('Content-Type', 'audio/pcm');
+     *   res.setHeader('Transfer-Encoding', 'chunked');
+     *
+     *   const readable = client.tts.toReadable({
+     *     text: req.body.message.text,
+     *     modelId: 'kugel-1-turbo',
+     *     sampleRate: req.body.message.sampleRate,
+     *     language: 'en',
+     *   });
+     *
+     *   readable.pipe(res);
+     * });
+     * ```
+     *
+     * @param options - TTS generation options (same as `stream()`)
+     * @param reuseConnection - Reuse the pooled WebSocket connection (default: true)
+     * @returns Node.js Readable stream emitting raw PCM16 binary Buffer chunks
+     */
+    toReadable(options: GenerateOptions, reuseConnection?: boolean): any;
     /**
      * Build the WebSocket URL with appropriate auth param.
      */
@@ -348,11 +654,47 @@ declare class TTSResource {
      * Stream without connection pooling (original behavior).
      */
     private streamWithoutPooling;
+    /**
+     * Start periodic keepalive pings on the pooled connection.
+     * Uses the ws package's ping() in Node.js; silently skips in browsers
+     * where WebSocket doesn't expose a ping method.
+     */
+    private startKeepalive;
+    private stopKeepalive;
     /**
      * Close the pooled WebSocket connection.
      */
     close(): void;
     private parseError;
+    /**
+     * Create a streaming session for LLM integration.
+     *
+     * The session connects to `/ws/tts/stream` and keeps a persistent
+     * connection across multiple {@link StreamingSession.send} calls.
+     * The server auto-chunks text at sentence boundaries — no client-side
+     * flushing required.
+     *
+     * @param config - Session configuration (voice, model, chunking strategy).
+     * @param callbacks - Callbacks for audio chunks and session lifecycle events.
+     * @returns A {@link StreamingSession} instance. Call `.connect()` before sending.
+     *
+     * @example
+     * ```typescript
+     * const session = client.tts.streamingSession(
+     *   { voiceId: 123, autoMode: true, chunkLengthSchedule: [50, 100, 150, 250] },
+     *   { onChunk: (chunk) => playAudio(chunk.audio) },
+     * );
+     *
+     * session.connect();
+     *
+     * for await (const token of llmStream) {
+     *   session.send(token);
+     * }
+     *
+     * await session.close();
+     * ```
+     */
+    streamingSession(config: StreamConfig, callbacks: StreamingSessionCallbacks): StreamingSession;
     /**
      * Create a multi-context session for concurrent TTS streams.
      *
@@ -371,7 +713,7 @@ declare class TTSResource {
      *     console.log(`Audio from ${chunk.contextId}`);
      *     playAudio(chunk.audio);
      *   },
-     *   onContextFinal: (contextId) => {
+     *   onContextClosed: (contextId) => {
      *     console.log(`${contextId} finished`);
      *   },
      * });
@@ -408,8 +750,13 @@ declare class MultiContextSession {
     get sessionId(): string | null;
     /**
      * Connect to the multi-context WebSocket endpoint.
+     *
+     * The returned promise resolves once the WebSocket is OPEN so callers can
+     * ``await session.connect(callbacks)`` before invoking
+     * {@link createContext} / {@link send}. Pre-open errors reject with the
+     * typed error.
      */
-    connect(callbacks: MultiContextCallbacks): void;
+    connect(callbacks: MultiContextCallbacks): Promise<void>;
     /**
      * Create a new context with optional voice settings.
      */
@@ -446,6 +793,103 @@ declare class MultiContextSession {
      */
     get isConnected(): boolean;
 }
+/**
+ * Streaming session for LLM integration via `/ws/tts/stream`.
+ *
+ * The server accumulates text across multiple {@link send} calls and
+ * auto-chunks it at sentence boundaries, keeping the KV cache warm between
+ * chunks for natural prosody.  You never need to call `flush` explicitly —
+ * configure {@link StreamConfig.chunkLengthSchedule} or
+ * {@link StreamConfig.autoMode} instead.
+ *
+ * @example
+ * ```typescript
+ * const session = client.tts.streamingSession({
+ *   voiceId: 123,
+ *   autoMode: true,
+ *   chunkLengthSchedule: [50, 100, 150, 250],
+ * }, {
+ *   onChunk: (chunk) => playAudio(chunk.audio),
+ *   onSessionClosed: (totalSecs) => console.log(`Done: ${totalSecs}s`),
+ * });
+ *
+ * session.connect();
+ *
+ * for await (const token of llmStream) {
+ *   session.send(token);
+ * }
+ *
+ * await session.close();
+ * ```
+ */
+declare class StreamingSession {
+    private ws;
+    private config;
+    private callbacks;
+    private client;
+    private configSent;
+    constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
+    /**
+     * Open the WebSocket connection and authenticate.
+     *
+     * The returned promise resolves once the WebSocket is OPEN, so callers can
+     * ``await session.connect()`` and then ``send()`` without racing the
+     * handshake. Pre-open errors (network failure, 4001 unauthorized, …) reject
+     * the promise with the typed error.
+     */
+    connect(): Promise<void>;
+    /**
+     * Send a text chunk to the server (e.g. one LLM output token).
+     *
+     * The server buffers text across multiple calls and starts generating at
+     * natural sentence boundaries automatically — no need to call `flush`.
+     *
+     * @param text - Raw text or LLM token to append to the server buffer.
+     * @param flush - Force immediate generation of whatever is buffered.
+     *   **Avoid calling this per-sentence from the client.** Doing so bypasses
+     *   the server's semantic chunking, incurs a fresh model prefill cost on
+     *   every flush, and makes latency *worse*, not better.  Let the server
+     *   handle chunking via `chunkLengthSchedule` / `autoMode` instead.
+     */
+    send(text: string, flush?: boolean): void;
+    /**
+     * End the current session but keep the WebSocket connection open.
+     *
+     * This allows starting a new session on the same connection, avoiding
+     * the overhead of a new WebSocket handshake (~200-300ms). After calling
+     * this, optionally call {@link updateConfig} to change voice/model settings,
+     * then call {@link send} to start the next session.
+     *
+     * The returned promise resolves once the server confirms with a
+     * `session_closed` message, or after a 15 s **quiet** timeout — i.e. 15 s
+     * elapse without *any* server message arriving. The timer resets on every
+     * incoming frame so a long final flush that streams audio for tens of
+     * seconds is not truncated; only a genuinely silent server trips the fuse.
+     */
+    endSession(): Promise<void>;
+    /**
+     * Update session configuration for the next session.
+     *
+     * Call this after {@link endSession} and before the next {@link send}
+     * to change voice, model, language, or other settings.
+     */
+    updateConfig(config: Partial<StreamConfig>): void;
+    /**
+     * Close the session and the WebSocket connection.
+     *
+     * For session reuse without closing the connection, use
+     * {@link endSession} instead.
+     *
+     * The returned promise resolves once the server confirms the close with a
+     * `session_closed` message, or after a 15 s **quiet** timeout (no traffic
+     * from the server in that window). Audio frames from the server-side
+     * final-flush of the still-buffered text are delivered to your callbacks
+     * before this promise resolves, and each frame resets the quiet timer.
+     */
+    close(): Promise<void>;
+    /** Whether the underlying WebSocket is open. */
+    get isConnected(): boolean;
+}
 /**
  * KugelAudio API client.
  *
@@ -459,13 +903,13 @@ declare class MultiContextSession {
  * // List voices
  * const voices = await client.voices.list();
  *
- * // Generate audio with fast model (1.5B params)
+ * // Generate audio with fast model
  * const audio = await client.tts.generate({
  *   text: 'Hello, world!',
  *   modelId: 'kugel-1-turbo',
  * });
  *
- * // Generate audio with premium model (7B params)
+ * // Generate audio with premium model
  * const audio = await client.tts.generate({
  *   text: 'Hello, world!',
  *   modelId: 'kugel-1',
@@ -480,6 +924,7 @@ declare class KugelAudio {
     private _apiUrl;
     private _ttsUrl;
     private _timeout;
+    private _keepalivePingInterval;
     /** Models resource */
     readonly models: ModelsResource;
     /** Voices resource */
@@ -514,6 +959,8 @@ declare class KugelAudio {
     get orgId(): number | undefined;
     /** Get TTS URL */
     get ttsUrl(): string;
+    /** Get keepalive ping interval in milliseconds, or null if disabled. */
+    get keepalivePingInterval(): number | null;
     /**
      * Close the client and release resources.
      * This closes any pooled WebSocket connections.
@@ -546,48 +993,125 @@ declare class KugelAudio {
      * @internal
      */
     request<T>(method: string, path: string, body?: unknown): Promise<T>;
+    /**
+     * Make a multipart/form-data request (for file uploads).
+     * @internal Used by VoicesResource for reference file uploads.
+     */
+    requestMultipart<T>(method: string, path: string, formData: FormData): Promise<T>;
 }
 /**
  * Custom errors for KugelAudio SDK.
+ *
+ * All SDK errors inherit from {@link KugelAudioError}. Specific subclasses
+ * map to the server's `error_code` field (see the server-side `ErrorCode`
+ * enum at `tts/src/serving/deployments/errors.py`) so callers can
+ * `instanceof AuthenticationError` without matching on message text.
  */
+declare const ErrorCodes: {
+    readonly UNAUTHORIZED: "UNAUTHORIZED";
+    readonly RATE_LIMITED: "RATE_LIMITED";
+    readonly INSUFFICIENT_CREDITS: "INSUFFICIENT_CREDITS";
+    readonly MODEL_UNAVAILABLE: "MODEL_UNAVAILABLE";
+    readonly EMPTY_AUDIO: "EMPTY_AUDIO";
+    readonly VALIDATION: "VALIDATION_ERROR";
+    readonly INTERNAL: "INTERNAL_ERROR";
+    readonly NOT_FOUND: "NOT_FOUND";
+};
+type ErrorCode = typeof ErrorCodes[keyof typeof ErrorCodes];
+declare const WsCloseCodes: {
+    readonly UNAUTHORIZED: 4001;
+    readonly INSUFFICIENT_CREDITS: 4003;
+    readonly RATE_LIMITED: 4029;
+    readonly MODEL_UNAVAILABLE: 4500;
+};
+interface KugelAudioErrorOptions {
+    statusCode?: number;
+    errorCode?: string;
+    requestId?: string;
+    retryAfter?: number;
+    cause?: unknown;
+}
 /**
  * Base error class for KugelAudio SDK.
  */
 declare class KugelAudioError extends Error {
     readonly statusCode?: number;
-    constructor(message: string, statusCode?: number);
+    readonly errorCode?: string;
+    readonly requestId?: string;
+    readonly retryAfter?: number;
+    constructor(message: string, options?: KugelAudioErrorOptions);
 }
 /**
- * Thrown when authentication fails.
+ * API key was missing, malformed, or rejected by the server.
  */
 declare class AuthenticationError extends KugelAudioError {
-    constructor(message?: string);
+    constructor(message?: string, options?: KugelAudioErrorOptions);
 }
 /**
- * Thrown when rate limit is exceeded.
+ * Request was rejected by the per-org rate limiter.
  */
 declare class RateLimitError extends KugelAudioError {
-    constructor(message?: string);
+    constructor(message?: string, options?: KugelAudioErrorOptions);
 }
 /**
- * Thrown when user has insufficient credits.
+ * Account is out of TTS credits.
  */
 declare class InsufficientCreditsError extends KugelAudioError {
-    constructor(message?: string);
+    constructor(message?: string, options?: KugelAudioErrorOptions);
 }
 /**
- * Thrown when request validation fails.
+ * Request was rejected as invalid (bad params, missing fields, etc.).
  */
 declare class ValidationError extends KugelAudioError {
-    constructor(message: string);
+    constructor(message: string, options?: KugelAudioErrorOptions);
 }
 /**
- * Thrown when connection to server fails.
+ * The SDK could not reach KugelAudio (network error, server down,
+ * or model deployment temporarily unavailable).
  */
 declare class ConnectionError extends KugelAudioError {
-    constructor(message?: string);
+    constructor(message: string, options?: KugelAudioErrorOptions);
+}
+interface HttpResponseLike {
+    status: number;
+    headers: {
+        get(name: string): string | null;
+    } | Record<string, string | undefined>;
+    text?: () => Promise<string>;
 }
+/**
+ * Build the appropriate `KugelAudioError` from an HTTP response body that
+ * was already parsed. `bodyText` is the raw text fallback.
+ */
+declare function classifyHttpError(status: number, bodyText: string, headers: HttpResponseLike['headers']): KugelAudioError;
+/**
+ * Build a `KugelAudioError` from a server-sent WebSocket error frame
+ * (`{error, error_code, retry_after}`).
+ */
+declare function classifyWsFrame(data: {
+    error?: string;
+    error_code?: string;
+    retry_after?: number;
+}): KugelAudioError;
+/**
+ * Build a `KugelAudioError` from a WebSocket close code + reason.
+ */
+declare function classifyWsClose(code: number | undefined, reason?: string): KugelAudioError;
+/**
+ * Extract the HTTP status from a `ws` package handshake-rejection error and
+ * return a typed `KugelAudioError`. Returns `null` if the error doesn't look
+ * like a handshake rejection (e.g. pure network failure).
+ *
+ * The `ws` library surfaces rejected upgrades via:
+ *  - an Error whose `.message` is `"Unexpected server response: <status>"`
+ *  - `error.code === 'EUNEXPECTEDRESPONSE'`, with `error.statusCode` on some versions
+ *
+ * The TTS server rejects WS upgrades with a bare API key using HTTP 403
+ * (not 401), so we treat 403 here as an auth failure — HTTP API callers
+ * keep the generic 403 semantics via {@link classifyHttpError}.
+ */
+declare function classifyWsHandshakeError(err: unknown): KugelAudioError | null;
 /**
  * Utility functions for KugelAudio SDK.
@@ -609,4 +1133,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
  */
 declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
-export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
+export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type CreateVoiceOptions, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };