kugelaudio 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -41,6 +41,81 @@ interface Voice {
41
41
  isPublic: boolean;
42
42
  verified: boolean;
43
43
  }
44
+ /**
45
+ * Paginated response from the voices list endpoint.
46
+ */
47
+ interface VoiceListResponse {
48
+ voices: Voice[];
49
+ total: number;
50
+ limit: number;
51
+ offset: number;
52
+ }
53
+ /**
54
+ * Voice quality levels.
55
+ */
56
+ type VoiceQuality = 'low' | 'mid' | 'high';
57
+ /**
58
+ * Extended voice information returned by voice management endpoints.
59
+ */
60
+ interface VoiceDetail {
61
+ id: number;
62
+ name: string;
63
+ description: string;
64
+ generativeVoiceDescription: string;
65
+ supportedLanguages: string[];
66
+ category: string;
67
+ age?: string;
68
+ sex?: string;
69
+ quality: string;
70
+ isPublic: boolean;
71
+ verified: boolean;
72
+ pendingVerification: boolean;
73
+ sampleUrl?: string;
74
+ avatarUrl?: string;
75
+ sampleText: string;
76
+ }
77
+ /**
78
+ * Voice reference audio metadata.
79
+ */
80
+ interface VoiceReference {
81
+ id: number;
82
+ voiceId: number;
83
+ name: string;
84
+ referenceText: string;
85
+ s3Path: string;
86
+ audioUrl?: string;
87
+ isGenerated: boolean;
88
+ }
89
+ /**
90
+ * Options for creating a new voice.
91
+ */
92
+ interface CreateVoiceOptions {
93
+ name: string;
94
+ sex: string;
95
+ description?: string;
96
+ category?: string;
97
+ age?: string;
98
+ quality?: string;
99
+ supportedLanguages?: string[];
100
+ isPublic?: boolean;
101
+ sampleText?: string;
102
+ /** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
103
+ referenceFiles?: Array<File | Blob>;
104
+ }
105
+ /**
106
+ * Options for updating an existing voice.
107
+ */
108
+ interface UpdateVoiceOptions {
109
+ name?: string;
110
+ description?: string;
111
+ category?: string;
112
+ age?: string;
113
+ sex?: string;
114
+ quality?: string;
115
+ supportedLanguages?: string[];
116
+ isPublic?: boolean;
117
+ sampleText?: string;
118
+ }
44
119
  /**
45
120
  * Word-level timestamp from server-side forced alignment.
46
121
  */
@@ -64,12 +139,20 @@ interface WordTimestamp {
64
139
  interface GenerateOptions {
65
140
  /** Text to synthesize */
66
141
  text: string;
67
- /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
142
+ /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
68
143
  modelId?: string;
69
144
  /** Voice ID to use */
70
145
  voiceId?: number;
71
146
  /** CFG scale for generation (default: 2.0) */
72
147
  cfgScale?: number;
148
+ /**
149
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
150
+ * 1 = most variance. Default: 0.5.
151
+ *
152
+ * Lower values produce more consistent reads across regenerations —
153
+ * useful for stable voiceovers, IVR prompts, and e-learning.
154
+ */
155
+ temperature?: number;
73
156
  /** Maximum tokens to generate (default: 2048) */
74
157
  maxNewTokens?: number;
75
158
  /** Output sample rate (default: 24000) */
@@ -89,7 +172,8 @@ interface GenerateOptions {
89
172
  * (adds ~150ms latency).
90
173
  *
91
174
  * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
92
- * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
175
+ * el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
176
+ * he, fa, ur, bn, ta, yue, th, id, ms
93
177
  */
94
178
  language?: string;
95
179
  /**
@@ -98,15 +182,51 @@ interface GenerateOptions {
98
182
  * Default: false
99
183
  */
100
184
  wordTimestamps?: boolean;
185
+ /**
186
+ * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
187
+ *
188
+ * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
189
+ * can also be used for per-segment speed control.
190
+ * Range: [0.8, 1.2]. Default: 1.0.
191
+ */
192
+ speed?: number;
193
+ /**
194
+ * Optional project ID for project-scoped features (custom dictionary
195
+ * replacements, per-project rate limits). The caller MUST verify the
196
+ * authenticated user has access to this project before passing it; the
197
+ * server treats the value as trusted once received.
198
+ */
199
+ projectId?: number;
101
200
  }
102
201
  /**
103
- * Streaming session configuration.
202
+ * Streaming session configuration for `/ws/tts/stream`.
203
+ *
204
+ * The server accumulates LLM tokens internally and starts generation at natural
205
+ * sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
206
+ * server begins generating, or set {@link autoMode} to start at the very first
207
+ * clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
208
+ *
209
+ * @example Low-latency preset
210
+ * ```typescript
211
+ * const session = client.tts.streamingSession({
212
+ * voiceId: 123,
213
+ * autoMode: true,
214
+ * chunkLengthSchedule: [50, 100, 150, 250],
215
+ * });
216
+ * ```
104
217
  */
105
218
  interface StreamConfig {
106
219
  /** Voice ID to use */
107
220
  voiceId?: number;
221
+ /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
222
+ modelId?: string;
108
223
  /** CFG scale for generation */
109
224
  cfgScale?: number;
225
+ /**
226
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
227
+ * Default: 0.5.
228
+ */
229
+ temperature?: number;
110
230
  /** Maximum tokens per generation */
111
231
  maxNewTokens?: number;
112
232
  /** Output sample rate */
@@ -130,6 +250,63 @@ interface StreamConfig {
130
250
  * Default: false
131
251
  */
132
252
  wordTimestamps?: boolean;
253
+ /**
254
+ * Minimum buffer sizes (in characters) the server must accumulate before
255
+ * auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
256
+ * last value is reused for all subsequent chunks.
257
+ *
258
+ * Smaller values produce lower TTFA at the cost of less prosody context.
259
+ * Larger values improve naturalness but increase TTFA.
260
+ *
261
+ * @example
262
+ * ```typescript
263
+ * chunkLengthSchedule: [50, 100, 150, 250] // low-latency
264
+ * chunkLengthSchedule: [120, 200, 300] // high-quality prosody
265
+ * ```
266
+ */
267
+ chunkLengthSchedule?: number[];
268
+ /**
269
+ * When `true`, the server starts generating audio at the very first clean
270
+ * sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
271
+ * ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
272
+ * less natural prosody on the first chunk.
273
+ */
274
+ autoMode?: boolean;
275
+ /**
276
+ * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
277
+ *
278
+ * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
279
+ * can also be used for per-segment speed control.
280
+ * Range: [0.8, 1.2]. Default: 1.0.
281
+ */
282
+ speed?: number;
283
+ }
284
+ /**
285
+ * Event callbacks for a streaming session (`/ws/tts/stream`).
286
+ *
287
+ * This is the LLM-integration endpoint: forward raw tokens via
288
+ * {@link StreamingSession.send} and the server auto-chunks them at sentence
289
+ * boundaries.
290
+ */
291
+ interface StreamingSessionCallbacks {
292
+ /** Called when an audio chunk arrives for any segment. */
293
+ onChunk?: (chunk: AudioChunk) => void;
294
+ /**
295
+ * Called when all audio for one flushed text segment is complete.
296
+ * Carries the segment index, total audio duration, and generation time.
297
+ */
298
+ onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
299
+ /**
300
+ * Called when the session is fully closed (after `session.close()`).
301
+ * Equivalent to `onFinal` on the one-shot endpoint.
302
+ */
303
+ onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
304
+ /** Called when the server begins generating audio for a text segment. */
305
+ onGenerationStarted?: (chunkId: number, text: string) => void;
306
+ /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
307
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
308
+ /** Called on any error. */
309
+ onError?: (error: Error) => void;
133
310
  }
134
311
  /**
135
312
  * Audio chunk from streaming TTS.
@@ -160,8 +337,6 @@ interface GenerationStats {
160
337
  durationMs: number;
161
338
  /** Generation time in milliseconds */
162
339
  generationMs: number;
163
- /** Time to first audio in milliseconds */
164
- ttfaMs: number | null;
165
340
  /** Real-time factor */
166
341
  rtf: number;
167
342
  /** Error message if any */
@@ -203,11 +378,18 @@ interface StreamCallbacks {
203
378
  /** Called when connection closes */
204
379
  onClose?: () => void;
205
380
  }
381
+ /**
382
+ * Deployment region. Controls which API endpoint the SDK connects to.
383
+ * - `'eu'` — `api.kugelaudio.com` (default)
384
+ * - `'us'` — `us-api.kugelaudio.com`
385
+ * - `'global'` — `global-api.kugelaudio.com` (geo-routed)
386
+ */
387
+ type Region = 'eu' | 'us' | 'global';
206
388
  /**
207
389
  * KugelAudio client options.
208
390
  */
209
391
  interface KugelAudioOptions {
210
- /** Your KugelAudio API key or JWT token */
392
+ /** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
211
393
  apiKey: string;
212
394
  /** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
213
395
  isMasterKey?: boolean;
@@ -215,12 +397,20 @@ interface KugelAudioOptions {
215
397
  isToken?: boolean;
216
398
  /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
217
399
  orgId?: number;
400
+ /** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
401
+ region?: Region;
218
402
  /** API base URL (default: https://api.kugelaudio.com) */
219
403
  apiUrl?: string;
220
404
  /** TTS server URL (default: same as apiUrl) */
221
405
  ttsUrl?: string;
222
406
  /** Request timeout in milliseconds (default: 60000) */
223
407
  timeout?: number;
408
+ /**
409
+ * Interval in milliseconds between WebSocket ping frames sent on the pooled connection
410
+ * to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
411
+ * In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
412
+ */
413
+ keepalivePingInterval?: number | null;
224
414
  }
225
415
  /**
226
416
  * Multi-context session configuration.
@@ -232,10 +422,21 @@ interface MultiContextConfig {
232
422
  sampleRate?: number;
233
423
  /** CFG scale for generation (default: 2.0) */
234
424
  cfgScale?: number;
425
+ /**
426
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
427
+ * Default: 0.5.
428
+ */
429
+ temperature?: number;
235
430
  /** Maximum tokens to generate (default: 2048) */
236
431
  maxNewTokens?: number;
237
432
  /** Enable text normalization (default: true) */
238
433
  normalize?: boolean;
434
+ /**
435
+ * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
436
+ * If not set and normalize is true (default), the server auto-detects
437
+ * the language, which adds ~60-150ms to time-to-first-audio.
438
+ */
439
+ language?: string;
239
440
  /** Seconds before context auto-closes (default: 20.0) */
240
441
  inactivityTimeout?: number;
241
442
  }
@@ -271,8 +472,6 @@ interface MultiContextCallbacks {
271
472
  onContextCreated?: (contextId: string) => void;
272
473
  /** Called when an audio chunk is received */
273
474
  onChunk?: (chunk: MultiContextAudioChunk) => void;
274
- /** Called when a context finishes generating */
275
- onContextFinal?: (contextId: string) => void;
276
475
  /** Called when a context is closed */
277
476
  onContextClosed?: (contextId: string) => void;
278
477
  /** Called when a context times out */
@@ -307,11 +506,51 @@ declare class VoicesResource {
307
506
  language?: string;
308
507
  includePublic?: boolean;
309
508
  limit?: number;
310
- }): Promise<Voice[]>;
509
+ offset?: number;
510
+ }): Promise<VoiceListResponse>;
311
511
  /**
312
512
  * Get a specific voice by ID.
313
513
  */
314
- get(voiceId: number): Promise<Voice>;
514
+ get(voiceId: number): Promise<VoiceDetail>;
515
+ /**
516
+ * Create a new voice.
517
+ */
518
+ create(options: CreateVoiceOptions): Promise<VoiceDetail>;
519
+ /**
520
+ * Update an existing voice. Only provided fields are updated.
521
+ */
522
+ update(voiceId: number, options: UpdateVoiceOptions): Promise<VoiceDetail>;
523
+ /**
524
+ * Delete a voice.
525
+ */
526
+ delete(voiceId: number): Promise<void>;
527
+ /**
528
+ * List reference audio files for a voice.
529
+ */
530
+ listReferences(voiceId: number): Promise<VoiceReference[]>;
531
+ /**
532
+ * Upload a reference audio file to a voice.
533
+ *
534
+ * @param voiceId - Voice ID
535
+ * @param file - Audio file (File in browser, Blob in Node.js)
536
+ * @param referenceText - Optional transcript of the reference audio
537
+ */
538
+ addReference(voiceId: number, file: File | Blob, referenceText?: string): Promise<VoiceReference>;
539
+ /**
540
+ * Delete a reference audio file from a voice.
541
+ */
542
+ deleteReference(voiceId: number, referenceId: number): Promise<void>;
543
+ /**
544
+ * Request publication of a voice. Sets it as public and marks it
545
+ * as pending verification by an admin.
546
+ */
547
+ publish(voiceId: number): Promise<VoiceDetail>;
548
+ /**
549
+ * Trigger sample audio generation for a voice.
550
+ */
551
+ generateSample(voiceId: number): Promise<VoiceDetail>;
552
+ private mapVoiceDetail;
553
+ private mapVoiceReference;
315
554
  }
316
555
  /**
317
556
  * TTS resource for text-to-speech generation.
@@ -322,6 +561,7 @@ declare class TTSResource {
322
561
  private wsUrl;
323
562
  private pendingRequests;
324
563
  private requestCounter;
564
+ private keepaliveTimer;
325
565
  constructor(client: KugelAudio);
326
566
  /**
327
567
  * Pre-establish WebSocket connection for faster first request.
@@ -350,6 +590,40 @@ declare class TTSResource {
350
590
  * Returns complete audio after all chunks are received.
351
591
  */
352
592
  generate(options: GenerateOptions): Promise<AudioResponse>;
593
+ /**
594
+ * Stream audio and return a Node.js Readable stream of raw PCM16 binary data.
595
+ *
596
+ * **Node.js only** — this method requires the `stream` built-in module and is
597
+ * intended for server-side integrations such as Vapi custom TTS endpoints,
598
+ * Express/Fastify handlers, or any pipeline that expects a Node.js `Readable`.
599
+ *
600
+ * Compared to manually wiring `onChunk` to a `Readable`, this method avoids
601
+ * a common race-condition: the stream object is created and returned **before**
602
+ * any chunks arrive, so the caller can safely pipe or attach listeners before
603
+ * the first audio byte is pushed.
604
+ *
605
+ * @example Vapi custom TTS endpoint
606
+ * ```typescript
607
+ * app.post('/synthesize', (req, res) => {
608
+ * res.setHeader('Content-Type', 'audio/pcm');
609
+ * res.setHeader('Transfer-Encoding', 'chunked');
610
+ *
611
+ * const readable = client.tts.toReadable({
612
+ * text: req.body.message.text,
613
+ * modelId: 'kugel-1-turbo',
614
+ * sampleRate: req.body.message.sampleRate,
615
+ * language: 'en',
616
+ * });
617
+ *
618
+ * readable.pipe(res);
619
+ * });
620
+ * ```
621
+ *
622
+ * @param options - TTS generation options (same as `stream()`)
623
+ * @param reuseConnection - Reuse the pooled WebSocket connection (default: true)
624
+ * @returns Node.js Readable stream emitting raw PCM16 binary Buffer chunks
625
+ */
626
+ toReadable(options: GenerateOptions, reuseConnection?: boolean): any;
353
627
  /**
354
628
  * Build the WebSocket URL with appropriate auth param.
355
629
  */
@@ -380,11 +654,47 @@ declare class TTSResource {
380
654
  * Stream without connection pooling (original behavior).
381
655
  */
382
656
  private streamWithoutPooling;
657
+ /**
658
+ * Start periodic keepalive pings on the pooled connection.
659
+ * Uses the ws package's ping() in Node.js; silently skips in browsers
660
+ * where WebSocket doesn't expose a ping method.
661
+ */
662
+ private startKeepalive;
663
+ private stopKeepalive;
383
664
  /**
384
665
  * Close the pooled WebSocket connection.
385
666
  */
386
667
  close(): void;
387
668
  private parseError;
669
+ /**
670
+ * Create a streaming session for LLM integration.
671
+ *
672
+ * The session connects to `/ws/tts/stream` and keeps a persistent
673
+ * connection across multiple {@link StreamingSession.send} calls.
674
+ * The server auto-chunks text at sentence boundaries — no client-side
675
+ * flushing required.
676
+ *
677
+ * @param config - Session configuration (voice, model, chunking strategy).
678
+ * @param callbacks - Callbacks for audio chunks and session lifecycle events.
679
+ * @returns A {@link StreamingSession} instance. Call `.connect()` before sending.
680
+ *
681
+ * @example
682
+ * ```typescript
683
+ * const session = client.tts.streamingSession(
684
+ * { voiceId: 123, autoMode: true, chunkLengthSchedule: [50, 100, 150, 250] },
685
+ * { onChunk: (chunk) => playAudio(chunk.audio) },
686
+ * );
687
+ *
688
+ * session.connect();
689
+ *
690
+ * for await (const token of llmStream) {
691
+ * session.send(token);
692
+ * }
693
+ *
694
+ * await session.close();
695
+ * ```
696
+ */
697
+ streamingSession(config: StreamConfig, callbacks: StreamingSessionCallbacks): StreamingSession;
388
698
  /**
389
699
  * Create a multi-context session for concurrent TTS streams.
390
700
  *
@@ -403,7 +713,7 @@ declare class TTSResource {
403
713
  * console.log(`Audio from ${chunk.contextId}`);
404
714
  * playAudio(chunk.audio);
405
715
  * },
406
- * onContextFinal: (contextId) => {
716
+ * onContextClosed: (contextId) => {
407
717
  * console.log(`${contextId} finished`);
408
718
  * },
409
719
  * });
@@ -440,8 +750,13 @@ declare class MultiContextSession {
440
750
  get sessionId(): string | null;
441
751
  /**
442
752
  * Connect to the multi-context WebSocket endpoint.
753
+ *
754
+ * The returned promise resolves once the WebSocket is OPEN so callers can
755
+ * ``await session.connect(callbacks)`` before invoking
756
+ * {@link createContext} / {@link send}. Pre-open errors reject with the
757
+ * typed error.
443
758
  */
444
- connect(callbacks: MultiContextCallbacks): void;
759
+ connect(callbacks: MultiContextCallbacks): Promise<void>;
445
760
  /**
446
761
  * Create a new context with optional voice settings.
447
762
  */
@@ -478,6 +793,103 @@ declare class MultiContextSession {
478
793
  */
479
794
  get isConnected(): boolean;
480
795
  }
796
+ /**
797
+ * Streaming session for LLM integration via `/ws/tts/stream`.
798
+ *
799
+ * The server accumulates text across multiple {@link send} calls and
800
+ * auto-chunks it at sentence boundaries, keeping the KV cache warm between
801
+ * chunks for natural prosody. You never need to call `flush` explicitly —
802
+ * configure {@link StreamConfig.chunkLengthSchedule} or
803
+ * {@link StreamConfig.autoMode} instead.
804
+ *
805
+ * @example
806
+ * ```typescript
807
+ * const session = client.tts.streamingSession({
808
+ * voiceId: 123,
809
+ * autoMode: true,
810
+ * chunkLengthSchedule: [50, 100, 150, 250],
811
+ * }, {
812
+ * onChunk: (chunk) => playAudio(chunk.audio),
813
+ * onSessionClosed: (totalSecs) => console.log(`Done: ${totalSecs}s`),
814
+ * });
815
+ *
816
+ * session.connect();
817
+ *
818
+ * for await (const token of llmStream) {
819
+ * session.send(token);
820
+ * }
821
+ *
822
+ * await session.close();
823
+ * ```
824
+ */
825
+ declare class StreamingSession {
826
+ private ws;
827
+ private config;
828
+ private callbacks;
829
+ private client;
830
+ private configSent;
831
+ constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
832
+ /**
833
+ * Open the WebSocket connection and authenticate.
834
+ *
835
+ * The returned promise resolves once the WebSocket is OPEN, so callers can
836
+ * ``await session.connect()`` and then ``send()`` without racing the
837
+ * handshake. Pre-open errors (network failure, 4001 unauthorized, …) reject
838
+ * the promise with the typed error.
839
+ */
840
+ connect(): Promise<void>;
841
+ /**
842
+ * Send a text chunk to the server (e.g. one LLM output token).
843
+ *
844
+ * The server buffers text across multiple calls and starts generating at
845
+ * natural sentence boundaries automatically — no need to call `flush`.
846
+ *
847
+ * @param text - Raw text or LLM token to append to the server buffer.
848
+ * @param flush - Force immediate generation of whatever is buffered.
849
+ * **Avoid calling this per-sentence from the client.** Doing so bypasses
850
+ * the server's semantic chunking, incurs a fresh model prefill cost on
851
+ * every flush, and makes latency *worse*, not better. Let the server
852
+ * handle chunking via `chunkLengthSchedule` / `autoMode` instead.
853
+ */
854
+ send(text: string, flush?: boolean): void;
855
+ /**
856
+ * End the current session but keep the WebSocket connection open.
857
+ *
858
+ * This allows starting a new session on the same connection, avoiding
859
+ * the overhead of a new WebSocket handshake (~200-300ms). After calling
860
+ * this, optionally call {@link updateConfig} to change voice/model settings,
861
+ * then call {@link send} to start the next session.
862
+ *
863
+ * The returned promise resolves once the server confirms with a
864
+ * `session_closed` message, or after a 15 s **quiet** timeout — i.e. 15 s
865
+ * elapse without *any* server message arriving. The timer resets on every
866
+ * incoming frame so a long final flush that streams audio for tens of
867
+ * seconds is not truncated; only a genuinely silent server trips the fuse.
868
+ */
869
+ endSession(): Promise<void>;
870
+ /**
871
+ * Update session configuration for the next session.
872
+ *
873
+ * Call this after {@link endSession} and before the next {@link send}
874
+ * to change voice, model, language, or other settings.
875
+ */
876
+ updateConfig(config: Partial<StreamConfig>): void;
877
+ /**
878
+ * Close the session and the WebSocket connection.
879
+ *
880
+ * For session reuse without closing the connection, use
881
+ * {@link endSession} instead.
882
+ *
883
+ * The returned promise resolves once the server confirms the close with a
884
+ * `session_closed` message, or after a 15 s **quiet** timeout (no traffic
885
+ * from the server in that window). Audio frames from the server-side
886
+ * final-flush of the still-buffered text are delivered to your callbacks
887
+ * before this promise resolves, and each frame resets the quiet timer.
888
+ */
889
+ close(): Promise<void>;
890
+ /** Whether the underlying WebSocket is open. */
891
+ get isConnected(): boolean;
892
+ }
481
893
  /**
482
894
  * KugelAudio API client.
483
895
  *
@@ -491,13 +903,13 @@ declare class MultiContextSession {
491
903
  * // List voices
492
904
  * const voices = await client.voices.list();
493
905
  *
494
- * // Generate audio with fast model (1.5B params)
906
+ * // Generate audio with fast model
495
907
  * const audio = await client.tts.generate({
496
908
  * text: 'Hello, world!',
497
909
  * modelId: 'kugel-1-turbo',
498
910
  * });
499
911
  *
500
- * // Generate audio with premium model (7B params)
912
+ * // Generate audio with premium model
501
913
  * const audio = await client.tts.generate({
502
914
  * text: 'Hello, world!',
503
915
  * modelId: 'kugel-1',
@@ -512,6 +924,7 @@ declare class KugelAudio {
512
924
  private _apiUrl;
513
925
  private _ttsUrl;
514
926
  private _timeout;
927
+ private _keepalivePingInterval;
515
928
  /** Models resource */
516
929
  readonly models: ModelsResource;
517
930
  /** Voices resource */
@@ -546,6 +959,8 @@ declare class KugelAudio {
546
959
  get orgId(): number | undefined;
547
960
  /** Get TTS URL */
548
961
  get ttsUrl(): string;
962
+ /** Get keepalive ping interval in milliseconds, or null if disabled. */
963
+ get keepalivePingInterval(): number | null;
549
964
  /**
550
965
  * Close the client and release resources.
551
966
  * This closes any pooled WebSocket connections.
@@ -578,48 +993,125 @@ declare class KugelAudio {
578
993
  * @internal
579
994
  */
580
995
  request<T>(method: string, path: string, body?: unknown): Promise<T>;
996
+ /**
997
+ * Make a multipart/form-data request (for file uploads).
998
+ * @internal Used by VoicesResource for reference file uploads.
999
+ */
1000
+ requestMultipart<T>(method: string, path: string, formData: FormData): Promise<T>;
581
1001
  }
582
1002
 
583
1003
  /**
584
1004
  * Custom errors for KugelAudio SDK.
1005
+ *
1006
+ * All SDK errors inherit from {@link KugelAudioError}. Specific subclasses
1007
+ * map to the server's `error_code` field (see the server-side `ErrorCode`
1008
+ * enum at `tts/src/serving/deployments/errors.py`) so callers can
1009
+ * `instanceof AuthenticationError` without matching on message text.
585
1010
  */
1011
+ declare const ErrorCodes: {
1012
+ readonly UNAUTHORIZED: "UNAUTHORIZED";
1013
+ readonly RATE_LIMITED: "RATE_LIMITED";
1014
+ readonly INSUFFICIENT_CREDITS: "INSUFFICIENT_CREDITS";
1015
+ readonly MODEL_UNAVAILABLE: "MODEL_UNAVAILABLE";
1016
+ readonly EMPTY_AUDIO: "EMPTY_AUDIO";
1017
+ readonly VALIDATION: "VALIDATION_ERROR";
1018
+ readonly INTERNAL: "INTERNAL_ERROR";
1019
+ readonly NOT_FOUND: "NOT_FOUND";
1020
+ };
1021
+ type ErrorCode = typeof ErrorCodes[keyof typeof ErrorCodes];
1022
+ declare const WsCloseCodes: {
1023
+ readonly UNAUTHORIZED: 4001;
1024
+ readonly INSUFFICIENT_CREDITS: 4003;
1025
+ readonly RATE_LIMITED: 4029;
1026
+ readonly MODEL_UNAVAILABLE: 4500;
1027
+ };
1028
+ interface KugelAudioErrorOptions {
1029
+ statusCode?: number;
1030
+ errorCode?: string;
1031
+ requestId?: string;
1032
+ retryAfter?: number;
1033
+ cause?: unknown;
1034
+ }
586
1035
  /**
587
1036
  * Base error class for KugelAudio SDK.
588
1037
  */
589
1038
  declare class KugelAudioError extends Error {
590
1039
  readonly statusCode?: number;
591
- constructor(message: string, statusCode?: number);
1040
+ readonly errorCode?: string;
1041
+ readonly requestId?: string;
1042
+ readonly retryAfter?: number;
1043
+ constructor(message: string, options?: KugelAudioErrorOptions);
592
1044
  }
593
1045
  /**
594
- * Thrown when authentication fails.
1046
+ * API key was missing, malformed, or rejected by the server.
595
1047
  */
596
1048
  declare class AuthenticationError extends KugelAudioError {
597
- constructor(message?: string);
1049
+ constructor(message?: string, options?: KugelAudioErrorOptions);
598
1050
  }
599
1051
  /**
600
- * Thrown when rate limit is exceeded.
1052
+ * Request was rejected by the per-org rate limiter.
601
1053
  */
602
1054
  declare class RateLimitError extends KugelAudioError {
603
- constructor(message?: string);
1055
+ constructor(message?: string, options?: KugelAudioErrorOptions);
604
1056
  }
605
1057
  /**
606
- * Thrown when user has insufficient credits.
1058
+ * Account is out of TTS credits.
607
1059
  */
608
1060
  declare class InsufficientCreditsError extends KugelAudioError {
609
- constructor(message?: string);
1061
+ constructor(message?: string, options?: KugelAudioErrorOptions);
610
1062
  }
611
1063
  /**
612
- * Thrown when request validation fails.
1064
+ * Request was rejected as invalid (bad params, missing fields, etc.).
613
1065
  */
614
1066
  declare class ValidationError extends KugelAudioError {
615
- constructor(message: string);
1067
+ constructor(message: string, options?: KugelAudioErrorOptions);
616
1068
  }
617
1069
  /**
618
- * Thrown when connection to server fails.
1070
+ * The SDK could not reach KugelAudio (network error, server down,
1071
+ * or model deployment temporarily unavailable).
619
1072
  */
620
1073
  declare class ConnectionError extends KugelAudioError {
621
- constructor(message?: string);
1074
+ constructor(message: string, options?: KugelAudioErrorOptions);
1075
+ }
1076
+ interface HttpResponseLike {
1077
+ status: number;
1078
+ headers: {
1079
+ get(name: string): string | null;
1080
+ } | Record<string, string | undefined>;
1081
+ text?: () => Promise<string>;
622
1082
  }
1083
+ /**
1084
+ * Build the appropriate `KugelAudioError` from an HTTP response body that
1085
+ * was already parsed. `bodyText` is the raw text fallback.
1086
+ */
1087
+ declare function classifyHttpError(status: number, bodyText: string, headers: HttpResponseLike['headers']): KugelAudioError;
1088
+ /**
1089
+ * Build a `KugelAudioError` from a server-sent WebSocket error frame
1090
+ * (`{error, error_code, retry_after}`).
1091
+ */
1092
+ declare function classifyWsFrame(data: {
1093
+ error?: string;
1094
+ error_code?: string;
1095
+ retry_after?: number;
1096
+ }): KugelAudioError;
1097
+ /**
1098
+ * Build a `KugelAudioError` from a WebSocket close code + reason.
1099
+ */
1100
+ declare function classifyWsClose(code: number | undefined, reason?: string): KugelAudioError;
1101
+ /**
1102
+ * Extract the HTTP status from a `ws` package handshake-rejection error and
1103
+ * return a typed `KugelAudioError`. Returns `null` if the error doesn't look
1104
+ * like a handshake rejection (e.g. pure network failure).
1105
+ *
1106
+ * The `ws` library surfaces rejected upgrades via:
1107
+ * - an Error whose `.message` is `"Unexpected server response: <status>"`
1108
+ * - `error.code === 'EUNEXPECTEDRESPONSE'`, with `error.statusCode` on some versions
1109
+ *
1110
+ * The TTS server rejects WS upgrades with a bare API key using HTTP 403
1111
+ * (not 401), so we treat 403 here as an auth failure — HTTP API callers
1112
+ * keep the generic 403 semantics via {@link classifyHttpError}.
1113
+ */
1114
+ declare function classifyWsHandshakeError(err: unknown): KugelAudioError | null;
623
1115
 
624
1116
  /**
625
1117
  * Utility functions for KugelAudio SDK.
@@ -641,4 +1133,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
641
1133
  */
642
1134
  declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
643
1135
 
644
- export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
1136
+ export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type CreateVoiceOptions, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };