kugelaudio 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -41,18 +41,118 @@ interface Voice {
41
41
  isPublic: boolean;
42
42
  verified: boolean;
43
43
  }
44
+ /**
45
+ * Paginated response from the voices list endpoint.
46
+ */
47
+ interface VoiceListResponse {
48
+ voices: Voice[];
49
+ total: number;
50
+ limit: number;
51
+ offset: number;
52
+ }
53
+ /**
54
+ * Voice quality levels.
55
+ */
56
+ type VoiceQuality = 'low' | 'mid' | 'high';
57
+ /**
58
+ * Extended voice information returned by voice management endpoints.
59
+ */
60
+ interface VoiceDetail {
61
+ id: number;
62
+ name: string;
63
+ description: string;
64
+ generativeVoiceDescription: string;
65
+ supportedLanguages: string[];
66
+ category: string;
67
+ age?: string;
68
+ sex?: string;
69
+ quality: string;
70
+ isPublic: boolean;
71
+ verified: boolean;
72
+ pendingVerification: boolean;
73
+ sampleUrl?: string;
74
+ avatarUrl?: string;
75
+ sampleText: string;
76
+ }
77
+ /**
78
+ * Voice reference audio metadata.
79
+ */
80
+ interface VoiceReference {
81
+ id: number;
82
+ voiceId: number;
83
+ name: string;
84
+ referenceText: string;
85
+ s3Path: string;
86
+ audioUrl?: string;
87
+ isGenerated: boolean;
88
+ }
89
+ /**
90
+ * Options for creating a new voice.
91
+ */
92
+ interface CreateVoiceOptions {
93
+ name: string;
94
+ sex: string;
95
+ description?: string;
96
+ category?: string;
97
+ age?: string;
98
+ quality?: string;
99
+ supportedLanguages?: string[];
100
+ isPublic?: boolean;
101
+ sampleText?: string;
102
+ /** Reference audio files (File objects in browser, Buffer/Blob in Node.js) */
103
+ referenceFiles?: Array<File | Blob>;
104
+ }
105
+ /**
106
+ * Options for updating an existing voice.
107
+ */
108
+ interface UpdateVoiceOptions {
109
+ name?: string;
110
+ description?: string;
111
+ category?: string;
112
+ age?: string;
113
+ sex?: string;
114
+ quality?: string;
115
+ supportedLanguages?: string[];
116
+ isPublic?: boolean;
117
+ sampleText?: string;
118
+ }
119
+ /**
120
+ * Word-level timestamp from server-side forced alignment.
121
+ */
122
+ interface WordTimestamp {
123
+ /** The aligned word */
124
+ word: string;
125
+ /** Start time in milliseconds (relative to chunk/audio start) */
126
+ startMs: number;
127
+ /** End time in milliseconds (relative to chunk/audio start) */
128
+ endMs: number;
129
+ /** Start character offset in the original text */
130
+ charStart: number;
131
+ /** End character offset in the original text */
132
+ charEnd: number;
133
+ /** Alignment confidence score (0.0 - 1.0) */
134
+ score: number;
135
+ }
44
136
  /**
45
137
  * TTS generation request options.
46
138
  */
47
139
  interface GenerateOptions {
48
140
  /** Text to synthesize */
49
141
  text: string;
50
- /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
142
+ /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
51
143
  modelId?: string;
52
144
  /** Voice ID to use */
53
145
  voiceId?: number;
54
146
  /** CFG scale for generation (default: 2.0) */
55
147
  cfgScale?: number;
148
+ /**
149
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable (near-greedy),
150
+ * 1 = most variance. Default: 0.5.
151
+ *
152
+ * Lower values produce more consistent reads across regenerations —
153
+ * useful for stable voiceovers, IVR prompts, and e-learning.
154
+ */
155
+ temperature?: number;
56
156
  /** Maximum tokens to generate (default: 2048) */
57
157
  maxNewTokens?: number;
58
158
  /** Output sample rate (default: 24000) */
@@ -72,18 +172,61 @@ interface GenerateOptions {
72
172
  * (adds ~150ms latency).
73
173
  *
74
174
  * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
75
- * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
175
+ * el, uk, bg, tr, vi, ar, hi, zh, ja, ko, sk, sl, hr, sr, ru,
176
+ * he, fa, ur, bn, ta, yue, th, id, ms
76
177
  */
77
178
  language?: string;
179
+ /**
180
+ * Request word-level timestamps alongside audio.
181
+ * When true, the server performs forced alignment and returns per-word timing boundaries.
182
+ * Default: false
183
+ */
184
+ wordTimestamps?: boolean;
185
+ /**
186
+ * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
187
+ *
188
+ * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
189
+ * can also be used for per-segment speed control.
190
+ * Range: [0.8, 1.2]. Default: 1.0.
191
+ */
192
+ speed?: number;
193
+ /**
194
+ * Optional project ID for project-scoped features (custom dictionary
195
+ * replacements, per-project rate limits). The caller MUST verify the
196
+ * authenticated user has access to this project before passing it; the
197
+ * server treats the value as trusted once received.
198
+ */
199
+ projectId?: number;
78
200
  }
79
201
  /**
80
- * Streaming session configuration.
202
+ * Streaming session configuration for `/ws/tts/stream`.
203
+ *
204
+ * The server accumulates LLM tokens internally and starts generation at natural
205
+ * sentence boundaries. Use {@link chunkLengthSchedule} to tune how eagerly the
206
+ * server begins generating, or set {@link autoMode} to start at the very first
207
+ * clean boundary — equivalent to ElevenLabs' `auto_mode=true`.
208
+ *
209
+ * @example Low-latency preset
210
+ * ```typescript
211
+ * const session = client.tts.streamingSession({
212
+ * voiceId: 123,
213
+ * autoMode: true,
214
+ * chunkLengthSchedule: [50, 100, 150, 250],
215
+ * });
216
+ * ```
81
217
  */
82
218
  interface StreamConfig {
83
219
  /** Voice ID to use */
84
220
  voiceId?: number;
221
+ /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
222
+ modelId?: string;
85
223
  /** CFG scale for generation */
86
224
  cfgScale?: number;
225
+ /**
226
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
227
+ * Default: 0.5.
228
+ */
229
+ temperature?: number;
87
230
  /** Maximum tokens per generation */
88
231
  maxNewTokens?: number;
89
232
  /** Output sample rate */
@@ -102,6 +245,68 @@ interface StreamConfig {
102
245
  * Specify to avoid ~150ms auto-detection latency.
103
246
  */
104
247
  language?: string;
248
+ /**
249
+ * Request word-level timestamps alongside audio.
250
+ * Default: false
251
+ */
252
+ wordTimestamps?: boolean;
253
+ /**
254
+ * Minimum buffer sizes (in characters) the server must accumulate before
255
+ * auto-emitting each successive chunk. Entry `i` applies to chunk `i`; the
256
+ * last value is reused for all subsequent chunks.
257
+ *
258
+ * Smaller values produce lower TTFA at the cost of less prosody context.
259
+ * Larger values improve naturalness but increase TTFA.
260
+ *
261
+ * @example
262
+ * ```typescript
263
+ * chunkLengthSchedule: [50, 100, 150, 250] // low-latency
264
+ * chunkLengthSchedule: [120, 200, 300] // high-quality prosody
265
+ * ```
266
+ */
267
+ chunkLengthSchedule?: number[];
268
+ /**
269
+ * When `true`, the server starts generating audio at the very first clean
270
+ * sentence boundary, regardless of `chunkLengthSchedule`. Equivalent to
271
+ * ElevenLabs' `auto_mode=true`. Prioritises low TTFA; may produce slightly
272
+ * less natural prosody on the first chunk.
273
+ */
274
+ autoMode?: boolean;
275
+ /**
276
+ * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
277
+ *
278
+ * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
279
+ * can also be used for per-segment speed control.
280
+ * Range: [0.8, 1.2]. Default: 1.0.
281
+ */
282
+ speed?: number;
283
+ }
284
+ /**
285
+ * Event callbacks for a streaming session (`/ws/tts/stream`).
286
+ *
287
+ * This is the LLM-integration endpoint: forward raw tokens via
288
+ * {@link StreamingSession.send} and the server auto-chunks them at sentence
289
+ * boundaries.
290
+ */
291
+ interface StreamingSessionCallbacks {
292
+ /** Called when an audio chunk arrives for any segment. */
293
+ onChunk?: (chunk: AudioChunk) => void;
294
+ /**
295
+ * Called when all audio for one flushed text segment is complete.
296
+ * Carries the segment index, total audio duration, and generation time.
297
+ */
298
+ onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
299
+ /**
300
+ * Called when the session is fully closed (after `session.close()`).
301
+ * Equivalent to `onFinal` on the one-shot endpoint.
302
+ */
303
+ onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
304
+ /** Called when the server begins generating audio for a text segment. */
305
+ onGenerationStarted?: (chunkId: number, text: string) => void;
306
+ /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
307
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
308
+ /** Called on any error. */
309
+ onError?: (error: Error) => void;
105
310
  }
106
311
  /**
107
312
  * Audio chunk from streaming TTS.
@@ -132,8 +337,6 @@ interface GenerationStats {
132
337
  durationMs: number;
133
338
  /** Generation time in milliseconds */
134
339
  generationMs: number;
135
- /** Time to first audio in milliseconds */
136
- ttfaMs: number | null;
137
340
  /** Real-time factor */
138
341
  rtf: number;
139
342
  /** Error message if any */
@@ -155,6 +358,8 @@ interface AudioResponse {
155
358
  generationMs: number;
156
359
  /** Real-time factor */
157
360
  rtf: number;
361
+ /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
362
+ wordTimestamps: WordTimestamp[];
158
363
  }
159
364
  /**
160
365
  * Event callbacks for streaming.
@@ -162,6 +367,8 @@ interface AudioResponse {
162
367
  interface StreamCallbacks {
163
368
  /** Called when an audio chunk is received */
164
369
  onChunk?: (chunk: AudioChunk) => void;
370
+ /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
371
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
165
372
  /** Called when generation is complete */
166
373
  onFinal?: (stats: GenerationStats) => void;
167
374
  /** Called on error */
@@ -171,11 +378,18 @@ interface StreamCallbacks {
171
378
  /** Called when connection closes */
172
379
  onClose?: () => void;
173
380
  }
381
+ /**
382
+ * Deployment region. Controls which API endpoint the SDK connects to.
383
+ * - `'eu'` — `api.kugelaudio.com` (default)
384
+ * - `'us'` — `us-api.kugelaudio.com`
385
+ * - `'global'` — `global-api.kugelaudio.com` (geo-routed)
386
+ */
387
+ type Region = 'eu' | 'us' | 'global';
174
388
  /**
175
389
  * KugelAudio client options.
176
390
  */
177
391
  interface KugelAudioOptions {
178
- /** Your KugelAudio API key or JWT token */
392
+ /** Your KugelAudio API key or JWT token. Can be prefixed with `eu-`, `us-`, or `global-` to select a region (prefix is stripped before auth). */
179
393
  apiKey: string;
180
394
  /** Whether apiKey is a master key (for internal/server-side use). Master keys bypass billing. */
181
395
  isMasterKey?: boolean;
@@ -183,12 +397,20 @@ interface KugelAudioOptions {
183
397
  isToken?: boolean;
184
398
  /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
185
399
  orgId?: number;
400
+ /** Deployment region. Takes precedence over API-key prefix but not over `apiUrl`. */
401
+ region?: Region;
186
402
  /** API base URL (default: https://api.kugelaudio.com) */
187
403
  apiUrl?: string;
188
404
  /** TTS server URL (default: same as apiUrl) */
189
405
  ttsUrl?: string;
190
406
  /** Request timeout in milliseconds (default: 60000) */
191
407
  timeout?: number;
408
+ /**
409
+ * Interval in milliseconds between WebSocket ping frames sent on the pooled connection
410
+ * to prevent idle timeouts (default: 20000). Set to 0 or null to disable.
411
+ * In browsers, pings are sent via the ws package only (skipped in native WebSocket environments).
412
+ */
413
+ keepalivePingInterval?: number | null;
192
414
  }
193
415
  /**
194
416
  * Multi-context session configuration.
@@ -200,10 +422,21 @@ interface MultiContextConfig {
200
422
  sampleRate?: number;
201
423
  /** CFG scale for generation (default: 2.0) */
202
424
  cfgScale?: number;
425
+ /**
426
+ * Sampling variance. Range [0.0, 1.0]. 0 = most stable, 1 = most variance.
427
+ * Default: 0.5.
428
+ */
429
+ temperature?: number;
203
430
  /** Maximum tokens to generate (default: 2048) */
204
431
  maxNewTokens?: number;
205
432
  /** Enable text normalization (default: true) */
206
433
  normalize?: boolean;
434
+ /**
435
+ * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
436
+ * If not set and normalize is true (default), the server auto-detects
437
+ * the language, which adds ~60-150ms to time-to-first-audio.
438
+ */
439
+ language?: string;
207
440
  /** Seconds before context auto-closes (default: 20.0) */
208
441
  inactivityTimeout?: number;
209
442
  }
@@ -239,8 +472,6 @@ interface MultiContextCallbacks {
239
472
  onContextCreated?: (contextId: string) => void;
240
473
  /** Called when an audio chunk is received */
241
474
  onChunk?: (chunk: MultiContextAudioChunk) => void;
242
- /** Called when a context finishes generating */
243
- onContextFinal?: (contextId: string) => void;
244
475
  /** Called when a context is closed */
245
476
  onContextClosed?: (contextId: string) => void;
246
477
  /** Called when a context times out */
@@ -275,11 +506,51 @@ declare class VoicesResource {
275
506
  language?: string;
276
507
  includePublic?: boolean;
277
508
  limit?: number;
278
- }): Promise<Voice[]>;
509
+ offset?: number;
510
+ }): Promise<VoiceListResponse>;
279
511
  /**
280
512
  * Get a specific voice by ID.
281
513
  */
282
- get(voiceId: number): Promise<Voice>;
514
+ get(voiceId: number): Promise<VoiceDetail>;
515
+ /**
516
+ * Create a new voice.
517
+ */
518
+ create(options: CreateVoiceOptions): Promise<VoiceDetail>;
519
+ /**
520
+ * Update an existing voice. Only provided fields are updated.
521
+ */
522
+ update(voiceId: number, options: UpdateVoiceOptions): Promise<VoiceDetail>;
523
+ /**
524
+ * Delete a voice.
525
+ */
526
+ delete(voiceId: number): Promise<void>;
527
+ /**
528
+ * List reference audio files for a voice.
529
+ */
530
+ listReferences(voiceId: number): Promise<VoiceReference[]>;
531
+ /**
532
+ * Upload a reference audio file to a voice.
533
+ *
534
+ * @param voiceId - Voice ID
535
+ * @param file - Audio file (File in browser, Blob in Node.js)
536
+ * @param referenceText - Optional transcript of the reference audio
537
+ */
538
+ addReference(voiceId: number, file: File | Blob, referenceText?: string): Promise<VoiceReference>;
539
+ /**
540
+ * Delete a reference audio file from a voice.
541
+ */
542
+ deleteReference(voiceId: number, referenceId: number): Promise<void>;
543
+ /**
544
+ * Request publication of a voice. Sets it as public and marks it
545
+ * as pending verification by an admin.
546
+ */
547
+ publish(voiceId: number): Promise<VoiceDetail>;
548
+ /**
549
+ * Trigger sample audio generation for a voice.
550
+ */
551
+ generateSample(voiceId: number): Promise<VoiceDetail>;
552
+ private mapVoiceDetail;
553
+ private mapVoiceReference;
283
554
  }
284
555
  /**
285
556
  * TTS resource for text-to-speech generation.
@@ -290,6 +561,7 @@ declare class TTSResource {
290
561
  private wsUrl;
291
562
  private pendingRequests;
292
563
  private requestCounter;
564
+ private keepaliveTimer;
293
565
  constructor(client: KugelAudio);
294
566
  /**
295
567
  * Pre-establish WebSocket connection for faster first request.
@@ -318,6 +590,40 @@ declare class TTSResource {
318
590
  * Returns complete audio after all chunks are received.
319
591
  */
320
592
  generate(options: GenerateOptions): Promise<AudioResponse>;
593
+ /**
594
+ * Stream audio and return a Node.js Readable stream of raw PCM16 binary data.
595
+ *
596
+ * **Node.js only** — this method requires the `stream` built-in module and is
597
+ * intended for server-side integrations such as Vapi custom TTS endpoints,
598
+ * Express/Fastify handlers, or any pipeline that expects a Node.js `Readable`.
599
+ *
600
+ * Compared to manually wiring `onChunk` to a `Readable`, this method avoids
601
+ * a common race-condition: the stream object is created and returned **before**
602
+ * any chunks arrive, so the caller can safely pipe or attach listeners before
603
+ * the first audio byte is pushed.
604
+ *
605
+ * @example Vapi custom TTS endpoint
606
+ * ```typescript
607
+ * app.post('/synthesize', (req, res) => {
608
+ * res.setHeader('Content-Type', 'audio/pcm');
609
+ * res.setHeader('Transfer-Encoding', 'chunked');
610
+ *
611
+ * const readable = client.tts.toReadable({
612
+ * text: req.body.message.text,
613
+ * modelId: 'kugel-1-turbo',
614
+ * sampleRate: req.body.message.sampleRate,
615
+ * language: 'en',
616
+ * });
617
+ *
618
+ * readable.pipe(res);
619
+ * });
620
+ * ```
621
+ *
622
+ * @param options - TTS generation options (same as `stream()`)
623
+ * @param reuseConnection - Reuse the pooled WebSocket connection (default: true)
624
+ * @returns Node.js Readable stream emitting raw PCM16 binary Buffer chunks
625
+ */
626
+ toReadable(options: GenerateOptions, reuseConnection?: boolean): any;
321
627
  /**
322
628
  * Build the WebSocket URL with appropriate auth param.
323
629
  */
@@ -348,11 +654,47 @@ declare class TTSResource {
348
654
  * Stream without connection pooling (original behavior).
349
655
  */
350
656
  private streamWithoutPooling;
657
+ /**
658
+ * Start periodic keepalive pings on the pooled connection.
659
+ * Uses the ws package's ping() in Node.js; silently skips in browsers
660
+ * where WebSocket doesn't expose a ping method.
661
+ */
662
+ private startKeepalive;
663
+ private stopKeepalive;
351
664
  /**
352
665
  * Close the pooled WebSocket connection.
353
666
  */
354
667
  close(): void;
355
668
  private parseError;
669
+ /**
670
+ * Create a streaming session for LLM integration.
671
+ *
672
+ * The session connects to `/ws/tts/stream` and keeps a persistent
673
+ * connection across multiple {@link StreamingSession.send} calls.
674
+ * The server auto-chunks text at sentence boundaries — no client-side
675
+ * flushing required.
676
+ *
677
+ * @param config - Session configuration (voice, model, chunking strategy).
678
+ * @param callbacks - Callbacks for audio chunks and session lifecycle events.
679
+ * @returns A {@link StreamingSession} instance. Call `.connect()` before sending.
680
+ *
681
+ * @example
682
+ * ```typescript
683
+ * const session = client.tts.streamingSession(
684
+ * { voiceId: 123, autoMode: true, chunkLengthSchedule: [50, 100, 150, 250] },
685
+ * { onChunk: (chunk) => playAudio(chunk.audio) },
686
+ * );
687
+ *
688
+ * session.connect();
689
+ *
690
+ * for await (const token of llmStream) {
691
+ * session.send(token);
692
+ * }
693
+ *
694
+ * await session.close();
695
+ * ```
696
+ */
697
+ streamingSession(config: StreamConfig, callbacks: StreamingSessionCallbacks): StreamingSession;
356
698
  /**
357
699
  * Create a multi-context session for concurrent TTS streams.
358
700
  *
@@ -371,7 +713,7 @@ declare class TTSResource {
371
713
  * console.log(`Audio from ${chunk.contextId}`);
372
714
  * playAudio(chunk.audio);
373
715
  * },
374
- * onContextFinal: (contextId) => {
716
+ * onContextClosed: (contextId) => {
375
717
  * console.log(`${contextId} finished`);
376
718
  * },
377
719
  * });
@@ -408,8 +750,13 @@ declare class MultiContextSession {
408
750
  get sessionId(): string | null;
409
751
  /**
410
752
  * Connect to the multi-context WebSocket endpoint.
753
+ *
754
+ * The returned promise resolves once the WebSocket is OPEN so callers can
755
+ * ``await session.connect(callbacks)`` before invoking
756
+ * {@link createContext} / {@link send}. Pre-open errors reject with the
757
+ * typed error.
411
758
  */
412
- connect(callbacks: MultiContextCallbacks): void;
759
+ connect(callbacks: MultiContextCallbacks): Promise<void>;
413
760
  /**
414
761
  * Create a new context with optional voice settings.
415
762
  */
@@ -446,6 +793,103 @@ declare class MultiContextSession {
446
793
  */
447
794
  get isConnected(): boolean;
448
795
  }
796
+ /**
797
+ * Streaming session for LLM integration via `/ws/tts/stream`.
798
+ *
799
+ * The server accumulates text across multiple {@link send} calls and
800
+ * auto-chunks it at sentence boundaries, keeping the KV cache warm between
801
+ * chunks for natural prosody. You never need to call `flush` explicitly —
802
+ * configure {@link StreamConfig.chunkLengthSchedule} or
803
+ * {@link StreamConfig.autoMode} instead.
804
+ *
805
+ * @example
806
+ * ```typescript
807
+ * const session = client.tts.streamingSession({
808
+ * voiceId: 123,
809
+ * autoMode: true,
810
+ * chunkLengthSchedule: [50, 100, 150, 250],
811
+ * }, {
812
+ * onChunk: (chunk) => playAudio(chunk.audio),
813
+ * onSessionClosed: (totalSecs) => console.log(`Done: ${totalSecs}s`),
814
+ * });
815
+ *
816
+ * session.connect();
817
+ *
818
+ * for await (const token of llmStream) {
819
+ * session.send(token);
820
+ * }
821
+ *
822
+ * await session.close();
823
+ * ```
824
+ */
825
+ declare class StreamingSession {
826
+ private ws;
827
+ private config;
828
+ private callbacks;
829
+ private client;
830
+ private configSent;
831
+ constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
832
+ /**
833
+ * Open the WebSocket connection and authenticate.
834
+ *
835
+ * The returned promise resolves once the WebSocket is OPEN, so callers can
836
+ * ``await session.connect()`` and then ``send()`` without racing the
837
+ * handshake. Pre-open errors (network failure, 4001 unauthorized, …) reject
838
+ * the promise with the typed error.
839
+ */
840
+ connect(): Promise<void>;
841
+ /**
842
+ * Send a text chunk to the server (e.g. one LLM output token).
843
+ *
844
+ * The server buffers text across multiple calls and starts generating at
845
+ * natural sentence boundaries automatically — no need to call `flush`.
846
+ *
847
+ * @param text - Raw text or LLM token to append to the server buffer.
848
+ * @param flush - Force immediate generation of whatever is buffered.
849
+ * **Avoid calling this per-sentence from the client.** Doing so bypasses
850
+ * the server's semantic chunking, incurs a fresh model prefill cost on
851
+ * every flush, and makes latency *worse*, not better. Let the server
852
+ * handle chunking via `chunkLengthSchedule` / `autoMode` instead.
853
+ */
854
+ send(text: string, flush?: boolean): void;
855
+ /**
856
+ * End the current session but keep the WebSocket connection open.
857
+ *
858
+ * This allows starting a new session on the same connection, avoiding
859
+ * the overhead of a new WebSocket handshake (~200-300ms). After calling
860
+ * this, optionally call {@link updateConfig} to change voice/model settings,
861
+ * then call {@link send} to start the next session.
862
+ *
863
+ * The returned promise resolves once the server confirms with a
864
+ * `session_closed` message, or after a 15 s **quiet** timeout — i.e. 15 s
865
+ * elapse without *any* server message arriving. The timer resets on every
866
+ * incoming frame so a long final flush that streams audio for tens of
867
+ * seconds is not truncated; only a genuinely silent server trips the fuse.
868
+ */
869
+ endSession(): Promise<void>;
870
+ /**
871
+ * Update session configuration for the next session.
872
+ *
873
+ * Call this after {@link endSession} and before the next {@link send}
874
+ * to change voice, model, language, or other settings.
875
+ */
876
+ updateConfig(config: Partial<StreamConfig>): void;
877
+ /**
878
+ * Close the session and the WebSocket connection.
879
+ *
880
+ * For session reuse without closing the connection, use
881
+ * {@link endSession} instead.
882
+ *
883
+ * The returned promise resolves once the server confirms the close with a
884
+ * `session_closed` message, or after a 15 s **quiet** timeout (no traffic
885
+ * from the server in that window). Audio frames from the server-side
886
+ * final-flush of the still-buffered text are delivered to your callbacks
887
+ * before this promise resolves, and each frame resets the quiet timer.
888
+ */
889
+ close(): Promise<void>;
890
+ /** Whether the underlying WebSocket is open. */
891
+ get isConnected(): boolean;
892
+ }
449
893
  /**
450
894
  * KugelAudio API client.
451
895
  *
@@ -459,13 +903,13 @@ declare class MultiContextSession {
459
903
  * // List voices
460
904
  * const voices = await client.voices.list();
461
905
  *
462
- * // Generate audio with fast model (1.5B params)
906
+ * // Generate audio with fast model
463
907
  * const audio = await client.tts.generate({
464
908
  * text: 'Hello, world!',
465
909
  * modelId: 'kugel-1-turbo',
466
910
  * });
467
911
  *
468
- * // Generate audio with premium model (7B params)
912
+ * // Generate audio with premium model
469
913
  * const audio = await client.tts.generate({
470
914
  * text: 'Hello, world!',
471
915
  * modelId: 'kugel-1',
@@ -480,6 +924,7 @@ declare class KugelAudio {
480
924
  private _apiUrl;
481
925
  private _ttsUrl;
482
926
  private _timeout;
927
+ private _keepalivePingInterval;
483
928
  /** Models resource */
484
929
  readonly models: ModelsResource;
485
930
  /** Voices resource */
@@ -514,6 +959,8 @@ declare class KugelAudio {
514
959
  get orgId(): number | undefined;
515
960
  /** Get TTS URL */
516
961
  get ttsUrl(): string;
962
+ /** Get keepalive ping interval in milliseconds, or null if disabled. */
963
+ get keepalivePingInterval(): number | null;
517
964
  /**
518
965
  * Close the client and release resources.
519
966
  * This closes any pooled WebSocket connections.
@@ -546,48 +993,125 @@ declare class KugelAudio {
546
993
  * @internal
547
994
  */
548
995
  request<T>(method: string, path: string, body?: unknown): Promise<T>;
996
+ /**
997
+ * Make a multipart/form-data request (for file uploads).
998
+ * @internal Used by VoicesResource for reference file uploads.
999
+ */
1000
+ requestMultipart<T>(method: string, path: string, formData: FormData): Promise<T>;
549
1001
  }
550
1002
 
551
1003
  /**
552
1004
  * Custom errors for KugelAudio SDK.
1005
+ *
1006
+ * All SDK errors inherit from {@link KugelAudioError}. Specific subclasses
1007
+ * map to the server's `error_code` field (see the server-side `ErrorCode`
1008
+ * enum at `tts/src/serving/deployments/errors.py`) so callers can
1009
+ * `instanceof AuthenticationError` without matching on message text.
553
1010
  */
1011
+ declare const ErrorCodes: {
1012
+ readonly UNAUTHORIZED: "UNAUTHORIZED";
1013
+ readonly RATE_LIMITED: "RATE_LIMITED";
1014
+ readonly INSUFFICIENT_CREDITS: "INSUFFICIENT_CREDITS";
1015
+ readonly MODEL_UNAVAILABLE: "MODEL_UNAVAILABLE";
1016
+ readonly EMPTY_AUDIO: "EMPTY_AUDIO";
1017
+ readonly VALIDATION: "VALIDATION_ERROR";
1018
+ readonly INTERNAL: "INTERNAL_ERROR";
1019
+ readonly NOT_FOUND: "NOT_FOUND";
1020
+ };
1021
+ type ErrorCode = typeof ErrorCodes[keyof typeof ErrorCodes];
1022
+ declare const WsCloseCodes: {
1023
+ readonly UNAUTHORIZED: 4001;
1024
+ readonly INSUFFICIENT_CREDITS: 4003;
1025
+ readonly RATE_LIMITED: 4029;
1026
+ readonly MODEL_UNAVAILABLE: 4500;
1027
+ };
1028
+ interface KugelAudioErrorOptions {
1029
+ statusCode?: number;
1030
+ errorCode?: string;
1031
+ requestId?: string;
1032
+ retryAfter?: number;
1033
+ cause?: unknown;
1034
+ }
554
1035
  /**
555
1036
  * Base error class for KugelAudio SDK.
556
1037
  */
557
1038
  declare class KugelAudioError extends Error {
558
1039
  readonly statusCode?: number;
559
- constructor(message: string, statusCode?: number);
1040
+ readonly errorCode?: string;
1041
+ readonly requestId?: string;
1042
+ readonly retryAfter?: number;
1043
+ constructor(message: string, options?: KugelAudioErrorOptions);
560
1044
  }
561
1045
  /**
562
- * Thrown when authentication fails.
1046
+ * API key was missing, malformed, or rejected by the server.
563
1047
  */
564
1048
  declare class AuthenticationError extends KugelAudioError {
565
- constructor(message?: string);
1049
+ constructor(message?: string, options?: KugelAudioErrorOptions);
566
1050
  }
567
1051
  /**
568
- * Thrown when rate limit is exceeded.
1052
+ * Request was rejected by the per-org rate limiter.
569
1053
  */
570
1054
  declare class RateLimitError extends KugelAudioError {
571
- constructor(message?: string);
1055
+ constructor(message?: string, options?: KugelAudioErrorOptions);
572
1056
  }
573
1057
  /**
574
- * Thrown when user has insufficient credits.
1058
+ * Account is out of TTS credits.
575
1059
  */
576
1060
  declare class InsufficientCreditsError extends KugelAudioError {
577
- constructor(message?: string);
1061
+ constructor(message?: string, options?: KugelAudioErrorOptions);
578
1062
  }
579
1063
  /**
580
- * Thrown when request validation fails.
1064
+ * Request was rejected as invalid (bad params, missing fields, etc.).
581
1065
  */
582
1066
  declare class ValidationError extends KugelAudioError {
583
- constructor(message: string);
1067
+ constructor(message: string, options?: KugelAudioErrorOptions);
584
1068
  }
585
1069
  /**
586
- * Thrown when connection to server fails.
1070
+ * The SDK could not reach KugelAudio (network error, server down,
1071
+ * or model deployment temporarily unavailable).
587
1072
  */
588
1073
  declare class ConnectionError extends KugelAudioError {
589
- constructor(message?: string);
1074
+ constructor(message: string, options?: KugelAudioErrorOptions);
1075
+ }
1076
+ interface HttpResponseLike {
1077
+ status: number;
1078
+ headers: {
1079
+ get(name: string): string | null;
1080
+ } | Record<string, string | undefined>;
1081
+ text?: () => Promise<string>;
590
1082
  }
1083
+ /**
1084
+ * Build the appropriate `KugelAudioError` from an HTTP response body that
1085
+ * was already parsed. `bodyText` is the raw text fallback.
1086
+ */
1087
+ declare function classifyHttpError(status: number, bodyText: string, headers: HttpResponseLike['headers']): KugelAudioError;
1088
+ /**
1089
+ * Build a `KugelAudioError` from a server-sent WebSocket error frame
1090
+ * (`{error, error_code, retry_after}`).
1091
+ */
1092
+ declare function classifyWsFrame(data: {
1093
+ error?: string;
1094
+ error_code?: string;
1095
+ retry_after?: number;
1096
+ }): KugelAudioError;
1097
+ /**
1098
+ * Build a `KugelAudioError` from a WebSocket close code + reason.
1099
+ */
1100
+ declare function classifyWsClose(code: number | undefined, reason?: string): KugelAudioError;
1101
+ /**
1102
+ * Extract the HTTP status from a `ws` package handshake-rejection error and
1103
+ * return a typed `KugelAudioError`. Returns `null` if the error doesn't look
1104
+ * like a handshake rejection (e.g. pure network failure).
1105
+ *
1106
+ * The `ws` library surfaces rejected upgrades via:
1107
+ * - an Error whose `.message` is `"Unexpected server response: <status>"`
1108
+ * - `error.code === 'EUNEXPECTEDRESPONSE'`, with `error.statusCode` on some versions
1109
+ *
1110
+ * The TTS server rejects WS upgrades with a bare API key using HTTP 403
1111
+ * (not 401), so we treat 403 here as an auth failure — HTTP API callers
1112
+ * keep the generic 403 semantics via {@link classifyHttpError}.
1113
+ */
1114
+ declare function classifyWsHandshakeError(err: unknown): KugelAudioError | null;
591
1115
 
592
1116
  /**
593
1117
  * Utility functions for KugelAudio SDK.
@@ -609,4 +1133,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
609
1133
  */
610
1134
  declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
611
1135
 
612
- export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
1136
+ export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type CreateVoiceOptions, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };