kugelaudio 0.6.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,27 @@
1
+ ## [kugelaudio-v0.8.0](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.7.0...js-sdk-v0.8.0) (2026-06-10)
2
+
3
+ ### Features
4
+
5
+ * **ingress,python-sdk,js-sdk,java-sdk:** per-session usage over WebSocket ([#1346](https://github.com/Kugelaudio/KugelAudio/issues/1346)) ([2881881](https://github.com/Kugelaudio/KugelAudio/commit/28818816dca9c8d222391691d70f458c0eb28ed8))
6
+ * **ingress,python-sdk,js-sdk:** streaming final end-of-audio frame ([#1362](https://github.com/Kugelaudio/KugelAudio/issues/1362)) ([3fa95d2](https://github.com/Kugelaudio/KugelAudio/commit/3fa95d2f8597e6c9ced0aaf8370682dbcb123c71))
7
+ * **ingress:** output_format token + server-side G.711 (KUG-1190) ([#1345](https://github.com/Kugelaudio/KugelAudio/issues/1345)) ([3723291](https://github.com/Kugelaudio/KugelAudio/commit/372329196c4c91aa41fe2111783872874b6e895b))
8
+ * per-request dictionary selection (KUG-1094) ([#1361](https://github.com/Kugelaudio/KugelAudio/issues/1361)) ([3c28968](https://github.com/Kugelaudio/KugelAudio/commit/3c28968d32018bf3cafe1d312f32831668ea96b8))
9
+
10
+ ### Bug Fixes
11
+
12
+ * **js-sdk,java-sdk,ingress:** multi-turn conversations work end-to-end + live SDK e2e bench in CI (KUG-1233) ([#1363](https://github.com/Kugelaudio/KugelAudio/issues/1363)) ([c0ed2a9](https://github.com/Kugelaudio/KugelAudio/commit/c0ed2a9cf41025bac5c7182c1a281eb600d8dd36))
13
+
14
+ ## [kugelaudio-v0.7.0](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.6.1...js-sdk-v0.7.0) (2026-06-06)
15
+
16
+ ### Features
17
+
18
+ * **ingress:** add request observability metadata ([#1321](https://github.com/Kugelaudio/KugelAudio/issues/1321)) ([a9c5178](https://github.com/Kugelaudio/KugelAudio/commit/a9c5178193cb8b746a8bbd9b566b11f7b1d00f6d))
19
+ * **sdks:** default all SDKs to kugel-3 model ([#1323](https://github.com/Kugelaudio/KugelAudio/issues/1323)) ([c4de212](https://github.com/Kugelaudio/KugelAudio/commit/c4de212c91e16326a15dbee5622acacc83ed85bb))
20
+
21
+ ### Bug Fixes
22
+
23
+ * **js-sdk:** type SDK metadata fetch mock ([#1334](https://github.com/Kugelaudio/KugelAudio/issues/1334)) ([e8f6f59](https://github.com/Kugelaudio/KugelAudio/commit/e8f6f59595e123eaae8b44670c94fb4e7bc8d06c))
24
+
1
25
  ## [kugelaudio-v0.6.1](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.6.0...js-sdk-v0.6.1) (2026-06-04)
2
26
 
3
27
  ### Bug Fixes
package/README.md CHANGED
@@ -173,6 +173,7 @@ const audio = await client.tts.generate({
173
173
  cfgScale: 2.0, // Guidance scale (1.0-5.0)
174
174
  maxNewTokens: 2048, // Maximum tokens to generate
175
175
  sampleRate: 24000, // Output sample rate
176
+ outputFormat: undefined, // Optional: 'pcm_24000', 'ulaw_8000', 'alaw_8000', ...
176
177
  normalize: true, // Enable text normalization (see below)
177
178
  language: 'en', // Language for normalization
178
179
  });
@@ -298,12 +299,19 @@ for await (const token of llmTokenStream) {
298
299
  // Triggers the server-side final flush of any trailing text,
299
300
  // streams the resulting audio through onChunk, then closes the WS.
300
301
  await session.close();
302
+
303
+ // Per-session usage — bill your own customers per conversation.
304
+ // `costCents` is the actual charge in EUR cents (null if undetermined).
305
+ const usage = session.lastUsage;
306
+ if (usage) {
307
+ console.log(`audio: ${usage.audioSeconds}s, cost: ${usage.costCents ?? 'n/a'} ct`);
308
+ }
301
309
  ```
302
310
 
303
311
  > ⚠️ **Do not call `session.send(text, true)` (`flush=true`) between
304
312
  > sentences or words.** Each explicit flush is a separate TTS request
305
313
  > that pays the full model time-to-first-audio (TTFA) again and produces
306
- > an audible gap. See [Streaming best practices](https://docs.kugelaudio.com/streaming-best-practices)
314
+ > an audible gap. See [Chunking & per-segment latency](https://docs.kugelaudio.com/streaming/chunking-and-latency)
307
315
  > for the full rationale, chunk-size ordering, and ElevenLabs migration
308
316
  > notes.
309
317
 
@@ -413,6 +421,7 @@ interface GenerateOptions {
413
421
  cfgScale?: number; // Default: 2.0
414
422
  maxNewTokens?: number; // Default: 2048
415
423
  sampleRate?: number; // Default: 24000
424
+ outputFormat?: string; // 'pcm_8000' | 'pcm_16000' | 'pcm_22050' | 'pcm_24000' | 'ulaw_8000' | 'alaw_8000'
416
425
  normalize?: boolean; // Default: true - Enable text normalization
417
426
  language?: string; // ISO 639-1 code for normalization (e.g., 'en', 'de')
418
427
  }
@@ -425,7 +434,7 @@ interface GenerateOptions {
425
434
  ```typescript
426
435
  interface AudioChunk {
427
436
  audio: string; // Base64-encoded PCM16 audio
428
- encoding: string; // 'pcm_s16le'
437
+ encoding: string; // 'pcm_s16le' | 'mulaw' | 'alaw' (G.711 when outputFormat set)
429
438
  index: number; // Chunk index (0-based)
430
439
  sampleRate: number; // Sample rate (24000)
431
440
  samples: number; // Number of samples in chunk
package/dist/index.d.mts CHANGED
@@ -217,7 +217,7 @@ interface WordTimestamp {
217
217
  interface GenerateOptions {
218
218
  /** Text to synthesize */
219
219
  text: string;
220
- /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
220
+ /** Model to use. Default: 'kugel-3'. Legacy ids (kugel-2.5, kugel-1-turbo, ) still accepted; they alias to kugel-3 server-side. */
221
221
  modelId?: string;
222
222
  /** Voice ID to use */
223
223
  voiceId?: number;
@@ -235,6 +235,12 @@ interface GenerateOptions {
235
235
  maxNewTokens?: number;
236
236
  /** Output sample rate (default: 24000) */
237
237
  sampleRate?: number;
238
+ /**
239
+ * Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
240
+ * Opt-in; when set it is authoritative and must not contradict sampleRate.
241
+ * Absent ⇒ legacy PCM16 at sampleRate.
242
+ */
243
+ outputFormat?: string;
238
244
  /**
239
245
  * Enable text normalization (converts numbers, dates, etc. to spoken words).
240
246
  * When true, text will be normalized before TTS generation.
@@ -263,8 +269,8 @@ interface GenerateOptions {
263
269
  /**
264
270
  * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
265
271
  *
266
- * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
267
- * can also be used for per-segment speed control.
272
+ * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
273
+ * whole request (no per-span control).
268
274
  * Range: [0.8, 1.2]. Default: 1.0.
269
275
  */
270
276
  speed?: number;
@@ -275,6 +281,14 @@ interface GenerateOptions {
275
281
  * server treats the value as trusted once received.
276
282
  */
277
283
  projectId?: number;
284
+ /**
285
+ * Per-request dictionary selection. Omit for the default behavior (all
286
+ * active dictionaries of the project apply, filtered by language). An
287
+ * empty array disables dictionaries for this request. A list of
288
+ * dictionary IDs applies exactly those dictionaries — including
289
+ * inactive ones — bypassing the language filter.
290
+ */
291
+ dictionaryIds?: number[];
278
292
  }
279
293
  /**
280
294
  * Streaming session configuration for `/ws/tts/stream`.
@@ -296,7 +310,7 @@ interface GenerateOptions {
296
310
  interface StreamConfig {
297
311
  /** Voice ID to use */
298
312
  voiceId?: number;
299
- /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
313
+ /** Model ID. Default: 'kugel-3'. Legacy ids still accepted; they alias to kugel-3 server-side. */
300
314
  modelId?: string;
301
315
  /** CFG scale for generation */
302
316
  cfgScale?: number;
@@ -309,6 +323,8 @@ interface StreamConfig {
309
323
  maxNewTokens?: number;
310
324
  /** Output sample rate */
311
325
  sampleRate?: number;
326
+ /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
327
+ outputFormat?: string;
312
328
  /** Auto-flush timeout in milliseconds */
313
329
  flushTimeoutMs?: number;
314
330
  /** Maximum buffer length */
@@ -353,11 +369,19 @@ interface StreamConfig {
353
369
  /**
354
370
  * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
355
371
  *
356
- * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
357
- * can also be used for per-segment speed control.
372
+ * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
373
+ * whole request (no per-span control).
358
374
  * Range: [0.8, 1.2]. Default: 1.0.
359
375
  */
360
376
  speed?: number;
377
+ /**
378
+ * Per-request dictionary selection. Omit for the default behavior (all
379
+ * active dictionaries of the project apply, filtered by language). An
380
+ * empty array disables dictionaries for this request. A list of
381
+ * dictionary IDs applies exactly those dictionaries — including
382
+ * inactive ones — bypassing the language filter.
383
+ */
384
+ dictionaryIds?: number[];
361
385
  }
362
386
  /**
363
387
  * Event callbacks for a streaming session (`/ws/tts/stream`).
@@ -374,9 +398,18 @@ interface StreamingSessionCallbacks {
374
398
  * Carries the segment index, total audio duration, and generation time.
375
399
  */
376
400
  onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
401
+ /**
402
+ * Called when the server marks the end of a turn's audio
403
+ * (`{"final": true, ...}` — sent after the last audio frame of every
404
+ * gracefully completed turn, right before `session_closed`). The
405
+ * ElevenLabs `isFinal` equivalent: once this fires, no further audio
406
+ * for the turn will arrive. Not fired on a barge-in cancel — that
407
+ * path fires {@link onInterrupted} instead.
408
+ */
409
+ onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
377
410
  /**
378
411
  * Called when the session is fully closed (after `session.close()`).
379
- * Equivalent to `onFinal` on the one-shot endpoint.
412
+ * Fires right after {@link onFinal} and additionally carries usage.
380
413
  */
381
414
  onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
382
415
  /** Called when the server begins generating audio for a text segment. */
@@ -393,14 +426,45 @@ interface StreamingSessionCallbacks {
393
426
  /** Called on any error. */
394
427
  onError?: (error: Error) => void;
395
428
  }
429
+ /**
430
+ * Per-session usage reported in the `session_closed` frame (KUG-1192).
431
+ *
432
+ * Lets you bill your own customers per conversation. `costCents` is the
433
+ * actual amount charged in **EUR cents**. When the charge could not be
434
+ * determined at session end (e.g. a transient billing error) `costCents` is
435
+ * `null` and `costAvailable` is `false` — never a misleading `0`.
436
+ * `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
437
+ * context (per conversation) on each `context_closed` frame, not aggregated
438
+ * across contexts.
439
+ */
440
+ interface SessionUsage {
441
+ /** Total audio generated this session, in seconds (the unit we bill on). */
442
+ audioSeconds: number;
443
+ /** Actual amount charged in EUR cents, or `null` if undetermined. */
444
+ costCents: number | null;
445
+ /** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
446
+ currency?: string;
447
+ /** Total input characters submitted this session, if reported. */
448
+ characters?: number;
449
+ /** Model that produced the audio, if reported. */
450
+ modelId?: string;
451
+ /** `true` when an authoritative charge was returned for this session. */
452
+ costAvailable: boolean;
453
+ }
454
+ /**
455
+ * Parse the raw `usage` object (or a legacy `session_closed` payload without
456
+ * one) into a typed {@link SessionUsage}. Returns `null` when no usage info
457
+ * is present.
458
+ */
459
+ declare function parseSessionUsage(data: Record<string, unknown>): SessionUsage | null;
396
460
  /**
397
461
  * Audio chunk from streaming TTS.
398
462
  */
399
463
  interface AudioChunk {
400
464
  /** Raw PCM16 audio as base64 */
401
465
  audio: string;
402
- /** Encoding format */
403
- encoding: 'pcm_s16le';
466
+ /** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
467
+ encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
404
468
  /** Chunk index */
405
469
  index: number;
406
470
  /** Sample rate */
@@ -426,6 +490,12 @@ interface GenerationStats {
426
490
  rtf: number;
427
491
  /** Error message if any */
428
492
  error?: string;
493
+ /**
494
+ * Per-request usage (audio time + amount charged), for billing your own
495
+ * customers. Undefined when the server reports no usage. See
496
+ * {@link SessionUsage}.
497
+ */
498
+ usage?: SessionUsage;
429
499
  }
430
500
  /**
431
501
  * Complete audio response from TTS generation.
@@ -505,6 +575,8 @@ interface MultiContextConfig {
505
575
  defaultVoiceId?: number;
506
576
  /** Output sample rate (default: 24000) */
507
577
  sampleRate?: number;
578
+ /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
579
+ outputFormat?: string;
508
580
  /** CFG scale for generation (default: 2.0) */
509
581
  cfgScale?: number;
510
582
  /**
@@ -522,6 +594,14 @@ interface MultiContextConfig {
522
594
  * the language, which adds ~60-150ms to time-to-first-audio.
523
595
  */
524
596
  language?: string;
597
+ /**
598
+ * Per-request dictionary selection. Omit for the default behavior (all
599
+ * active dictionaries of the project apply, filtered by language). An
600
+ * empty array disables dictionaries for this request. A list of
601
+ * dictionary IDs applies exactly those dictionaries — including
602
+ * inactive ones — bypassing the language filter.
603
+ */
604
+ dictionaryIds?: number[];
525
605
  /** Seconds before context auto-closes (default: 20.0) */
526
606
  inactivityTimeout?: number;
527
607
  }
@@ -557,8 +637,20 @@ interface MultiContextCallbacks {
557
637
  onContextCreated?: (contextId: string) => void;
558
638
  /** Called when an audio chunk is received */
559
639
  onChunk?: (chunk: MultiContextAudioChunk) => void;
560
- /** Called when a context is closed */
561
- onContextClosed?: (contextId: string) => void;
640
+ /**
641
+ * Called when all audio admitted before a `{flush: true}` has been
642
+ * delivered for a context (`{"final": true, "context_id": ...}`), and
643
+ * once more before {@link onContextClosed} on a graceful close. The
644
+ * ElevenLabs multi-context `is_final` equivalent. Not fired on an
645
+ * immediate (barge-in) close.
646
+ */
647
+ onFinal?: (contextId: string) => void;
648
+ /**
649
+ * Called when a context is closed (terminal). `usage` carries this
650
+ * conversation's audio time + amount charged (undefined if not reported).
651
+ * See {@link SessionUsage}.
652
+ */
653
+ onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
562
654
  /** Called when a context times out */
563
655
  onContextTimeout?: (contextId: string) => void;
564
656
  /** Called when session is closed */
@@ -912,13 +1004,25 @@ declare class MultiContextSession {
912
1004
  private config;
913
1005
  private callbacks;
914
1006
  private contexts;
1007
+ /** Contexts a create message has been sent for (not yet necessarily
1008
+ * confirmed by the server via context_created). */
1009
+ private requestedContexts;
915
1010
  private _sessionId;
1011
+ private _contextUsage;
916
1012
  private isStarted;
917
1013
  constructor(client: KugelAudio, config?: MultiContextConfig);
918
1014
  /**
919
1015
  * Get the current session ID, or null if not connected.
920
1016
  */
921
1017
  get sessionId(): string | null;
1018
+ /**
1019
+ * Per-context usage (audio time + amount charged) for a closed context, or
1020
+ * null if that context hasn't closed yet. Each context is its own
1021
+ * conversation — use this to bill per conversation. See {@link SessionUsage}.
1022
+ */
1023
+ usageFor(contextId: string): SessionUsage | null;
1024
+ /** Map of context_id → per-context usage for all closed contexts. */
1025
+ get contextUsage(): Map<string, SessionUsage>;
922
1026
  /**
923
1027
  * Connect to the multi-context WebSocket endpoint.
924
1028
  *
@@ -1005,7 +1109,14 @@ declare class StreamingSession {
1005
1109
  private callbacks;
1006
1110
  private client;
1007
1111
  private configSent;
1112
+ private _lastUsage;
1008
1113
  constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
1114
+ /**
1115
+ * Per-session usage from the most recently closed session, or null before
1116
+ * the first session closes. Use this to bill your own customers per
1117
+ * conversation. See {@link SessionUsage}.
1118
+ */
1119
+ get lastUsage(): SessionUsage | null;
1009
1120
  /**
1010
1121
  * Open the WebSocket connection and authenticate.
1011
1122
  *
@@ -1354,4 +1465,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
1354
1465
  */
1355
1466
  declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
1356
1467
 
1357
- export { type AudioChunk, type AudioResponse, AuthenticationError, type BulkReplaceResult, ConnectionError, type ContextVoiceSettings, type CreateDictionaryOptions, type CreateVoiceOptions, DictionariesResource, type Dictionary, DictionaryEntriesResource, type DictionaryEntry, type DictionaryEntryInput, type DictionaryEntryListResponse, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, NotFoundError, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateDictionaryEntryOptions, type UpdateDictionaryOptions, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };
1468
+ export { type AudioChunk, type AudioResponse, AuthenticationError, type BulkReplaceResult, ConnectionError, type ContextVoiceSettings, type CreateDictionaryOptions, type CreateVoiceOptions, DictionariesResource, type Dictionary, DictionaryEntriesResource, type DictionaryEntry, type DictionaryEntryInput, type DictionaryEntryListResponse, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, NotFoundError, RateLimitError, type Region, type SessionUsage, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateDictionaryEntryOptions, type UpdateDictionaryOptions, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16, parseSessionUsage };
package/dist/index.d.ts CHANGED
@@ -217,7 +217,7 @@ interface WordTimestamp {
217
217
  interface GenerateOptions {
218
218
  /** Text to synthesize */
219
219
  text: string;
220
- /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
220
+ /** Model to use. Default: 'kugel-3'. Legacy ids (kugel-2.5, kugel-1-turbo, ) still accepted; they alias to kugel-3 server-side. */
221
221
  modelId?: string;
222
222
  /** Voice ID to use */
223
223
  voiceId?: number;
@@ -235,6 +235,12 @@ interface GenerateOptions {
235
235
  maxNewTokens?: number;
236
236
  /** Output sample rate (default: 24000) */
237
237
  sampleRate?: number;
238
+ /**
239
+ * Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
240
+ * Opt-in; when set it is authoritative and must not contradict sampleRate.
241
+ * Absent ⇒ legacy PCM16 at sampleRate.
242
+ */
243
+ outputFormat?: string;
238
244
  /**
239
245
  * Enable text normalization (converts numbers, dates, etc. to spoken words).
240
246
  * When true, text will be normalized before TTS generation.
@@ -263,8 +269,8 @@ interface GenerateOptions {
263
269
  /**
264
270
  * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
265
271
  *
266
- * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
267
- * can also be used for per-segment speed control.
272
+ * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
273
+ * whole request (no per-span control).
268
274
  * Range: [0.8, 1.2]. Default: 1.0.
269
275
  */
270
276
  speed?: number;
@@ -275,6 +281,14 @@ interface GenerateOptions {
275
281
  * server treats the value as trusted once received.
276
282
  */
277
283
  projectId?: number;
284
+ /**
285
+ * Per-request dictionary selection. Omit for the default behavior (all
286
+ * active dictionaries of the project apply, filtered by language). An
287
+ * empty array disables dictionaries for this request. A list of
288
+ * dictionary IDs applies exactly those dictionaries — including
289
+ * inactive ones — bypassing the language filter.
290
+ */
291
+ dictionaryIds?: number[];
278
292
  }
279
293
  /**
280
294
  * Streaming session configuration for `/ws/tts/stream`.
@@ -296,7 +310,7 @@ interface GenerateOptions {
296
310
  interface StreamConfig {
297
311
  /** Voice ID to use */
298
312
  voiceId?: number;
299
- /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
313
+ /** Model ID. Default: 'kugel-3'. Legacy ids still accepted; they alias to kugel-3 server-side. */
300
314
  modelId?: string;
301
315
  /** CFG scale for generation */
302
316
  cfgScale?: number;
@@ -309,6 +323,8 @@ interface StreamConfig {
309
323
  maxNewTokens?: number;
310
324
  /** Output sample rate */
311
325
  sampleRate?: number;
326
+ /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
327
+ outputFormat?: string;
312
328
  /** Auto-flush timeout in milliseconds */
313
329
  flushTimeoutMs?: number;
314
330
  /** Maximum buffer length */
@@ -353,11 +369,19 @@ interface StreamConfig {
353
369
  /**
354
370
  * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
355
371
  *
356
- * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
357
- * can also be used for per-segment speed control.
372
+ * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
373
+ * whole request (no per-span control).
358
374
  * Range: [0.8, 1.2]. Default: 1.0.
359
375
  */
360
376
  speed?: number;
377
+ /**
378
+ * Per-request dictionary selection. Omit for the default behavior (all
379
+ * active dictionaries of the project apply, filtered by language). An
380
+ * empty array disables dictionaries for this request. A list of
381
+ * dictionary IDs applies exactly those dictionaries — including
382
+ * inactive ones — bypassing the language filter.
383
+ */
384
+ dictionaryIds?: number[];
361
385
  }
362
386
  /**
363
387
  * Event callbacks for a streaming session (`/ws/tts/stream`).
@@ -374,9 +398,18 @@ interface StreamingSessionCallbacks {
374
398
  * Carries the segment index, total audio duration, and generation time.
375
399
  */
376
400
  onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
401
+ /**
402
+ * Called when the server marks the end of a turn's audio
403
+ * (`{"final": true, ...}` — sent after the last audio frame of every
404
+ * gracefully completed turn, right before `session_closed`). The
405
+ * ElevenLabs `isFinal` equivalent: once this fires, no further audio
406
+ * for the turn will arrive. Not fired on a barge-in cancel — that
407
+ * path fires {@link onInterrupted} instead.
408
+ */
409
+ onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
377
410
  /**
378
411
  * Called when the session is fully closed (after `session.close()`).
379
- * Equivalent to `onFinal` on the one-shot endpoint.
412
+ * Fires right after {@link onFinal} and additionally carries usage.
380
413
  */
381
414
  onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
382
415
  /** Called when the server begins generating audio for a text segment. */
@@ -393,14 +426,45 @@ interface StreamingSessionCallbacks {
393
426
  /** Called on any error. */
394
427
  onError?: (error: Error) => void;
395
428
  }
429
+ /**
430
+ * Per-session usage reported in the `session_closed` frame (KUG-1192).
431
+ *
432
+ * Lets you bill your own customers per conversation. `costCents` is the
433
+ * actual amount charged in **EUR cents**. When the charge could not be
434
+ * determined at session end (e.g. a transient billing error) `costCents` is
435
+ * `null` and `costAvailable` is `false` — never a misleading `0`.
436
+ * `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
437
+ * context (per conversation) on each `context_closed` frame, not aggregated
438
+ * across contexts.
439
+ */
440
+ interface SessionUsage {
441
+ /** Total audio generated this session, in seconds (the unit we bill on). */
442
+ audioSeconds: number;
443
+ /** Actual amount charged in EUR cents, or `null` if undetermined. */
444
+ costCents: number | null;
445
+ /** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
446
+ currency?: string;
447
+ /** Total input characters submitted this session, if reported. */
448
+ characters?: number;
449
+ /** Model that produced the audio, if reported. */
450
+ modelId?: string;
451
+ /** `true` when an authoritative charge was returned for this session. */
452
+ costAvailable: boolean;
453
+ }
454
+ /**
455
+ * Parse the raw `usage` object (or a legacy `session_closed` payload without
456
+ * one) into a typed {@link SessionUsage}. Returns `null` when no usage info
457
+ * is present.
458
+ */
459
+ declare function parseSessionUsage(data: Record<string, unknown>): SessionUsage | null;
396
460
  /**
397
461
  * Audio chunk from streaming TTS.
398
462
  */
399
463
  interface AudioChunk {
400
464
  /** Raw PCM16 audio as base64 */
401
465
  audio: string;
402
- /** Encoding format */
403
- encoding: 'pcm_s16le';
466
+ /** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
467
+ encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
404
468
  /** Chunk index */
405
469
  index: number;
406
470
  /** Sample rate */
@@ -426,6 +490,12 @@ interface GenerationStats {
426
490
  rtf: number;
427
491
  /** Error message if any */
428
492
  error?: string;
493
+ /**
494
+ * Per-request usage (audio time + amount charged), for billing your own
495
+ * customers. Undefined when the server reports no usage. See
496
+ * {@link SessionUsage}.
497
+ */
498
+ usage?: SessionUsage;
429
499
  }
430
500
  /**
431
501
  * Complete audio response from TTS generation.
@@ -505,6 +575,8 @@ interface MultiContextConfig {
505
575
  defaultVoiceId?: number;
506
576
  /** Output sample rate (default: 24000) */
507
577
  sampleRate?: number;
578
+ /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
579
+ outputFormat?: string;
508
580
  /** CFG scale for generation (default: 2.0) */
509
581
  cfgScale?: number;
510
582
  /**
@@ -522,6 +594,14 @@ interface MultiContextConfig {
522
594
  * the language, which adds ~60-150ms to time-to-first-audio.
523
595
  */
524
596
  language?: string;
597
+ /**
598
+ * Per-request dictionary selection. Omit for the default behavior (all
599
+ * active dictionaries of the project apply, filtered by language). An
600
+ * empty array disables dictionaries for this request. A list of
601
+ * dictionary IDs applies exactly those dictionaries — including
602
+ * inactive ones — bypassing the language filter.
603
+ */
604
+ dictionaryIds?: number[];
525
605
  /** Seconds before context auto-closes (default: 20.0) */
526
606
  inactivityTimeout?: number;
527
607
  }
@@ -557,8 +637,20 @@ interface MultiContextCallbacks {
557
637
  onContextCreated?: (contextId: string) => void;
558
638
  /** Called when an audio chunk is received */
559
639
  onChunk?: (chunk: MultiContextAudioChunk) => void;
560
- /** Called when a context is closed */
561
- onContextClosed?: (contextId: string) => void;
640
+ /**
641
+ * Called when all audio admitted before a `{flush: true}` has been
642
+ * delivered for a context (`{"final": true, "context_id": ...}`), and
643
+ * once more before {@link onContextClosed} on a graceful close. The
644
+ * ElevenLabs multi-context `is_final` equivalent. Not fired on an
645
+ * immediate (barge-in) close.
646
+ */
647
+ onFinal?: (contextId: string) => void;
648
+ /**
649
+ * Called when a context is closed (terminal). `usage` carries this
650
+ * conversation's audio time + amount charged (undefined if not reported).
651
+ * See {@link SessionUsage}.
652
+ */
653
+ onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
562
654
  /** Called when a context times out */
563
655
  onContextTimeout?: (contextId: string) => void;
564
656
  /** Called when session is closed */
@@ -912,13 +1004,25 @@ declare class MultiContextSession {
912
1004
  private config;
913
1005
  private callbacks;
914
1006
  private contexts;
1007
+ /** Contexts a create message has been sent for (not yet necessarily
1008
+ * confirmed by the server via context_created). */
1009
+ private requestedContexts;
915
1010
  private _sessionId;
1011
+ private _contextUsage;
916
1012
  private isStarted;
917
1013
  constructor(client: KugelAudio, config?: MultiContextConfig);
918
1014
  /**
919
1015
  * Get the current session ID, or null if not connected.
920
1016
  */
921
1017
  get sessionId(): string | null;
1018
+ /**
1019
+ * Per-context usage (audio time + amount charged) for a closed context, or
1020
+ * null if that context hasn't closed yet. Each context is its own
1021
+ * conversation — use this to bill per conversation. See {@link SessionUsage}.
1022
+ */
1023
+ usageFor(contextId: string): SessionUsage | null;
1024
+ /** Map of context_id → per-context usage for all closed contexts. */
1025
+ get contextUsage(): Map<string, SessionUsage>;
922
1026
  /**
923
1027
  * Connect to the multi-context WebSocket endpoint.
924
1028
  *
@@ -1005,7 +1109,14 @@ declare class StreamingSession {
1005
1109
  private callbacks;
1006
1110
  private client;
1007
1111
  private configSent;
1112
+ private _lastUsage;
1008
1113
  constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
1114
+ /**
1115
+ * Per-session usage from the most recently closed session, or null before
1116
+ * the first session closes. Use this to bill your own customers per
1117
+ * conversation. See {@link SessionUsage}.
1118
+ */
1119
+ get lastUsage(): SessionUsage | null;
1009
1120
  /**
1010
1121
  * Open the WebSocket connection and authenticate.
1011
1122
  *
@@ -1354,4 +1465,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
1354
1465
  */
1355
1466
  declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
1356
1467
 
1357
- export { type AudioChunk, type AudioResponse, AuthenticationError, type BulkReplaceResult, ConnectionError, type ContextVoiceSettings, type CreateDictionaryOptions, type CreateVoiceOptions, DictionariesResource, type Dictionary, DictionaryEntriesResource, type DictionaryEntry, type DictionaryEntryInput, type DictionaryEntryListResponse, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, NotFoundError, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateDictionaryEntryOptions, type UpdateDictionaryOptions, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };
1468
+ export { type AudioChunk, type AudioResponse, AuthenticationError, type BulkReplaceResult, ConnectionError, type ContextVoiceSettings, type CreateDictionaryOptions, type CreateVoiceOptions, DictionariesResource, type Dictionary, DictionaryEntriesResource, type DictionaryEntry, type DictionaryEntryInput, type DictionaryEntryListResponse, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, NotFoundError, RateLimitError, type Region, type SessionUsage, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateDictionaryEntryOptions, type UpdateDictionaryOptions, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16, parseSessionUsage };