kugelaudio 0.6.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/client.ts CHANGED
@@ -29,14 +29,18 @@ import type {
29
29
  VoiceReference,
30
30
  WordTimestamp
31
31
  } from './types';
32
+ import { parseSessionUsage } from './types';
32
33
  import { base64ToArrayBuffer } from './utils';
33
34
  import { getWebSocket } from './websocket';
34
35
 
35
36
  import type { Region } from './types';
37
+ import packageJson from '../package.json';
36
38
 
37
39
  const DEFAULT_API_URL = 'https://api.kugelaudio.com';
38
40
  const EU_API_URL = 'https://api.eu.kugelaudio.com';
39
41
  const SUPPORTED_REGIONS = ['eu', 'us', 'global'] as const;
42
+ const SDK_NAME = 'js';
43
+ const SDK_VERSION = packageJson.version;
40
44
 
41
45
  const REGION_PREFIXES = ['eu-', 'us-', 'global-'] as const;
42
46
 
@@ -49,6 +53,18 @@ function parseApiKey(apiKey: string): { cleanKey: string; detectedRegion?: Regio
49
53
  return { cleanKey: apiKey };
50
54
  }
51
55
 
56
+ function sdkHeaders(): Record<string, string> {
57
+ return {
58
+ 'X-KugelAudio-SDK': SDK_NAME,
59
+ 'X-KugelAudio-SDK-Version': SDK_VERSION,
60
+ };
61
+ }
62
+
63
+ function appendSdkQuery(url: string): string {
64
+ const separator = url.includes('?') ? '&' : '?';
65
+ return `${url}${separator}sdk=${encodeURIComponent(SDK_NAME)}&sdk_version=${encodeURIComponent(SDK_VERSION)}`;
66
+ }
67
+
52
68
  /**
53
69
  * Create a new WebSocket instance.
54
70
  * Lazily resolves the constructor to avoid top-level side-effects
@@ -491,7 +507,7 @@ class TTSResource {
491
507
  if (this.client.orgId !== undefined) {
492
508
  url += `&org_id=${this.client.orgId}`;
493
509
  }
494
- return url;
510
+ return appendSdkQuery(url);
495
511
  }
496
512
 
497
513
  /**
@@ -581,6 +597,7 @@ class TTSResource {
581
597
  generationMs: data.gen_ms,
582
598
  rtf: data.rtf,
583
599
  error: data.error,
600
+ usage: parseSessionUsage(data) ?? undefined,
584
601
  };
585
602
  pending.callbacks.onFinal?.(stats);
586
603
  this.pendingRequests.delete(requestId);
@@ -692,17 +709,21 @@ class TTSResource {
692
709
 
693
710
  ws.send(JSON.stringify({
694
711
  text: options.text,
695
- model_id: options.modelId || 'kugel-1-turbo',
712
+ model_id: options.modelId || 'kugel-3',
696
713
  voice_id: options.voiceId,
697
714
  cfg_scale: options.cfgScale ?? 2.0,
698
715
  ...(options.temperature !== undefined && { temperature: options.temperature }),
699
716
  max_new_tokens: options.maxNewTokens ?? 2048,
700
717
  sample_rate: options.sampleRate ?? 24000,
718
+ ...(options.outputFormat && { output_format: options.outputFormat }),
701
719
  normalize: options.normalize ?? true,
702
720
  ...(options.language && { language: options.language }),
703
721
  ...(options.wordTimestamps && { word_timestamps: true }),
704
722
  ...(options.speed !== undefined && { speed: options.speed }),
705
723
  ...(options.projectId !== undefined && { project_id: options.projectId }),
724
+ // [] is meaningful (explicit opt-out) and must be sent; only
725
+ // undefined (use the project default) is omitted.
726
+ ...(options.dictionaryIds !== undefined && { dictionary_ids: options.dictionaryIds }),
706
727
  }));
707
728
  });
708
729
  }
@@ -724,16 +745,20 @@ class TTSResource {
724
745
  // Send TTS request
725
746
  ws.send(JSON.stringify({
726
747
  text: options.text,
727
- model_id: options.modelId || 'kugel-1-turbo',
748
+ model_id: options.modelId || 'kugel-3',
728
749
  voice_id: options.voiceId,
729
750
  cfg_scale: options.cfgScale ?? 2.0,
730
751
  max_new_tokens: options.maxNewTokens ?? 2048,
731
752
  sample_rate: options.sampleRate ?? 24000,
753
+ ...(options.outputFormat && { output_format: options.outputFormat }),
732
754
  normalize: options.normalize ?? true,
733
755
  ...(options.language && { language: options.language }),
734
756
  ...(options.wordTimestamps && { word_timestamps: true }),
735
757
  ...(options.speed !== undefined && { speed: options.speed }),
736
758
  ...(options.projectId !== undefined && { project_id: options.projectId }),
759
+ // [] is meaningful (explicit opt-out) and must be sent; only
760
+ // undefined (use the project default) is omitted.
761
+ ...(options.dictionaryIds !== undefined && { dictionary_ids: options.dictionaryIds }),
737
762
  }));
738
763
  };
739
764
 
@@ -764,6 +789,7 @@ class TTSResource {
764
789
  generationMs: data.gen_ms,
765
790
  rtf: data.rtf,
766
791
  error: data.error,
792
+ usage: parseSessionUsage(data) ?? undefined,
767
793
  };
768
794
  callbacks.onFinal?.(stats);
769
795
  ws.close();
@@ -962,7 +988,11 @@ class MultiContextSession {
962
988
  private config: import('./types').MultiContextConfig;
963
989
  private callbacks: import('./types').MultiContextCallbacks = {};
964
990
  private contexts: Set<string> = new Set();
991
+ /** Contexts a create message has been sent for (not yet necessarily
992
+ * confirmed by the server via context_created). */
993
+ private requestedContexts: Set<string> = new Set();
965
994
  private _sessionId: string | null = null;
995
+ private _contextUsage: Map<string, import('./types').SessionUsage> = new Map();
966
996
  private isStarted = false;
967
997
 
968
998
  constructor(
@@ -979,6 +1009,20 @@ class MultiContextSession {
979
1009
  return this._sessionId;
980
1010
  }
981
1011
 
1012
+ /**
1013
+ * Per-context usage (audio time + amount charged) for a closed context, or
1014
+ * null if that context hasn't closed yet. Each context is its own
1015
+ * conversation — use this to bill per conversation. See {@link SessionUsage}.
1016
+ */
1017
+ usageFor(contextId: string): import('./types').SessionUsage | null {
1018
+ return this._contextUsage.get(contextId) ?? null;
1019
+ }
1020
+
1021
+ /** Map of context_id → per-context usage for all closed contexts. */
1022
+ get contextUsage(): Map<string, import('./types').SessionUsage> {
1023
+ return new Map(this._contextUsage);
1024
+ }
1025
+
982
1026
  /**
983
1027
  * Connect to the multi-context WebSocket endpoint.
984
1028
  *
@@ -1003,7 +1047,7 @@ class MultiContextSession {
1003
1047
  authParam = 'api_key';
1004
1048
  }
1005
1049
 
1006
- const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`;
1050
+ const url = appendSdkQuery(`${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`);
1007
1051
  this.ws = createWs(url);
1008
1052
  const ws = this.ws;
1009
1053
 
@@ -1048,13 +1092,25 @@ class MultiContextSession {
1048
1092
  this.callbacks.onChunk?.(chunk);
1049
1093
  }
1050
1094
 
1095
+ if (data.final && data.context_id) {
1096
+ // Per-context end-of-audio marker (KUG-1238): all audio admitted
1097
+ // before the client's flush has been delivered; also precedes
1098
+ // context_closed on a graceful close.
1099
+ this.callbacks.onFinal?.(data.context_id);
1100
+ }
1101
+
1051
1102
  if (data.context_closed) {
1052
1103
  this.contexts.delete(data.context_id);
1053
- this.callbacks.onContextClosed?.(data.context_id);
1104
+ this.requestedContexts.delete(data.context_id);
1105
+ // Per-context (per-conversation) usage rides on context_closed.
1106
+ const ctxUsage = parseSessionUsage(data) ?? undefined;
1107
+ if (ctxUsage) this._contextUsage.set(data.context_id, ctxUsage);
1108
+ this.callbacks.onContextClosed?.(data.context_id, ctxUsage);
1054
1109
  }
1055
1110
 
1056
1111
  if (data.context_timeout) {
1057
1112
  this.contexts.delete(data.context_id);
1113
+ this.requestedContexts.delete(data.context_id);
1058
1114
  this.callbacks.onContextTimeout?.(data.context_id);
1059
1115
  }
1060
1116
 
@@ -1109,6 +1165,7 @@ class MultiContextSession {
1109
1165
  this.ws = null;
1110
1166
  this.isStarted = false;
1111
1167
  this.contexts.clear();
1168
+ this.requestedContexts.clear();
1112
1169
  };
1113
1170
  });
1114
1171
  }
@@ -1126,6 +1183,7 @@ class MultiContextSession {
1126
1183
  if (!this.ws || this.ws.readyState !== WS_OPEN) {
1127
1184
  throw new KugelAudioError('WebSocket not connected');
1128
1185
  }
1186
+ this.requestedContexts.add(contextId);
1129
1187
 
1130
1188
  const msg: Record<string, unknown> = {
1131
1189
  text: ' ',
@@ -1136,26 +1194,36 @@ class MultiContextSession {
1136
1194
  if (!this.isStarted) {
1137
1195
  warnIfNoLanguage(this.config.language, this.config.normalize);
1138
1196
  if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
1197
+ if (this.config.outputFormat) msg.output_format = this.config.outputFormat;
1139
1198
  if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
1140
1199
  if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
1141
1200
  if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
1142
1201
  if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
1143
1202
  if (this.config.language) msg.language = this.config.language;
1203
+ // [] is meaningful (explicit opt-out) and must be sent; only
1204
+ // undefined (use the project default) is omitted.
1205
+ if (this.config.dictionaryIds !== undefined) msg.dictionary_ids = this.config.dictionaryIds;
1144
1206
  if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
1145
1207
  }
1146
1208
 
1147
- // Per-context voice
1209
+ // Per-context voice. The server binds a context's voice ONLY from
1210
+ // voice_settings.voice_id at context creation — a top-level voice_id
1211
+ // merely updates the session config and leaves the context voiceless,
1212
+ // which the server rejects with MISSING_VOICE_ID on the first text
1213
+ // (KUG-1233). This matches the Python SDK's wire format.
1214
+ const voiceSettings: Record<string, unknown> = {};
1148
1215
  const voiceId = options?.voiceId || this.config.defaultVoiceId;
1149
- if (voiceId) msg.voice_id = voiceId;
1216
+ if (voiceId) voiceSettings.voice_id = voiceId;
1150
1217
 
1151
1218
  if (options?.voiceSettings) {
1152
- msg.voice_settings = {
1153
- stability: options.voiceSettings.stability,
1154
- similarity_boost: options.voiceSettings.similarityBoost,
1155
- style: options.voiceSettings.style,
1156
- use_speaker_boost: options.voiceSettings.useSpeakerBoost,
1157
- speed: options.voiceSettings.speed,
1158
- };
1219
+ voiceSettings.stability = options.voiceSettings.stability;
1220
+ voiceSettings.similarity_boost = options.voiceSettings.similarityBoost;
1221
+ voiceSettings.style = options.voiceSettings.style;
1222
+ voiceSettings.use_speaker_boost = options.voiceSettings.useSpeakerBoost;
1223
+ voiceSettings.speed = options.voiceSettings.speed;
1224
+ }
1225
+ if (Object.keys(voiceSettings).length > 0) {
1226
+ msg.voice_settings = voiceSettings;
1159
1227
  }
1160
1228
 
1161
1229
  this.ws.send(JSON.stringify(msg));
@@ -1169,8 +1237,12 @@ class MultiContextSession {
1169
1237
  throw new KugelAudioError('WebSocket not connected');
1170
1238
  }
1171
1239
 
1172
- // Auto-create context if needed
1173
- if (!this.contexts.has(contextId) && !this.isStarted) {
1240
+ // Auto-create context if needed. Tracked via requestedContexts (sent
1241
+ // creates, not yet necessarily confirmed) rather than this.contexts
1242
+ // (server-confirmed) — otherwise a send() to a new context after the
1243
+ // session started goes out bare, and the server auto-creates the
1244
+ // context without voice_settings → MISSING_VOICE_ID (KUG-1233).
1245
+ if (!this.requestedContexts.has(contextId) && !this.contexts.has(contextId)) {
1174
1246
  this.createContext(contextId);
1175
1247
  }
1176
1248
 
@@ -1236,6 +1308,7 @@ class MultiContextSession {
1236
1308
  this.ws = null;
1237
1309
  this.isStarted = false;
1238
1310
  this.contexts.clear();
1311
+ this.requestedContexts.clear();
1239
1312
  }
1240
1313
 
1241
1314
  /**
@@ -1288,6 +1361,7 @@ class StreamingSession {
1288
1361
  private callbacks: StreamingSessionCallbacks;
1289
1362
  private client: KugelAudio;
1290
1363
  private configSent = false;
1364
+ private _lastUsage: import('./types').SessionUsage | null = null;
1291
1365
 
1292
1366
  constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks) {
1293
1367
  this.client = client;
@@ -1295,6 +1369,15 @@ class StreamingSession {
1295
1369
  this.callbacks = callbacks;
1296
1370
  }
1297
1371
 
1372
+ /**
1373
+ * Per-session usage from the most recently closed session, or null before
1374
+ * the first session closes. Use this to bill your own customers per
1375
+ * conversation. See {@link SessionUsage}.
1376
+ */
1377
+ get lastUsage(): import('./types').SessionUsage | null {
1378
+ return this._lastUsage;
1379
+ }
1380
+
1298
1381
  /**
1299
1382
  * Open the WebSocket connection and authenticate.
1300
1383
  *
@@ -1317,7 +1400,7 @@ class StreamingSession {
1317
1400
  authParam = 'api_key';
1318
1401
  }
1319
1402
 
1320
- const url = `${wsUrl}/ws/tts/stream?${authParam}=${this.client.apiKey}`;
1403
+ const url = appendSdkQuery(`${wsUrl}/ws/tts/stream?${authParam}=${this.client.apiKey}`);
1321
1404
  this.ws = createWs(url);
1322
1405
  const ws = this.ws;
1323
1406
 
@@ -1374,7 +1457,18 @@ class StreamingSession {
1374
1457
  this.callbacks.onInterrupted?.();
1375
1458
  }
1376
1459
 
1460
+ if (data.final) {
1461
+ // End-of-audio marker for the turn (KUG-1238) — arrives after
1462
+ // the last audio frame and before session_closed.
1463
+ this.callbacks.onFinal?.(
1464
+ data.total_audio_seconds ?? 0,
1465
+ data.total_text_chunks ?? 0,
1466
+ data.total_audio_chunks ?? 0,
1467
+ );
1468
+ }
1469
+
1377
1470
  if (data.session_closed) {
1471
+ this._lastUsage = parseSessionUsage(data);
1378
1472
  this.callbacks.onSessionClosed?.(
1379
1473
  data.total_audio_seconds ?? 0,
1380
1474
  data.total_text_chunks ?? 0,
@@ -1459,6 +1553,7 @@ class StreamingSession {
1459
1553
  if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
1460
1554
  if (this.config.maxNewTokens !== undefined) msg.max_new_tokens = this.config.maxNewTokens;
1461
1555
  if (this.config.sampleRate !== undefined) msg.sample_rate = this.config.sampleRate;
1556
+ if (this.config.outputFormat !== undefined) msg.output_format = this.config.outputFormat;
1462
1557
  if (this.config.flushTimeoutMs !== undefined) msg.flush_timeout_ms = this.config.flushTimeoutMs;
1463
1558
  if (this.config.maxBufferLength !== undefined) msg.max_buffer_length = this.config.maxBufferLength;
1464
1559
  if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
@@ -1467,6 +1562,9 @@ class StreamingSession {
1467
1562
  if (this.config.autoMode !== undefined) msg.auto_mode = this.config.autoMode;
1468
1563
  if (this.config.chunkLengthSchedule?.length) msg.chunk_length_schedule = this.config.chunkLengthSchedule;
1469
1564
  if (this.config.speed !== undefined) msg.speed = this.config.speed;
1565
+ // [] is meaningful (explicit opt-out) and must be sent; only
1566
+ // undefined (use the project default) is omitted.
1567
+ if (this.config.dictionaryIds !== undefined) msg.dictionary_ids = this.config.dictionaryIds;
1470
1568
  this.configSent = true;
1471
1569
  }
1472
1570
 
@@ -1865,6 +1963,7 @@ export class KugelAudio {
1865
1963
  'Content-Type': 'application/json',
1866
1964
  'X-API-Key': this._apiKey,
1867
1965
  'Authorization': `Bearer ${this._apiKey}`,
1966
+ ...sdkHeaders(),
1868
1967
  };
1869
1968
 
1870
1969
  const controller = new AbortController();
@@ -1913,6 +2012,7 @@ export class KugelAudio {
1913
2012
  const headers: Record<string, string> = {
1914
2013
  'X-API-Key': this._apiKey,
1915
2014
  'Authorization': `Bearer ${this._apiKey}`,
2015
+ ...sdkHeaders(),
1916
2016
  };
1917
2017
 
1918
2018
  const controller = new AbortController();
package/src/index.ts CHANGED
@@ -62,6 +62,7 @@ export type {
62
62
  MultiContextAudioChunk,
63
63
  MultiContextCallbacks,
64
64
  MultiContextConfig,
65
+ SessionUsage,
65
66
  StreamCallbacks,
66
67
  StreamConfig,
67
68
  StreamingSessionCallbacks,
@@ -78,6 +79,7 @@ export type {
78
79
  VoiceSex,
79
80
  WordTimestamp
80
81
  } from './types';
82
+ export { parseSessionUsage } from './types';
81
83
 
82
84
  export { DictionariesResource, DictionaryEntriesResource } from './dictionaries';
83
85
 
package/src/types.ts CHANGED
@@ -240,7 +240,7 @@ export interface WordTimestamp {
240
240
  export interface GenerateOptions {
241
241
  /** Text to synthesize */
242
242
  text: string;
243
- /** Model to use: 'kugel-1-turbo' (fast) or 'kugel-1' (premium). Default: 'kugel-1-turbo' */
243
+ /** Model to use. Default: 'kugel-3'. Legacy ids (kugel-2.5, kugel-1-turbo, ) still accepted; they alias to kugel-3 server-side. */
244
244
  modelId?: string;
245
245
  /** Voice ID to use */
246
246
  voiceId?: number;
@@ -258,7 +258,13 @@ export interface GenerateOptions {
258
258
  maxNewTokens?: number;
259
259
  /** Output sample rate (default: 24000) */
260
260
  sampleRate?: number;
261
- /**
261
+ /**
262
+ * Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
263
+ * Opt-in; when set it is authoritative and must not contradict sampleRate.
264
+ * Absent ⇒ legacy PCM16 at sampleRate.
265
+ */
266
+ outputFormat?: string;
267
+ /**
262
268
  * Enable text normalization (converts numbers, dates, etc. to spoken words).
263
269
  * When true, text will be normalized before TTS generation.
264
270
  * Default: true
@@ -286,8 +292,8 @@ export interface GenerateOptions {
286
292
  /**
287
293
  * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
288
294
  *
289
- * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
290
- * can also be used for per-segment speed control.
295
+ * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
296
+ * whole request (no per-span control).
291
297
  * Range: [0.8, 1.2]. Default: 1.0.
292
298
  */
293
299
  speed?: number;
@@ -298,6 +304,14 @@ export interface GenerateOptions {
298
304
  * server treats the value as trusted once received.
299
305
  */
300
306
  projectId?: number;
307
+ /**
308
+ * Per-request dictionary selection. Omit for the default behavior (all
309
+ * active dictionaries of the project apply, filtered by language). An
310
+ * empty array disables dictionaries for this request. A list of
311
+ * dictionary IDs applies exactly those dictionaries — including
312
+ * inactive ones — bypassing the language filter.
313
+ */
314
+ dictionaryIds?: number[];
301
315
  }
302
316
 
303
317
  /**
@@ -320,7 +334,7 @@ export interface GenerateOptions {
320
334
  export interface StreamConfig {
321
335
  /** Voice ID to use */
322
336
  voiceId?: number;
323
- /** Model ID ('kugel-1-turbo' or 'kugel-1'). Default: 'kugel-1-turbo' */
337
+ /** Model ID. Default: 'kugel-3'. Legacy ids still accepted; they alias to kugel-3 server-side. */
324
338
  modelId?: string;
325
339
  /** CFG scale for generation */
326
340
  cfgScale?: number;
@@ -333,6 +347,8 @@ export interface StreamConfig {
333
347
  maxNewTokens?: number;
334
348
  /** Output sample rate */
335
349
  sampleRate?: number;
350
+ /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
351
+ outputFormat?: string;
336
352
  /** Auto-flush timeout in milliseconds */
337
353
  flushTimeoutMs?: number;
338
354
  /** Maximum buffer length */
@@ -377,11 +393,19 @@ export interface StreamConfig {
377
393
  /**
378
394
  * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
379
395
  *
380
- * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
381
- * can also be used for per-segment speed control.
396
+ * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
397
+ * whole request (no per-span control).
382
398
  * Range: [0.8, 1.2]. Default: 1.0.
383
399
  */
384
400
  speed?: number;
401
+ /**
402
+ * Per-request dictionary selection. Omit for the default behavior (all
403
+ * active dictionaries of the project apply, filtered by language). An
404
+ * empty array disables dictionaries for this request. A list of
405
+ * dictionary IDs applies exactly those dictionaries — including
406
+ * inactive ones — bypassing the language filter.
407
+ */
408
+ dictionaryIds?: number[];
385
409
  }
386
410
 
387
411
  /**
@@ -399,9 +423,18 @@ export interface StreamingSessionCallbacks {
399
423
  * Carries the segment index, total audio duration, and generation time.
400
424
  */
401
425
  onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
426
+ /**
427
+ * Called when the server marks the end of a turn's audio
428
+ * (`{"final": true, ...}` — sent after the last audio frame of every
429
+ * gracefully completed turn, right before `session_closed`). The
430
+ * ElevenLabs `isFinal` equivalent: once this fires, no further audio
431
+ * for the turn will arrive. Not fired on a barge-in cancel — that
432
+ * path fires {@link onInterrupted} instead.
433
+ */
434
+ onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
402
435
  /**
403
436
  * Called when the session is fully closed (after `session.close()`).
404
- * Equivalent to `onFinal` on the one-shot endpoint.
437
+ * Fires right after {@link onFinal} and additionally carries usage.
405
438
  */
406
439
  onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
407
440
  /** Called when the server begins generating audio for a text segment. */
@@ -419,14 +452,71 @@ export interface StreamingSessionCallbacks {
419
452
  onError?: (error: Error) => void;
420
453
  }
421
454
 
455
+ /**
456
+ * Per-session usage reported in the `session_closed` frame (KUG-1192).
457
+ *
458
+ * Lets you bill your own customers per conversation. `costCents` is the
459
+ * actual amount charged in **EUR cents**. When the charge could not be
460
+ * determined at session end (e.g. a transient billing error) `costCents` is
461
+ * `null` and `costAvailable` is `false` — never a misleading `0`.
462
+ * `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
463
+ * context (per conversation) on each `context_closed` frame, not aggregated
464
+ * across contexts.
465
+ */
466
+ export interface SessionUsage {
467
+ /** Total audio generated this session, in seconds (the unit we bill on). */
468
+ audioSeconds: number;
469
+ /** Actual amount charged in EUR cents, or `null` if undetermined. */
470
+ costCents: number | null;
471
+ /** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
472
+ currency?: string;
473
+ /** Total input characters submitted this session, if reported. */
474
+ characters?: number;
475
+ /** Model that produced the audio, if reported. */
476
+ modelId?: string;
477
+ /** `true` when an authoritative charge was returned for this session. */
478
+ costAvailable: boolean;
479
+ }
480
+
481
+ /**
482
+ * Parse the raw `usage` object (or a legacy `session_closed` payload without
483
+ * one) into a typed {@link SessionUsage}. Returns `null` when no usage info
484
+ * is present.
485
+ */
486
+ export function parseSessionUsage(
487
+ data: Record<string, unknown>,
488
+ ): SessionUsage | null {
489
+ const raw = data.usage as Record<string, unknown> | undefined;
490
+ const source = raw && typeof raw === 'object' ? raw : data;
491
+ const audioSeconds =
492
+ typeof source.audio_seconds === 'number'
493
+ ? source.audio_seconds
494
+ : typeof data.total_audio_seconds === 'number'
495
+ ? data.total_audio_seconds
496
+ : undefined;
497
+ if (audioSeconds === undefined) return null;
498
+ const costCents =
499
+ typeof source.cost_cents === 'number' ? source.cost_cents : null;
500
+ return {
501
+ audioSeconds,
502
+ costCents,
503
+ currency:
504
+ typeof source.currency === 'string' ? source.currency : undefined,
505
+ characters:
506
+ typeof source.characters === 'number' ? source.characters : undefined,
507
+ modelId: typeof source.model_id === 'string' ? source.model_id : undefined,
508
+ costAvailable: costCents !== null,
509
+ };
510
+ }
511
+
422
512
  /**
423
513
  * Audio chunk from streaming TTS.
424
514
  */
425
515
  export interface AudioChunk {
426
516
  /** Raw PCM16 audio as base64 */
427
517
  audio: string;
428
- /** Encoding format */
429
- encoding: 'pcm_s16le';
518
+ /** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
519
+ encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
430
520
  /** Chunk index */
431
521
  index: number;
432
522
  /** Sample rate */
@@ -453,6 +543,12 @@ export interface GenerationStats {
453
543
  rtf: number;
454
544
  /** Error message if any */
455
545
  error?: string;
546
+ /**
547
+ * Per-request usage (audio time + amount charged), for billing your own
548
+ * customers. Undefined when the server reports no usage. See
549
+ * {@link SessionUsage}.
550
+ */
551
+ usage?: SessionUsage;
456
552
  }
457
553
 
458
554
  /**
@@ -546,6 +642,8 @@ export interface MultiContextConfig {
546
642
  defaultVoiceId?: number;
547
643
  /** Output sample rate (default: 24000) */
548
644
  sampleRate?: number;
645
+ /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
646
+ outputFormat?: string;
549
647
  /** CFG scale for generation (default: 2.0) */
550
648
  cfgScale?: number;
551
649
  /**
@@ -563,6 +661,14 @@ export interface MultiContextConfig {
563
661
  * the language, which adds ~60-150ms to time-to-first-audio.
564
662
  */
565
663
  language?: string;
664
+ /**
665
+ * Per-request dictionary selection. Omit for the default behavior (all
666
+ * active dictionaries of the project apply, filtered by language). An
667
+ * empty array disables dictionaries for this request. A list of
668
+ * dictionary IDs applies exactly those dictionaries — including
669
+ * inactive ones — bypassing the language filter.
670
+ */
671
+ dictionaryIds?: number[];
566
672
  /** Seconds before context auto-closes (default: 20.0) */
567
673
  inactivityTimeout?: number;
568
674
  }
@@ -601,8 +707,20 @@ export interface MultiContextCallbacks {
601
707
  onContextCreated?: (contextId: string) => void;
602
708
  /** Called when an audio chunk is received */
603
709
  onChunk?: (chunk: MultiContextAudioChunk) => void;
604
- /** Called when a context is closed */
605
- onContextClosed?: (contextId: string) => void;
710
+ /**
711
+ * Called when all audio admitted before a `{flush: true}` has been
712
+ * delivered for a context (`{"final": true, "context_id": ...}`), and
713
+ * once more before {@link onContextClosed} on a graceful close. The
714
+ * ElevenLabs multi-context `is_final` equivalent. Not fired on an
715
+ * immediate (barge-in) close.
716
+ */
717
+ onFinal?: (contextId: string) => void;
718
+ /**
719
+ * Called when a context is closed (terminal). `usage` carries this
720
+ * conversation's audio time + amount charged (undefined if not reported).
721
+ * See {@link SessionUsage}.
722
+ */
723
+ onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
606
724
  /** Called when a context times out */
607
725
  onContextTimeout?: (contextId: string) => void;
608
726
  /** Called when session is closed */