kugelaudio 0.6.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/README.md +11 -2
- package/dist/index.d.mts +123 -12
- package/dist/index.d.ts +123 -12
- package/dist/index.js +178 -23
- package/dist/index.mjs +176 -22
- package/package.json +1 -1
- package/src/client.test.ts +395 -1
- package/src/client.ts +117 -17
- package/src/index.ts +2 -0
- package/src/types.ts +130 -12
package/src/client.ts
CHANGED
|
@@ -29,14 +29,18 @@ import type {
|
|
|
29
29
|
VoiceReference,
|
|
30
30
|
WordTimestamp
|
|
31
31
|
} from './types';
|
|
32
|
+
import { parseSessionUsage } from './types';
|
|
32
33
|
import { base64ToArrayBuffer } from './utils';
|
|
33
34
|
import { getWebSocket } from './websocket';
|
|
34
35
|
|
|
35
36
|
import type { Region } from './types';
|
|
37
|
+
import packageJson from '../package.json';
|
|
36
38
|
|
|
37
39
|
const DEFAULT_API_URL = 'https://api.kugelaudio.com';
|
|
38
40
|
const EU_API_URL = 'https://api.eu.kugelaudio.com';
|
|
39
41
|
const SUPPORTED_REGIONS = ['eu', 'us', 'global'] as const;
|
|
42
|
+
const SDK_NAME = 'js';
|
|
43
|
+
const SDK_VERSION = packageJson.version;
|
|
40
44
|
|
|
41
45
|
const REGION_PREFIXES = ['eu-', 'us-', 'global-'] as const;
|
|
42
46
|
|
|
@@ -49,6 +53,18 @@ function parseApiKey(apiKey: string): { cleanKey: string; detectedRegion?: Regio
|
|
|
49
53
|
return { cleanKey: apiKey };
|
|
50
54
|
}
|
|
51
55
|
|
|
56
|
+
function sdkHeaders(): Record<string, string> {
|
|
57
|
+
return {
|
|
58
|
+
'X-KugelAudio-SDK': SDK_NAME,
|
|
59
|
+
'X-KugelAudio-SDK-Version': SDK_VERSION,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function appendSdkQuery(url: string): string {
|
|
64
|
+
const separator = url.includes('?') ? '&' : '?';
|
|
65
|
+
return `${url}${separator}sdk=${encodeURIComponent(SDK_NAME)}&sdk_version=${encodeURIComponent(SDK_VERSION)}`;
|
|
66
|
+
}
|
|
67
|
+
|
|
52
68
|
/**
|
|
53
69
|
* Create a new WebSocket instance.
|
|
54
70
|
* Lazily resolves the constructor to avoid top-level side-effects
|
|
@@ -491,7 +507,7 @@ class TTSResource {
|
|
|
491
507
|
if (this.client.orgId !== undefined) {
|
|
492
508
|
url += `&org_id=${this.client.orgId}`;
|
|
493
509
|
}
|
|
494
|
-
return url;
|
|
510
|
+
return appendSdkQuery(url);
|
|
495
511
|
}
|
|
496
512
|
|
|
497
513
|
/**
|
|
@@ -581,6 +597,7 @@ class TTSResource {
|
|
|
581
597
|
generationMs: data.gen_ms,
|
|
582
598
|
rtf: data.rtf,
|
|
583
599
|
error: data.error,
|
|
600
|
+
usage: parseSessionUsage(data) ?? undefined,
|
|
584
601
|
};
|
|
585
602
|
pending.callbacks.onFinal?.(stats);
|
|
586
603
|
this.pendingRequests.delete(requestId);
|
|
@@ -692,17 +709,21 @@ class TTSResource {
|
|
|
692
709
|
|
|
693
710
|
ws.send(JSON.stringify({
|
|
694
711
|
text: options.text,
|
|
695
|
-
model_id: options.modelId || 'kugel-
|
|
712
|
+
model_id: options.modelId || 'kugel-3',
|
|
696
713
|
voice_id: options.voiceId,
|
|
697
714
|
cfg_scale: options.cfgScale ?? 2.0,
|
|
698
715
|
...(options.temperature !== undefined && { temperature: options.temperature }),
|
|
699
716
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
700
717
|
sample_rate: options.sampleRate ?? 24000,
|
|
718
|
+
...(options.outputFormat && { output_format: options.outputFormat }),
|
|
701
719
|
normalize: options.normalize ?? true,
|
|
702
720
|
...(options.language && { language: options.language }),
|
|
703
721
|
...(options.wordTimestamps && { word_timestamps: true }),
|
|
704
722
|
...(options.speed !== undefined && { speed: options.speed }),
|
|
705
723
|
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
724
|
+
// [] is meaningful (explicit opt-out) and must be sent; only
|
|
725
|
+
// undefined (use the project default) is omitted.
|
|
726
|
+
...(options.dictionaryIds !== undefined && { dictionary_ids: options.dictionaryIds }),
|
|
706
727
|
}));
|
|
707
728
|
});
|
|
708
729
|
}
|
|
@@ -724,16 +745,20 @@ class TTSResource {
|
|
|
724
745
|
// Send TTS request
|
|
725
746
|
ws.send(JSON.stringify({
|
|
726
747
|
text: options.text,
|
|
727
|
-
model_id: options.modelId || 'kugel-
|
|
748
|
+
model_id: options.modelId || 'kugel-3',
|
|
728
749
|
voice_id: options.voiceId,
|
|
729
750
|
cfg_scale: options.cfgScale ?? 2.0,
|
|
730
751
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
731
752
|
sample_rate: options.sampleRate ?? 24000,
|
|
753
|
+
...(options.outputFormat && { output_format: options.outputFormat }),
|
|
732
754
|
normalize: options.normalize ?? true,
|
|
733
755
|
...(options.language && { language: options.language }),
|
|
734
756
|
...(options.wordTimestamps && { word_timestamps: true }),
|
|
735
757
|
...(options.speed !== undefined && { speed: options.speed }),
|
|
736
758
|
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
759
|
+
// [] is meaningful (explicit opt-out) and must be sent; only
|
|
760
|
+
// undefined (use the project default) is omitted.
|
|
761
|
+
...(options.dictionaryIds !== undefined && { dictionary_ids: options.dictionaryIds }),
|
|
737
762
|
}));
|
|
738
763
|
};
|
|
739
764
|
|
|
@@ -764,6 +789,7 @@ class TTSResource {
|
|
|
764
789
|
generationMs: data.gen_ms,
|
|
765
790
|
rtf: data.rtf,
|
|
766
791
|
error: data.error,
|
|
792
|
+
usage: parseSessionUsage(data) ?? undefined,
|
|
767
793
|
};
|
|
768
794
|
callbacks.onFinal?.(stats);
|
|
769
795
|
ws.close();
|
|
@@ -962,7 +988,11 @@ class MultiContextSession {
|
|
|
962
988
|
private config: import('./types').MultiContextConfig;
|
|
963
989
|
private callbacks: import('./types').MultiContextCallbacks = {};
|
|
964
990
|
private contexts: Set<string> = new Set();
|
|
991
|
+
/** Contexts a create message has been sent for (not yet necessarily
|
|
992
|
+
* confirmed by the server via context_created). */
|
|
993
|
+
private requestedContexts: Set<string> = new Set();
|
|
965
994
|
private _sessionId: string | null = null;
|
|
995
|
+
private _contextUsage: Map<string, import('./types').SessionUsage> = new Map();
|
|
966
996
|
private isStarted = false;
|
|
967
997
|
|
|
968
998
|
constructor(
|
|
@@ -979,6 +1009,20 @@ class MultiContextSession {
|
|
|
979
1009
|
return this._sessionId;
|
|
980
1010
|
}
|
|
981
1011
|
|
|
1012
|
+
/**
|
|
1013
|
+
* Per-context usage (audio time + amount charged) for a closed context, or
|
|
1014
|
+
* null if that context hasn't closed yet. Each context is its own
|
|
1015
|
+
* conversation — use this to bill per conversation. See {@link SessionUsage}.
|
|
1016
|
+
*/
|
|
1017
|
+
usageFor(contextId: string): import('./types').SessionUsage | null {
|
|
1018
|
+
return this._contextUsage.get(contextId) ?? null;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
/** Map of context_id → per-context usage for all closed contexts. */
|
|
1022
|
+
get contextUsage(): Map<string, import('./types').SessionUsage> {
|
|
1023
|
+
return new Map(this._contextUsage);
|
|
1024
|
+
}
|
|
1025
|
+
|
|
982
1026
|
/**
|
|
983
1027
|
* Connect to the multi-context WebSocket endpoint.
|
|
984
1028
|
*
|
|
@@ -1003,7 +1047,7 @@ class MultiContextSession {
|
|
|
1003
1047
|
authParam = 'api_key';
|
|
1004
1048
|
}
|
|
1005
1049
|
|
|
1006
|
-
const url = `${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}
|
|
1050
|
+
const url = appendSdkQuery(`${wsUrl}/ws/tts/multi?${authParam}=${this.client.apiKey}`);
|
|
1007
1051
|
this.ws = createWs(url);
|
|
1008
1052
|
const ws = this.ws;
|
|
1009
1053
|
|
|
@@ -1048,13 +1092,25 @@ class MultiContextSession {
|
|
|
1048
1092
|
this.callbacks.onChunk?.(chunk);
|
|
1049
1093
|
}
|
|
1050
1094
|
|
|
1095
|
+
if (data.final && data.context_id) {
|
|
1096
|
+
// Per-context end-of-audio marker (KUG-1238): all audio admitted
|
|
1097
|
+
// before the client's flush has been delivered; also precedes
|
|
1098
|
+
// context_closed on a graceful close.
|
|
1099
|
+
this.callbacks.onFinal?.(data.context_id);
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1051
1102
|
if (data.context_closed) {
|
|
1052
1103
|
this.contexts.delete(data.context_id);
|
|
1053
|
-
this.
|
|
1104
|
+
this.requestedContexts.delete(data.context_id);
|
|
1105
|
+
// Per-context (per-conversation) usage rides on context_closed.
|
|
1106
|
+
const ctxUsage = parseSessionUsage(data) ?? undefined;
|
|
1107
|
+
if (ctxUsage) this._contextUsage.set(data.context_id, ctxUsage);
|
|
1108
|
+
this.callbacks.onContextClosed?.(data.context_id, ctxUsage);
|
|
1054
1109
|
}
|
|
1055
1110
|
|
|
1056
1111
|
if (data.context_timeout) {
|
|
1057
1112
|
this.contexts.delete(data.context_id);
|
|
1113
|
+
this.requestedContexts.delete(data.context_id);
|
|
1058
1114
|
this.callbacks.onContextTimeout?.(data.context_id);
|
|
1059
1115
|
}
|
|
1060
1116
|
|
|
@@ -1109,6 +1165,7 @@ class MultiContextSession {
|
|
|
1109
1165
|
this.ws = null;
|
|
1110
1166
|
this.isStarted = false;
|
|
1111
1167
|
this.contexts.clear();
|
|
1168
|
+
this.requestedContexts.clear();
|
|
1112
1169
|
};
|
|
1113
1170
|
});
|
|
1114
1171
|
}
|
|
@@ -1126,6 +1183,7 @@ class MultiContextSession {
|
|
|
1126
1183
|
if (!this.ws || this.ws.readyState !== WS_OPEN) {
|
|
1127
1184
|
throw new KugelAudioError('WebSocket not connected');
|
|
1128
1185
|
}
|
|
1186
|
+
this.requestedContexts.add(contextId);
|
|
1129
1187
|
|
|
1130
1188
|
const msg: Record<string, unknown> = {
|
|
1131
1189
|
text: ' ',
|
|
@@ -1136,26 +1194,36 @@ class MultiContextSession {
|
|
|
1136
1194
|
if (!this.isStarted) {
|
|
1137
1195
|
warnIfNoLanguage(this.config.language, this.config.normalize);
|
|
1138
1196
|
if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
|
|
1197
|
+
if (this.config.outputFormat) msg.output_format = this.config.outputFormat;
|
|
1139
1198
|
if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
|
|
1140
1199
|
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
1141
1200
|
if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
|
|
1142
1201
|
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
1143
1202
|
if (this.config.language) msg.language = this.config.language;
|
|
1203
|
+
// [] is meaningful (explicit opt-out) and must be sent; only
|
|
1204
|
+
// undefined (use the project default) is omitted.
|
|
1205
|
+
if (this.config.dictionaryIds !== undefined) msg.dictionary_ids = this.config.dictionaryIds;
|
|
1144
1206
|
if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
|
|
1145
1207
|
}
|
|
1146
1208
|
|
|
1147
|
-
// Per-context voice
|
|
1209
|
+
// Per-context voice. The server binds a context's voice ONLY from
|
|
1210
|
+
// voice_settings.voice_id at context creation — a top-level voice_id
|
|
1211
|
+
// merely updates the session config and leaves the context voiceless,
|
|
1212
|
+
// which the server rejects with MISSING_VOICE_ID on the first text
|
|
1213
|
+
// (KUG-1233). This matches the Python SDK's wire format.
|
|
1214
|
+
const voiceSettings: Record<string, unknown> = {};
|
|
1148
1215
|
const voiceId = options?.voiceId || this.config.defaultVoiceId;
|
|
1149
|
-
if (voiceId)
|
|
1216
|
+
if (voiceId) voiceSettings.voice_id = voiceId;
|
|
1150
1217
|
|
|
1151
1218
|
if (options?.voiceSettings) {
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1219
|
+
voiceSettings.stability = options.voiceSettings.stability;
|
|
1220
|
+
voiceSettings.similarity_boost = options.voiceSettings.similarityBoost;
|
|
1221
|
+
voiceSettings.style = options.voiceSettings.style;
|
|
1222
|
+
voiceSettings.use_speaker_boost = options.voiceSettings.useSpeakerBoost;
|
|
1223
|
+
voiceSettings.speed = options.voiceSettings.speed;
|
|
1224
|
+
}
|
|
1225
|
+
if (Object.keys(voiceSettings).length > 0) {
|
|
1226
|
+
msg.voice_settings = voiceSettings;
|
|
1159
1227
|
}
|
|
1160
1228
|
|
|
1161
1229
|
this.ws.send(JSON.stringify(msg));
|
|
@@ -1169,8 +1237,12 @@ class MultiContextSession {
|
|
|
1169
1237
|
throw new KugelAudioError('WebSocket not connected');
|
|
1170
1238
|
}
|
|
1171
1239
|
|
|
1172
|
-
// Auto-create context if needed
|
|
1173
|
-
|
|
1240
|
+
// Auto-create context if needed. Tracked via requestedContexts (sent
|
|
1241
|
+
// creates, not yet necessarily confirmed) rather than this.contexts
|
|
1242
|
+
// (server-confirmed) — otherwise a send() to a new context after the
|
|
1243
|
+
// session started goes out bare, and the server auto-creates the
|
|
1244
|
+
// context without voice_settings → MISSING_VOICE_ID (KUG-1233).
|
|
1245
|
+
if (!this.requestedContexts.has(contextId) && !this.contexts.has(contextId)) {
|
|
1174
1246
|
this.createContext(contextId);
|
|
1175
1247
|
}
|
|
1176
1248
|
|
|
@@ -1236,6 +1308,7 @@ class MultiContextSession {
|
|
|
1236
1308
|
this.ws = null;
|
|
1237
1309
|
this.isStarted = false;
|
|
1238
1310
|
this.contexts.clear();
|
|
1311
|
+
this.requestedContexts.clear();
|
|
1239
1312
|
}
|
|
1240
1313
|
|
|
1241
1314
|
/**
|
|
@@ -1288,6 +1361,7 @@ class StreamingSession {
|
|
|
1288
1361
|
private callbacks: StreamingSessionCallbacks;
|
|
1289
1362
|
private client: KugelAudio;
|
|
1290
1363
|
private configSent = false;
|
|
1364
|
+
private _lastUsage: import('./types').SessionUsage | null = null;
|
|
1291
1365
|
|
|
1292
1366
|
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks) {
|
|
1293
1367
|
this.client = client;
|
|
@@ -1295,6 +1369,15 @@ class StreamingSession {
|
|
|
1295
1369
|
this.callbacks = callbacks;
|
|
1296
1370
|
}
|
|
1297
1371
|
|
|
1372
|
+
/**
|
|
1373
|
+
* Per-session usage from the most recently closed session, or null before
|
|
1374
|
+
* the first session closes. Use this to bill your own customers per
|
|
1375
|
+
* conversation. See {@link SessionUsage}.
|
|
1376
|
+
*/
|
|
1377
|
+
get lastUsage(): import('./types').SessionUsage | null {
|
|
1378
|
+
return this._lastUsage;
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1298
1381
|
/**
|
|
1299
1382
|
* Open the WebSocket connection and authenticate.
|
|
1300
1383
|
*
|
|
@@ -1317,7 +1400,7 @@ class StreamingSession {
|
|
|
1317
1400
|
authParam = 'api_key';
|
|
1318
1401
|
}
|
|
1319
1402
|
|
|
1320
|
-
const url = `${wsUrl}/ws/tts/stream?${authParam}=${this.client.apiKey}
|
|
1403
|
+
const url = appendSdkQuery(`${wsUrl}/ws/tts/stream?${authParam}=${this.client.apiKey}`);
|
|
1321
1404
|
this.ws = createWs(url);
|
|
1322
1405
|
const ws = this.ws;
|
|
1323
1406
|
|
|
@@ -1374,7 +1457,18 @@ class StreamingSession {
|
|
|
1374
1457
|
this.callbacks.onInterrupted?.();
|
|
1375
1458
|
}
|
|
1376
1459
|
|
|
1460
|
+
if (data.final) {
|
|
1461
|
+
// End-of-audio marker for the turn (KUG-1238) — arrives after
|
|
1462
|
+
// the last audio frame and before session_closed.
|
|
1463
|
+
this.callbacks.onFinal?.(
|
|
1464
|
+
data.total_audio_seconds ?? 0,
|
|
1465
|
+
data.total_text_chunks ?? 0,
|
|
1466
|
+
data.total_audio_chunks ?? 0,
|
|
1467
|
+
);
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1377
1470
|
if (data.session_closed) {
|
|
1471
|
+
this._lastUsage = parseSessionUsage(data);
|
|
1378
1472
|
this.callbacks.onSessionClosed?.(
|
|
1379
1473
|
data.total_audio_seconds ?? 0,
|
|
1380
1474
|
data.total_text_chunks ?? 0,
|
|
@@ -1459,6 +1553,7 @@ class StreamingSession {
|
|
|
1459
1553
|
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
1460
1554
|
if (this.config.maxNewTokens !== undefined) msg.max_new_tokens = this.config.maxNewTokens;
|
|
1461
1555
|
if (this.config.sampleRate !== undefined) msg.sample_rate = this.config.sampleRate;
|
|
1556
|
+
if (this.config.outputFormat !== undefined) msg.output_format = this.config.outputFormat;
|
|
1462
1557
|
if (this.config.flushTimeoutMs !== undefined) msg.flush_timeout_ms = this.config.flushTimeoutMs;
|
|
1463
1558
|
if (this.config.maxBufferLength !== undefined) msg.max_buffer_length = this.config.maxBufferLength;
|
|
1464
1559
|
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
@@ -1467,6 +1562,9 @@ class StreamingSession {
|
|
|
1467
1562
|
if (this.config.autoMode !== undefined) msg.auto_mode = this.config.autoMode;
|
|
1468
1563
|
if (this.config.chunkLengthSchedule?.length) msg.chunk_length_schedule = this.config.chunkLengthSchedule;
|
|
1469
1564
|
if (this.config.speed !== undefined) msg.speed = this.config.speed;
|
|
1565
|
+
// [] is meaningful (explicit opt-out) and must be sent; only
|
|
1566
|
+
// undefined (use the project default) is omitted.
|
|
1567
|
+
if (this.config.dictionaryIds !== undefined) msg.dictionary_ids = this.config.dictionaryIds;
|
|
1470
1568
|
this.configSent = true;
|
|
1471
1569
|
}
|
|
1472
1570
|
|
|
@@ -1865,6 +1963,7 @@ export class KugelAudio {
|
|
|
1865
1963
|
'Content-Type': 'application/json',
|
|
1866
1964
|
'X-API-Key': this._apiKey,
|
|
1867
1965
|
'Authorization': `Bearer ${this._apiKey}`,
|
|
1966
|
+
...sdkHeaders(),
|
|
1868
1967
|
};
|
|
1869
1968
|
|
|
1870
1969
|
const controller = new AbortController();
|
|
@@ -1913,6 +2012,7 @@ export class KugelAudio {
|
|
|
1913
2012
|
const headers: Record<string, string> = {
|
|
1914
2013
|
'X-API-Key': this._apiKey,
|
|
1915
2014
|
'Authorization': `Bearer ${this._apiKey}`,
|
|
2015
|
+
...sdkHeaders(),
|
|
1916
2016
|
};
|
|
1917
2017
|
|
|
1918
2018
|
const controller = new AbortController();
|
package/src/index.ts
CHANGED
|
@@ -62,6 +62,7 @@ export type {
|
|
|
62
62
|
MultiContextAudioChunk,
|
|
63
63
|
MultiContextCallbacks,
|
|
64
64
|
MultiContextConfig,
|
|
65
|
+
SessionUsage,
|
|
65
66
|
StreamCallbacks,
|
|
66
67
|
StreamConfig,
|
|
67
68
|
StreamingSessionCallbacks,
|
|
@@ -78,6 +79,7 @@ export type {
|
|
|
78
79
|
VoiceSex,
|
|
79
80
|
WordTimestamp
|
|
80
81
|
} from './types';
|
|
82
|
+
export { parseSessionUsage } from './types';
|
|
81
83
|
|
|
82
84
|
export { DictionariesResource, DictionaryEntriesResource } from './dictionaries';
|
|
83
85
|
|
package/src/types.ts
CHANGED
|
@@ -240,7 +240,7 @@ export interface WordTimestamp {
|
|
|
240
240
|
export interface GenerateOptions {
|
|
241
241
|
/** Text to synthesize */
|
|
242
242
|
text: string;
|
|
243
|
-
/** Model to use: 'kugel-
|
|
243
|
+
/** Model to use. Default: 'kugel-3'. Legacy ids (kugel-2.5, kugel-1-turbo, …) still accepted; they alias to kugel-3 server-side. */
|
|
244
244
|
modelId?: string;
|
|
245
245
|
/** Voice ID to use */
|
|
246
246
|
voiceId?: number;
|
|
@@ -258,7 +258,13 @@ export interface GenerateOptions {
|
|
|
258
258
|
maxNewTokens?: number;
|
|
259
259
|
/** Output sample rate (default: 24000) */
|
|
260
260
|
sampleRate?: number;
|
|
261
|
-
/**
|
|
261
|
+
/**
|
|
262
|
+
* Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
|
|
263
|
+
* Opt-in; when set it is authoritative and must not contradict sampleRate.
|
|
264
|
+
* Absent ⇒ legacy PCM16 at sampleRate.
|
|
265
|
+
*/
|
|
266
|
+
outputFormat?: string;
|
|
267
|
+
/**
|
|
262
268
|
* Enable text normalization (converts numbers, dates, etc. to spoken words).
|
|
263
269
|
* When true, text will be normalized before TTS generation.
|
|
264
270
|
* Default: true
|
|
@@ -286,8 +292,8 @@ export interface GenerateOptions {
|
|
|
286
292
|
/**
|
|
287
293
|
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
288
294
|
*
|
|
289
|
-
* Uses pitch-preserving time-stretching (WSOLA)
|
|
290
|
-
*
|
|
295
|
+
* Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
|
|
296
|
+
* whole request (no per-span control).
|
|
291
297
|
* Range: [0.8, 1.2]. Default: 1.0.
|
|
292
298
|
*/
|
|
293
299
|
speed?: number;
|
|
@@ -298,6 +304,14 @@ export interface GenerateOptions {
|
|
|
298
304
|
* server treats the value as trusted once received.
|
|
299
305
|
*/
|
|
300
306
|
projectId?: number;
|
|
307
|
+
/**
|
|
308
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
309
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
310
|
+
* empty array disables dictionaries for this request. A list of
|
|
311
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
312
|
+
* inactive ones — bypassing the language filter.
|
|
313
|
+
*/
|
|
314
|
+
dictionaryIds?: number[];
|
|
301
315
|
}
|
|
302
316
|
|
|
303
317
|
/**
|
|
@@ -320,7 +334,7 @@ export interface GenerateOptions {
|
|
|
320
334
|
export interface StreamConfig {
|
|
321
335
|
/** Voice ID to use */
|
|
322
336
|
voiceId?: number;
|
|
323
|
-
/** Model ID
|
|
337
|
+
/** Model ID. Default: 'kugel-3'. Legacy ids still accepted; they alias to kugel-3 server-side. */
|
|
324
338
|
modelId?: string;
|
|
325
339
|
/** CFG scale for generation */
|
|
326
340
|
cfgScale?: number;
|
|
@@ -333,6 +347,8 @@ export interface StreamConfig {
|
|
|
333
347
|
maxNewTokens?: number;
|
|
334
348
|
/** Output sample rate */
|
|
335
349
|
sampleRate?: number;
|
|
350
|
+
/** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
|
|
351
|
+
outputFormat?: string;
|
|
336
352
|
/** Auto-flush timeout in milliseconds */
|
|
337
353
|
flushTimeoutMs?: number;
|
|
338
354
|
/** Maximum buffer length */
|
|
@@ -377,11 +393,19 @@ export interface StreamConfig {
|
|
|
377
393
|
/**
|
|
378
394
|
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
379
395
|
*
|
|
380
|
-
* Uses pitch-preserving time-stretching (WSOLA)
|
|
381
|
-
*
|
|
396
|
+
* Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
|
|
397
|
+
* whole request (no per-span control).
|
|
382
398
|
* Range: [0.8, 1.2]. Default: 1.0.
|
|
383
399
|
*/
|
|
384
400
|
speed?: number;
|
|
401
|
+
/**
|
|
402
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
403
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
404
|
+
* empty array disables dictionaries for this request. A list of
|
|
405
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
406
|
+
* inactive ones — bypassing the language filter.
|
|
407
|
+
*/
|
|
408
|
+
dictionaryIds?: number[];
|
|
385
409
|
}
|
|
386
410
|
|
|
387
411
|
/**
|
|
@@ -399,9 +423,18 @@ export interface StreamingSessionCallbacks {
|
|
|
399
423
|
* Carries the segment index, total audio duration, and generation time.
|
|
400
424
|
*/
|
|
401
425
|
onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
|
|
426
|
+
/**
|
|
427
|
+
* Called when the server marks the end of a turn's audio
|
|
428
|
+
* (`{"final": true, ...}` — sent after the last audio frame of every
|
|
429
|
+
* gracefully completed turn, right before `session_closed`). The
|
|
430
|
+
* ElevenLabs `isFinal` equivalent: once this fires, no further audio
|
|
431
|
+
* for the turn will arrive. Not fired on a barge-in cancel — that
|
|
432
|
+
* path fires {@link onInterrupted} instead.
|
|
433
|
+
*/
|
|
434
|
+
onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
402
435
|
/**
|
|
403
436
|
* Called when the session is fully closed (after `session.close()`).
|
|
404
|
-
*
|
|
437
|
+
* Fires right after {@link onFinal} and additionally carries usage.
|
|
405
438
|
*/
|
|
406
439
|
onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
407
440
|
/** Called when the server begins generating audio for a text segment. */
|
|
@@ -419,14 +452,71 @@ export interface StreamingSessionCallbacks {
|
|
|
419
452
|
onError?: (error: Error) => void;
|
|
420
453
|
}
|
|
421
454
|
|
|
455
|
+
/**
|
|
456
|
+
* Per-session usage reported in the `session_closed` frame (KUG-1192).
|
|
457
|
+
*
|
|
458
|
+
* Lets you bill your own customers per conversation. `costCents` is the
|
|
459
|
+
* actual amount charged in **EUR cents**. When the charge could not be
|
|
460
|
+
* determined at session end (e.g. a transient billing error) `costCents` is
|
|
461
|
+
* `null` and `costAvailable` is `false` — never a misleading `0`.
|
|
462
|
+
* `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
|
|
463
|
+
* context (per conversation) on each `context_closed` frame, not aggregated
|
|
464
|
+
* across contexts.
|
|
465
|
+
*/
|
|
466
|
+
export interface SessionUsage {
|
|
467
|
+
/** Total audio generated this session, in seconds (the unit we bill on). */
|
|
468
|
+
audioSeconds: number;
|
|
469
|
+
/** Actual amount charged in EUR cents, or `null` if undetermined. */
|
|
470
|
+
costCents: number | null;
|
|
471
|
+
/** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
|
|
472
|
+
currency?: string;
|
|
473
|
+
/** Total input characters submitted this session, if reported. */
|
|
474
|
+
characters?: number;
|
|
475
|
+
/** Model that produced the audio, if reported. */
|
|
476
|
+
modelId?: string;
|
|
477
|
+
/** `true` when an authoritative charge was returned for this session. */
|
|
478
|
+
costAvailable: boolean;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Parse the raw `usage` object (or a legacy `session_closed` payload without
|
|
483
|
+
* one) into a typed {@link SessionUsage}. Returns `null` when no usage info
|
|
484
|
+
* is present.
|
|
485
|
+
*/
|
|
486
|
+
export function parseSessionUsage(
|
|
487
|
+
data: Record<string, unknown>,
|
|
488
|
+
): SessionUsage | null {
|
|
489
|
+
const raw = data.usage as Record<string, unknown> | undefined;
|
|
490
|
+
const source = raw && typeof raw === 'object' ? raw : data;
|
|
491
|
+
const audioSeconds =
|
|
492
|
+
typeof source.audio_seconds === 'number'
|
|
493
|
+
? source.audio_seconds
|
|
494
|
+
: typeof data.total_audio_seconds === 'number'
|
|
495
|
+
? data.total_audio_seconds
|
|
496
|
+
: undefined;
|
|
497
|
+
if (audioSeconds === undefined) return null;
|
|
498
|
+
const costCents =
|
|
499
|
+
typeof source.cost_cents === 'number' ? source.cost_cents : null;
|
|
500
|
+
return {
|
|
501
|
+
audioSeconds,
|
|
502
|
+
costCents,
|
|
503
|
+
currency:
|
|
504
|
+
typeof source.currency === 'string' ? source.currency : undefined,
|
|
505
|
+
characters:
|
|
506
|
+
typeof source.characters === 'number' ? source.characters : undefined,
|
|
507
|
+
modelId: typeof source.model_id === 'string' ? source.model_id : undefined,
|
|
508
|
+
costAvailable: costCents !== null,
|
|
509
|
+
};
|
|
510
|
+
}
|
|
511
|
+
|
|
422
512
|
/**
|
|
423
513
|
* Audio chunk from streaming TTS.
|
|
424
514
|
*/
|
|
425
515
|
export interface AudioChunk {
|
|
426
516
|
/** Raw PCM16 audio as base64 */
|
|
427
517
|
audio: string;
|
|
428
|
-
/** Encoding format */
|
|
429
|
-
encoding: 'pcm_s16le';
|
|
518
|
+
/** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
|
|
519
|
+
encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
|
|
430
520
|
/** Chunk index */
|
|
431
521
|
index: number;
|
|
432
522
|
/** Sample rate */
|
|
@@ -453,6 +543,12 @@ export interface GenerationStats {
|
|
|
453
543
|
rtf: number;
|
|
454
544
|
/** Error message if any */
|
|
455
545
|
error?: string;
|
|
546
|
+
/**
|
|
547
|
+
* Per-request usage (audio time + amount charged), for billing your own
|
|
548
|
+
* customers. Undefined when the server reports no usage. See
|
|
549
|
+
* {@link SessionUsage}.
|
|
550
|
+
*/
|
|
551
|
+
usage?: SessionUsage;
|
|
456
552
|
}
|
|
457
553
|
|
|
458
554
|
/**
|
|
@@ -546,6 +642,8 @@ export interface MultiContextConfig {
|
|
|
546
642
|
defaultVoiceId?: number;
|
|
547
643
|
/** Output sample rate (default: 24000) */
|
|
548
644
|
sampleRate?: number;
|
|
645
|
+
/** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
|
|
646
|
+
outputFormat?: string;
|
|
549
647
|
/** CFG scale for generation (default: 2.0) */
|
|
550
648
|
cfgScale?: number;
|
|
551
649
|
/**
|
|
@@ -563,6 +661,14 @@ export interface MultiContextConfig {
|
|
|
563
661
|
* the language, which adds ~60-150ms to time-to-first-audio.
|
|
564
662
|
*/
|
|
565
663
|
language?: string;
|
|
664
|
+
/**
|
|
665
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
666
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
667
|
+
* empty array disables dictionaries for this request. A list of
|
|
668
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
669
|
+
* inactive ones — bypassing the language filter.
|
|
670
|
+
*/
|
|
671
|
+
dictionaryIds?: number[];
|
|
566
672
|
/** Seconds before context auto-closes (default: 20.0) */
|
|
567
673
|
inactivityTimeout?: number;
|
|
568
674
|
}
|
|
@@ -601,8 +707,20 @@ export interface MultiContextCallbacks {
|
|
|
601
707
|
onContextCreated?: (contextId: string) => void;
|
|
602
708
|
/** Called when an audio chunk is received */
|
|
603
709
|
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
604
|
-
/**
|
|
605
|
-
|
|
710
|
+
/**
|
|
711
|
+
* Called when all audio admitted before a `{flush: true}` has been
|
|
712
|
+
* delivered for a context (`{"final": true, "context_id": ...}`), and
|
|
713
|
+
* once more before {@link onContextClosed} on a graceful close. The
|
|
714
|
+
* ElevenLabs multi-context `is_final` equivalent. Not fired on an
|
|
715
|
+
* immediate (barge-in) close.
|
|
716
|
+
*/
|
|
717
|
+
onFinal?: (contextId: string) => void;
|
|
718
|
+
/**
|
|
719
|
+
* Called when a context is closed (terminal). `usage` carries this
|
|
720
|
+
* conversation's audio time + amount charged (undefined if not reported).
|
|
721
|
+
* See {@link SessionUsage}.
|
|
722
|
+
*/
|
|
723
|
+
onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
|
|
606
724
|
/** Called when a context times out */
|
|
607
725
|
onContextTimeout?: (contextId: string) => void;
|
|
608
726
|
/** Called when session is closed */
|