kugelaudio 0.6.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/README.md +11 -2
- package/dist/index.d.mts +123 -12
- package/dist/index.d.ts +123 -12
- package/dist/index.js +178 -23
- package/dist/index.mjs +176 -22
- package/package.json +1 -1
- package/src/client.test.ts +395 -1
- package/src/client.ts +117 -17
- package/src/index.ts +2 -0
- package/src/types.ts +130 -12
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,27 @@
|
|
|
1
|
+
## [kugelaudio-v0.8.0](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.7.0...js-sdk-v0.8.0) (2026-06-10)
|
|
2
|
+
|
|
3
|
+
### Features
|
|
4
|
+
|
|
5
|
+
* **ingress,python-sdk,js-sdk,java-sdk:** per-session usage over WebSocket ([#1346](https://github.com/Kugelaudio/KugelAudio/issues/1346)) ([2881881](https://github.com/Kugelaudio/KugelAudio/commit/28818816dca9c8d222391691d70f458c0eb28ed8))
|
|
6
|
+
* **ingress,python-sdk,js-sdk:** streaming final end-of-audio frame ([#1362](https://github.com/Kugelaudio/KugelAudio/issues/1362)) ([3fa95d2](https://github.com/Kugelaudio/KugelAudio/commit/3fa95d2f8597e6c9ced0aaf8370682dbcb123c71))
|
|
7
|
+
* **ingress:** output_format token + server-side G.711 (KUG-1190) ([#1345](https://github.com/Kugelaudio/KugelAudio/issues/1345)) ([3723291](https://github.com/Kugelaudio/KugelAudio/commit/372329196c4c91aa41fe2111783872874b6e895b))
|
|
8
|
+
* per-request dictionary selection (KUG-1094) ([#1361](https://github.com/Kugelaudio/KugelAudio/issues/1361)) ([3c28968](https://github.com/Kugelaudio/KugelAudio/commit/3c28968d32018bf3cafe1d312f32831668ea96b8))
|
|
9
|
+
|
|
10
|
+
### Bug Fixes
|
|
11
|
+
|
|
12
|
+
* **js-sdk,java-sdk,ingress:** multi-turn conversations work end-to-end + live SDK e2e bench in CI (KUG-1233) ([#1363](https://github.com/Kugelaudio/KugelAudio/issues/1363)) ([c0ed2a9](https://github.com/Kugelaudio/KugelAudio/commit/c0ed2a9cf41025bac5c7182c1a281eb600d8dd36))
|
|
13
|
+
|
|
14
|
+
## [kugelaudio-v0.7.0](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.6.1...js-sdk-v0.7.0) (2026-06-06)
|
|
15
|
+
|
|
16
|
+
### Features
|
|
17
|
+
|
|
18
|
+
* **ingress:** add request observability metadata ([#1321](https://github.com/Kugelaudio/KugelAudio/issues/1321)) ([a9c5178](https://github.com/Kugelaudio/KugelAudio/commit/a9c5178193cb8b746a8bbd9b566b11f7b1d00f6d))
|
|
19
|
+
* **sdks:** default all SDKs to kugel-3 model ([#1323](https://github.com/Kugelaudio/KugelAudio/issues/1323)) ([c4de212](https://github.com/Kugelaudio/KugelAudio/commit/c4de212c91e16326a15dbee5622acacc83ed85bb))
|
|
20
|
+
|
|
21
|
+
### Bug Fixes
|
|
22
|
+
|
|
23
|
+
* **js-sdk:** type SDK metadata fetch mock ([#1334](https://github.com/Kugelaudio/KugelAudio/issues/1334)) ([e8f6f59](https://github.com/Kugelaudio/KugelAudio/commit/e8f6f59595e123eaae8b44670c94fb4e7bc8d06c))
|
|
24
|
+
|
|
1
25
|
## [kugelaudio-v0.6.1](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.6.0...js-sdk-v0.6.1) (2026-06-04)
|
|
2
26
|
|
|
3
27
|
### Bug Fixes
|
package/README.md
CHANGED
|
@@ -173,6 +173,7 @@ const audio = await client.tts.generate({
|
|
|
173
173
|
cfgScale: 2.0, // Guidance scale (1.0-5.0)
|
|
174
174
|
maxNewTokens: 2048, // Maximum tokens to generate
|
|
175
175
|
sampleRate: 24000, // Output sample rate
|
|
176
|
+
outputFormat: undefined, // Optional: 'pcm_24000', 'ulaw_8000', 'alaw_8000', ...
|
|
176
177
|
normalize: true, // Enable text normalization (see below)
|
|
177
178
|
language: 'en', // Language for normalization
|
|
178
179
|
});
|
|
@@ -298,12 +299,19 @@ for await (const token of llmTokenStream) {
|
|
|
298
299
|
// Triggers the server-side final flush of any trailing text,
|
|
299
300
|
// streams the resulting audio through onChunk, then closes the WS.
|
|
300
301
|
await session.close();
|
|
302
|
+
|
|
303
|
+
// Per-session usage — bill your own customers per conversation.
|
|
304
|
+
// `costCents` is the actual charge in EUR cents (null if undetermined).
|
|
305
|
+
const usage = session.lastUsage;
|
|
306
|
+
if (usage) {
|
|
307
|
+
console.log(`audio: ${usage.audioSeconds}s, cost: ${usage.costCents ?? 'n/a'} ct`);
|
|
308
|
+
}
|
|
301
309
|
```
|
|
302
310
|
|
|
303
311
|
> ⚠️ **Do not call `session.send(text, true)` (`flush=true`) between
|
|
304
312
|
> sentences or words.** Each explicit flush is a separate TTS request
|
|
305
313
|
> that pays the full model time-to-first-audio (TTFA) again and produces
|
|
306
|
-
> an audible gap. See [
|
|
314
|
+
> an audible gap. See [Chunking & per-segment latency](https://docs.kugelaudio.com/streaming/chunking-and-latency)
|
|
307
315
|
> for the full rationale, chunk-size ordering, and ElevenLabs migration
|
|
308
316
|
> notes.
|
|
309
317
|
|
|
@@ -413,6 +421,7 @@ interface GenerateOptions {
|
|
|
413
421
|
cfgScale?: number; // Default: 2.0
|
|
414
422
|
maxNewTokens?: number; // Default: 2048
|
|
415
423
|
sampleRate?: number; // Default: 24000
|
|
424
|
+
outputFormat?: string; // 'pcm_8000' | 'pcm_16000' | 'pcm_22050' | 'pcm_24000' | 'ulaw_8000' | 'alaw_8000'
|
|
416
425
|
normalize?: boolean; // Default: true - Enable text normalization
|
|
417
426
|
language?: string; // ISO 639-1 code for normalization (e.g., 'en', 'de')
|
|
418
427
|
}
|
|
@@ -425,7 +434,7 @@ interface GenerateOptions {
|
|
|
425
434
|
```typescript
|
|
426
435
|
interface AudioChunk {
|
|
427
436
|
audio: string; // Base64-encoded PCM16 audio
|
|
428
|
-
encoding: string; // 'pcm_s16le'
|
|
437
|
+
encoding: string; // 'pcm_s16le' | 'mulaw' | 'alaw' (G.711 when outputFormat set)
|
|
429
438
|
index: number; // Chunk index (0-based)
|
|
430
439
|
sampleRate: number; // Sample rate (24000)
|
|
431
440
|
samples: number; // Number of samples in chunk
|
package/dist/index.d.mts
CHANGED
|
@@ -217,7 +217,7 @@ interface WordTimestamp {
|
|
|
217
217
|
interface GenerateOptions {
|
|
218
218
|
/** Text to synthesize */
|
|
219
219
|
text: string;
|
|
220
|
-
/** Model to use: 'kugel-
|
|
220
|
+
/** Model to use. Default: 'kugel-3'. Legacy ids (kugel-2.5, kugel-1-turbo, …) still accepted; they alias to kugel-3 server-side. */
|
|
221
221
|
modelId?: string;
|
|
222
222
|
/** Voice ID to use */
|
|
223
223
|
voiceId?: number;
|
|
@@ -235,6 +235,12 @@ interface GenerateOptions {
|
|
|
235
235
|
maxNewTokens?: number;
|
|
236
236
|
/** Output sample rate (default: 24000) */
|
|
237
237
|
sampleRate?: number;
|
|
238
|
+
/**
|
|
239
|
+
* Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
|
|
240
|
+
* Opt-in; when set it is authoritative and must not contradict sampleRate.
|
|
241
|
+
* Absent ⇒ legacy PCM16 at sampleRate.
|
|
242
|
+
*/
|
|
243
|
+
outputFormat?: string;
|
|
238
244
|
/**
|
|
239
245
|
* Enable text normalization (converts numbers, dates, etc. to spoken words).
|
|
240
246
|
* When true, text will be normalized before TTS generation.
|
|
@@ -263,8 +269,8 @@ interface GenerateOptions {
|
|
|
263
269
|
/**
|
|
264
270
|
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
265
271
|
*
|
|
266
|
-
* Uses pitch-preserving time-stretching (WSOLA)
|
|
267
|
-
*
|
|
272
|
+
* Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
|
|
273
|
+
* whole request (no per-span control).
|
|
268
274
|
* Range: [0.8, 1.2]. Default: 1.0.
|
|
269
275
|
*/
|
|
270
276
|
speed?: number;
|
|
@@ -275,6 +281,14 @@ interface GenerateOptions {
|
|
|
275
281
|
* server treats the value as trusted once received.
|
|
276
282
|
*/
|
|
277
283
|
projectId?: number;
|
|
284
|
+
/**
|
|
285
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
286
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
287
|
+
* empty array disables dictionaries for this request. A list of
|
|
288
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
289
|
+
* inactive ones — bypassing the language filter.
|
|
290
|
+
*/
|
|
291
|
+
dictionaryIds?: number[];
|
|
278
292
|
}
|
|
279
293
|
/**
|
|
280
294
|
* Streaming session configuration for `/ws/tts/stream`.
|
|
@@ -296,7 +310,7 @@ interface GenerateOptions {
|
|
|
296
310
|
interface StreamConfig {
|
|
297
311
|
/** Voice ID to use */
|
|
298
312
|
voiceId?: number;
|
|
299
|
-
/** Model ID
|
|
313
|
+
/** Model ID. Default: 'kugel-3'. Legacy ids still accepted; they alias to kugel-3 server-side. */
|
|
300
314
|
modelId?: string;
|
|
301
315
|
/** CFG scale for generation */
|
|
302
316
|
cfgScale?: number;
|
|
@@ -309,6 +323,8 @@ interface StreamConfig {
|
|
|
309
323
|
maxNewTokens?: number;
|
|
310
324
|
/** Output sample rate */
|
|
311
325
|
sampleRate?: number;
|
|
326
|
+
/** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
|
|
327
|
+
outputFormat?: string;
|
|
312
328
|
/** Auto-flush timeout in milliseconds */
|
|
313
329
|
flushTimeoutMs?: number;
|
|
314
330
|
/** Maximum buffer length */
|
|
@@ -353,11 +369,19 @@ interface StreamConfig {
|
|
|
353
369
|
/**
|
|
354
370
|
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
355
371
|
*
|
|
356
|
-
* Uses pitch-preserving time-stretching (WSOLA)
|
|
357
|
-
*
|
|
372
|
+
* Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
|
|
373
|
+
* whole request (no per-span control).
|
|
358
374
|
* Range: [0.8, 1.2]. Default: 1.0.
|
|
359
375
|
*/
|
|
360
376
|
speed?: number;
|
|
377
|
+
/**
|
|
378
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
379
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
380
|
+
* empty array disables dictionaries for this request. A list of
|
|
381
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
382
|
+
* inactive ones — bypassing the language filter.
|
|
383
|
+
*/
|
|
384
|
+
dictionaryIds?: number[];
|
|
361
385
|
}
|
|
362
386
|
/**
|
|
363
387
|
* Event callbacks for a streaming session (`/ws/tts/stream`).
|
|
@@ -374,9 +398,18 @@ interface StreamingSessionCallbacks {
|
|
|
374
398
|
* Carries the segment index, total audio duration, and generation time.
|
|
375
399
|
*/
|
|
376
400
|
onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
|
|
401
|
+
/**
|
|
402
|
+
* Called when the server marks the end of a turn's audio
|
|
403
|
+
* (`{"final": true, ...}` — sent after the last audio frame of every
|
|
404
|
+
* gracefully completed turn, right before `session_closed`). The
|
|
405
|
+
* ElevenLabs `isFinal` equivalent: once this fires, no further audio
|
|
406
|
+
* for the turn will arrive. Not fired on a barge-in cancel — that
|
|
407
|
+
* path fires {@link onInterrupted} instead.
|
|
408
|
+
*/
|
|
409
|
+
onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
377
410
|
/**
|
|
378
411
|
* Called when the session is fully closed (after `session.close()`).
|
|
379
|
-
*
|
|
412
|
+
* Fires right after {@link onFinal} and additionally carries usage.
|
|
380
413
|
*/
|
|
381
414
|
onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
382
415
|
/** Called when the server begins generating audio for a text segment. */
|
|
@@ -393,14 +426,45 @@ interface StreamingSessionCallbacks {
|
|
|
393
426
|
/** Called on any error. */
|
|
394
427
|
onError?: (error: Error) => void;
|
|
395
428
|
}
|
|
429
|
+
/**
|
|
430
|
+
* Per-session usage reported in the `session_closed` frame (KUG-1192).
|
|
431
|
+
*
|
|
432
|
+
* Lets you bill your own customers per conversation. `costCents` is the
|
|
433
|
+
* actual amount charged in **EUR cents**. When the charge could not be
|
|
434
|
+
* determined at session end (e.g. a transient billing error) `costCents` is
|
|
435
|
+
* `null` and `costAvailable` is `false` — never a misleading `0`.
|
|
436
|
+
* `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
|
|
437
|
+
* context (per conversation) on each `context_closed` frame, not aggregated
|
|
438
|
+
* across contexts.
|
|
439
|
+
*/
|
|
440
|
+
interface SessionUsage {
|
|
441
|
+
/** Total audio generated this session, in seconds (the unit we bill on). */
|
|
442
|
+
audioSeconds: number;
|
|
443
|
+
/** Actual amount charged in EUR cents, or `null` if undetermined. */
|
|
444
|
+
costCents: number | null;
|
|
445
|
+
/** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
|
|
446
|
+
currency?: string;
|
|
447
|
+
/** Total input characters submitted this session, if reported. */
|
|
448
|
+
characters?: number;
|
|
449
|
+
/** Model that produced the audio, if reported. */
|
|
450
|
+
modelId?: string;
|
|
451
|
+
/** `true` when an authoritative charge was returned for this session. */
|
|
452
|
+
costAvailable: boolean;
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Parse the raw `usage` object (or a legacy `session_closed` payload without
|
|
456
|
+
* one) into a typed {@link SessionUsage}. Returns `null` when no usage info
|
|
457
|
+
* is present.
|
|
458
|
+
*/
|
|
459
|
+
declare function parseSessionUsage(data: Record<string, unknown>): SessionUsage | null;
|
|
396
460
|
/**
|
|
397
461
|
* Audio chunk from streaming TTS.
|
|
398
462
|
*/
|
|
399
463
|
interface AudioChunk {
|
|
400
464
|
/** Raw PCM16 audio as base64 */
|
|
401
465
|
audio: string;
|
|
402
|
-
/** Encoding format */
|
|
403
|
-
encoding: 'pcm_s16le';
|
|
466
|
+
/** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
|
|
467
|
+
encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
|
|
404
468
|
/** Chunk index */
|
|
405
469
|
index: number;
|
|
406
470
|
/** Sample rate */
|
|
@@ -426,6 +490,12 @@ interface GenerationStats {
|
|
|
426
490
|
rtf: number;
|
|
427
491
|
/** Error message if any */
|
|
428
492
|
error?: string;
|
|
493
|
+
/**
|
|
494
|
+
* Per-request usage (audio time + amount charged), for billing your own
|
|
495
|
+
* customers. Undefined when the server reports no usage. See
|
|
496
|
+
* {@link SessionUsage}.
|
|
497
|
+
*/
|
|
498
|
+
usage?: SessionUsage;
|
|
429
499
|
}
|
|
430
500
|
/**
|
|
431
501
|
* Complete audio response from TTS generation.
|
|
@@ -505,6 +575,8 @@ interface MultiContextConfig {
|
|
|
505
575
|
defaultVoiceId?: number;
|
|
506
576
|
/** Output sample rate (default: 24000) */
|
|
507
577
|
sampleRate?: number;
|
|
578
|
+
/** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
|
|
579
|
+
outputFormat?: string;
|
|
508
580
|
/** CFG scale for generation (default: 2.0) */
|
|
509
581
|
cfgScale?: number;
|
|
510
582
|
/**
|
|
@@ -522,6 +594,14 @@ interface MultiContextConfig {
|
|
|
522
594
|
* the language, which adds ~60-150ms to time-to-first-audio.
|
|
523
595
|
*/
|
|
524
596
|
language?: string;
|
|
597
|
+
/**
|
|
598
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
599
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
600
|
+
* empty array disables dictionaries for this request. A list of
|
|
601
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
602
|
+
* inactive ones — bypassing the language filter.
|
|
603
|
+
*/
|
|
604
|
+
dictionaryIds?: number[];
|
|
525
605
|
/** Seconds before context auto-closes (default: 20.0) */
|
|
526
606
|
inactivityTimeout?: number;
|
|
527
607
|
}
|
|
@@ -557,8 +637,20 @@ interface MultiContextCallbacks {
|
|
|
557
637
|
onContextCreated?: (contextId: string) => void;
|
|
558
638
|
/** Called when an audio chunk is received */
|
|
559
639
|
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
560
|
-
/**
|
|
561
|
-
|
|
640
|
+
/**
|
|
641
|
+
* Called when all audio admitted before a `{flush: true}` has been
|
|
642
|
+
* delivered for a context (`{"final": true, "context_id": ...}`), and
|
|
643
|
+
* once more before {@link onContextClosed} on a graceful close. The
|
|
644
|
+
* ElevenLabs multi-context `is_final` equivalent. Not fired on an
|
|
645
|
+
* immediate (barge-in) close.
|
|
646
|
+
*/
|
|
647
|
+
onFinal?: (contextId: string) => void;
|
|
648
|
+
/**
|
|
649
|
+
* Called when a context is closed (terminal). `usage` carries this
|
|
650
|
+
* conversation's audio time + amount charged (undefined if not reported).
|
|
651
|
+
* See {@link SessionUsage}.
|
|
652
|
+
*/
|
|
653
|
+
onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
|
|
562
654
|
/** Called when a context times out */
|
|
563
655
|
onContextTimeout?: (contextId: string) => void;
|
|
564
656
|
/** Called when session is closed */
|
|
@@ -912,13 +1004,25 @@ declare class MultiContextSession {
|
|
|
912
1004
|
private config;
|
|
913
1005
|
private callbacks;
|
|
914
1006
|
private contexts;
|
|
1007
|
+
/** Contexts a create message has been sent for (not yet necessarily
|
|
1008
|
+
* confirmed by the server via context_created). */
|
|
1009
|
+
private requestedContexts;
|
|
915
1010
|
private _sessionId;
|
|
1011
|
+
private _contextUsage;
|
|
916
1012
|
private isStarted;
|
|
917
1013
|
constructor(client: KugelAudio, config?: MultiContextConfig);
|
|
918
1014
|
/**
|
|
919
1015
|
* Get the current session ID, or null if not connected.
|
|
920
1016
|
*/
|
|
921
1017
|
get sessionId(): string | null;
|
|
1018
|
+
/**
|
|
1019
|
+
* Per-context usage (audio time + amount charged) for a closed context, or
|
|
1020
|
+
* null if that context hasn't closed yet. Each context is its own
|
|
1021
|
+
* conversation — use this to bill per conversation. See {@link SessionUsage}.
|
|
1022
|
+
*/
|
|
1023
|
+
usageFor(contextId: string): SessionUsage | null;
|
|
1024
|
+
/** Map of context_id → per-context usage for all closed contexts. */
|
|
1025
|
+
get contextUsage(): Map<string, SessionUsage>;
|
|
922
1026
|
/**
|
|
923
1027
|
* Connect to the multi-context WebSocket endpoint.
|
|
924
1028
|
*
|
|
@@ -1005,7 +1109,14 @@ declare class StreamingSession {
|
|
|
1005
1109
|
private callbacks;
|
|
1006
1110
|
private client;
|
|
1007
1111
|
private configSent;
|
|
1112
|
+
private _lastUsage;
|
|
1008
1113
|
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
|
|
1114
|
+
/**
|
|
1115
|
+
* Per-session usage from the most recently closed session, or null before
|
|
1116
|
+
* the first session closes. Use this to bill your own customers per
|
|
1117
|
+
* conversation. See {@link SessionUsage}.
|
|
1118
|
+
*/
|
|
1119
|
+
get lastUsage(): SessionUsage | null;
|
|
1009
1120
|
/**
|
|
1010
1121
|
* Open the WebSocket connection and authenticate.
|
|
1011
1122
|
*
|
|
@@ -1354,4 +1465,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
|
|
|
1354
1465
|
*/
|
|
1355
1466
|
declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
|
|
1356
1467
|
|
|
1357
|
-
export { type AudioChunk, type AudioResponse, AuthenticationError, type BulkReplaceResult, ConnectionError, type ContextVoiceSettings, type CreateDictionaryOptions, type CreateVoiceOptions, DictionariesResource, type Dictionary, DictionaryEntriesResource, type DictionaryEntry, type DictionaryEntryInput, type DictionaryEntryListResponse, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, NotFoundError, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateDictionaryEntryOptions, type UpdateDictionaryOptions, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };
|
|
1468
|
+
export { type AudioChunk, type AudioResponse, AuthenticationError, type BulkReplaceResult, ConnectionError, type ContextVoiceSettings, type CreateDictionaryOptions, type CreateVoiceOptions, DictionariesResource, type Dictionary, DictionaryEntriesResource, type DictionaryEntry, type DictionaryEntryInput, type DictionaryEntryListResponse, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, NotFoundError, RateLimitError, type Region, type SessionUsage, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateDictionaryEntryOptions, type UpdateDictionaryOptions, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16, parseSessionUsage };
|
package/dist/index.d.ts
CHANGED
|
@@ -217,7 +217,7 @@ interface WordTimestamp {
|
|
|
217
217
|
interface GenerateOptions {
|
|
218
218
|
/** Text to synthesize */
|
|
219
219
|
text: string;
|
|
220
|
-
/** Model to use: 'kugel-
|
|
220
|
+
/** Model to use. Default: 'kugel-3'. Legacy ids (kugel-2.5, kugel-1-turbo, …) still accepted; they alias to kugel-3 server-side. */
|
|
221
221
|
modelId?: string;
|
|
222
222
|
/** Voice ID to use */
|
|
223
223
|
voiceId?: number;
|
|
@@ -235,6 +235,12 @@ interface GenerateOptions {
|
|
|
235
235
|
maxNewTokens?: number;
|
|
236
236
|
/** Output sample rate (default: 24000) */
|
|
237
237
|
sampleRate?: number;
|
|
238
|
+
/**
|
|
239
|
+
* Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
|
|
240
|
+
* Opt-in; when set it is authoritative and must not contradict sampleRate.
|
|
241
|
+
* Absent ⇒ legacy PCM16 at sampleRate.
|
|
242
|
+
*/
|
|
243
|
+
outputFormat?: string;
|
|
238
244
|
/**
|
|
239
245
|
* Enable text normalization (converts numbers, dates, etc. to spoken words).
|
|
240
246
|
* When true, text will be normalized before TTS generation.
|
|
@@ -263,8 +269,8 @@ interface GenerateOptions {
|
|
|
263
269
|
/**
|
|
264
270
|
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
265
271
|
*
|
|
266
|
-
* Uses pitch-preserving time-stretching (WSOLA)
|
|
267
|
-
*
|
|
272
|
+
* Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
|
|
273
|
+
* whole request (no per-span control).
|
|
268
274
|
* Range: [0.8, 1.2]. Default: 1.0.
|
|
269
275
|
*/
|
|
270
276
|
speed?: number;
|
|
@@ -275,6 +281,14 @@ interface GenerateOptions {
|
|
|
275
281
|
* server treats the value as trusted once received.
|
|
276
282
|
*/
|
|
277
283
|
projectId?: number;
|
|
284
|
+
/**
|
|
285
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
286
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
287
|
+
* empty array disables dictionaries for this request. A list of
|
|
288
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
289
|
+
* inactive ones — bypassing the language filter.
|
|
290
|
+
*/
|
|
291
|
+
dictionaryIds?: number[];
|
|
278
292
|
}
|
|
279
293
|
/**
|
|
280
294
|
* Streaming session configuration for `/ws/tts/stream`.
|
|
@@ -296,7 +310,7 @@ interface GenerateOptions {
|
|
|
296
310
|
interface StreamConfig {
|
|
297
311
|
/** Voice ID to use */
|
|
298
312
|
voiceId?: number;
|
|
299
|
-
/** Model ID
|
|
313
|
+
/** Model ID. Default: 'kugel-3'. Legacy ids still accepted; they alias to kugel-3 server-side. */
|
|
300
314
|
modelId?: string;
|
|
301
315
|
/** CFG scale for generation */
|
|
302
316
|
cfgScale?: number;
|
|
@@ -309,6 +323,8 @@ interface StreamConfig {
|
|
|
309
323
|
maxNewTokens?: number;
|
|
310
324
|
/** Output sample rate */
|
|
311
325
|
sampleRate?: number;
|
|
326
|
+
/** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
|
|
327
|
+
outputFormat?: string;
|
|
312
328
|
/** Auto-flush timeout in milliseconds */
|
|
313
329
|
flushTimeoutMs?: number;
|
|
314
330
|
/** Maximum buffer length */
|
|
@@ -353,11 +369,19 @@ interface StreamConfig {
|
|
|
353
369
|
/**
|
|
354
370
|
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
355
371
|
*
|
|
356
|
-
* Uses pitch-preserving time-stretching (WSOLA)
|
|
357
|
-
*
|
|
372
|
+
* Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
|
|
373
|
+
* whole request (no per-span control).
|
|
358
374
|
* Range: [0.8, 1.2]. Default: 1.0.
|
|
359
375
|
*/
|
|
360
376
|
speed?: number;
|
|
377
|
+
/**
|
|
378
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
379
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
380
|
+
* empty array disables dictionaries for this request. A list of
|
|
381
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
382
|
+
* inactive ones — bypassing the language filter.
|
|
383
|
+
*/
|
|
384
|
+
dictionaryIds?: number[];
|
|
361
385
|
}
|
|
362
386
|
/**
|
|
363
387
|
* Event callbacks for a streaming session (`/ws/tts/stream`).
|
|
@@ -374,9 +398,18 @@ interface StreamingSessionCallbacks {
|
|
|
374
398
|
* Carries the segment index, total audio duration, and generation time.
|
|
375
399
|
*/
|
|
376
400
|
onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
|
|
401
|
+
/**
|
|
402
|
+
* Called when the server marks the end of a turn's audio
|
|
403
|
+
* (`{"final": true, ...}` — sent after the last audio frame of every
|
|
404
|
+
* gracefully completed turn, right before `session_closed`). The
|
|
405
|
+
* ElevenLabs `isFinal` equivalent: once this fires, no further audio
|
|
406
|
+
* for the turn will arrive. Not fired on a barge-in cancel — that
|
|
407
|
+
* path fires {@link onInterrupted} instead.
|
|
408
|
+
*/
|
|
409
|
+
onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
377
410
|
/**
|
|
378
411
|
* Called when the session is fully closed (after `session.close()`).
|
|
379
|
-
*
|
|
412
|
+
* Fires right after {@link onFinal} and additionally carries usage.
|
|
380
413
|
*/
|
|
381
414
|
onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
382
415
|
/** Called when the server begins generating audio for a text segment. */
|
|
@@ -393,14 +426,45 @@ interface StreamingSessionCallbacks {
|
|
|
393
426
|
/** Called on any error. */
|
|
394
427
|
onError?: (error: Error) => void;
|
|
395
428
|
}
|
|
429
|
+
/**
|
|
430
|
+
* Per-session usage reported in the `session_closed` frame (KUG-1192).
|
|
431
|
+
*
|
|
432
|
+
* Lets you bill your own customers per conversation. `costCents` is the
|
|
433
|
+
* actual amount charged in **EUR cents**. When the charge could not be
|
|
434
|
+
* determined at session end (e.g. a transient billing error) `costCents` is
|
|
435
|
+
* `null` and `costAvailable` is `false` — never a misleading `0`.
|
|
436
|
+
* `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
|
|
437
|
+
* context (per conversation) on each `context_closed` frame, not aggregated
|
|
438
|
+
* across contexts.
|
|
439
|
+
*/
|
|
440
|
+
interface SessionUsage {
|
|
441
|
+
/** Total audio generated this session, in seconds (the unit we bill on). */
|
|
442
|
+
audioSeconds: number;
|
|
443
|
+
/** Actual amount charged in EUR cents, or `null` if undetermined. */
|
|
444
|
+
costCents: number | null;
|
|
445
|
+
/** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
|
|
446
|
+
currency?: string;
|
|
447
|
+
/** Total input characters submitted this session, if reported. */
|
|
448
|
+
characters?: number;
|
|
449
|
+
/** Model that produced the audio, if reported. */
|
|
450
|
+
modelId?: string;
|
|
451
|
+
/** `true` when an authoritative charge was returned for this session. */
|
|
452
|
+
costAvailable: boolean;
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Parse the raw `usage` object (or a legacy `session_closed` payload without
|
|
456
|
+
* one) into a typed {@link SessionUsage}. Returns `null` when no usage info
|
|
457
|
+
* is present.
|
|
458
|
+
*/
|
|
459
|
+
declare function parseSessionUsage(data: Record<string, unknown>): SessionUsage | null;
|
|
396
460
|
/**
|
|
397
461
|
* Audio chunk from streaming TTS.
|
|
398
462
|
*/
|
|
399
463
|
interface AudioChunk {
|
|
400
464
|
/** Raw PCM16 audio as base64 */
|
|
401
465
|
audio: string;
|
|
402
|
-
/** Encoding format */
|
|
403
|
-
encoding: 'pcm_s16le';
|
|
466
|
+
/** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
|
|
467
|
+
encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
|
|
404
468
|
/** Chunk index */
|
|
405
469
|
index: number;
|
|
406
470
|
/** Sample rate */
|
|
@@ -426,6 +490,12 @@ interface GenerationStats {
|
|
|
426
490
|
rtf: number;
|
|
427
491
|
/** Error message if any */
|
|
428
492
|
error?: string;
|
|
493
|
+
/**
|
|
494
|
+
* Per-request usage (audio time + amount charged), for billing your own
|
|
495
|
+
* customers. Undefined when the server reports no usage. See
|
|
496
|
+
* {@link SessionUsage}.
|
|
497
|
+
*/
|
|
498
|
+
usage?: SessionUsage;
|
|
429
499
|
}
|
|
430
500
|
/**
|
|
431
501
|
* Complete audio response from TTS generation.
|
|
@@ -505,6 +575,8 @@ interface MultiContextConfig {
|
|
|
505
575
|
defaultVoiceId?: number;
|
|
506
576
|
/** Output sample rate (default: 24000) */
|
|
507
577
|
sampleRate?: number;
|
|
578
|
+
/** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
|
|
579
|
+
outputFormat?: string;
|
|
508
580
|
/** CFG scale for generation (default: 2.0) */
|
|
509
581
|
cfgScale?: number;
|
|
510
582
|
/**
|
|
@@ -522,6 +594,14 @@ interface MultiContextConfig {
|
|
|
522
594
|
* the language, which adds ~60-150ms to time-to-first-audio.
|
|
523
595
|
*/
|
|
524
596
|
language?: string;
|
|
597
|
+
/**
|
|
598
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
599
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
600
|
+
* empty array disables dictionaries for this request. A list of
|
|
601
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
602
|
+
* inactive ones — bypassing the language filter.
|
|
603
|
+
*/
|
|
604
|
+
dictionaryIds?: number[];
|
|
525
605
|
/** Seconds before context auto-closes (default: 20.0) */
|
|
526
606
|
inactivityTimeout?: number;
|
|
527
607
|
}
|
|
@@ -557,8 +637,20 @@ interface MultiContextCallbacks {
|
|
|
557
637
|
onContextCreated?: (contextId: string) => void;
|
|
558
638
|
/** Called when an audio chunk is received */
|
|
559
639
|
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
560
|
-
/**
|
|
561
|
-
|
|
640
|
+
/**
|
|
641
|
+
* Called when all audio admitted before a `{flush: true}` has been
|
|
642
|
+
* delivered for a context (`{"final": true, "context_id": ...}`), and
|
|
643
|
+
* once more before {@link onContextClosed} on a graceful close. The
|
|
644
|
+
* ElevenLabs multi-context `is_final` equivalent. Not fired on an
|
|
645
|
+
* immediate (barge-in) close.
|
|
646
|
+
*/
|
|
647
|
+
onFinal?: (contextId: string) => void;
|
|
648
|
+
/**
|
|
649
|
+
* Called when a context is closed (terminal). `usage` carries this
|
|
650
|
+
* conversation's audio time + amount charged (undefined if not reported).
|
|
651
|
+
* See {@link SessionUsage}.
|
|
652
|
+
*/
|
|
653
|
+
onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
|
|
562
654
|
/** Called when a context times out */
|
|
563
655
|
onContextTimeout?: (contextId: string) => void;
|
|
564
656
|
/** Called when session is closed */
|
|
@@ -912,13 +1004,25 @@ declare class MultiContextSession {
|
|
|
912
1004
|
private config;
|
|
913
1005
|
private callbacks;
|
|
914
1006
|
private contexts;
|
|
1007
|
+
/** Contexts a create message has been sent for (not yet necessarily
|
|
1008
|
+
* confirmed by the server via context_created). */
|
|
1009
|
+
private requestedContexts;
|
|
915
1010
|
private _sessionId;
|
|
1011
|
+
private _contextUsage;
|
|
916
1012
|
private isStarted;
|
|
917
1013
|
constructor(client: KugelAudio, config?: MultiContextConfig);
|
|
918
1014
|
/**
|
|
919
1015
|
* Get the current session ID, or null if not connected.
|
|
920
1016
|
*/
|
|
921
1017
|
get sessionId(): string | null;
|
|
1018
|
+
/**
|
|
1019
|
+
* Per-context usage (audio time + amount charged) for a closed context, or
|
|
1020
|
+
* null if that context hasn't closed yet. Each context is its own
|
|
1021
|
+
* conversation — use this to bill per conversation. See {@link SessionUsage}.
|
|
1022
|
+
*/
|
|
1023
|
+
usageFor(contextId: string): SessionUsage | null;
|
|
1024
|
+
/** Map of context_id → per-context usage for all closed contexts. */
|
|
1025
|
+
get contextUsage(): Map<string, SessionUsage>;
|
|
922
1026
|
/**
|
|
923
1027
|
* Connect to the multi-context WebSocket endpoint.
|
|
924
1028
|
*
|
|
@@ -1005,7 +1109,14 @@ declare class StreamingSession {
|
|
|
1005
1109
|
private callbacks;
|
|
1006
1110
|
private client;
|
|
1007
1111
|
private configSent;
|
|
1112
|
+
private _lastUsage;
|
|
1008
1113
|
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks);
|
|
1114
|
+
/**
|
|
1115
|
+
* Per-session usage from the most recently closed session, or null before
|
|
1116
|
+
* the first session closes. Use this to bill your own customers per
|
|
1117
|
+
* conversation. See {@link SessionUsage}.
|
|
1118
|
+
*/
|
|
1119
|
+
get lastUsage(): SessionUsage | null;
|
|
1009
1120
|
/**
|
|
1010
1121
|
* Open the WebSocket connection and authenticate.
|
|
1011
1122
|
*
|
|
@@ -1354,4 +1465,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
|
|
|
1354
1465
|
*/
|
|
1355
1466
|
declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
|
|
1356
1467
|
|
|
1357
|
-
export { type AudioChunk, type AudioResponse, AuthenticationError, type BulkReplaceResult, ConnectionError, type ContextVoiceSettings, type CreateDictionaryOptions, type CreateVoiceOptions, DictionariesResource, type Dictionary, DictionaryEntriesResource, type DictionaryEntry, type DictionaryEntryInput, type DictionaryEntryListResponse, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, NotFoundError, RateLimitError, type Region, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateDictionaryEntryOptions, type UpdateDictionaryOptions, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16 };
|
|
1468
|
+
export { type AudioChunk, type AudioResponse, AuthenticationError, type BulkReplaceResult, ConnectionError, type ContextVoiceSettings, type CreateDictionaryOptions, type CreateVoiceOptions, DictionariesResource, type Dictionary, DictionaryEntriesResource, type DictionaryEntry, type DictionaryEntryInput, type DictionaryEntryListResponse, type ErrorCode, ErrorCodes, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioErrorOptions, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, NotFoundError, RateLimitError, type Region, type SessionUsage, type StreamCallbacks, type StreamConfig, type StreamingSessionCallbacks, type UpdateDictionaryEntryOptions, type UpdateDictionaryOptions, type UpdateVoiceOptions, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceDetail, type VoiceListResponse, type VoiceQuality, type VoiceReference, type VoiceSex, type WordTimestamp, WsCloseCodes, base64ToArrayBuffer, classifyHttpError, classifyWsClose, classifyWsFrame, classifyWsHandshakeError, createWavBlob, createWavFile, decodePCM16, parseSessionUsage };
|