kugelaudio 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/README.md +11 -2
- package/dist/index.d.mts +121 -10
- package/dist/index.d.ts +121 -10
- package/dist/index.js +95 -17
- package/dist/index.mjs +93 -16
- package/package.json +1 -1
- package/src/client.test.ts +356 -2
- package/src/client.ts +95 -12
- package/src/index.ts +2 -0
- package/src/types.ts +128 -10
package/src/types.ts
CHANGED
|
@@ -258,7 +258,13 @@ export interface GenerateOptions {
|
|
|
258
258
|
maxNewTokens?: number;
|
|
259
259
|
/** Output sample rate (default: 24000) */
|
|
260
260
|
sampleRate?: number;
|
|
261
|
-
/**
|
|
261
|
+
/**
|
|
262
|
+
* Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
|
|
263
|
+
* Opt-in; when set it is authoritative and must not contradict sampleRate.
|
|
264
|
+
* Absent ⇒ legacy PCM16 at sampleRate.
|
|
265
|
+
*/
|
|
266
|
+
outputFormat?: string;
|
|
267
|
+
/**
|
|
262
268
|
* Enable text normalization (converts numbers, dates, etc. to spoken words).
|
|
263
269
|
* When true, text will be normalized before TTS generation.
|
|
264
270
|
* Default: true
|
|
@@ -286,8 +292,8 @@ export interface GenerateOptions {
|
|
|
286
292
|
/**
|
|
287
293
|
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
288
294
|
*
|
|
289
|
-
* Uses pitch-preserving time-stretching (WSOLA)
|
|
290
|
-
*
|
|
295
|
+
* Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
|
|
296
|
+
* whole request (no per-span control).
|
|
291
297
|
* Range: [0.8, 1.2]. Default: 1.0.
|
|
292
298
|
*/
|
|
293
299
|
speed?: number;
|
|
@@ -298,6 +304,14 @@ export interface GenerateOptions {
|
|
|
298
304
|
* server treats the value as trusted once received.
|
|
299
305
|
*/
|
|
300
306
|
projectId?: number;
|
|
307
|
+
/**
|
|
308
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
309
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
310
|
+
* empty array disables dictionaries for this request. A list of
|
|
311
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
312
|
+
* inactive ones — bypassing the language filter.
|
|
313
|
+
*/
|
|
314
|
+
dictionaryIds?: number[];
|
|
301
315
|
}
|
|
302
316
|
|
|
303
317
|
/**
|
|
@@ -333,6 +347,8 @@ export interface StreamConfig {
|
|
|
333
347
|
maxNewTokens?: number;
|
|
334
348
|
/** Output sample rate */
|
|
335
349
|
sampleRate?: number;
|
|
350
|
+
/** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
|
|
351
|
+
outputFormat?: string;
|
|
336
352
|
/** Auto-flush timeout in milliseconds */
|
|
337
353
|
flushTimeoutMs?: number;
|
|
338
354
|
/** Maximum buffer length */
|
|
@@ -377,11 +393,19 @@ export interface StreamConfig {
|
|
|
377
393
|
/**
|
|
378
394
|
* Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
|
|
379
395
|
*
|
|
380
|
-
* Uses pitch-preserving time-stretching (WSOLA)
|
|
381
|
-
*
|
|
396
|
+
* Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
|
|
397
|
+
* whole request (no per-span control).
|
|
382
398
|
* Range: [0.8, 1.2]. Default: 1.0.
|
|
383
399
|
*/
|
|
384
400
|
speed?: number;
|
|
401
|
+
/**
|
|
402
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
403
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
404
|
+
* empty array disables dictionaries for this request. A list of
|
|
405
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
406
|
+
* inactive ones — bypassing the language filter.
|
|
407
|
+
*/
|
|
408
|
+
dictionaryIds?: number[];
|
|
385
409
|
}
|
|
386
410
|
|
|
387
411
|
/**
|
|
@@ -399,9 +423,18 @@ export interface StreamingSessionCallbacks {
|
|
|
399
423
|
* Carries the segment index, total audio duration, and generation time.
|
|
400
424
|
*/
|
|
401
425
|
onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
|
|
426
|
+
/**
|
|
427
|
+
* Called when the server marks the end of a turn's audio
|
|
428
|
+
* (`{"final": true, ...}` — sent after the last audio frame of every
|
|
429
|
+
* gracefully completed turn, right before `session_closed`). The
|
|
430
|
+
* ElevenLabs `isFinal` equivalent: once this fires, no further audio
|
|
431
|
+
* for the turn will arrive. Not fired on a barge-in cancel — that
|
|
432
|
+
* path fires {@link onInterrupted} instead.
|
|
433
|
+
*/
|
|
434
|
+
onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
402
435
|
/**
|
|
403
436
|
* Called when the session is fully closed (after `session.close()`).
|
|
404
|
-
*
|
|
437
|
+
* Fires right after {@link onFinal} and additionally carries usage.
|
|
405
438
|
*/
|
|
406
439
|
onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
|
|
407
440
|
/** Called when the server begins generating audio for a text segment. */
|
|
@@ -419,14 +452,71 @@ export interface StreamingSessionCallbacks {
|
|
|
419
452
|
onError?: (error: Error) => void;
|
|
420
453
|
}
|
|
421
454
|
|
|
455
|
+
/**
|
|
456
|
+
* Per-session usage reported in the `session_closed` frame (KUG-1192).
|
|
457
|
+
*
|
|
458
|
+
* Lets you bill your own customers per conversation. `costCents` is the
|
|
459
|
+
* actual amount charged in **EUR cents**. When the charge could not be
|
|
460
|
+
* determined at session end (e.g. a transient billing error) `costCents` is
|
|
461
|
+
* `null` and `costAvailable` is `false` — never a misleading `0`.
|
|
462
|
+
* `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
|
|
463
|
+
* context (per conversation) on each `context_closed` frame, not aggregated
|
|
464
|
+
* across contexts.
|
|
465
|
+
*/
|
|
466
|
+
export interface SessionUsage {
|
|
467
|
+
/** Total audio generated this session, in seconds (the unit we bill on). */
|
|
468
|
+
audioSeconds: number;
|
|
469
|
+
/** Actual amount charged in EUR cents, or `null` if undetermined. */
|
|
470
|
+
costCents: number | null;
|
|
471
|
+
/** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
|
|
472
|
+
currency?: string;
|
|
473
|
+
/** Total input characters submitted this session, if reported. */
|
|
474
|
+
characters?: number;
|
|
475
|
+
/** Model that produced the audio, if reported. */
|
|
476
|
+
modelId?: string;
|
|
477
|
+
/** `true` when an authoritative charge was returned for this session. */
|
|
478
|
+
costAvailable: boolean;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Parse the raw `usage` object (or a legacy `session_closed` payload without
|
|
483
|
+
* one) into a typed {@link SessionUsage}. Returns `null` when no usage info
|
|
484
|
+
* is present.
|
|
485
|
+
*/
|
|
486
|
+
export function parseSessionUsage(
|
|
487
|
+
data: Record<string, unknown>,
|
|
488
|
+
): SessionUsage | null {
|
|
489
|
+
const raw = data.usage as Record<string, unknown> | undefined;
|
|
490
|
+
const source = raw && typeof raw === 'object' ? raw : data;
|
|
491
|
+
const audioSeconds =
|
|
492
|
+
typeof source.audio_seconds === 'number'
|
|
493
|
+
? source.audio_seconds
|
|
494
|
+
: typeof data.total_audio_seconds === 'number'
|
|
495
|
+
? data.total_audio_seconds
|
|
496
|
+
: undefined;
|
|
497
|
+
if (audioSeconds === undefined) return null;
|
|
498
|
+
const costCents =
|
|
499
|
+
typeof source.cost_cents === 'number' ? source.cost_cents : null;
|
|
500
|
+
return {
|
|
501
|
+
audioSeconds,
|
|
502
|
+
costCents,
|
|
503
|
+
currency:
|
|
504
|
+
typeof source.currency === 'string' ? source.currency : undefined,
|
|
505
|
+
characters:
|
|
506
|
+
typeof source.characters === 'number' ? source.characters : undefined,
|
|
507
|
+
modelId: typeof source.model_id === 'string' ? source.model_id : undefined,
|
|
508
|
+
costAvailable: costCents !== null,
|
|
509
|
+
};
|
|
510
|
+
}
|
|
511
|
+
|
|
422
512
|
/**
|
|
423
513
|
* Audio chunk from streaming TTS.
|
|
424
514
|
*/
|
|
425
515
|
export interface AudioChunk {
|
|
426
516
|
/** Raw PCM16 audio as base64 */
|
|
427
517
|
audio: string;
|
|
428
|
-
/** Encoding format */
|
|
429
|
-
encoding: 'pcm_s16le';
|
|
518
|
+
/** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
|
|
519
|
+
encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
|
|
430
520
|
/** Chunk index */
|
|
431
521
|
index: number;
|
|
432
522
|
/** Sample rate */
|
|
@@ -453,6 +543,12 @@ export interface GenerationStats {
|
|
|
453
543
|
rtf: number;
|
|
454
544
|
/** Error message if any */
|
|
455
545
|
error?: string;
|
|
546
|
+
/**
|
|
547
|
+
* Per-request usage (audio time + amount charged), for billing your own
|
|
548
|
+
* customers. Undefined when the server reports no usage. See
|
|
549
|
+
* {@link SessionUsage}.
|
|
550
|
+
*/
|
|
551
|
+
usage?: SessionUsage;
|
|
456
552
|
}
|
|
457
553
|
|
|
458
554
|
/**
|
|
@@ -546,6 +642,8 @@ export interface MultiContextConfig {
|
|
|
546
642
|
defaultVoiceId?: number;
|
|
547
643
|
/** Output sample rate (default: 24000) */
|
|
548
644
|
sampleRate?: number;
|
|
645
|
+
/** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
|
|
646
|
+
outputFormat?: string;
|
|
549
647
|
/** CFG scale for generation (default: 2.0) */
|
|
550
648
|
cfgScale?: number;
|
|
551
649
|
/**
|
|
@@ -563,6 +661,14 @@ export interface MultiContextConfig {
|
|
|
563
661
|
* the language, which adds ~60-150ms to time-to-first-audio.
|
|
564
662
|
*/
|
|
565
663
|
language?: string;
|
|
664
|
+
/**
|
|
665
|
+
* Per-request dictionary selection. Omit for the default behavior (all
|
|
666
|
+
* active dictionaries of the project apply, filtered by language). An
|
|
667
|
+
* empty array disables dictionaries for this request. A list of
|
|
668
|
+
* dictionary IDs applies exactly those dictionaries — including
|
|
669
|
+
* inactive ones — bypassing the language filter.
|
|
670
|
+
*/
|
|
671
|
+
dictionaryIds?: number[];
|
|
566
672
|
/** Seconds before context auto-closes (default: 20.0) */
|
|
567
673
|
inactivityTimeout?: number;
|
|
568
674
|
}
|
|
@@ -601,8 +707,20 @@ export interface MultiContextCallbacks {
|
|
|
601
707
|
onContextCreated?: (contextId: string) => void;
|
|
602
708
|
/** Called when an audio chunk is received */
|
|
603
709
|
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
604
|
-
/**
|
|
605
|
-
|
|
710
|
+
/**
|
|
711
|
+
* Called when all audio admitted before a `{flush: true}` has been
|
|
712
|
+
* delivered for a context (`{"final": true, "context_id": ...}`), and
|
|
713
|
+
* once more before {@link onContextClosed} on a graceful close. The
|
|
714
|
+
* ElevenLabs multi-context `is_final` equivalent. Not fired on an
|
|
715
|
+
* immediate (barge-in) close.
|
|
716
|
+
*/
|
|
717
|
+
onFinal?: (contextId: string) => void;
|
|
718
|
+
/**
|
|
719
|
+
* Called when a context is closed (terminal). `usage` carries this
|
|
720
|
+
* conversation's audio time + amount charged (undefined if not reported).
|
|
721
|
+
* See {@link SessionUsage}.
|
|
722
|
+
*/
|
|
723
|
+
onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
|
|
606
724
|
/** Called when a context times out */
|
|
607
725
|
onContextTimeout?: (contextId: string) => void;
|
|
608
726
|
/** Called when session is closed */
|