kugelaudio 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/types.ts CHANGED
@@ -258,7 +258,13 @@ export interface GenerateOptions {
258
258
  maxNewTokens?: number;
259
259
  /** Output sample rate (default: 24000) */
260
260
  sampleRate?: number;
261
- /**
261
+ /**
262
+ * Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
263
+ * Opt-in; when set it is authoritative and must not contradict sampleRate.
264
+ * Absent ⇒ legacy PCM16 at sampleRate.
265
+ */
266
+ outputFormat?: string;
267
+ /**
262
268
  * Enable text normalization (converts numbers, dates, etc. to spoken words).
263
269
  * When true, text will be normalized before TTS generation.
264
270
  * Default: true
@@ -286,8 +292,8 @@ export interface GenerateOptions {
286
292
  /**
287
293
  * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
288
294
  *
289
- * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
290
- * can also be used for per-segment speed control.
295
+ * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
296
+ * whole request (no per-span control).
291
297
  * Range: [0.8, 1.2]. Default: 1.0.
292
298
  */
293
299
  speed?: number;
@@ -298,6 +304,14 @@ export interface GenerateOptions {
298
304
  * server treats the value as trusted once received.
299
305
  */
300
306
  projectId?: number;
307
+ /**
308
+ * Per-request dictionary selection. Omit for the default behavior (all
309
+ * active dictionaries of the project apply, filtered by language). An
310
+ * empty array disables dictionaries for this request. A list of
311
+ * dictionary IDs applies exactly those dictionaries — including
312
+ * inactive ones — bypassing the language filter.
313
+ */
314
+ dictionaryIds?: number[];
301
315
  }
302
316
 
303
317
  /**
@@ -333,6 +347,8 @@ export interface StreamConfig {
333
347
  maxNewTokens?: number;
334
348
  /** Output sample rate */
335
349
  sampleRate?: number;
350
+ /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
351
+ outputFormat?: string;
336
352
  /** Auto-flush timeout in milliseconds */
337
353
  flushTimeoutMs?: number;
338
354
  /** Maximum buffer length */
@@ -377,11 +393,19 @@ export interface StreamConfig {
377
393
  /**
378
394
  * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
379
395
  *
380
- * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
381
- * can also be used for per-segment speed control.
396
+ * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
397
+ * whole request (no per-span control).
382
398
  * Range: [0.8, 1.2]. Default: 1.0.
383
399
  */
384
400
  speed?: number;
401
+ /**
402
+ * Per-request dictionary selection. Omit for the default behavior (all
403
+ * active dictionaries of the project apply, filtered by language). An
404
+ * empty array disables dictionaries for this request. A list of
405
+ * dictionary IDs applies exactly those dictionaries — including
406
+ * inactive ones — bypassing the language filter.
407
+ */
408
+ dictionaryIds?: number[];
385
409
  }
386
410
 
387
411
  /**
@@ -399,9 +423,18 @@ export interface StreamingSessionCallbacks {
399
423
  * Carries the segment index, total audio duration, and generation time.
400
424
  */
401
425
  onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
426
+ /**
427
+ * Called when the server marks the end of a turn's audio
428
+ * (`{"final": true, ...}` — sent after the last audio frame of every
429
+ * gracefully completed turn, right before `session_closed`). The
430
+ * ElevenLabs `isFinal` equivalent: once this fires, no further audio
431
+ * for the turn will arrive. Not fired on a barge-in cancel — that
432
+ * path fires {@link onInterrupted} instead.
433
+ */
434
+ onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
402
435
  /**
403
436
  * Called when the session is fully closed (after `session.close()`).
404
- * Equivalent to `onFinal` on the one-shot endpoint.
437
+ * Fires right after {@link onFinal} and additionally carries usage.
405
438
  */
406
439
  onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
407
440
  /** Called when the server begins generating audio for a text segment. */
@@ -419,14 +452,71 @@ export interface StreamingSessionCallbacks {
419
452
  onError?: (error: Error) => void;
420
453
  }
421
454
 
455
+ /**
456
+ * Per-session usage reported in the `session_closed` frame (KUG-1192).
457
+ *
458
+ * Lets you bill your own customers per conversation. `costCents` is the
459
+ * actual amount charged in **EUR cents**. When the charge could not be
460
+ * determined at session end (e.g. a transient billing error) `costCents` is
461
+ * `null` and `costAvailable` is `false` — never a misleading `0`.
462
+ * `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
463
+ * context (per conversation) on each `context_closed` frame, not aggregated
464
+ * across contexts.
465
+ */
466
+ export interface SessionUsage {
467
+ /** Total audio generated this session, in seconds (the unit we bill on). */
468
+ audioSeconds: number;
469
+ /** Actual amount charged in EUR cents, or `null` if undetermined. */
470
+ costCents: number | null;
471
+ /** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
472
+ currency?: string;
473
+ /** Total input characters submitted this session, if reported. */
474
+ characters?: number;
475
+ /** Model that produced the audio, if reported. */
476
+ modelId?: string;
477
+ /** `true` when an authoritative charge was returned for this session. */
478
+ costAvailable: boolean;
479
+ }
480
+
481
+ /**
482
+ * Parse the raw `usage` object (or a legacy `session_closed` payload without
483
+ * one) into a typed {@link SessionUsage}. Returns `null` when no usage info
484
+ * is present.
485
+ */
486
+ export function parseSessionUsage(
487
+ data: Record<string, unknown>,
488
+ ): SessionUsage | null {
489
+ const raw = data.usage as Record<string, unknown> | undefined;
490
+ const source = raw && typeof raw === 'object' ? raw : data;
491
+ const audioSeconds =
492
+ typeof source.audio_seconds === 'number'
493
+ ? source.audio_seconds
494
+ : typeof data.total_audio_seconds === 'number'
495
+ ? data.total_audio_seconds
496
+ : undefined;
497
+ if (audioSeconds === undefined) return null;
498
+ const costCents =
499
+ typeof source.cost_cents === 'number' ? source.cost_cents : null;
500
+ return {
501
+ audioSeconds,
502
+ costCents,
503
+ currency:
504
+ typeof source.currency === 'string' ? source.currency : undefined,
505
+ characters:
506
+ typeof source.characters === 'number' ? source.characters : undefined,
507
+ modelId: typeof source.model_id === 'string' ? source.model_id : undefined,
508
+ costAvailable: costCents !== null,
509
+ };
510
+ }
511
+
422
512
  /**
423
513
  * Audio chunk from streaming TTS.
424
514
  */
425
515
  export interface AudioChunk {
426
516
  /** Raw PCM16 audio as base64 */
427
517
  audio: string;
428
- /** Encoding format */
429
- encoding: 'pcm_s16le';
518
+ /** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
519
+ encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
430
520
  /** Chunk index */
431
521
  index: number;
432
522
  /** Sample rate */
@@ -453,6 +543,12 @@ export interface GenerationStats {
453
543
  rtf: number;
454
544
  /** Error message if any */
455
545
  error?: string;
546
+ /**
547
+ * Per-request usage (audio time + amount charged), for billing your own
548
+ * customers. Undefined when the server reports no usage. See
549
+ * {@link SessionUsage}.
550
+ */
551
+ usage?: SessionUsage;
456
552
  }
457
553
 
458
554
  /**
@@ -546,6 +642,8 @@ export interface MultiContextConfig {
546
642
  defaultVoiceId?: number;
547
643
  /** Output sample rate (default: 24000) */
548
644
  sampleRate?: number;
645
+ /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
646
+ outputFormat?: string;
549
647
  /** CFG scale for generation (default: 2.0) */
550
648
  cfgScale?: number;
551
649
  /**
@@ -563,6 +661,14 @@ export interface MultiContextConfig {
563
661
  * the language, which adds ~60-150ms to time-to-first-audio.
564
662
  */
565
663
  language?: string;
664
+ /**
665
+ * Per-request dictionary selection. Omit for the default behavior (all
666
+ * active dictionaries of the project apply, filtered by language). An
667
+ * empty array disables dictionaries for this request. A list of
668
+ * dictionary IDs applies exactly those dictionaries — including
669
+ * inactive ones — bypassing the language filter.
670
+ */
671
+ dictionaryIds?: number[];
566
672
  /** Seconds before context auto-closes (default: 20.0) */
567
673
  inactivityTimeout?: number;
568
674
  }
@@ -601,8 +707,20 @@ export interface MultiContextCallbacks {
601
707
  onContextCreated?: (contextId: string) => void;
602
708
  /** Called when an audio chunk is received */
603
709
  onChunk?: (chunk: MultiContextAudioChunk) => void;
604
- /** Called when a context is closed */
605
- onContextClosed?: (contextId: string) => void;
710
+ /**
711
+ * Called when all audio admitted before a `{flush: true}` has been
712
+ * delivered for a context (`{"final": true, "context_id": ...}`), and
713
+ * once more before {@link onContextClosed} on a graceful close. The
714
+ * ElevenLabs multi-context `is_final` equivalent. Not fired on an
715
+ * immediate (barge-in) close.
716
+ */
717
+ onFinal?: (contextId: string) => void;
718
+ /**
719
+ * Called when a context is closed (terminal). `usage` carries this
720
+ * conversation's audio time + amount charged (undefined if not reported).
721
+ * See {@link SessionUsage}.
722
+ */
723
+ onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
606
724
  /** Called when a context times out */
607
725
  onContextTimeout?: (contextId: string) => void;
608
726
  /** Called when session is closed */