npm - kugelaudio - Versions diffs - 0.7.0 → 0.8.0 - Mend

kugelaudio 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/src/types.ts CHANGED Viewed

@@ -258,7 +258,13 @@ export interface GenerateOptions {
   maxNewTokens?: number;
   /** Output sample rate (default: 24000) */
   sampleRate?: number;
-  /**
+  /**
+   * Combined codec+rate token, e.g. 'ulaw_8000' / 'alaw_8000' / 'pcm_8000'.
+   * Opt-in; when set it is authoritative and must not contradict sampleRate.
+   * Absent ⇒ legacy PCM16 at sampleRate.
+   */
+  outputFormat?: string;
+  /**
    * Enable text normalization (converts numbers, dates, etc. to spoken words).
    * When true, text will be normalized before TTS generation.
    * Default: true
@@ -286,8 +292,8 @@ export interface GenerateOptions {
   /**
    * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
    *
-   * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
-   * can also be used for per-segment speed control.
+   * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
+   * whole request (no per-span control).
    * Range: [0.8, 1.2]. Default: 1.0.
    */
   speed?: number;
@@ -298,6 +304,14 @@ export interface GenerateOptions {
    * server treats the value as trusted once received.
    */
   projectId?: number;
+  /**
+   * Per-request dictionary selection. Omit for the default behavior (all
+   * active dictionaries of the project apply, filtered by language). An
+   * empty array disables dictionaries for this request. A list of
+   * dictionary IDs applies exactly those dictionaries — including
+   * inactive ones — bypassing the language filter.
+   */
+  dictionaryIds?: number[];
 }
 /**
@@ -333,6 +347,8 @@ export interface StreamConfig {
   maxNewTokens?: number;
   /** Output sample rate */
   sampleRate?: number;
+  /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per session. */
+  outputFormat?: string;
   /** Auto-flush timeout in milliseconds */
   flushTimeoutMs?: number;
   /** Maximum buffer length */
@@ -377,11 +393,19 @@ export interface StreamConfig {
   /**
    * Playback speed multiplier (0.8 = slower, 1.0 = normal, 1.2 = faster).
    *
-   * Uses pitch-preserving time-stretching (WSOLA). Inline `<prosody rate="...">` tags
-   * can also be used for per-segment speed control.
+   * Uses pitch-preserving time-stretching (WSOLA); applies uniformly to the
+   * whole request (no per-span control).
    * Range: [0.8, 1.2]. Default: 1.0.
    */
   speed?: number;
+  /**
+   * Per-request dictionary selection. Omit for the default behavior (all
+   * active dictionaries of the project apply, filtered by language). An
+   * empty array disables dictionaries for this request. A list of
+   * dictionary IDs applies exactly those dictionaries — including
+   * inactive ones — bypassing the language filter.
+   */
+  dictionaryIds?: number[];
 }
 /**
@@ -399,9 +423,18 @@ export interface StreamingSessionCallbacks {
    * Carries the segment index, total audio duration, and generation time.
    */
   onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
+  /**
+   * Called when the server marks the end of a turn's audio
+   * (`{"final": true, ...}` — sent after the last audio frame of every
+   * gracefully completed turn, right before `session_closed`). The
+   * ElevenLabs `isFinal` equivalent: once this fires, no further audio
+   * for the turn will arrive. Not fired on a barge-in cancel — that
+   * path fires {@link onInterrupted} instead.
+   */
+  onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
   /**
    * Called when the session is fully closed (after `session.close()`).
-   * Equivalent to `onFinal` on the one-shot endpoint.
+   * Fires right after {@link onFinal} and additionally carries usage.
    */
   onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
   /** Called when the server begins generating audio for a text segment. */
@@ -419,14 +452,71 @@ export interface StreamingSessionCallbacks {
   onError?: (error: Error) => void;
 }
+/**
+ * Per-session usage reported in the `session_closed` frame (KUG-1192).
+ *
+ * Lets you bill your own customers per conversation. `costCents` is the
+ * actual amount charged in **EUR cents**. When the charge could not be
+ * determined at session end (e.g. a transient billing error) `costCents` is
+ * `null` and `costAvailable` is `false` — never a misleading `0`.
+ * `audioSeconds` is always reported. On `/ws/tts/multi` usage is reported per
+ * context (per conversation) on each `context_closed` frame, not aggregated
+ * across contexts.
+ */
+export interface SessionUsage {
+  /** Total audio generated this session, in seconds (the unit we bill on). */
+  audioSeconds: number;
+  /** Actual amount charged in EUR cents, or `null` if undetermined. */
+  costCents: number | null;
+  /** Currency of `costCents` (`"eur"`); present only when `costCents` is set. */
+  currency?: string;
+  /** Total input characters submitted this session, if reported. */
+  characters?: number;
+  /** Model that produced the audio, if reported. */
+  modelId?: string;
+  /** `true` when an authoritative charge was returned for this session. */
+  costAvailable: boolean;
+}
+/**
+ * Parse the raw `usage` object (or a legacy `session_closed` payload without
+ * one) into a typed {@link SessionUsage}. Returns `null` when no usage info
+ * is present.
+ */
+export function parseSessionUsage(
+  data: Record<string, unknown>,
+): SessionUsage | null {
+  const raw = data.usage as Record<string, unknown> | undefined;
+  const source = raw && typeof raw === 'object' ? raw : data;
+  const audioSeconds =
+    typeof source.audio_seconds === 'number'
+      ? source.audio_seconds
+      : typeof data.total_audio_seconds === 'number'
+        ? data.total_audio_seconds
+        : undefined;
+  if (audioSeconds === undefined) return null;
+  const costCents =
+    typeof source.cost_cents === 'number' ? source.cost_cents : null;
+  return {
+    audioSeconds,
+    costCents,
+    currency:
+      typeof source.currency === 'string' ? source.currency : undefined,
+    characters:
+      typeof source.characters === 'number' ? source.characters : undefined,
+    modelId: typeof source.model_id === 'string' ? source.model_id : undefined,
+    costAvailable: costCents !== null,
+  };
+}
 /**
  * Audio chunk from streaming TTS.
  */
 export interface AudioChunk {
   /** Raw PCM16 audio as base64 */
   audio: string;
-  /** Encoding format */
-  encoding: 'pcm_s16le';
+  /** Encoding format. 'mulaw' / 'alaw' only when output_format requested G.711. */
+  encoding: 'pcm_s16le' | 'mulaw' | 'alaw';
   /** Chunk index */
   index: number;
   /** Sample rate */
@@ -453,6 +543,12 @@ export interface GenerationStats {
   rtf: number;
   /** Error message if any */
   error?: string;
+  /**
+   * Per-request usage (audio time + amount charged), for billing your own
+   * customers. Undefined when the server reports no usage. See
+   * {@link SessionUsage}.
+   */
+  usage?: SessionUsage;
 }
 /**
@@ -546,6 +642,8 @@ export interface MultiContextConfig {
   defaultVoiceId?: number;
   /** Output sample rate (default: 24000) */
   sampleRate?: number;
+  /** Combined codec+rate token (e.g. 'ulaw_8000'); opt-in, set-once per context. */
+  outputFormat?: string;
   /** CFG scale for generation (default: 2.0) */
   cfgScale?: number;
   /**
@@ -563,6 +661,14 @@ export interface MultiContextConfig {
    * the language, which adds ~60-150ms to time-to-first-audio.
    */
   language?: string;
+  /**
+   * Per-request dictionary selection. Omit for the default behavior (all
+   * active dictionaries of the project apply, filtered by language). An
+   * empty array disables dictionaries for this request. A list of
+   * dictionary IDs applies exactly those dictionaries — including
+   * inactive ones — bypassing the language filter.
+   */
+  dictionaryIds?: number[];
   /** Seconds before context auto-closes (default: 20.0) */
   inactivityTimeout?: number;
 }
@@ -601,8 +707,20 @@ export interface MultiContextCallbacks {
   onContextCreated?: (contextId: string) => void;
   /** Called when an audio chunk is received */
   onChunk?: (chunk: MultiContextAudioChunk) => void;
-  /** Called when a context is closed */
-  onContextClosed?: (contextId: string) => void;
+  /**
+   * Called when all audio admitted before a `{flush: true}` has been
+   * delivered for a context (`{"final": true, "context_id": ...}`), and
+   * once more before {@link onContextClosed} on a graceful close. The
+   * ElevenLabs multi-context `is_final` equivalent. Not fired on an
+   * immediate (barge-in) close.
+   */
+  onFinal?: (contextId: string) => void;
+  /**
+   * Called when a context is closed (terminal). `usage` carries this
+   * conversation's audio time + amount charged (undefined if not reported).
+   * See {@link SessionUsage}.
+   */
+  onContextClosed?: (contextId: string, usage?: SessionUsage) => void;
   /** Called when a context times out */
   onContextTimeout?: (contextId: string) => void;
   /** Called when session is closed */