npm - kugelaudio - Versions diffs - 0.2.2 → 0.2.3 - Mend

kugelaudio 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -41,6 +41,23 @@ interface Voice {
     isPublic: boolean;
     verified: boolean;
 }
+/**
+ * Word-level timestamp from server-side forced alignment.
+ */
+interface WordTimestamp {
+    /** The aligned word */
+    word: string;
+    /** Start time in milliseconds (relative to chunk/audio start) */
+    startMs: number;
+    /** End time in milliseconds (relative to chunk/audio start) */
+    endMs: number;
+    /** Start character offset in the original text */
+    charStart: number;
+    /** End character offset in the original text */
+    charEnd: number;
+    /** Alignment confidence score (0.0 - 1.0) */
+    score: number;
+}
 /**
  * TTS generation request options.
  */
@@ -75,6 +92,12 @@ interface GenerateOptions {
      *            el, uk, bg, tr, vi, ar, hi, zh, ja, ko
      */
     language?: string;
+    /**
+     * Request word-level timestamps alongside audio.
+     * When true, the server performs forced alignment and returns per-word timing boundaries.
+     * Default: false
+     */
+    wordTimestamps?: boolean;
 }
 /**
  * Streaming session configuration.
@@ -102,6 +125,11 @@ interface StreamConfig {
      * Specify to avoid ~150ms auto-detection latency.
      */
     language?: string;
+    /**
+     * Request word-level timestamps alongside audio.
+     * Default: false
+     */
+    wordTimestamps?: boolean;
 }
 /**
  * Audio chunk from streaming TTS.
@@ -155,6 +183,8 @@ interface AudioResponse {
     generationMs: number;
     /** Real-time factor */
     rtf: number;
+    /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
+    wordTimestamps: WordTimestamp[];
 }
 /**
  * Event callbacks for streaming.
@@ -162,6 +192,8 @@ interface AudioResponse {
 interface StreamCallbacks {
     /** Called when an audio chunk is received */
     onChunk?: (chunk: AudioChunk) => void;
+    /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
+    onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
     /** Called when generation is complete */
     onFinal?: (stats: GenerationStats) => void;
     /** Called on error */
@@ -609,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
  */
 declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
-export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
+export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };

package/dist/index.d.ts CHANGED Viewed

@@ -41,6 +41,23 @@ interface Voice {
     isPublic: boolean;
     verified: boolean;
 }
+/**
+ * Word-level timestamp from server-side forced alignment.
+ */
+interface WordTimestamp {
+    /** The aligned word */
+    word: string;
+    /** Start time in milliseconds (relative to chunk/audio start) */
+    startMs: number;
+    /** End time in milliseconds (relative to chunk/audio start) */
+    endMs: number;
+    /** Start character offset in the original text */
+    charStart: number;
+    /** End character offset in the original text */
+    charEnd: number;
+    /** Alignment confidence score (0.0 - 1.0) */
+    score: number;
+}
 /**
  * TTS generation request options.
  */
@@ -75,6 +92,12 @@ interface GenerateOptions {
      *            el, uk, bg, tr, vi, ar, hi, zh, ja, ko
      */
     language?: string;
+    /**
+     * Request word-level timestamps alongside audio.
+     * When true, the server performs forced alignment and returns per-word timing boundaries.
+     * Default: false
+     */
+    wordTimestamps?: boolean;
 }
 /**
  * Streaming session configuration.
@@ -102,6 +125,11 @@ interface StreamConfig {
      * Specify to avoid ~150ms auto-detection latency.
      */
     language?: string;
+    /**
+     * Request word-level timestamps alongside audio.
+     * Default: false
+     */
+    wordTimestamps?: boolean;
 }
 /**
  * Audio chunk from streaming TTS.
@@ -155,6 +183,8 @@ interface AudioResponse {
     generationMs: number;
     /** Real-time factor */
     rtf: number;
+    /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
+    wordTimestamps: WordTimestamp[];
 }
 /**
  * Event callbacks for streaming.
@@ -162,6 +192,8 @@ interface AudioResponse {
 interface StreamCallbacks {
     /** Called when an audio chunk is received */
     onChunk?: (chunk: AudioChunk) => void;
+    /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
+    onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
     /** Called when generation is complete */
     onFinal?: (stats: GenerationStats) => void;
     /** Called on error */
@@ -609,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
  */
 declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
-export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
+export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };

package/dist/index.js CHANGED Viewed

@@ -278,10 +278,14 @@ var TTSResource = class {
   async generate(options) {
     const chunks = [];
     let finalStats;
+    const allTimestamps = [];
     await this.stream(options, {
       onChunk: (chunk) => {
         chunks.push(base64ToArrayBuffer(chunk.audio));
       },
+      onWordTimestamps: (timestamps) => {
+        allTimestamps.push(...timestamps);
+      },
       onFinal: (stats) => {
         finalStats = stats;
       }
@@ -299,7 +303,8 @@ var TTSResource = class {
       samples: finalStats ? finalStats.totalSamples : totalLength / 2,
       durationMs: finalStats ? finalStats.durationMs : 0,
       generationMs: finalStats ? finalStats.generationMs : 0,
-      rtf: finalStats ? finalStats.rtf : 0
+      rtf: finalStats ? finalStats.rtf : 0,
+      wordTimestamps: allTimestamps
     };
   }
   /**
@@ -393,6 +398,19 @@ var TTSResource = class {
           };
           pending.callbacks.onChunk?.(chunk);
         }
+        if (data.word_timestamps) {
+          const timestamps = data.word_timestamps.map(
+            (w) => ({
+              word: w.word,
+              startMs: w.start_ms,
+              endMs: w.end_ms,
+              charStart: w.char_start,
+              charEnd: w.char_end,
+              score: w.score ?? 1
+            })
+          );
+          pending.callbacks.onWordTimestamps?.(timestamps);
+        }
       } catch (e) {
         console.error("Failed to parse WebSocket message:", e);
       }
@@ -450,7 +468,8 @@ var TTSResource = class {
         max_new_tokens: options.maxNewTokens ?? 2048,
         sample_rate: options.sampleRate ?? 24e3,
         normalize: options.normalize ?? true,
-        ...options.language && { language: options.language }
+        ...options.language && { language: options.language },
+        ...options.wordTimestamps && { word_timestamps: true }
       }));
     });
   }
@@ -471,7 +490,8 @@ var TTSResource = class {
           max_new_tokens: options.maxNewTokens ?? 2048,
           sample_rate: options.sampleRate ?? 24e3,
           normalize: options.normalize ?? true,
-          ...options.language && { language: options.language }
+          ...options.language && { language: options.language },
+          ...options.wordTimestamps && { word_timestamps: true }
         }));
       };
       ws.onmessage = (event) => {
@@ -511,6 +531,19 @@ var TTSResource = class {
             };
             callbacks.onChunk?.(chunk);
           }
+          if (data.word_timestamps) {
+            const timestamps = data.word_timestamps.map(
+              (w) => ({
+                word: w.word,
+                startMs: w.start_ms,
+                endMs: w.end_ms,
+                charStart: w.char_start,
+                charEnd: w.char_end,
+                score: w.score ?? 1
+              })
+            );
+            callbacks.onWordTimestamps?.(timestamps);
+          }
         } catch (e) {
           console.error("Failed to parse WebSocket message:", e);
         }

package/dist/index.mjs CHANGED Viewed

@@ -249,10 +249,14 @@ var TTSResource = class {
   async generate(options) {
     const chunks = [];
     let finalStats;
+    const allTimestamps = [];
     await this.stream(options, {
       onChunk: (chunk) => {
         chunks.push(base64ToArrayBuffer(chunk.audio));
       },
+      onWordTimestamps: (timestamps) => {
+        allTimestamps.push(...timestamps);
+      },
       onFinal: (stats) => {
         finalStats = stats;
       }
@@ -270,7 +274,8 @@ var TTSResource = class {
       samples: finalStats ? finalStats.totalSamples : totalLength / 2,
       durationMs: finalStats ? finalStats.durationMs : 0,
       generationMs: finalStats ? finalStats.generationMs : 0,
-      rtf: finalStats ? finalStats.rtf : 0
+      rtf: finalStats ? finalStats.rtf : 0,
+      wordTimestamps: allTimestamps
     };
   }
   /**
@@ -364,6 +369,19 @@ var TTSResource = class {
           };
           pending.callbacks.onChunk?.(chunk);
         }
+        if (data.word_timestamps) {
+          const timestamps = data.word_timestamps.map(
+            (w) => ({
+              word: w.word,
+              startMs: w.start_ms,
+              endMs: w.end_ms,
+              charStart: w.char_start,
+              charEnd: w.char_end,
+              score: w.score ?? 1
+            })
+          );
+          pending.callbacks.onWordTimestamps?.(timestamps);
+        }
       } catch (e) {
         console.error("Failed to parse WebSocket message:", e);
       }
@@ -421,7 +439,8 @@ var TTSResource = class {
         max_new_tokens: options.maxNewTokens ?? 2048,
         sample_rate: options.sampleRate ?? 24e3,
         normalize: options.normalize ?? true,
-        ...options.language && { language: options.language }
+        ...options.language && { language: options.language },
+        ...options.wordTimestamps && { word_timestamps: true }
       }));
     });
   }
@@ -442,7 +461,8 @@ var TTSResource = class {
           max_new_tokens: options.maxNewTokens ?? 2048,
           sample_rate: options.sampleRate ?? 24e3,
           normalize: options.normalize ?? true,
-          ...options.language && { language: options.language }
+          ...options.language && { language: options.language },
+          ...options.wordTimestamps && { word_timestamps: true }
         }));
       };
       ws.onmessage = (event) => {
@@ -482,6 +502,19 @@ var TTSResource = class {
             };
             callbacks.onChunk?.(chunk);
           }
+          if (data.word_timestamps) {
+            const timestamps = data.word_timestamps.map(
+              (w) => ({
+                word: w.word,
+                startMs: w.start_ms,
+                endMs: w.end_ms,
+                charStart: w.char_start,
+                charEnd: w.char_end,
+                score: w.score ?? 1
+              })
+            );
+            callbacks.onWordTimestamps?.(timestamps);
+          }
         } catch (e) {
           console.error("Failed to parse WebSocket message:", e);
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "kugelaudio",
-  "version": "0.2.2",
+  "version": "0.2.3",
   "description": "Official JavaScript/TypeScript SDK for KugelAudio TTS API",
   "main": "dist/index.js",
   "module": "dist/index.mjs",

package/src/client.ts CHANGED Viewed

@@ -16,7 +16,8 @@ import type {
     KugelAudioOptions,
     Model,
     StreamCallbacks,
-    Voice
+    Voice,
+    WordTimestamp
 } from './types';
 import { base64ToArrayBuffer } from './utils';
 import { getWebSocket } from './websocket';
@@ -172,11 +173,15 @@ class TTSResource {
   async generate(options: GenerateOptions): Promise<AudioResponse> {
     const chunks: ArrayBuffer[] = [];
     let finalStats: GenerationStats | undefined;
+    const allTimestamps: WordTimestamp[] = [];
     await this.stream(options, {
       onChunk: (chunk) => {
         chunks.push(base64ToArrayBuffer(chunk.audio));
       },
+      onWordTimestamps: (timestamps) => {
+        allTimestamps.push(...timestamps);
+      },
       onFinal: (stats) => {
         finalStats = stats;
       },
@@ -198,6 +203,7 @@ class TTSResource {
       durationMs: finalStats ? finalStats.durationMs : 0,
       generationMs: finalStats ? finalStats.generationMs : 0,
       rtf: finalStats ? finalStats.rtf : 0,
+      wordTimestamps: allTimestamps,
     };
   }
@@ -321,6 +327,20 @@ class TTSResource {
           };
           pending.callbacks.onChunk?.(chunk);
         }
+        if (data.word_timestamps) {
+          const timestamps: WordTimestamp[] = data.word_timestamps.map(
+            (w: Record<string, unknown>) => ({
+              word: w.word as string,
+              startMs: w.start_ms as number,
+              endMs: w.end_ms as number,
+              charStart: w.char_start as number,
+              charEnd: w.char_end as number,
+              score: (w.score as number) ?? 1.0,
+            })
+          );
+          pending.callbacks.onWordTimestamps?.(timestamps);
+        }
       } catch (e) {
         console.error('Failed to parse WebSocket message:', e);
       }
@@ -397,6 +417,7 @@ class TTSResource {
         sample_rate: options.sampleRate ?? 24000,
         normalize: options.normalize ?? true,
         ...(options.language && { language: options.language }),
+        ...(options.wordTimestamps && { word_timestamps: true }),
       }));
     });
   }
@@ -424,6 +445,7 @@ class TTSResource {
           sample_rate: options.sampleRate ?? 24000,
           normalize: options.normalize ?? true,
           ...(options.language && { language: options.language }),
+          ...(options.wordTimestamps && { word_timestamps: true }),
         }));
       };
@@ -472,6 +494,20 @@ class TTSResource {
             };
             callbacks.onChunk?.(chunk);
           }
+          if (data.word_timestamps) {
+            const timestamps: WordTimestamp[] = data.word_timestamps.map(
+              (w: Record<string, unknown>) => ({
+                word: w.word as string,
+                startMs: w.start_ms as number,
+                endMs: w.end_ms as number,
+                charStart: w.char_start as number,
+                charEnd: w.char_end as number,
+                score: (w.score as number) ?? 1.0,
+              })
+            );
+            callbacks.onWordTimestamps?.(timestamps);
+          }
         } catch (e) {
           console.error('Failed to parse WebSocket message:', e);
         }

package/src/index.ts CHANGED Viewed

@@ -59,7 +59,8 @@ export type {
     Voice,
     VoiceAge,
     VoiceCategory,
-    VoiceSex
+    VoiceSex,
+    WordTimestamp
 } from './types';
 // Errors

package/src/types.ts CHANGED Viewed

@@ -47,6 +47,24 @@ export interface Voice {
   verified: boolean;
 }
+/**
+ * Word-level timestamp from server-side forced alignment.
+ */
+export interface WordTimestamp {
+  /** The aligned word */
+  word: string;
+  /** Start time in milliseconds (relative to chunk/audio start) */
+  startMs: number;
+  /** End time in milliseconds (relative to chunk/audio start) */
+  endMs: number;
+  /** Start character offset in the original text */
+  charStart: number;
+  /** End character offset in the original text */
+  charEnd: number;
+  /** Alignment confidence score (0.0 - 1.0) */
+  score: number;
+}
 /**
  * TTS generation request options.
  */
@@ -81,6 +99,12 @@ export interface GenerateOptions {
    *            el, uk, bg, tr, vi, ar, hi, zh, ja, ko
    */
   language?: string;
+  /**
+   * Request word-level timestamps alongside audio.
+   * When true, the server performs forced alignment and returns per-word timing boundaries.
+   * Default: false
+   */
+  wordTimestamps?: boolean;
 }
 /**
@@ -109,6 +133,11 @@ export interface StreamConfig {
    * Specify to avoid ~150ms auto-detection latency.
    */
   language?: string;
+  /**
+   * Request word-level timestamps alongside audio.
+   * Default: false
+   */
+  wordTimestamps?: boolean;
 }
 /**
@@ -165,6 +194,8 @@ export interface AudioResponse {
   generationMs: number;
   /** Real-time factor */
   rtf: number;
+  /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
+  wordTimestamps: WordTimestamp[];
 }
 /**
@@ -173,6 +204,8 @@ export interface AudioResponse {
 export interface StreamCallbacks {
   /** Called when an audio chunk is received */
   onChunk?: (chunk: AudioChunk) => void;
+  /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
+  onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
   /** Called when generation is complete */
   onFinal?: (stats: GenerationStats) => void;
   /** Called on error */