kugelaudio 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -41,6 +41,23 @@ interface Voice {
41
41
  isPublic: boolean;
42
42
  verified: boolean;
43
43
  }
44
+ /**
45
+ * Word-level timestamp from server-side forced alignment.
46
+ */
47
+ interface WordTimestamp {
48
+ /** The aligned word */
49
+ word: string;
50
+ /** Start time in milliseconds (relative to chunk/audio start) */
51
+ startMs: number;
52
+ /** End time in milliseconds (relative to chunk/audio start) */
53
+ endMs: number;
54
+ /** Start character offset in the original text */
55
+ charStart: number;
56
+ /** End character offset in the original text */
57
+ charEnd: number;
58
+ /** Alignment confidence score (0.0 - 1.0) */
59
+ score: number;
60
+ }
44
61
  /**
45
62
  * TTS generation request options.
46
63
  */
@@ -75,6 +92,12 @@ interface GenerateOptions {
75
92
  * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
76
93
  */
77
94
  language?: string;
95
+ /**
96
+ * Request word-level timestamps alongside audio.
97
+ * When true, the server performs forced alignment and returns per-word timing boundaries.
98
+ * Default: false
99
+ */
100
+ wordTimestamps?: boolean;
78
101
  }
79
102
  /**
80
103
  * Streaming session configuration.
@@ -102,6 +125,11 @@ interface StreamConfig {
102
125
  * Specify to avoid ~150ms auto-detection latency.
103
126
  */
104
127
  language?: string;
128
+ /**
129
+ * Request word-level timestamps alongside audio.
130
+ * Default: false
131
+ */
132
+ wordTimestamps?: boolean;
105
133
  }
106
134
  /**
107
135
  * Audio chunk from streaming TTS.
@@ -155,6 +183,8 @@ interface AudioResponse {
155
183
  generationMs: number;
156
184
  /** Real-time factor */
157
185
  rtf: number;
186
+ /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
187
+ wordTimestamps: WordTimestamp[];
158
188
  }
159
189
  /**
160
190
  * Event callbacks for streaming.
@@ -162,6 +192,8 @@ interface AudioResponse {
162
192
  interface StreamCallbacks {
163
193
  /** Called when an audio chunk is received */
164
194
  onChunk?: (chunk: AudioChunk) => void;
195
+ /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
196
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
165
197
  /** Called when generation is complete */
166
198
  onFinal?: (stats: GenerationStats) => void;
167
199
  /** Called on error */
@@ -609,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
609
641
  */
610
642
  declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
611
643
 
612
- export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
644
+ export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
package/dist/index.d.ts CHANGED
@@ -41,6 +41,23 @@ interface Voice {
41
41
  isPublic: boolean;
42
42
  verified: boolean;
43
43
  }
44
+ /**
45
+ * Word-level timestamp from server-side forced alignment.
46
+ */
47
+ interface WordTimestamp {
48
+ /** The aligned word */
49
+ word: string;
50
+ /** Start time in milliseconds (relative to chunk/audio start) */
51
+ startMs: number;
52
+ /** End time in milliseconds (relative to chunk/audio start) */
53
+ endMs: number;
54
+ /** Start character offset in the original text */
55
+ charStart: number;
56
+ /** End character offset in the original text */
57
+ charEnd: number;
58
+ /** Alignment confidence score (0.0 - 1.0) */
59
+ score: number;
60
+ }
44
61
  /**
45
62
  * TTS generation request options.
46
63
  */
@@ -75,6 +92,12 @@ interface GenerateOptions {
75
92
  * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
76
93
  */
77
94
  language?: string;
95
+ /**
96
+ * Request word-level timestamps alongside audio.
97
+ * When true, the server performs forced alignment and returns per-word timing boundaries.
98
+ * Default: false
99
+ */
100
+ wordTimestamps?: boolean;
78
101
  }
79
102
  /**
80
103
  * Streaming session configuration.
@@ -102,6 +125,11 @@ interface StreamConfig {
102
125
  * Specify to avoid ~150ms auto-detection latency.
103
126
  */
104
127
  language?: string;
128
+ /**
129
+ * Request word-level timestamps alongside audio.
130
+ * Default: false
131
+ */
132
+ wordTimestamps?: boolean;
105
133
  }
106
134
  /**
107
135
  * Audio chunk from streaming TTS.
@@ -155,6 +183,8 @@ interface AudioResponse {
155
183
  generationMs: number;
156
184
  /** Real-time factor */
157
185
  rtf: number;
186
+ /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
187
+ wordTimestamps: WordTimestamp[];
158
188
  }
159
189
  /**
160
190
  * Event callbacks for streaming.
@@ -162,6 +192,8 @@ interface AudioResponse {
162
192
  interface StreamCallbacks {
163
193
  /** Called when an audio chunk is received */
164
194
  onChunk?: (chunk: AudioChunk) => void;
195
+ /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
196
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
165
197
  /** Called when generation is complete */
166
198
  onFinal?: (stats: GenerationStats) => void;
167
199
  /** Called on error */
@@ -609,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
609
641
  */
610
642
  declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
611
643
 
612
- export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
644
+ export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
package/dist/index.js CHANGED
@@ -278,10 +278,14 @@ var TTSResource = class {
278
278
  async generate(options) {
279
279
  const chunks = [];
280
280
  let finalStats;
281
+ const allTimestamps = [];
281
282
  await this.stream(options, {
282
283
  onChunk: (chunk) => {
283
284
  chunks.push(base64ToArrayBuffer(chunk.audio));
284
285
  },
286
+ onWordTimestamps: (timestamps) => {
287
+ allTimestamps.push(...timestamps);
288
+ },
285
289
  onFinal: (stats) => {
286
290
  finalStats = stats;
287
291
  }
@@ -299,7 +303,8 @@ var TTSResource = class {
299
303
  samples: finalStats ? finalStats.totalSamples : totalLength / 2,
300
304
  durationMs: finalStats ? finalStats.durationMs : 0,
301
305
  generationMs: finalStats ? finalStats.generationMs : 0,
302
- rtf: finalStats ? finalStats.rtf : 0
306
+ rtf: finalStats ? finalStats.rtf : 0,
307
+ wordTimestamps: allTimestamps
303
308
  };
304
309
  }
305
310
  /**
@@ -393,6 +398,19 @@ var TTSResource = class {
393
398
  };
394
399
  pending.callbacks.onChunk?.(chunk);
395
400
  }
401
+ if (data.word_timestamps) {
402
+ const timestamps = data.word_timestamps.map(
403
+ (w) => ({
404
+ word: w.word,
405
+ startMs: w.start_ms,
406
+ endMs: w.end_ms,
407
+ charStart: w.char_start,
408
+ charEnd: w.char_end,
409
+ score: w.score ?? 1
410
+ })
411
+ );
412
+ pending.callbacks.onWordTimestamps?.(timestamps);
413
+ }
396
414
  } catch (e) {
397
415
  console.error("Failed to parse WebSocket message:", e);
398
416
  }
@@ -450,7 +468,8 @@ var TTSResource = class {
450
468
  max_new_tokens: options.maxNewTokens ?? 2048,
451
469
  sample_rate: options.sampleRate ?? 24e3,
452
470
  normalize: options.normalize ?? true,
453
- ...options.language && { language: options.language }
471
+ ...options.language && { language: options.language },
472
+ ...options.wordTimestamps && { word_timestamps: true }
454
473
  }));
455
474
  });
456
475
  }
@@ -471,7 +490,8 @@ var TTSResource = class {
471
490
  max_new_tokens: options.maxNewTokens ?? 2048,
472
491
  sample_rate: options.sampleRate ?? 24e3,
473
492
  normalize: options.normalize ?? true,
474
- ...options.language && { language: options.language }
493
+ ...options.language && { language: options.language },
494
+ ...options.wordTimestamps && { word_timestamps: true }
475
495
  }));
476
496
  };
477
497
  ws.onmessage = (event) => {
@@ -511,6 +531,19 @@ var TTSResource = class {
511
531
  };
512
532
  callbacks.onChunk?.(chunk);
513
533
  }
534
+ if (data.word_timestamps) {
535
+ const timestamps = data.word_timestamps.map(
536
+ (w) => ({
537
+ word: w.word,
538
+ startMs: w.start_ms,
539
+ endMs: w.end_ms,
540
+ charStart: w.char_start,
541
+ charEnd: w.char_end,
542
+ score: w.score ?? 1
543
+ })
544
+ );
545
+ callbacks.onWordTimestamps?.(timestamps);
546
+ }
514
547
  } catch (e) {
515
548
  console.error("Failed to parse WebSocket message:", e);
516
549
  }
package/dist/index.mjs CHANGED
@@ -249,10 +249,14 @@ var TTSResource = class {
249
249
  async generate(options) {
250
250
  const chunks = [];
251
251
  let finalStats;
252
+ const allTimestamps = [];
252
253
  await this.stream(options, {
253
254
  onChunk: (chunk) => {
254
255
  chunks.push(base64ToArrayBuffer(chunk.audio));
255
256
  },
257
+ onWordTimestamps: (timestamps) => {
258
+ allTimestamps.push(...timestamps);
259
+ },
256
260
  onFinal: (stats) => {
257
261
  finalStats = stats;
258
262
  }
@@ -270,7 +274,8 @@ var TTSResource = class {
270
274
  samples: finalStats ? finalStats.totalSamples : totalLength / 2,
271
275
  durationMs: finalStats ? finalStats.durationMs : 0,
272
276
  generationMs: finalStats ? finalStats.generationMs : 0,
273
- rtf: finalStats ? finalStats.rtf : 0
277
+ rtf: finalStats ? finalStats.rtf : 0,
278
+ wordTimestamps: allTimestamps
274
279
  };
275
280
  }
276
281
  /**
@@ -364,6 +369,19 @@ var TTSResource = class {
364
369
  };
365
370
  pending.callbacks.onChunk?.(chunk);
366
371
  }
372
+ if (data.word_timestamps) {
373
+ const timestamps = data.word_timestamps.map(
374
+ (w) => ({
375
+ word: w.word,
376
+ startMs: w.start_ms,
377
+ endMs: w.end_ms,
378
+ charStart: w.char_start,
379
+ charEnd: w.char_end,
380
+ score: w.score ?? 1
381
+ })
382
+ );
383
+ pending.callbacks.onWordTimestamps?.(timestamps);
384
+ }
367
385
  } catch (e) {
368
386
  console.error("Failed to parse WebSocket message:", e);
369
387
  }
@@ -421,7 +439,8 @@ var TTSResource = class {
421
439
  max_new_tokens: options.maxNewTokens ?? 2048,
422
440
  sample_rate: options.sampleRate ?? 24e3,
423
441
  normalize: options.normalize ?? true,
424
- ...options.language && { language: options.language }
442
+ ...options.language && { language: options.language },
443
+ ...options.wordTimestamps && { word_timestamps: true }
425
444
  }));
426
445
  });
427
446
  }
@@ -442,7 +461,8 @@ var TTSResource = class {
442
461
  max_new_tokens: options.maxNewTokens ?? 2048,
443
462
  sample_rate: options.sampleRate ?? 24e3,
444
463
  normalize: options.normalize ?? true,
445
- ...options.language && { language: options.language }
464
+ ...options.language && { language: options.language },
465
+ ...options.wordTimestamps && { word_timestamps: true }
446
466
  }));
447
467
  };
448
468
  ws.onmessage = (event) => {
@@ -482,6 +502,19 @@ var TTSResource = class {
482
502
  };
483
503
  callbacks.onChunk?.(chunk);
484
504
  }
505
+ if (data.word_timestamps) {
506
+ const timestamps = data.word_timestamps.map(
507
+ (w) => ({
508
+ word: w.word,
509
+ startMs: w.start_ms,
510
+ endMs: w.end_ms,
511
+ charStart: w.char_start,
512
+ charEnd: w.char_end,
513
+ score: w.score ?? 1
514
+ })
515
+ );
516
+ callbacks.onWordTimestamps?.(timestamps);
517
+ }
485
518
  } catch (e) {
486
519
  console.error("Failed to parse WebSocket message:", e);
487
520
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kugelaudio",
3
- "version": "0.2.2",
3
+ "version": "0.2.3",
4
4
  "description": "Official JavaScript/TypeScript SDK for KugelAudio TTS API",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",
package/src/client.ts CHANGED
@@ -16,7 +16,8 @@ import type {
16
16
  KugelAudioOptions,
17
17
  Model,
18
18
  StreamCallbacks,
19
- Voice
19
+ Voice,
20
+ WordTimestamp
20
21
  } from './types';
21
22
  import { base64ToArrayBuffer } from './utils';
22
23
  import { getWebSocket } from './websocket';
@@ -172,11 +173,15 @@ class TTSResource {
172
173
  async generate(options: GenerateOptions): Promise<AudioResponse> {
173
174
  const chunks: ArrayBuffer[] = [];
174
175
  let finalStats: GenerationStats | undefined;
176
+ const allTimestamps: WordTimestamp[] = [];
175
177
 
176
178
  await this.stream(options, {
177
179
  onChunk: (chunk) => {
178
180
  chunks.push(base64ToArrayBuffer(chunk.audio));
179
181
  },
182
+ onWordTimestamps: (timestamps) => {
183
+ allTimestamps.push(...timestamps);
184
+ },
180
185
  onFinal: (stats) => {
181
186
  finalStats = stats;
182
187
  },
@@ -198,6 +203,7 @@ class TTSResource {
198
203
  durationMs: finalStats ? finalStats.durationMs : 0,
199
204
  generationMs: finalStats ? finalStats.generationMs : 0,
200
205
  rtf: finalStats ? finalStats.rtf : 0,
206
+ wordTimestamps: allTimestamps,
201
207
  };
202
208
  }
203
209
 
@@ -321,6 +327,20 @@ class TTSResource {
321
327
  };
322
328
  pending.callbacks.onChunk?.(chunk);
323
329
  }
330
+
331
+ if (data.word_timestamps) {
332
+ const timestamps: WordTimestamp[] = data.word_timestamps.map(
333
+ (w: Record<string, unknown>) => ({
334
+ word: w.word as string,
335
+ startMs: w.start_ms as number,
336
+ endMs: w.end_ms as number,
337
+ charStart: w.char_start as number,
338
+ charEnd: w.char_end as number,
339
+ score: (w.score as number) ?? 1.0,
340
+ })
341
+ );
342
+ pending.callbacks.onWordTimestamps?.(timestamps);
343
+ }
324
344
  } catch (e) {
325
345
  console.error('Failed to parse WebSocket message:', e);
326
346
  }
@@ -397,6 +417,7 @@ class TTSResource {
397
417
  sample_rate: options.sampleRate ?? 24000,
398
418
  normalize: options.normalize ?? true,
399
419
  ...(options.language && { language: options.language }),
420
+ ...(options.wordTimestamps && { word_timestamps: true }),
400
421
  }));
401
422
  });
402
423
  }
@@ -424,6 +445,7 @@ class TTSResource {
424
445
  sample_rate: options.sampleRate ?? 24000,
425
446
  normalize: options.normalize ?? true,
426
447
  ...(options.language && { language: options.language }),
448
+ ...(options.wordTimestamps && { word_timestamps: true }),
427
449
  }));
428
450
  };
429
451
 
@@ -472,6 +494,20 @@ class TTSResource {
472
494
  };
473
495
  callbacks.onChunk?.(chunk);
474
496
  }
497
+
498
+ if (data.word_timestamps) {
499
+ const timestamps: WordTimestamp[] = data.word_timestamps.map(
500
+ (w: Record<string, unknown>) => ({
501
+ word: w.word as string,
502
+ startMs: w.start_ms as number,
503
+ endMs: w.end_ms as number,
504
+ charStart: w.char_start as number,
505
+ charEnd: w.char_end as number,
506
+ score: (w.score as number) ?? 1.0,
507
+ })
508
+ );
509
+ callbacks.onWordTimestamps?.(timestamps);
510
+ }
475
511
  } catch (e) {
476
512
  console.error('Failed to parse WebSocket message:', e);
477
513
  }
package/src/index.ts CHANGED
@@ -59,7 +59,8 @@ export type {
59
59
  Voice,
60
60
  VoiceAge,
61
61
  VoiceCategory,
62
- VoiceSex
62
+ VoiceSex,
63
+ WordTimestamp
63
64
  } from './types';
64
65
 
65
66
  // Errors
package/src/types.ts CHANGED
@@ -47,6 +47,24 @@ export interface Voice {
47
47
  verified: boolean;
48
48
  }
49
49
 
50
+ /**
51
+ * Word-level timestamp from server-side forced alignment.
52
+ */
53
+ export interface WordTimestamp {
54
+ /** The aligned word */
55
+ word: string;
56
+ /** Start time in milliseconds (relative to chunk/audio start) */
57
+ startMs: number;
58
+ /** End time in milliseconds (relative to chunk/audio start) */
59
+ endMs: number;
60
+ /** Start character offset in the original text */
61
+ charStart: number;
62
+ /** End character offset in the original text */
63
+ charEnd: number;
64
+ /** Alignment confidence score (0.0 - 1.0) */
65
+ score: number;
66
+ }
67
+
50
68
  /**
51
69
  * TTS generation request options.
52
70
  */
@@ -81,6 +99,12 @@ export interface GenerateOptions {
81
99
  * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
82
100
  */
83
101
  language?: string;
102
+ /**
103
+ * Request word-level timestamps alongside audio.
104
+ * When true, the server performs forced alignment and returns per-word timing boundaries.
105
+ * Default: false
106
+ */
107
+ wordTimestamps?: boolean;
84
108
  }
85
109
 
86
110
  /**
@@ -109,6 +133,11 @@ export interface StreamConfig {
109
133
  * Specify to avoid ~150ms auto-detection latency.
110
134
  */
111
135
  language?: string;
136
+ /**
137
+ * Request word-level timestamps alongside audio.
138
+ * Default: false
139
+ */
140
+ wordTimestamps?: boolean;
112
141
  }
113
142
 
114
143
  /**
@@ -165,6 +194,8 @@ export interface AudioResponse {
165
194
  generationMs: number;
166
195
  /** Real-time factor */
167
196
  rtf: number;
197
+ /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
198
+ wordTimestamps: WordTimestamp[];
168
199
  }
169
200
 
170
201
  /**
@@ -173,6 +204,8 @@ export interface AudioResponse {
173
204
  export interface StreamCallbacks {
174
205
  /** Called when an audio chunk is received */
175
206
  onChunk?: (chunk: AudioChunk) => void;
207
+ /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
208
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
176
209
  /** Called when generation is complete */
177
210
  onFinal?: (stats: GenerationStats) => void;
178
211
  /** Called on error */