kugelaudio 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +33 -1
- package/dist/index.d.ts +33 -1
- package/dist/index.js +36 -3
- package/dist/index.mjs +36 -3
- package/package.json +1 -1
- package/src/client.ts +37 -1
- package/src/index.ts +2 -1
- package/src/types.ts +33 -0
package/dist/index.d.mts
CHANGED
|
@@ -41,6 +41,23 @@ interface Voice {
|
|
|
41
41
|
isPublic: boolean;
|
|
42
42
|
verified: boolean;
|
|
43
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* Word-level timestamp from server-side forced alignment.
|
|
46
|
+
*/
|
|
47
|
+
interface WordTimestamp {
|
|
48
|
+
/** The aligned word */
|
|
49
|
+
word: string;
|
|
50
|
+
/** Start time in milliseconds (relative to chunk/audio start) */
|
|
51
|
+
startMs: number;
|
|
52
|
+
/** End time in milliseconds (relative to chunk/audio start) */
|
|
53
|
+
endMs: number;
|
|
54
|
+
/** Start character offset in the original text */
|
|
55
|
+
charStart: number;
|
|
56
|
+
/** End character offset in the original text */
|
|
57
|
+
charEnd: number;
|
|
58
|
+
/** Alignment confidence score (0.0 - 1.0) */
|
|
59
|
+
score: number;
|
|
60
|
+
}
|
|
44
61
|
/**
|
|
45
62
|
* TTS generation request options.
|
|
46
63
|
*/
|
|
@@ -75,6 +92,12 @@ interface GenerateOptions {
|
|
|
75
92
|
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko
|
|
76
93
|
*/
|
|
77
94
|
language?: string;
|
|
95
|
+
/**
|
|
96
|
+
* Request word-level timestamps alongside audio.
|
|
97
|
+
* When true, the server performs forced alignment and returns per-word timing boundaries.
|
|
98
|
+
* Default: false
|
|
99
|
+
*/
|
|
100
|
+
wordTimestamps?: boolean;
|
|
78
101
|
}
|
|
79
102
|
/**
|
|
80
103
|
* Streaming session configuration.
|
|
@@ -102,6 +125,11 @@ interface StreamConfig {
|
|
|
102
125
|
* Specify to avoid ~150ms auto-detection latency.
|
|
103
126
|
*/
|
|
104
127
|
language?: string;
|
|
128
|
+
/**
|
|
129
|
+
* Request word-level timestamps alongside audio.
|
|
130
|
+
* Default: false
|
|
131
|
+
*/
|
|
132
|
+
wordTimestamps?: boolean;
|
|
105
133
|
}
|
|
106
134
|
/**
|
|
107
135
|
* Audio chunk from streaming TTS.
|
|
@@ -155,6 +183,8 @@ interface AudioResponse {
|
|
|
155
183
|
generationMs: number;
|
|
156
184
|
/** Real-time factor */
|
|
157
185
|
rtf: number;
|
|
186
|
+
/** Per-word timing boundaries (populated when `wordTimestamps: true`) */
|
|
187
|
+
wordTimestamps: WordTimestamp[];
|
|
158
188
|
}
|
|
159
189
|
/**
|
|
160
190
|
* Event callbacks for streaming.
|
|
@@ -162,6 +192,8 @@ interface AudioResponse {
|
|
|
162
192
|
interface StreamCallbacks {
|
|
163
193
|
/** Called when an audio chunk is received */
|
|
164
194
|
onChunk?: (chunk: AudioChunk) => void;
|
|
195
|
+
/** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
|
|
196
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
165
197
|
/** Called when generation is complete */
|
|
166
198
|
onFinal?: (stats: GenerationStats) => void;
|
|
167
199
|
/** Called on error */
|
|
@@ -609,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
|
|
|
609
641
|
*/
|
|
610
642
|
declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
|
|
611
643
|
|
|
612
|
-
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
|
|
644
|
+
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
|
package/dist/index.d.ts
CHANGED
|
@@ -41,6 +41,23 @@ interface Voice {
|
|
|
41
41
|
isPublic: boolean;
|
|
42
42
|
verified: boolean;
|
|
43
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* Word-level timestamp from server-side forced alignment.
|
|
46
|
+
*/
|
|
47
|
+
interface WordTimestamp {
|
|
48
|
+
/** The aligned word */
|
|
49
|
+
word: string;
|
|
50
|
+
/** Start time in milliseconds (relative to chunk/audio start) */
|
|
51
|
+
startMs: number;
|
|
52
|
+
/** End time in milliseconds (relative to chunk/audio start) */
|
|
53
|
+
endMs: number;
|
|
54
|
+
/** Start character offset in the original text */
|
|
55
|
+
charStart: number;
|
|
56
|
+
/** End character offset in the original text */
|
|
57
|
+
charEnd: number;
|
|
58
|
+
/** Alignment confidence score (0.0 - 1.0) */
|
|
59
|
+
score: number;
|
|
60
|
+
}
|
|
44
61
|
/**
|
|
45
62
|
* TTS generation request options.
|
|
46
63
|
*/
|
|
@@ -75,6 +92,12 @@ interface GenerateOptions {
|
|
|
75
92
|
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko
|
|
76
93
|
*/
|
|
77
94
|
language?: string;
|
|
95
|
+
/**
|
|
96
|
+
* Request word-level timestamps alongside audio.
|
|
97
|
+
* When true, the server performs forced alignment and returns per-word timing boundaries.
|
|
98
|
+
* Default: false
|
|
99
|
+
*/
|
|
100
|
+
wordTimestamps?: boolean;
|
|
78
101
|
}
|
|
79
102
|
/**
|
|
80
103
|
* Streaming session configuration.
|
|
@@ -102,6 +125,11 @@ interface StreamConfig {
|
|
|
102
125
|
* Specify to avoid ~150ms auto-detection latency.
|
|
103
126
|
*/
|
|
104
127
|
language?: string;
|
|
128
|
+
/**
|
|
129
|
+
* Request word-level timestamps alongside audio.
|
|
130
|
+
* Default: false
|
|
131
|
+
*/
|
|
132
|
+
wordTimestamps?: boolean;
|
|
105
133
|
}
|
|
106
134
|
/**
|
|
107
135
|
* Audio chunk from streaming TTS.
|
|
@@ -155,6 +183,8 @@ interface AudioResponse {
|
|
|
155
183
|
generationMs: number;
|
|
156
184
|
/** Real-time factor */
|
|
157
185
|
rtf: number;
|
|
186
|
+
/** Per-word timing boundaries (populated when `wordTimestamps: true`) */
|
|
187
|
+
wordTimestamps: WordTimestamp[];
|
|
158
188
|
}
|
|
159
189
|
/**
|
|
160
190
|
* Event callbacks for streaming.
|
|
@@ -162,6 +192,8 @@ interface AudioResponse {
|
|
|
162
192
|
interface StreamCallbacks {
|
|
163
193
|
/** Called when an audio chunk is received */
|
|
164
194
|
onChunk?: (chunk: AudioChunk) => void;
|
|
195
|
+
/** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
|
|
196
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
165
197
|
/** Called when generation is complete */
|
|
166
198
|
onFinal?: (stats: GenerationStats) => void;
|
|
167
199
|
/** Called on error */
|
|
@@ -609,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
|
|
|
609
641
|
*/
|
|
610
642
|
declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
|
|
611
643
|
|
|
612
|
-
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
|
|
644
|
+
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
|
package/dist/index.js
CHANGED
|
@@ -278,10 +278,14 @@ var TTSResource = class {
|
|
|
278
278
|
async generate(options) {
|
|
279
279
|
const chunks = [];
|
|
280
280
|
let finalStats;
|
|
281
|
+
const allTimestamps = [];
|
|
281
282
|
await this.stream(options, {
|
|
282
283
|
onChunk: (chunk) => {
|
|
283
284
|
chunks.push(base64ToArrayBuffer(chunk.audio));
|
|
284
285
|
},
|
|
286
|
+
onWordTimestamps: (timestamps) => {
|
|
287
|
+
allTimestamps.push(...timestamps);
|
|
288
|
+
},
|
|
285
289
|
onFinal: (stats) => {
|
|
286
290
|
finalStats = stats;
|
|
287
291
|
}
|
|
@@ -299,7 +303,8 @@ var TTSResource = class {
|
|
|
299
303
|
samples: finalStats ? finalStats.totalSamples : totalLength / 2,
|
|
300
304
|
durationMs: finalStats ? finalStats.durationMs : 0,
|
|
301
305
|
generationMs: finalStats ? finalStats.generationMs : 0,
|
|
302
|
-
rtf: finalStats ? finalStats.rtf : 0
|
|
306
|
+
rtf: finalStats ? finalStats.rtf : 0,
|
|
307
|
+
wordTimestamps: allTimestamps
|
|
303
308
|
};
|
|
304
309
|
}
|
|
305
310
|
/**
|
|
@@ -393,6 +398,19 @@ var TTSResource = class {
|
|
|
393
398
|
};
|
|
394
399
|
pending.callbacks.onChunk?.(chunk);
|
|
395
400
|
}
|
|
401
|
+
if (data.word_timestamps) {
|
|
402
|
+
const timestamps = data.word_timestamps.map(
|
|
403
|
+
(w) => ({
|
|
404
|
+
word: w.word,
|
|
405
|
+
startMs: w.start_ms,
|
|
406
|
+
endMs: w.end_ms,
|
|
407
|
+
charStart: w.char_start,
|
|
408
|
+
charEnd: w.char_end,
|
|
409
|
+
score: w.score ?? 1
|
|
410
|
+
})
|
|
411
|
+
);
|
|
412
|
+
pending.callbacks.onWordTimestamps?.(timestamps);
|
|
413
|
+
}
|
|
396
414
|
} catch (e) {
|
|
397
415
|
console.error("Failed to parse WebSocket message:", e);
|
|
398
416
|
}
|
|
@@ -450,7 +468,8 @@ var TTSResource = class {
|
|
|
450
468
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
451
469
|
sample_rate: options.sampleRate ?? 24e3,
|
|
452
470
|
normalize: options.normalize ?? true,
|
|
453
|
-
...options.language && { language: options.language }
|
|
471
|
+
...options.language && { language: options.language },
|
|
472
|
+
...options.wordTimestamps && { word_timestamps: true }
|
|
454
473
|
}));
|
|
455
474
|
});
|
|
456
475
|
}
|
|
@@ -471,7 +490,8 @@ var TTSResource = class {
|
|
|
471
490
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
472
491
|
sample_rate: options.sampleRate ?? 24e3,
|
|
473
492
|
normalize: options.normalize ?? true,
|
|
474
|
-
...options.language && { language: options.language }
|
|
493
|
+
...options.language && { language: options.language },
|
|
494
|
+
...options.wordTimestamps && { word_timestamps: true }
|
|
475
495
|
}));
|
|
476
496
|
};
|
|
477
497
|
ws.onmessage = (event) => {
|
|
@@ -511,6 +531,19 @@ var TTSResource = class {
|
|
|
511
531
|
};
|
|
512
532
|
callbacks.onChunk?.(chunk);
|
|
513
533
|
}
|
|
534
|
+
if (data.word_timestamps) {
|
|
535
|
+
const timestamps = data.word_timestamps.map(
|
|
536
|
+
(w) => ({
|
|
537
|
+
word: w.word,
|
|
538
|
+
startMs: w.start_ms,
|
|
539
|
+
endMs: w.end_ms,
|
|
540
|
+
charStart: w.char_start,
|
|
541
|
+
charEnd: w.char_end,
|
|
542
|
+
score: w.score ?? 1
|
|
543
|
+
})
|
|
544
|
+
);
|
|
545
|
+
callbacks.onWordTimestamps?.(timestamps);
|
|
546
|
+
}
|
|
514
547
|
} catch (e) {
|
|
515
548
|
console.error("Failed to parse WebSocket message:", e);
|
|
516
549
|
}
|
package/dist/index.mjs
CHANGED
|
@@ -249,10 +249,14 @@ var TTSResource = class {
|
|
|
249
249
|
async generate(options) {
|
|
250
250
|
const chunks = [];
|
|
251
251
|
let finalStats;
|
|
252
|
+
const allTimestamps = [];
|
|
252
253
|
await this.stream(options, {
|
|
253
254
|
onChunk: (chunk) => {
|
|
254
255
|
chunks.push(base64ToArrayBuffer(chunk.audio));
|
|
255
256
|
},
|
|
257
|
+
onWordTimestamps: (timestamps) => {
|
|
258
|
+
allTimestamps.push(...timestamps);
|
|
259
|
+
},
|
|
256
260
|
onFinal: (stats) => {
|
|
257
261
|
finalStats = stats;
|
|
258
262
|
}
|
|
@@ -270,7 +274,8 @@ var TTSResource = class {
|
|
|
270
274
|
samples: finalStats ? finalStats.totalSamples : totalLength / 2,
|
|
271
275
|
durationMs: finalStats ? finalStats.durationMs : 0,
|
|
272
276
|
generationMs: finalStats ? finalStats.generationMs : 0,
|
|
273
|
-
rtf: finalStats ? finalStats.rtf : 0
|
|
277
|
+
rtf: finalStats ? finalStats.rtf : 0,
|
|
278
|
+
wordTimestamps: allTimestamps
|
|
274
279
|
};
|
|
275
280
|
}
|
|
276
281
|
/**
|
|
@@ -364,6 +369,19 @@ var TTSResource = class {
|
|
|
364
369
|
};
|
|
365
370
|
pending.callbacks.onChunk?.(chunk);
|
|
366
371
|
}
|
|
372
|
+
if (data.word_timestamps) {
|
|
373
|
+
const timestamps = data.word_timestamps.map(
|
|
374
|
+
(w) => ({
|
|
375
|
+
word: w.word,
|
|
376
|
+
startMs: w.start_ms,
|
|
377
|
+
endMs: w.end_ms,
|
|
378
|
+
charStart: w.char_start,
|
|
379
|
+
charEnd: w.char_end,
|
|
380
|
+
score: w.score ?? 1
|
|
381
|
+
})
|
|
382
|
+
);
|
|
383
|
+
pending.callbacks.onWordTimestamps?.(timestamps);
|
|
384
|
+
}
|
|
367
385
|
} catch (e) {
|
|
368
386
|
console.error("Failed to parse WebSocket message:", e);
|
|
369
387
|
}
|
|
@@ -421,7 +439,8 @@ var TTSResource = class {
|
|
|
421
439
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
422
440
|
sample_rate: options.sampleRate ?? 24e3,
|
|
423
441
|
normalize: options.normalize ?? true,
|
|
424
|
-
...options.language && { language: options.language }
|
|
442
|
+
...options.language && { language: options.language },
|
|
443
|
+
...options.wordTimestamps && { word_timestamps: true }
|
|
425
444
|
}));
|
|
426
445
|
});
|
|
427
446
|
}
|
|
@@ -442,7 +461,8 @@ var TTSResource = class {
|
|
|
442
461
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
443
462
|
sample_rate: options.sampleRate ?? 24e3,
|
|
444
463
|
normalize: options.normalize ?? true,
|
|
445
|
-
...options.language && { language: options.language }
|
|
464
|
+
...options.language && { language: options.language },
|
|
465
|
+
...options.wordTimestamps && { word_timestamps: true }
|
|
446
466
|
}));
|
|
447
467
|
};
|
|
448
468
|
ws.onmessage = (event) => {
|
|
@@ -482,6 +502,19 @@ var TTSResource = class {
|
|
|
482
502
|
};
|
|
483
503
|
callbacks.onChunk?.(chunk);
|
|
484
504
|
}
|
|
505
|
+
if (data.word_timestamps) {
|
|
506
|
+
const timestamps = data.word_timestamps.map(
|
|
507
|
+
(w) => ({
|
|
508
|
+
word: w.word,
|
|
509
|
+
startMs: w.start_ms,
|
|
510
|
+
endMs: w.end_ms,
|
|
511
|
+
charStart: w.char_start,
|
|
512
|
+
charEnd: w.char_end,
|
|
513
|
+
score: w.score ?? 1
|
|
514
|
+
})
|
|
515
|
+
);
|
|
516
|
+
callbacks.onWordTimestamps?.(timestamps);
|
|
517
|
+
}
|
|
485
518
|
} catch (e) {
|
|
486
519
|
console.error("Failed to parse WebSocket message:", e);
|
|
487
520
|
}
|
package/package.json
CHANGED
package/src/client.ts
CHANGED
|
@@ -16,7 +16,8 @@ import type {
|
|
|
16
16
|
KugelAudioOptions,
|
|
17
17
|
Model,
|
|
18
18
|
StreamCallbacks,
|
|
19
|
-
Voice
|
|
19
|
+
Voice,
|
|
20
|
+
WordTimestamp
|
|
20
21
|
} from './types';
|
|
21
22
|
import { base64ToArrayBuffer } from './utils';
|
|
22
23
|
import { getWebSocket } from './websocket';
|
|
@@ -172,11 +173,15 @@ class TTSResource {
|
|
|
172
173
|
async generate(options: GenerateOptions): Promise<AudioResponse> {
|
|
173
174
|
const chunks: ArrayBuffer[] = [];
|
|
174
175
|
let finalStats: GenerationStats | undefined;
|
|
176
|
+
const allTimestamps: WordTimestamp[] = [];
|
|
175
177
|
|
|
176
178
|
await this.stream(options, {
|
|
177
179
|
onChunk: (chunk) => {
|
|
178
180
|
chunks.push(base64ToArrayBuffer(chunk.audio));
|
|
179
181
|
},
|
|
182
|
+
onWordTimestamps: (timestamps) => {
|
|
183
|
+
allTimestamps.push(...timestamps);
|
|
184
|
+
},
|
|
180
185
|
onFinal: (stats) => {
|
|
181
186
|
finalStats = stats;
|
|
182
187
|
},
|
|
@@ -198,6 +203,7 @@ class TTSResource {
|
|
|
198
203
|
durationMs: finalStats ? finalStats.durationMs : 0,
|
|
199
204
|
generationMs: finalStats ? finalStats.generationMs : 0,
|
|
200
205
|
rtf: finalStats ? finalStats.rtf : 0,
|
|
206
|
+
wordTimestamps: allTimestamps,
|
|
201
207
|
};
|
|
202
208
|
}
|
|
203
209
|
|
|
@@ -321,6 +327,20 @@ class TTSResource {
|
|
|
321
327
|
};
|
|
322
328
|
pending.callbacks.onChunk?.(chunk);
|
|
323
329
|
}
|
|
330
|
+
|
|
331
|
+
if (data.word_timestamps) {
|
|
332
|
+
const timestamps: WordTimestamp[] = data.word_timestamps.map(
|
|
333
|
+
(w: Record<string, unknown>) => ({
|
|
334
|
+
word: w.word as string,
|
|
335
|
+
startMs: w.start_ms as number,
|
|
336
|
+
endMs: w.end_ms as number,
|
|
337
|
+
charStart: w.char_start as number,
|
|
338
|
+
charEnd: w.char_end as number,
|
|
339
|
+
score: (w.score as number) ?? 1.0,
|
|
340
|
+
})
|
|
341
|
+
);
|
|
342
|
+
pending.callbacks.onWordTimestamps?.(timestamps);
|
|
343
|
+
}
|
|
324
344
|
} catch (e) {
|
|
325
345
|
console.error('Failed to parse WebSocket message:', e);
|
|
326
346
|
}
|
|
@@ -397,6 +417,7 @@ class TTSResource {
|
|
|
397
417
|
sample_rate: options.sampleRate ?? 24000,
|
|
398
418
|
normalize: options.normalize ?? true,
|
|
399
419
|
...(options.language && { language: options.language }),
|
|
420
|
+
...(options.wordTimestamps && { word_timestamps: true }),
|
|
400
421
|
}));
|
|
401
422
|
});
|
|
402
423
|
}
|
|
@@ -424,6 +445,7 @@ class TTSResource {
|
|
|
424
445
|
sample_rate: options.sampleRate ?? 24000,
|
|
425
446
|
normalize: options.normalize ?? true,
|
|
426
447
|
...(options.language && { language: options.language }),
|
|
448
|
+
...(options.wordTimestamps && { word_timestamps: true }),
|
|
427
449
|
}));
|
|
428
450
|
};
|
|
429
451
|
|
|
@@ -472,6 +494,20 @@ class TTSResource {
|
|
|
472
494
|
};
|
|
473
495
|
callbacks.onChunk?.(chunk);
|
|
474
496
|
}
|
|
497
|
+
|
|
498
|
+
if (data.word_timestamps) {
|
|
499
|
+
const timestamps: WordTimestamp[] = data.word_timestamps.map(
|
|
500
|
+
(w: Record<string, unknown>) => ({
|
|
501
|
+
word: w.word as string,
|
|
502
|
+
startMs: w.start_ms as number,
|
|
503
|
+
endMs: w.end_ms as number,
|
|
504
|
+
charStart: w.char_start as number,
|
|
505
|
+
charEnd: w.char_end as number,
|
|
506
|
+
score: (w.score as number) ?? 1.0,
|
|
507
|
+
})
|
|
508
|
+
);
|
|
509
|
+
callbacks.onWordTimestamps?.(timestamps);
|
|
510
|
+
}
|
|
475
511
|
} catch (e) {
|
|
476
512
|
console.error('Failed to parse WebSocket message:', e);
|
|
477
513
|
}
|
package/src/index.ts
CHANGED
package/src/types.ts
CHANGED
|
@@ -47,6 +47,24 @@ export interface Voice {
|
|
|
47
47
|
verified: boolean;
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
/**
|
|
51
|
+
* Word-level timestamp from server-side forced alignment.
|
|
52
|
+
*/
|
|
53
|
+
export interface WordTimestamp {
|
|
54
|
+
/** The aligned word */
|
|
55
|
+
word: string;
|
|
56
|
+
/** Start time in milliseconds (relative to chunk/audio start) */
|
|
57
|
+
startMs: number;
|
|
58
|
+
/** End time in milliseconds (relative to chunk/audio start) */
|
|
59
|
+
endMs: number;
|
|
60
|
+
/** Start character offset in the original text */
|
|
61
|
+
charStart: number;
|
|
62
|
+
/** End character offset in the original text */
|
|
63
|
+
charEnd: number;
|
|
64
|
+
/** Alignment confidence score (0.0 - 1.0) */
|
|
65
|
+
score: number;
|
|
66
|
+
}
|
|
67
|
+
|
|
50
68
|
/**
|
|
51
69
|
* TTS generation request options.
|
|
52
70
|
*/
|
|
@@ -81,6 +99,12 @@ export interface GenerateOptions {
|
|
|
81
99
|
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko
|
|
82
100
|
*/
|
|
83
101
|
language?: string;
|
|
102
|
+
/**
|
|
103
|
+
* Request word-level timestamps alongside audio.
|
|
104
|
+
* When true, the server performs forced alignment and returns per-word timing boundaries.
|
|
105
|
+
* Default: false
|
|
106
|
+
*/
|
|
107
|
+
wordTimestamps?: boolean;
|
|
84
108
|
}
|
|
85
109
|
|
|
86
110
|
/**
|
|
@@ -109,6 +133,11 @@ export interface StreamConfig {
|
|
|
109
133
|
* Specify to avoid ~150ms auto-detection latency.
|
|
110
134
|
*/
|
|
111
135
|
language?: string;
|
|
136
|
+
/**
|
|
137
|
+
* Request word-level timestamps alongside audio.
|
|
138
|
+
* Default: false
|
|
139
|
+
*/
|
|
140
|
+
wordTimestamps?: boolean;
|
|
112
141
|
}
|
|
113
142
|
|
|
114
143
|
/**
|
|
@@ -165,6 +194,8 @@ export interface AudioResponse {
|
|
|
165
194
|
generationMs: number;
|
|
166
195
|
/** Real-time factor */
|
|
167
196
|
rtf: number;
|
|
197
|
+
/** Per-word timing boundaries (populated when `wordTimestamps: true`) */
|
|
198
|
+
wordTimestamps: WordTimestamp[];
|
|
168
199
|
}
|
|
169
200
|
|
|
170
201
|
/**
|
|
@@ -173,6 +204,8 @@ export interface AudioResponse {
|
|
|
173
204
|
export interface StreamCallbacks {
|
|
174
205
|
/** Called when an audio chunk is received */
|
|
175
206
|
onChunk?: (chunk: AudioChunk) => void;
|
|
207
|
+
/** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
|
|
208
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
176
209
|
/** Called when generation is complete */
|
|
177
210
|
onFinal?: (stats: GenerationStats) => void;
|
|
178
211
|
/** Called on error */
|