@inworld/tts 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @inworld/tts might be problematic. Click here for more details.
- package/CHANGELOG.md +9 -0
- package/LICENSE +21 -0
- package/README.md +332 -0
- package/dist/index.cjs +1580 -0
- package/package.json +77 -0
- package/src/client.js +929 -0
- package/src/config.js +135 -0
- package/src/encoding.js +23 -0
- package/src/errors.js +31 -0
- package/src/index.d.ts +363 -0
- package/src/index.js +149 -0
- package/src/player.browser.js +53 -0
- package/src/player.js +143 -0
- package/src/voice.js +498 -0
- package/src/write-file.browser.js +7 -0
- package/src/write-file.js +11 -0
package/src/client.js
ADDED
|
@@ -0,0 +1,929 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTTP client for generate and stream (aligned with Inworld TTS API).
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import {
|
|
6
|
+
BITS_PER_SAMPLE,
|
|
7
|
+
CHANNELS,
|
|
8
|
+
MAX_CHUNK_SIZE,
|
|
9
|
+
MIN_CHUNK_SIZE,
|
|
10
|
+
SAMPLE_RATE,
|
|
11
|
+
SPLICE_BREAK_SECONDS,
|
|
12
|
+
MAX_CONCURRENT_REQUESTS,
|
|
13
|
+
GENERATE_MAX_CHARS,
|
|
14
|
+
STREAM_MAX_CHARS,
|
|
15
|
+
CHARS_PER_SECOND,
|
|
16
|
+
CJK_CHAR_WEIGHT,
|
|
17
|
+
getTimeoutSignal,
|
|
18
|
+
debugLog,
|
|
19
|
+
withRetry,
|
|
20
|
+
} from './config.js';
|
|
21
|
+
import { ApiError, MissingApiKeyError, NetworkError } from './errors.js';
|
|
22
|
+
import { writeFileSafe } from './write-file.js';
|
|
23
|
+
import { playFile, play as playAudio } from './player.js';
|
|
24
|
+
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// Browser-compatible byte utilities
|
|
27
|
+
// (Buffer extends Uint8Array in Node, so returning Uint8Array is backward-compatible)
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
function base64ToBytes(b64) {
|
|
31
|
+
if (typeof Buffer !== 'undefined') return Buffer.from(b64, 'base64');
|
|
32
|
+
const bin = atob(b64);
|
|
33
|
+
const out = new Uint8Array(bin.length);
|
|
34
|
+
for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
|
|
35
|
+
return out;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function concatBytes(arrays) {
|
|
39
|
+
if (typeof Buffer !== 'undefined') return Buffer.concat(arrays);
|
|
40
|
+
const total = arrays.reduce((n, a) => n + a.length, 0);
|
|
41
|
+
const out = new Uint8Array(total);
|
|
42
|
+
let offset = 0;
|
|
43
|
+
for (const a of arrays) { out.set(a, offset); offset += a.length; }
|
|
44
|
+
return out;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
// Effective length estimation (SSML <break> + CJK weight)
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Count CJK characters (Chinese, Japanese Kanji/Kana, Korean Hangul).
|
|
53
|
+
* @param {string} text
|
|
54
|
+
* @returns {number}
|
|
55
|
+
*/
|
|
56
|
+
function _countCjk(text) {
|
|
57
|
+
return text?.match(/[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]/g)?.length ?? 0;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Return text length with SSML <break> durations and CJK weight factored in.
|
|
62
|
+
* Without this, chunks stuffed with <break time="Xs"/> tags look short in chars
|
|
63
|
+
* but produce huge audio, potentially exceeding API response size limits. CJK
|
|
64
|
+
* characters also produce ~3x more audio per character than Latin text.
|
|
65
|
+
* @param {string} text
|
|
66
|
+
* @param {number} [charsPerSecond]
|
|
67
|
+
* @returns {number}
|
|
68
|
+
*/
|
|
69
|
+
function estimateEffectiveLength(text, charsPerSecond = CHARS_PER_SECOND) {
|
|
70
|
+
const breakPattern = /<break\s+time="([\d.]+)(m?s)"\s*\/?>/gi;
|
|
71
|
+
let totalBreakSeconds = 0;
|
|
72
|
+
let m;
|
|
73
|
+
while ((m = breakPattern.exec(text)) !== null) {
|
|
74
|
+
const val = parseFloat(m[1]);
|
|
75
|
+
if (!isNaN(val)) totalBreakSeconds += m[2].toLowerCase() === 'ms' ? val / 1000 : val;
|
|
76
|
+
}
|
|
77
|
+
const textWithoutBreaks = text.replace(/<break\s[^>]*\/?>/gi, '');
|
|
78
|
+
const cjkCount = _countCjk(textWithoutBreaks);
|
|
79
|
+
const rawLength = (textWithoutBreaks.length - cjkCount) + Math.floor(cjkCount * CJK_CHAR_WEIGHT);
|
|
80
|
+
return rawLength + Math.floor(totalBreakSeconds * charsPerSecond);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Long text chunking (per api-examples)
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
function findBreakPoint(text, minPos, maxPos) {
|
|
88
|
+
const searchText = text.slice(0, maxPos);
|
|
89
|
+
let searchStart = minPos;
|
|
90
|
+
for (;;) {
|
|
91
|
+
const idx = searchText.indexOf('\n\n', searchStart);
|
|
92
|
+
if (idx === -1 || idx >= maxPos) break;
|
|
93
|
+
if (idx >= minPos) return idx + 2;
|
|
94
|
+
searchStart = idx + 1;
|
|
95
|
+
}
|
|
96
|
+
searchStart = minPos;
|
|
97
|
+
for (;;) {
|
|
98
|
+
const idx = searchText.indexOf('\n', searchStart);
|
|
99
|
+
if (idx === -1 || idx >= maxPos) break;
|
|
100
|
+
if (idx >= minPos) return idx + 1;
|
|
101
|
+
searchStart = idx + 1;
|
|
102
|
+
}
|
|
103
|
+
const sentenceEnd = /[.!?]["']?\s+|[。!?]["']?\s*|[.!?。!?]["']?$/g;
|
|
104
|
+
let match;
|
|
105
|
+
while ((match = sentenceEnd.exec(searchText)) !== null) {
|
|
106
|
+
if (match.index >= minPos) return match.index + match[0].length;
|
|
107
|
+
}
|
|
108
|
+
sentenceEnd.lastIndex = 0;
|
|
109
|
+
let last = -1;
|
|
110
|
+
while ((match = sentenceEnd.exec(searchText)) !== null) {
|
|
111
|
+
last = match.index + match[0].length;
|
|
112
|
+
}
|
|
113
|
+
if (last > 0) return last;
|
|
114
|
+
const spaceIdx = searchText.lastIndexOf(' ');
|
|
115
|
+
return spaceIdx > 0 ? spaceIdx + 1 : maxPos;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function chunkText(text) {
|
|
119
|
+
const chunks = [];
|
|
120
|
+
let pos = 0;
|
|
121
|
+
while (pos < text.length) {
|
|
122
|
+
const rest = text.slice(pos);
|
|
123
|
+
if (estimateEffectiveLength(rest) <= MAX_CHUNK_SIZE) {
|
|
124
|
+
const s = rest.trim();
|
|
125
|
+
if (s) chunks.push(s);
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
// When <break> tags or CJK inflate effective length, scale down the search window
|
|
129
|
+
const candidate = rest.slice(0, MAX_CHUNK_SIZE);
|
|
130
|
+
const effLen = estimateEffectiveLength(candidate);
|
|
131
|
+
let effectiveMax, effectiveMin;
|
|
132
|
+
if (effLen > MAX_CHUNK_SIZE) {
|
|
133
|
+
const scale = MAX_CHUNK_SIZE / effLen;
|
|
134
|
+
effectiveMax = Math.max(1, Math.floor(candidate.length * scale));
|
|
135
|
+
effectiveMin = Math.max(1, Math.floor(MIN_CHUNK_SIZE * scale));
|
|
136
|
+
} else {
|
|
137
|
+
effectiveMax = MAX_CHUNK_SIZE;
|
|
138
|
+
effectiveMin = MIN_CHUNK_SIZE;
|
|
139
|
+
}
|
|
140
|
+
const end = findBreakPoint(rest, effectiveMin, effectiveMax);
|
|
141
|
+
const s = rest.slice(0, end).trim();
|
|
142
|
+
if (s) chunks.push(s);
|
|
143
|
+
pos += end;
|
|
144
|
+
}
|
|
145
|
+
return chunks;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
// Audio merge for long-text generate
|
|
150
|
+
// ---------------------------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
function extractRawPcm(audioData) {
|
|
153
|
+
// Check for RIFF header bytes: R=0x52 I=0x49 F=0x46 F=0x46
|
|
154
|
+
if (audioData.length > 44 &&
|
|
155
|
+
audioData[0] === 0x52 && audioData[1] === 0x49 &&
|
|
156
|
+
audioData[2] === 0x46 && audioData[3] === 0x46) {
|
|
157
|
+
return audioData.subarray(44);
|
|
158
|
+
}
|
|
159
|
+
return audioData;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
function createSilence(seconds, sampleRate = SAMPLE_RATE) {
|
|
163
|
+
const byteRate = sampleRate * (BITS_PER_SAMPLE / 8) * CHANNELS;
|
|
164
|
+
let n = Math.floor(byteRate * seconds);
|
|
165
|
+
n -= n % 2;
|
|
166
|
+
return new Uint8Array(n); // zero-initialized
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
function mergeLinear16Buffers(buffers, spliceBreak = SPLICE_BREAK_SECONDS, sampleRate = SAMPLE_RATE) {
|
|
170
|
+
if (buffers.length === 0) return new Uint8Array(0);
|
|
171
|
+
if (buffers.length === 1) return buffers[0];
|
|
172
|
+
const silence = spliceBreak > 0 ? createSilence(spliceBreak, sampleRate) : null;
|
|
173
|
+
const out = [];
|
|
174
|
+
for (let i = 0; i < buffers.length; i++) {
|
|
175
|
+
const raw = extractRawPcm(buffers[i]);
|
|
176
|
+
if (i > 0 && silence) out.push(silence);
|
|
177
|
+
out.push(raw);
|
|
178
|
+
}
|
|
179
|
+
return concatBytes(out);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
function wavHeader(dataLen, sampleRate = SAMPLE_RATE) {
|
|
183
|
+
const byteRate = sampleRate * (BITS_PER_SAMPLE / 8) * CHANNELS;
|
|
184
|
+
const blockAlign = CHANNELS * (BITS_PER_SAMPLE / 8);
|
|
185
|
+
const ab = new ArrayBuffer(44);
|
|
186
|
+
const v = new DataView(ab);
|
|
187
|
+
// RIFF chunk: "RIFF" + size + "WAVE"
|
|
188
|
+
v.setUint8(0, 0x52); v.setUint8(1, 0x49); v.setUint8(2, 0x46); v.setUint8(3, 0x46);
|
|
189
|
+
v.setUint32(4, 36 + dataLen, true);
|
|
190
|
+
v.setUint8(8, 0x57); v.setUint8(9, 0x41); v.setUint8(10, 0x56); v.setUint8(11, 0x45);
|
|
191
|
+
// fmt sub-chunk: "fmt " + 16 + PCM + channels + sampleRate + byteRate + blockAlign + bitsPerSample
|
|
192
|
+
v.setUint8(12, 0x66); v.setUint8(13, 0x6D); v.setUint8(14, 0x74); v.setUint8(15, 0x20);
|
|
193
|
+
v.setUint32(16, 16, true);
|
|
194
|
+
v.setUint16(20, 1, true);
|
|
195
|
+
v.setUint16(22, CHANNELS, true);
|
|
196
|
+
v.setUint32(24, sampleRate, true);
|
|
197
|
+
v.setUint32(28, byteRate, true);
|
|
198
|
+
v.setUint16(32, blockAlign, true);
|
|
199
|
+
v.setUint16(34, BITS_PER_SAMPLE, true);
|
|
200
|
+
// data sub-chunk: "data" + size
|
|
201
|
+
v.setUint8(36, 0x64); v.setUint8(37, 0x61); v.setUint8(38, 0x74); v.setUint8(39, 0x61);
|
|
202
|
+
v.setUint32(40, dataLen, true);
|
|
203
|
+
return new Uint8Array(ab);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// ---------------------------------------------------------------------------
|
|
207
|
+
// outputFile extension check
|
|
208
|
+
// ---------------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
const VALID_ENCODINGS = ['MP3', 'OGG_OPUS', 'LINEAR16', 'WAV', 'PCM', 'FLAC', 'ALAW', 'MULAW'];
|
|
211
|
+
|
|
212
|
+
const ENCODING_EXTENSIONS = {
|
|
213
|
+
MP3: ['.mp3'],
|
|
214
|
+
OGG_OPUS: ['.ogg', '.opus'],
|
|
215
|
+
LINEAR16: ['.wav'],
|
|
216
|
+
WAV: ['.wav'],
|
|
217
|
+
PCM: ['.pcm'],
|
|
218
|
+
FLAC: ['.flac'],
|
|
219
|
+
ALAW: ['.alaw'],
|
|
220
|
+
MULAW: ['.mulaw'],
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
function warnExtensionMismatch(outputFile, encoding) {
|
|
224
|
+
const ext = outputFile.slice(outputFile.lastIndexOf('.')).toLowerCase();
|
|
225
|
+
const expected = ENCODING_EXTENSIONS[encoding];
|
|
226
|
+
if (expected && !expected.includes(ext)) {
|
|
227
|
+
console.warn(
|
|
228
|
+
`[inworld-tts] Warning: outputFile "${outputFile}" has extension "${ext}" ` +
|
|
229
|
+
`but encoding is "${encoding}" (expected ${expected.join(' or ')}). ` +
|
|
230
|
+
`The file will contain ${encoding} audio data.`
|
|
231
|
+
);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// ---------------------------------------------------------------------------
|
|
236
|
+
// Option parsing and request body
|
|
237
|
+
// ---------------------------------------------------------------------------
|
|
238
|
+
|
|
239
|
+
const KNOWN_OPTIONS = new Set([
|
|
240
|
+
'text', 'voice', 'model', 'encoding', 'sampleRate',
|
|
241
|
+
'bitRate', 'speakingRate', 'temperature', 'timestampType', 'timestampTransportStrategy',
|
|
242
|
+
'applyTextNormalization', 'outputFile', 'play',
|
|
243
|
+
]);
|
|
244
|
+
|
|
245
|
+
const RENAMED_OPTIONS = {
|
|
246
|
+
voiceId: 'voice',
|
|
247
|
+
modelId: 'model',
|
|
248
|
+
audioEncoding: 'encoding',
|
|
249
|
+
sampleRateHertz: 'sampleRate',
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
function parseOptions(options, defaultModelId) {
|
|
253
|
+
if (!options || typeof options !== 'object') throw new ApiError('options object is required');
|
|
254
|
+
for (const key of Object.keys(options)) {
|
|
255
|
+
if (RENAMED_OPTIONS[key]) {
|
|
256
|
+
console.warn(`[inworld-tts] Warning: "${key}" has been renamed to "${RENAMED_OPTIONS[key]}". Please update your code.`);
|
|
257
|
+
} else if (!KNOWN_OPTIONS.has(key)) {
|
|
258
|
+
console.warn(`[inworld-tts] Warning: unknown option "${key}" will be ignored. Use camelCase keys (e.g. voice, encoding).`);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
return {
|
|
262
|
+
text: options.text || '',
|
|
263
|
+
voice: (() => {
|
|
264
|
+
if (!options.voice) throw new ApiError('options.voice is required');
|
|
265
|
+
return options.voice;
|
|
266
|
+
})(),
|
|
267
|
+
model: options.model || defaultModelId,
|
|
268
|
+
encoding: (() => {
|
|
269
|
+
const enc = (options.encoding || 'MP3').toUpperCase();
|
|
270
|
+
if (!VALID_ENCODINGS.includes(enc)) {
|
|
271
|
+
throw new ApiError(`encoding "${enc}" is not supported. Valid values: ${VALID_ENCODINGS.join(', ')}`);
|
|
272
|
+
}
|
|
273
|
+
return enc;
|
|
274
|
+
})(),
|
|
275
|
+
sampleRate: (() => {
|
|
276
|
+
const enc = (options.encoding || 'MP3').toUpperCase();
|
|
277
|
+
if ((enc === 'ALAW' || enc === 'MULAW') && options.sampleRate == null) {
|
|
278
|
+
console.warn(`[inworld-tts] ${enc} only supports sampleRate 8000 — defaulting to 8000.`);
|
|
279
|
+
return 8000;
|
|
280
|
+
}
|
|
281
|
+
return options.sampleRate ?? 48000;
|
|
282
|
+
})(),
|
|
283
|
+
bitRate: options.bitRate ?? 128000,
|
|
284
|
+
speakingRate: options.speakingRate ?? 1.0,
|
|
285
|
+
temperature: options.temperature ?? 1.0,
|
|
286
|
+
timestampType: options.timestampType,
|
|
287
|
+
timestampTransportStrategy: options.timestampTransportStrategy,
|
|
288
|
+
applyTextNormalization: options.applyTextNormalization === 'none' ? 'APPLY_TEXT_NORMALIZATION_UNSPECIFIED' : options.applyTextNormalization,
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
function buildBody(opts, textOverride) {
|
|
293
|
+
const ac = {
|
|
294
|
+
audioEncoding: opts.encoding,
|
|
295
|
+
sampleRateHertz: opts.sampleRate,
|
|
296
|
+
speakingRate: opts.speakingRate,
|
|
297
|
+
};
|
|
298
|
+
if (['MP3', 'OGG_OPUS'].includes(opts.encoding)) ac.bitRate = opts.bitRate;
|
|
299
|
+
const body = {
|
|
300
|
+
text: textOverride ?? opts.text,
|
|
301
|
+
voiceId: opts.voice,
|
|
302
|
+
modelId: opts.model,
|
|
303
|
+
audioConfig: ac,
|
|
304
|
+
temperature: opts.temperature,
|
|
305
|
+
};
|
|
306
|
+
if (opts.timestampType) body.timestampType = opts.timestampType;
|
|
307
|
+
if (opts.timestampTransportStrategy) body.timestampTransportStrategy = opts.timestampTransportStrategy;
|
|
308
|
+
if (opts.applyTextNormalization) body.applyTextNormalization = opts.applyTextNormalization;
|
|
309
|
+
return body;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// ---------------------------------------------------------------------------
|
|
313
|
+
// HTTP request helper
|
|
314
|
+
// ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
async function generateOne(url, headers, body, signal, config = {}) {
|
|
317
|
+
const fetchOpts = { method: 'POST', headers, body: JSON.stringify(body), signal };
|
|
318
|
+
let response;
|
|
319
|
+
try {
|
|
320
|
+
response = await fetch(url, fetchOpts);
|
|
321
|
+
} catch (e) {
|
|
322
|
+
// Let AbortError/TimeoutError pass through unwrapped so withRetry sees them
|
|
323
|
+
// as non-retryable; convert connection failures to NetworkError so withRetry
|
|
324
|
+
// can retry them.
|
|
325
|
+
if (e.name === 'AbortError' || e.name === 'TimeoutError') throw e;
|
|
326
|
+
throw new NetworkError(e.message);
|
|
327
|
+
}
|
|
328
|
+
if (!response.ok) {
|
|
329
|
+
let errMsg = response.statusText;
|
|
330
|
+
let details = {};
|
|
331
|
+
try {
|
|
332
|
+
details = await response.json();
|
|
333
|
+
errMsg = details.message || JSON.stringify(details);
|
|
334
|
+
} catch (_) {}
|
|
335
|
+
throw new ApiError(errMsg, response.status, details);
|
|
336
|
+
}
|
|
337
|
+
let data;
|
|
338
|
+
try {
|
|
339
|
+
data = await response.json();
|
|
340
|
+
} catch (_) {
|
|
341
|
+
throw new ApiError('unexpected response: failed to parse JSON');
|
|
342
|
+
}
|
|
343
|
+
if (!data.audioContent) {
|
|
344
|
+
throw new ApiError('unexpected response: missing audioContent');
|
|
345
|
+
}
|
|
346
|
+
return { audio: base64ToBytes(data.audioContent), timestampInfo: data.timestampInfo ?? null };
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// ---------------------------------------------------------------------------
|
|
350
|
+
// Timestamp helpers (for generateWithTimestamps)
|
|
351
|
+
// ---------------------------------------------------------------------------
|
|
352
|
+
|
|
353
|
+
function isEmptyTimestampInfo(ts) {
|
|
354
|
+
if (!ts) return true;
|
|
355
|
+
if (ts.wordAlignment) {
|
|
356
|
+
return !ts.wordAlignment.words || ts.wordAlignment.words.length === 0;
|
|
357
|
+
}
|
|
358
|
+
if (ts.characterAlignment) {
|
|
359
|
+
return !ts.characterAlignment.characters || ts.characterAlignment.characters.length === 0;
|
|
360
|
+
}
|
|
361
|
+
return true;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
function getLastEndTime(ts) {
|
|
365
|
+
if (!ts) return 0;
|
|
366
|
+
if (ts.wordAlignment?.wordEndTimeSeconds?.length) {
|
|
367
|
+
const ends = ts.wordAlignment.wordEndTimeSeconds;
|
|
368
|
+
return ends[ends.length - 1];
|
|
369
|
+
}
|
|
370
|
+
if (ts.characterAlignment?.characterEndTimeSeconds?.length) {
|
|
371
|
+
const ends = ts.characterAlignment.characterEndTimeSeconds;
|
|
372
|
+
return ends[ends.length - 1];
|
|
373
|
+
}
|
|
374
|
+
return 0;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
function extractTrailingBreakSeconds(text) {
|
|
378
|
+
const m = text.match(/<break\s+time="([\d.]+)(m?s)"\s*\/?>\s*$/i);
|
|
379
|
+
if (!m) return 0;
|
|
380
|
+
const val = parseFloat(m[1]);
|
|
381
|
+
return m[2].toLowerCase() === 'ms' ? val / 1000 : val;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
function applyTimestampOffset(ts, offset) {
|
|
385
|
+
if (!ts || offset === 0) return ts ?? {};
|
|
386
|
+
const result = {};
|
|
387
|
+
if (ts.wordAlignment) {
|
|
388
|
+
result.wordAlignment = {
|
|
389
|
+
words: ts.wordAlignment.words,
|
|
390
|
+
wordStartTimeSeconds: ts.wordAlignment.wordStartTimeSeconds?.map(t => t + offset) ?? [],
|
|
391
|
+
wordEndTimeSeconds: ts.wordAlignment.wordEndTimeSeconds?.map(t => t + offset) ?? [],
|
|
392
|
+
phoneticDetails: ts.wordAlignment.phoneticDetails?.map(pd => ({
|
|
393
|
+
...pd,
|
|
394
|
+
phones: pd.phones?.map(p => ({ ...p, startTimeSeconds: p.startTimeSeconds + offset })),
|
|
395
|
+
})) ?? [],
|
|
396
|
+
};
|
|
397
|
+
}
|
|
398
|
+
if (ts.characterAlignment) {
|
|
399
|
+
result.characterAlignment = {
|
|
400
|
+
characters: ts.characterAlignment.characters,
|
|
401
|
+
characterStartTimeSeconds: ts.characterAlignment.characterStartTimeSeconds?.map(t => t + offset) ?? [],
|
|
402
|
+
characterEndTimeSeconds: ts.characterAlignment.characterEndTimeSeconds?.map(t => t + offset) ?? [],
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
return result;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
function mergeTimestampInfos(infos) {
|
|
409
|
+
const result = {};
|
|
410
|
+
const hasWord = infos.some(ts => ts?.wordAlignment);
|
|
411
|
+
if (hasWord) {
|
|
412
|
+
const allWords = [];
|
|
413
|
+
const allWordStart = [];
|
|
414
|
+
const allWordEnd = [];
|
|
415
|
+
const allPhoneticDetails = [];
|
|
416
|
+
let runningWordCount = 0;
|
|
417
|
+
for (const ts of infos) {
|
|
418
|
+
if (!ts?.wordAlignment) continue;
|
|
419
|
+
const wa = ts.wordAlignment;
|
|
420
|
+
allWords.push(...(wa.words ?? []));
|
|
421
|
+
allWordStart.push(...(wa.wordStartTimeSeconds ?? []));
|
|
422
|
+
allWordEnd.push(...(wa.wordEndTimeSeconds ?? []));
|
|
423
|
+
for (const pd of wa.phoneticDetails ?? []) {
|
|
424
|
+
allPhoneticDetails.push({ ...pd, wordIndex: pd.wordIndex + runningWordCount });
|
|
425
|
+
}
|
|
426
|
+
runningWordCount += wa.words?.length ?? 0;
|
|
427
|
+
}
|
|
428
|
+
result.wordAlignment = {
|
|
429
|
+
words: allWords,
|
|
430
|
+
wordStartTimeSeconds: allWordStart,
|
|
431
|
+
wordEndTimeSeconds: allWordEnd,
|
|
432
|
+
phoneticDetails: allPhoneticDetails,
|
|
433
|
+
};
|
|
434
|
+
}
|
|
435
|
+
const hasChar = infos.some(ts => ts?.characterAlignment);
|
|
436
|
+
if (hasChar) {
|
|
437
|
+
const allChars = [];
|
|
438
|
+
const allCharStart = [];
|
|
439
|
+
const allCharEnd = [];
|
|
440
|
+
for (const ts of infos) {
|
|
441
|
+
if (!ts?.characterAlignment) continue;
|
|
442
|
+
const ca = ts.characterAlignment;
|
|
443
|
+
allChars.push(...(ca.characters ?? []));
|
|
444
|
+
allCharStart.push(...(ca.characterStartTimeSeconds ?? []));
|
|
445
|
+
allCharEnd.push(...(ca.characterEndTimeSeconds ?? []));
|
|
446
|
+
}
|
|
447
|
+
result.characterAlignment = {
|
|
448
|
+
characters: allChars,
|
|
449
|
+
characterStartTimeSeconds: allCharStart,
|
|
450
|
+
characterEndTimeSeconds: allCharEnd,
|
|
451
|
+
};
|
|
452
|
+
}
|
|
453
|
+
return result;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// ---------------------------------------------------------------------------
|
|
457
|
+
// Generate (handles any text length automatically)
|
|
458
|
+
// ---------------------------------------------------------------------------
|
|
459
|
+
|
|
460
|
+
export async function generate(options, config = {}) {
|
|
461
|
+
if (!config._authHeader) throw new MissingApiKeyError();
|
|
462
|
+
const url = `${config._baseUrl}/tts/v1/voice`;
|
|
463
|
+
const { outputFile, play: shouldPlay, ...rest } = options || {};
|
|
464
|
+
const opts = parseOptions(rest, 'inworld-tts-1.5-max');
|
|
465
|
+
|
|
466
|
+
if (!opts.text) {
|
|
467
|
+
const received = (options || {}).text;
|
|
468
|
+
const typeInfo = received === undefined ? 'undefined' : received === null ? 'null' : `${typeof received} (${JSON.stringify(received)})`;
|
|
469
|
+
throw new ApiError(`options.text is required (received: ${typeInfo})`);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
const { signal, clear } = getTimeoutSignal(config.timeout ?? 60_000);
|
|
473
|
+
const maxConcurrent = config.maxConcurrentRequests ?? MAX_CONCURRENT_REQUESTS;
|
|
474
|
+
const spliceBreak = SPLICE_BREAK_SECONDS;
|
|
475
|
+
|
|
476
|
+
const headers = {
|
|
477
|
+
'Content-Type': 'application/json',
|
|
478
|
+
Authorization: config._authHeader,
|
|
479
|
+
};
|
|
480
|
+
|
|
481
|
+
const generateStart = performance.now();
|
|
482
|
+
let audio;
|
|
483
|
+
|
|
484
|
+
try {
|
|
485
|
+
if (estimateEffectiveLength(opts.text) <= GENERATE_MAX_CHARS) {
|
|
486
|
+
let result;
|
|
487
|
+
try {
|
|
488
|
+
result = await withRetry(() => generateOne(url, headers, buildBody(opts), signal, config), config);
|
|
489
|
+
} catch (e) {
|
|
490
|
+
if (e instanceof ApiError || e instanceof NetworkError) throw e;
|
|
491
|
+
throw new NetworkError('Request timed out');
|
|
492
|
+
}
|
|
493
|
+
audio = result.audio;
|
|
494
|
+
} else {
|
|
495
|
+
const chunks = chunkText(opts.text);
|
|
496
|
+
if (chunks.length === 0) return new Uint8Array(0);
|
|
497
|
+
debugLog(config, `long text: ${chunks.length} chunks, concurrency=${Math.min(maxConcurrent, chunks.length)}`);
|
|
498
|
+
|
|
499
|
+
const results = new Array(chunks.length);
|
|
500
|
+
const queue = chunks.map((t, i) => ({ text: t, index: i }));
|
|
501
|
+
const concurrency = Math.min(maxConcurrent, chunks.length);
|
|
502
|
+
|
|
503
|
+
const ac = new AbortController();
|
|
504
|
+
signal.addEventListener('abort', () => ac.abort(signal.reason), { once: true });
|
|
505
|
+
|
|
506
|
+
async function worker() {
|
|
507
|
+
while (queue.length > 0) {
|
|
508
|
+
if (ac.signal.aborted) return;
|
|
509
|
+
const { text: t, index: i } = queue.shift();
|
|
510
|
+
try {
|
|
511
|
+
results[i] = await withRetry(() => generateOne(url, headers, buildBody(opts, t), ac.signal, config), config);
|
|
512
|
+
} catch (e) {
|
|
513
|
+
ac.abort(e);
|
|
514
|
+
if (e instanceof ApiError || e instanceof NetworkError) throw e;
|
|
515
|
+
throw new NetworkError('Request timed out');
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
await Promise.all(Array.from({ length: concurrency }, () => worker()));
|
|
521
|
+
|
|
522
|
+
const listBuffers = results.filter(r => r?.audio instanceof Uint8Array).map(r => r.audio);
|
|
523
|
+
if (opts.encoding === 'LINEAR16' || opts.encoding === 'WAV') {
|
|
524
|
+
// 16-bit PCM with WAV container: merge with silence, add WAV header
|
|
525
|
+
const merged = mergeLinear16Buffers(listBuffers, spliceBreak, opts.sampleRate);
|
|
526
|
+
audio = concatBytes([wavHeader(merged.length, opts.sampleRate), merged]);
|
|
527
|
+
} else if (opts.encoding === 'PCM') {
|
|
528
|
+
// Raw 16-bit PCM, no container: merge with silence, no WAV header
|
|
529
|
+
audio = mergeLinear16Buffers(listBuffers, spliceBreak, opts.sampleRate);
|
|
530
|
+
} else {
|
|
531
|
+
// MP3, OGG_OPUS, FLAC, ALAW, MULAW: concatenate chunks as-is
|
|
532
|
+
audio = concatBytes(listBuffers);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
} finally {
|
|
536
|
+
clear();
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
debugLog(config, `generate: ${audio.length.toLocaleString()} bytes (${((performance.now() - generateStart) / 1000).toFixed(2)}s)`);
|
|
540
|
+
if (outputFile) {
|
|
541
|
+
warnExtensionMismatch(outputFile, opts.encoding);
|
|
542
|
+
await writeFileSafe(outputFile, audio);
|
|
543
|
+
}
|
|
544
|
+
if (shouldPlay) {
|
|
545
|
+
if (outputFile) {
|
|
546
|
+
await playFile(outputFile, opts.encoding);
|
|
547
|
+
} else {
|
|
548
|
+
await playAudio(audio, { encoding: opts.encoding });
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
return audio;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
// ---------------------------------------------------------------------------
|
|
555
|
+
// Generate with timestamps
|
|
556
|
+
// ---------------------------------------------------------------------------
|
|
557
|
+
|
|
558
|
+
export async function generateWithTimestamps(options, config = {}) {
|
|
559
|
+
if (!config._authHeader) throw new MissingApiKeyError();
|
|
560
|
+
const url = `${config._baseUrl}/tts/v1/voice`;
|
|
561
|
+
const { outputFile, play: shouldPlay, timestampType, ...rest } = options || {};
|
|
562
|
+
if (!timestampType) throw new ApiError('options.timestampType is required ("WORD" or "CHARACTER")');
|
|
563
|
+
const opts = parseOptions({ ...rest, timestampType }, 'inworld-tts-1.5-max');
|
|
564
|
+
|
|
565
|
+
if (!opts.text) {
|
|
566
|
+
const received = (options || {}).text;
|
|
567
|
+
const typeInfo = received === undefined ? 'undefined' : received === null ? 'null' : `${typeof received} (${JSON.stringify(received)})`;
|
|
568
|
+
throw new ApiError(`options.text is required (received: ${typeInfo})`);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
const { signal, clear } = getTimeoutSignal(config.timeout ?? 60_000);
|
|
572
|
+
const maxConcurrent = config.maxConcurrentRequests ?? MAX_CONCURRENT_REQUESTS;
|
|
573
|
+
|
|
574
|
+
const headers = {
|
|
575
|
+
'Content-Type': 'application/json',
|
|
576
|
+
Authorization: config._authHeader,
|
|
577
|
+
};
|
|
578
|
+
|
|
579
|
+
const generateStart = performance.now();
|
|
580
|
+
let audio;
|
|
581
|
+
let timestamps;
|
|
582
|
+
|
|
583
|
+
try {
|
|
584
|
+
if (estimateEffectiveLength(opts.text) <= GENERATE_MAX_CHARS) {
|
|
585
|
+
let result;
|
|
586
|
+
try {
|
|
587
|
+
result = await withRetry(() => generateOne(url, headers, buildBody(opts), signal, config), config);
|
|
588
|
+
} catch (e) {
|
|
589
|
+
if (e instanceof ApiError || e instanceof NetworkError) throw e;
|
|
590
|
+
throw new NetworkError('Request timed out');
|
|
591
|
+
}
|
|
592
|
+
audio = result.audio;
|
|
593
|
+
timestamps = result.timestampInfo ?? {};
|
|
594
|
+
} else {
|
|
595
|
+
const chunks = chunkText(opts.text);
|
|
596
|
+
if (chunks.length === 0) return { audio: new Uint8Array(0), timestamps: {} };
|
|
597
|
+
debugLog(config, `generateWithTimestamps: ${chunks.length} chunks, concurrency=${Math.min(maxConcurrent, chunks.length)}`);
|
|
598
|
+
|
|
599
|
+
const results = new Array(chunks.length);
|
|
600
|
+
const queue = chunks.map((t, i) => ({ text: t, index: i }));
|
|
601
|
+
const concurrency = Math.min(maxConcurrent, chunks.length);
|
|
602
|
+
|
|
603
|
+
const ac = new AbortController();
|
|
604
|
+
signal.addEventListener('abort', () => ac.abort(signal.reason), { once: true });
|
|
605
|
+
|
|
606
|
+
async function worker() {
|
|
607
|
+
while (queue.length > 0) {
|
|
608
|
+
if (ac.signal.aborted) return;
|
|
609
|
+
const { text: t, index: i } = queue.shift();
|
|
610
|
+
try {
|
|
611
|
+
results[i] = await withRetry(() => generateOne(url, headers, buildBody(opts, t), ac.signal, config), config);
|
|
612
|
+
} catch (e) {
|
|
613
|
+
ac.abort(e);
|
|
614
|
+
if (e instanceof ApiError || e instanceof NetworkError) throw e;
|
|
615
|
+
throw new NetworkError('Request timed out');
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
await Promise.all(Array.from({ length: concurrency }, () => worker()));
|
|
621
|
+
|
|
622
|
+
// Sequential pass: compute timestamp offsets and merge
|
|
623
|
+
const spliceGap = ['LINEAR16', 'WAV', 'PCM'].includes(opts.encoding) ? SPLICE_BREAK_SECONDS : 0;
|
|
624
|
+
let offset = 0;
|
|
625
|
+
const adjustedTimestamps = [];
|
|
626
|
+
const audioBuffers = [];
|
|
627
|
+
|
|
628
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
629
|
+
const { audio: chunkAudio, timestampInfo } = results[i];
|
|
630
|
+
audioBuffers.push(chunkAudio);
|
|
631
|
+
adjustedTimestamps.push(applyTimestampOffset(timestampInfo, offset));
|
|
632
|
+
offset += getLastEndTime(timestampInfo) + extractTrailingBreakSeconds(chunks[i]) + spliceGap;
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
timestamps = mergeTimestampInfos(adjustedTimestamps);
|
|
636
|
+
|
|
637
|
+
if (opts.encoding === 'LINEAR16' || opts.encoding === 'WAV') {
|
|
638
|
+
const merged = mergeLinear16Buffers(audioBuffers, SPLICE_BREAK_SECONDS, opts.sampleRate);
|
|
639
|
+
audio = concatBytes([wavHeader(merged.length, opts.sampleRate), merged]);
|
|
640
|
+
} else if (opts.encoding === 'PCM') {
|
|
641
|
+
audio = mergeLinear16Buffers(audioBuffers, SPLICE_BREAK_SECONDS, opts.sampleRate);
|
|
642
|
+
} else {
|
|
643
|
+
audio = concatBytes(audioBuffers);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
} finally {
|
|
647
|
+
clear();
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
debugLog(config, `generateWithTimestamps: ${audio.length.toLocaleString()} bytes (${((performance.now() - generateStart) / 1000).toFixed(2)}s)`);
|
|
651
|
+
if (outputFile) {
|
|
652
|
+
warnExtensionMismatch(outputFile, opts.encoding);
|
|
653
|
+
await writeFileSafe(outputFile, audio);
|
|
654
|
+
}
|
|
655
|
+
if (shouldPlay) {
|
|
656
|
+
if (outputFile) {
|
|
657
|
+
await playFile(outputFile, opts.encoding);
|
|
658
|
+
} else {
|
|
659
|
+
await playAudio(audio, { encoding: opts.encoding });
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
return { audio, timestamps };
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
// ---------------------------------------------------------------------------
|
|
666
|
+
// Stream (HTTP streaming, async iterable)
|
|
667
|
+
// ---------------------------------------------------------------------------
|
|
668
|
+
|
|
669
|
+
export async function* stream(options, config = {}) {
|
|
670
|
+
if (!config._authHeader) throw new MissingApiKeyError();
|
|
671
|
+
const url = `${config._baseUrl}/tts/v1/voice:stream`;
|
|
672
|
+
const outputFile = options?.outputFile;
|
|
673
|
+
const shouldPlay = options?.play;
|
|
674
|
+
if (outputFile && typeof window !== 'undefined') {
|
|
675
|
+
throw new ApiError('outputFile is not supported in browser environments. Use stream() to collect chunks or generate() to get a Uint8Array.');
|
|
676
|
+
}
|
|
677
|
+
const opts = parseOptions(options, 'inworld-tts-1.5-mini');
|
|
678
|
+
|
|
679
|
+
if (!opts.text) {
|
|
680
|
+
const received = (options || {}).text;
|
|
681
|
+
const typeInfo = received === undefined ? 'undefined' : received === null ? 'null' : `${typeof received} (${JSON.stringify(received)})`;
|
|
682
|
+
throw new ApiError(`options.text is required (received: ${typeInfo})`);
|
|
683
|
+
}
|
|
684
|
+
const effectiveLen = estimateEffectiveLength(opts.text);
|
|
685
|
+
if (effectiveLen > STREAM_MAX_CHARS) {
|
|
686
|
+
throw new ApiError(
|
|
687
|
+
`text exceeds ${STREAM_MAX_CHARS} character limit for stream() (effective length ${effectiveLen}, raw length ${opts.text.length}). Use generate() instead — it handles any text length automatically.`
|
|
688
|
+
);
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
const { signal, clear } = getTimeoutSignal(config.timeout ?? 60_000);
|
|
692
|
+
const headers = {
|
|
693
|
+
'Content-Type': 'application/json',
|
|
694
|
+
Authorization: config._authHeader,
|
|
695
|
+
};
|
|
696
|
+
|
|
697
|
+
const body = buildBody(opts);
|
|
698
|
+
const fetchOpts = { method: 'POST', headers, body: JSON.stringify(body), signal };
|
|
699
|
+
const streamStart = performance.now();
|
|
700
|
+
const collectedChunks = (outputFile || shouldPlay) ? [] : null;
|
|
701
|
+
try {
|
|
702
|
+
let res;
|
|
703
|
+
try {
|
|
704
|
+
res = await withRetry(async () => {
|
|
705
|
+
let r;
|
|
706
|
+
try {
|
|
707
|
+
r = await fetch(url, fetchOpts);
|
|
708
|
+
} catch (e) {
|
|
709
|
+
if (e.name === 'AbortError' || e.name === 'TimeoutError') throw e;
|
|
710
|
+
throw new NetworkError(e.message);
|
|
711
|
+
}
|
|
712
|
+
if (r.status >= 500) {
|
|
713
|
+
const details = await r.json().catch(() => ({}));
|
|
714
|
+
throw new ApiError(details.message || r.statusText, r.status, details);
|
|
715
|
+
}
|
|
716
|
+
return r;
|
|
717
|
+
}, config);
|
|
718
|
+
} catch (e) {
|
|
719
|
+
if (e instanceof ApiError || e instanceof NetworkError) throw e;
|
|
720
|
+
throw new NetworkError('Request timed out');
|
|
721
|
+
}
|
|
722
|
+
if (!res.ok) {
|
|
723
|
+
let errMsg = res.statusText;
|
|
724
|
+
let details = {};
|
|
725
|
+
try {
|
|
726
|
+
details = await res.json();
|
|
727
|
+
errMsg = details.message || JSON.stringify(details);
|
|
728
|
+
} catch (_) {
|
|
729
|
+
errMsg = await res.text().catch(() => `HTTP ${res.status}`);
|
|
730
|
+
}
|
|
731
|
+
throw new ApiError(errMsg, res.status, details);
|
|
732
|
+
}
|
|
733
|
+
const resBody = res.body;
|
|
734
|
+
if (!resBody) throw new ApiError('empty response body');
|
|
735
|
+
const reader = resBody.getReader();
|
|
736
|
+
const decoder = new TextDecoder();
|
|
737
|
+
let buf = '';
|
|
738
|
+
let chunkCount = 0;
|
|
739
|
+
let firstChunkMs = null;
|
|
740
|
+
let totalBytes = 0;
|
|
741
|
+
try {
|
|
742
|
+
while (true) {
|
|
743
|
+
const { done, value } = await reader.read();
|
|
744
|
+
if (done) break;
|
|
745
|
+
buf += decoder.decode(value, { stream: true });
|
|
746
|
+
const lines = buf.split('\n');
|
|
747
|
+
buf = lines.pop() || '';
|
|
748
|
+
for (const line of lines) {
|
|
749
|
+
if (!line.trim()) continue;
|
|
750
|
+
try {
|
|
751
|
+
const data = JSON.parse(line);
|
|
752
|
+
const result = data.result;
|
|
753
|
+
if (result && result.audioContent) {
|
|
754
|
+
const chunk = base64ToBytes(result.audioContent);
|
|
755
|
+
if (firstChunkMs === null) firstChunkMs = performance.now() - streamStart;
|
|
756
|
+
chunkCount++;
|
|
757
|
+
totalBytes += chunk.length;
|
|
758
|
+
debugLog(config, `stream chunk #${chunkCount} (${chunk.length} bytes)`);
|
|
759
|
+
if (collectedChunks) collectedChunks.push(chunk);
|
|
760
|
+
yield chunk;
|
|
761
|
+
}
|
|
762
|
+
} catch (_) {}
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
if (buf.trim()) {
|
|
766
|
+
try {
|
|
767
|
+
const data = JSON.parse(buf);
|
|
768
|
+
const result = data.result;
|
|
769
|
+
if (result && result.audioContent) {
|
|
770
|
+
const chunk = base64ToBytes(result.audioContent);
|
|
771
|
+
if (firstChunkMs === null) firstChunkMs = performance.now() - streamStart;
|
|
772
|
+
chunkCount++;
|
|
773
|
+
totalBytes += chunk.length;
|
|
774
|
+
debugLog(config, `stream chunk #${chunkCount} (${chunk.length} bytes)`);
|
|
775
|
+
if (collectedChunks) collectedChunks.push(chunk);
|
|
776
|
+
yield chunk;
|
|
777
|
+
}
|
|
778
|
+
} catch (_) {}
|
|
779
|
+
}
|
|
780
|
+
if (chunkCount > 0) {
|
|
781
|
+
debugLog(config, `stream: first chunk ${Math.round(firstChunkMs)}ms, ${chunkCount} chunks, ${totalBytes.toLocaleString()} bytes total`);
|
|
782
|
+
}
|
|
783
|
+
} finally {
|
|
784
|
+
reader.releaseLock?.();
|
|
785
|
+
}
|
|
786
|
+
} finally {
|
|
787
|
+
clear();
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
if (collectedChunks) {
|
|
791
|
+
const audio = concatBytes(collectedChunks);
|
|
792
|
+
const enc = (opts.encoding || 'MP3').toUpperCase();
|
|
793
|
+
if (outputFile) {
|
|
794
|
+
warnExtensionMismatch(outputFile, enc);
|
|
795
|
+
await writeFileSafe(outputFile, audio);
|
|
796
|
+
if (shouldPlay) await playFile(outputFile, enc);
|
|
797
|
+
} else {
|
|
798
|
+
await playAudio(audio, { encoding: enc });
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
// ---------------------------------------------------------------------------
|
|
804
|
+
// Stream with timestamps (hardcodes SYNC strategy)
|
|
805
|
+
// ---------------------------------------------------------------------------
|
|
806
|
+
|
|
807
|
+
export async function* streamWithTimestamps(options, config = {}) {
|
|
808
|
+
if (!config._authHeader) throw new MissingApiKeyError();
|
|
809
|
+
const url = `${config._baseUrl}/tts/v1/voice:stream`;
|
|
810
|
+
const { timestampType, outputFile, play: shouldPlay, ...rest } = options || {};
|
|
811
|
+
if (!timestampType) throw new ApiError('options.timestampType is required ("WORD" or "CHARACTER")');
|
|
812
|
+
if (outputFile && typeof window !== 'undefined') {
|
|
813
|
+
throw new ApiError('outputFile is not supported in browser environments.');
|
|
814
|
+
}
|
|
815
|
+
const opts = parseOptions({ ...rest, timestampType }, 'inworld-tts-1.5-mini');
|
|
816
|
+
|
|
817
|
+
if (!opts.text) {
|
|
818
|
+
const received = (options || {}).text;
|
|
819
|
+
const typeInfo = received === undefined ? 'undefined' : received === null ? 'null' : `${typeof received} (${JSON.stringify(received)})`;
|
|
820
|
+
throw new ApiError(`options.text is required (received: ${typeInfo})`);
|
|
821
|
+
}
|
|
822
|
+
const effectiveLen = estimateEffectiveLength(opts.text);
|
|
823
|
+
if (effectiveLen > STREAM_MAX_CHARS) {
|
|
824
|
+
throw new ApiError(
|
|
825
|
+
`text exceeds ${STREAM_MAX_CHARS} character limit for streamWithTimestamps() (effective length ${effectiveLen}, raw length ${opts.text.length}). Use generateWithTimestamps() instead.`
|
|
826
|
+
);
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
const { signal, clear } = getTimeoutSignal(config.timeout ?? 60_000);
|
|
830
|
+
const headers = { 'Content-Type': 'application/json', Authorization: config._authHeader };
|
|
831
|
+
const body = buildBody(opts);
|
|
832
|
+
body.timestampTransportStrategy = 'SYNC';
|
|
833
|
+
const fetchOpts = { method: 'POST', headers, body: JSON.stringify(body), signal };
|
|
834
|
+
const streamStart = performance.now();
|
|
835
|
+
const collectedChunks = (outputFile || shouldPlay) ? [] : null;
|
|
836
|
+
|
|
837
|
+
try {
|
|
838
|
+
let res;
|
|
839
|
+
try {
|
|
840
|
+
res = await withRetry(async () => {
|
|
841
|
+
let r;
|
|
842
|
+
try {
|
|
843
|
+
r = await fetch(url, fetchOpts);
|
|
844
|
+
} catch (e) {
|
|
845
|
+
if (e.name === 'AbortError' || e.name === 'TimeoutError') throw e;
|
|
846
|
+
throw new NetworkError(e.message);
|
|
847
|
+
}
|
|
848
|
+
if (r.status >= 500) {
|
|
849
|
+
const details = await r.json().catch(() => ({}));
|
|
850
|
+
throw new ApiError(details.message || r.statusText, r.status, details);
|
|
851
|
+
}
|
|
852
|
+
return r;
|
|
853
|
+
}, config);
|
|
854
|
+
} catch (e) {
|
|
855
|
+
if (e instanceof ApiError || e instanceof NetworkError) throw e;
|
|
856
|
+
throw new NetworkError('Request timed out');
|
|
857
|
+
}
|
|
858
|
+
if (!res.ok) {
|
|
859
|
+
let errMsg = res.statusText;
|
|
860
|
+
let details = {};
|
|
861
|
+
try {
|
|
862
|
+
details = await res.json();
|
|
863
|
+
errMsg = details.message || JSON.stringify(details);
|
|
864
|
+
} catch (_) {
|
|
865
|
+
errMsg = await res.text().catch(() => `HTTP ${res.status}`);
|
|
866
|
+
}
|
|
867
|
+
throw new ApiError(errMsg, res.status, details);
|
|
868
|
+
}
|
|
869
|
+
const resBody = res.body;
|
|
870
|
+
if (!resBody) throw new ApiError('empty response body');
|
|
871
|
+
const reader = resBody.getReader();
|
|
872
|
+
const decoder = new TextDecoder();
|
|
873
|
+
let buf = '';
|
|
874
|
+
let chunkCount = 0;
|
|
875
|
+
let firstChunkMs = null;
|
|
876
|
+
|
|
877
|
+
const processChunk = function* (line) {
|
|
878
|
+
if (!line.trim()) return;
|
|
879
|
+
try {
|
|
880
|
+
const data = JSON.parse(line);
|
|
881
|
+
const result = data.result;
|
|
882
|
+
if (result && result.audioContent) {
|
|
883
|
+
const chunk = base64ToBytes(result.audioContent);
|
|
884
|
+
if (firstChunkMs === null) firstChunkMs = performance.now() - streamStart;
|
|
885
|
+
chunkCount++;
|
|
886
|
+
debugLog(config, `streamWithTimestamps chunk #${chunkCount} (${chunk.length} bytes)`);
|
|
887
|
+
if (collectedChunks) collectedChunks.push(chunk);
|
|
888
|
+
const ts = result.timestampInfo;
|
|
889
|
+
if (isEmptyTimestampInfo(ts)) {
|
|
890
|
+
yield { audio: chunk };
|
|
891
|
+
} else {
|
|
892
|
+
yield { audio: chunk, timestamps: ts };
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
} catch (_) {}
|
|
896
|
+
};
|
|
897
|
+
|
|
898
|
+
try {
|
|
899
|
+
while (true) {
|
|
900
|
+
const { done, value } = await reader.read();
|
|
901
|
+
if (done) break;
|
|
902
|
+
buf += decoder.decode(value, { stream: true });
|
|
903
|
+
const lines = buf.split('\n');
|
|
904
|
+
buf = lines.pop() || '';
|
|
905
|
+
for (const line of lines) yield* processChunk(line);
|
|
906
|
+
}
|
|
907
|
+
if (buf.trim()) yield* processChunk(buf);
|
|
908
|
+
if (chunkCount > 0) {
|
|
909
|
+
debugLog(config, `streamWithTimestamps: first chunk ${Math.round(firstChunkMs)}ms, ${chunkCount} chunks total`);
|
|
910
|
+
}
|
|
911
|
+
} finally {
|
|
912
|
+
reader.releaseLock?.();
|
|
913
|
+
}
|
|
914
|
+
} finally {
|
|
915
|
+
clear();
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
if (collectedChunks) {
|
|
919
|
+
const audio = concatBytes(collectedChunks);
|
|
920
|
+
const enc = (opts.encoding || 'MP3').toUpperCase();
|
|
921
|
+
if (outputFile) {
|
|
922
|
+
warnExtensionMismatch(outputFile, enc);
|
|
923
|
+
await writeFileSafe(outputFile, audio);
|
|
924
|
+
if (shouldPlay) await playFile(outputFile, enc);
|
|
925
|
+
} else {
|
|
926
|
+
await playAudio(audio, { encoding: enc });
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
}
|