webtalk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1121 @@
1
+ // Pocket TTS ONNX Web Worker
2
+ console.log('Pocket TTS Worker Starting...');
3
+ self.postMessage({ type: 'status', status: 'Worker Thread Started', state: 'idle' });
4
+
5
+ // Load ONNX Runtime (will be loaded dynamically in loadModels for module worker)
6
+ let ort = null;
7
+
8
+ // Configuration
9
+ const MODELS = {
10
+ mimi_encoder: '/models/tts/mimi_encoder.onnx',
11
+ text_conditioner: '/models/tts/text_conditioner.onnx',
12
+ flow_lm_main: '/models/tts/flow_lm_main_int8.onnx',
13
+ flow_lm_flow: '/models/tts/flow_lm_flow_int8.onnx',
14
+ mimi_decoder: '/models/tts/mimi_decoder_int8.onnx',
15
+ tokenizer: '/models/tts/tokenizer.model',
16
+ voices: '/models/tts/voices.bin'
17
+ };
18
+
19
+ const SAMPLE_RATE = 24000;
20
+ const SAMPLES_PER_FRAME = 1920;
21
+ const MAX_FRAMES = 500;
22
+ const DEBUG_LOGS = true;
23
+ // Text chunking target; lower if long passages hit generation limits.
24
+ const CHUNK_TARGET_TOKENS = 50;
25
+ const CHUNK_GAP_SEC = 0.25;
26
+ // If true, re-run voice conditioning per chunk to avoid stale AR state.
27
+ const RESET_FLOW_STATE_EACH_CHUNK = true;
28
+ // If true, reset decoder state per chunk to avoid carry-over artifacts.
29
+ const RESET_MIMI_STATE_EACH_CHUNK = true;
30
+
31
+ // State
32
+ let mimiEncoderSession = null;
33
+ let textConditionerSession = null;
34
+ let flowLmMainSession = null;
35
+ let flowLmFlowSession = null;
36
+ let mimiDecoderSession = null;
37
+ let tokenizerProcessor = null;
38
+ let tokenizerModelB64 = null;
39
+ let predefinedVoices = {};
40
+ let stTensors = []; // Optimization: Pre-allocated s/t tensors for max LSD
41
+ let isGenerating = false;
42
+ let isReady = false;
43
+
44
+ // Dynamic LSD (Latent Solver/Diffusion steps)
45
+ const MAX_LSD = 10; // Default/max quality
46
+ let currentLSD = MAX_LSD;
47
+
48
+ // Current voice embedding (cached)
49
+ let currentVoiceEmbedding = null;
50
+ let currentVoiceName = null;
51
+
52
+ // Text preprocessing utilities
53
+ const ONES = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'];
54
+ const TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'];
55
+ const ORDINAL_ONES = ['', 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth'];
56
+ const ORDINAL_TENS = ['', '', 'twentieth', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth'];
57
+
58
+ function numberToWords(num, options = {}) {
59
+ const { andword = '', zero = 'zero', group = 0 } = options;
60
+ if (num === 0) return zero;
61
+ const convert = (n) => {
62
+ if (n < 20) return ONES[n];
63
+ if (n < 100) return TENS[Math.floor(n / 10)] + (n % 10 ? ' ' + ONES[n % 10] : '');
64
+ if (n < 1000) {
65
+ const remainder = n % 100;
66
+ return ONES[Math.floor(n / 100)] + ' hundred' + (remainder ? (andword ? ' ' + andword + ' ' : ' ') + convert(remainder) : '');
67
+ }
68
+ if (n < 1000000) {
69
+ const thousands = Math.floor(n / 1000);
70
+ const remainder = n % 1000;
71
+ return convert(thousands) + ' thousand' + (remainder ? ' ' + convert(remainder) : '');
72
+ }
73
+ if (n < 1000000000) {
74
+ const millions = Math.floor(n / 1000000);
75
+ const remainder = n % 1000000;
76
+ return convert(millions) + ' million' + (remainder ? ' ' + convert(remainder) : '');
77
+ }
78
+ const billions = Math.floor(n / 1000000000);
79
+ const remainder = n % 1000000000;
80
+ return convert(billions) + ' billion' + (remainder ? ' ' + convert(remainder) : '');
81
+ };
82
+ if (group === 2 && num > 1000 && num < 10000) {
83
+ const high = Math.floor(num / 100);
84
+ const low = num % 100;
85
+ if (low === 0) return convert(high) + ' hundred';
86
+ else if (low < 10) return convert(high) + ' ' + (zero === 'oh' ? 'oh' : zero) + ' ' + ONES[low];
87
+ else return convert(high) + ' ' + convert(low);
88
+ }
89
+ return convert(num);
90
+ }
91
+
92
+ function ordinalToWords(num) {
93
+ if (num < 20) return ORDINAL_ONES[num] || numberToWords(num) + 'th';
94
+ if (num < 100) {
95
+ const tens = Math.floor(num / 10);
96
+ const ones = num % 10;
97
+ if (ones === 0) return ORDINAL_TENS[tens];
98
+ return TENS[tens] + ' ' + ORDINAL_ONES[ones];
99
+ }
100
+ const cardinal = numberToWords(num);
101
+ if (cardinal.endsWith('y')) return cardinal.slice(0, -1) + 'ieth';
102
+ if (cardinal.endsWith('one')) return cardinal.slice(0, -3) + 'first';
103
+ if (cardinal.endsWith('two')) return cardinal.slice(0, -3) + 'second';
104
+ if (cardinal.endsWith('three')) return cardinal.slice(0, -5) + 'third';
105
+ if (cardinal.endsWith('ve')) return cardinal.slice(0, -2) + 'fth';
106
+ if (cardinal.endsWith('e')) return cardinal.slice(0, -1) + 'th';
107
+ if (cardinal.endsWith('t')) return cardinal + 'h';
108
+ return cardinal + 'th';
109
+ }
110
+
111
+ const UNICODE_MAP = {
112
+ 'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a', 'æ': 'ae', 'ç': 'c', 'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'ñ': 'n', 'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o', 'ø': 'o', 'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u', 'ý': 'y', 'ÿ': 'y', 'ß': 'ss', 'œ': 'oe', 'ð': 'd', 'þ': 'th', 'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A', 'Å': 'A', 'Æ': 'AE', 'Ç': 'C', 'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E', 'Ì': 'I', 'Í': 'I', 'Î': 'I', 'Ï': 'I', 'Ñ': 'N', 'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O', 'Ø': 'O', 'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U', 'Ý': 'Y', '\u201C': '"', '\u201D': '"', '\u2018': "'", '\u2019': "'", '\u2026': '...', '\u2013': '-', '\u2014': '-'
113
+ };
114
+
115
+ function convertToAscii(text) {
116
+ return text.split('').map(c => UNICODE_MAP[c] || c).join('').normalize('NFD').replace(/[\u0300-\u036f]/g, '');
117
+ }
118
+
119
+ const ABBREVIATIONS = [
120
+ [/\bmrs\./gi, 'misuss'], [/\bms\./gi, 'miss'], [/\bmr\./gi, 'mister'], [/\bdr\./gi, 'doctor'], [/\bst\./gi, 'saint'], [/\bco\./gi, 'company'], [/\bjr\./gi, 'junior'], [/\bmaj\./gi, 'major'], [/\bgen\./gi, 'general'], [/\bdrs\./gi, 'doctors'], [/\brev\./gi, 'reverend'], [/\blt\./gi, 'lieutenant'], [/\bhon\./gi, 'honorable'], [/\bsgt\./gi, 'sergeant'], [/\bcapt\./gi, 'captain'], [/\besq\./gi, 'esquire'], [/\bltd\./gi, 'limited'], [/\bcol\./gi, 'colonel'], [/\bft\./gi, 'fort']
121
+ ];
122
+ const CASED_ABBREVIATIONS = [
123
+ [/\bTTS\b/g, 'text to speech'], [/\bHz\b/g, 'hertz'], [/\bkHz\b/g, 'kilohertz'], [/\bKBs\b/g, 'kilobytes'], [/\bKB\b/g, 'kilobyte'], [/\bMBs\b/g, 'megabytes'], [/\bMB\b/g, 'megabyte'], [/\bGBs\b/g, 'gigabytes'], [/\bGB\b/g, 'gigabyte'], [/\bTBs\b/g, 'terabytes'], [/\bTB\b/g, 'terabyte'], [/\bAPIs\b/g, "a p i's"], [/\bAPI\b/g, 'a p i'], [/\bCLIs\b/g, "c l i's"], [/\bCLI\b/g, 'c l i'], [/\bCPUs\b/g, "c p u's"], [/\bCPU\b/g, 'c p u'], [/\bGPUs\b/g, "g p u's"], [/\bGPU\b/g, 'g p u'], [/\bAve\b/g, 'avenue'], [/\betc\b/g, 'etcetera']
124
+ ];
125
+
126
+ function expandAbbreviations(text) {
127
+ for (const [regex, replacement] of [...ABBREVIATIONS, ...CASED_ABBREVIATIONS]) text = text.replace(regex, replacement);
128
+ return text;
129
+ }
130
+
131
+ const NUM_PREFIX_RE = /#(\d)/g;
132
+ const NUM_SUFFIX_RE = /(\d)([KMBT])/gi;
133
+ const NUM_LETTER_SPLIT_RE = /(\d)([a-z])|([a-z])(\d)/gi;
134
+ const COMMA_NUMBER_RE = /(\d[\d,]+\d)/g;
135
+ const DATE_RE = /(^|[^/])(\d\d?[/-]\d\d?[/-]\d\d(?:\d\d)?)($|[^/])/g;
136
+ const PHONE_NUMBER_RE = /\(?\d{3}\)?[-.\s]\d{3}[-.\s]?\d{4}/g;
137
+ const TIME_RE = /(\d\d?):(\d\d)(?::(\d\d))?/g;
138
+ const POUNDS_RE = /£([\d,]*\d+)/g;
139
+ const DOLLARS_RE = /\$([\d.,]*\d+)/g;
140
+ const DECIMAL_NUMBER_RE = /(\d+(?:\.\d+)+)/g;
141
+ const MULTIPLY_RE = /(\d)\s?\*\s?(\d)/g;
142
+ const DIVIDE_RE = /(\d)\s?\/\s?(\d)/g;
143
+ const ADD_RE = /(\d)\s?\+\s?(\d)/g;
144
+ const SUBTRACT_RE = /(\d)?\s?-\s?(\d)/g;
145
+ const FRACTION_RE = /(\d+)\/(\d+)/g;
146
+ const ORDINAL_RE = /(\d+)(st|nd|rd|th)/gi;
147
+ const NUMBER_RE = /\d+/g;
148
+
149
+ function normalizeNumbers(text) {
150
+ text = text.replace(NUM_PREFIX_RE, (_, d) => `number ${d}`);
151
+ text = text.replace(NUM_SUFFIX_RE, (_, num, suffix) => {
152
+ const map = { k: 'thousand', m: 'million', b: 'billion', t: 'trillion' };
153
+ return `${num} ${map[suffix.toLowerCase()]}`;
154
+ });
155
+ for (let i = 0; i < 2; i++) {
156
+ text = text.replace(NUM_LETTER_SPLIT_RE, (m, d1, l1, l2, d2) => {
157
+ if (d1 && l1) return `${d1} ${l1}`;
158
+ if (l2 && d2) return `${l2} ${d2}`;
159
+ return m;
160
+ });
161
+ }
162
+ text = text.replace(COMMA_NUMBER_RE, m => m.replace(/,/g, ''));
163
+ text = text.replace(DATE_RE, (_, pre, date, post) => pre + date.split(/[./-]/).join(' dash ') + post);
164
+ text = text.replace(PHONE_NUMBER_RE, m => {
165
+ const digits = m.replace(/\D/g, '');
166
+ return digits.length === 10 ? `${digits.slice(0, 3).split('').join(' ')}, ${digits.slice(3, 6).split('').join(' ')}, ${digits.slice(6).split('').join(' ')}` : m;
167
+ });
168
+ text = text.replace(TIME_RE, (_, hours, minutes, seconds) => {
169
+ const h = parseInt(hours), m = parseInt(minutes), s = seconds ? parseInt(seconds) : 0;
170
+ if (!seconds) return m === 0 ? (h === 0 ? '0' : h > 12 ? `${hours} minutes` : `${hours} o'clock`) : minutes.startsWith('0') ? `${hours} oh ${minutes[1]}` : `${hours} ${minutes}`;
171
+ let res = '';
172
+ if (h !== 0) res = hours + ' ' + (m === 0 ? 'oh oh' : minutes.startsWith('0') ? `oh ${minutes[1]}` : minutes);
173
+ else if (m !== 0) res = minutes + ' ' + (s === 0 ? 'oh oh' : seconds.startsWith('0') ? `oh ${seconds[1]}` : seconds);
174
+ else res = seconds;
175
+ return res + ' ' + (s === 0 ? '' : seconds.startsWith('0') ? `oh ${seconds[1]}` : seconds);
176
+ });
177
+ text = text.replace(POUNDS_RE, (_, amount) => `${amount.replace(/,/g, '')} pounds`);
178
+ text = text.replace(DOLLARS_RE, (_, amount) => {
179
+ const parts = amount.replace(/,/g, '').split('.');
180
+ const dollars = parseInt(parts[0]) || 0;
181
+ const cents = parts[1] ? parseInt(parts[1]) : 0;
182
+ if (dollars && cents) return `${dollars} ${dollars === 1 ? 'dollar' : 'dollars'}, ${cents} ${cents === 1 ? 'cent' : 'cents'}`;
183
+ if (dollars) return `${dollars} ${dollars === 1 ? 'dollar' : 'dollars'}`;
184
+ if (cents) return `${cents} ${cents === 1 ? 'cent' : 'cents'}`;
185
+ return 'zero dollars';
186
+ });
187
+ text = text.replace(DECIMAL_NUMBER_RE, m => m.split('.').join(' point ').split('').join(' '));
188
+ text = text.replace(MULTIPLY_RE, '$1 times $2');
189
+ text = text.replace(DIVIDE_RE, '$1 over $2');
190
+ text = text.replace(ADD_RE, '$1 plus $2');
191
+ text = text.replace(SUBTRACT_RE, (_, a, b) => (a ? a : '') + ' minus ' + b);
192
+ text = text.replace(FRACTION_RE, '$1 over $2');
193
+ text = text.replace(ORDINAL_RE, (_, num) => ordinalToWords(parseInt(num)));
194
+ text = text.replace(NUMBER_RE, m => {
195
+ const num = parseInt(m);
196
+ if (num > 1000 && num < 3000) {
197
+ if (num === 2000) return 'two thousand';
198
+ if (num > 2000 && num < 2010) return 'two thousand ' + numberToWords(num % 100);
199
+ if (num % 100 === 0) return numberToWords(Math.floor(num / 100)) + ' hundred';
200
+ return numberToWords(num, { zero: 'oh', group: 2 });
201
+ }
202
+ return numberToWords(num);
203
+ });
204
+ return text;
205
+ }
206
+
207
+ const SPECIAL_CHARACTERS = [
208
+ [/@/g, ' at '], [/&/g, ' and '], [/%/g, ' percent '], [/:/g, '.'], [/;/g, ','], [/\+/g, ' plus '], [/\\/g, ' backslash '], [/~/g, ' about '], [/(^| )<3/g, ' heart '], [/<=/g, ' less than or equal to '], [/>=/g, ' greater than or equal to '], [/</g, ' less than '], [/>/g, ' greater than '], [/=/g, ' equals '], [/\//g, ' slash '], [/_/g, ' '],
209
+ ];
210
+ const LINK_HEADER_RE = /https?:\/\//gi;
211
+ const DASH_RE = /(.) - (.)/g;
212
+ const DOT_RE = /([A-Z])\.([A-Z])/gi;
213
+ const PARENTHESES_RE = /[\(\[\{][^\)\]\}]*[\)\]\}](.)?/g;
214
+
215
+ function normalizeSpecial(text) {
216
+ text = text.replace(LINK_HEADER_RE, 'h t t p s colon slash slash ');
217
+ text = text.replace(DASH_RE, '$1, $2');
218
+ text = text.replace(DOT_RE, '$1 dot $2');
219
+ text = text.replace(PARENTHESES_RE, (m, after) => {
220
+ let result = m.replace(/[\(\[\{]/g, ', ').replace(/[\)\]\}]/g, ', ');
221
+ if (after && /[$.!?,]/.test(after)) result = result.slice(0, -2) + after;
222
+ return result;
223
+ });
224
+ return text;
225
+ }
226
+
227
+ function expandSpecialCharacters(text) {
228
+ for (const [regex, replacement] of SPECIAL_CHARACTERS) text = text.replace(regex, replacement);
229
+ return text;
230
+ }
231
+
232
+ function collapseWhitespace(text) {
233
+ return text.replace(/\s+/g, ' ').replace(/ ([.\?!,])/g, '$1');
234
+ }
235
+
236
+ function dedupPunctuation(text) {
237
+ return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
238
+ }
239
+
240
+ const SENTENCE_SPLIT_RE = /[^.!?]+[.!?]+|[^.!?]+$/g;
241
+
242
+ function splitTextIntoSentences(text) {
243
+ const matches = text.match(SENTENCE_SPLIT_RE);
244
+ if (!matches) return [];
245
+ return matches.map(sentence => sentence.trim()).filter(Boolean);
246
+ }
247
+
248
+ function splitTokenIdsIntoChunks(tokenIds, maxTokens) {
249
+ const chunks = [];
250
+ for (let i = 0; i < tokenIds.length; i += maxTokens) {
251
+ const chunkText = tokenizerProcessor.decodeIds(tokenIds.slice(i, i + maxTokens)).trim();
252
+ if (chunkText) chunks.push(chunkText);
253
+ }
254
+ return chunks;
255
+ }
256
+
257
+ // Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
258
+ function splitIntoBestSentences(text) {
259
+ const preparedText = prepareText(text);
260
+ if (!preparedText) return [];
261
+
262
+ const sentences = splitTextIntoSentences(preparedText);
263
+ if (sentences.length === 0) return [];
264
+
265
+ // Merge sentences into chunks that stay within the token target
266
+ const chunks = [];
267
+ let currentChunk = '';
268
+ for (const sentenceText of sentences) {
269
+ const sentenceTokenIds = tokenizerProcessor.encodeIds(sentenceText);
270
+ const sentenceTokens = sentenceTokenIds.length;
271
+
272
+ if (sentenceTokens > CHUNK_TARGET_TOKENS) {
273
+ if (currentChunk !== '') {
274
+ chunks.push(currentChunk.trim());
275
+ currentChunk = '';
276
+ }
277
+ const splitChunks = splitTokenIdsIntoChunks(sentenceTokenIds, CHUNK_TARGET_TOKENS);
278
+ for (const splitChunk of splitChunks) {
279
+ if (splitChunk) chunks.push(splitChunk.trim());
280
+ }
281
+ continue;
282
+ }
283
+
284
+ if (currentChunk === '') {
285
+ currentChunk = sentenceText;
286
+ continue;
287
+ }
288
+
289
+ const combined = `${currentChunk} ${sentenceText}`;
290
+ const combinedTokens = tokenizerProcessor.encodeIds(combined).length;
291
+ if (combinedTokens > CHUNK_TARGET_TOKENS) {
292
+ chunks.push(currentChunk.trim());
293
+ currentChunk = sentenceText;
294
+ } else {
295
+ currentChunk = combined;
296
+ }
297
+ }
298
+
299
+ if (currentChunk !== '') {
300
+ chunks.push(currentChunk.trim());
301
+ }
302
+
303
+ return chunks;
304
+ }
305
+
306
+ // Pocket TTS specific text preprocessing
307
+ function prepareText(text) {
308
+ text = text.trim();
309
+ if (!text) return '';
310
+
311
+ // Convert to ASCII
312
+ text = convertToAscii(text);
313
+
314
+ // Normalize numbers first
315
+ text = normalizeNumbers(text);
316
+
317
+ // Normalize special characters
318
+ text = normalizeSpecial(text);
319
+
320
+ // Expand abbreviations
321
+ text = expandAbbreviations(text);
322
+
323
+ // Expand special characters
324
+ text = expandSpecialCharacters(text);
325
+
326
+ // Collapse whitespace
327
+ text = collapseWhitespace(text);
328
+
329
+ // Deduplicate punctuation
330
+ text = dedupPunctuation(text);
331
+
332
+ // Final cleanup
333
+ text = text.trim();
334
+
335
+ // Ensure proper punctuation at end
336
+ if (text && text[text.length - 1].match(/[a-zA-Z0-9]/)) {
337
+ text = text + '.';
338
+ }
339
+
340
+ // Capitalize first letter
341
+ if (text && !text[0].match(/[A-Z]/)) {
342
+ text = text[0].toUpperCase() + text.slice(1);
343
+ }
344
+
345
+ return text;
346
+ }
347
+
348
+ // ----------------------------------------------------------------------------
349
+ // Worker Logic
350
+ // ----------------------------------------------------------------------------
351
+
352
+ self.onmessage = async (e) => {
353
+ const { type, data } = e.data;
354
+ console.log('Worker received message:', type);
355
+
356
+ if (type === 'load') {
357
+ try {
358
+ await loadModels();
359
+ postMessage({ type: 'loaded' });
360
+ } catch (err) {
361
+ postMessage({ type: 'error', error: err.toString() });
362
+ }
363
+ } else if (type === 'generate') {
364
+ if (!isReady) {
365
+ postMessage({ type: 'error', error: 'Models are not loaded yet.' });
366
+ return;
367
+ }
368
+ if (isGenerating) return;
369
+ try {
370
+ await startGeneration(data.text, data.voice);
371
+ } catch (err) {
372
+ console.error('Generation Error:', err);
373
+ postMessage({ type: 'error', error: err.toString() });
374
+ }
375
+ } else if (type === 'encode_voice') {
376
+ if (!isReady) {
377
+ postMessage({ type: 'error', error: 'Models are not loaded yet.' });
378
+ return;
379
+ }
380
+ try {
381
+ const embedding = await encodeVoiceAudio(data.audio);
382
+ currentVoiceEmbedding = embedding;
383
+ currentVoiceName = 'custom';
384
+ postMessage({ type: 'voice_encoded', voiceName: 'custom' });
385
+ } catch (err) {
386
+ console.error('Voice encoding error:', err);
387
+ postMessage({ type: 'error', error: 'Failed to encode voice: ' + err.toString() });
388
+ }
389
+ } else if (type === 'set_voice') {
390
+ if (!isReady) {
391
+ postMessage({ type: 'error', error: 'Models are not loaded yet.' });
392
+ return;
393
+ }
394
+ if (data.voiceName === 'custom') {
395
+ // Custom voice already set via encode_voice
396
+ postMessage({ type: 'voice_set', voiceName: 'custom' });
397
+ } else if (predefinedVoices[data.voiceName]) {
398
+ currentVoiceEmbedding = predefinedVoices[data.voiceName];
399
+ currentVoiceName = data.voiceName;
400
+ postMessage({ type: 'voice_set', voiceName: data.voiceName });
401
+ } else {
402
+ postMessage({ type: 'error', error: `Unknown voice: ${data.voiceName}` });
403
+ }
404
+ } else if (type === 'set_lsd') {
405
+ // Dynamic LSD adjustment for edge devices
406
+ const newLSD = Math.max(1, Math.min(MAX_LSD, data.lsd));
407
+ if (newLSD !== currentLSD) {
408
+ console.log(`LSD adjusted: ${currentLSD} → ${newLSD}`);
409
+ currentLSD = newLSD;
410
+ }
411
+ } else if (type === 'stop') {
412
+ isGenerating = false;
413
+ postMessage({ type: 'status', status: 'Stopped', state: 'idle' });
414
+ }
415
+ };
416
+
417
+ async function loadModels() {
418
+ if (mimiEncoderSession) return;
419
+
420
+ postMessage({ type: 'status', status: 'Loading ONNX Runtime...', state: 'loading' });
421
+
422
+ // Load ONNX Runtime dynamically
423
+ const version = '1.20.0';
424
+ const cdnBase = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${version}/dist/`;
425
+
426
+ try {
427
+ const ortModule = await import(`https://cdn.jsdelivr.net/npm/onnxruntime-web@${version}/dist/ort.min.mjs`);
428
+ ort = ortModule.default || ortModule;
429
+ } catch (e) {
430
+ console.error('Failed to load ONNX Runtime:', e);
431
+ throw new Error('Failed to load ONNX Runtime: ' + e.message);
432
+ }
433
+
434
+ if (!ort) {
435
+ throw new Error('ONNX Runtime failed to load');
436
+ }
437
+
438
+ postMessage({ type: 'status', status: 'Loading models...', state: 'loading' });
439
+
440
+ // Configure WASM Paths
441
+ ort.env.wasm.wasmPaths = cdnBase;
442
+
443
+ // Enable SIMD for significant performance boost (2-4x faster)
444
+ ort.env.wasm.simd = true;
445
+
446
+ // Configure multi-threading
447
+ if (!self.crossOriginIsolated) {
448
+ console.warn('Environment is not cross-origin isolated. Disabling WASM multi-threading.');
449
+ console.warn('To enable multi-threading, serve with headers:');
450
+ console.warn(' Cross-Origin-Opener-Policy: same-origin');
451
+ console.warn(' Cross-Origin-Embedder-Policy: require-corp');
452
+ ort.env.wasm.numThreads = 1;
453
+ } else {
454
+ const threads = Math.min(navigator.hardwareConcurrency || 4, 8);
455
+ ort.env.wasm.numThreads = threads;
456
+ if (DEBUG_LOGS) {
457
+ console.log(`Multi-threading enabled with ${threads} threads`);
458
+ }
459
+ }
460
+
461
+ console.log(`ORT: crossOriginIsolated=${self.crossOriginIsolated}, simd=${ort.env.wasm.simd}, threads=${ort.env.wasm.numThreads}`);
462
+
463
+ try {
464
+ const sessionOptions = {
465
+ executionProviders: ['wasm'],
466
+ graphOptimizationLevel: 'all'
467
+ };
468
+
469
+ // Load all models in parallel
470
+ postMessage({ type: 'status', status: 'Loading MIMI encoder...', state: 'loading' });
471
+ if (DEBUG_LOGS) {
472
+ console.log('Loading MIMI encoder...');
473
+ }
474
+
475
+ const [encoderRes, textCondRes, flowMainRes, flowFlowRes, decoderRes] = await Promise.all([
476
+ ort.InferenceSession.create(MODELS.mimi_encoder, sessionOptions),
477
+ ort.InferenceSession.create(MODELS.text_conditioner, sessionOptions),
478
+ ort.InferenceSession.create(MODELS.flow_lm_main, sessionOptions),
479
+ ort.InferenceSession.create(MODELS.flow_lm_flow, sessionOptions),
480
+ ort.InferenceSession.create(MODELS.mimi_decoder, sessionOptions)
481
+ ]);
482
+
483
+ mimiEncoderSession = encoderRes;
484
+ textConditionerSession = textCondRes;
485
+ flowLmMainSession = flowMainRes;
486
+ flowLmFlowSession = flowFlowRes;
487
+ mimiDecoderSession = decoderRes;
488
+
489
+ if (DEBUG_LOGS) {
490
+ console.log('All models loaded successfully');
491
+ console.log('Flow LM Main inputs:', flowLmMainSession.inputNames);
492
+ console.log('Flow LM Main outputs:', flowLmMainSession.outputNames);
493
+ console.log('MIMI decoder inputs:', mimiDecoderSession.inputNames);
494
+ console.log('MIMI decoder outputs:', mimiDecoderSession.outputNames);
495
+ }
496
+
497
+ // Load tokenizer
498
+ postMessage({ type: 'status', status: 'Loading tokenizer...', state: 'loading' });
499
+ if (DEBUG_LOGS) {
500
+ console.log('Loading tokenizer...');
501
+ }
502
+
503
+ const tokenizerResponse = await fetch(MODELS.tokenizer);
504
+ if (!tokenizerResponse.ok) {
505
+ throw new Error(`Failed to load tokenizer: ${tokenizerResponse.statusText}`);
506
+ }
507
+ const tokenizerBuffer = await tokenizerResponse.arrayBuffer();
508
+ tokenizerModelB64 = btoa(String.fromCharCode(...new Uint8Array(tokenizerBuffer)));
509
+
510
+ // Import and initialize sentencepiece processor
511
+ const spModule = await import('./sentencepiece.js?v=2');
512
+ const SentencePieceProcessor = spModule.SentencePieceProcessor;
513
+ if (!SentencePieceProcessor) {
514
+ throw new Error('SentencePieceProcessor not found in sentencepiece.js');
515
+ }
516
+ tokenizerProcessor = new SentencePieceProcessor();
517
+ await tokenizerProcessor.loadFromB64StringModel(tokenizerModelB64);
518
+ if (DEBUG_LOGS) {
519
+ console.log('Tokenizer loaded');
520
+ }
521
+
522
+ // Load predefined voices
523
+ postMessage({ type: 'status', status: 'Loading voices...', state: 'loading' });
524
+ if (DEBUG_LOGS) {
525
+ console.log('Loading predefined voices...');
526
+ }
527
+
528
+ try {
529
+ const voicesResponse = await fetch(MODELS.voices);
530
+ if (voicesResponse.ok) {
531
+ const voicesData = await voicesResponse.arrayBuffer();
532
+ predefinedVoices = parseVoicesBin(voicesData);
533
+ if (DEBUG_LOGS) {
534
+ console.log('Loaded voices:', Object.keys(predefinedVoices));
535
+ }
536
+
537
+ // Set default voice
538
+ if (predefinedVoices['cosette']) {
539
+ currentVoiceEmbedding = predefinedVoices['cosette'];
540
+ currentVoiceName = 'cosette';
541
+ } else {
542
+ // Use first available voice
543
+ const firstVoice = Object.keys(predefinedVoices)[0];
544
+ if (firstVoice) {
545
+ currentVoiceEmbedding = predefinedVoices[firstVoice];
546
+ currentVoiceName = firstVoice;
547
+ }
548
+ }
549
+ }
550
+ } catch (e) {
551
+ console.warn('Could not load predefined voices:', e);
552
+ postMessage({ type: 'status', status: 'Voice load error: ' + e.message, state: 'loading' });
553
+ }
554
+
555
+ // Send list of available voices
556
+ postMessage({
557
+ type: 'voices_loaded',
558
+ voices: Object.keys(predefinedVoices),
559
+ defaultVoice: currentVoiceName
560
+ });
561
+
562
+ // Pre-allocate s/t tensors for Flow Matching Loop (Optimization)
563
+ // Pre-allocate for MAX_LSD to support dynamic switching
564
+ if (DEBUG_LOGS) {
565
+ console.log(`Pre-allocating Flow Matching tensors for LSD 1-${MAX_LSD}...`);
566
+ }
567
+ stTensors = {};
568
+
569
+ for (let lsd = 1; lsd <= MAX_LSD; lsd++) {
570
+ stTensors[lsd] = [];
571
+ const dt = 1.0 / lsd;
572
+ for (let j = 0; j < lsd; j++) {
573
+ const s = j / lsd;
574
+ const t = s + dt;
575
+ stTensors[lsd].push({
576
+ s: new ort.Tensor('float32', new Float32Array([s]), [1, 1]),
577
+ t: new ort.Tensor('float32', new Float32Array([t]), [1, 1])
578
+ });
579
+ }
580
+ }
581
+
582
+ isReady = true;
583
+ postMessage({ type: 'status', status: 'Ready', state: 'idle' });
584
+ postMessage({ type: 'model_status', status: 'ready', text: 'Ready' });
585
+ postMessage({ type: 'loaded' });
586
+
587
+ } catch (err) {
588
+ console.error('Model load failed:', err);
589
+ throw err;
590
+ }
591
+ }
592
+
593
+ function parseVoicesBin(buffer) {
594
+ // Simple binary format:
595
+ // Header: 4 bytes (uint32) = number of voices
596
+ // For each voice:
597
+ // - 32 bytes: voice name (null-terminated string)
598
+ // - 4 bytes (uint32): number of frames
599
+ // - 4 bytes (uint32): embedding dim (1024)
600
+ // - frames * dim * 4 bytes: float32 embeddings
601
+
602
+ const voices = {};
603
+ const view = new DataView(buffer);
604
+ let offset = 0;
605
+
606
+ const numVoices = view.getUint32(offset, true);
607
+ offset += 4;
608
+
609
+ for (let i = 0; i < numVoices; i++) {
610
+ // Read voice name
611
+ const nameBytes = new Uint8Array(buffer, offset, 32);
612
+ const nameEnd = nameBytes.indexOf(0);
613
+ const name = new TextDecoder().decode(nameBytes.subarray(0, nameEnd > 0 ? nameEnd : 32)).trim();
614
+ offset += 32;
615
+
616
+ // Read dimensions
617
+ const numFrames = view.getUint32(offset, true);
618
+ offset += 4;
619
+ const embDim = view.getUint32(offset, true);
620
+ offset += 4;
621
+
622
+ // Read embeddings (copy to avoid alignment issues)
623
+ const embSize = numFrames * embDim;
624
+ const embBytes = new Uint8Array(buffer, offset, embSize * 4);
625
+ const embeddings = new Float32Array(embBytes.buffer.slice(offset, offset + embSize * 4));
626
+ offset += embSize * 4;
627
+
628
+ voices[name] = {
629
+ data: embeddings,
630
+ shape: [1, numFrames, embDim]
631
+ };
632
+
633
+ console.log(`Loaded voice '${name}': ${numFrames} frames, ${embDim} dim`);
634
+ }
635
+
636
+ return voices;
637
+ }
638
+
639
+ async function encodeVoiceAudio(audioData) {
640
+ // audioData should be Float32Array at 24kHz, mono
641
+ // Reshape to [1, 1, samples]
642
+ const input = new ort.Tensor('float32', audioData, [1, 1, audioData.length]);
643
+
644
+ const outputs = await mimiEncoderSession.run({ audio: input });
645
+ const embeddings = outputs[mimiEncoderSession.outputNames[0]];
646
+
647
+ return {
648
+ data: new Float32Array(embeddings.data),
649
+ shape: embeddings.dims
650
+ };
651
+ }
652
+
653
+ // Hardcoded state shapes extracted from ONNX model metadata
654
+ // These are the initial shapes - dynamic dimensions start at 0
655
+ const FLOW_LM_STATE_SHAPES = {
656
+ // KV cache layers: [kv=2, batch=1, max_seq=1000, heads=16, head_dim=64]
657
+ state_0: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
658
+ state_1: { shape: [0], dtype: 'float32' }, // dynamic
659
+ state_2: { shape: [1], dtype: 'int64' }, // step counter
660
+ state_3: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
661
+ state_4: { shape: [0], dtype: 'float32' },
662
+ state_5: { shape: [1], dtype: 'int64' },
663
+ state_6: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
664
+ state_7: { shape: [0], dtype: 'float32' },
665
+ state_8: { shape: [1], dtype: 'int64' },
666
+ state_9: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
667
+ state_10: { shape: [0], dtype: 'float32' },
668
+ state_11: { shape: [1], dtype: 'int64' },
669
+ state_12: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
670
+ state_13: { shape: [0], dtype: 'float32' },
671
+ state_14: { shape: [1], dtype: 'int64' },
672
+ state_15: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
673
+ state_16: { shape: [0], dtype: 'float32' },
674
+ state_17: { shape: [1], dtype: 'int64' },
675
+ };
676
+
677
+ const MIMI_DECODER_STATE_SHAPES = {
678
+ state_0: { shape: [1], dtype: 'bool' },
679
+ state_1: { shape: [1, 512, 6], dtype: 'float32' },
680
+ state_2: { shape: [1], dtype: 'bool' },
681
+ state_3: { shape: [1, 64, 2], dtype: 'float32' },
682
+ state_4: { shape: [1, 256, 6], dtype: 'float32' },
683
+ state_5: { shape: [1], dtype: 'bool' },
684
+ state_6: { shape: [1, 256, 2], dtype: 'float32' },
685
+ state_7: { shape: [1], dtype: 'bool' },
686
+ state_8: { shape: [1, 128, 0], dtype: 'float32' }, // dynamic
687
+ state_9: { shape: [1, 128, 5], dtype: 'float32' },
688
+ state_10: { shape: [1], dtype: 'bool' },
689
+ state_11: { shape: [1, 128, 2], dtype: 'float32' },
690
+ state_12: { shape: [1], dtype: 'bool' },
691
+ state_13: { shape: [1, 64, 0], dtype: 'float32' }, // dynamic
692
+ state_14: { shape: [1, 64, 4], dtype: 'float32' },
693
+ state_15: { shape: [1], dtype: 'bool' },
694
+ state_16: { shape: [1, 64, 2], dtype: 'float32' },
695
+ state_17: { shape: [1], dtype: 'bool' },
696
+ state_18: { shape: [1, 32, 0], dtype: 'float32' }, // dynamic
697
+ state_19: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
698
+ state_20: { shape: [1], dtype: 'int64' },
699
+ state_21: { shape: [1], dtype: 'int64' },
700
+ state_22: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
701
+ state_23: { shape: [1], dtype: 'int64' },
702
+ state_24: { shape: [1], dtype: 'int64' },
703
+ state_25: { shape: [1], dtype: 'bool' },
704
+ state_26: { shape: [1, 512, 16], dtype: 'float32' },
705
+ state_27: { shape: [1], dtype: 'bool' },
706
+ state_28: { shape: [1, 1, 6], dtype: 'float32' },
707
+ state_29: { shape: [1], dtype: 'bool' },
708
+ state_30: { shape: [1, 64, 2], dtype: 'float32' },
709
+ state_31: { shape: [1], dtype: 'bool' },
710
+ state_32: { shape: [1, 32, 0], dtype: 'float32' }, // dynamic
711
+ state_33: { shape: [1], dtype: 'bool' },
712
+ state_34: { shape: [1, 512, 2], dtype: 'float32' },
713
+ state_35: { shape: [1], dtype: 'bool' },
714
+ state_36: { shape: [1, 64, 4], dtype: 'float32' },
715
+ state_37: { shape: [1], dtype: 'bool' },
716
+ state_38: { shape: [1, 128, 2], dtype: 'float32' },
717
+ state_39: { shape: [1], dtype: 'bool' },
718
+ state_40: { shape: [1, 64, 0], dtype: 'float32' }, // dynamic
719
+ state_41: { shape: [1], dtype: 'bool' },
720
+ state_42: { shape: [1, 128, 5], dtype: 'float32' },
721
+ state_43: { shape: [1], dtype: 'bool' },
722
+ state_44: { shape: [1, 256, 2], dtype: 'float32' },
723
+ state_45: { shape: [1], dtype: 'bool' },
724
+ state_46: { shape: [1, 128, 0], dtype: 'float32' }, // dynamic
725
+ state_47: { shape: [1], dtype: 'bool' },
726
+ state_48: { shape: [1, 256, 6], dtype: 'float32' },
727
+ state_49: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
728
+ state_50: { shape: [1], dtype: 'int64' },
729
+ state_51: { shape: [1], dtype: 'int64' },
730
+ state_52: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
731
+ state_53: { shape: [1], dtype: 'int64' },
732
+ state_54: { shape: [1], dtype: 'int64' },
733
+ state_55: { shape: [1, 512, 16], dtype: 'float32' },
734
+ };
735
+
736
+ function initState(session, stateShapes) {
737
+ /**
738
+ * Initialize state tensors for a stateful ONNX model using hardcoded shapes.
739
+ */
740
+ const state = {};
741
+
742
+ for (const inputName of session.inputNames) {
743
+ if (inputName.startsWith('state_')) {
744
+ const stateInfo = stateShapes[inputName];
745
+ if (!stateInfo) {
746
+ console.warn(`Unknown state input: ${inputName}, skipping`);
747
+ continue;
748
+ }
749
+
750
+ const { shape, dtype } = stateInfo;
751
+ const size = shape.reduce((a, b) => a * b, 1);
752
+
753
+ let data;
754
+ if (dtype === 'int64') {
755
+ data = new BigInt64Array(size);
756
+ } else if (dtype === 'bool') {
757
+ data = new Uint8Array(size);
758
+ } else {
759
+ data = new Float32Array(size);
760
+ }
761
+
762
+ state[inputName] = new ort.Tensor(dtype, data, shape);
763
+ if (DEBUG_LOGS) {
764
+ console.log(`Init state ${inputName}: shape=${JSON.stringify(shape)}, dtype=${dtype}`);
765
+ }
766
+ }
767
+ }
768
+
769
+ return state;
770
+ }
771
+
772
+ async function startGeneration(text, voiceName) {
773
+ isGenerating = true;
774
+ currentLSD = MAX_LSD; // Reset to max quality for each new generation
775
+ postMessage({ type: 'status', status: 'Generating...', state: 'running' });
776
+ postMessage({ type: 'generation_started', data: { time: performance.now() } });
777
+
778
+ try {
779
+ // Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
780
+ const chunks = splitIntoBestSentences(text);
781
+ console.log(`Split into ${chunks.length} chunks:`, chunks);
782
+
783
+ if (chunks.length === 0) {
784
+ throw new Error('No text to generate');
785
+ }
786
+
787
+ // Get voice embedding
788
+ let voiceEmb = currentVoiceEmbedding;
789
+ if (voiceName && voiceName !== currentVoiceName) {
790
+ if (predefinedVoices[voiceName]) {
791
+ voiceEmb = predefinedVoices[voiceName];
792
+ currentVoiceEmbedding = voiceEmb;
793
+ currentVoiceName = voiceName;
794
+ }
795
+ }
796
+
797
+ if (!voiceEmb) {
798
+ throw new Error('No voice embedding available. Please select a voice or upload custom audio.');
799
+ }
800
+
801
+ // Run generation pipeline with chunks
802
+ await runGenerationPipeline(voiceEmb, chunks);
803
+
804
+ } catch (err) {
805
+ console.error('Generation error:', err);
806
+ postMessage({ type: 'error', error: err.toString() });
807
+ } finally {
808
+ if (isGenerating) {
809
+ postMessage({ type: 'stream_ended' });
810
+ postMessage({ type: 'status', status: 'Finished', state: 'idle' });
811
+ }
812
+ isGenerating = false;
813
+ }
814
+ }
815
+
816
+ async function runGenerationPipeline(voiceEmb, chunks) {
817
+ // Initialize state - may be reset per chunk
818
+ let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
819
+ const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
820
+ const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);
821
+
822
+ // Voice embedding tensor
823
+ const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
824
+ console.log('Voice embeddings shape:', voiceEmb.shape);
825
+
826
+ async function buildVoiceConditionedState() {
827
+ let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
828
+ console.log('Running voice conditioning...');
829
+ const voiceCondInputs = {
830
+ sequence: emptySeq,
831
+ text_embeddings: voiceTensor,
832
+ ...flowLmState
833
+ };
834
+
835
+ let condResult = await flowLmMainSession.run(voiceCondInputs);
836
+
837
+ // Update state from voice conditioning
838
+ for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
839
+ const outputName = flowLmMainSession.outputNames[i];
840
+ if (outputName.startsWith('out_state_')) {
841
+ const stateIdx = parseInt(outputName.replace('out_state_', ''));
842
+ flowLmState[`state_${stateIdx}`] = condResult[outputName];
843
+ }
844
+ }
845
+ return flowLmState;
846
+ }
847
+
848
+ let flowLmState = await buildVoiceConditionedState();
849
+
850
+ // Streaming parameters
851
+ const FIRST_CHUNK_FRAMES = 3;
852
+ const NORMAL_CHUNK_FRAMES = 12;
853
+
854
+ // Tracking across all chunks
855
+ const allGeneratedLatents = [];
856
+ let isFirstAudioChunk = true;
857
+ let totalDecodedFrames = 0;
858
+ let totalFlowLmTime = 0;
859
+ let totalDecodeTime = 0;
860
+ const arStartTime = performance.now();
861
+
862
+ // Process each text chunk
863
+ for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
864
+ if (!isGenerating) break;
865
+
866
+ if (RESET_FLOW_STATE_EACH_CHUNK && chunkIdx > 0) {
867
+ flowLmState = await buildVoiceConditionedState();
868
+ }
869
+ if (RESET_MIMI_STATE_EACH_CHUNK && chunkIdx > 0) {
870
+ mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
871
+ }
872
+
873
+ const chunkText = chunks[chunkIdx];
874
+ console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);
875
+
876
+ let isFirstAudioChunkOfTextChunk = true;
877
+
878
+ // Tokenize this chunk
879
+ const tokenIds = tokenizerProcessor.encodeIds(chunkText);
880
+ console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);
881
+
882
+ // Text conditioning for this chunk
883
+ const textInput = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(x => BigInt(x))), [1, tokenIds.length]);
884
+ const textCondResult = await textConditionerSession.run({ token_ids: textInput });
885
+ let textEmb = textCondResult[textConditionerSession.outputNames[0]];
886
+
887
+ if (textEmb.dims.length === 2) {
888
+ textEmb = new ort.Tensor('float32', textEmb.data, [1, textEmb.dims[0], textEmb.dims[1]]);
889
+ }
890
+
891
+ const textCondInputs = {
892
+ sequence: emptySeq,
893
+ text_embeddings: textEmb,
894
+ ...flowLmState
895
+ };
896
+
897
+ let condResult = await flowLmMainSession.run(textCondInputs);
898
+
899
+ // Update state from text conditioning
900
+ for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
901
+ const outputName = flowLmMainSession.outputNames[i];
902
+ if (outputName.startsWith('out_state_')) {
903
+ const stateIdx = parseInt(outputName.replace('out_state_', ''));
904
+ flowLmState[`state_${stateIdx}`] = condResult[outputName];
905
+ }
906
+ }
907
+
908
+ // AR generation for this chunk
909
+ const chunkLatents = [];
910
+ let currentLatent = new ort.Tensor('float32', new Float32Array(32).fill(NaN), [1, 1, 32]);
911
+ let chunkDecodedFrames = 0;
912
+ const FRAMES_AFTER_EOS = 3; // Match PyTorch behavior - generate extra frames after EOS
913
+ let eosStep = null;
914
+
915
+ let chunkEnded = false;
916
+ let chunkGenTimeMs = 0;
917
+ for (let step = 0; step < MAX_FRAMES; step++) {
918
+ if (!isGenerating) break;
919
+
920
+ // Yield every 4 steps to allow message processing (e.g., set_lsd)
921
+ if (step > 0 && step % 4 === 0) {
922
+ await new Promise(r => setTimeout(r, 0));
923
+ }
924
+
925
+ const arInputs = {
926
+ sequence: currentLatent,
927
+ text_embeddings: emptyTextEmb,
928
+ ...flowLmState
929
+ };
930
+
931
+ const stepStart = performance.now();
932
+ const arResult = await flowLmMainSession.run(arInputs);
933
+ const stepElapsed = performance.now() - stepStart;
934
+ chunkGenTimeMs += stepElapsed;
935
+
936
+ const conditioning = arResult['conditioning'];
937
+ const eosLogit = arResult['eos_logit'].data[0];
938
+ const isEos = eosLogit > -4.0;
939
+
940
+ // Track when EOS is first detected
941
+ if (isEos && eosStep === null) {
942
+ eosStep = step;
943
+ }
944
+
945
+ // Only stop after FRAMES_AFTER_EOS additional frames
946
+ const shouldStop = eosStep !== null && step >= eosStep + FRAMES_AFTER_EOS;
947
+
948
+ // Flow matching (LSD loop) - uses currentLSD which can be adjusted dynamically
949
+ const TEMP = 0.7;
950
+ const STD = Math.sqrt(TEMP);
951
+ let xData = new Float32Array(32);
952
+ for (let i = 0; i < 32; i++) {
953
+ let u = 0, v = 0;
954
+ while (u === 0) u = Math.random();
955
+ while (v === 0) v = Math.random();
956
+ xData[i] = Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v) * STD;
957
+ }
958
+
959
+ const lsdSteps = currentLSD;
960
+ const dt = 1.0 / lsdSteps;
961
+
962
+ for (let j = 0; j < lsdSteps; j++) {
963
+ const flowInputs = {
964
+ c: conditioning,
965
+ s: stTensors[lsdSteps][j].s,
966
+ t: stTensors[lsdSteps][j].t,
967
+ x: new ort.Tensor('float32', xData, [1, 32])
968
+ };
969
+
970
+ const flowResult = await flowLmFlowSession.run(flowInputs);
971
+ const v = flowResult['flow_dir'].data;
972
+
973
+ for (let k = 0; k < 32; k++) {
974
+ xData[k] += v[k] * dt;
975
+ }
976
+ }
977
+
978
+ totalFlowLmTime += stepElapsed;
979
+
980
+ const latentData = xData;
981
+ chunkLatents.push(new Float32Array(latentData));
982
+ allGeneratedLatents.push(new Float32Array(latentData));
983
+
984
+ // Update state
985
+ currentLatent = new ort.Tensor('float32', latentData, [1, 1, 32]);
986
+ for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
987
+ const outputName = flowLmMainSession.outputNames[i];
988
+ if (outputName.startsWith('out_state_')) {
989
+ const stateIdx = parseInt(outputName.replace('out_state_', ''));
990
+ flowLmState[`state_${stateIdx}`] = arResult[outputName];
991
+ }
992
+ }
993
+
994
+ // Decode audio chunks
995
+ const pending = chunkLatents.length - chunkDecodedFrames;
996
+ let decodeSize = 0;
997
+
998
+ if (shouldStop) {
999
+ decodeSize = pending;
1000
+ } else if (isFirstAudioChunk && pending >= FIRST_CHUNK_FRAMES) {
1001
+ decodeSize = FIRST_CHUNK_FRAMES;
1002
+ } else if (pending >= NORMAL_CHUNK_FRAMES) {
1003
+ decodeSize = NORMAL_CHUNK_FRAMES;
1004
+ }
1005
+
1006
+ if (decodeSize > 0) {
1007
+ const decodeLatents = new Float32Array(decodeSize * 32);
1008
+ for (let i = 0; i < decodeSize; i++) {
1009
+ decodeLatents.set(chunkLatents[chunkDecodedFrames + i], i * 32);
1010
+ }
1011
+
1012
+ const latentTensor = new ort.Tensor('float32', decodeLatents, [1, decodeSize, 32]);
1013
+ const decodeInputs = { latent: latentTensor, ...mimiState };
1014
+
1015
+ const decStart = performance.now();
1016
+ const decodeResult = await mimiDecoderSession.run(decodeInputs);
1017
+ const decElapsed = performance.now() - decStart;
1018
+ totalDecodeTime += decElapsed;
1019
+ chunkGenTimeMs += decElapsed;
1020
+ const audioChunk = decodeResult[mimiDecoderSession.outputNames[0]].data;
1021
+
1022
+ // Update MIMI state
1023
+ for (let i = 1; i < mimiDecoderSession.outputNames.length; i++) {
1024
+ const outputName = mimiDecoderSession.outputNames[i];
1025
+ const stateIdx = i - 1;
1026
+ mimiState[`state_${stateIdx}`] = decodeResult[outputName];
1027
+ }
1028
+
1029
+ chunkDecodedFrames += decodeSize;
1030
+ totalDecodedFrames += decodeSize;
1031
+
1032
+ const audioFloat32 = new Float32Array(audioChunk);
1033
+ const isLastChunk = shouldStop && chunkIdx === chunks.length - 1;
1034
+ postMessage({
1035
+ type: 'audio_chunk',
1036
+ data: audioFloat32,
1037
+ metrics: {
1038
+ bbTime: 0,
1039
+ decTime: 0,
1040
+ chunkDuration: audioFloat32.length / SAMPLE_RATE,
1041
+ genTimeSec: chunkGenTimeMs / 1000,
1042
+ isFirst: isFirstAudioChunk,
1043
+ isLast: isLastChunk,
1044
+ chunkStart: isFirstAudioChunkOfTextChunk
1045
+ }
1046
+ }, [audioFloat32.buffer]);
1047
+
1048
+ isFirstAudioChunk = false;
1049
+ isFirstAudioChunkOfTextChunk = false;
1050
+ chunkGenTimeMs = 0;
1051
+ }
1052
+
1053
+ if (shouldStop) {
1054
+ console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
1055
+ chunkEnded = true;
1056
+ break;
1057
+ }
1058
+ }
1059
+
1060
+ if (chunkEnded && isGenerating && chunkIdx < chunks.length - 1) {
1061
+ const gapSamples = Math.max(1, Math.floor(CHUNK_GAP_SEC * SAMPLE_RATE));
1062
+ const silence = new Float32Array(gapSamples);
1063
+ postMessage({
1064
+ type: 'audio_chunk',
1065
+ data: silence,
1066
+ metrics: {
1067
+ bbTime: 0,
1068
+ decTime: 0,
1069
+ chunkDuration: gapSamples / SAMPLE_RATE,
1070
+ isFirst: false,
1071
+ isLast: false,
1072
+ isSilence: true
1073
+ }
1074
+ }, [silence.buffer]);
1075
+ }
1076
+ }
1077
+
1078
+ const totalTime = (performance.now() - arStartTime) / 1000;
1079
+ const audioSeconds = allGeneratedLatents.length * SAMPLES_PER_FRAME / SAMPLE_RATE;
1080
+
1081
+ // RTFx based on actual generation time (flow LM + decoder), not including conditioning
1082
+ const genTime = (totalFlowLmTime + totalDecodeTime) / 1000;
1083
+ const rtfx = audioSeconds / genTime;
1084
+
1085
+ console.log(`Generation complete: ${allGeneratedLatents.length} frames (${audioSeconds.toFixed(2)}s audio)`);
1086
+ console.log(` Total time: ${totalTime.toFixed(2)}s`);
1087
+ console.log(` Gen time: ${genTime.toFixed(2)}s, RTFx: ${rtfx.toFixed(2)}x`);
1088
+ console.log(` Flow LM: ${(totalFlowLmTime / 1000).toFixed(2)}s (${(totalFlowLmTime / allGeneratedLatents.length).toFixed(1)}ms/step)`);
1089
+ console.log(` Decoder: ${(totalDecodeTime / 1000).toFixed(2)}s`);
1090
+
1091
+ postMessage({
1092
+ type: 'status',
1093
+ status: `Finished (RTFx: ${rtfx.toFixed(2)}x)`,
1094
+ state: 'idle',
1095
+ metrics: { rtfx, genTime, totalTime, audioDuration: audioSeconds }
1096
+ });
1097
+ }
1098
+
1099
+ // Pre-allocated buffers for step counter updates (avoid GC pressure in hot loop)
1100
+ const stepBuffers = {};
1101
+
1102
+ function updateStateSteps(state, increment) {
1103
+ // Update step counters in state dict - reuse buffers to avoid allocation
1104
+ const incBigInt = BigInt(increment);
1105
+ for (const key in state) {
1106
+ if (key.includes('step') && state[key]) {
1107
+ const tensor = state[key];
1108
+ if (tensor.data instanceof BigInt64Array) {
1109
+ // Reuse buffer if same size, otherwise create new one
1110
+ if (!stepBuffers[key] || stepBuffers[key].length !== tensor.data.length) {
1111
+ stepBuffers[key] = new BigInt64Array(tensor.data.length);
1112
+ }
1113
+ const buf = stepBuffers[key];
1114
+ for (let i = 0; i < tensor.data.length; i++) {
1115
+ buf[i] = tensor.data[i] + incBigInt;
1116
+ }
1117
+ state[key] = new ort.Tensor('int64', buf, tensor.dims);
1118
+ }
1119
+ }
1120
+ }
1121
+ }