webtalk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitattributes +35 -0
- package/.github/workflows/publish.yml +26 -0
- package/README.md +1 -0
- package/app.html +519 -0
- package/assets/index-ClpvH5Vn.js +40 -0
- package/assets/index-DUYekU7u.css +1 -0
- package/assets/worker-BPxxCWVT.js +2679 -0
- package/config.js +36 -0
- package/debug.js +21 -0
- package/download-lock.js +26 -0
- package/hot-reload.js +78 -0
- package/middleware.js +62 -0
- package/package.json +33 -0
- package/persistent-state.js +62 -0
- package/sdk.js +22 -0
- package/serve-static.js +45 -0
- package/server.js +177 -0
- package/setup-npm-publishing.sh +140 -0
- package/stt.js +141 -0
- package/test.mp3 +0 -0
- package/tts/EventEmitter.js +59 -0
- package/tts/PCMPlayerWorklet.js +563 -0
- package/tts/inference-worker.js +1121 -0
- package/tts/onnx-streaming.js +721 -0
- package/tts-models.js +97 -0
- package/tts-utils.js +52 -0
- package/tts.js +167 -0
- package/whisper-models.js +161 -0
- package/worker-patch.js +32 -0
|
@@ -0,0 +1,1121 @@
|
|
|
1
|
+
// Pocket TTS ONNX Web Worker
|
|
2
|
+
console.log('Pocket TTS Worker Starting...');
|
|
3
|
+
self.postMessage({ type: 'status', status: 'Worker Thread Started', state: 'idle' });
|
|
4
|
+
|
|
5
|
+
// Load ONNX Runtime (will be loaded dynamically in loadModels for module worker)
|
|
6
|
+
let ort = null;
|
|
7
|
+
|
|
8
|
+
// Configuration
|
|
9
|
+
const MODELS = {
|
|
10
|
+
mimi_encoder: '/models/tts/mimi_encoder.onnx',
|
|
11
|
+
text_conditioner: '/models/tts/text_conditioner.onnx',
|
|
12
|
+
flow_lm_main: '/models/tts/flow_lm_main_int8.onnx',
|
|
13
|
+
flow_lm_flow: '/models/tts/flow_lm_flow_int8.onnx',
|
|
14
|
+
mimi_decoder: '/models/tts/mimi_decoder_int8.onnx',
|
|
15
|
+
tokenizer: '/models/tts/tokenizer.model',
|
|
16
|
+
voices: '/models/tts/voices.bin'
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
const SAMPLE_RATE = 24000;
|
|
20
|
+
const SAMPLES_PER_FRAME = 1920;
|
|
21
|
+
const MAX_FRAMES = 500;
|
|
22
|
+
const DEBUG_LOGS = true;
|
|
23
|
+
// Text chunking target; lower if long passages hit generation limits.
|
|
24
|
+
const CHUNK_TARGET_TOKENS = 50;
|
|
25
|
+
const CHUNK_GAP_SEC = 0.25;
|
|
26
|
+
// If true, re-run voice conditioning per chunk to avoid stale AR state.
|
|
27
|
+
const RESET_FLOW_STATE_EACH_CHUNK = true;
|
|
28
|
+
// If true, reset decoder state per chunk to avoid carry-over artifacts.
|
|
29
|
+
const RESET_MIMI_STATE_EACH_CHUNK = true;
|
|
30
|
+
|
|
31
|
+
// State
|
|
32
|
+
let mimiEncoderSession = null;
|
|
33
|
+
let textConditionerSession = null;
|
|
34
|
+
let flowLmMainSession = null;
|
|
35
|
+
let flowLmFlowSession = null;
|
|
36
|
+
let mimiDecoderSession = null;
|
|
37
|
+
let tokenizerProcessor = null;
|
|
38
|
+
let tokenizerModelB64 = null;
|
|
39
|
+
let predefinedVoices = {};
|
|
40
|
+
let stTensors = []; // Optimization: Pre-allocated s/t tensors for max LSD
|
|
41
|
+
let isGenerating = false;
|
|
42
|
+
let isReady = false;
|
|
43
|
+
|
|
44
|
+
// Dynamic LSD (Latent Solver/Diffusion steps)
|
|
45
|
+
const MAX_LSD = 10; // Default/max quality
|
|
46
|
+
let currentLSD = MAX_LSD;
|
|
47
|
+
|
|
48
|
+
// Current voice embedding (cached)
|
|
49
|
+
let currentVoiceEmbedding = null;
|
|
50
|
+
let currentVoiceName = null;
|
|
51
|
+
|
|
52
|
+
// Text preprocessing utilities
|
|
53
|
+
const ONES = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'];
|
|
54
|
+
const TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'];
|
|
55
|
+
const ORDINAL_ONES = ['', 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth'];
|
|
56
|
+
const ORDINAL_TENS = ['', '', 'twentieth', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth'];
|
|
57
|
+
|
|
58
|
+
function numberToWords(num, options = {}) {
|
|
59
|
+
const { andword = '', zero = 'zero', group = 0 } = options;
|
|
60
|
+
if (num === 0) return zero;
|
|
61
|
+
const convert = (n) => {
|
|
62
|
+
if (n < 20) return ONES[n];
|
|
63
|
+
if (n < 100) return TENS[Math.floor(n / 10)] + (n % 10 ? ' ' + ONES[n % 10] : '');
|
|
64
|
+
if (n < 1000) {
|
|
65
|
+
const remainder = n % 100;
|
|
66
|
+
return ONES[Math.floor(n / 100)] + ' hundred' + (remainder ? (andword ? ' ' + andword + ' ' : ' ') + convert(remainder) : '');
|
|
67
|
+
}
|
|
68
|
+
if (n < 1000000) {
|
|
69
|
+
const thousands = Math.floor(n / 1000);
|
|
70
|
+
const remainder = n % 1000;
|
|
71
|
+
return convert(thousands) + ' thousand' + (remainder ? ' ' + convert(remainder) : '');
|
|
72
|
+
}
|
|
73
|
+
if (n < 1000000000) {
|
|
74
|
+
const millions = Math.floor(n / 1000000);
|
|
75
|
+
const remainder = n % 1000000;
|
|
76
|
+
return convert(millions) + ' million' + (remainder ? ' ' + convert(remainder) : '');
|
|
77
|
+
}
|
|
78
|
+
const billions = Math.floor(n / 1000000000);
|
|
79
|
+
const remainder = n % 1000000000;
|
|
80
|
+
return convert(billions) + ' billion' + (remainder ? ' ' + convert(remainder) : '');
|
|
81
|
+
};
|
|
82
|
+
if (group === 2 && num > 1000 && num < 10000) {
|
|
83
|
+
const high = Math.floor(num / 100);
|
|
84
|
+
const low = num % 100;
|
|
85
|
+
if (low === 0) return convert(high) + ' hundred';
|
|
86
|
+
else if (low < 10) return convert(high) + ' ' + (zero === 'oh' ? 'oh' : zero) + ' ' + ONES[low];
|
|
87
|
+
else return convert(high) + ' ' + convert(low);
|
|
88
|
+
}
|
|
89
|
+
return convert(num);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function ordinalToWords(num) {
|
|
93
|
+
if (num < 20) return ORDINAL_ONES[num] || numberToWords(num) + 'th';
|
|
94
|
+
if (num < 100) {
|
|
95
|
+
const tens = Math.floor(num / 10);
|
|
96
|
+
const ones = num % 10;
|
|
97
|
+
if (ones === 0) return ORDINAL_TENS[tens];
|
|
98
|
+
return TENS[tens] + ' ' + ORDINAL_ONES[ones];
|
|
99
|
+
}
|
|
100
|
+
const cardinal = numberToWords(num);
|
|
101
|
+
if (cardinal.endsWith('y')) return cardinal.slice(0, -1) + 'ieth';
|
|
102
|
+
if (cardinal.endsWith('one')) return cardinal.slice(0, -3) + 'first';
|
|
103
|
+
if (cardinal.endsWith('two')) return cardinal.slice(0, -3) + 'second';
|
|
104
|
+
if (cardinal.endsWith('three')) return cardinal.slice(0, -5) + 'third';
|
|
105
|
+
if (cardinal.endsWith('ve')) return cardinal.slice(0, -2) + 'fth';
|
|
106
|
+
if (cardinal.endsWith('e')) return cardinal.slice(0, -1) + 'th';
|
|
107
|
+
if (cardinal.endsWith('t')) return cardinal + 'h';
|
|
108
|
+
return cardinal + 'th';
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const UNICODE_MAP = {
|
|
112
|
+
'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a', 'æ': 'ae', 'ç': 'c', 'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'ñ': 'n', 'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o', 'ø': 'o', 'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u', 'ý': 'y', 'ÿ': 'y', 'ß': 'ss', 'œ': 'oe', 'ð': 'd', 'þ': 'th', 'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A', 'Å': 'A', 'Æ': 'AE', 'Ç': 'C', 'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E', 'Ì': 'I', 'Í': 'I', 'Î': 'I', 'Ï': 'I', 'Ñ': 'N', 'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O', 'Ø': 'O', 'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U', 'Ý': 'Y', '\u201C': '"', '\u201D': '"', '\u2018': "'", '\u2019': "'", '\u2026': '...', '\u2013': '-', '\u2014': '-'
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
function convertToAscii(text) {
|
|
116
|
+
return text.split('').map(c => UNICODE_MAP[c] || c).join('').normalize('NFD').replace(/[\u0300-\u036f]/g, '');
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const ABBREVIATIONS = [
|
|
120
|
+
[/\bmrs\./gi, 'misuss'], [/\bms\./gi, 'miss'], [/\bmr\./gi, 'mister'], [/\bdr\./gi, 'doctor'], [/\bst\./gi, 'saint'], [/\bco\./gi, 'company'], [/\bjr\./gi, 'junior'], [/\bmaj\./gi, 'major'], [/\bgen\./gi, 'general'], [/\bdrs\./gi, 'doctors'], [/\brev\./gi, 'reverend'], [/\blt\./gi, 'lieutenant'], [/\bhon\./gi, 'honorable'], [/\bsgt\./gi, 'sergeant'], [/\bcapt\./gi, 'captain'], [/\besq\./gi, 'esquire'], [/\bltd\./gi, 'limited'], [/\bcol\./gi, 'colonel'], [/\bft\./gi, 'fort']
|
|
121
|
+
];
|
|
122
|
+
const CASED_ABBREVIATIONS = [
|
|
123
|
+
[/\bTTS\b/g, 'text to speech'], [/\bHz\b/g, 'hertz'], [/\bkHz\b/g, 'kilohertz'], [/\bKBs\b/g, 'kilobytes'], [/\bKB\b/g, 'kilobyte'], [/\bMBs\b/g, 'megabytes'], [/\bMB\b/g, 'megabyte'], [/\bGBs\b/g, 'gigabytes'], [/\bGB\b/g, 'gigabyte'], [/\bTBs\b/g, 'terabytes'], [/\bTB\b/g, 'terabyte'], [/\bAPIs\b/g, "a p i's"], [/\bAPI\b/g, 'a p i'], [/\bCLIs\b/g, "c l i's"], [/\bCLI\b/g, 'c l i'], [/\bCPUs\b/g, "c p u's"], [/\bCPU\b/g, 'c p u'], [/\bGPUs\b/g, "g p u's"], [/\bGPU\b/g, 'g p u'], [/\bAve\b/g, 'avenue'], [/\betc\b/g, 'etcetera']
|
|
124
|
+
];
|
|
125
|
+
|
|
126
|
+
function expandAbbreviations(text) {
|
|
127
|
+
for (const [regex, replacement] of [...ABBREVIATIONS, ...CASED_ABBREVIATIONS]) text = text.replace(regex, replacement);
|
|
128
|
+
return text;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const NUM_PREFIX_RE = /#(\d)/g;
|
|
132
|
+
const NUM_SUFFIX_RE = /(\d)([KMBT])/gi;
|
|
133
|
+
const NUM_LETTER_SPLIT_RE = /(\d)([a-z])|([a-z])(\d)/gi;
|
|
134
|
+
const COMMA_NUMBER_RE = /(\d[\d,]+\d)/g;
|
|
135
|
+
const DATE_RE = /(^|[^/])(\d\d?[/-]\d\d?[/-]\d\d(?:\d\d)?)($|[^/])/g;
|
|
136
|
+
const PHONE_NUMBER_RE = /\(?\d{3}\)?[-.\s]\d{3}[-.\s]?\d{4}/g;
|
|
137
|
+
const TIME_RE = /(\d\d?):(\d\d)(?::(\d\d))?/g;
|
|
138
|
+
const POUNDS_RE = /£([\d,]*\d+)/g;
|
|
139
|
+
const DOLLARS_RE = /\$([\d.,]*\d+)/g;
|
|
140
|
+
const DECIMAL_NUMBER_RE = /(\d+(?:\.\d+)+)/g;
|
|
141
|
+
const MULTIPLY_RE = /(\d)\s?\*\s?(\d)/g;
|
|
142
|
+
const DIVIDE_RE = /(\d)\s?\/\s?(\d)/g;
|
|
143
|
+
const ADD_RE = /(\d)\s?\+\s?(\d)/g;
|
|
144
|
+
const SUBTRACT_RE = /(\d)?\s?-\s?(\d)/g;
|
|
145
|
+
const FRACTION_RE = /(\d+)\/(\d+)/g;
|
|
146
|
+
const ORDINAL_RE = /(\d+)(st|nd|rd|th)/gi;
|
|
147
|
+
const NUMBER_RE = /\d+/g;
|
|
148
|
+
|
|
149
|
+
function normalizeNumbers(text) {
|
|
150
|
+
text = text.replace(NUM_PREFIX_RE, (_, d) => `number ${d}`);
|
|
151
|
+
text = text.replace(NUM_SUFFIX_RE, (_, num, suffix) => {
|
|
152
|
+
const map = { k: 'thousand', m: 'million', b: 'billion', t: 'trillion' };
|
|
153
|
+
return `${num} ${map[suffix.toLowerCase()]}`;
|
|
154
|
+
});
|
|
155
|
+
for (let i = 0; i < 2; i++) {
|
|
156
|
+
text = text.replace(NUM_LETTER_SPLIT_RE, (m, d1, l1, l2, d2) => {
|
|
157
|
+
if (d1 && l1) return `${d1} ${l1}`;
|
|
158
|
+
if (l2 && d2) return `${l2} ${d2}`;
|
|
159
|
+
return m;
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
text = text.replace(COMMA_NUMBER_RE, m => m.replace(/,/g, ''));
|
|
163
|
+
text = text.replace(DATE_RE, (_, pre, date, post) => pre + date.split(/[./-]/).join(' dash ') + post);
|
|
164
|
+
text = text.replace(PHONE_NUMBER_RE, m => {
|
|
165
|
+
const digits = m.replace(/\D/g, '');
|
|
166
|
+
return digits.length === 10 ? `${digits.slice(0, 3).split('').join(' ')}, ${digits.slice(3, 6).split('').join(' ')}, ${digits.slice(6).split('').join(' ')}` : m;
|
|
167
|
+
});
|
|
168
|
+
text = text.replace(TIME_RE, (_, hours, minutes, seconds) => {
|
|
169
|
+
const h = parseInt(hours), m = parseInt(minutes), s = seconds ? parseInt(seconds) : 0;
|
|
170
|
+
if (!seconds) return m === 0 ? (h === 0 ? '0' : h > 12 ? `${hours} minutes` : `${hours} o'clock`) : minutes.startsWith('0') ? `${hours} oh ${minutes[1]}` : `${hours} ${minutes}`;
|
|
171
|
+
let res = '';
|
|
172
|
+
if (h !== 0) res = hours + ' ' + (m === 0 ? 'oh oh' : minutes.startsWith('0') ? `oh ${minutes[1]}` : minutes);
|
|
173
|
+
else if (m !== 0) res = minutes + ' ' + (s === 0 ? 'oh oh' : seconds.startsWith('0') ? `oh ${seconds[1]}` : seconds);
|
|
174
|
+
else res = seconds;
|
|
175
|
+
return res + ' ' + (s === 0 ? '' : seconds.startsWith('0') ? `oh ${seconds[1]}` : seconds);
|
|
176
|
+
});
|
|
177
|
+
text = text.replace(POUNDS_RE, (_, amount) => `${amount.replace(/,/g, '')} pounds`);
|
|
178
|
+
text = text.replace(DOLLARS_RE, (_, amount) => {
|
|
179
|
+
const parts = amount.replace(/,/g, '').split('.');
|
|
180
|
+
const dollars = parseInt(parts[0]) || 0;
|
|
181
|
+
const cents = parts[1] ? parseInt(parts[1]) : 0;
|
|
182
|
+
if (dollars && cents) return `${dollars} ${dollars === 1 ? 'dollar' : 'dollars'}, ${cents} ${cents === 1 ? 'cent' : 'cents'}`;
|
|
183
|
+
if (dollars) return `${dollars} ${dollars === 1 ? 'dollar' : 'dollars'}`;
|
|
184
|
+
if (cents) return `${cents} ${cents === 1 ? 'cent' : 'cents'}`;
|
|
185
|
+
return 'zero dollars';
|
|
186
|
+
});
|
|
187
|
+
text = text.replace(DECIMAL_NUMBER_RE, m => m.split('.').join(' point ').split('').join(' '));
|
|
188
|
+
text = text.replace(MULTIPLY_RE, '$1 times $2');
|
|
189
|
+
text = text.replace(DIVIDE_RE, '$1 over $2');
|
|
190
|
+
text = text.replace(ADD_RE, '$1 plus $2');
|
|
191
|
+
text = text.replace(SUBTRACT_RE, (_, a, b) => (a ? a : '') + ' minus ' + b);
|
|
192
|
+
text = text.replace(FRACTION_RE, '$1 over $2');
|
|
193
|
+
text = text.replace(ORDINAL_RE, (_, num) => ordinalToWords(parseInt(num)));
|
|
194
|
+
text = text.replace(NUMBER_RE, m => {
|
|
195
|
+
const num = parseInt(m);
|
|
196
|
+
if (num > 1000 && num < 3000) {
|
|
197
|
+
if (num === 2000) return 'two thousand';
|
|
198
|
+
if (num > 2000 && num < 2010) return 'two thousand ' + numberToWords(num % 100);
|
|
199
|
+
if (num % 100 === 0) return numberToWords(Math.floor(num / 100)) + ' hundred';
|
|
200
|
+
return numberToWords(num, { zero: 'oh', group: 2 });
|
|
201
|
+
}
|
|
202
|
+
return numberToWords(num);
|
|
203
|
+
});
|
|
204
|
+
return text;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const SPECIAL_CHARACTERS = [
|
|
208
|
+
[/@/g, ' at '], [/&/g, ' and '], [/%/g, ' percent '], [/:/g, '.'], [/;/g, ','], [/\+/g, ' plus '], [/\\/g, ' backslash '], [/~/g, ' about '], [/(^| )<3/g, ' heart '], [/<=/g, ' less than or equal to '], [/>=/g, ' greater than or equal to '], [/</g, ' less than '], [/>/g, ' greater than '], [/=/g, ' equals '], [/\//g, ' slash '], [/_/g, ' '],
|
|
209
|
+
];
|
|
210
|
+
const LINK_HEADER_RE = /https?:\/\//gi;
|
|
211
|
+
const DASH_RE = /(.) - (.)/g;
|
|
212
|
+
const DOT_RE = /([A-Z])\.([A-Z])/gi;
|
|
213
|
+
const PARENTHESES_RE = /[\(\[\{][^\)\]\}]*[\)\]\}](.)?/g;
|
|
214
|
+
|
|
215
|
+
function normalizeSpecial(text) {
|
|
216
|
+
text = text.replace(LINK_HEADER_RE, 'h t t p s colon slash slash ');
|
|
217
|
+
text = text.replace(DASH_RE, '$1, $2');
|
|
218
|
+
text = text.replace(DOT_RE, '$1 dot $2');
|
|
219
|
+
text = text.replace(PARENTHESES_RE, (m, after) => {
|
|
220
|
+
let result = m.replace(/[\(\[\{]/g, ', ').replace(/[\)\]\}]/g, ', ');
|
|
221
|
+
if (after && /[$.!?,]/.test(after)) result = result.slice(0, -2) + after;
|
|
222
|
+
return result;
|
|
223
|
+
});
|
|
224
|
+
return text;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function expandSpecialCharacters(text) {
|
|
228
|
+
for (const [regex, replacement] of SPECIAL_CHARACTERS) text = text.replace(regex, replacement);
|
|
229
|
+
return text;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function collapseWhitespace(text) {
|
|
233
|
+
return text.replace(/\s+/g, ' ').replace(/ ([.\?!,])/g, '$1');
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
function dedupPunctuation(text) {
|
|
237
|
+
return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const SENTENCE_SPLIT_RE = /[^.!?]+[.!?]+|[^.!?]+$/g;
|
|
241
|
+
|
|
242
|
+
function splitTextIntoSentences(text) {
|
|
243
|
+
const matches = text.match(SENTENCE_SPLIT_RE);
|
|
244
|
+
if (!matches) return [];
|
|
245
|
+
return matches.map(sentence => sentence.trim()).filter(Boolean);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function splitTokenIdsIntoChunks(tokenIds, maxTokens) {
|
|
249
|
+
const chunks = [];
|
|
250
|
+
for (let i = 0; i < tokenIds.length; i += maxTokens) {
|
|
251
|
+
const chunkText = tokenizerProcessor.decodeIds(tokenIds.slice(i, i + maxTokens)).trim();
|
|
252
|
+
if (chunkText) chunks.push(chunkText);
|
|
253
|
+
}
|
|
254
|
+
return chunks;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
|
|
258
|
+
function splitIntoBestSentences(text) {
|
|
259
|
+
const preparedText = prepareText(text);
|
|
260
|
+
if (!preparedText) return [];
|
|
261
|
+
|
|
262
|
+
const sentences = splitTextIntoSentences(preparedText);
|
|
263
|
+
if (sentences.length === 0) return [];
|
|
264
|
+
|
|
265
|
+
// Merge sentences into chunks that stay within the token target
|
|
266
|
+
const chunks = [];
|
|
267
|
+
let currentChunk = '';
|
|
268
|
+
for (const sentenceText of sentences) {
|
|
269
|
+
const sentenceTokenIds = tokenizerProcessor.encodeIds(sentenceText);
|
|
270
|
+
const sentenceTokens = sentenceTokenIds.length;
|
|
271
|
+
|
|
272
|
+
if (sentenceTokens > CHUNK_TARGET_TOKENS) {
|
|
273
|
+
if (currentChunk !== '') {
|
|
274
|
+
chunks.push(currentChunk.trim());
|
|
275
|
+
currentChunk = '';
|
|
276
|
+
}
|
|
277
|
+
const splitChunks = splitTokenIdsIntoChunks(sentenceTokenIds, CHUNK_TARGET_TOKENS);
|
|
278
|
+
for (const splitChunk of splitChunks) {
|
|
279
|
+
if (splitChunk) chunks.push(splitChunk.trim());
|
|
280
|
+
}
|
|
281
|
+
continue;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if (currentChunk === '') {
|
|
285
|
+
currentChunk = sentenceText;
|
|
286
|
+
continue;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
const combined = `${currentChunk} ${sentenceText}`;
|
|
290
|
+
const combinedTokens = tokenizerProcessor.encodeIds(combined).length;
|
|
291
|
+
if (combinedTokens > CHUNK_TARGET_TOKENS) {
|
|
292
|
+
chunks.push(currentChunk.trim());
|
|
293
|
+
currentChunk = sentenceText;
|
|
294
|
+
} else {
|
|
295
|
+
currentChunk = combined;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if (currentChunk !== '') {
|
|
300
|
+
chunks.push(currentChunk.trim());
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
return chunks;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Pocket TTS specific text preprocessing
|
|
307
|
+
function prepareText(text) {
|
|
308
|
+
text = text.trim();
|
|
309
|
+
if (!text) return '';
|
|
310
|
+
|
|
311
|
+
// Convert to ASCII
|
|
312
|
+
text = convertToAscii(text);
|
|
313
|
+
|
|
314
|
+
// Normalize numbers first
|
|
315
|
+
text = normalizeNumbers(text);
|
|
316
|
+
|
|
317
|
+
// Normalize special characters
|
|
318
|
+
text = normalizeSpecial(text);
|
|
319
|
+
|
|
320
|
+
// Expand abbreviations
|
|
321
|
+
text = expandAbbreviations(text);
|
|
322
|
+
|
|
323
|
+
// Expand special characters
|
|
324
|
+
text = expandSpecialCharacters(text);
|
|
325
|
+
|
|
326
|
+
// Collapse whitespace
|
|
327
|
+
text = collapseWhitespace(text);
|
|
328
|
+
|
|
329
|
+
// Deduplicate punctuation
|
|
330
|
+
text = dedupPunctuation(text);
|
|
331
|
+
|
|
332
|
+
// Final cleanup
|
|
333
|
+
text = text.trim();
|
|
334
|
+
|
|
335
|
+
// Ensure proper punctuation at end
|
|
336
|
+
if (text && text[text.length - 1].match(/[a-zA-Z0-9]/)) {
|
|
337
|
+
text = text + '.';
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Capitalize first letter
|
|
341
|
+
if (text && !text[0].match(/[A-Z]/)) {
|
|
342
|
+
text = text[0].toUpperCase() + text.slice(1);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
return text;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// ----------------------------------------------------------------------------
|
|
349
|
+
// Worker Logic
|
|
350
|
+
// ----------------------------------------------------------------------------
|
|
351
|
+
|
|
352
|
+
self.onmessage = async (e) => {
|
|
353
|
+
const { type, data } = e.data;
|
|
354
|
+
console.log('Worker received message:', type);
|
|
355
|
+
|
|
356
|
+
if (type === 'load') {
|
|
357
|
+
try {
|
|
358
|
+
await loadModels();
|
|
359
|
+
postMessage({ type: 'loaded' });
|
|
360
|
+
} catch (err) {
|
|
361
|
+
postMessage({ type: 'error', error: err.toString() });
|
|
362
|
+
}
|
|
363
|
+
} else if (type === 'generate') {
|
|
364
|
+
if (!isReady) {
|
|
365
|
+
postMessage({ type: 'error', error: 'Models are not loaded yet.' });
|
|
366
|
+
return;
|
|
367
|
+
}
|
|
368
|
+
if (isGenerating) return;
|
|
369
|
+
try {
|
|
370
|
+
await startGeneration(data.text, data.voice);
|
|
371
|
+
} catch (err) {
|
|
372
|
+
console.error('Generation Error:', err);
|
|
373
|
+
postMessage({ type: 'error', error: err.toString() });
|
|
374
|
+
}
|
|
375
|
+
} else if (type === 'encode_voice') {
|
|
376
|
+
if (!isReady) {
|
|
377
|
+
postMessage({ type: 'error', error: 'Models are not loaded yet.' });
|
|
378
|
+
return;
|
|
379
|
+
}
|
|
380
|
+
try {
|
|
381
|
+
const embedding = await encodeVoiceAudio(data.audio);
|
|
382
|
+
currentVoiceEmbedding = embedding;
|
|
383
|
+
currentVoiceName = 'custom';
|
|
384
|
+
postMessage({ type: 'voice_encoded', voiceName: 'custom' });
|
|
385
|
+
} catch (err) {
|
|
386
|
+
console.error('Voice encoding error:', err);
|
|
387
|
+
postMessage({ type: 'error', error: 'Failed to encode voice: ' + err.toString() });
|
|
388
|
+
}
|
|
389
|
+
} else if (type === 'set_voice') {
|
|
390
|
+
if (!isReady) {
|
|
391
|
+
postMessage({ type: 'error', error: 'Models are not loaded yet.' });
|
|
392
|
+
return;
|
|
393
|
+
}
|
|
394
|
+
if (data.voiceName === 'custom') {
|
|
395
|
+
// Custom voice already set via encode_voice
|
|
396
|
+
postMessage({ type: 'voice_set', voiceName: 'custom' });
|
|
397
|
+
} else if (predefinedVoices[data.voiceName]) {
|
|
398
|
+
currentVoiceEmbedding = predefinedVoices[data.voiceName];
|
|
399
|
+
currentVoiceName = data.voiceName;
|
|
400
|
+
postMessage({ type: 'voice_set', voiceName: data.voiceName });
|
|
401
|
+
} else {
|
|
402
|
+
postMessage({ type: 'error', error: `Unknown voice: ${data.voiceName}` });
|
|
403
|
+
}
|
|
404
|
+
} else if (type === 'set_lsd') {
|
|
405
|
+
// Dynamic LSD adjustment for edge devices
|
|
406
|
+
const newLSD = Math.max(1, Math.min(MAX_LSD, data.lsd));
|
|
407
|
+
if (newLSD !== currentLSD) {
|
|
408
|
+
console.log(`LSD adjusted: ${currentLSD} → ${newLSD}`);
|
|
409
|
+
currentLSD = newLSD;
|
|
410
|
+
}
|
|
411
|
+
} else if (type === 'stop') {
|
|
412
|
+
isGenerating = false;
|
|
413
|
+
postMessage({ type: 'status', status: 'Stopped', state: 'idle' });
|
|
414
|
+
}
|
|
415
|
+
};
|
|
416
|
+
|
|
417
|
+
async function loadModels() {
|
|
418
|
+
if (mimiEncoderSession) return;
|
|
419
|
+
|
|
420
|
+
postMessage({ type: 'status', status: 'Loading ONNX Runtime...', state: 'loading' });
|
|
421
|
+
|
|
422
|
+
// Load ONNX Runtime dynamically
|
|
423
|
+
const version = '1.20.0';
|
|
424
|
+
const cdnBase = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${version}/dist/`;
|
|
425
|
+
|
|
426
|
+
try {
|
|
427
|
+
const ortModule = await import(`https://cdn.jsdelivr.net/npm/onnxruntime-web@${version}/dist/ort.min.mjs`);
|
|
428
|
+
ort = ortModule.default || ortModule;
|
|
429
|
+
} catch (e) {
|
|
430
|
+
console.error('Failed to load ONNX Runtime:', e);
|
|
431
|
+
throw new Error('Failed to load ONNX Runtime: ' + e.message);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
if (!ort) {
|
|
435
|
+
throw new Error('ONNX Runtime failed to load');
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
postMessage({ type: 'status', status: 'Loading models...', state: 'loading' });
|
|
439
|
+
|
|
440
|
+
// Configure WASM Paths
|
|
441
|
+
ort.env.wasm.wasmPaths = cdnBase;
|
|
442
|
+
|
|
443
|
+
// Enable SIMD for significant performance boost (2-4x faster)
|
|
444
|
+
ort.env.wasm.simd = true;
|
|
445
|
+
|
|
446
|
+
// Configure multi-threading
|
|
447
|
+
if (!self.crossOriginIsolated) {
|
|
448
|
+
console.warn('Environment is not cross-origin isolated. Disabling WASM multi-threading.');
|
|
449
|
+
console.warn('To enable multi-threading, serve with headers:');
|
|
450
|
+
console.warn(' Cross-Origin-Opener-Policy: same-origin');
|
|
451
|
+
console.warn(' Cross-Origin-Embedder-Policy: require-corp');
|
|
452
|
+
ort.env.wasm.numThreads = 1;
|
|
453
|
+
} else {
|
|
454
|
+
const threads = Math.min(navigator.hardwareConcurrency || 4, 8);
|
|
455
|
+
ort.env.wasm.numThreads = threads;
|
|
456
|
+
if (DEBUG_LOGS) {
|
|
457
|
+
console.log(`Multi-threading enabled with ${threads} threads`);
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
console.log(`ORT: crossOriginIsolated=${self.crossOriginIsolated}, simd=${ort.env.wasm.simd}, threads=${ort.env.wasm.numThreads}`);
|
|
462
|
+
|
|
463
|
+
try {
|
|
464
|
+
const sessionOptions = {
|
|
465
|
+
executionProviders: ['wasm'],
|
|
466
|
+
graphOptimizationLevel: 'all'
|
|
467
|
+
};
|
|
468
|
+
|
|
469
|
+
// Load all models in parallel
|
|
470
|
+
postMessage({ type: 'status', status: 'Loading MIMI encoder...', state: 'loading' });
|
|
471
|
+
if (DEBUG_LOGS) {
|
|
472
|
+
console.log('Loading MIMI encoder...');
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
const [encoderRes, textCondRes, flowMainRes, flowFlowRes, decoderRes] = await Promise.all([
|
|
476
|
+
ort.InferenceSession.create(MODELS.mimi_encoder, sessionOptions),
|
|
477
|
+
ort.InferenceSession.create(MODELS.text_conditioner, sessionOptions),
|
|
478
|
+
ort.InferenceSession.create(MODELS.flow_lm_main, sessionOptions),
|
|
479
|
+
ort.InferenceSession.create(MODELS.flow_lm_flow, sessionOptions),
|
|
480
|
+
ort.InferenceSession.create(MODELS.mimi_decoder, sessionOptions)
|
|
481
|
+
]);
|
|
482
|
+
|
|
483
|
+
mimiEncoderSession = encoderRes;
|
|
484
|
+
textConditionerSession = textCondRes;
|
|
485
|
+
flowLmMainSession = flowMainRes;
|
|
486
|
+
flowLmFlowSession = flowFlowRes;
|
|
487
|
+
mimiDecoderSession = decoderRes;
|
|
488
|
+
|
|
489
|
+
if (DEBUG_LOGS) {
|
|
490
|
+
console.log('All models loaded successfully');
|
|
491
|
+
console.log('Flow LM Main inputs:', flowLmMainSession.inputNames);
|
|
492
|
+
console.log('Flow LM Main outputs:', flowLmMainSession.outputNames);
|
|
493
|
+
console.log('MIMI decoder inputs:', mimiDecoderSession.inputNames);
|
|
494
|
+
console.log('MIMI decoder outputs:', mimiDecoderSession.outputNames);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// Load tokenizer
|
|
498
|
+
postMessage({ type: 'status', status: 'Loading tokenizer...', state: 'loading' });
|
|
499
|
+
if (DEBUG_LOGS) {
|
|
500
|
+
console.log('Loading tokenizer...');
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
const tokenizerResponse = await fetch(MODELS.tokenizer);
|
|
504
|
+
if (!tokenizerResponse.ok) {
|
|
505
|
+
throw new Error(`Failed to load tokenizer: ${tokenizerResponse.statusText}`);
|
|
506
|
+
}
|
|
507
|
+
const tokenizerBuffer = await tokenizerResponse.arrayBuffer();
|
|
508
|
+
tokenizerModelB64 = btoa(String.fromCharCode(...new Uint8Array(tokenizerBuffer)));
|
|
509
|
+
|
|
510
|
+
// Import and initialize sentencepiece processor
|
|
511
|
+
const spModule = await import('./sentencepiece.js?v=2');
|
|
512
|
+
const SentencePieceProcessor = spModule.SentencePieceProcessor;
|
|
513
|
+
if (!SentencePieceProcessor) {
|
|
514
|
+
throw new Error('SentencePieceProcessor not found in sentencepiece.js');
|
|
515
|
+
}
|
|
516
|
+
tokenizerProcessor = new SentencePieceProcessor();
|
|
517
|
+
await tokenizerProcessor.loadFromB64StringModel(tokenizerModelB64);
|
|
518
|
+
if (DEBUG_LOGS) {
|
|
519
|
+
console.log('Tokenizer loaded');
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// Load predefined voices
|
|
523
|
+
postMessage({ type: 'status', status: 'Loading voices...', state: 'loading' });
|
|
524
|
+
if (DEBUG_LOGS) {
|
|
525
|
+
console.log('Loading predefined voices...');
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
try {
|
|
529
|
+
const voicesResponse = await fetch(MODELS.voices);
|
|
530
|
+
if (voicesResponse.ok) {
|
|
531
|
+
const voicesData = await voicesResponse.arrayBuffer();
|
|
532
|
+
predefinedVoices = parseVoicesBin(voicesData);
|
|
533
|
+
if (DEBUG_LOGS) {
|
|
534
|
+
console.log('Loaded voices:', Object.keys(predefinedVoices));
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
// Set default voice
|
|
538
|
+
if (predefinedVoices['cosette']) {
|
|
539
|
+
currentVoiceEmbedding = predefinedVoices['cosette'];
|
|
540
|
+
currentVoiceName = 'cosette';
|
|
541
|
+
} else {
|
|
542
|
+
// Use first available voice
|
|
543
|
+
const firstVoice = Object.keys(predefinedVoices)[0];
|
|
544
|
+
if (firstVoice) {
|
|
545
|
+
currentVoiceEmbedding = predefinedVoices[firstVoice];
|
|
546
|
+
currentVoiceName = firstVoice;
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
} catch (e) {
|
|
551
|
+
console.warn('Could not load predefined voices:', e);
|
|
552
|
+
postMessage({ type: 'status', status: 'Voice load error: ' + e.message, state: 'loading' });
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// Send list of available voices
|
|
556
|
+
postMessage({
|
|
557
|
+
type: 'voices_loaded',
|
|
558
|
+
voices: Object.keys(predefinedVoices),
|
|
559
|
+
defaultVoice: currentVoiceName
|
|
560
|
+
});
|
|
561
|
+
|
|
562
|
+
// Pre-allocate s/t tensors for Flow Matching Loop (Optimization)
|
|
563
|
+
// Pre-allocate for MAX_LSD to support dynamic switching
|
|
564
|
+
if (DEBUG_LOGS) {
|
|
565
|
+
console.log(`Pre-allocating Flow Matching tensors for LSD 1-${MAX_LSD}...`);
|
|
566
|
+
}
|
|
567
|
+
stTensors = {};
|
|
568
|
+
|
|
569
|
+
for (let lsd = 1; lsd <= MAX_LSD; lsd++) {
|
|
570
|
+
stTensors[lsd] = [];
|
|
571
|
+
const dt = 1.0 / lsd;
|
|
572
|
+
for (let j = 0; j < lsd; j++) {
|
|
573
|
+
const s = j / lsd;
|
|
574
|
+
const t = s + dt;
|
|
575
|
+
stTensors[lsd].push({
|
|
576
|
+
s: new ort.Tensor('float32', new Float32Array([s]), [1, 1]),
|
|
577
|
+
t: new ort.Tensor('float32', new Float32Array([t]), [1, 1])
|
|
578
|
+
});
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
isReady = true;
|
|
583
|
+
postMessage({ type: 'status', status: 'Ready', state: 'idle' });
|
|
584
|
+
postMessage({ type: 'model_status', status: 'ready', text: 'Ready' });
|
|
585
|
+
postMessage({ type: 'loaded' });
|
|
586
|
+
|
|
587
|
+
} catch (err) {
|
|
588
|
+
console.error('Model load failed:', err);
|
|
589
|
+
throw err;
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
function parseVoicesBin(buffer) {
|
|
594
|
+
// Simple binary format:
|
|
595
|
+
// Header: 4 bytes (uint32) = number of voices
|
|
596
|
+
// For each voice:
|
|
597
|
+
// - 32 bytes: voice name (null-terminated string)
|
|
598
|
+
// - 4 bytes (uint32): number of frames
|
|
599
|
+
// - 4 bytes (uint32): embedding dim (1024)
|
|
600
|
+
// - frames * dim * 4 bytes: float32 embeddings
|
|
601
|
+
|
|
602
|
+
const voices = {};
|
|
603
|
+
const view = new DataView(buffer);
|
|
604
|
+
let offset = 0;
|
|
605
|
+
|
|
606
|
+
const numVoices = view.getUint32(offset, true);
|
|
607
|
+
offset += 4;
|
|
608
|
+
|
|
609
|
+
for (let i = 0; i < numVoices; i++) {
|
|
610
|
+
// Read voice name
|
|
611
|
+
const nameBytes = new Uint8Array(buffer, offset, 32);
|
|
612
|
+
const nameEnd = nameBytes.indexOf(0);
|
|
613
|
+
const name = new TextDecoder().decode(nameBytes.subarray(0, nameEnd > 0 ? nameEnd : 32)).trim();
|
|
614
|
+
offset += 32;
|
|
615
|
+
|
|
616
|
+
// Read dimensions
|
|
617
|
+
const numFrames = view.getUint32(offset, true);
|
|
618
|
+
offset += 4;
|
|
619
|
+
const embDim = view.getUint32(offset, true);
|
|
620
|
+
offset += 4;
|
|
621
|
+
|
|
622
|
+
// Read embeddings (copy to avoid alignment issues)
|
|
623
|
+
const embSize = numFrames * embDim;
|
|
624
|
+
const embBytes = new Uint8Array(buffer, offset, embSize * 4);
|
|
625
|
+
const embeddings = new Float32Array(embBytes.buffer.slice(offset, offset + embSize * 4));
|
|
626
|
+
offset += embSize * 4;
|
|
627
|
+
|
|
628
|
+
voices[name] = {
|
|
629
|
+
data: embeddings,
|
|
630
|
+
shape: [1, numFrames, embDim]
|
|
631
|
+
};
|
|
632
|
+
|
|
633
|
+
console.log(`Loaded voice '${name}': ${numFrames} frames, ${embDim} dim`);
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
return voices;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
async function encodeVoiceAudio(audioData) {
|
|
640
|
+
// audioData should be Float32Array at 24kHz, mono
|
|
641
|
+
// Reshape to [1, 1, samples]
|
|
642
|
+
const input = new ort.Tensor('float32', audioData, [1, 1, audioData.length]);
|
|
643
|
+
|
|
644
|
+
const outputs = await mimiEncoderSession.run({ audio: input });
|
|
645
|
+
const embeddings = outputs[mimiEncoderSession.outputNames[0]];
|
|
646
|
+
|
|
647
|
+
return {
|
|
648
|
+
data: new Float32Array(embeddings.data),
|
|
649
|
+
shape: embeddings.dims
|
|
650
|
+
};
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// Hardcoded state shapes extracted from ONNX model metadata
|
|
654
|
+
// These are the initial shapes - dynamic dimensions start at 0
|
|
655
|
+
const FLOW_LM_STATE_SHAPES = {
|
|
656
|
+
// KV cache layers: [kv=2, batch=1, max_seq=1000, heads=16, head_dim=64]
|
|
657
|
+
state_0: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
|
|
658
|
+
state_1: { shape: [0], dtype: 'float32' }, // dynamic
|
|
659
|
+
state_2: { shape: [1], dtype: 'int64' }, // step counter
|
|
660
|
+
state_3: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
|
|
661
|
+
state_4: { shape: [0], dtype: 'float32' },
|
|
662
|
+
state_5: { shape: [1], dtype: 'int64' },
|
|
663
|
+
state_6: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
|
|
664
|
+
state_7: { shape: [0], dtype: 'float32' },
|
|
665
|
+
state_8: { shape: [1], dtype: 'int64' },
|
|
666
|
+
state_9: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
|
|
667
|
+
state_10: { shape: [0], dtype: 'float32' },
|
|
668
|
+
state_11: { shape: [1], dtype: 'int64' },
|
|
669
|
+
state_12: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
|
|
670
|
+
state_13: { shape: [0], dtype: 'float32' },
|
|
671
|
+
state_14: { shape: [1], dtype: 'int64' },
|
|
672
|
+
state_15: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
|
|
673
|
+
state_16: { shape: [0], dtype: 'float32' },
|
|
674
|
+
state_17: { shape: [1], dtype: 'int64' },
|
|
675
|
+
};
|
|
676
|
+
|
|
677
|
+
const MIMI_DECODER_STATE_SHAPES = {
|
|
678
|
+
state_0: { shape: [1], dtype: 'bool' },
|
|
679
|
+
state_1: { shape: [1, 512, 6], dtype: 'float32' },
|
|
680
|
+
state_2: { shape: [1], dtype: 'bool' },
|
|
681
|
+
state_3: { shape: [1, 64, 2], dtype: 'float32' },
|
|
682
|
+
state_4: { shape: [1, 256, 6], dtype: 'float32' },
|
|
683
|
+
state_5: { shape: [1], dtype: 'bool' },
|
|
684
|
+
state_6: { shape: [1, 256, 2], dtype: 'float32' },
|
|
685
|
+
state_7: { shape: [1], dtype: 'bool' },
|
|
686
|
+
state_8: { shape: [1, 128, 0], dtype: 'float32' }, // dynamic
|
|
687
|
+
state_9: { shape: [1, 128, 5], dtype: 'float32' },
|
|
688
|
+
state_10: { shape: [1], dtype: 'bool' },
|
|
689
|
+
state_11: { shape: [1, 128, 2], dtype: 'float32' },
|
|
690
|
+
state_12: { shape: [1], dtype: 'bool' },
|
|
691
|
+
state_13: { shape: [1, 64, 0], dtype: 'float32' }, // dynamic
|
|
692
|
+
state_14: { shape: [1, 64, 4], dtype: 'float32' },
|
|
693
|
+
state_15: { shape: [1], dtype: 'bool' },
|
|
694
|
+
state_16: { shape: [1, 64, 2], dtype: 'float32' },
|
|
695
|
+
state_17: { shape: [1], dtype: 'bool' },
|
|
696
|
+
state_18: { shape: [1, 32, 0], dtype: 'float32' }, // dynamic
|
|
697
|
+
state_19: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
|
|
698
|
+
state_20: { shape: [1], dtype: 'int64' },
|
|
699
|
+
state_21: { shape: [1], dtype: 'int64' },
|
|
700
|
+
state_22: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
|
|
701
|
+
state_23: { shape: [1], dtype: 'int64' },
|
|
702
|
+
state_24: { shape: [1], dtype: 'int64' },
|
|
703
|
+
state_25: { shape: [1], dtype: 'bool' },
|
|
704
|
+
state_26: { shape: [1, 512, 16], dtype: 'float32' },
|
|
705
|
+
state_27: { shape: [1], dtype: 'bool' },
|
|
706
|
+
state_28: { shape: [1, 1, 6], dtype: 'float32' },
|
|
707
|
+
state_29: { shape: [1], dtype: 'bool' },
|
|
708
|
+
state_30: { shape: [1, 64, 2], dtype: 'float32' },
|
|
709
|
+
state_31: { shape: [1], dtype: 'bool' },
|
|
710
|
+
state_32: { shape: [1, 32, 0], dtype: 'float32' }, // dynamic
|
|
711
|
+
state_33: { shape: [1], dtype: 'bool' },
|
|
712
|
+
state_34: { shape: [1, 512, 2], dtype: 'float32' },
|
|
713
|
+
state_35: { shape: [1], dtype: 'bool' },
|
|
714
|
+
state_36: { shape: [1, 64, 4], dtype: 'float32' },
|
|
715
|
+
state_37: { shape: [1], dtype: 'bool' },
|
|
716
|
+
state_38: { shape: [1, 128, 2], dtype: 'float32' },
|
|
717
|
+
state_39: { shape: [1], dtype: 'bool' },
|
|
718
|
+
state_40: { shape: [1, 64, 0], dtype: 'float32' }, // dynamic
|
|
719
|
+
state_41: { shape: [1], dtype: 'bool' },
|
|
720
|
+
state_42: { shape: [1, 128, 5], dtype: 'float32' },
|
|
721
|
+
state_43: { shape: [1], dtype: 'bool' },
|
|
722
|
+
state_44: { shape: [1, 256, 2], dtype: 'float32' },
|
|
723
|
+
state_45: { shape: [1], dtype: 'bool' },
|
|
724
|
+
state_46: { shape: [1, 128, 0], dtype: 'float32' }, // dynamic
|
|
725
|
+
state_47: { shape: [1], dtype: 'bool' },
|
|
726
|
+
state_48: { shape: [1, 256, 6], dtype: 'float32' },
|
|
727
|
+
state_49: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
|
|
728
|
+
state_50: { shape: [1], dtype: 'int64' },
|
|
729
|
+
state_51: { shape: [1], dtype: 'int64' },
|
|
730
|
+
state_52: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
|
|
731
|
+
state_53: { shape: [1], dtype: 'int64' },
|
|
732
|
+
state_54: { shape: [1], dtype: 'int64' },
|
|
733
|
+
state_55: { shape: [1, 512, 16], dtype: 'float32' },
|
|
734
|
+
};
|
|
735
|
+
|
|
736
|
+
function initState(session, stateShapes) {
|
|
737
|
+
/**
|
|
738
|
+
* Initialize state tensors for a stateful ONNX model using hardcoded shapes.
|
|
739
|
+
*/
|
|
740
|
+
const state = {};
|
|
741
|
+
|
|
742
|
+
for (const inputName of session.inputNames) {
|
|
743
|
+
if (inputName.startsWith('state_')) {
|
|
744
|
+
const stateInfo = stateShapes[inputName];
|
|
745
|
+
if (!stateInfo) {
|
|
746
|
+
console.warn(`Unknown state input: ${inputName}, skipping`);
|
|
747
|
+
continue;
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
const { shape, dtype } = stateInfo;
|
|
751
|
+
const size = shape.reduce((a, b) => a * b, 1);
|
|
752
|
+
|
|
753
|
+
let data;
|
|
754
|
+
if (dtype === 'int64') {
|
|
755
|
+
data = new BigInt64Array(size);
|
|
756
|
+
} else if (dtype === 'bool') {
|
|
757
|
+
data = new Uint8Array(size);
|
|
758
|
+
} else {
|
|
759
|
+
data = new Float32Array(size);
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
state[inputName] = new ort.Tensor(dtype, data, shape);
|
|
763
|
+
if (DEBUG_LOGS) {
|
|
764
|
+
console.log(`Init state ${inputName}: shape=${JSON.stringify(shape)}, dtype=${dtype}`);
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
return state;
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
async function startGeneration(text, voiceName) {
|
|
773
|
+
isGenerating = true;
|
|
774
|
+
currentLSD = MAX_LSD; // Reset to max quality for each new generation
|
|
775
|
+
postMessage({ type: 'status', status: 'Generating...', state: 'running' });
|
|
776
|
+
postMessage({ type: 'generation_started', data: { time: performance.now() } });
|
|
777
|
+
|
|
778
|
+
try {
|
|
779
|
+
// Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
|
|
780
|
+
const chunks = splitIntoBestSentences(text);
|
|
781
|
+
console.log(`Split into ${chunks.length} chunks:`, chunks);
|
|
782
|
+
|
|
783
|
+
if (chunks.length === 0) {
|
|
784
|
+
throw new Error('No text to generate');
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
// Get voice embedding
|
|
788
|
+
let voiceEmb = currentVoiceEmbedding;
|
|
789
|
+
if (voiceName && voiceName !== currentVoiceName) {
|
|
790
|
+
if (predefinedVoices[voiceName]) {
|
|
791
|
+
voiceEmb = predefinedVoices[voiceName];
|
|
792
|
+
currentVoiceEmbedding = voiceEmb;
|
|
793
|
+
currentVoiceName = voiceName;
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
if (!voiceEmb) {
|
|
798
|
+
throw new Error('No voice embedding available. Please select a voice or upload custom audio.');
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
// Run generation pipeline with chunks
|
|
802
|
+
await runGenerationPipeline(voiceEmb, chunks);
|
|
803
|
+
|
|
804
|
+
} catch (err) {
|
|
805
|
+
console.error('Generation error:', err);
|
|
806
|
+
postMessage({ type: 'error', error: err.toString() });
|
|
807
|
+
} finally {
|
|
808
|
+
if (isGenerating) {
|
|
809
|
+
postMessage({ type: 'stream_ended' });
|
|
810
|
+
postMessage({ type: 'status', status: 'Finished', state: 'idle' });
|
|
811
|
+
}
|
|
812
|
+
isGenerating = false;
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
async function runGenerationPipeline(voiceEmb, chunks) {
|
|
817
|
+
// Initialize state - may be reset per chunk
|
|
818
|
+
let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
|
|
819
|
+
const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
|
|
820
|
+
const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);
|
|
821
|
+
|
|
822
|
+
// Voice embedding tensor
|
|
823
|
+
const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
|
|
824
|
+
console.log('Voice embeddings shape:', voiceEmb.shape);
|
|
825
|
+
|
|
826
|
+
async function buildVoiceConditionedState() {
|
|
827
|
+
let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
|
|
828
|
+
console.log('Running voice conditioning...');
|
|
829
|
+
const voiceCondInputs = {
|
|
830
|
+
sequence: emptySeq,
|
|
831
|
+
text_embeddings: voiceTensor,
|
|
832
|
+
...flowLmState
|
|
833
|
+
};
|
|
834
|
+
|
|
835
|
+
let condResult = await flowLmMainSession.run(voiceCondInputs);
|
|
836
|
+
|
|
837
|
+
// Update state from voice conditioning
|
|
838
|
+
for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
|
|
839
|
+
const outputName = flowLmMainSession.outputNames[i];
|
|
840
|
+
if (outputName.startsWith('out_state_')) {
|
|
841
|
+
const stateIdx = parseInt(outputName.replace('out_state_', ''));
|
|
842
|
+
flowLmState[`state_${stateIdx}`] = condResult[outputName];
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
return flowLmState;
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
let flowLmState = await buildVoiceConditionedState();
|
|
849
|
+
|
|
850
|
+
// Streaming parameters
|
|
851
|
+
const FIRST_CHUNK_FRAMES = 3;
|
|
852
|
+
const NORMAL_CHUNK_FRAMES = 12;
|
|
853
|
+
|
|
854
|
+
// Tracking across all chunks
|
|
855
|
+
const allGeneratedLatents = [];
|
|
856
|
+
let isFirstAudioChunk = true;
|
|
857
|
+
let totalDecodedFrames = 0;
|
|
858
|
+
let totalFlowLmTime = 0;
|
|
859
|
+
let totalDecodeTime = 0;
|
|
860
|
+
const arStartTime = performance.now();
|
|
861
|
+
|
|
862
|
+
// Process each text chunk
|
|
863
|
+
for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
|
|
864
|
+
if (!isGenerating) break;
|
|
865
|
+
|
|
866
|
+
if (RESET_FLOW_STATE_EACH_CHUNK && chunkIdx > 0) {
|
|
867
|
+
flowLmState = await buildVoiceConditionedState();
|
|
868
|
+
}
|
|
869
|
+
if (RESET_MIMI_STATE_EACH_CHUNK && chunkIdx > 0) {
|
|
870
|
+
mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
const chunkText = chunks[chunkIdx];
|
|
874
|
+
console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);
|
|
875
|
+
|
|
876
|
+
let isFirstAudioChunkOfTextChunk = true;
|
|
877
|
+
|
|
878
|
+
// Tokenize this chunk
|
|
879
|
+
const tokenIds = tokenizerProcessor.encodeIds(chunkText);
|
|
880
|
+
console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);
|
|
881
|
+
|
|
882
|
+
// Text conditioning for this chunk
|
|
883
|
+
const textInput = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(x => BigInt(x))), [1, tokenIds.length]);
|
|
884
|
+
const textCondResult = await textConditionerSession.run({ token_ids: textInput });
|
|
885
|
+
let textEmb = textCondResult[textConditionerSession.outputNames[0]];
|
|
886
|
+
|
|
887
|
+
if (textEmb.dims.length === 2) {
|
|
888
|
+
textEmb = new ort.Tensor('float32', textEmb.data, [1, textEmb.dims[0], textEmb.dims[1]]);
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
const textCondInputs = {
|
|
892
|
+
sequence: emptySeq,
|
|
893
|
+
text_embeddings: textEmb,
|
|
894
|
+
...flowLmState
|
|
895
|
+
};
|
|
896
|
+
|
|
897
|
+
let condResult = await flowLmMainSession.run(textCondInputs);
|
|
898
|
+
|
|
899
|
+
// Update state from text conditioning
|
|
900
|
+
for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
|
|
901
|
+
const outputName = flowLmMainSession.outputNames[i];
|
|
902
|
+
if (outputName.startsWith('out_state_')) {
|
|
903
|
+
const stateIdx = parseInt(outputName.replace('out_state_', ''));
|
|
904
|
+
flowLmState[`state_${stateIdx}`] = condResult[outputName];
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
// AR generation for this chunk
|
|
909
|
+
const chunkLatents = [];
|
|
910
|
+
let currentLatent = new ort.Tensor('float32', new Float32Array(32).fill(NaN), [1, 1, 32]);
|
|
911
|
+
let chunkDecodedFrames = 0;
|
|
912
|
+
const FRAMES_AFTER_EOS = 3; // Match PyTorch behavior - generate extra frames after EOS
|
|
913
|
+
let eosStep = null;
|
|
914
|
+
|
|
915
|
+
let chunkEnded = false;
|
|
916
|
+
let chunkGenTimeMs = 0;
|
|
917
|
+
for (let step = 0; step < MAX_FRAMES; step++) {
|
|
918
|
+
if (!isGenerating) break;
|
|
919
|
+
|
|
920
|
+
// Yield every 4 steps to allow message processing (e.g., set_lsd)
|
|
921
|
+
if (step > 0 && step % 4 === 0) {
|
|
922
|
+
await new Promise(r => setTimeout(r, 0));
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
const arInputs = {
|
|
926
|
+
sequence: currentLatent,
|
|
927
|
+
text_embeddings: emptyTextEmb,
|
|
928
|
+
...flowLmState
|
|
929
|
+
};
|
|
930
|
+
|
|
931
|
+
const stepStart = performance.now();
|
|
932
|
+
const arResult = await flowLmMainSession.run(arInputs);
|
|
933
|
+
const stepElapsed = performance.now() - stepStart;
|
|
934
|
+
chunkGenTimeMs += stepElapsed;
|
|
935
|
+
|
|
936
|
+
const conditioning = arResult['conditioning'];
|
|
937
|
+
const eosLogit = arResult['eos_logit'].data[0];
|
|
938
|
+
const isEos = eosLogit > -4.0;
|
|
939
|
+
|
|
940
|
+
// Track when EOS is first detected
|
|
941
|
+
if (isEos && eosStep === null) {
|
|
942
|
+
eosStep = step;
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
// Only stop after FRAMES_AFTER_EOS additional frames
|
|
946
|
+
const shouldStop = eosStep !== null && step >= eosStep + FRAMES_AFTER_EOS;
|
|
947
|
+
|
|
948
|
+
// Flow matching (LSD loop) - uses currentLSD which can be adjusted dynamically
|
|
949
|
+
const TEMP = 0.7;
|
|
950
|
+
const STD = Math.sqrt(TEMP);
|
|
951
|
+
let xData = new Float32Array(32);
|
|
952
|
+
for (let i = 0; i < 32; i++) {
|
|
953
|
+
let u = 0, v = 0;
|
|
954
|
+
while (u === 0) u = Math.random();
|
|
955
|
+
while (v === 0) v = Math.random();
|
|
956
|
+
xData[i] = Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v) * STD;
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
const lsdSteps = currentLSD;
|
|
960
|
+
const dt = 1.0 / lsdSteps;
|
|
961
|
+
|
|
962
|
+
for (let j = 0; j < lsdSteps; j++) {
|
|
963
|
+
const flowInputs = {
|
|
964
|
+
c: conditioning,
|
|
965
|
+
s: stTensors[lsdSteps][j].s,
|
|
966
|
+
t: stTensors[lsdSteps][j].t,
|
|
967
|
+
x: new ort.Tensor('float32', xData, [1, 32])
|
|
968
|
+
};
|
|
969
|
+
|
|
970
|
+
const flowResult = await flowLmFlowSession.run(flowInputs);
|
|
971
|
+
const v = flowResult['flow_dir'].data;
|
|
972
|
+
|
|
973
|
+
for (let k = 0; k < 32; k++) {
|
|
974
|
+
xData[k] += v[k] * dt;
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
totalFlowLmTime += stepElapsed;
|
|
979
|
+
|
|
980
|
+
const latentData = xData;
|
|
981
|
+
chunkLatents.push(new Float32Array(latentData));
|
|
982
|
+
allGeneratedLatents.push(new Float32Array(latentData));
|
|
983
|
+
|
|
984
|
+
// Update state
|
|
985
|
+
currentLatent = new ort.Tensor('float32', latentData, [1, 1, 32]);
|
|
986
|
+
for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
|
|
987
|
+
const outputName = flowLmMainSession.outputNames[i];
|
|
988
|
+
if (outputName.startsWith('out_state_')) {
|
|
989
|
+
const stateIdx = parseInt(outputName.replace('out_state_', ''));
|
|
990
|
+
flowLmState[`state_${stateIdx}`] = arResult[outputName];
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
// Decode audio chunks
|
|
995
|
+
const pending = chunkLatents.length - chunkDecodedFrames;
|
|
996
|
+
let decodeSize = 0;
|
|
997
|
+
|
|
998
|
+
if (shouldStop) {
|
|
999
|
+
decodeSize = pending;
|
|
1000
|
+
} else if (isFirstAudioChunk && pending >= FIRST_CHUNK_FRAMES) {
|
|
1001
|
+
decodeSize = FIRST_CHUNK_FRAMES;
|
|
1002
|
+
} else if (pending >= NORMAL_CHUNK_FRAMES) {
|
|
1003
|
+
decodeSize = NORMAL_CHUNK_FRAMES;
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
if (decodeSize > 0) {
|
|
1007
|
+
const decodeLatents = new Float32Array(decodeSize * 32);
|
|
1008
|
+
for (let i = 0; i < decodeSize; i++) {
|
|
1009
|
+
decodeLatents.set(chunkLatents[chunkDecodedFrames + i], i * 32);
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
const latentTensor = new ort.Tensor('float32', decodeLatents, [1, decodeSize, 32]);
|
|
1013
|
+
const decodeInputs = { latent: latentTensor, ...mimiState };
|
|
1014
|
+
|
|
1015
|
+
const decStart = performance.now();
|
|
1016
|
+
const decodeResult = await mimiDecoderSession.run(decodeInputs);
|
|
1017
|
+
const decElapsed = performance.now() - decStart;
|
|
1018
|
+
totalDecodeTime += decElapsed;
|
|
1019
|
+
chunkGenTimeMs += decElapsed;
|
|
1020
|
+
const audioChunk = decodeResult[mimiDecoderSession.outputNames[0]].data;
|
|
1021
|
+
|
|
1022
|
+
// Update MIMI state
|
|
1023
|
+
for (let i = 1; i < mimiDecoderSession.outputNames.length; i++) {
|
|
1024
|
+
const outputName = mimiDecoderSession.outputNames[i];
|
|
1025
|
+
const stateIdx = i - 1;
|
|
1026
|
+
mimiState[`state_${stateIdx}`] = decodeResult[outputName];
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
chunkDecodedFrames += decodeSize;
|
|
1030
|
+
totalDecodedFrames += decodeSize;
|
|
1031
|
+
|
|
1032
|
+
const audioFloat32 = new Float32Array(audioChunk);
|
|
1033
|
+
const isLastChunk = shouldStop && chunkIdx === chunks.length - 1;
|
|
1034
|
+
postMessage({
|
|
1035
|
+
type: 'audio_chunk',
|
|
1036
|
+
data: audioFloat32,
|
|
1037
|
+
metrics: {
|
|
1038
|
+
bbTime: 0,
|
|
1039
|
+
decTime: 0,
|
|
1040
|
+
chunkDuration: audioFloat32.length / SAMPLE_RATE,
|
|
1041
|
+
genTimeSec: chunkGenTimeMs / 1000,
|
|
1042
|
+
isFirst: isFirstAudioChunk,
|
|
1043
|
+
isLast: isLastChunk,
|
|
1044
|
+
chunkStart: isFirstAudioChunkOfTextChunk
|
|
1045
|
+
}
|
|
1046
|
+
}, [audioFloat32.buffer]);
|
|
1047
|
+
|
|
1048
|
+
isFirstAudioChunk = false;
|
|
1049
|
+
isFirstAudioChunkOfTextChunk = false;
|
|
1050
|
+
chunkGenTimeMs = 0;
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
if (shouldStop) {
|
|
1054
|
+
console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
|
|
1055
|
+
chunkEnded = true;
|
|
1056
|
+
break;
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
if (chunkEnded && isGenerating && chunkIdx < chunks.length - 1) {
|
|
1061
|
+
const gapSamples = Math.max(1, Math.floor(CHUNK_GAP_SEC * SAMPLE_RATE));
|
|
1062
|
+
const silence = new Float32Array(gapSamples);
|
|
1063
|
+
postMessage({
|
|
1064
|
+
type: 'audio_chunk',
|
|
1065
|
+
data: silence,
|
|
1066
|
+
metrics: {
|
|
1067
|
+
bbTime: 0,
|
|
1068
|
+
decTime: 0,
|
|
1069
|
+
chunkDuration: gapSamples / SAMPLE_RATE,
|
|
1070
|
+
isFirst: false,
|
|
1071
|
+
isLast: false,
|
|
1072
|
+
isSilence: true
|
|
1073
|
+
}
|
|
1074
|
+
}, [silence.buffer]);
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
const totalTime = (performance.now() - arStartTime) / 1000;
|
|
1079
|
+
const audioSeconds = allGeneratedLatents.length * SAMPLES_PER_FRAME / SAMPLE_RATE;
|
|
1080
|
+
|
|
1081
|
+
// RTFx based on actual generation time (flow LM + decoder), not including conditioning
|
|
1082
|
+
const genTime = (totalFlowLmTime + totalDecodeTime) / 1000;
|
|
1083
|
+
const rtfx = audioSeconds / genTime;
|
|
1084
|
+
|
|
1085
|
+
console.log(`Generation complete: ${allGeneratedLatents.length} frames (${audioSeconds.toFixed(2)}s audio)`);
|
|
1086
|
+
console.log(` Total time: ${totalTime.toFixed(2)}s`);
|
|
1087
|
+
console.log(` Gen time: ${genTime.toFixed(2)}s, RTFx: ${rtfx.toFixed(2)}x`);
|
|
1088
|
+
console.log(` Flow LM: ${(totalFlowLmTime / 1000).toFixed(2)}s (${(totalFlowLmTime / allGeneratedLatents.length).toFixed(1)}ms/step)`);
|
|
1089
|
+
console.log(` Decoder: ${(totalDecodeTime / 1000).toFixed(2)}s`);
|
|
1090
|
+
|
|
1091
|
+
postMessage({
|
|
1092
|
+
type: 'status',
|
|
1093
|
+
status: `Finished (RTFx: ${rtfx.toFixed(2)}x)`,
|
|
1094
|
+
state: 'idle',
|
|
1095
|
+
metrics: { rtfx, genTime, totalTime, audioDuration: audioSeconds }
|
|
1096
|
+
});
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
// Pre-allocated buffers for step counter updates (avoid GC pressure in hot loop)
|
|
1100
|
+
const stepBuffers = {};
|
|
1101
|
+
|
|
1102
|
+
function updateStateSteps(state, increment) {
|
|
1103
|
+
// Update step counters in state dict - reuse buffers to avoid allocation
|
|
1104
|
+
const incBigInt = BigInt(increment);
|
|
1105
|
+
for (const key in state) {
|
|
1106
|
+
if (key.includes('step') && state[key]) {
|
|
1107
|
+
const tensor = state[key];
|
|
1108
|
+
if (tensor.data instanceof BigInt64Array) {
|
|
1109
|
+
// Reuse buffer if same size, otherwise create new one
|
|
1110
|
+
if (!stepBuffers[key] || stepBuffers[key].length !== tensor.data.length) {
|
|
1111
|
+
stepBuffers[key] = new BigInt64Array(tensor.data.length);
|
|
1112
|
+
}
|
|
1113
|
+
const buf = stepBuffers[key];
|
|
1114
|
+
for (let i = 0; i < tensor.data.length; i++) {
|
|
1115
|
+
buf[i] = tensor.data[i] + incBigInt;
|
|
1116
|
+
}
|
|
1117
|
+
state[key] = new ort.Tensor('int64', buf, tensor.dims);
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
}
|