refacil-sdd-ai 5.2.2 → 5.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NOTICE.md +46 -0
- package/README.md +209 -42
- package/agents/auditor.md +46 -0
- package/agents/debugger.md +41 -1
- package/agents/implementer.md +76 -10
- package/agents/investigator.md +36 -0
- package/agents/proposer.md +46 -2
- package/agents/tester.md +45 -8
- package/agents/validator.md +67 -13
- package/bin/cli.js +428 -83
- package/bin/postinstall.js +20 -0
- package/lib/bus/broker.js +121 -3
- package/lib/bus/spawn.js +189 -121
- package/lib/check-review.js +102 -0
- package/lib/codegraph-telemetry.js +135 -0
- package/lib/codegraph.js +273 -0
- package/lib/commands/autopilot.js +120 -0
- package/lib/commands/bus.js +29 -36
- package/lib/commands/compact.js +185 -46
- package/lib/commands/read-spec.js +352 -0
- package/lib/commands/sdd.js +429 -44
- package/lib/compact-guidance.js +122 -77
- package/lib/config.js +136 -0
- package/lib/global-paths.js +56 -20
- package/lib/hooks.js +32 -4
- package/lib/ide-detection.js +1 -1
- package/lib/ignore-files.js +5 -1
- package/lib/installer.js +202 -19
- package/lib/kapso.js +241 -0
- package/lib/methodology-migration-pending.js +13 -0
- package/lib/open-browser.js +32 -0
- package/lib/opencode-migrate.js +148 -0
- package/lib/opencode-plugin/index.js +84 -104
- package/lib/opencode-plugin/rules.js +236 -0
- package/lib/project-root.js +154 -0
- package/lib/repo-ide-sync.js +5 -0
- package/lib/spec-reader/lang.js +72 -0
- package/lib/spec-reader/md-parser.js +299 -0
- package/lib/spec-reader/session.js +139 -0
- package/lib/spec-reader/ui/app.js +685 -0
- package/lib/spec-reader/ui/index.html +59 -0
- package/lib/spec-reader/ui/mixed-lang.js +200 -0
- package/lib/spec-reader/ui/model-cache.js +117 -0
- package/lib/spec-reader/ui/style.css +294 -0
- package/lib/spec-reader/ui/supertonic-helper.js +565 -0
- package/lib/spec-sync.js +258 -0
- package/lib/test-scope.js +713 -0
- package/lib/testing-policy-sync.js +14 -2
- package/package.json +6 -3
- package/skills/apply/SKILL.md +39 -64
- package/skills/archive/SKILL.md +74 -48
- package/skills/ask/SKILL.md +43 -8
- package/skills/autopilot/SKILL.md +476 -0
- package/skills/bug/SKILL.md +52 -53
- package/skills/explore/SKILL.md +48 -1
- package/skills/guide/SKILL.md +31 -13
- package/skills/inbox/SKILL.md +9 -0
- package/skills/join/SKILL.md +1 -1
- package/skills/prereqs/BUS-CROSS-REPO.md +33 -16
- package/skills/prereqs/METHODOLOGY-CONTRACT.md +96 -17
- package/skills/prereqs/SKILL.md +1 -1
- package/skills/propose/SKILL.md +74 -19
- package/skills/read-spec/SKILL.md +76 -0
- package/skills/reply/SKILL.md +42 -9
- package/skills/review/SKILL.md +63 -25
- package/skills/review/checklist.md +2 -2
- package/skills/say/SKILL.md +40 -4
- package/skills/setup/SKILL.md +59 -5
- package/skills/setup/troubleshooting.md +11 -3
- package/skills/stats/SKILL.md +157 -0
- package/skills/test/SKILL.md +35 -10
- package/skills/up-code/SKILL.md +20 -13
- package/skills/update/SKILL.md +32 -1
- package/skills/verify/SKILL.md +78 -41
- package/templates/compact-guidance.md +10 -0
- package/templates/methodology-guide.md +5 -0
|
@@ -0,0 +1,565 @@
|
|
|
1
|
+
import * as ort from 'onnxruntime-web';
|
|
2
|
+
|
|
3
|
+
// Available languages for multilingual TTS
|
|
4
|
+
export const AVAILABLE_LANGS = ['en', 'ko', 'ja', 'ar', 'bg', 'cs', 'da', 'de', 'el', 'es', 'et', 'fi', 'fr', 'hi', 'hr', 'hu', 'id', 'it', 'lt', 'lv', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sv', 'tr', 'uk', 'vi', 'na'];
|
|
5
|
+
|
|
6
|
+
export function isValidLang(lang) {
|
|
7
|
+
return AVAILABLE_LANGS.includes(lang);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Unicode Text Processor
|
|
12
|
+
*/
|
|
13
|
+
export class UnicodeProcessor {
|
|
14
|
+
constructor(indexer) {
|
|
15
|
+
this.indexer = indexer;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
call(textList, langList) {
|
|
19
|
+
const processedTexts = textList.map((text, i) => this.preprocessText(text, langList[i]));
|
|
20
|
+
|
|
21
|
+
const textIdsLengths = processedTexts.map(text => text.length);
|
|
22
|
+
const maxLen = Math.max(...textIdsLengths);
|
|
23
|
+
|
|
24
|
+
const textIds = processedTexts.map(text => {
|
|
25
|
+
const row = new Array(maxLen).fill(0);
|
|
26
|
+
for (let j = 0; j < text.length; j++) {
|
|
27
|
+
const codePoint = text.codePointAt(j);
|
|
28
|
+
row[j] = (codePoint < this.indexer.length) ? this.indexer[codePoint] : -1;
|
|
29
|
+
}
|
|
30
|
+
return row;
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
const textMask = this.getTextMask(textIdsLengths);
|
|
34
|
+
return { textIds, textMask };
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
preprocessText(text, lang) {
|
|
38
|
+
// TODO: Need advanced normalizer for better performance
|
|
39
|
+
// NFKD decomposes accented chars into base + combining mark (e.g. ó → o + ◌́).
|
|
40
|
+
// The model indexer was trained on NFKD codepoints — do NOT use NFC here, because
|
|
41
|
+
// NFC keeps ó as a single codepoint U+00F3 which may be absent from the indexer,
|
|
42
|
+
// causing it to return -1 and mispronounce the character.
|
|
43
|
+
text = text.normalize('NFKD');
|
|
44
|
+
|
|
45
|
+
// Remove emojis (wide Unicode range)
|
|
46
|
+
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
|
|
47
|
+
text = text.replace(emojiPattern, '');
|
|
48
|
+
|
|
49
|
+
// Replace various dashes and symbols
|
|
50
|
+
const replacements = {
|
|
51
|
+
'–': '-',
|
|
52
|
+
'‑': '-',
|
|
53
|
+
'—': '-',
|
|
54
|
+
'_': ' ',
|
|
55
|
+
'\u201C': '"', // left double quote "
|
|
56
|
+
'\u201D': '"', // right double quote "
|
|
57
|
+
'\u2018': "'", // left single quote '
|
|
58
|
+
'\u2019': "'", // right single quote '
|
|
59
|
+
'´': "'",
|
|
60
|
+
'`': "'",
|
|
61
|
+
'[': ' ',
|
|
62
|
+
']': ' ',
|
|
63
|
+
'|': ' ',
|
|
64
|
+
'/': ' ',
|
|
65
|
+
'#': ' ',
|
|
66
|
+
'→': ' ',
|
|
67
|
+
'←': ' ',
|
|
68
|
+
};
|
|
69
|
+
for (const [k, v] of Object.entries(replacements)) {
|
|
70
|
+
text = text.replaceAll(k, v);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Remove special symbols
|
|
74
|
+
text = text.replace(/[♥☆♡©\\]/g, '');
|
|
75
|
+
|
|
76
|
+
// Replace known expressions
|
|
77
|
+
const exprReplacements = {
|
|
78
|
+
'@': ' at ',
|
|
79
|
+
'e.g.,': 'for example, ',
|
|
80
|
+
'i.e.,': 'that is, ',
|
|
81
|
+
};
|
|
82
|
+
for (const [k, v] of Object.entries(exprReplacements)) {
|
|
83
|
+
text = text.replaceAll(k, v);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Fix spacing around punctuation
|
|
87
|
+
text = text.replace(/ ,/g, ',');
|
|
88
|
+
text = text.replace(/ \./g, '.');
|
|
89
|
+
text = text.replace(/ !/g, '!');
|
|
90
|
+
text = text.replace(/ \?/g, '?');
|
|
91
|
+
text = text.replace(/ ;/g, ';');
|
|
92
|
+
text = text.replace(/ :/g, ':');
|
|
93
|
+
text = text.replace(/ '/g, "'");
|
|
94
|
+
|
|
95
|
+
// Remove duplicate quotes
|
|
96
|
+
while (text.includes('""')) {
|
|
97
|
+
text = text.replace('""', '"');
|
|
98
|
+
}
|
|
99
|
+
while (text.includes("''")) {
|
|
100
|
+
text = text.replace("''", "'");
|
|
101
|
+
}
|
|
102
|
+
while (text.includes('``')) {
|
|
103
|
+
text = text.replace('``', '`');
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Remove extra spaces
|
|
107
|
+
text = text.replace(/\s+/g, ' ').trim();
|
|
108
|
+
|
|
109
|
+
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
|
110
|
+
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
|
|
111
|
+
text += '.';
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Validate language
|
|
115
|
+
if (!isValidLang(lang)) {
|
|
116
|
+
throw new Error(`Invalid language: ${lang}. Available: ${AVAILABLE_LANGS.join(', ')}`);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Wrap text with language tags
|
|
120
|
+
text = `<${lang}>${text}</${lang}>`;
|
|
121
|
+
|
|
122
|
+
return text;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
getTextMask(textIdsLengths) {
|
|
126
|
+
const maxLen = Math.max(...textIdsLengths);
|
|
127
|
+
return this.lengthToMask(textIdsLengths, maxLen);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
lengthToMask(lengths, maxLen = null) {
|
|
131
|
+
const actualMaxLen = maxLen || Math.max(...lengths);
|
|
132
|
+
return lengths.map(len => {
|
|
133
|
+
const row = new Array(actualMaxLen).fill(0.0);
|
|
134
|
+
for (let j = 0; j < Math.min(len, actualMaxLen); j++) {
|
|
135
|
+
row[j] = 1.0;
|
|
136
|
+
}
|
|
137
|
+
return [row];
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Style class to hold TTL and DP tensors
|
|
144
|
+
*/
|
|
145
|
+
export class Style {
|
|
146
|
+
constructor(ttlTensor, dpTensor) {
|
|
147
|
+
this.ttl = ttlTensor;
|
|
148
|
+
this.dp = dpTensor;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Text-to-Speech class
|
|
154
|
+
*/
|
|
155
|
+
export class TextToSpeech {
|
|
156
|
+
constructor(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt) {
|
|
157
|
+
this.cfgs = cfgs;
|
|
158
|
+
this.textProcessor = textProcessor;
|
|
159
|
+
this.dpOrt = dpOrt;
|
|
160
|
+
this.textEncOrt = textEncOrt;
|
|
161
|
+
this.vectorEstOrt = vectorEstOrt;
|
|
162
|
+
this.vocoderOrt = vocoderOrt;
|
|
163
|
+
this.sampleRate = cfgs.ae.sample_rate;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
async _infer(textList, langList, style, totalStep, speed = 1.05, progressCallback = null) {
|
|
167
|
+
const bsz = textList.length;
|
|
168
|
+
|
|
169
|
+
// Process text
|
|
170
|
+
const { textIds, textMask } = this.textProcessor.call(textList, langList);
|
|
171
|
+
|
|
172
|
+
const textIdsFlat = new BigInt64Array(textIds.flat().map(x => BigInt(x)));
|
|
173
|
+
const textIdsShape = [bsz, textIds[0].length];
|
|
174
|
+
const textIdsTensor = new ort.Tensor('int64', textIdsFlat, textIdsShape);
|
|
175
|
+
|
|
176
|
+
const textMaskFlat = new Float32Array(textMask.flat(2));
|
|
177
|
+
const textMaskShape = [bsz, 1, textMask[0][0].length];
|
|
178
|
+
const textMaskTensor = new ort.Tensor('float32', textMaskFlat, textMaskShape);
|
|
179
|
+
|
|
180
|
+
// Predict duration
|
|
181
|
+
const dpOutputs = await this.dpOrt.run({
|
|
182
|
+
text_ids: textIdsTensor,
|
|
183
|
+
style_dp: style.dp,
|
|
184
|
+
text_mask: textMaskTensor
|
|
185
|
+
});
|
|
186
|
+
const duration = Array.from(dpOutputs.duration.data);
|
|
187
|
+
|
|
188
|
+
// Apply speed factor to duration
|
|
189
|
+
for (let i = 0; i < duration.length; i++) {
|
|
190
|
+
duration[i] /= speed;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Encode text
|
|
194
|
+
const textEncOutputs = await this.textEncOrt.run({
|
|
195
|
+
text_ids: textIdsTensor,
|
|
196
|
+
style_ttl: style.ttl,
|
|
197
|
+
text_mask: textMaskTensor
|
|
198
|
+
});
|
|
199
|
+
const textEmb = textEncOutputs.text_emb;
|
|
200
|
+
|
|
201
|
+
// Sample noisy latent
|
|
202
|
+
let { xt, latentMask } = this.sampleNoisyLatent(
|
|
203
|
+
duration,
|
|
204
|
+
this.sampleRate,
|
|
205
|
+
this.cfgs.ae.base_chunk_size,
|
|
206
|
+
this.cfgs.ttl.chunk_compress_factor,
|
|
207
|
+
this.cfgs.ttl.latent_dim
|
|
208
|
+
);
|
|
209
|
+
|
|
210
|
+
const latentMaskFlat = new Float32Array(latentMask.flat(2));
|
|
211
|
+
const latentMaskShape = [bsz, 1, latentMask[0][0].length];
|
|
212
|
+
const latentMaskTensor = new ort.Tensor('float32', latentMaskFlat, latentMaskShape);
|
|
213
|
+
|
|
214
|
+
// Prepare constant arrays
|
|
215
|
+
const totalStepArray = new Float32Array(bsz).fill(totalStep);
|
|
216
|
+
const totalStepTensor = new ort.Tensor('float32', totalStepArray, [bsz]);
|
|
217
|
+
|
|
218
|
+
// Denoising loop
|
|
219
|
+
for (let step = 0; step < totalStep; step++) {
|
|
220
|
+
if (progressCallback) {
|
|
221
|
+
progressCallback(step + 1, totalStep);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const currentStepArray = new Float32Array(bsz).fill(step);
|
|
225
|
+
const currentStepTensor = new ort.Tensor('float32', currentStepArray, [bsz]);
|
|
226
|
+
|
|
227
|
+
const xtFlat = new Float32Array(xt.flat(2));
|
|
228
|
+
const xtShape = [bsz, xt[0].length, xt[0][0].length];
|
|
229
|
+
const xtTensor = new ort.Tensor('float32', xtFlat, xtShape);
|
|
230
|
+
|
|
231
|
+
const vectorEstOutputs = await this.vectorEstOrt.run({
|
|
232
|
+
noisy_latent: xtTensor,
|
|
233
|
+
text_emb: textEmb,
|
|
234
|
+
style_ttl: style.ttl,
|
|
235
|
+
latent_mask: latentMaskTensor,
|
|
236
|
+
text_mask: textMaskTensor,
|
|
237
|
+
current_step: currentStepTensor,
|
|
238
|
+
total_step: totalStepTensor
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
const denoised = Array.from(vectorEstOutputs.denoised_latent.data);
|
|
242
|
+
|
|
243
|
+
// Reshape to 3D
|
|
244
|
+
const latentDim = xt[0].length;
|
|
245
|
+
const latentLen = xt[0][0].length;
|
|
246
|
+
xt = [];
|
|
247
|
+
let idx = 0;
|
|
248
|
+
for (let b = 0; b < bsz; b++) {
|
|
249
|
+
const batch = [];
|
|
250
|
+
for (let d = 0; d < latentDim; d++) {
|
|
251
|
+
const row = [];
|
|
252
|
+
for (let t = 0; t < latentLen; t++) {
|
|
253
|
+
row.push(denoised[idx++]);
|
|
254
|
+
}
|
|
255
|
+
batch.push(row);
|
|
256
|
+
}
|
|
257
|
+
xt.push(batch);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Generate waveform
|
|
262
|
+
const finalXtFlat = new Float32Array(xt.flat(2));
|
|
263
|
+
const finalXtShape = [bsz, xt[0].length, xt[0][0].length];
|
|
264
|
+
const finalXtTensor = new ort.Tensor('float32', finalXtFlat, finalXtShape);
|
|
265
|
+
|
|
266
|
+
const vocoderOutputs = await this.vocoderOrt.run({
|
|
267
|
+
latent: finalXtTensor
|
|
268
|
+
});
|
|
269
|
+
|
|
270
|
+
const wav = Array.from(vocoderOutputs.wav_tts.data);
|
|
271
|
+
|
|
272
|
+
return { wav, duration };
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
async call(text, lang, style, totalStep, speed = 1.05, silenceDuration = 0.3, progressCallback = null) {
|
|
276
|
+
if (style.ttl.dims[0] !== 1) {
|
|
277
|
+
throw new Error('Single speaker text to speech only supports single style');
|
|
278
|
+
}
|
|
279
|
+
const maxLen = (lang === 'ko' || lang === 'ja') ? 120 : 300;
|
|
280
|
+
const textList = chunkText(text, maxLen);
|
|
281
|
+
const langList = new Array(textList.length).fill(lang);
|
|
282
|
+
let wavCat = [];
|
|
283
|
+
let durCat = 0;
|
|
284
|
+
|
|
285
|
+
for (let i = 0; i < textList.length; i++) {
|
|
286
|
+
const { wav, duration } = await this._infer([textList[i]], [langList[i]], style, totalStep, speed, progressCallback);
|
|
287
|
+
|
|
288
|
+
if (wavCat.length === 0) {
|
|
289
|
+
wavCat = wav;
|
|
290
|
+
durCat = duration[0];
|
|
291
|
+
} else {
|
|
292
|
+
const silenceLen = Math.floor(silenceDuration * this.sampleRate);
|
|
293
|
+
const silence = new Array(silenceLen).fill(0);
|
|
294
|
+
wavCat = [...wavCat, ...silence, ...wav];
|
|
295
|
+
durCat += duration[0] + silenceDuration;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
return { wav: wavCat, duration: [durCat] };
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
async batch(textList, langList, style, totalStep, speed = 1.05, progressCallback = null) {
|
|
303
|
+
return await this._infer(textList, langList, style, totalStep, speed, progressCallback);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
sampleNoisyLatent(duration, sampleRate, baseChunkSize, chunkCompress, latentDim) {
|
|
307
|
+
const bsz = duration.length;
|
|
308
|
+
const maxDur = Math.max(...duration);
|
|
309
|
+
|
|
310
|
+
const wavLenMax = Math.floor(maxDur * sampleRate);
|
|
311
|
+
const wavLengths = duration.map(d => Math.floor(d * sampleRate));
|
|
312
|
+
|
|
313
|
+
const chunkSize = baseChunkSize * chunkCompress;
|
|
314
|
+
const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize);
|
|
315
|
+
const latentDimVal = latentDim * chunkCompress;
|
|
316
|
+
|
|
317
|
+
const xt = [];
|
|
318
|
+
for (let b = 0; b < bsz; b++) {
|
|
319
|
+
const batch = [];
|
|
320
|
+
for (let d = 0; d < latentDimVal; d++) {
|
|
321
|
+
const row = [];
|
|
322
|
+
for (let t = 0; t < latentLen; t++) {
|
|
323
|
+
// Box-Muller transform
|
|
324
|
+
const u1 = Math.max(0.0001, Math.random());
|
|
325
|
+
const u2 = Math.random();
|
|
326
|
+
const val = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2);
|
|
327
|
+
row.push(val);
|
|
328
|
+
}
|
|
329
|
+
batch.push(row);
|
|
330
|
+
}
|
|
331
|
+
xt.push(batch);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const latentLengths = wavLengths.map(len => Math.floor((len + chunkSize - 1) / chunkSize));
|
|
335
|
+
const latentMask = this.lengthToMask(latentLengths, latentLen);
|
|
336
|
+
|
|
337
|
+
// Apply mask
|
|
338
|
+
for (let b = 0; b < bsz; b++) {
|
|
339
|
+
for (let d = 0; d < latentDimVal; d++) {
|
|
340
|
+
for (let t = 0; t < latentLen; t++) {
|
|
341
|
+
xt[b][d][t] *= latentMask[b][0][t];
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return { xt, latentMask };
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
lengthToMask(lengths, maxLen = null) {
|
|
350
|
+
const actualMaxLen = maxLen || Math.max(...lengths);
|
|
351
|
+
return lengths.map(len => {
|
|
352
|
+
const row = new Array(actualMaxLen).fill(0.0);
|
|
353
|
+
for (let j = 0; j < Math.min(len, actualMaxLen); j++) {
|
|
354
|
+
row[j] = 1.0;
|
|
355
|
+
}
|
|
356
|
+
return [row];
|
|
357
|
+
});
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Load voice style from JSON files
|
|
363
|
+
*/
|
|
364
|
+
export async function loadVoiceStyle(voiceStylePaths, verbose = false) {
|
|
365
|
+
const bsz = voiceStylePaths.length;
|
|
366
|
+
|
|
367
|
+
// Read first file to get dimensions
|
|
368
|
+
const firstResponse = await fetch(voiceStylePaths[0]);
|
|
369
|
+
const firstStyle = await firstResponse.json();
|
|
370
|
+
|
|
371
|
+
const ttlDims = firstStyle.style_ttl.dims;
|
|
372
|
+
const dpDims = firstStyle.style_dp.dims;
|
|
373
|
+
|
|
374
|
+
const ttlDim1 = ttlDims[1];
|
|
375
|
+
const ttlDim2 = ttlDims[2];
|
|
376
|
+
const dpDim1 = dpDims[1];
|
|
377
|
+
const dpDim2 = dpDims[2];
|
|
378
|
+
|
|
379
|
+
// Pre-allocate arrays with full batch size
|
|
380
|
+
const ttlSize = bsz * ttlDim1 * ttlDim2;
|
|
381
|
+
const dpSize = bsz * dpDim1 * dpDim2;
|
|
382
|
+
const ttlFlat = new Float32Array(ttlSize);
|
|
383
|
+
const dpFlat = new Float32Array(dpSize);
|
|
384
|
+
|
|
385
|
+
// Fill in the data
|
|
386
|
+
for (let i = 0; i < bsz; i++) {
|
|
387
|
+
const response = await fetch(voiceStylePaths[i]);
|
|
388
|
+
const voiceStyle = await response.json();
|
|
389
|
+
|
|
390
|
+
// Flatten TTL data
|
|
391
|
+
const ttlData = voiceStyle.style_ttl.data.flat(Infinity);
|
|
392
|
+
const ttlOffset = i * ttlDim1 * ttlDim2;
|
|
393
|
+
ttlFlat.set(ttlData, ttlOffset);
|
|
394
|
+
|
|
395
|
+
// Flatten DP data
|
|
396
|
+
const dpData = voiceStyle.style_dp.data.flat(Infinity);
|
|
397
|
+
const dpOffset = i * dpDim1 * dpDim2;
|
|
398
|
+
dpFlat.set(dpData, dpOffset);
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
const ttlShape = [bsz, ttlDim1, ttlDim2];
|
|
402
|
+
const dpShape = [bsz, dpDim1, dpDim2];
|
|
403
|
+
|
|
404
|
+
const ttlTensor = new ort.Tensor('float32', ttlFlat, ttlShape);
|
|
405
|
+
const dpTensor = new ort.Tensor('float32', dpFlat, dpShape);
|
|
406
|
+
|
|
407
|
+
if (verbose) {
|
|
408
|
+
console.log(`Loaded ${bsz} voice styles`);
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
return new Style(ttlTensor, dpTensor);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Load configuration from JSON
|
|
416
|
+
*/
|
|
417
|
+
export async function loadCfgs(onnxDir) {
|
|
418
|
+
const response = await fetch(`${onnxDir}/tts.json`);
|
|
419
|
+
const cfgs = await response.json();
|
|
420
|
+
return cfgs;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* Load text processor
|
|
425
|
+
*/
|
|
426
|
+
export async function loadTextProcessor(onnxDir) {
|
|
427
|
+
const response = await fetch(`${onnxDir}/unicode_indexer.json`);
|
|
428
|
+
const indexer = await response.json();
|
|
429
|
+
return new UnicodeProcessor(indexer);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
/**
|
|
433
|
+
* Load ONNX model
|
|
434
|
+
*/
|
|
435
|
+
export async function loadOnnx(onnxPath, options) {
|
|
436
|
+
const session = await ort.InferenceSession.create(onnxPath, options);
|
|
437
|
+
return session;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/**
|
|
441
|
+
* Load all TTS components
|
|
442
|
+
*/
|
|
443
|
+
export async function loadTextToSpeech(onnxDir, sessionOptions = {}, progressCallback = null) {
|
|
444
|
+
console.log('Using WebAssembly/WebGPU for inference');
|
|
445
|
+
|
|
446
|
+
const cfgs = await loadCfgs(onnxDir);
|
|
447
|
+
|
|
448
|
+
const dpPath = `${onnxDir}/duration_predictor.onnx`;
|
|
449
|
+
const textEncPath = `${onnxDir}/text_encoder.onnx`;
|
|
450
|
+
const vectorEstPath = `${onnxDir}/vector_estimator.onnx`;
|
|
451
|
+
const vocoderPath = `${onnxDir}/vocoder.onnx`;
|
|
452
|
+
|
|
453
|
+
const modelPaths = [
|
|
454
|
+
{ name: 'Duration Predictor', path: dpPath },
|
|
455
|
+
{ name: 'Text Encoder', path: textEncPath },
|
|
456
|
+
{ name: 'Vector Estimator', path: vectorEstPath },
|
|
457
|
+
{ name: 'Vocoder', path: vocoderPath }
|
|
458
|
+
];
|
|
459
|
+
|
|
460
|
+
const sessions = [];
|
|
461
|
+
for (let i = 0; i < modelPaths.length; i++) {
|
|
462
|
+
if (progressCallback) {
|
|
463
|
+
progressCallback(modelPaths[i].name, i + 1, modelPaths.length);
|
|
464
|
+
}
|
|
465
|
+
const session = await loadOnnx(modelPaths[i].path, sessionOptions);
|
|
466
|
+
sessions.push(session);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
const [dpOrt, textEncOrt, vectorEstOrt, vocoderOrt] = sessions;
|
|
470
|
+
|
|
471
|
+
const textProcessor = await loadTextProcessor(onnxDir);
|
|
472
|
+
const textToSpeech = new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt);
|
|
473
|
+
|
|
474
|
+
return { textToSpeech, cfgs };
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
/**
|
|
478
|
+
* Chunk text into manageable segments
|
|
479
|
+
*/
|
|
480
|
+
function chunkText(text, maxLen = 300) {
|
|
481
|
+
if (typeof text !== 'string') {
|
|
482
|
+
throw new Error(`chunkText expects a string, got ${typeof text}`);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// Split by paragraph (two or more newlines)
|
|
486
|
+
const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim());
|
|
487
|
+
|
|
488
|
+
const chunks = [];
|
|
489
|
+
|
|
490
|
+
for (let paragraph of paragraphs) {
|
|
491
|
+
paragraph = paragraph.trim();
|
|
492
|
+
if (!paragraph) continue;
|
|
493
|
+
|
|
494
|
+
// Split by sentence boundaries (period, question mark, exclamation mark followed by space)
|
|
495
|
+
// But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
|
|
496
|
+
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/);
|
|
497
|
+
|
|
498
|
+
let currentChunk = "";
|
|
499
|
+
|
|
500
|
+
for (let sentence of sentences) {
|
|
501
|
+
if (currentChunk.length + sentence.length + 1 <= maxLen) {
|
|
502
|
+
currentChunk += (currentChunk ? " " : "") + sentence;
|
|
503
|
+
} else {
|
|
504
|
+
if (currentChunk) {
|
|
505
|
+
chunks.push(currentChunk.trim());
|
|
506
|
+
}
|
|
507
|
+
currentChunk = sentence;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
if (currentChunk) {
|
|
512
|
+
chunks.push(currentChunk.trim());
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
return chunks;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
/**
|
|
520
|
+
* Write WAV file to ArrayBuffer
|
|
521
|
+
*/
|
|
522
|
+
export function writeWavFile(audioData, sampleRate) {
|
|
523
|
+
const numChannels = 1;
|
|
524
|
+
const bitsPerSample = 16;
|
|
525
|
+
const byteRate = sampleRate * numChannels * bitsPerSample / 8;
|
|
526
|
+
const blockAlign = numChannels * bitsPerSample / 8;
|
|
527
|
+
const dataSize = audioData.length * 2;
|
|
528
|
+
|
|
529
|
+
// Create ArrayBuffer
|
|
530
|
+
const buffer = new ArrayBuffer(44 + dataSize);
|
|
531
|
+
const view = new DataView(buffer);
|
|
532
|
+
|
|
533
|
+
// Write WAV header
|
|
534
|
+
const writeString = (offset, string) => {
|
|
535
|
+
for (let i = 0; i < string.length; i++) {
|
|
536
|
+
view.setUint8(offset + i, string.charCodeAt(i));
|
|
537
|
+
}
|
|
538
|
+
};
|
|
539
|
+
|
|
540
|
+
writeString(0, 'RIFF');
|
|
541
|
+
view.setUint32(4, 36 + dataSize, true);
|
|
542
|
+
writeString(8, 'WAVE');
|
|
543
|
+
writeString(12, 'fmt ');
|
|
544
|
+
view.setUint32(16, 16, true);
|
|
545
|
+
view.setUint16(20, 1, true); // PCM
|
|
546
|
+
view.setUint16(22, numChannels, true);
|
|
547
|
+
view.setUint32(24, sampleRate, true);
|
|
548
|
+
view.setUint32(28, byteRate, true);
|
|
549
|
+
view.setUint16(32, blockAlign, true);
|
|
550
|
+
view.setUint16(34, bitsPerSample, true);
|
|
551
|
+
writeString(36, 'data');
|
|
552
|
+
view.setUint32(40, dataSize, true);
|
|
553
|
+
|
|
554
|
+
// Write audio data
|
|
555
|
+
const int16Data = new Int16Array(audioData.length);
|
|
556
|
+
for (let i = 0; i < audioData.length; i++) {
|
|
557
|
+
const clamped = Math.max(-1.0, Math.min(1.0, audioData[i]));
|
|
558
|
+
int16Data[i] = Math.floor(clamped * 32767);
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
const dataView = new Uint8Array(buffer, 44);
|
|
562
|
+
dataView.set(new Uint8Array(int16Data.buffer));
|
|
563
|
+
|
|
564
|
+
return buffer;
|
|
565
|
+
}
|