@vortex-os/computer-use 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,296 +1,296 @@
1
- // computer-use — Supertonic TTS speaker helper (Node + onnxruntime-node; SEPARATE INSTALL, Windows-first).
2
- //
3
- // Speaks ONE already-finalized utterance and exits — the drop-in higher-quality alternative to speak.ps1
4
- // (System.Speech/Heami). The CALLER (Node reflex path in mcp-stdio.mjs) owns the security: provenance
5
- // prefix, sanitization, and the speech budget / no-overlap. This helper just renders audio, as its own
6
- // short-lived process, so it never blocks the resident worker. Engine selection + Heami fallback live in
7
- // the caller; this script assumes models are present (the caller probes first).
8
- //
9
- // Contract (mirrors speak.ps1): --text <utt> [--voice F1] [--lang ko] [--model-dir <dir>] [--to-wav <path>]
10
- // [--earcon] [--speed 1.05] [--steps 8] [--max-chars 600]. One JSON line on stdout
11
- // {ok, voice, chars, ms} (or {ok:false, error}, exit 1). --to-wav renders to a file (silent, for tests).
12
- //
13
- // Inference logic is adapted from Supertone's official Node example (supertone-inc/supertonic, nodejs/helper.js,
14
- // MIT). Model weights (Supertone/supertonic-3) are OpenRAIL-M and are downloaded separately (fetch-supertonic.mjs),
15
- // never bundled. onnxruntime-node is an optionalDependency.
16
-
17
- import fs from 'node:fs';
18
- import os from 'node:os';
19
- import path from 'node:path';
20
- import { spawn } from 'node:child_process';
21
- import { fileURLToPath } from 'node:url';
22
-
23
- const __dirname = path.dirname(fileURLToPath(import.meta.url));
24
-
25
- function emit(o) { process.stdout.write(JSON.stringify(o) + '\n'); }
26
-
27
- // ── arg parse ──
28
- function parseArgs(argv) {
29
- const a = { text: '', voice: 'F1', lang: 'ko', modelDir: '', toWav: '', earcon: false, speed: 1.05, steps: 8, maxChars: 600 };
30
- for (let i = 2; i < argv.length; i++) {
31
- const k = argv[i];
32
- if (k === '--text') a.text = argv[++i];
33
- else if (k === '--voice') a.voice = argv[++i];
34
- else if (k === '--lang') a.lang = argv[++i];
35
- else if (k === '--model-dir') a.modelDir = argv[++i];
36
- else if (k === '--to-wav') a.toWav = argv[++i];
37
- else if (k === '--earcon') a.earcon = true;
38
- else if (k === '--speed') a.speed = parseFloat(argv[++i]);
39
- else if (k === '--steps') a.steps = parseInt(argv[++i], 10);
40
- else if (k === '--max-chars') a.maxChars = parseInt(argv[++i], 10);
41
- }
42
- return a;
43
- }
44
-
45
- // Default model dir: env override, else a per-user cache the fetch script writes to.
46
- function resolveModelDir(arg) {
47
- if (arg) return arg;
48
- if (process.env.VORTEX_CU_TTS_MODEL_DIR) return process.env.VORTEX_CU_TTS_MODEL_DIR;
49
- return path.join(os.homedir(), '.vortex', 'computer-use', 'supertonic-3');
50
- }
51
-
52
- const AVAILABLE_LANGS = ['en', 'ko', 'ja', 'ar', 'bg', 'cs', 'da', 'de', 'el', 'es', 'et', 'fi', 'fr', 'hi', 'hr', 'hu', 'id', 'it', 'lt', 'lv', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sv', 'tr', 'uk', 'vi', 'na'];
53
-
54
- // ── text preprocessing (port of UnicodeProcessor) ──
55
- class UnicodeProcessor {
56
- constructor(indexerPath) { this.indexer = JSON.parse(fs.readFileSync(indexerPath, 'utf8')); }
57
- _pre(text, lang) {
58
- text = text.normalize('NFKD');
59
- const emoji = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
60
- text = text.replace(emoji, '');
61
- const rep = { '–': '-', '‑': '-', '—': '-', '_': ' ', '“': '"', '”': '"', '‘': "'", '’': "'", '´': "'", '`': "'", '[': ' ', ']': ' ', '|': ' ', '/': ' ', '#': ' ', '→': ' ', '←': ' ' };
62
- for (const [k, v] of Object.entries(rep)) text = text.replaceAll(k, v);
63
- text = text.replace(/[♥☆♡©\\]/g, '');
64
- const expr = { '@': ' at ', 'e.g.,': 'for example, ', 'i.e.,': 'that is, ' };
65
- for (const [k, v] of Object.entries(expr)) text = text.replaceAll(k, v);
66
- text = text.replace(/ ,/g, ',').replace(/ \./g, '.').replace(/ !/g, '!').replace(/ \?/g, '?').replace(/ ;/g, ';').replace(/ :/g, ':').replace(/ '/g, "'");
67
- while (text.includes('""')) text = text.replace('""', '"');
68
- while (text.includes("''")) text = text.replace("''", "'");
69
- text = text.replace(/\s+/g, ' ').trim();
70
- if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text)) text += '.';
71
- if (!AVAILABLE_LANGS.includes(lang)) throw new Error(`invalid lang: ${lang}`);
72
- return `<${lang}>` + text + `</${lang}>`;
73
- }
74
- call(textList, langList) {
75
- const processed = textList.map((t, i) => this._pre(t, langList[i]));
76
- const lengths = processed.map((t) => t.length);
77
- const maxLen = Math.max(...lengths);
78
- const textIds = [];
79
- for (let i = 0; i < processed.length; i++) {
80
- const row = new Array(maxLen).fill(0);
81
- const vals = Array.from(processed[i]).map((c) => c.charCodeAt(0));
82
- for (let j = 0; j < vals.length; j++) row[j] = this.indexer[vals[j]];
83
- textIds.push(row);
84
- }
85
- return { textIds, textMask: lengthToMask(lengths) };
86
- }
87
- }
88
-
89
- function lengthToMask(lengths, maxLen = null) {
90
- maxLen = maxLen || Math.max(...lengths);
91
- return lengths.map((len) => [Array.from({ length: maxLen }, (_, j) => (j < len ? 1.0 : 0.0))]);
92
- }
93
- function getLatentMask(wavLengths, baseChunkSize, ccf) {
94
- const sz = baseChunkSize * ccf;
95
- return lengthToMask(wavLengths.map((len) => Math.floor((len + sz - 1) / sz)));
96
- }
97
-
98
- let ort; // lazy so a missing optionalDependency yields a clean JSON error, not an import crash
99
- function tensorF32(array, dims) { return new ort.Tensor('float32', Float32Array.from(array.flat(Infinity)), dims); }
100
- function tensorI64(array, dims) { return new ort.Tensor('int64', BigInt64Array.from(array.flat(Infinity).map((x) => BigInt(x))), dims); }
101
-
102
- class TextToSpeech {
103
- constructor(cfgs, tp, dp, te, ve, vo) {
104
- this.cfgs = cfgs; this.tp = tp; this.dp = dp; this.te = te; this.ve = ve; this.vo = vo;
105
- this.sampleRate = cfgs.ae.sample_rate; this.baseChunk = cfgs.ae.base_chunk_size;
106
- this.ccf = cfgs.ttl.chunk_compress_factor; this.ldim = cfgs.ttl.latent_dim;
107
- }
108
- _sampleNoisy(duration) {
109
- const wavLenMax = Math.max(...duration) * this.sampleRate;
110
- const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate));
111
- const chunkSize = this.baseChunk * this.ccf;
112
- const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize);
113
- const latentDim = this.ldim * this.ccf;
114
- const noisy = [];
115
- for (let b = 0; b < duration.length; b++) {
116
- const batch = [];
117
- for (let d = 0; d < latentDim; d++) {
118
- const row = [];
119
- for (let t = 0; t < latentLen; t++) {
120
- const u1 = Math.max(1e-10, Math.random()), u2 = Math.random();
121
- row.push(Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2));
122
- }
123
- batch.push(row);
124
- }
125
- noisy.push(batch);
126
- }
127
- const mask = getLatentMask(wavLengths, this.baseChunk, this.ccf);
128
- for (let b = 0; b < noisy.length; b++) for (let d = 0; d < noisy[b].length; d++) for (let t = 0; t < noisy[b][d].length; t++) noisy[b][d][t] *= mask[b][0][t];
129
- return { noisy, mask };
130
- }
131
- async _infer(textList, langList, style, totalStep, speed) {
132
- const bsz = textList.length;
133
- const { textIds, textMask } = this.tp.call(textList, langList);
134
- const idsShape = [bsz, textIds[0].length];
135
- const maskTensor = tensorF32(textMask, [bsz, 1, textMask[0][0].length]);
136
- const dpr = await this.dp.run({ text_ids: tensorI64(textIds, idsShape), style_dp: style.dp, text_mask: maskTensor });
137
- const dur = Array.from(dpr.duration.data).map((d) => d / speed);
138
- const ter = await this.te.run({ text_ids: tensorI64(textIds, idsShape), style_ttl: style.ttl, text_mask: maskTensor });
139
- const textEmb = ter.text_emb;
140
- let { noisy, mask } = this._sampleNoisy(dur);
141
- const latShape = [bsz, noisy[0].length, noisy[0][0].length];
142
- const latMaskTensor = tensorF32(mask, [bsz, 1, mask[0][0].length]);
143
- const stepTensor = tensorF32(new Array(bsz).fill(totalStep), [bsz]);
144
- for (let step = 0; step < totalStep; step++) {
145
- const r = await this.ve.run({
146
- noisy_latent: tensorF32(noisy, latShape), text_emb: textEmb, style_ttl: style.ttl,
147
- text_mask: maskTensor, latent_mask: latMaskTensor, total_step: stepTensor,
148
- current_step: tensorF32(new Array(bsz).fill(step), [bsz]),
149
- });
150
- const den = Array.from(r.denoised_latent.data);
151
- let idx = 0;
152
- for (let b = 0; b < noisy.length; b++) for (let d = 0; d < noisy[b].length; d++) for (let t = 0; t < noisy[b][d].length; t++) noisy[b][d][t] = den[idx++];
153
- }
154
- const vr = await this.vo.run({ latent: tensorF32(noisy, latShape) });
155
- return { wav: Array.from(vr.wav_tts.data), duration: dur };
156
- }
157
- async call(text, lang, style, totalStep, speed, silence = 0.3) {
158
- const maxLen = (lang === 'ko' || lang === 'ja') ? 120 : 300;
159
- const chunks = chunkText(text, maxLen);
160
- let wavCat = null, durCat = 0;
161
- for (const chunk of chunks) {
162
- const { wav, duration } = await this._infer([chunk], [lang], style, totalStep, speed);
163
- if (wavCat === null) { wavCat = wav; durCat = duration[0]; }
164
- else { wavCat = [...wavCat, ...new Array(Math.floor(silence * this.sampleRate)).fill(0), ...wav]; durCat += duration[0] + silence; }
165
- }
166
- return { wav: wavCat, duration: [durCat] };
167
- }
168
- }
169
-
170
- function chunkText(text, maxLen) {
171
- const paras = text.trim().split(/\n\s*\n+/).filter((p) => p.trim());
172
- const chunks = [];
173
- for (let para of paras) {
174
- para = para.trim();
175
- if (!para) continue;
176
- const sentences = para.split(/(?<=[.!?])\s+/);
177
- let cur = '';
178
- for (const s of sentences) {
179
- if (cur.length + s.length + 1 <= maxLen) cur += (cur ? ' ' : '') + s;
180
- else { if (cur) chunks.push(cur.trim()); cur = s; }
181
- }
182
- if (cur) chunks.push(cur.trim());
183
- }
184
- return chunks.length ? chunks : [text.trim()];
185
- }
186
-
187
- function loadVoiceStyle(p) {
188
- const vs = JSON.parse(fs.readFileSync(p, 'utf8'));
189
- const td = vs.style_ttl.dims, dd = vs.style_dp.dims;
190
- const ttl = new ort.Tensor('float32', Float32Array.from(vs.style_ttl.data.flat(Infinity)), [1, td[1], td[2]]);
191
- const dp = new ort.Tensor('float32', Float32Array.from(vs.style_dp.data.flat(Infinity)), [1, dd[1], dd[2]]);
192
- return { ttl, dp };
193
- }
194
-
195
- async function loadTTS(modelDir) {
196
- const onnx = path.join(modelDir, 'onnx');
197
- const cfgs = JSON.parse(fs.readFileSync(path.join(onnx, 'tts.json'), 'utf8'));
198
- const opts = {};
199
- const [dp, te, ve, vo] = await Promise.all([
200
- ort.InferenceSession.create(path.join(onnx, 'duration_predictor.onnx'), opts),
201
- ort.InferenceSession.create(path.join(onnx, 'text_encoder.onnx'), opts),
202
- ort.InferenceSession.create(path.join(onnx, 'vector_estimator.onnx'), opts),
203
- ort.InferenceSession.create(path.join(onnx, 'vocoder.onnx'), opts),
204
- ]);
205
- const tp = new UnicodeProcessor(path.join(onnx, 'unicode_indexer.json'));
206
- return new TextToSpeech(cfgs, tp, dp, te, ve, vo);
207
- }
208
-
209
- // ── WAV (16-bit PCM mono) ──
210
- function floatToWav(samples, sampleRate) {
211
- const dataSize = samples.length * 2;
212
- const buf = Buffer.alloc(44 + dataSize);
213
- buf.write('RIFF', 0); buf.writeUInt32LE(36 + dataSize, 4); buf.write('WAVE', 8);
214
- buf.write('fmt ', 12); buf.writeUInt32LE(16, 16); buf.writeUInt16LE(1, 20); buf.writeUInt16LE(1, 22);
215
- buf.writeUInt32LE(sampleRate, 24); buf.writeUInt32LE(sampleRate * 2, 28); buf.writeUInt16LE(2, 32); buf.writeUInt16LE(16, 34);
216
- buf.write('data', 36); buf.writeUInt32LE(dataSize, 40);
217
- for (let i = 0; i < samples.length; i++) {
218
- const s = Math.max(-1, Math.min(1, samples[i]));
219
- buf.writeInt16LE(Math.floor(s * 32767), 44 + i * 2);
220
- }
221
- return buf;
222
- }
223
-
224
- // Voice "slightly up": peak-normalize the synthesized voice to a consistent presence (neural TTS output often
225
- // sits well below full scale). Only boosts (never reduces a loud take), capped so near-silence isn't blown up.
226
- function normalizePeak(samples, target = 0.9, maxGain = 3.0) {
227
- let peak = 0;
228
- for (let i = 0; i < samples.length; i++) { const a = Math.abs(samples[i]); if (a > peak) peak = a; }
229
- if (peak <= 0) return samples;
230
- const gain = Math.min(target / peak, maxGain);
231
- if (gain <= 1.0) return samples;
232
- for (let i = 0; i < samples.length; i++) samples[i] *= gain;
233
- return samples;
234
- }
235
-
236
- // Provenance chime (two-tone 1175→1568 Hz) matching speak.ps1's earcon, as PCM samples to prepend.
237
- function earconSamples(sampleRate) {
238
- const tone = (freq, ms) => {
239
- const n = Math.floor((ms / 1000) * sampleRate);
240
- return Array.from({ length: n }, (_, i) => 0.25 * Math.sin((2 * Math.PI * freq * i) / sampleRate));
241
- };
242
- return [...tone(1175, 90), ...tone(1568, 110), ...new Array(Math.floor(0.06 * sampleRate)).fill(0)];
243
- }
244
-
245
- // Play a WAV synchronously through audio-duck.ps1 (win32): it ducks OTHER apps' sessions while the voice plays
246
- // and restores them in a finally, excluding its own process so the voice is never ducked. No native deps.
247
- // If ducking is unavailable it still plays (audio-duck falls back to a plain SoundPlayer). VORTEX_CU_DUCK=off
248
- // disables ducking (factor 1.0); VORTEX_CU_DUCK_FACTOR tunes how much others drop (0.5 = to 50%).
249
- function playWav(wavPath) {
250
- return new Promise((resolve) => {
251
- if (process.platform !== 'win32') return resolve(false);
252
- const duckScript = path.join(__dirname, 'audio-duck.ps1');
253
- const factor = process.env.VORTEX_CU_DUCK === 'off' ? '1.0' : (process.env.VORTEX_CU_DUCK_FACTOR || '0.3');
254
- const ps = spawn('pwsh', ['-NoProfile', '-NonInteractive', '-File', duckScript, '-WavPath', wavPath, '-Factor', factor], { stdio: 'ignore' });
255
- const kill = () => { try { ps.kill(); } catch {} };
256
- process.once('SIGTERM', kill); process.once('SIGINT', kill);
257
- ps.on('exit', () => resolve(true));
258
- ps.on('error', () => resolve(false));
259
- });
260
- }
261
-
262
- async function main() {
263
- const a = parseArgs(process.argv);
264
- let text = String(a.text || '');
265
- if (text.length > a.maxChars) text = text.slice(0, a.maxChars);
266
- if (!text.trim()) { emit({ ok: false, error: 'empty text' }); process.exit(1); }
267
- const modelDir = resolveModelDir(a.modelDir);
268
- const t0 = Date.now();
269
- try {
270
- ort = (await import('onnxruntime-node')).default ?? (await import('onnxruntime-node'));
271
- } catch { emit({ ok: false, error: 'onnxruntime-node not installed' }); process.exit(1); }
272
- try {
273
- const stylePath = path.join(modelDir, 'voice_styles', `${a.voice}.json`);
274
- if (!fs.existsSync(stylePath)) { emit({ ok: false, error: `voice not found: ${a.voice}` }); process.exit(1); }
275
- const tts = await loadTTS(modelDir);
276
- const style = loadVoiceStyle(stylePath);
277
- const { wav } = await tts.call(text, a.lang, style, a.steps, a.speed);
278
- const voice = normalizePeak(wav); // voice "slightly up" — consistent presence
279
- let samples = a.earcon ? [...earconSamples(tts.sampleRate), ...voice] : voice;
280
- const wavBuf = floatToWav(samples, tts.sampleRate);
281
- if (a.toWav) {
282
- fs.writeFileSync(a.toWav, wavBuf);
283
- } else {
284
- const tmp = path.join(os.tmpdir(), `vortex-cu-tts-${process.pid}-${Date.now()}.wav`);
285
- fs.writeFileSync(tmp, wavBuf);
286
- await playWav(tmp);
287
- try { fs.unlinkSync(tmp); } catch {}
288
- }
289
- emit({ ok: true, voice: a.voice, chars: text.length, ms: Date.now() - t0 });
290
- } catch (e) {
291
- emit({ ok: false, error: 'tts failed' });
292
- process.exit(1);
293
- }
294
- }
295
-
296
- main();
1
+ // computer-use — Supertonic TTS speaker helper (Node + onnxruntime-node; SEPARATE INSTALL, Windows-first).
2
+ //
3
+ // Speaks ONE already-finalized utterance and exits — the drop-in higher-quality alternative to speak.ps1
4
+ // (System.Speech/Heami). The CALLER (Node reflex path in mcp-stdio.mjs) owns the security: provenance
5
+ // prefix, sanitization, and the speech budget / no-overlap. This helper just renders audio, as its own
6
+ // short-lived process, so it never blocks the resident worker. Engine selection + Heami fallback live in
7
+ // the caller; this script assumes models are present (the caller probes first).
8
+ //
9
+ // Contract (mirrors speak.ps1): --text <utt> [--voice F1] [--lang ko] [--model-dir <dir>] [--to-wav <path>]
10
+ // [--earcon] [--speed 1.05] [--steps 8] [--max-chars 600]. One JSON line on stdout
11
+ // {ok, voice, chars, ms} (or {ok:false, error}, exit 1). --to-wav renders to a file (silent, for tests).
12
+ //
13
+ // Inference logic is adapted from Supertone's official Node example (supertone-inc/supertonic, nodejs/helper.js,
14
+ // MIT). Model weights (Supertone/supertonic-3) are OpenRAIL-M and are downloaded separately (fetch-supertonic.mjs),
15
+ // never bundled. onnxruntime-node is an optionalDependency.
16
+
17
+ import fs from 'node:fs';
18
+ import os from 'node:os';
19
+ import path from 'node:path';
20
+ import { spawn } from 'node:child_process';
21
+ import { fileURLToPath } from 'node:url';
22
+
23
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
24
+
25
+ function emit(o) { process.stdout.write(JSON.stringify(o) + '\n'); }
26
+
27
+ // ── arg parse ──
28
+ function parseArgs(argv) {
29
+ const a = { text: '', voice: 'F1', lang: 'ko', modelDir: '', toWav: '', earcon: false, speed: 1.05, steps: 8, maxChars: 600 };
30
+ for (let i = 2; i < argv.length; i++) {
31
+ const k = argv[i];
32
+ if (k === '--text') a.text = argv[++i];
33
+ else if (k === '--voice') a.voice = argv[++i];
34
+ else if (k === '--lang') a.lang = argv[++i];
35
+ else if (k === '--model-dir') a.modelDir = argv[++i];
36
+ else if (k === '--to-wav') a.toWav = argv[++i];
37
+ else if (k === '--earcon') a.earcon = true;
38
+ else if (k === '--speed') a.speed = parseFloat(argv[++i]);
39
+ else if (k === '--steps') a.steps = parseInt(argv[++i], 10);
40
+ else if (k === '--max-chars') a.maxChars = parseInt(argv[++i], 10);
41
+ }
42
+ return a;
43
+ }
44
+
45
+ // Default model dir: env override, else a per-user cache the fetch script writes to.
46
+ function resolveModelDir(arg) {
47
+ if (arg) return arg;
48
+ if (process.env.VORTEX_CU_TTS_MODEL_DIR) return process.env.VORTEX_CU_TTS_MODEL_DIR;
49
+ return path.join(os.homedir(), '.vortex', 'computer-use', 'supertonic-3');
50
+ }
51
+
52
+ const AVAILABLE_LANGS = ['en', 'ko', 'ja', 'ar', 'bg', 'cs', 'da', 'de', 'el', 'es', 'et', 'fi', 'fr', 'hi', 'hr', 'hu', 'id', 'it', 'lt', 'lv', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sv', 'tr', 'uk', 'vi', 'na'];
53
+
54
+ // ── text preprocessing (port of UnicodeProcessor) ──
55
+ class UnicodeProcessor {
56
+ constructor(indexerPath) { this.indexer = JSON.parse(fs.readFileSync(indexerPath, 'utf8')); }
57
+ _pre(text, lang) {
58
+ text = text.normalize('NFKD');
59
+ const emoji = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
60
+ text = text.replace(emoji, '');
61
+ const rep = { '–': '-', '‑': '-', '—': '-', '_': ' ', '“': '"', '”': '"', '‘': "'", '’': "'", '´': "'", '`': "'", '[': ' ', ']': ' ', '|': ' ', '/': ' ', '#': ' ', '→': ' ', '←': ' ' };
62
+ for (const [k, v] of Object.entries(rep)) text = text.replaceAll(k, v);
63
+ text = text.replace(/[♥☆♡©\\]/g, '');
64
+ const expr = { '@': ' at ', 'e.g.,': 'for example, ', 'i.e.,': 'that is, ' };
65
+ for (const [k, v] of Object.entries(expr)) text = text.replaceAll(k, v);
66
+ text = text.replace(/ ,/g, ',').replace(/ \./g, '.').replace(/ !/g, '!').replace(/ \?/g, '?').replace(/ ;/g, ';').replace(/ :/g, ':').replace(/ '/g, "'");
67
+ while (text.includes('""')) text = text.replace('""', '"');
68
+ while (text.includes("''")) text = text.replace("''", "'");
69
+ text = text.replace(/\s+/g, ' ').trim();
70
+ if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text)) text += '.';
71
+ if (!AVAILABLE_LANGS.includes(lang)) throw new Error(`invalid lang: ${lang}`);
72
+ return `<${lang}>` + text + `</${lang}>`;
73
+ }
74
+ call(textList, langList) {
75
+ const processed = textList.map((t, i) => this._pre(t, langList[i]));
76
+ const lengths = processed.map((t) => t.length);
77
+ const maxLen = Math.max(...lengths);
78
+ const textIds = [];
79
+ for (let i = 0; i < processed.length; i++) {
80
+ const row = new Array(maxLen).fill(0);
81
+ const vals = Array.from(processed[i]).map((c) => c.charCodeAt(0));
82
+ for (let j = 0; j < vals.length; j++) row[j] = this.indexer[vals[j]];
83
+ textIds.push(row);
84
+ }
85
+ return { textIds, textMask: lengthToMask(lengths) };
86
+ }
87
+ }
88
+
89
+ function lengthToMask(lengths, maxLen = null) {
90
+ maxLen = maxLen || Math.max(...lengths);
91
+ return lengths.map((len) => [Array.from({ length: maxLen }, (_, j) => (j < len ? 1.0 : 0.0))]);
92
+ }
93
+ function getLatentMask(wavLengths, baseChunkSize, ccf) {
94
+ const sz = baseChunkSize * ccf;
95
+ return lengthToMask(wavLengths.map((len) => Math.floor((len + sz - 1) / sz)));
96
+ }
97
+
98
+ let ort; // lazy so a missing optionalDependency yields a clean JSON error, not an import crash
99
+ function tensorF32(array, dims) { return new ort.Tensor('float32', Float32Array.from(array.flat(Infinity)), dims); }
100
+ function tensorI64(array, dims) { return new ort.Tensor('int64', BigInt64Array.from(array.flat(Infinity).map((x) => BigInt(x))), dims); }
101
+
102
+ class TextToSpeech {
103
+ constructor(cfgs, tp, dp, te, ve, vo) {
104
+ this.cfgs = cfgs; this.tp = tp; this.dp = dp; this.te = te; this.ve = ve; this.vo = vo;
105
+ this.sampleRate = cfgs.ae.sample_rate; this.baseChunk = cfgs.ae.base_chunk_size;
106
+ this.ccf = cfgs.ttl.chunk_compress_factor; this.ldim = cfgs.ttl.latent_dim;
107
+ }
108
+ _sampleNoisy(duration) {
109
+ const wavLenMax = Math.max(...duration) * this.sampleRate;
110
+ const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate));
111
+ const chunkSize = this.baseChunk * this.ccf;
112
+ const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize);
113
+ const latentDim = this.ldim * this.ccf;
114
+ const noisy = [];
115
+ for (let b = 0; b < duration.length; b++) {
116
+ const batch = [];
117
+ for (let d = 0; d < latentDim; d++) {
118
+ const row = [];
119
+ for (let t = 0; t < latentLen; t++) {
120
+ const u1 = Math.max(1e-10, Math.random()), u2 = Math.random();
121
+ row.push(Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2));
122
+ }
123
+ batch.push(row);
124
+ }
125
+ noisy.push(batch);
126
+ }
127
+ const mask = getLatentMask(wavLengths, this.baseChunk, this.ccf);
128
+ for (let b = 0; b < noisy.length; b++) for (let d = 0; d < noisy[b].length; d++) for (let t = 0; t < noisy[b][d].length; t++) noisy[b][d][t] *= mask[b][0][t];
129
+ return { noisy, mask };
130
+ }
131
+ async _infer(textList, langList, style, totalStep, speed) {
132
+ const bsz = textList.length;
133
+ const { textIds, textMask } = this.tp.call(textList, langList);
134
+ const idsShape = [bsz, textIds[0].length];
135
+ const maskTensor = tensorF32(textMask, [bsz, 1, textMask[0][0].length]);
136
+ const dpr = await this.dp.run({ text_ids: tensorI64(textIds, idsShape), style_dp: style.dp, text_mask: maskTensor });
137
+ const dur = Array.from(dpr.duration.data).map((d) => d / speed);
138
+ const ter = await this.te.run({ text_ids: tensorI64(textIds, idsShape), style_ttl: style.ttl, text_mask: maskTensor });
139
+ const textEmb = ter.text_emb;
140
+ let { noisy, mask } = this._sampleNoisy(dur);
141
+ const latShape = [bsz, noisy[0].length, noisy[0][0].length];
142
+ const latMaskTensor = tensorF32(mask, [bsz, 1, mask[0][0].length]);
143
+ const stepTensor = tensorF32(new Array(bsz).fill(totalStep), [bsz]);
144
+ for (let step = 0; step < totalStep; step++) {
145
+ const r = await this.ve.run({
146
+ noisy_latent: tensorF32(noisy, latShape), text_emb: textEmb, style_ttl: style.ttl,
147
+ text_mask: maskTensor, latent_mask: latMaskTensor, total_step: stepTensor,
148
+ current_step: tensorF32(new Array(bsz).fill(step), [bsz]),
149
+ });
150
+ const den = Array.from(r.denoised_latent.data);
151
+ let idx = 0;
152
+ for (let b = 0; b < noisy.length; b++) for (let d = 0; d < noisy[b].length; d++) for (let t = 0; t < noisy[b][d].length; t++) noisy[b][d][t] = den[idx++];
153
+ }
154
+ const vr = await this.vo.run({ latent: tensorF32(noisy, latShape) });
155
+ return { wav: Array.from(vr.wav_tts.data), duration: dur };
156
+ }
157
+ async call(text, lang, style, totalStep, speed, silence = 0.3) {
158
+ const maxLen = (lang === 'ko' || lang === 'ja') ? 120 : 300;
159
+ const chunks = chunkText(text, maxLen);
160
+ let wavCat = null, durCat = 0;
161
+ for (const chunk of chunks) {
162
+ const { wav, duration } = await this._infer([chunk], [lang], style, totalStep, speed);
163
+ if (wavCat === null) { wavCat = wav; durCat = duration[0]; }
164
+ else { wavCat = [...wavCat, ...new Array(Math.floor(silence * this.sampleRate)).fill(0), ...wav]; durCat += duration[0] + silence; }
165
+ }
166
+ return { wav: wavCat, duration: [durCat] };
167
+ }
168
+ }
169
+
170
+ function chunkText(text, maxLen) {
171
+ const paras = text.trim().split(/\n\s*\n+/).filter((p) => p.trim());
172
+ const chunks = [];
173
+ for (let para of paras) {
174
+ para = para.trim();
175
+ if (!para) continue;
176
+ const sentences = para.split(/(?<=[.!?])\s+/);
177
+ let cur = '';
178
+ for (const s of sentences) {
179
+ if (cur.length + s.length + 1 <= maxLen) cur += (cur ? ' ' : '') + s;
180
+ else { if (cur) chunks.push(cur.trim()); cur = s; }
181
+ }
182
+ if (cur) chunks.push(cur.trim());
183
+ }
184
+ return chunks.length ? chunks : [text.trim()];
185
+ }
186
+
187
+ function loadVoiceStyle(p) {
188
+ const vs = JSON.parse(fs.readFileSync(p, 'utf8'));
189
+ const td = vs.style_ttl.dims, dd = vs.style_dp.dims;
190
+ const ttl = new ort.Tensor('float32', Float32Array.from(vs.style_ttl.data.flat(Infinity)), [1, td[1], td[2]]);
191
+ const dp = new ort.Tensor('float32', Float32Array.from(vs.style_dp.data.flat(Infinity)), [1, dd[1], dd[2]]);
192
+ return { ttl, dp };
193
+ }
194
+
195
+ async function loadTTS(modelDir) {
196
+ const onnx = path.join(modelDir, 'onnx');
197
+ const cfgs = JSON.parse(fs.readFileSync(path.join(onnx, 'tts.json'), 'utf8'));
198
+ const opts = {};
199
+ const [dp, te, ve, vo] = await Promise.all([
200
+ ort.InferenceSession.create(path.join(onnx, 'duration_predictor.onnx'), opts),
201
+ ort.InferenceSession.create(path.join(onnx, 'text_encoder.onnx'), opts),
202
+ ort.InferenceSession.create(path.join(onnx, 'vector_estimator.onnx'), opts),
203
+ ort.InferenceSession.create(path.join(onnx, 'vocoder.onnx'), opts),
204
+ ]);
205
+ const tp = new UnicodeProcessor(path.join(onnx, 'unicode_indexer.json'));
206
+ return new TextToSpeech(cfgs, tp, dp, te, ve, vo);
207
+ }
208
+
209
+ // ── WAV (16-bit PCM mono) ──
210
+ function floatToWav(samples, sampleRate) {
211
+ const dataSize = samples.length * 2;
212
+ const buf = Buffer.alloc(44 + dataSize);
213
+ buf.write('RIFF', 0); buf.writeUInt32LE(36 + dataSize, 4); buf.write('WAVE', 8);
214
+ buf.write('fmt ', 12); buf.writeUInt32LE(16, 16); buf.writeUInt16LE(1, 20); buf.writeUInt16LE(1, 22);
215
+ buf.writeUInt32LE(sampleRate, 24); buf.writeUInt32LE(sampleRate * 2, 28); buf.writeUInt16LE(2, 32); buf.writeUInt16LE(16, 34);
216
+ buf.write('data', 36); buf.writeUInt32LE(dataSize, 40);
217
+ for (let i = 0; i < samples.length; i++) {
218
+ const s = Math.max(-1, Math.min(1, samples[i]));
219
+ buf.writeInt16LE(Math.floor(s * 32767), 44 + i * 2);
220
+ }
221
+ return buf;
222
+ }
223
+
224
+ // Voice "slightly up": peak-normalize the synthesized voice to a consistent presence (neural TTS output often
225
+ // sits well below full scale). Only boosts (never reduces a loud take), capped so near-silence isn't blown up.
226
+ function normalizePeak(samples, target = 0.9, maxGain = 3.0) {
227
+ let peak = 0;
228
+ for (let i = 0; i < samples.length; i++) { const a = Math.abs(samples[i]); if (a > peak) peak = a; }
229
+ if (peak <= 0) return samples;
230
+ const gain = Math.min(target / peak, maxGain);
231
+ if (gain <= 1.0) return samples;
232
+ for (let i = 0; i < samples.length; i++) samples[i] *= gain;
233
+ return samples;
234
+ }
235
+
236
+ // Provenance chime (two-tone 1175→1568 Hz) matching speak.ps1's earcon, as PCM samples to prepend.
237
+ function earconSamples(sampleRate) {
238
+ const tone = (freq, ms) => {
239
+ const n = Math.floor((ms / 1000) * sampleRate);
240
+ return Array.from({ length: n }, (_, i) => 0.25 * Math.sin((2 * Math.PI * freq * i) / sampleRate));
241
+ };
242
+ return [...tone(1175, 90), ...tone(1568, 110), ...new Array(Math.floor(0.06 * sampleRate)).fill(0)];
243
+ }
244
+
245
+ // Play a WAV synchronously through audio-duck.ps1 (win32): it ducks OTHER apps' sessions while the voice plays
246
+ // and restores them in a finally, excluding its own process so the voice is never ducked. No native deps.
247
+ // If ducking is unavailable it still plays (audio-duck falls back to a plain SoundPlayer). VORTEX_CU_DUCK=off
248
+ // disables ducking (factor 1.0); VORTEX_CU_DUCK_FACTOR tunes how much others drop (0.5 = to 50%).
249
+ function playWav(wavPath) {
250
+ return new Promise((resolve) => {
251
+ if (process.platform !== 'win32') return resolve(false);
252
+ const duckScript = path.join(__dirname, 'audio-duck.ps1');
253
+ const factor = process.env.VORTEX_CU_DUCK === 'off' ? '1.0' : (process.env.VORTEX_CU_DUCK_FACTOR || '0.3');
254
+ const ps = spawn('pwsh', ['-NoProfile', '-NonInteractive', '-File', duckScript, '-WavPath', wavPath, '-Factor', factor], { stdio: 'ignore' });
255
+ const kill = () => { try { ps.kill(); } catch {} };
256
+ process.once('SIGTERM', kill); process.once('SIGINT', kill);
257
+ ps.on('exit', () => resolve(true));
258
+ ps.on('error', () => resolve(false));
259
+ });
260
+ }
261
+
262
+ async function main() {
263
+ const a = parseArgs(process.argv);
264
+ let text = String(a.text || '');
265
+ if (text.length > a.maxChars) text = text.slice(0, a.maxChars);
266
+ if (!text.trim()) { emit({ ok: false, error: 'empty text' }); process.exit(1); }
267
+ const modelDir = resolveModelDir(a.modelDir);
268
+ const t0 = Date.now();
269
+ try {
270
+ ort = (await import('onnxruntime-node')).default ?? (await import('onnxruntime-node'));
271
+ } catch { emit({ ok: false, error: 'onnxruntime-node not installed' }); process.exit(1); }
272
+ try {
273
+ const stylePath = path.join(modelDir, 'voice_styles', `${a.voice}.json`);
274
+ if (!fs.existsSync(stylePath)) { emit({ ok: false, error: `voice not found: ${a.voice}` }); process.exit(1); }
275
+ const tts = await loadTTS(modelDir);
276
+ const style = loadVoiceStyle(stylePath);
277
+ const { wav } = await tts.call(text, a.lang, style, a.steps, a.speed);
278
+ const voice = normalizePeak(wav); // voice "slightly up" — consistent presence
279
+ let samples = a.earcon ? [...earconSamples(tts.sampleRate), ...voice] : voice;
280
+ const wavBuf = floatToWav(samples, tts.sampleRate);
281
+ if (a.toWav) {
282
+ fs.writeFileSync(a.toWav, wavBuf);
283
+ } else {
284
+ const tmp = path.join(os.tmpdir(), `vortex-cu-tts-${process.pid}-${Date.now()}.wav`);
285
+ fs.writeFileSync(tmp, wavBuf);
286
+ await playWav(tmp);
287
+ try { fs.unlinkSync(tmp); } catch {}
288
+ }
289
+ emit({ ok: true, voice: a.voice, chars: text.length, ms: Date.now() - t0 });
290
+ } catch (e) {
291
+ emit({ ok: false, error: 'tts failed' });
292
+ process.exit(1);
293
+ }
294
+ }
295
+
296
+ main();