npm - @vortex-os/computer-use - Versions diffs - 0.2.1 → 0.7.0 - Mend

@vortex-os/computer-use 0.2.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +108 -5
package/computer-use.config.example.json +16 -0
package/package.json +19 -4
package/scripts/activity.mjs +92 -0
package/scripts/audio-duck.ps1 +180 -0
package/scripts/classify.ps1 +8 -0
package/scripts/fetch-supertonic.mjs +65 -0
package/scripts/lib.ps1 +62 -0
package/scripts/mcp-stdio.mjs +714 -7
package/scripts/noise-filter.mjs +135 -0
package/scripts/ocr.ps1 +92 -0
package/scripts/speak-supertonic.mjs +296 -0
package/scripts/speak.ps1 +58 -0
package/scripts/speech-safety.mjs +104 -0
package/scripts/vlm.mjs +106 -0

package/scripts/noise-filter.mjs ADDED Viewed

@@ -0,0 +1,135 @@
+// @vortex-os/computer-use — noise filter for the watch layer (design §22.1).
+//
+// Problem: video / games / scrolling change every frame, so a raw per-frame change threshold
+// floods the agent with one event per ripple of the SAME activity (each event costs a capture
+// + an LLM look). Measured calibration: a playing video jitters ~2.5–4% frame-to-frame, a real
+// scene cut jumps ~16.8%. We want to ignore the steady jitter and report only meaningful,
+// settled changes — without ever going completely silent on a screen that keeps moving.
+//
+// Design: debounce + cooldown COMBINED, with hysteresis. Neither alone works:
+//   - debounce alone ("report once it goes quiet") starves on video/games that never go quiet.
+//   - cooldown alone ("report at most once per N s") fires on a half-drawn transition frame (bad quality).
+// So they split roles:
+//   - debounce  = QUALITY  — emit the frame AFTER motion settles (a clean, stable shot).
+//   - cooldown  = FREQUENCY — never emit more than once per cooldownMs (suppress the ripples).
+//   - maxWait   = ANTI-STARVATION — once WOKEN, if motion never settles, emit anyway every maxWaitMs, so a
+//                 screen with sustained ABOVE-threshold motion still yields periodic snapshots (not silence).
+//   - hysteresis = two thresholds so ambient jitter never even wakes the filter (no flapping).
+// Note on the silence boundary: a screen that changes only BELOW activityThreshold (e.g. steady video jitter
+// at 2.5-4%) is intentionally treated as "nothing happening" and produces no events — that IS the goal of
+// ignoring the ripple, and it means maxWait only applies once a real (above-threshold) change has woken the
+// filter. Lower activityThreshold if you want fainter sustained motion to count as activity.
+//
+// This module is pure (no I/O, no timers of its own — the caller feeds it samples with an explicit
+// `now`), so the state machine is deterministically unit-testable from a synthetic change sequence.
+// Calibrated against the measured numbers above. All tunable per start_watch.
+export const FILTER_DEFAULTS = {
+  // % frame-to-frame change to WAKE the filter (enter an "active" episode). Set well above the
+  // measured video jitter ceiling (~4%) so steady playback never registers as a new event, yet
+  // below a scene cut (~16.8%) so real transitions do. The whole-frame metric dilutes tiny local
+  // changes (a clock, a cursor), so target a meaningful region/window for small UI events.
+  activityThreshold: 8,
+  // % below which a frame counts as "still". Hysteresis: must be < activityThreshold so a woken
+  // episode keeps tracking motion down to a lower floor before it's declared settled.
+  quietThreshold: 5,
+  // Motion must stay below quietThreshold this long before we emit a "settled" event (quality gate).
+  // Keep it a small multiple of the poll interval so a couple of quiet polls confirm the settle.
+  debounceQuietMs: 900,
+  // Minimum gap between emitted events (frequency cap). Suppresses the ripples of one activity.
+  cooldownMs: 6000,
+  // If an episode never settles (sustained motion: video, ongoing battle), emit anyway this often
+  // so a continuously-moving screen still produces periodic snapshots instead of going silent.
+  maxWaitMs: 8000,
+};
+const clampNum = (v, lo, hi, dflt) => {
+  const n = Number(v);
+  if (!Number.isFinite(n)) return dflt;
+  return Math.min(hi, Math.max(lo, n));
+};
+// Sanitize caller-supplied options into a coherent, safe config. Enforces the hysteresis invariant
+// (quietThreshold < activityThreshold) so a misconfiguration can't make the filter flap or never wake.
+export function resolveFilterConfig(opts = {}) {
+  const d = FILTER_DEFAULTS;
+  let activityThreshold = clampNum(opts.activityThreshold, 0.1, 100, d.activityThreshold);
+  let quietThreshold = clampNum(opts.quietThreshold, 0, 100, d.quietThreshold);
+  // Keep the quiet floor strictly below the wake threshold (hysteresis). If a caller inverts them,
+  // pull the quiet floor down to just under the wake threshold rather than silently swapping intent.
+  if (quietThreshold >= activityThreshold) quietThreshold = Math.max(0, activityThreshold - 1);
+  return {
+    activityThreshold,
+    quietThreshold,
+    debounceQuietMs: clampNum(opts.debounceQuietMs, 0, 60000, d.debounceQuietMs),
+    cooldownMs: clampNum(opts.cooldownMs, 0, 600000, d.cooldownMs),
+    maxWaitMs: clampNum(opts.maxWaitMs, 100, 600000, d.maxWaitMs),
+  };
+}
+export class NoiseFilter {
+  constructor(opts = {}) {
+    this.cfg = resolveFilterConfig(opts);
+    this.phase = 'idle';        // idle | active | cooldown
+    this.activeStart = 0;       // ts the current episode woke
+    this.lastMotionTs = 0;      // last ts change was >= quietThreshold (i.e. "still moving")
+    this.lastEmitTs = -Infinity; // ts of the last emitted event
+    this.peakPct = 0;           // strongest change seen during the current episode
+    this.samples = 0;           // total samples fed (diagnostics)
+  }
+  // Feed one polled sample. Returns an emit descriptor { reason, peakPct, activeMs } when this sample
+  // triggers an event, otherwise null. `now` is the caller's clock (ms); `baseline` true means the
+  // change metric is not meaningful for this sample (fresh baseline / lost watch state) — reset the episode.
+  push({ changePct, now, baseline = false }) {
+    this.samples++;
+    const c = Number.isFinite(Number(changePct)) ? Number(changePct) : 0;
+    const cfg = this.cfg;
+    if (baseline) {
+      // A (re)baseline carries no comparable diff — abandon any in-progress episode, stay armed.
+      this.phase = 'idle';
+      this.peakPct = 0;
+      return null;
+    }
+    // Re-arm after cooldown expires, then evaluate this same sample as idle (so a sample that arrives
+    // already above the wake threshold starts the next episode immediately, no wasted poll).
+    if (this.phase === 'cooldown') {
+      if (now - this.lastEmitTs >= cfg.cooldownMs) this.phase = 'idle';
+      else return null;
+    }
+    if (this.phase === 'idle') {
+      if (c >= cfg.activityThreshold) {
+        this.phase = 'active';
+        this.activeStart = now;
+        this.lastMotionTs = now;
+        this.peakPct = c;
+      }
+      return null;
+    }
+    // phase === 'active'
+    if (c > this.peakPct) this.peakPct = c;
+    if (c >= cfg.quietThreshold) this.lastMotionTs = now;   // still moving — push the settle clock back
+    const quietFor = now - this.lastMotionTs;
+    const activeFor = now - this.activeStart;
+    if (quietFor >= cfg.debounceQuietMs) return this._emit('settled', now, activeFor);
+    if (activeFor >= cfg.maxWaitMs) return this._emit('maxwait', now, activeFor);
+    return null;
+  }
+  _emit(reason, now, activeMs) {
+    const peakPct = this.peakPct;
+    this.lastEmitTs = now;
+    this.phase = 'cooldown';
+    this.peakPct = 0;
+    return { reason, peakPct, activeMs };
+  }
+  // Snapshot for status reporting / tests.
+  get status() {
+    return { phase: this.phase, samples: this.samples, peakPct: this.peakPct };
+  }
+}

package/scripts/ocr.ps1 ADDED Viewed

@@ -0,0 +1,92 @@
+# computer-use — OCR helper (Windows PowerShell 5.1 ONLY; invoked via powershell.exe).
+#
+# Why a separate helper: the resident worker runs under pwsh 7 (.NET Core), which dropped the WinRT
+# projection, so Windows.Media.Ocr (built-in, offline, NO install) cannot be loaded there. pwsh 7
+# delegates the OCR step to this script run via the in-box `powershell.exe` (Windows PowerShell 5.1),
+# where the WinRT types load. This powers the reflex path: read on-screen text WITHOUT a cloud LLM
+# round-trip, so an alert can be spoken in well under a second.
+#
+# Contract: input -ImagePath <png>; output exactly ONE JSON line on stdout — {ok, text, lang, ms} on
+# success, {ok:false, error} on failure (exit 1). Nothing else is written to stdout (clean to parse).
+# The PARENT must invoke this with an argument array (never a shell string): powershell.exe -NoProfile
+# -NonInteractive -ExecutionPolicy Bypass -File <abs ocr.ps1> -ImagePath <abs temp png>, and must impose
+# its OWN hard spawn timeout + temp-PNG cleanup + non-JSON-stdout handling (codex r1).
+#
+# SECURITY: the recognized text is UNTRUSTED input (it is screen content). It is for narration/alerting
+# ONLY — it must never be turned into an action or an instruction the agent follows, and a speaking
+# caller MUST add a provenance prefix + shape/cap it before TTS (design §14/§16/§24, codex r1 HIGH).
+# Defence-in-depth here: only OCR files under the allowed temp root, bound every WinRT await with a
+# timeout, and pre-shape/cap the emitted text so a hung or hostile crop can't stall or flood the caller.
+param(
+  [Parameter(Mandatory = $true)][string]$ImagePath,
+  [string]$Lang = '',                         # optional BCP-47 tag (e.g. 'ko', 'en-US'); else user-profile default
+  [int]$TimeoutMs = 4000,                     # per-await hard cap so a stuck WinRT call can't block forever
+  [string]$AllowRoot = '',                    # only OCR files under this root (default: the user TEMP dir)
+  [int]$MaxChars = 600                        # cap emitted text (a long spoken utterance is itself a risk)
+)
+$ErrorActionPreference = 'Stop'
+try { [Console]::OutputEncoding = [System.Text.UTF8Encoding]::new($false) } catch {}
+function Emit($o) { [Console]::Out.WriteLine(($o | ConvertTo-Json -Compress)) }
+try {
+  # Path policy: resolve to a full path and require it under the allowed root (worker-owned temp crops only).
+  # The real guarantee is caller-side (it passes a denylist-gated, worker-created temp PNG); this is defence-in-depth.
+  $root = if ($AllowRoot) { $AllowRoot } else { $env:TEMP }
+  $rootFull = [System.IO.Path]::GetFullPath($root).TrimEnd('\') + '\'
+  $full = [System.IO.Path]::GetFullPath($ImagePath)
+  if (-not $full.StartsWith($rootFull, [System.StringComparison]::OrdinalIgnoreCase)) {
+    Emit @{ ok = $false; error = 'image path is outside the allowed temp root' }; exit 1
+  }
+  if (-not (Test-Path -LiteralPath $full -PathType Leaf)) { Emit @{ ok = $false; error = 'image not found' }; exit 1 }
+  $null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime]
+  Add-Type -AssemblyName System.Runtime.WindowsRuntime
+  # The single-arg generic AsTask projection (IAsyncOperation<T> -> Task<T>) so we can synchronously await WinRT calls.
+  $asTask = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
+      $_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1'
+    })[0]
+  if (-not $asTask) { Emit @{ ok = $false; error = 'WinRT AsTask projection unavailable' }; exit 1 }
+  # Bound every await: if a WinRT call hangs, .Wait(timeout) returns false and we fail cleanly instead of blocking.
+  function Await($op, $t) {
+    $m = $asTask.MakeGenericMethod($t); $task = $m.Invoke($null, @($op))
+    if (-not $task.Wait($TimeoutMs)) { throw "OCR step timed out after ${TimeoutMs}ms" }
+    $task.Result
+  }
+  # Engine: a specific recognizer language if requested and installed, otherwise the user-profile default.
+  $eng = $null
+  if ($Lang) {
+    try {
+      $L = New-Object Windows.Globalization.Language $Lang
+      if ([Windows.Media.Ocr.OcrEngine]::IsLanguageSupported($L)) { $eng = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($L) }
+    } catch {}
+  }
+  if (-not $eng) { $eng = [Windows.Media.Ocr.OcrEngine]::TryCreateFromUserProfileLanguages() }
+  if (-not $eng) { Emit @{ ok = $false; error = 'no OCR recognizer language installed on this machine' }; exit 1 }
+  $sw = [System.Diagnostics.Stopwatch]::StartNew()
+  $file = Await ([Windows.Storage.StorageFile, Windows.Storage, ContentType = WindowsRuntime]::GetFileFromPathAsync($full)) ([Windows.Storage.StorageFile])
+  $stream = Await ($file.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
+  $decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder, Windows.Graphics, ContentType = WindowsRuntime]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
+  $soft = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
+  # Windows OCR is picky about pixel format on some decoders — normalize to Bgra8 for robustness (codex r1 low/med).
+  if ($soft.BitmapPixelFormat -ne [Windows.Graphics.Imaging.BitmapPixelFormat]::Bgra8) {
+    $soft = [Windows.Graphics.Imaging.SoftwareBitmap]::Convert($soft, [Windows.Graphics.Imaging.BitmapPixelFormat]::Bgra8, [Windows.Graphics.Imaging.BitmapAlphaMode]::Premultiplied)
+  }
+  $res = Await ($eng.RecognizeAsync($soft)) ([Windows.Media.Ocr.OcrResult])
+  $sw.Stop()
+  # Pre-shape the recognized text defensively: strip control/format chars (incl. bidi marks), collapse
+  # whitespace, cap length. The SPEAKING caller still adds a provenance prefix + full sanitization/budget.
+  $txt = [string]$res.Text
+  $txt = ($txt -replace '\p{C}', ' ') -replace '\s+', ' '
+  $txt = $txt.Trim()
+  $truncated = $false
+  if ($txt.Length -gt $MaxChars) { $txt = $txt.Substring(0, $MaxChars); $truncated = $true }
+  Emit @{ ok = $true; text = $txt; lang = $eng.RecognizerLanguage.LanguageTag; ms = [int]$sw.Elapsed.TotalMilliseconds; truncated = $truncated }
+} catch {
+  # Keep the reason generic: don't surface raw paths / exception internals into cloud-visible agent context (codex r1 low).
+  Emit @{ ok = $false; error = 'ocr failed' }
+  exit 1
+}

package/scripts/speak-supertonic.mjs ADDED Viewed

@@ -0,0 +1,296 @@
+// computer-use — Supertonic TTS speaker helper (Node + onnxruntime-node; SEPARATE INSTALL, Windows-first).
+//
+// Speaks ONE already-finalized utterance and exits — the drop-in higher-quality alternative to speak.ps1
+// (System.Speech/Heami). The CALLER (Node reflex path in mcp-stdio.mjs) owns the security: provenance
+// prefix, sanitization, and the speech budget / no-overlap. This helper just renders audio, as its own
+// short-lived process, so it never blocks the resident worker. Engine selection + Heami fallback live in
+// the caller; this script assumes models are present (the caller probes first).
+//
+// Contract (mirrors speak.ps1): --text <utt> [--voice F1] [--lang ko] [--model-dir <dir>] [--to-wav <path>]
+//   [--earcon] [--speed 1.05] [--steps 8] [--max-chars 600]. One JSON line on stdout
+//   {ok, voice, chars, ms} (or {ok:false, error}, exit 1). --to-wav renders to a file (silent, for tests).
+//
+// Inference logic is adapted from Supertone's official Node example (supertone-inc/supertonic, nodejs/helper.js,
+// MIT). Model weights (Supertone/supertonic-3) are OpenRAIL-M and are downloaded separately (fetch-supertonic.mjs),
+// never bundled. onnxruntime-node is an optionalDependency.
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+import { spawn } from 'node:child_process';
+import { fileURLToPath } from 'node:url';
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+function emit(o) { process.stdout.write(JSON.stringify(o) + '\n'); }
+// ── arg parse ──
+function parseArgs(argv) {
+  const a = { text: '', voice: 'F1', lang: 'ko', modelDir: '', toWav: '', earcon: false, speed: 1.05, steps: 8, maxChars: 600 };
+  for (let i = 2; i < argv.length; i++) {
+    const k = argv[i];
+    if (k === '--text') a.text = argv[++i];
+    else if (k === '--voice') a.voice = argv[++i];
+    else if (k === '--lang') a.lang = argv[++i];
+    else if (k === '--model-dir') a.modelDir = argv[++i];
+    else if (k === '--to-wav') a.toWav = argv[++i];
+    else if (k === '--earcon') a.earcon = true;
+    else if (k === '--speed') a.speed = parseFloat(argv[++i]);
+    else if (k === '--steps') a.steps = parseInt(argv[++i], 10);
+    else if (k === '--max-chars') a.maxChars = parseInt(argv[++i], 10);
+  }
+  return a;
+}
+// Default model dir: env override, else a per-user cache the fetch script writes to.
+function resolveModelDir(arg) {
+  if (arg) return arg;
+  if (process.env.VORTEX_CU_TTS_MODEL_DIR) return process.env.VORTEX_CU_TTS_MODEL_DIR;
+  return path.join(os.homedir(), '.vortex', 'computer-use', 'supertonic-3');
+}
+const AVAILABLE_LANGS = ['en', 'ko', 'ja', 'ar', 'bg', 'cs', 'da', 'de', 'el', 'es', 'et', 'fi', 'fr', 'hi', 'hr', 'hu', 'id', 'it', 'lt', 'lv', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sv', 'tr', 'uk', 'vi', 'na'];
+// ── text preprocessing (port of UnicodeProcessor) ──
+class UnicodeProcessor {
+  constructor(indexerPath) { this.indexer = JSON.parse(fs.readFileSync(indexerPath, 'utf8')); }
+  _pre(text, lang) {
+    text = text.normalize('NFKD');
+    const emoji = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
+    text = text.replace(emoji, '');
+    const rep = { '–': '-', '‑': '-', '—': '-', '_': ' ', '“': '"', '”': '"', '‘': "'", '’': "'", '´': "'", '`': "'", '[': ' ', ']': ' ', '|': ' ', '/': ' ', '#': ' ', '→': ' ', '←': ' ' };
+    for (const [k, v] of Object.entries(rep)) text = text.replaceAll(k, v);
+    text = text.replace(/[♥☆♡©\\]/g, '');
+    const expr = { '@': ' at ', 'e.g.,': 'for example, ', 'i.e.,': 'that is, ' };
+    for (const [k, v] of Object.entries(expr)) text = text.replaceAll(k, v);
+    text = text.replace(/ ,/g, ',').replace(/ \./g, '.').replace(/ !/g, '!').replace(/ \?/g, '?').replace(/ ;/g, ';').replace(/ :/g, ':').replace(/ '/g, "'");
+    while (text.includes('""')) text = text.replace('""', '"');
+    while (text.includes("''")) text = text.replace("''", "'");
+    text = text.replace(/\s+/g, ' ').trim();
+    if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text)) text += '.';
+    if (!AVAILABLE_LANGS.includes(lang)) throw new Error(`invalid lang: ${lang}`);
+    return `<${lang}>` + text + `</${lang}>`;
+  }
+  call(textList, langList) {
+    const processed = textList.map((t, i) => this._pre(t, langList[i]));
+    const lengths = processed.map((t) => t.length);
+    const maxLen = Math.max(...lengths);
+    const textIds = [];
+    for (let i = 0; i < processed.length; i++) {
+      const row = new Array(maxLen).fill(0);
+      const vals = Array.from(processed[i]).map((c) => c.charCodeAt(0));
+      for (let j = 0; j < vals.length; j++) row[j] = this.indexer[vals[j]];
+      textIds.push(row);
+    }
+    return { textIds, textMask: lengthToMask(lengths) };
+  }
+}
+function lengthToMask(lengths, maxLen = null) {
+  maxLen = maxLen || Math.max(...lengths);
+  return lengths.map((len) => [Array.from({ length: maxLen }, (_, j) => (j < len ? 1.0 : 0.0))]);
+}
+function getLatentMask(wavLengths, baseChunkSize, ccf) {
+  const sz = baseChunkSize * ccf;
+  return lengthToMask(wavLengths.map((len) => Math.floor((len + sz - 1) / sz)));
+}
+let ort; // lazy so a missing optionalDependency yields a clean JSON error, not an import crash
+function tensorF32(array, dims) { return new ort.Tensor('float32', Float32Array.from(array.flat(Infinity)), dims); }
+function tensorI64(array, dims) { return new ort.Tensor('int64', BigInt64Array.from(array.flat(Infinity).map((x) => BigInt(x))), dims); }
+class TextToSpeech {
+  constructor(cfgs, tp, dp, te, ve, vo) {
+    this.cfgs = cfgs; this.tp = tp; this.dp = dp; this.te = te; this.ve = ve; this.vo = vo;
+    this.sampleRate = cfgs.ae.sample_rate; this.baseChunk = cfgs.ae.base_chunk_size;
+    this.ccf = cfgs.ttl.chunk_compress_factor; this.ldim = cfgs.ttl.latent_dim;
+  }
+  _sampleNoisy(duration) {
+    const wavLenMax = Math.max(...duration) * this.sampleRate;
+    const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate));
+    const chunkSize = this.baseChunk * this.ccf;
+    const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize);
+    const latentDim = this.ldim * this.ccf;
+    const noisy = [];
+    for (let b = 0; b < duration.length; b++) {
+      const batch = [];
+      for (let d = 0; d < latentDim; d++) {
+        const row = [];
+        for (let t = 0; t < latentLen; t++) {
+          const u1 = Math.max(1e-10, Math.random()), u2 = Math.random();
+          row.push(Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2));
+        }
+        batch.push(row);
+      }
+      noisy.push(batch);
+    }
+    const mask = getLatentMask(wavLengths, this.baseChunk, this.ccf);
+    for (let b = 0; b < noisy.length; b++) for (let d = 0; d < noisy[b].length; d++) for (let t = 0; t < noisy[b][d].length; t++) noisy[b][d][t] *= mask[b][0][t];
+    return { noisy, mask };
+  }
+  async _infer(textList, langList, style, totalStep, speed) {
+    const bsz = textList.length;
+    const { textIds, textMask } = this.tp.call(textList, langList);
+    const idsShape = [bsz, textIds[0].length];
+    const maskTensor = tensorF32(textMask, [bsz, 1, textMask[0][0].length]);
+    const dpr = await this.dp.run({ text_ids: tensorI64(textIds, idsShape), style_dp: style.dp, text_mask: maskTensor });
+    const dur = Array.from(dpr.duration.data).map((d) => d / speed);
+    const ter = await this.te.run({ text_ids: tensorI64(textIds, idsShape), style_ttl: style.ttl, text_mask: maskTensor });
+    const textEmb = ter.text_emb;
+    let { noisy, mask } = this._sampleNoisy(dur);
+    const latShape = [bsz, noisy[0].length, noisy[0][0].length];
+    const latMaskTensor = tensorF32(mask, [bsz, 1, mask[0][0].length]);
+    const stepTensor = tensorF32(new Array(bsz).fill(totalStep), [bsz]);
+    for (let step = 0; step < totalStep; step++) {
+      const r = await this.ve.run({
+        noisy_latent: tensorF32(noisy, latShape), text_emb: textEmb, style_ttl: style.ttl,
+        text_mask: maskTensor, latent_mask: latMaskTensor, total_step: stepTensor,
+        current_step: tensorF32(new Array(bsz).fill(step), [bsz]),
+      });
+      const den = Array.from(r.denoised_latent.data);
+      let idx = 0;
+      for (let b = 0; b < noisy.length; b++) for (let d = 0; d < noisy[b].length; d++) for (let t = 0; t < noisy[b][d].length; t++) noisy[b][d][t] = den[idx++];
+    }
+    const vr = await this.vo.run({ latent: tensorF32(noisy, latShape) });
+    return { wav: Array.from(vr.wav_tts.data), duration: dur };
+  }
+  async call(text, lang, style, totalStep, speed, silence = 0.3) {
+    const maxLen = (lang === 'ko' || lang === 'ja') ? 120 : 300;
+    const chunks = chunkText(text, maxLen);
+    let wavCat = null, durCat = 0;
+    for (const chunk of chunks) {
+      const { wav, duration } = await this._infer([chunk], [lang], style, totalStep, speed);
+      if (wavCat === null) { wavCat = wav; durCat = duration[0]; }
+      else { wavCat = [...wavCat, ...new Array(Math.floor(silence * this.sampleRate)).fill(0), ...wav]; durCat += duration[0] + silence; }
+    }
+    return { wav: wavCat, duration: [durCat] };
+  }
+}
+function chunkText(text, maxLen) {
+  const paras = text.trim().split(/\n\s*\n+/).filter((p) => p.trim());
+  const chunks = [];
+  for (let para of paras) {
+    para = para.trim();
+    if (!para) continue;
+    const sentences = para.split(/(?<=[.!?])\s+/);
+    let cur = '';
+    for (const s of sentences) {
+      if (cur.length + s.length + 1 <= maxLen) cur += (cur ? ' ' : '') + s;
+      else { if (cur) chunks.push(cur.trim()); cur = s; }
+    }
+    if (cur) chunks.push(cur.trim());
+  }
+  return chunks.length ? chunks : [text.trim()];
+}
+function loadVoiceStyle(p) {
+  const vs = JSON.parse(fs.readFileSync(p, 'utf8'));
+  const td = vs.style_ttl.dims, dd = vs.style_dp.dims;
+  const ttl = new ort.Tensor('float32', Float32Array.from(vs.style_ttl.data.flat(Infinity)), [1, td[1], td[2]]);
+  const dp = new ort.Tensor('float32', Float32Array.from(vs.style_dp.data.flat(Infinity)), [1, dd[1], dd[2]]);
+  return { ttl, dp };
+}
+async function loadTTS(modelDir) {
+  const onnx = path.join(modelDir, 'onnx');
+  const cfgs = JSON.parse(fs.readFileSync(path.join(onnx, 'tts.json'), 'utf8'));
+  const opts = {};
+  const [dp, te, ve, vo] = await Promise.all([
+    ort.InferenceSession.create(path.join(onnx, 'duration_predictor.onnx'), opts),
+    ort.InferenceSession.create(path.join(onnx, 'text_encoder.onnx'), opts),
+    ort.InferenceSession.create(path.join(onnx, 'vector_estimator.onnx'), opts),
+    ort.InferenceSession.create(path.join(onnx, 'vocoder.onnx'), opts),
+  ]);
+  const tp = new UnicodeProcessor(path.join(onnx, 'unicode_indexer.json'));
+  return new TextToSpeech(cfgs, tp, dp, te, ve, vo);
+}
+// ── WAV (16-bit PCM mono) ──
+function floatToWav(samples, sampleRate) {
+  const dataSize = samples.length * 2;
+  const buf = Buffer.alloc(44 + dataSize);
+  buf.write('RIFF', 0); buf.writeUInt32LE(36 + dataSize, 4); buf.write('WAVE', 8);
+  buf.write('fmt ', 12); buf.writeUInt32LE(16, 16); buf.writeUInt16LE(1, 20); buf.writeUInt16LE(1, 22);
+  buf.writeUInt32LE(sampleRate, 24); buf.writeUInt32LE(sampleRate * 2, 28); buf.writeUInt16LE(2, 32); buf.writeUInt16LE(16, 34);
+  buf.write('data', 36); buf.writeUInt32LE(dataSize, 40);
+  for (let i = 0; i < samples.length; i++) {
+    const s = Math.max(-1, Math.min(1, samples[i]));
+    buf.writeInt16LE(Math.floor(s * 32767), 44 + i * 2);
+  }
+  return buf;
+}
+// Voice "slightly up": peak-normalize the synthesized voice to a consistent presence (neural TTS output often
+// sits well below full scale). Only boosts (never reduces a loud take), capped so near-silence isn't blown up.
+function normalizePeak(samples, target = 0.9, maxGain = 3.0) {
+  let peak = 0;
+  for (let i = 0; i < samples.length; i++) { const a = Math.abs(samples[i]); if (a > peak) peak = a; }
+  if (peak <= 0) return samples;
+  const gain = Math.min(target / peak, maxGain);
+  if (gain <= 1.0) return samples;
+  for (let i = 0; i < samples.length; i++) samples[i] *= gain;
+  return samples;
+}
+// Provenance chime (two-tone 1175→1568 Hz) matching speak.ps1's earcon, as PCM samples to prepend.
+function earconSamples(sampleRate) {
+  const tone = (freq, ms) => {
+    const n = Math.floor((ms / 1000) * sampleRate);
+    return Array.from({ length: n }, (_, i) => 0.25 * Math.sin((2 * Math.PI * freq * i) / sampleRate));
+  };
+  return [...tone(1175, 90), ...tone(1568, 110), ...new Array(Math.floor(0.06 * sampleRate)).fill(0)];
+}
+// Play a WAV synchronously through audio-duck.ps1 (win32): it ducks OTHER apps' sessions while the voice plays
+// and restores them in a finally, excluding its own process so the voice is never ducked. No native deps.
+// If ducking is unavailable it still plays (audio-duck falls back to a plain SoundPlayer). VORTEX_CU_DUCK=off
+// disables ducking (factor 1.0); VORTEX_CU_DUCK_FACTOR tunes how much others drop (0.5 = to 50%).
+function playWav(wavPath) {
+  return new Promise((resolve) => {
+    if (process.platform !== 'win32') return resolve(false);
+    const duckScript = path.join(__dirname, 'audio-duck.ps1');
+    const factor = process.env.VORTEX_CU_DUCK === 'off' ? '1.0' : (process.env.VORTEX_CU_DUCK_FACTOR || '0.3');
+    const ps = spawn('pwsh', ['-NoProfile', '-NonInteractive', '-File', duckScript, '-WavPath', wavPath, '-Factor', factor], { stdio: 'ignore' });
+    const kill = () => { try { ps.kill(); } catch {} };
+    process.once('SIGTERM', kill); process.once('SIGINT', kill);
+    ps.on('exit', () => resolve(true));
+    ps.on('error', () => resolve(false));
+  });
+}
+async function main() {
+  const a = parseArgs(process.argv);
+  let text = String(a.text || '');
+  if (text.length > a.maxChars) text = text.slice(0, a.maxChars);
+  if (!text.trim()) { emit({ ok: false, error: 'empty text' }); process.exit(1); }
+  const modelDir = resolveModelDir(a.modelDir);
+  const t0 = Date.now();
+  try {
+    ort = (await import('onnxruntime-node')).default ?? (await import('onnxruntime-node'));
+  } catch { emit({ ok: false, error: 'onnxruntime-node not installed' }); process.exit(1); }
+  try {
+    const stylePath = path.join(modelDir, 'voice_styles', `${a.voice}.json`);
+    if (!fs.existsSync(stylePath)) { emit({ ok: false, error: `voice not found: ${a.voice}` }); process.exit(1); }
+    const tts = await loadTTS(modelDir);
+    const style = loadVoiceStyle(stylePath);
+    const { wav } = await tts.call(text, a.lang, style, a.steps, a.speed);
+    const voice = normalizePeak(wav);   // voice "slightly up" — consistent presence
+    let samples = a.earcon ? [...earconSamples(tts.sampleRate), ...voice] : voice;
+    const wavBuf = floatToWav(samples, tts.sampleRate);
+    if (a.toWav) {
+      fs.writeFileSync(a.toWav, wavBuf);
+    } else {
+      const tmp = path.join(os.tmpdir(), `vortex-cu-tts-${process.pid}-${Date.now()}.wav`);
+      fs.writeFileSync(tmp, wavBuf);
+      await playWav(tmp);
+      try { fs.unlinkSync(tmp); } catch {}
+    }
+    emit({ ok: true, voice: a.voice, chars: text.length, ms: Date.now() - t0 });
+  } catch (e) {
+    emit({ ok: false, error: 'tts failed' });
+    process.exit(1);
+  }
+}
+main();

package/scripts/speak.ps1 ADDED Viewed

@@ -0,0 +1,58 @@
+# computer-use — TTS speaker helper (pwsh 7; System.Speech, built-in on Windows, NO install).
+#
+# Speaks ONE already-finalized utterance and exits. The CALLER (Node reflex path) owns the security:
+# provenance prefix, sanitization, and the speech budget / no-overlap (codex r1 HIGH/MED). This helper
+# just renders audio so it never blocks the resident worker (it runs as its own short-lived process).
+# `-ToWav` renders to a file instead of the speakers so tests/verification make no sound.
+#
+# Contract: -Text <utterance>; one JSON line on stdout {ok, voice, chars, ms} (or {ok:false,error}, exit 1).
+param(
+  [Parameter(Mandatory = $true)][string]$Text,
+  [int]$Rate = 0,            # System.Speech rate -10..10 (0 = default)
+  [string]$Voice = '',       # preferred voice-name substring; else first Korean voice; else default
+  [string]$ToWav = '',       # render to this WAV path instead of the speakers (tests)
+  [string]$Earcon = '',      # if set, play a short provenance chime through the speakers BEFORE speaking, marking
+                             #   the utterance as screen-derived (non-verbal provenance for the reflex OCR/vision
+                             #   path; skipped under -ToWav so verification stays silent)
+  [int]$MaxChars = 600       # defence-in-depth cap (caller already caps/shapes)
+)
+$ErrorActionPreference = 'Stop'
+try { [Console]::OutputEncoding = [System.Text.UTF8Encoding]::new($false) } catch {}
+function Emit($o) { [Console]::Out.WriteLine(($o | ConvertTo-Json -Compress)) }
+try {
+  $t = [string]$Text
+  if ($t.Length -gt $MaxChars) { $t = $t.Substring(0, $MaxChars) }
+  if ([string]::IsNullOrWhiteSpace($t)) { Emit @{ ok = $false; error = 'empty text' }; exit 1 }
+  Add-Type -AssemblyName System.Speech
+  $syn = New-Object System.Speech.Synthesis.SpeechSynthesizer
+  $voices = @($syn.GetInstalledVoices() | Where-Object { $_.Enabled } | ForEach-Object { $_.VoiceInfo })
+  $picked = $null
+  if ($Voice) { $picked = @($voices | Where-Object { $_.Name -like "*$Voice*" })[0] }
+  if (-not $picked) { $picked = @($voices | Where-Object { $_.Culture.Name -like 'ko*' })[0] }
+  if ($picked) { $syn.SelectVoice($picked.Name) }
+  $syn.Rate = [Math]::Max(-10, [Math]::Min(10, $Rate))
+  if ($ToWav) { $syn.SetOutputToWaveFile($ToWav) } else { $syn.SetOutputToDefaultAudioDevice() }
+  # Provenance chime: a short, distinct two-tone played BEFORE screen-derived speech so the listener hears
+  # "this next bit is raw screen text, not the assistant" without a verbal prefix. Only on real audio output
+  # (never under -ToWav, so verification stays silent). Best-effort — a failed beep must not abort speech.
+  # Duck other apps' audio while speaking (per-app WASAPI), restored in finally — excludes THIS process so the
+  # voice isn't ducked. Skipped under -ToWav (silent test) or VORTEX_CU_DUCK=off. Best-effort: never blocks speech.
+  $duckHandle = $null
+  if (-not $ToWav -and $env:VORTEX_CU_DUCK -ne 'off') {
+    try {
+      . (Join-Path $PSScriptRoot 'audio-duck.ps1')
+      $df = 0.0; [double]::TryParse($env:VORTEX_CU_DUCK_FACTOR, [ref]$df) | Out-Null; if ($df -le 0) { $df = 0.3 }
+      $duckHandle = Invoke-Duck $df @($PID)
+    } catch {}
+  }
+  $sw = [System.Diagnostics.Stopwatch]::StartNew()
+  try {
+    if ($Earcon -and -not $ToWav) { try { [Console]::Beep(1175, 90); [Console]::Beep(1568, 110) } catch {} }
+    $syn.Speak($t)
+  } finally {
+    $sw.Stop()
+    if ($duckHandle) { Restore-Duck $duckHandle }
+  }
+  $syn.SetOutputToNull(); $syn.Dispose()
+  Emit @{ ok = $true; voice = $(if ($picked) { $picked.Name } else { 'default' }); chars = $t.Length; ms = [int]$sw.Elapsed.TotalMilliseconds }
+} catch { Emit @{ ok = $false; error = 'tts failed' }; exit 1 }