npm - @vortex-os/computer-use - Versions diffs - 0.2.0 → 0.7.0 - Mend

@vortex-os/computer-use 0.2.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +108 -5
package/computer-use.config.example.json +16 -0
package/package.json +19 -4
package/scripts/activity.mjs +92 -0
package/scripts/audio-duck.ps1 +180 -0
package/scripts/classify.ps1 +8 -0
package/scripts/fetch-supertonic.mjs +65 -0
package/scripts/lib.ps1 +62 -0
package/scripts/mcp-stdio.mjs +726 -19
package/scripts/noise-filter.mjs +135 -0
package/scripts/ocr.ps1 +92 -0
package/scripts/speak-supertonic.mjs +296 -0
package/scripts/speak.ps1 +58 -0
package/scripts/speech-safety.mjs +104 -0
package/scripts/vlm.mjs +106 -0

package/scripts/mcp-stdio.mjs CHANGED Viewed

@@ -1,21 +1,23 @@
 #!/usr/bin/env node
 // @vortex-os/computer-use — read-only screen-perception MCP stdio server (Windows-first).
-// Tools: probe · read_ui · capture_screen · watch_capture · poll_change · beep. Control is out of scope.
+// Tools: probe · read_ui · classify_activity · capture_screen · watch_capture · poll_change · start_watch · get_events · stop_watch · beep · speak.
+// Control is out of scope.
 // Two modes (bin `vortex-mcp-computer-use`):
 //   - default: run the stdio server (what an MCP host launches).
 //   - `install`: self-register into the project `.mcp.json` under the non-reserved key
 //     `vortex-computer-use` (merge-safe). e.g. `npx vortex-mcp-computer-use install`.
-// Optional dep: @modelcontextprotocol/sdk (declared optional; imported by this entry — `install`
-// just registers and exits without connecting a transport).
-import { Server } from '@modelcontextprotocol/sdk/server/index.js';
-import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
-import { ListToolsRequestSchema, CallToolRequestSchema } from '@modelcontextprotocol/sdk/types.js';
+// Optional dep: @modelcontextprotocol/sdk — loaded DYNAMICALLY only on the serve path (see the
+// bottom dispatch), so `install` registers and exits without needing the SDK present.
 import { spawnSync, spawn } from 'node:child_process';
 import { fileURLToPath } from 'node:url';
 import { dirname, join } from 'node:path';
 import { readFileSync, unlinkSync, statSync, mkdtempSync, rmSync, existsSync, mkdirSync, writeFileSync, renameSync, appendFileSync } from 'node:fs';
 import { tmpdir, homedir } from 'node:os';
 import { createHmac, randomBytes } from 'node:crypto';
+import { NoiseFilter, resolveFilterConfig } from './noise-filter.mjs';
+import { sanitizeForSpeech, buildUtterance, estimateSpeechMs, SpeechBudget } from './speech-safety.mjs';
+import { parseVlmConfig, vlmGate, buildChatBody, extractText, SYNTH_PNG_B64, DEFAULT_VLM_PROMPT } from './vlm.mjs';
+import { classifyActivity } from './activity.mjs';
 const dir = dirname(fileURLToPath(import.meta.url));
 const plat = process.platform;
@@ -43,6 +45,43 @@ function loadRedactionConfig() {
 }
 const REDACTION = loadRedactionConfig();
+// ── TTS / audio-ducking config (file < env precedence, like the denylist) ──────────────────
+// Reads the `tts` section of computer-use.config.json and fills process.env ONLY where the matching env var is
+// unset, so a user can tune voice/engine/ducking in the config FILE (no env needed) while env still wins. The
+// values flow unchanged to the spawned speak helpers (speak-supertonic.mjs / speak.ps1), which read these env vars.
+function loadTtsConfig() {
+  let cfg = {};
+  try {
+    const cfgPath = join(dir, 'computer-use.config.json');
+    if (existsSync(cfgPath)) cfg = (JSON.parse(readFileSync(cfgPath, 'utf8')) || {}).tts || {};
+  } catch {}
+  const setIfUnset = (k, v) => { if (v !== undefined && v !== null && (process.env[k] === undefined || process.env[k] === '')) process.env[k] = String(v); };
+  setIfUnset('VORTEX_CU_TTS_ENGINE', cfg.engine);       // 'auto' (default) | 'supertonic' | 'heami'
+  setIfUnset('VORTEX_CU_TTS_VOICE', cfg.voice);         // Supertonic voice: F1..F5 / M1..M5
+  setIfUnset('VORTEX_CU_TTS_MODEL_DIR', cfg.modelDir);  // Supertonic model cache (default ~/.vortex/computer-use/supertonic-3)
+  setIfUnset('VORTEX_CU_TTS_LANG', cfg.lang);           // spoken language (defaults to the OCR language)
+  if (cfg.duck === false) setIfUnset('VORTEX_CU_DUCK', 'off');   // lower other apps while speaking (default on)
+  setIfUnset('VORTEX_CU_DUCK_FACTOR', cfg.duckFactor);  // others -> original*factor during speech (0..1; default 0.3)
+}
+loadTtsConfig();
+// ── companion (adaptive screen companion) config — same file<env precedence ──
+// `companion.uiaCanvasMax` tunes the GPU-canvas (game/video) UIA cutoff; `companion.profiles` overrides per-class
+// cadence/proactivity (e.g. { "GAME": { "cadenceSec": 20 } }). Consumed by classify_activity.
+let COMPANION_PROFILES = {};
+function loadCompanionConfig() {
+  let cfg = {};
+  try {
+    const cfgPath = join(dir, 'computer-use.config.json');
+    if (existsSync(cfgPath)) cfg = (JSON.parse(readFileSync(cfgPath, 'utf8')) || {}).companion || {};
+  } catch {}
+  if (cfg.uiaCanvasMax != null && (process.env.VORTEX_CU_UIA_CANVAS_MAX === undefined || process.env.VORTEX_CU_UIA_CANVAS_MAX === '')) {
+    process.env.VORTEX_CU_UIA_CANVAS_MAX = String(cfg.uiaCanvasMax);
+  }
+  COMPANION_PROFILES = (cfg.profiles && typeof cfg.profiles === 'object') ? cfg.profiles : {};
+}
+loadCompanionConfig();
 // ── audit log (§8: metadata/HMAC only, original image not stored) ──────────────────────
 // Location = under LocalAppData (outside the instance data/ -> won't leak via corporate sync, codex MEDIUM). The key lives there too.
 const AUDIT_DIR = join(process.env.LOCALAPPDATA || join(homedir(), '.local', 'share'), 'vortex-computer-use', 'audit');
@@ -90,6 +129,7 @@ const B = {
   win32: {
     probe:   ['pwsh', ['-NoProfile', '-File', join(dir, 'probe.ps1')]],
     read:    ['pwsh', ['-NoProfile', '-File', join(dir, 'read-ui.ps1')]],
+    classify: ['pwsh', ['-NoProfile', '-File', join(dir, 'classify.ps1')]],
     capture: ['pwsh', ['-NoProfile', '-File', join(dir, 'point-to-ask.ps1')]],
   },
   darwin: {
@@ -100,11 +140,15 @@ const B = {
 };
 // Returns: { payload, isError } — backend abnormal exit / non-JSON output is surfaced as isError (so an error never flows as if normal in the watch loop).
-function runBackend(kind, extraArgs = []) {
+function runBackend(kind, extraArgs = [], timeoutMs = 0) {
   const b = B[plat]?.[kind];
   if (!b) return { payload: { error: `unsupported platform/op: ${plat}/${kind}`, grade: 'P0 (manual) fallback' }, isError: true };
   const [exe, base] = b;
-  const r = spawnSync(exe, [...base, ...extraArgs], { encoding: 'utf8', maxBuffer: 8 * 1024 * 1024 });
+  // Optional hard timeout: a one-shot tool (e.g. classify_activity) must not let a hung COM/UIA call freeze the
+  // synchronous spawn (and thus the event loop). On timeout, spawnSync kills the child and sets r.error.
+  const opts = { encoding: 'utf8', maxBuffer: 8 * 1024 * 1024 };
+  if (timeoutMs > 0) { opts.timeout = timeoutMs; opts.killSignal = 'SIGKILL'; }
+  const r = spawnSync(exe, [...base, ...extraArgs], opts);
   if (r.error) return { payload: { error: String(r.error) }, isError: true };
   const failed = r.status !== 0;
   try {
@@ -298,6 +342,528 @@ async function viaWorker(op, args, timeoutMs) {
   catch (e) { return { payload: { error: String((e && e.message) || e) }, isError: true }; }
 }
+// ── watch sessions: background noise-filtered watch + in-memory event buffer (design §22.1·§22.2) ──
+// start_watch spins a non-blocking poll loop OWNED BY THIS SERVER — not a separate process, and not the
+// single worker's request slot (the worker stays a dumb single-shot capture engine; long watches must
+// not occupy it, codex #1). Each tick polls the target's frame-to-frame change via the worker, feeds it
+// to a NoiseFilter (debounce + cooldown + maxWait), and on an emit captures the settled frame and
+// appends an event to a bounded in-memory ring buffer. get_events drains the buffer (non-blocking,
+// batched -> few LLM looks); stop_watch ends it. The buffer is memory-only with count/byte/TTL caps and
+// the frames live in RAM only (design §24.1 — no screen history on disk; the brief capture temp file is
+// materialized inline + unlinked at once). Denylist + volatility apply per frame via the reused worker ops.
+// Concurrency is deliberately conservative: every watch tick enqueues a poll_change on the SAME single
+// PowerShell worker the foreground tools use, so too many fast watches would starve the agent's own
+// capture_screen/read_ui calls (codex MED). 4 watches at a 400ms floor caps worst-case worker pressure at
+// ~10 polls/s; a global foreground-priority queue is the documented next step if this proves tight.
+const MAX_WATCHES = 4;                          // concurrent background watches cap
+const WATCH_MIN_INTERVAL = 400, WATCH_MAX_INTERVAL = 5000;
+const WATCH_DEFAULT_INTERVAL = 600;
+const WATCH_MAX_DURATION_MS = 30 * 60 * 1000;   // auto-stop a forgotten watch (privacy §8 + runaway guard)
+const EVENT_RING_MAX = 64;                       // max buffered events per watch (oldest dropped when full)
+const EVENT_TTL_MS = 5 * 60 * 1000;              // buffered events older than this are evicted unread (§24.1 TTL)
+const EVENT_IMG_MAX_BYTES = 4 * 1024 * 1024;     // per-event inline image cap
+const WATCH_BUF_MAX_BYTES = 24 * 1024 * 1024;    // total inline image bytes held across one watch buffer
+const GET_EVENTS_MAX = 12;                       // events returned per get_events call
+const GET_EVENTS_MAX_IMAGES = 8;                 // image items returned per get_events call (MCP response bound)
+const round2 = (n) => (Number.isFinite(Number(n)) ? Math.round(Number(n) * 100) / 100 : n);
+function buildWatchTargetArgs(a) {
+  const t = {};
+  if (a.region) t.region = `${a.region.x},${a.region.y},${a.region.w},${a.region.h}`;
+  if (a.window) t.windowMatch = String(a.window);
+  if (a.monitor != null) t.monitor = String(a.monitor);
+  if (a.boxW) t.boxW = a.boxW;
+  if (a.boxH) t.boxH = a.boxH;
+  if (a.detail) t.detail = String(a.detail);
+  return t;
+}
+function watchTargetLabel(a) {
+  if (a.region) return `region ${a.region.x},${a.region.y} ${a.region.w}x${a.region.h}`;
+  if (a.window) return `window "${a.window}"`;
+  if (a.monitor != null) return `monitor ${a.monitor}`;
+  return 'cursor';
+}
+// ── reflex path: fixed-phrase / OCR readout spoken LOCALLY, no cloud round-trip (design §22.3) ──
+// A registered trigger crossing fires beep / say(fixed phrase) / ocr(read the region's text) directly from
+// the watch loop. Speech goes through the GLOBAL speech safety (codex r1): screen-derived text (ocr) is
+// never voiced raw — it gets a "화면 글자:" provenance prefix + control/secret shaping + a per-minute
+// utterance & seconds budget with no-overlap and auto-mute. OCR uses the in-box Windows PowerShell 5.1
+// (pwsh 7 can't load WinRT OCR); TTS uses pwsh 7 System.Speech. Both are spawned (non-blocking) and
+// degrade silently if absent. The OCR crop comes from the SAME denylist-gated worker capture (never an
+// arbitrary file), so a denylisted window blocks reflex OCR too (codex r1 MED).
+const PS51 = join(process.env.WINDIR || 'C:\\Windows', 'System32', 'WindowsPowerShell', 'v1.0', 'powershell.exe');
+const OCR_SCRIPT = join(dir, 'ocr.ps1');
+const SPEAK_SCRIPT = join(dir, 'speak.ps1');
+const OCR_LANG = process.env.VORTEX_CU_OCR_LANG || 'ko';
+const SPEAK_TOWAV_DIR = process.env.VORTEX_CU_SPEAK_TOWAV_DIR || '';   // test hook: render speech to WAV instead of audio
+const speechBudget = new SpeechBudget();                              // GLOBAL across all watches (one set of ears)
+let speakingChild = null;
+let speakSeq = 0;
+const MAX_SPEAK_MS = 30000;   // hard upper bound for one utterance so a hung TTS can't hold the no-overlap lock forever (codex r2 MED)
+// Provenance for screen-derived speech (ocr/vision): by DEFAULT a self-documenting verbal prefix ("화면 글자:" …)
+// so even a first-time listener knows the source — this is the prior HIGH control (voicing raw screen text is a
+// social-engineering channel) and a chime alone can't convey it, so spoken stays the default (codex r1 HIGH).
+// VORTEX_CU_SPEECH_PROVENANCE=earcon is an explicit opt-in to a non-verbal chime instead. Either way the source
+// is marked. Agent/user-authored speech ('agent' via the `speak` tool, and a fixed 'say' phrase) is trusted
+// content and carries NO provenance mark.
+const SPEECH_PROVENANCE_EARCON = process.env.VORTEX_CU_SPEECH_PROVENANCE === 'earcon';
+const EARCON_EST_MS = 250;   // chime duration to reserve against the speech budget when earcon mode is on (codex r1 LOW)
+// ── TTS engine: Supertonic (separate-install ONNX neural, higher quality) with Heami fallback ──
+// The audio-RENDER step only. The safety/budget/provenance layer above (buildUtterance + speechBudget) is
+// engine-agnostic; only the final spawn differs. Engine is resolved ONCE at startup (not per-utterance): 'auto'
+// picks Supertonic when its models + onnxruntime-node are present, else the always-available Heami (speak.ps1).
+// VORTEX_CU_TTS_ENGINE=auto|supertonic|heami. Models live in VORTEX_CU_TTS_MODEL_DIR (fetch-supertonic.mjs writes
+// the default ~/.vortex/computer-use/supertonic-3). Voice VORTEX_CU_TTS_VOICE (F1..F5/M1..M5), lang follows OCR_LANG.
+const SPEAK_SUPERTONIC = join(dir, 'speak-supertonic.mjs');
+const TTS_ENGINE_CFG = (process.env.VORTEX_CU_TTS_ENGINE || 'auto').toLowerCase();
+const TTS_MODEL_DIR = process.env.VORTEX_CU_TTS_MODEL_DIR || join(homedir(), '.vortex', 'computer-use', 'supertonic-3');
+const TTS_VOICE = process.env.VORTEX_CU_TTS_VOICE || 'F1';
+const TTS_LANG = process.env.VORTEX_CU_TTS_LANG || OCR_LANG;
+function supertonicAvailable() {
+  try {
+    const onnx = join(TTS_MODEL_DIR, 'onnx');
+    const need = ['duration_predictor.onnx', 'text_encoder.onnx', 'vector_estimator.onnx', 'vocoder.onnx', 'tts.json', 'unicode_indexer.json'];
+    if (!need.every((f) => existsSync(join(onnx, f)))) return false;
+    if (!existsSync(join(TTS_MODEL_DIR, 'voice_styles', `${TTS_VOICE}.json`))) return false;
+    import.meta.resolve('onnxruntime-node');   // throws if the optional dep isn't installed -> fall back to Heami
+    return true;
+  } catch { return false; }
+}
+// 'heami' forces SAPI; 'supertonic'/'auto' use Supertonic when actually available, else fall back (never go mute).
+const TTS_ENGINE = TTS_ENGINE_CFG === 'heami' ? 'heami'
+  : ((TTS_ENGINE_CFG === 'supertonic' || TTS_ENGINE_CFG === 'auto') && supertonicAvailable() ? 'supertonic' : 'heami');
+// Speak a finalized utterance without blocking the watch loop.
+//   kind: 'agent' (the `speak` tool — agent's judged words, no mark, redacted) | 'say' (fixed phrase, no mark)
+//       | 'ocr' | 'vision' (screen-derived, untrusted — marked + shaped).
+function reflexSpeak(kind, text) {
+  const screenDerived = kind === 'ocr' || kind === 'vision';
+  const earcon = screenDerived && SPEECH_PROVENANCE_EARCON;
+  // Earcon mode carries provenance via the chime, so DON'T bake the spoken prefix — but keep the SAME shaping
+  // (control-char strip, secret redaction, length cap) buildUtterance applies to screen text.
+  const utt = earcon ? sanitizeForSpeech(text) : buildUtterance(kind, text);
+  if (!utt) return { ok: false, reason: 'empty' };
+  const res = speechBudget.tryReserve(estimateSpeechMs(utt) + (earcon ? EARCON_EST_MS : 0), Date.now());
+  if (!res.ok) return res;
+  try {
+    let child;
+    if (TTS_ENGINE === 'supertonic') {
+      // Render via the separate-install ONNX neural engine (higher quality). Same spawn lifecycle as Heami below:
+      // non-blocking, killed by the MAX_SPEAK_MS watchdog, budget released exactly once on exit.
+      const sargs = [SPEAK_SUPERTONIC, '--text', utt, '--voice', TTS_VOICE, '--lang', TTS_LANG, '--model-dir', TTS_MODEL_DIR];
+      if (earcon) sargs.push('--earcon');
+      if (SPEAK_TOWAV_DIR) sargs.push('--to-wav', join(SPEAK_TOWAV_DIR, `utt-${++speakSeq}.wav`));
+      child = spawn(process.execPath, sargs, { stdio: 'ignore' });
+    } else {
+      const args = ['-NoProfile', '-File', SPEAK_SCRIPT, '-Text', utt];
+      if (earcon) args.push('-Earcon', 'screen');
+      if (SPEAK_TOWAV_DIR) args.push('-ToWav', join(SPEAK_TOWAV_DIR, `utt-${++speakSeq}.wav`));
+      child = spawn('pwsh', args, { stdio: 'ignore' });
+    }
+    speakingChild = child;
+    // Release the budget EXACTLY ONCE per child, and only if this child still owns the active reservation —
+    // a late/duplicate exit+error event must not free a newer utterance's slot (codex r2 LOW/MED). A hard
+    // timeout kills a hung speaker so it can't deadlock no-overlap (codex r2 MED).
+    let released = false;
+    const done = () => { if (released) return; released = true; clearTimeout(killer); if (speakingChild === child) { speakingChild = null; speechBudget.release(); } };
+    const killer = setTimeout(() => { try { child.kill(); } catch {} done(); }, MAX_SPEAK_MS);
+    if (killer.unref) killer.unref();
+    child.on('exit', done); child.on('error', done);
+    return { ok: true, uttered: utt };
+  } catch { speechBudget.release(); return { ok: false, reason: 'spawn-failed' }; }
+}
+// OCR a worker-captured temp PNG via the 5.1 helper, hard time-bounded; returns recognized text or null.
+// Resolves only AFTER the child has CLOSED (even on timeout: kill, then wait for close) so the PNG file
+// handle is released before the caller unlinks the crop — otherwise a still-open handle leaves a screen
+// crop on disk (codex r2 MED).
+function runOcr(pngPath) {
+  return new Promise((resolve) => {
+    let out = '', settled = false, killed = false, child;
+    const done = (v) => { if (settled) return; settled = true; clearTimeout(timer); resolve(v); };
+    try {
+      child = spawn(PS51, ['-NoProfile', '-NonInteractive', '-ExecutionPolicy', 'Bypass', '-File', OCR_SCRIPT, '-ImagePath', pngPath, '-Lang', OCR_LANG], { stdio: ['ignore', 'pipe', 'ignore'] });
+    } catch { return resolve(null); }
+    const timer = setTimeout(() => { killed = true; try { child.kill(); } catch {} }, 6000);
+    child.stdout.setEncoding('utf8');
+    child.stdout.on('data', (d) => { out += d; if (out.length > 65536) { killed = true; try { child.kill(); } catch {} } });
+    child.on('error', () => done(null));
+    child.on('close', () => { if (killed) return done(null); try { const j = JSON.parse(out.trim()); done(j && j.ok ? j.text : null); } catch { done(null); } });
+  });
+}
+// ── local VLM "middle path" (design §22.3 / §23.2 / §24): OPTIONAL, GPU-gated, off unless a trusted fast
+// local endpoint is reachable. The reflex/brain paths work with no GPU; this only adds a smarter local
+// description when the hardware allows. Capability is PROBED per session (never stored), with a SYNTHETIC
+// image first (never a real crop before the endpoint is trusted), and gated on a measured latency SLA.
+const VLM = parseVlmConfig();
+const VLM_PROBE_TTL = 5 * 60 * 1000;
+let vlmProbe = null, vlmProbeAt = 0, vlmProbing = null;
+const VLM_MAX_CROP_BYTES = 6 * 1024 * 1024;   // bound the data: URL we send (crop is already size-bounded; defence-in-depth)
+const VLM_MAX_RESP_BYTES = 256 * 1024;        // a short description reply is tiny — cap a hostile/huge response (codex MED)
+// Read a response body up to maxBytes, then stop (so a huge/streaming reply can't exhaust memory).
+async function readCappedText(r, maxBytes) {
+  const reader = r.body && r.body.getReader ? r.body.getReader() : null;
+  if (!reader) { const t = await r.text(); return t.length <= maxBytes ? t : null; }
+  let total = 0; const parts = [];
+  for (;;) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    if (value) { total += value.length; if (total > maxBytes) { try { await reader.cancel(); } catch {} return null; } parts.push(Buffer.from(value)); }
+  }
+  return Buffer.concat(parts).toString('utf8');
+}
+async function httpChat(body, timeoutMs) {
+  const ctrl = new AbortController();
+  const timer = setTimeout(() => ctrl.abort(), timeoutMs);
+  try {
+    const headers = { 'content-type': 'application/json' };
+    if (VLM.key) headers.authorization = `Bearer ${VLM.key}`;
+    // redirect:'manual' so a 3xx can't replay this POST (with a real crop) to a DIFFERENT host, bypassing
+    // the remote-off trust gate (codex MED). A local VLM server never needs redirects → treat any non-2xx as fail.
+    const r = await fetch(`${VLM.endpoint}/chat/completions`, { method: 'POST', headers, body: JSON.stringify(body), signal: ctrl.signal, redirect: 'manual' });
+    if (!r.ok || r.type === 'opaqueredirect') return { ok: false, status: r.status, redirected: r.type === 'opaqueredirect' };
+    const clen = Number(r.headers.get('content-length') || 0);
+    if (clen && clen > VLM_MAX_RESP_BYTES) return { ok: false, error: 'response too large' };
+    const txt = await readCappedText(r, VLM_MAX_RESP_BYTES);
+    if (txt === null) return { ok: false, error: 'response exceeded cap' };
+    try { return { ok: true, json: JSON.parse(txt) }; } catch { return { ok: false, error: 'non-JSON response' }; }
+  } catch (e) { return { ok: false, error: String((e && e.name === 'AbortError') ? 'timeout' : (e && e.message) || e) }; }
+  finally { clearTimeout(timer); }
+}
+// Probe the VLM with a SYNTHETIC image only (no real screen) — measure reachability + latency, confirm a
+// usable reply. Cached per process for VLM_PROBE_TTL. Note: synthetic latency is a LOWER bound (a real crop
+// is larger/slower), so it gates OUT a too-slow endpoint but doesn't guarantee a real call is within budget.
+async function probeVlm() {
+  const gate = vlmGate(VLM);
+  if (!gate.ok) return { available: false, reason: gate.reason, tier: gate.tier };
+  const now = Date.now();
+  if (vlmProbe && now - vlmProbeAt < VLM_PROBE_TTL) return vlmProbe;
+  if (vlmProbing) return vlmProbing;
+  vlmProbing = (async () => {
+    const t0 = Date.now();
+    const res = await httpChat(buildChatBody(VLM.model, '이미지가 보이면 ok 라고만 답해.', SYNTH_PNG_B64, 8), VLM.slaMs);
+    const latencyMs = Date.now() - t0;
+    let out;
+    if (!res.ok) out = { available: false, reason: `endpoint not reachable (${res.error || 'http ' + res.status})`, tier: gate.tier };
+    else if (latencyMs > VLM.slaMs) out = { available: false, reason: `too slow (${latencyMs}ms > ${VLM.slaMs}ms SLA)`, tier: gate.tier, latencyMs };
+    else out = { available: true, tier: gate.tier, model: VLM.model, latencyMs };
+    vlmProbe = out; vlmProbeAt = Date.now(); vlmProbing = null;
+    return out;
+  })();
+  return vlmProbing;
+}
+// Describe a worker-captured crop with the local VLM. Returns a short text or null. Output is UNTRUSTED
+// (the caller speaks it via buildUtterance('vision', …) so it gets the "로컬 비전:" prefix + shaping + budget).
+async function runVlm(pngPath) {
+  let b64;
+  try {
+    if (statSync(pngPath).size > VLM_MAX_CROP_BYTES) return null;   // bound the data: URL (codex MED)
+    b64 = readFileSync(pngPath).toString('base64');
+  } catch { return null; }
+  const res = await httpChat(buildChatBody(VLM.model, DEFAULT_VLM_PROMPT, b64, VLM.maxTokens), Math.min(20000, VLM.slaMs * 3));
+  if (!res.ok) return null;
+  return extractText(res.json) || null;
+}
+// Validate caller triggers into safe internal trigger objects. Triggers evaluate against the watch's main
+// target region. Capped, clamped, action-gated. `say` content is sanitized (not screen-derived → not redacted).
+function parseTriggers(raw) {
+  if (!Array.isArray(raw)) return [];
+  const out = [];
+  for (const t of raw.slice(0, 8)) {
+    if (!t || typeof t !== 'object') continue;
+    const action = ['say', 'ocr', 'vision'].includes(t.action) ? t.action : 'beep';
+    const th = Number(t.threshold);
+    const threshold = Math.min(100, Math.max(0.5, Number.isFinite(th) && th > 0 ? th : 12));
+    const cd = Number(t.cooldownMs);
+    const cooldownMs = Math.min(600000, Math.max(1500, Number.isFinite(cd) && cd > 0 ? Math.floor(cd) : 8000));
+    const trg = { action, threshold, cooldownMs, armed: true, pending: false, pendingTs: 0, lastFireTs: 0, fires: 0 };
+    if (action === 'say') trg.say = sanitizeForSpeech(String(t.say || ''), { redactTokens: false }) || '알림';
+    if (action === 'beep') trg.beep = ['info', 'warn', 'urgent'].includes(t.beep) ? t.beep : 'warn';
+    if (action === 'ocr' || action === 'vision') { const dw = Number(t.dwellMs); trg.dwellMs = Math.min(3000, Math.max(0, Number.isFinite(dw) && dw >= 0 ? dw : 700)); }
+    out.push(trg);
+  }
+  return out;
+}
+class WatchSession {
+  constructor(watchId, a) {
+    this.watchId = watchId;
+    this.bgId = `__watch_${watchId}`;             // worker poll-continuity slot, isolated from agent poll_change ids
+    this.targetArgs = buildWatchTargetArgs(a);
+    this.targetLabel = watchTargetLabel(a);
+    const iv = Number(a.pollIntervalMs);
+    this.pollIntervalMs = Math.min(WATCH_MAX_INTERVAL, Math.max(WATCH_MIN_INTERVAL, Number.isFinite(iv) && iv > 0 ? Math.floor(iv) : WATCH_DEFAULT_INTERVAL));
+    this.filter = new NoiseFilter(a);
+    this.cfg = this.filter.cfg;
+    this.outDir = mkdtempSync(join(tmpdir(), 'vortex-cu-watch-'));
+    this.ring = [];                               // [{seq, ts, reason, peakPct, ..., _img?}]
+    this.seq = 0; this.dropped = 0; this.bufBytes = 0;
+    this.polls = 0; this.emitted = 0; this.redactedFrames = 0;
+    this.lastChangePct = null; this.lastPollTs = null;
+    this.lastError = null; this.consecErrors = 0;
+    this.triggers = parseTriggers(a.triggers);    // reflex triggers on the main target region (§22.3)
+    this.reflexFires = 0; this.lastReflex = null;
+    this.startedAt = Date.now();
+    this.stopped = false; this.stopReason = null;
+    this._timer = null;
+  }
+  start() { this._schedule(0); return this.status(); }   // first tick goes through the same guarded path (no unhandled rejection, codex LOW)
+  _schedule(delayMs = this.pollIntervalMs) {
+    if (this.stopped) return;
+    this._timer = setTimeout(() => { this._tick().catch(() => { if (!this.stopped) this._schedule(); }); }, delayMs);
+    if (this._timer.unref) this._timer.unref();
+  }
+  async _tick() {
+    if (this.stopped) return;
+    this._evict();   // enforce the TTL on EVERY tick, not just on emit/get_events — a buffered frame must not outlive EVENT_TTL_MS even if the client never polls (codex HIGH, privacy §24.1)
+    if (Date.now() - this.startedAt > WATCH_MAX_DURATION_MS) { this._stop('max watch duration reached (auto-stopped)'); return; }
+    this.polls++;
+    const reset = this.polls === 1;
+    const wa = { ...this.targetArgs, watchId: this.bgId };
+    if (reset) wa.reset = true;
+    const res = await viaWorker('poll_change', wa, OP_TIMEOUT_MS);
+    if (this.stopped) return;   // stop_watch/dispose may have fired during the await — don't resume into a torn-down session (codex HIGH race)
+    const p = res.payload || {};
+    if (res.isError) {
+      this.consecErrors++; this.lastError = String(p.error || 'poll failed');
+      if (this.consecErrors >= 5) { this._stop(`stopped after repeated poll errors: ${this.lastError}`); return; }
+    } else {
+      this.consecErrors = 0;
+      this.lastChangePct = p.changePct != null ? round2(p.changePct) : null;
+      this.lastPollTs = Date.now();
+      if (p.redacted) {
+        // A denylisted window overlaps the target -> no capture this frame. Treat as a blind gap (don't feed
+        // the filter a fake diff); surface the count in status so the agent knows the watch is partially blind.
+        this.redactedFrames++;
+      } else {
+        const baseline = p.baseline === true;     // includes a silent worker stateReset (fresh baseline)
+        const c = p.changePct != null ? p.changePct : 0;
+        const emit = this.filter.push({ changePct: c, now: this.lastPollTs, baseline });
+        if (emit) { try { await this._onEmit(emit, p); } catch (e) { this.lastError = `emit/capture failed: ${String((e && e.message) || e)}`; } }
+        // Reflex triggers run on the RAW per-tick change (not the settled-event path) — fast local beep/say/ocr.
+        if (!baseline && this.triggers.length) { try { await this._evalReflexes(c, this.lastPollTs); } catch (e) { this.lastError = `reflex failed: ${String((e && e.message) || e)}`; } }
+      }
+    }
+    if (!this.stopped) this._schedule();
+  }
+  async _onEmit(emit, pollPayload) {
+    const cap = await viaWorker('capture', { ...this.targetArgs, outDir: this.outDir }, OP_TIMEOUT_MS);
+    if (this.stopped) {
+      materializeImages(cap.payload);   // stopped during the capture await — unlink the just-captured temp frame and drop it (don't push into a disposed session, codex HIGH race)
+      try { rmSync(this.outDir, { recursive: true, force: true }); } catch {}   // the worker may have recreated outDir to write that frame — clear the now-empty dir (codex r2 residual)
+      return;
+    }
+    const cp = cap.payload || {};
+    const ev = { seq: ++this.seq, ts: Date.now(), reason: emit.reason, peakPct: round2(emit.peakPct), activeMs: emit.activeMs, target: this.targetLabel };
+    if (cap.isError) {
+      ev.captureError = String(cp.error || 'capture failed');
+    } else if (cp.redacted) {
+      ev.redacted = true; ev.note = 'settled change detected but the frame was withheld (denylisted window in region)';
+    } else {
+      const items = materializeImages(cp);        // reads the temp PNG inline as base64 and unlinks it (§8 volatility)
+      const img = items[0];
+      if (img) {
+        const bytes = Buffer.from(img.data, 'base64').length;
+        if (bytes <= EVENT_IMG_MAX_BYTES) { ev._img = img; ev.bytes = bytes; this.bufBytes += bytes; }
+        else ev.imageDropped = `image too large (${bytes} bytes)`;
+      }
+      if (cp.outputSize) ev.outputSize = cp.outputSize;
+      if (cp.approxTokens != null) ev.approxTokens = cp.approxTokens;
+      if (cp.captureRect) ev.captureRect = cp.captureRect;
+    }
+    this.emitted++;
+    this.ring.push(ev);
+    this._evict();
+  }
+  // Reflex evaluation: per trigger, fire when the raw change crosses its threshold — with hysteresis
+  // (re-arm only after it goes quiet), per-trigger cooldown, and (for ocr) a one-tick dwell so we read a
+  // settled frame rather than a half-drawn one (codex r1 MED). The reflex bypasses the noise-filter debounce
+  // for speed but keeps these throttles + the global speech budget (codex r1 — no denial-of-attention).
+  async _evalReflexes(changePct, now) {
+    for (const trg of this.triggers) {
+      // A pending ocr dwell fires after dwellMs regardless of the current change (the point is a stable frame).
+      if (trg.pending) {
+        if (now - trg.pendingTs >= trg.dwellMs) { trg.pending = false; trg.armed = false; trg.lastFireTs = now; trg.fires++; await this._fireTrigger(trg); }
+        continue;
+      }
+      if (changePct < trg.threshold * 0.5) trg.armed = true;            // hysteresis: re-arm once it settles below half
+      if (changePct < trg.threshold || !trg.armed) continue;
+      if (now - trg.lastFireTs < trg.cooldownMs) continue;             // per-trigger cooldown
+      if (trg.action === 'ocr') { trg.pending = true; trg.pendingTs = now; continue; }   // dwell one tick, then read
+      trg.armed = false; trg.lastFireTs = now; trg.fires++;
+      await this._fireTrigger(trg);
+    }
+  }
+  async _fireTrigger(trg) {
+    if (this.stopped) return;
+    if (trg.action === 'beep') { this._reflexNote(trg, 'beep'); await viaWorker('beep', { pattern: trg.beep }); return; }
+    if (trg.action === 'say') { const r = reflexSpeak('say', trg.say); this._reflexNote(trg, r.ok ? 'say' : `say-skip:${r.reason}`); return; }
+    // 'vision' uses the local VLM if it's available this session; otherwise it degrades to OCR (A always works).
+    let kind = trg.action;   // 'ocr' | 'vision'
+    if (kind === 'vision') { const pv = await probeVlm(); if (this.stopped) return; if (!pv.available) kind = 'ocr'; }   // graceful degrade -> read text
+    // Outcome label: surface the degrade ("vision→ocr") in EVERY branch (redacted/nocapture/empty/said), not just success.
+    const tag = (k) => (trg.action === 'vision' && k === 'ocr') ? 'vision→ocr' : k;
+    // Capture the region through the SAME denylist-gated worker path (never an arbitrary file), then read it.
+    const cap = await viaWorker('capture', { ...this.targetArgs, outDir: this.outDir }, OP_TIMEOUT_MS);
+    if (this.stopped) { materializeImages(cap.payload); return; }
+    const cp = cap.payload || {};
+    if (cp.redacted) { this._reflexNote(trg, `${tag(kind)}-redacted`); return; }   // denylisted window in region -> stay blind
+    if (cap.isError || !cp.path) { this._reflexNote(trg, `${tag(kind)}-nocapture`); return; }
+    let text = null, usedKind = kind;
+    try {
+      if (kind === 'vision') {
+        text = await runVlm(cp.path);
+        // The probe said available but the LIVE call can still fail (model unloaded, timeout) — degrade to OCR (codex low).
+        if (text == null && !this.stopped) { usedKind = 'ocr'; text = await runOcr(cp.path); }
+      } else {
+        text = await runOcr(cp.path);
+      }
+    } finally { try { unlinkSync(cp.path); } catch {} }   // volatile: delete the crop after reading (§8)
+    if (this.stopped) return;
+    if (!text) { this._reflexNote(trg, `${tag(usedKind)}-empty`); return; }
+    const r = reflexSpeak(usedKind === 'vision' ? 'vision' : 'ocr', text);   // screen-derived (untrusted) -> provenance chime (or verbal prefix in spoken mode)
+    this._reflexNote(trg, r.ok ? `${tag(usedKind)}-said` : `${tag(usedKind)}-skip:${r.reason}`);
+  }
+  _reflexNote(trg, outcome) { this.reflexFires++; this.lastReflex = { ts: Date.now(), action: trg.action, outcome }; }
+  _evict() {
+    const cut = Date.now() - EVENT_TTL_MS;
+    while (this.ring.length && this.ring[0].ts < cut) this._drop(this.ring.shift());
+    while (this.ring.length > EVENT_RING_MAX) this._drop(this.ring.shift());
+    while (this.bufBytes > WATCH_BUF_MAX_BYTES && this.ring.length > 0) this._drop(this.ring.shift());
+  }
+  _drop(ev) { if (ev && ev.bytes) this.bufBytes -= ev.bytes; this.dropped++; }
+  drain(max = GET_EVENTS_MAX, maxImages = GET_EVENTS_MAX_IMAGES) {
+    this._evict();
+    // Drain only as far as we can return EVERYTHING we remove: stop BEFORE an image-bearing event that would
+    // exceed the per-call image cap, leaving it (and the rest) buffered for the next call (codex MED — never
+    // remove an event whose frame we then have to discard). Metadata-only events keep draining up to `max`.
+    const images = []; const records = [];
+    const cap = Math.max(1, max);
+    while (this.ring.length && records.length < cap) {
+      const ev = this.ring[0];
+      if (ev._img && images.length >= maxImages) break;   // would overflow images — leave it buffered, truthfully report remaining
+      this.ring.shift();
+      if (ev.bytes) this.bufBytes -= ev.bytes;
+      const rec = { ...ev }; delete rec._img;
+      if (ev._img) { images.push(ev._img); rec.image = 'inline'; }
+      records.push(rec);
+    }
+    return { records, images, remaining: this.ring.length };
+  }
+  status() {
+    this._evict();
+    return {
+      watchId: this.watchId, target: this.targetLabel, running: !this.stopped, stopReason: this.stopReason || undefined,
+      pollIntervalMs: this.pollIntervalMs, polls: this.polls, lastChangePct: this.lastChangePct,
+      ageMs: Date.now() - this.startedAt, buffered: this.ring.length, dropped: this.dropped, emitted: this.emitted,
+      redactedFrames: this.redactedFrames || undefined, filterPhase: this.filter.status.phase,
+      thresholds: { activityThreshold: this.cfg.activityThreshold, quietThreshold: this.cfg.quietThreshold, debounceQuietMs: this.cfg.debounceQuietMs, cooldownMs: this.cfg.cooldownMs, maxWaitMs: this.cfg.maxWaitMs },
+      triggers: this.triggers.length || undefined, reflexFires: this.reflexFires || undefined, lastReflex: this.lastReflex || undefined,
+      lastError: this.lastError || undefined,
+    };
+  }
+  // Stop the loop. On an INVOLUNTARY stop (auto-stop at max duration / repeated errors), the client may never
+  // call stop_watch, so we must not keep screen frames in RAM indefinitely: drop the captured frames now and
+  // remove the (otherwise empty) temp dir, but keep the lightweight metadata records so a later get_events can
+  // still report what happened (codex HIGH — privacy/TTL must hold without client cooperation, §24.1).
+  _stop(reason) {
+    if (this.stopped) return;
+    this.stopped = true; this.stopReason = reason;
+    if (this._timer) { clearTimeout(this._timer); this._timer = null; }
+    this._clearFrames();
+    try { rmSync(this.outDir, { recursive: true, force: true }); } catch {}
+  }
+  _clearFrames() {   // strip pixel data, keep metadata records
+    for (const ev of this.ring) { if (ev._img) { delete ev._img; ev.image = 'cleared (watch stopped)'; } }
+    this.bufBytes = 0;
+  }
+  dispose() {
+    this._stop(this.stopReason || 'disposed');
+    this.ring = []; this.bufBytes = 0;            // drop everything, including metadata
+  }
+}
+const watches = new Map();
+function startWatch(a) {
+  if (plat !== 'win32') return { payload: { error: 'start_watch is currently Windows-only', platform: plat }, isError: true };
+  // Require EXACTLY one fixed target — not "at least one" — so the worker's implicit precedence can't make a
+  // caller watch a different target than they passed (codex LOW). Cursor mode is disallowed (the cursor moves).
+  const targetCount = (a.region ? 1 : 0) + (a.window ? 1 : 0) + (a.monitor != null ? 1 : 0);
+  if (targetCount === 0) {
+    return { payload: { error: 'start_watch needs a fixed target — pass region, window, or monitor. Cursor mode is not allowed (the cursor moves, so every frame would differ).' }, isError: true };
+  }
+  if (targetCount > 1) {
+    return { payload: { error: 'start_watch takes exactly one target — pass only one of region, window, or monitor.' }, isError: true };
+  }
+  const watchId = a.watchId ? String(a.watchId) : 'default';
+  let replaced = false;
+  const existing = watches.get(watchId);
+  if (existing) { existing.dispose(); watches.delete(watchId); replaced = true; }
+  // Prune stopped/empty sessions before enforcing the cap, then reject if still over.
+  if (watches.size >= MAX_WATCHES) {
+    for (const [id, s] of watches) { if (s.stopped && s.ring.length === 0) { s.dispose(); watches.delete(id); } }
+  }
+  if (watches.size >= MAX_WATCHES) {
+    return { payload: { error: `too many concurrent watches (max ${MAX_WATCHES}) — stop one first with stop_watch.` }, isError: true };
+  }
+  const session = new WatchSession(watchId, a);
+  watches.set(watchId, session);
+  const status = session.start();
+  return { payload: { ok: true, action: replaced ? 'restarted' : 'started', ...status, hint: 'Watch runs in the background. Call get_events periodically to collect what changed; stop_watch when done.' }, isError: false };
+}
+function stopWatch(a) {
+  const watchId = a.watchId ? String(a.watchId) : null;
+  if (!watchId) {
+    const ids = [...watches.keys()];
+    let total = 0;
+    for (const [, s] of watches) { total += s.emitted; s.dispose(); }
+    watches.clear();
+    return { payload: { ok: true, action: 'stopped-all', stopped: ids, totalEmitted: total }, isError: false };
+  }
+  const s = watches.get(watchId);
+  if (!s) return { payload: { error: `no watch with id "${watchId}"`, active: [...watches.keys()] }, isError: true };
+  const summary = { ok: true, action: 'stopped', watchId, polls: s.polls, emitted: s.emitted, dropped: s.dropped, discardedUnread: s.ring.length, ageMs: Date.now() - s.startedAt };
+  s.dispose(); watches.delete(watchId);
+  return { payload: summary, isError: false };
+}
+// get_events returns the buffered events as text records plus their settled frames as MCP image items.
+function getEvents(a) {
+  const watchId = a.watchId ? String(a.watchId) : 'default';
+  const s = watches.get(watchId);
+  if (!s) return { result: { payload: { error: `no watch with id "${watchId}"`, active: [...watches.keys()] }, isError: true }, images: [] };
+  const max = Number.isFinite(Number(a.max)) ? Math.min(GET_EVENTS_MAX, Math.max(1, Math.floor(Number(a.max)))) : GET_EVENTS_MAX;
+  const { records, images, remaining } = s.drain(max);
+  const payload = { watchId, events: records, returned: records.length, remaining, status: s.status() };
+  return { result: { payload, isError: false }, images };
+}
 const TOOLS = [
   {
     name: 'probe',
@@ -316,6 +882,11 @@ const TOOLS = [
       additionalProperties: false,
     },
   },
+  {
+    name: 'classify_activity',
+    description: 'Adaptive companion: classify what the user is doing on the foreground window — returns { class: GAME|DEV|MEDIA|BROWSING|PRODUCTIVITY|UNKNOWN, process, title, notificationState, interruptible, canvas, uiaCount, fullscreen, profile, needsChangeRate }. Read-only, zero images. Use it to pick a help profile/cadence; for GAME, sample poll_change to split fast-action (break-gated) vs strategy (periodic). See docs/adaptive-companion.md.',
+    inputSchema: { type: 'object', properties: {}, additionalProperties: false },
+  },
   {
     name: 'capture_screen',
     description: 'Pixel capture — fallback for canvas/games that structured perception cannot read. Target (priority): region > window > monitor > (default) around the cursor. Returns a PNG path (volatility is the caller\'s job, §8).',
@@ -410,13 +981,93 @@ const TOOLS = [
       additionalProperties: false,
     },
   },
+  {
+    name: 'speak',
+    description:
+      "Speak ONE short line aloud in the user's language through the local TTS — the AGENT tier of the watch design. This is how you act as a companion while they look at a game / another screen: you SEE the screen (capture_screen / get_events), UNDERSTAND it, and say something useful — strategic advice, a warning about a threat/opportunity they might miss, brief commentary, or an answer to their question. These are YOUR OWN judged words (trusted), so they are spoken WITHOUT a provenance mark. Therefore do NOT pipe RAW screen text through here — summarize/judge it into your own sentence first; for unjudged raw screen text use an `ocr` trigger instead (it is marked as screen-derived). Keep it to one concise sentence. Globally rate-limited and never overlaps reflex speech (one set of ears). Windows-only.",
+    inputSchema: {
+      type: 'object',
+      properties: {
+        text: { type: 'string', description: 'The sentence to speak — your own words (a summary/advice/answer), not a raw screen dump. Kept short; long text is capped.' },
+      },
+      required: ['text'],
+      additionalProperties: false,
+    },
+  },
+  {
+    name: 'start_watch',
+    description:
+      'Start watching a fixed target in the BACKGROUND and return immediately (non-blocking). A built-in noise filter (debounce + cooldown) suppresses the per-frame ripple of video/games/scrolling and keeps only meaningful, SETTLED changes — so it works on screens that change every frame, where raw change-detection would flood you. Events accumulate in an in-memory buffer; collect them with get_events, end with stop_watch. Needs a fixed target (region/window/monitor — NOT the cursor). Tune with thresholds if needed. Windows-only. Auto-stops after 30 min.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        region: {
+          type: 'object',
+          description: 'Fixed region to watch (virtual-screen physical coordinates). Recommended for a game minimap/alert area.',
+          properties: { x: { type: 'number' }, y: { type: 'number' }, w: { type: 'number' }, h: { type: 'number' } },
+          required: ['x', 'y', 'w', 'h'],
+          additionalProperties: false,
+        },
+        window: { type: 'string', description: 'Window title substring to watch (tracks the window as it moves).' },
+        monitor: { type: 'string', description: "Monitor to watch: 1-based index ('2') or 'primary' (e.g. a game-only screen)." },
+        watchId: { type: 'string', description: 'Watch-session id (default "default"). Use distinct ids to run several watches at once; starting an existing id restarts it.' },
+        pollIntervalMs: { type: 'number', description: 'How often to sample the target (ms, default 600; clamped 400-5000). Lower = snappier + more CPU.' },
+        detail: { type: 'string', description: "Resolution preset for the captured settled frame: 'gist'/'normal'/'text'." },
+        activityThreshold: { type: 'number', description: 'Frame-to-frame % change that WAKES the filter (default 8). Set above your screen\'s ambient jitter (a playing video measures ~2.5-4%) and below a real transition (a scene cut ~16.8%).' },
+        quietThreshold: { type: 'number', description: 'Frame-to-frame % below which a frame counts as "still" (default 5). Hysteresis: forced below activityThreshold.' },
+        debounceQuietMs: { type: 'number', description: 'How long motion must stay quiet before the settled frame is emitted (ms, default 900 = quality gate).' },
+        cooldownMs: { type: 'number', description: 'Minimum gap between events (ms, default 6000 = frequency cap; suppresses ripples of one activity).' },
+        maxWaitMs: { type: 'number', description: 'If motion never settles, emit anyway this often (ms, default 8000 = anti-starvation for continuous motion).' },
+        triggers: {
+          type: 'array',
+          description: 'Reflex triggers (design §22.3): fire a local alert the INSTANT the watched region changes past a threshold — no cloud LLM round-trip, so it reaches the user in well under a second. Actions: `beep` (sound), `say` (speak a FIXED pre-written phrase — safest), `ocr` (read the region\'s text aloud via offline OCR), or `vision` (describe the scene via a LOCAL vision model if one is configured + fast enough, else auto-degrades to ocr). Spoken screen content is marked as screen-derived (by default a verbal "화면 글자:" / "로컬 비전:" prefix; a non-verbal chime when VORTEX_CU_SPEECH_PROVENANCE=earcon) and shaped; speech is globally rate-limited. Use a `beep`/`say` reflex for "ping me the moment X happens"; for the deeper, JUDGED commentary (advice, warnings, answers) look at the get_events frame and voice your own sentence with the `speak` tool — raw `ocr` readout is just a fallback.',
+          maxItems: 8,
+          items: {
+            type: 'object',
+            properties: {
+              action: { type: 'string', enum: ['beep', 'say', 'ocr', 'vision'], description: "What to do on a crossing." },
+              threshold: { type: 'number', description: 'Frame-to-frame % change that fires this trigger (default 12).' },
+              say: { type: 'string', description: "For action 'say': the fixed phrase to speak (e.g. '적 출현')." },
+              beep: { type: 'string', description: "For action 'beep': 'info'/'warn'/'urgent' (default warn)." },
+              cooldownMs: { type: 'number', description: 'Minimum gap between firings of this trigger (ms, default 8000).' },
+              dwellMs: { type: 'number', description: "For action 'ocr': wait this long after the change before reading, so the frame is settled (ms, default 700)." },
+            },
+            required: ['action'],
+            additionalProperties: false,
+          },
+        },
+      },
+      additionalProperties: false,
+    },
+  },
+  {
+    name: 'get_events',
+    description:
+      'Collect the changes a background watch (start_watch) has buffered since the last call — non-blocking, batched (so a long watch costs only a few looks). Returns one record per settled change (time, reason, change magnitude, capture metadata) plus the settled frames as inline images, and a status block (polls, buffered, dropped, filter phase). Drains what it returns; call again for any remainder.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        watchId: { type: 'string', description: 'Which watch to collect from (default "default").' },
+        max: { type: 'number', description: 'Max events to return this call (default/cap 12). Remaining stay buffered.' },
+      },
+      additionalProperties: false,
+    },
+  },
+  {
+    name: 'stop_watch',
+    description: 'Stop a background watch and discard its buffer + in-memory frames. Omit watchId to stop ALL watches. Returns a summary (polls, events emitted, unread discarded).',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        watchId: { type: 'string', description: 'Which watch to stop. Omit to stop every active watch.' },
+      },
+      additionalProperties: false,
+    },
+  },
 ];
-const server = new Server({ name: 'computer-use', version: '0.0.1-poc' }, { capabilities: { tools: {} } });
-server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
-server.setRequestHandler(CallToolRequestSchema, async (req) => {
+// The CallTool handler — standalone (no SDK types), wired to the server only on the serve path.
+async function handleCallTool(req) {
   const { name, arguments: a = {} } = req.params;
   const useWorker = plat === 'win32';   // the resident worker is Windows PowerShell backend only
   let result;
@@ -427,6 +1078,11 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
   try {
   if (name === 'probe') {
     result = useWorker ? await viaWorker('probe', {}) : runBackend('probe');
+    // If a local VLM is configured, also report its availability (synthetic-image probe — no real screen sent).
+    // This is the only place probe touches the network, and only when the user opted in by setting an endpoint.
+    if (VLM.enabled && result && result.payload && typeof result.payload === 'object') {
+      try { result.payload.vlm = await probeVlm(); } catch (e) { result.payload.vlm = { available: false, reason: String((e && e.message) || e) }; }
+    }
   } else if (name === 'read_ui') {
     if (useWorker) {
       const wa = {};
@@ -439,6 +1095,25 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
       if (a.target) args.push('-Target', String(a.target));
       result = runBackend('read', args);
     }
+  } else if (name === 'classify_activity') {
+    if (plat !== 'win32') {
+      result = { payload: { error: 'classify_activity is currently Windows-only', platform: plat }, isError: true };
+    } else {
+      const raw = runBackend('classify', [], 8000);   // fast one-shot; 8s hard timeout so a hung UIA call can't freeze the loop
+      if (raw.isError) {
+        result = raw;
+      } else {
+        try {
+          const opts = { profiles: COMPANION_PROFILES };
+          if (process.env.VORTEX_CU_UIA_CANVAS_MAX) opts.uiaCanvasMax = Number(process.env.VORTEX_CU_UIA_CANVAS_MAX);
+          const d = classifyActivity(raw.payload, opts);
+          const p = raw.payload || {};
+          result = { payload: { ...d, redacted: !!p.redacted, reason: p.reason, procId: p.procId, hwnd: p.hwnd, uiaCapped: p.uiaCapped, uiaOk: p.uiaOk, notificationStateCode: p.notificationState } };
+        } catch (e) {
+          result = { payload: { error: 'classify failed', detail: String((e && e.message) || e), raw: raw.payload }, isError: true };
+        }
+      }
+    }
   } else if (name === 'capture_screen') {
     reqDir = mkdtempSync(join(tmpdir(), 'vortex-cu-'));
     if (useWorker) {
@@ -529,6 +1204,31 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
       if (a.durationMs != null) wa.durationMs = a.durationMs;
       result = await viaWorker('beep', wa);
     }
+  } else if (name === 'speak') {
+    // Agent-authored speech: trusted content -> no provenance mark (treated like a 'say' fixed phrase), but
+    // still shaped + globally budgeted + no-overlap with reflex speech (one set of ears). Non-blocking.
+    if (plat !== 'win32') {
+      result = { payload: { error: 'speak is currently Windows-only', platform: plat }, isError: true };
+    } else if (typeof a.text !== 'string' || !a.text.trim()) {
+      result = { payload: { ok: false, error: 'empty text' }, isError: true };
+    } else {
+      const r = reflexSpeak('agent', a.text);   // agent's judged words: no provenance mark, but redacted + shaped + budgeted
+      result = { payload: r.ok ? { ok: true, uttered: r.uttered } : { ok: false, skipped: r.reason } };
+    }
+  } else if (name === 'start_watch') {
+    result = startWatch(a);
+  } else if (name === 'stop_watch') {
+    result = stopWatch(a);
+  } else if (name === 'get_events') {
+    // get_events carries its OWN already-materialized image items (from the watch buffer) — return them directly
+    // and skip the generic materializeImages pass (there are no on-disk paths in the payload to re-read).
+    if (plat !== 'win32') {
+      result = { payload: { error: 'get_events is currently Windows-only', platform: plat }, isError: true };
+    } else {
+      const { result: r, images } = getEvents(a);
+      auditLog('get_events', r.payload, images);
+      return { content: [{ type: 'text', text: JSON.stringify(r.payload, null, 2) }, ...images], isError: r.isError };
+    }
   } else {
     return { content: [{ type: 'text', text: `unknown tool: ${name}` }], isError: true };
   }
@@ -538,10 +1238,11 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
   } finally {
     if (reqDir) { try { rmSync(reqDir, { recursive: true, force: true }); } catch {} }
   }
-});
+}
-// Clean up the worker on server shutdown (it would also auto-terminate via stdin EOF when the parent dies, but do it explicitly).
-process.on('exit', () => workerMgr.dispose());
+// Clean up the worker + any background watches on server shutdown (stops the loops, frees in-RAM frames,
+// removes the watch temp dirs). The worker also auto-terminates via stdin EOF when the parent dies.
+process.on('exit', () => { try { if (speakingChild) speakingChild.kill(); } catch {} for (const s of watches.values()) { try { s.dispose(); } catch {} } workerMgr.dispose(); });
 process.on('SIGINT', () => process.exit(0));
 process.on('SIGTERM', () => process.exit(0));
@@ -611,7 +1312,13 @@ function runInstall(argv) {
 if (process.argv.slice(2).includes('install')) {
   runInstall(process.argv.slice(2));
 } else {
-  const transport = new StdioServerTransport();
-  await server.connect(transport);
-  process.stderr.write(`[computer-use MCP] ready on stdio (worker=${plat === 'win32' ? 'on' : 'off'}; tools: probe, read_ui, capture_screen, watch_capture, poll_change, beep)\n`);
+  // Serve path: load the MCP SDK dynamically (so `install` never requires it), wire the handlers, connect.
+  const { Server } = await import('@modelcontextprotocol/sdk/server/index.js');
+  const { StdioServerTransport } = await import('@modelcontextprotocol/sdk/server/stdio.js');
+  const { ListToolsRequestSchema, CallToolRequestSchema } = await import('@modelcontextprotocol/sdk/types.js');
+  const server = new Server({ name: 'computer-use', version: '0.5.0' }, { capabilities: { tools: {} } });
+  server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
+  server.setRequestHandler(CallToolRequestSchema, handleCallTool);
+  await server.connect(new StdioServerTransport());
+  process.stderr.write(`[computer-use MCP] ready on stdio (worker=${plat === 'win32' ? 'on' : 'off'}; tools: probe, read_ui, classify_activity, capture_screen, watch_capture, poll_change, start_watch, get_events, stop_watch, beep, speak)\n`);
 }