@vortex-os/computer-use 0.2.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,23 @@
1
1
  #!/usr/bin/env node
2
2
  // @vortex-os/computer-use — read-only screen-perception MCP stdio server (Windows-first).
3
- // Tools: probe · read_ui · capture_screen · watch_capture · poll_change · beep. Control is out of scope.
3
+ // Tools: probe · read_ui · classify_activity · capture_screen · watch_capture · poll_change · start_watch · get_events · stop_watch · beep · speak.
4
+ // Control is out of scope.
4
5
  // Two modes (bin `vortex-mcp-computer-use`):
5
6
  // - default: run the stdio server (what an MCP host launches).
6
7
  // - `install`: self-register into the project `.mcp.json` under the non-reserved key
7
8
  // `vortex-computer-use` (merge-safe). e.g. `npx vortex-mcp-computer-use install`.
8
- // Optional dep: @modelcontextprotocol/sdk (declared optional; imported by this entry `install`
9
- // just registers and exits without connecting a transport).
10
- import { Server } from '@modelcontextprotocol/sdk/server/index.js';
11
- import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
12
- import { ListToolsRequestSchema, CallToolRequestSchema } from '@modelcontextprotocol/sdk/types.js';
9
+ // Optional dep: @modelcontextprotocol/sdk loaded DYNAMICALLY only on the serve path (see the
10
+ // bottom dispatch), so `install` registers and exits without needing the SDK present.
13
11
  import { spawnSync, spawn } from 'node:child_process';
14
12
  import { fileURLToPath } from 'node:url';
15
13
  import { dirname, join } from 'node:path';
16
14
  import { readFileSync, unlinkSync, statSync, mkdtempSync, rmSync, existsSync, mkdirSync, writeFileSync, renameSync, appendFileSync } from 'node:fs';
17
15
  import { tmpdir, homedir } from 'node:os';
18
16
  import { createHmac, randomBytes } from 'node:crypto';
17
+ import { NoiseFilter, resolveFilterConfig } from './noise-filter.mjs';
18
+ import { sanitizeForSpeech, buildUtterance, estimateSpeechMs, SpeechBudget } from './speech-safety.mjs';
19
+ import { parseVlmConfig, vlmGate, buildChatBody, extractText, SYNTH_PNG_B64, DEFAULT_VLM_PROMPT } from './vlm.mjs';
20
+ import { classifyActivity } from './activity.mjs';
19
21
 
20
22
  const dir = dirname(fileURLToPath(import.meta.url));
21
23
  const plat = process.platform;
@@ -43,6 +45,43 @@ function loadRedactionConfig() {
43
45
  }
44
46
  const REDACTION = loadRedactionConfig();
45
47
 
48
+ // ── TTS / audio-ducking config (file < env precedence, like the denylist) ──────────────────
49
+ // Reads the `tts` section of computer-use.config.json and fills process.env ONLY where the matching env var is
50
+ // unset, so a user can tune voice/engine/ducking in the config FILE (no env needed) while env still wins. The
51
+ // values flow unchanged to the spawned speak helpers (speak-supertonic.mjs / speak.ps1), which read these env vars.
52
+ function loadTtsConfig() {
53
+ let cfg = {};
54
+ try {
55
+ const cfgPath = join(dir, 'computer-use.config.json');
56
+ if (existsSync(cfgPath)) cfg = (JSON.parse(readFileSync(cfgPath, 'utf8')) || {}).tts || {};
57
+ } catch {}
58
+ const setIfUnset = (k, v) => { if (v !== undefined && v !== null && (process.env[k] === undefined || process.env[k] === '')) process.env[k] = String(v); };
59
+ setIfUnset('VORTEX_CU_TTS_ENGINE', cfg.engine); // 'auto' (default) | 'supertonic' | 'heami'
60
+ setIfUnset('VORTEX_CU_TTS_VOICE', cfg.voice); // Supertonic voice: F1..F5 / M1..M5
61
+ setIfUnset('VORTEX_CU_TTS_MODEL_DIR', cfg.modelDir); // Supertonic model cache (default ~/.vortex/computer-use/supertonic-3)
62
+ setIfUnset('VORTEX_CU_TTS_LANG', cfg.lang); // spoken language (defaults to the OCR language)
63
+ if (cfg.duck === false) setIfUnset('VORTEX_CU_DUCK', 'off'); // lower other apps while speaking (default on)
64
+ setIfUnset('VORTEX_CU_DUCK_FACTOR', cfg.duckFactor); // others -> original*factor during speech (0..1; default 0.3)
65
+ }
66
+ loadTtsConfig();
67
+
68
+ // ── companion (adaptive screen companion) config — same file<env precedence ──
69
+ // `companion.uiaCanvasMax` tunes the GPU-canvas (game/video) UIA cutoff; `companion.profiles` overrides per-class
70
+ // cadence/proactivity (e.g. { "GAME": { "cadenceSec": 20 } }). Consumed by classify_activity.
71
+ let COMPANION_PROFILES = {};
72
+ function loadCompanionConfig() {
73
+ let cfg = {};
74
+ try {
75
+ const cfgPath = join(dir, 'computer-use.config.json');
76
+ if (existsSync(cfgPath)) cfg = (JSON.parse(readFileSync(cfgPath, 'utf8')) || {}).companion || {};
77
+ } catch {}
78
+ if (cfg.uiaCanvasMax != null && (process.env.VORTEX_CU_UIA_CANVAS_MAX === undefined || process.env.VORTEX_CU_UIA_CANVAS_MAX === '')) {
79
+ process.env.VORTEX_CU_UIA_CANVAS_MAX = String(cfg.uiaCanvasMax);
80
+ }
81
+ COMPANION_PROFILES = (cfg.profiles && typeof cfg.profiles === 'object') ? cfg.profiles : {};
82
+ }
83
+ loadCompanionConfig();
84
+
46
85
  // ── audit log (§8: metadata/HMAC only, original image not stored) ──────────────────────
47
86
  // Location = under LocalAppData (outside the instance data/ -> won't leak via corporate sync, codex MEDIUM). The key lives there too.
48
87
  const AUDIT_DIR = join(process.env.LOCALAPPDATA || join(homedir(), '.local', 'share'), 'vortex-computer-use', 'audit');
@@ -90,6 +129,7 @@ const B = {
90
129
  win32: {
91
130
  probe: ['pwsh', ['-NoProfile', '-File', join(dir, 'probe.ps1')]],
92
131
  read: ['pwsh', ['-NoProfile', '-File', join(dir, 'read-ui.ps1')]],
132
+ classify: ['pwsh', ['-NoProfile', '-File', join(dir, 'classify.ps1')]],
93
133
  capture: ['pwsh', ['-NoProfile', '-File', join(dir, 'point-to-ask.ps1')]],
94
134
  },
95
135
  darwin: {
@@ -100,11 +140,15 @@ const B = {
100
140
  };
101
141
 
102
142
  // Returns: { payload, isError } — backend abnormal exit / non-JSON output is surfaced as isError (so an error never flows as if normal in the watch loop).
103
- function runBackend(kind, extraArgs = []) {
143
+ function runBackend(kind, extraArgs = [], timeoutMs = 0) {
104
144
  const b = B[plat]?.[kind];
105
145
  if (!b) return { payload: { error: `unsupported platform/op: ${plat}/${kind}`, grade: 'P0 (manual) fallback' }, isError: true };
106
146
  const [exe, base] = b;
107
- const r = spawnSync(exe, [...base, ...extraArgs], { encoding: 'utf8', maxBuffer: 8 * 1024 * 1024 });
147
+ // Optional hard timeout: a one-shot tool (e.g. classify_activity) must not let a hung COM/UIA call freeze the
148
+ // synchronous spawn (and thus the event loop). On timeout, spawnSync kills the child and sets r.error.
149
+ const opts = { encoding: 'utf8', maxBuffer: 8 * 1024 * 1024 };
150
+ if (timeoutMs > 0) { opts.timeout = timeoutMs; opts.killSignal = 'SIGKILL'; }
151
+ const r = spawnSync(exe, [...base, ...extraArgs], opts);
108
152
  if (r.error) return { payload: { error: String(r.error) }, isError: true };
109
153
  const failed = r.status !== 0;
110
154
  try {
@@ -298,6 +342,528 @@ async function viaWorker(op, args, timeoutMs) {
298
342
  catch (e) { return { payload: { error: String((e && e.message) || e) }, isError: true }; }
299
343
  }
300
344
 
345
+ // ── watch sessions: background noise-filtered watch + in-memory event buffer (design §22.1·§22.2) ──
346
+ // start_watch spins a non-blocking poll loop OWNED BY THIS SERVER — not a separate process, and not the
347
+ // single worker's request slot (the worker stays a dumb single-shot capture engine; long watches must
348
+ // not occupy it, codex #1). Each tick polls the target's frame-to-frame change via the worker, feeds it
349
+ // to a NoiseFilter (debounce + cooldown + maxWait), and on an emit captures the settled frame and
350
+ // appends an event to a bounded in-memory ring buffer. get_events drains the buffer (non-blocking,
351
+ // batched -> few LLM looks); stop_watch ends it. The buffer is memory-only with count/byte/TTL caps and
352
+ // the frames live in RAM only (design §24.1 — no screen history on disk; the brief capture temp file is
353
+ // materialized inline + unlinked at once). Denylist + volatility apply per frame via the reused worker ops.
354
+ // Concurrency is deliberately conservative: every watch tick enqueues a poll_change on the SAME single
355
+ // PowerShell worker the foreground tools use, so too many fast watches would starve the agent's own
356
+ // capture_screen/read_ui calls (codex MED). 4 watches at a 400ms floor caps worst-case worker pressure at
357
+ // ~10 polls/s; a global foreground-priority queue is the documented next step if this proves tight.
358
+ const MAX_WATCHES = 4; // concurrent background watches cap
359
+ const WATCH_MIN_INTERVAL = 400, WATCH_MAX_INTERVAL = 5000;
360
+ const WATCH_DEFAULT_INTERVAL = 600;
361
+ const WATCH_MAX_DURATION_MS = 30 * 60 * 1000; // auto-stop a forgotten watch (privacy §8 + runaway guard)
362
+ const EVENT_RING_MAX = 64; // max buffered events per watch (oldest dropped when full)
363
+ const EVENT_TTL_MS = 5 * 60 * 1000; // buffered events older than this are evicted unread (§24.1 TTL)
364
+ const EVENT_IMG_MAX_BYTES = 4 * 1024 * 1024; // per-event inline image cap
365
+ const WATCH_BUF_MAX_BYTES = 24 * 1024 * 1024; // total inline image bytes held across one watch buffer
366
+ const GET_EVENTS_MAX = 12; // events returned per get_events call
367
+ const GET_EVENTS_MAX_IMAGES = 8; // image items returned per get_events call (MCP response bound)
368
+ const round2 = (n) => (Number.isFinite(Number(n)) ? Math.round(Number(n) * 100) / 100 : n);
369
+
370
+ function buildWatchTargetArgs(a) {
371
+ const t = {};
372
+ if (a.region) t.region = `${a.region.x},${a.region.y},${a.region.w},${a.region.h}`;
373
+ if (a.window) t.windowMatch = String(a.window);
374
+ if (a.monitor != null) t.monitor = String(a.monitor);
375
+ if (a.boxW) t.boxW = a.boxW;
376
+ if (a.boxH) t.boxH = a.boxH;
377
+ if (a.detail) t.detail = String(a.detail);
378
+ return t;
379
+ }
380
+ function watchTargetLabel(a) {
381
+ if (a.region) return `region ${a.region.x},${a.region.y} ${a.region.w}x${a.region.h}`;
382
+ if (a.window) return `window "${a.window}"`;
383
+ if (a.monitor != null) return `monitor ${a.monitor}`;
384
+ return 'cursor';
385
+ }
386
+
387
+ // ── reflex path: fixed-phrase / OCR readout spoken LOCALLY, no cloud round-trip (design §22.3) ──
388
+ // A registered trigger crossing fires beep / say(fixed phrase) / ocr(read the region's text) directly from
389
+ // the watch loop. Speech goes through the GLOBAL speech safety (codex r1): screen-derived text (ocr) is
390
+ // never voiced raw — it gets a "화면 글자:" provenance prefix + control/secret shaping + a per-minute
391
+ // utterance & seconds budget with no-overlap and auto-mute. OCR uses the in-box Windows PowerShell 5.1
392
+ // (pwsh 7 can't load WinRT OCR); TTS uses pwsh 7 System.Speech. Both are spawned (non-blocking) and
393
+ // degrade silently if absent. The OCR crop comes from the SAME denylist-gated worker capture (never an
394
+ // arbitrary file), so a denylisted window blocks reflex OCR too (codex r1 MED).
395
+ const PS51 = join(process.env.WINDIR || 'C:\\Windows', 'System32', 'WindowsPowerShell', 'v1.0', 'powershell.exe');
396
+ const OCR_SCRIPT = join(dir, 'ocr.ps1');
397
+ const SPEAK_SCRIPT = join(dir, 'speak.ps1');
398
+ const OCR_LANG = process.env.VORTEX_CU_OCR_LANG || 'ko';
399
+ const SPEAK_TOWAV_DIR = process.env.VORTEX_CU_SPEAK_TOWAV_DIR || ''; // test hook: render speech to WAV instead of audio
400
+ const speechBudget = new SpeechBudget(); // GLOBAL across all watches (one set of ears)
401
+ let speakingChild = null;
402
+ let speakSeq = 0;
403
+
404
+ const MAX_SPEAK_MS = 30000; // hard upper bound for one utterance so a hung TTS can't hold the no-overlap lock forever (codex r2 MED)
405
+
406
+ // Provenance for screen-derived speech (ocr/vision): by DEFAULT a self-documenting verbal prefix ("화면 글자:" …)
407
+ // so even a first-time listener knows the source — this is the prior HIGH control (voicing raw screen text is a
408
+ // social-engineering channel) and a chime alone can't convey it, so spoken stays the default (codex r1 HIGH).
409
+ // VORTEX_CU_SPEECH_PROVENANCE=earcon is an explicit opt-in to a non-verbal chime instead. Either way the source
410
+ // is marked. Agent/user-authored speech ('agent' via the `speak` tool, and a fixed 'say' phrase) is trusted
411
+ // content and carries NO provenance mark.
412
+ const SPEECH_PROVENANCE_EARCON = process.env.VORTEX_CU_SPEECH_PROVENANCE === 'earcon';
413
+ const EARCON_EST_MS = 250; // chime duration to reserve against the speech budget when earcon mode is on (codex r1 LOW)
414
+
415
+ // ── TTS engine: Supertonic (separate-install ONNX neural, higher quality) with Heami fallback ──
416
+ // The audio-RENDER step only. The safety/budget/provenance layer above (buildUtterance + speechBudget) is
417
+ // engine-agnostic; only the final spawn differs. Engine is resolved ONCE at startup (not per-utterance): 'auto'
418
+ // picks Supertonic when its models + onnxruntime-node are present, else the always-available Heami (speak.ps1).
419
+ // VORTEX_CU_TTS_ENGINE=auto|supertonic|heami. Models live in VORTEX_CU_TTS_MODEL_DIR (fetch-supertonic.mjs writes
420
+ // the default ~/.vortex/computer-use/supertonic-3). Voice VORTEX_CU_TTS_VOICE (F1..F5/M1..M5), lang follows OCR_LANG.
421
+ const SPEAK_SUPERTONIC = join(dir, 'speak-supertonic.mjs');
422
+ const TTS_ENGINE_CFG = (process.env.VORTEX_CU_TTS_ENGINE || 'auto').toLowerCase();
423
+ const TTS_MODEL_DIR = process.env.VORTEX_CU_TTS_MODEL_DIR || join(homedir(), '.vortex', 'computer-use', 'supertonic-3');
424
+ const TTS_VOICE = process.env.VORTEX_CU_TTS_VOICE || 'F1';
425
+ const TTS_LANG = process.env.VORTEX_CU_TTS_LANG || OCR_LANG;
426
+ function supertonicAvailable() {
427
+ try {
428
+ const onnx = join(TTS_MODEL_DIR, 'onnx');
429
+ const need = ['duration_predictor.onnx', 'text_encoder.onnx', 'vector_estimator.onnx', 'vocoder.onnx', 'tts.json', 'unicode_indexer.json'];
430
+ if (!need.every((f) => existsSync(join(onnx, f)))) return false;
431
+ if (!existsSync(join(TTS_MODEL_DIR, 'voice_styles', `${TTS_VOICE}.json`))) return false;
432
+ import.meta.resolve('onnxruntime-node'); // throws if the optional dep isn't installed -> fall back to Heami
433
+ return true;
434
+ } catch { return false; }
435
+ }
436
+ // 'heami' forces SAPI; 'supertonic'/'auto' use Supertonic when actually available, else fall back (never go mute).
437
+ const TTS_ENGINE = TTS_ENGINE_CFG === 'heami' ? 'heami'
438
+ : ((TTS_ENGINE_CFG === 'supertonic' || TTS_ENGINE_CFG === 'auto') && supertonicAvailable() ? 'supertonic' : 'heami');
439
+
440
+ // Speak a finalized utterance without blocking the watch loop.
441
+ // kind: 'agent' (the `speak` tool — agent's judged words, no mark, redacted) | 'say' (fixed phrase, no mark)
442
+ // | 'ocr' | 'vision' (screen-derived, untrusted — marked + shaped).
443
+ function reflexSpeak(kind, text) {
444
+ const screenDerived = kind === 'ocr' || kind === 'vision';
445
+ const earcon = screenDerived && SPEECH_PROVENANCE_EARCON;
446
+ // Earcon mode carries provenance via the chime, so DON'T bake the spoken prefix — but keep the SAME shaping
447
+ // (control-char strip, secret redaction, length cap) buildUtterance applies to screen text.
448
+ const utt = earcon ? sanitizeForSpeech(text) : buildUtterance(kind, text);
449
+ if (!utt) return { ok: false, reason: 'empty' };
450
+ const res = speechBudget.tryReserve(estimateSpeechMs(utt) + (earcon ? EARCON_EST_MS : 0), Date.now());
451
+ if (!res.ok) return res;
452
+ try {
453
+ let child;
454
+ if (TTS_ENGINE === 'supertonic') {
455
+ // Render via the separate-install ONNX neural engine (higher quality). Same spawn lifecycle as Heami below:
456
+ // non-blocking, killed by the MAX_SPEAK_MS watchdog, budget released exactly once on exit.
457
+ const sargs = [SPEAK_SUPERTONIC, '--text', utt, '--voice', TTS_VOICE, '--lang', TTS_LANG, '--model-dir', TTS_MODEL_DIR];
458
+ if (earcon) sargs.push('--earcon');
459
+ if (SPEAK_TOWAV_DIR) sargs.push('--to-wav', join(SPEAK_TOWAV_DIR, `utt-${++speakSeq}.wav`));
460
+ child = spawn(process.execPath, sargs, { stdio: 'ignore' });
461
+ } else {
462
+ const args = ['-NoProfile', '-File', SPEAK_SCRIPT, '-Text', utt];
463
+ if (earcon) args.push('-Earcon', 'screen');
464
+ if (SPEAK_TOWAV_DIR) args.push('-ToWav', join(SPEAK_TOWAV_DIR, `utt-${++speakSeq}.wav`));
465
+ child = spawn('pwsh', args, { stdio: 'ignore' });
466
+ }
467
+ speakingChild = child;
468
+ // Release the budget EXACTLY ONCE per child, and only if this child still owns the active reservation —
469
+ // a late/duplicate exit+error event must not free a newer utterance's slot (codex r2 LOW/MED). A hard
470
+ // timeout kills a hung speaker so it can't deadlock no-overlap (codex r2 MED).
471
+ let released = false;
472
+ const done = () => { if (released) return; released = true; clearTimeout(killer); if (speakingChild === child) { speakingChild = null; speechBudget.release(); } };
473
+ const killer = setTimeout(() => { try { child.kill(); } catch {} done(); }, MAX_SPEAK_MS);
474
+ if (killer.unref) killer.unref();
475
+ child.on('exit', done); child.on('error', done);
476
+ return { ok: true, uttered: utt };
477
+ } catch { speechBudget.release(); return { ok: false, reason: 'spawn-failed' }; }
478
+ }
479
+
480
+ // OCR a worker-captured temp PNG via the 5.1 helper, hard time-bounded; returns recognized text or null.
481
+ // Resolves only AFTER the child has CLOSED (even on timeout: kill, then wait for close) so the PNG file
482
+ // handle is released before the caller unlinks the crop — otherwise a still-open handle leaves a screen
483
+ // crop on disk (codex r2 MED).
484
+ function runOcr(pngPath) {
485
+ return new Promise((resolve) => {
486
+ let out = '', settled = false, killed = false, child;
487
+ const done = (v) => { if (settled) return; settled = true; clearTimeout(timer); resolve(v); };
488
+ try {
489
+ child = spawn(PS51, ['-NoProfile', '-NonInteractive', '-ExecutionPolicy', 'Bypass', '-File', OCR_SCRIPT, '-ImagePath', pngPath, '-Lang', OCR_LANG], { stdio: ['ignore', 'pipe', 'ignore'] });
490
+ } catch { return resolve(null); }
491
+ const timer = setTimeout(() => { killed = true; try { child.kill(); } catch {} }, 6000);
492
+ child.stdout.setEncoding('utf8');
493
+ child.stdout.on('data', (d) => { out += d; if (out.length > 65536) { killed = true; try { child.kill(); } catch {} } });
494
+ child.on('error', () => done(null));
495
+ child.on('close', () => { if (killed) return done(null); try { const j = JSON.parse(out.trim()); done(j && j.ok ? j.text : null); } catch { done(null); } });
496
+ });
497
+ }
498
+
499
+ // ── local VLM "middle path" (design §22.3 / §23.2 / §24): OPTIONAL, GPU-gated, off unless a trusted fast
500
+ // local endpoint is reachable. The reflex/brain paths work with no GPU; this only adds a smarter local
501
+ // description when the hardware allows. Capability is PROBED per session (never stored), with a SYNTHETIC
502
+ // image first (never a real crop before the endpoint is trusted), and gated on a measured latency SLA.
503
+ const VLM = parseVlmConfig();
504
+ const VLM_PROBE_TTL = 5 * 60 * 1000;
505
+ let vlmProbe = null, vlmProbeAt = 0, vlmProbing = null;
506
+
507
+ const VLM_MAX_CROP_BYTES = 6 * 1024 * 1024; // bound the data: URL we send (crop is already size-bounded; defence-in-depth)
508
+ const VLM_MAX_RESP_BYTES = 256 * 1024; // a short description reply is tiny — cap a hostile/huge response (codex MED)
509
+
510
+ // Read a response body up to maxBytes, then stop (so a huge/streaming reply can't exhaust memory).
511
+ async function readCappedText(r, maxBytes) {
512
+ const reader = r.body && r.body.getReader ? r.body.getReader() : null;
513
+ if (!reader) { const t = await r.text(); return t.length <= maxBytes ? t : null; }
514
+ let total = 0; const parts = [];
515
+ for (;;) {
516
+ const { done, value } = await reader.read();
517
+ if (done) break;
518
+ if (value) { total += value.length; if (total > maxBytes) { try { await reader.cancel(); } catch {} return null; } parts.push(Buffer.from(value)); }
519
+ }
520
+ return Buffer.concat(parts).toString('utf8');
521
+ }
522
+
523
+ async function httpChat(body, timeoutMs) {
524
+ const ctrl = new AbortController();
525
+ const timer = setTimeout(() => ctrl.abort(), timeoutMs);
526
+ try {
527
+ const headers = { 'content-type': 'application/json' };
528
+ if (VLM.key) headers.authorization = `Bearer ${VLM.key}`;
529
+ // redirect:'manual' so a 3xx can't replay this POST (with a real crop) to a DIFFERENT host, bypassing
530
+ // the remote-off trust gate (codex MED). A local VLM server never needs redirects → treat any non-2xx as fail.
531
+ const r = await fetch(`${VLM.endpoint}/chat/completions`, { method: 'POST', headers, body: JSON.stringify(body), signal: ctrl.signal, redirect: 'manual' });
532
+ if (!r.ok || r.type === 'opaqueredirect') return { ok: false, status: r.status, redirected: r.type === 'opaqueredirect' };
533
+ const clen = Number(r.headers.get('content-length') || 0);
534
+ if (clen && clen > VLM_MAX_RESP_BYTES) return { ok: false, error: 'response too large' };
535
+ const txt = await readCappedText(r, VLM_MAX_RESP_BYTES);
536
+ if (txt === null) return { ok: false, error: 'response exceeded cap' };
537
+ try { return { ok: true, json: JSON.parse(txt) }; } catch { return { ok: false, error: 'non-JSON response' }; }
538
+ } catch (e) { return { ok: false, error: String((e && e.name === 'AbortError') ? 'timeout' : (e && e.message) || e) }; }
539
+ finally { clearTimeout(timer); }
540
+ }
541
+
542
+ // Probe the VLM with a SYNTHETIC image only (no real screen) — measure reachability + latency, confirm a
543
+ // usable reply. Cached per process for VLM_PROBE_TTL. Note: synthetic latency is a LOWER bound (a real crop
544
+ // is larger/slower), so it gates OUT a too-slow endpoint but doesn't guarantee a real call is within budget.
545
+ async function probeVlm() {
546
+ const gate = vlmGate(VLM);
547
+ if (!gate.ok) return { available: false, reason: gate.reason, tier: gate.tier };
548
+ const now = Date.now();
549
+ if (vlmProbe && now - vlmProbeAt < VLM_PROBE_TTL) return vlmProbe;
550
+ if (vlmProbing) return vlmProbing;
551
+ vlmProbing = (async () => {
552
+ const t0 = Date.now();
553
+ const res = await httpChat(buildChatBody(VLM.model, '이미지가 보이면 ok 라고만 답해.', SYNTH_PNG_B64, 8), VLM.slaMs);
554
+ const latencyMs = Date.now() - t0;
555
+ let out;
556
+ if (!res.ok) out = { available: false, reason: `endpoint not reachable (${res.error || 'http ' + res.status})`, tier: gate.tier };
557
+ else if (latencyMs > VLM.slaMs) out = { available: false, reason: `too slow (${latencyMs}ms > ${VLM.slaMs}ms SLA)`, tier: gate.tier, latencyMs };
558
+ else out = { available: true, tier: gate.tier, model: VLM.model, latencyMs };
559
+ vlmProbe = out; vlmProbeAt = Date.now(); vlmProbing = null;
560
+ return out;
561
+ })();
562
+ return vlmProbing;
563
+ }
564
+
565
+ // Describe a worker-captured crop with the local VLM. Returns a short text or null. Output is UNTRUSTED
566
+ // (the caller speaks it via buildUtterance('vision', …) so it gets the "로컬 비전:" prefix + shaping + budget).
567
+ async function runVlm(pngPath) {
568
+ let b64;
569
+ try {
570
+ if (statSync(pngPath).size > VLM_MAX_CROP_BYTES) return null; // bound the data: URL (codex MED)
571
+ b64 = readFileSync(pngPath).toString('base64');
572
+ } catch { return null; }
573
+ const res = await httpChat(buildChatBody(VLM.model, DEFAULT_VLM_PROMPT, b64, VLM.maxTokens), Math.min(20000, VLM.slaMs * 3));
574
+ if (!res.ok) return null;
575
+ return extractText(res.json) || null;
576
+ }
577
+
578
+ // Validate caller triggers into safe internal trigger objects. Triggers evaluate against the watch's main
579
+ // target region. Capped, clamped, action-gated. `say` content is sanitized (not screen-derived → not redacted).
580
+ function parseTriggers(raw) {
581
+ if (!Array.isArray(raw)) return [];
582
+ const out = [];
583
+ for (const t of raw.slice(0, 8)) {
584
+ if (!t || typeof t !== 'object') continue;
585
+ const action = ['say', 'ocr', 'vision'].includes(t.action) ? t.action : 'beep';
586
+ const th = Number(t.threshold);
587
+ const threshold = Math.min(100, Math.max(0.5, Number.isFinite(th) && th > 0 ? th : 12));
588
+ const cd = Number(t.cooldownMs);
589
+ const cooldownMs = Math.min(600000, Math.max(1500, Number.isFinite(cd) && cd > 0 ? Math.floor(cd) : 8000));
590
+ const trg = { action, threshold, cooldownMs, armed: true, pending: false, pendingTs: 0, lastFireTs: 0, fires: 0 };
591
+ if (action === 'say') trg.say = sanitizeForSpeech(String(t.say || ''), { redactTokens: false }) || '알림';
592
+ if (action === 'beep') trg.beep = ['info', 'warn', 'urgent'].includes(t.beep) ? t.beep : 'warn';
593
+ if (action === 'ocr' || action === 'vision') { const dw = Number(t.dwellMs); trg.dwellMs = Math.min(3000, Math.max(0, Number.isFinite(dw) && dw >= 0 ? dw : 700)); }
594
+ out.push(trg);
595
+ }
596
+ return out;
597
+ }
598
+
599
+ class WatchSession {
600
+ constructor(watchId, a) {
601
+ this.watchId = watchId;
602
+ this.bgId = `__watch_${watchId}`; // worker poll-continuity slot, isolated from agent poll_change ids
603
+ this.targetArgs = buildWatchTargetArgs(a);
604
+ this.targetLabel = watchTargetLabel(a);
605
+ const iv = Number(a.pollIntervalMs);
606
+ this.pollIntervalMs = Math.min(WATCH_MAX_INTERVAL, Math.max(WATCH_MIN_INTERVAL, Number.isFinite(iv) && iv > 0 ? Math.floor(iv) : WATCH_DEFAULT_INTERVAL));
607
+ this.filter = new NoiseFilter(a);
608
+ this.cfg = this.filter.cfg;
609
+ this.outDir = mkdtempSync(join(tmpdir(), 'vortex-cu-watch-'));
610
+ this.ring = []; // [{seq, ts, reason, peakPct, ..., _img?}]
611
+ this.seq = 0; this.dropped = 0; this.bufBytes = 0;
612
+ this.polls = 0; this.emitted = 0; this.redactedFrames = 0;
613
+ this.lastChangePct = null; this.lastPollTs = null;
614
+ this.lastError = null; this.consecErrors = 0;
615
+ this.triggers = parseTriggers(a.triggers); // reflex triggers on the main target region (§22.3)
616
+ this.reflexFires = 0; this.lastReflex = null;
617
+ this.startedAt = Date.now();
618
+ this.stopped = false; this.stopReason = null;
619
+ this._timer = null;
620
+ }
621
+
622
+ start() { this._schedule(0); return this.status(); } // first tick goes through the same guarded path (no unhandled rejection, codex LOW)
623
+
624
+ _schedule(delayMs = this.pollIntervalMs) {
625
+ if (this.stopped) return;
626
+ this._timer = setTimeout(() => { this._tick().catch(() => { if (!this.stopped) this._schedule(); }); }, delayMs);
627
+ if (this._timer.unref) this._timer.unref();
628
+ }
629
+
630
+ async _tick() {
631
+ if (this.stopped) return;
632
+ this._evict(); // enforce the TTL on EVERY tick, not just on emit/get_events — a buffered frame must not outlive EVENT_TTL_MS even if the client never polls (codex HIGH, privacy §24.1)
633
+ if (Date.now() - this.startedAt > WATCH_MAX_DURATION_MS) { this._stop('max watch duration reached (auto-stopped)'); return; }
634
+ this.polls++;
635
+ const reset = this.polls === 1;
636
+ const wa = { ...this.targetArgs, watchId: this.bgId };
637
+ if (reset) wa.reset = true;
638
+ const res = await viaWorker('poll_change', wa, OP_TIMEOUT_MS);
639
+ if (this.stopped) return; // stop_watch/dispose may have fired during the await — don't resume into a torn-down session (codex HIGH race)
640
+ const p = res.payload || {};
641
+ if (res.isError) {
642
+ this.consecErrors++; this.lastError = String(p.error || 'poll failed');
643
+ if (this.consecErrors >= 5) { this._stop(`stopped after repeated poll errors: ${this.lastError}`); return; }
644
+ } else {
645
+ this.consecErrors = 0;
646
+ this.lastChangePct = p.changePct != null ? round2(p.changePct) : null;
647
+ this.lastPollTs = Date.now();
648
+ if (p.redacted) {
649
+ // A denylisted window overlaps the target -> no capture this frame. Treat as a blind gap (don't feed
650
+ // the filter a fake diff); surface the count in status so the agent knows the watch is partially blind.
651
+ this.redactedFrames++;
652
+ } else {
653
+ const baseline = p.baseline === true; // includes a silent worker stateReset (fresh baseline)
654
+ const c = p.changePct != null ? p.changePct : 0;
655
+ const emit = this.filter.push({ changePct: c, now: this.lastPollTs, baseline });
656
+ if (emit) { try { await this._onEmit(emit, p); } catch (e) { this.lastError = `emit/capture failed: ${String((e && e.message) || e)}`; } }
657
+ // Reflex triggers run on the RAW per-tick change (not the settled-event path) — fast local beep/say/ocr.
658
+ if (!baseline && this.triggers.length) { try { await this._evalReflexes(c, this.lastPollTs); } catch (e) { this.lastError = `reflex failed: ${String((e && e.message) || e)}`; } }
659
+ }
660
+ }
661
+ if (!this.stopped) this._schedule();
662
+ }
663
+
664
+ async _onEmit(emit, pollPayload) {
665
+ const cap = await viaWorker('capture', { ...this.targetArgs, outDir: this.outDir }, OP_TIMEOUT_MS);
666
+ if (this.stopped) {
667
+ materializeImages(cap.payload); // stopped during the capture await — unlink the just-captured temp frame and drop it (don't push into a disposed session, codex HIGH race)
668
+ try { rmSync(this.outDir, { recursive: true, force: true }); } catch {} // the worker may have recreated outDir to write that frame — clear the now-empty dir (codex r2 residual)
669
+ return;
670
+ }
671
+ const cp = cap.payload || {};
672
+ const ev = { seq: ++this.seq, ts: Date.now(), reason: emit.reason, peakPct: round2(emit.peakPct), activeMs: emit.activeMs, target: this.targetLabel };
673
+ if (cap.isError) {
674
+ ev.captureError = String(cp.error || 'capture failed');
675
+ } else if (cp.redacted) {
676
+ ev.redacted = true; ev.note = 'settled change detected but the frame was withheld (denylisted window in region)';
677
+ } else {
678
+ const items = materializeImages(cp); // reads the temp PNG inline as base64 and unlinks it (§8 volatility)
679
+ const img = items[0];
680
+ if (img) {
681
+ const bytes = Buffer.from(img.data, 'base64').length;
682
+ if (bytes <= EVENT_IMG_MAX_BYTES) { ev._img = img; ev.bytes = bytes; this.bufBytes += bytes; }
683
+ else ev.imageDropped = `image too large (${bytes} bytes)`;
684
+ }
685
+ if (cp.outputSize) ev.outputSize = cp.outputSize;
686
+ if (cp.approxTokens != null) ev.approxTokens = cp.approxTokens;
687
+ if (cp.captureRect) ev.captureRect = cp.captureRect;
688
+ }
689
+ this.emitted++;
690
+ this.ring.push(ev);
691
+ this._evict();
692
+ }
693
+
694
+ // Reflex evaluation: per trigger, fire when the raw change crosses its threshold — with hysteresis
695
+ // (re-arm only after it goes quiet), per-trigger cooldown, and (for ocr) a one-tick dwell so we read a
696
+ // settled frame rather than a half-drawn one (codex r1 MED). The reflex bypasses the noise-filter debounce
697
+ // for speed but keeps these throttles + the global speech budget (codex r1 — no denial-of-attention).
698
+ async _evalReflexes(changePct, now) {
699
+ for (const trg of this.triggers) {
700
+ // A pending ocr dwell fires after dwellMs regardless of the current change (the point is a stable frame).
701
+ if (trg.pending) {
702
+ if (now - trg.pendingTs >= trg.dwellMs) { trg.pending = false; trg.armed = false; trg.lastFireTs = now; trg.fires++; await this._fireTrigger(trg); }
703
+ continue;
704
+ }
705
+ if (changePct < trg.threshold * 0.5) trg.armed = true; // hysteresis: re-arm once it settles below half
706
+ if (changePct < trg.threshold || !trg.armed) continue;
707
+ if (now - trg.lastFireTs < trg.cooldownMs) continue; // per-trigger cooldown
708
+ if (trg.action === 'ocr') { trg.pending = true; trg.pendingTs = now; continue; } // dwell one tick, then read
709
+ trg.armed = false; trg.lastFireTs = now; trg.fires++;
710
+ await this._fireTrigger(trg);
711
+ }
712
+ }
713
+
714
+ async _fireTrigger(trg) {
715
+ if (this.stopped) return;
716
+ if (trg.action === 'beep') { this._reflexNote(trg, 'beep'); await viaWorker('beep', { pattern: trg.beep }); return; }
717
+ if (trg.action === 'say') { const r = reflexSpeak('say', trg.say); this._reflexNote(trg, r.ok ? 'say' : `say-skip:${r.reason}`); return; }
718
+ // 'vision' uses the local VLM if it's available this session; otherwise it degrades to OCR (A always works).
719
+ let kind = trg.action; // 'ocr' | 'vision'
720
+ if (kind === 'vision') { const pv = await probeVlm(); if (this.stopped) return; if (!pv.available) kind = 'ocr'; } // graceful degrade -> read text
721
+ // Outcome label: surface the degrade ("vision→ocr") in EVERY branch (redacted/nocapture/empty/said), not just success.
722
+ const tag = (k) => (trg.action === 'vision' && k === 'ocr') ? 'vision→ocr' : k;
723
+ // Capture the region through the SAME denylist-gated worker path (never an arbitrary file), then read it.
724
+ const cap = await viaWorker('capture', { ...this.targetArgs, outDir: this.outDir }, OP_TIMEOUT_MS);
725
+ if (this.stopped) { materializeImages(cap.payload); return; }
726
+ const cp = cap.payload || {};
727
+ if (cp.redacted) { this._reflexNote(trg, `${tag(kind)}-redacted`); return; } // denylisted window in region -> stay blind
728
+ if (cap.isError || !cp.path) { this._reflexNote(trg, `${tag(kind)}-nocapture`); return; }
729
+ let text = null, usedKind = kind;
730
+ try {
731
+ if (kind === 'vision') {
732
+ text = await runVlm(cp.path);
733
+ // The probe said available but the LIVE call can still fail (model unloaded, timeout) — degrade to OCR (codex low).
734
+ if (text == null && !this.stopped) { usedKind = 'ocr'; text = await runOcr(cp.path); }
735
+ } else {
736
+ text = await runOcr(cp.path);
737
+ }
738
+ } finally { try { unlinkSync(cp.path); } catch {} } // volatile: delete the crop after reading (§8)
739
+ if (this.stopped) return;
740
+ if (!text) { this._reflexNote(trg, `${tag(usedKind)}-empty`); return; }
741
+ const r = reflexSpeak(usedKind === 'vision' ? 'vision' : 'ocr', text); // screen-derived (untrusted) -> provenance chime (or verbal prefix in spoken mode)
742
+ this._reflexNote(trg, r.ok ? `${tag(usedKind)}-said` : `${tag(usedKind)}-skip:${r.reason}`);
743
+ }
744
+
745
+ _reflexNote(trg, outcome) { this.reflexFires++; this.lastReflex = { ts: Date.now(), action: trg.action, outcome }; }
746
+
747
+ _evict() {
748
+ const cut = Date.now() - EVENT_TTL_MS;
749
+ while (this.ring.length && this.ring[0].ts < cut) this._drop(this.ring.shift());
750
+ while (this.ring.length > EVENT_RING_MAX) this._drop(this.ring.shift());
751
+ while (this.bufBytes > WATCH_BUF_MAX_BYTES && this.ring.length > 0) this._drop(this.ring.shift());
752
+ }
753
+ _drop(ev) { if (ev && ev.bytes) this.bufBytes -= ev.bytes; this.dropped++; }
754
+
755
+ drain(max = GET_EVENTS_MAX, maxImages = GET_EVENTS_MAX_IMAGES) {
756
+ this._evict();
757
+ // Drain only as far as we can return EVERYTHING we remove: stop BEFORE an image-bearing event that would
758
+ // exceed the per-call image cap, leaving it (and the rest) buffered for the next call (codex MED — never
759
+ // remove an event whose frame we then have to discard). Metadata-only events keep draining up to `max`.
760
+ const images = []; const records = [];
761
+ const cap = Math.max(1, max);
762
+ while (this.ring.length && records.length < cap) {
763
+ const ev = this.ring[0];
764
+ if (ev._img && images.length >= maxImages) break; // would overflow images — leave it buffered, truthfully report remaining
765
+ this.ring.shift();
766
+ if (ev.bytes) this.bufBytes -= ev.bytes;
767
+ const rec = { ...ev }; delete rec._img;
768
+ if (ev._img) { images.push(ev._img); rec.image = 'inline'; }
769
+ records.push(rec);
770
+ }
771
+ return { records, images, remaining: this.ring.length };
772
+ }
773
+
774
+ status() {
775
+ this._evict();
776
+ return {
777
+ watchId: this.watchId, target: this.targetLabel, running: !this.stopped, stopReason: this.stopReason || undefined,
778
+ pollIntervalMs: this.pollIntervalMs, polls: this.polls, lastChangePct: this.lastChangePct,
779
+ ageMs: Date.now() - this.startedAt, buffered: this.ring.length, dropped: this.dropped, emitted: this.emitted,
780
+ redactedFrames: this.redactedFrames || undefined, filterPhase: this.filter.status.phase,
781
+ thresholds: { activityThreshold: this.cfg.activityThreshold, quietThreshold: this.cfg.quietThreshold, debounceQuietMs: this.cfg.debounceQuietMs, cooldownMs: this.cfg.cooldownMs, maxWaitMs: this.cfg.maxWaitMs },
782
+ triggers: this.triggers.length || undefined, reflexFires: this.reflexFires || undefined, lastReflex: this.lastReflex || undefined,
783
+ lastError: this.lastError || undefined,
784
+ };
785
+ }
786
+
787
+ // Stop the loop. On an INVOLUNTARY stop (auto-stop at max duration / repeated errors), the client may never
788
+ // call stop_watch, so we must not keep screen frames in RAM indefinitely: drop the captured frames now and
789
+ // remove the (otherwise empty) temp dir, but keep the lightweight metadata records so a later get_events can
790
+ // still report what happened (codex HIGH — privacy/TTL must hold without client cooperation, §24.1).
791
+ _stop(reason) {
792
+ if (this.stopped) return;
793
+ this.stopped = true; this.stopReason = reason;
794
+ if (this._timer) { clearTimeout(this._timer); this._timer = null; }
795
+ this._clearFrames();
796
+ try { rmSync(this.outDir, { recursive: true, force: true }); } catch {}
797
+ }
798
+
799
+ _clearFrames() { // strip pixel data, keep metadata records
800
+ for (const ev of this.ring) { if (ev._img) { delete ev._img; ev.image = 'cleared (watch stopped)'; } }
801
+ this.bufBytes = 0;
802
+ }
803
+
804
+ dispose() {
805
+ this._stop(this.stopReason || 'disposed');
806
+ this.ring = []; this.bufBytes = 0; // drop everything, including metadata
807
+ }
808
+ }
809
+
810
+ const watches = new Map();
811
+
812
+ function startWatch(a) {
813
+ if (plat !== 'win32') return { payload: { error: 'start_watch is currently Windows-only', platform: plat }, isError: true };
814
+ // Require EXACTLY one fixed target — not "at least one" — so the worker's implicit precedence can't make a
815
+ // caller watch a different target than they passed (codex LOW). Cursor mode is disallowed (the cursor moves).
816
+ const targetCount = (a.region ? 1 : 0) + (a.window ? 1 : 0) + (a.monitor != null ? 1 : 0);
817
+ if (targetCount === 0) {
818
+ return { payload: { error: 'start_watch needs a fixed target — pass region, window, or monitor. Cursor mode is not allowed (the cursor moves, so every frame would differ).' }, isError: true };
819
+ }
820
+ if (targetCount > 1) {
821
+ return { payload: { error: 'start_watch takes exactly one target — pass only one of region, window, or monitor.' }, isError: true };
822
+ }
823
+ const watchId = a.watchId ? String(a.watchId) : 'default';
824
+ let replaced = false;
825
+ const existing = watches.get(watchId);
826
+ if (existing) { existing.dispose(); watches.delete(watchId); replaced = true; }
827
+ // Prune stopped/empty sessions before enforcing the cap, then reject if still over.
828
+ if (watches.size >= MAX_WATCHES) {
829
+ for (const [id, s] of watches) { if (s.stopped && s.ring.length === 0) { s.dispose(); watches.delete(id); } }
830
+ }
831
+ if (watches.size >= MAX_WATCHES) {
832
+ return { payload: { error: `too many concurrent watches (max ${MAX_WATCHES}) — stop one first with stop_watch.` }, isError: true };
833
+ }
834
+ const session = new WatchSession(watchId, a);
835
+ watches.set(watchId, session);
836
+ const status = session.start();
837
+ return { payload: { ok: true, action: replaced ? 'restarted' : 'started', ...status, hint: 'Watch runs in the background. Call get_events periodically to collect what changed; stop_watch when done.' }, isError: false };
838
+ }
839
+
840
+ function stopWatch(a) {
841
+ const watchId = a.watchId ? String(a.watchId) : null;
842
+ if (!watchId) {
843
+ const ids = [...watches.keys()];
844
+ let total = 0;
845
+ for (const [, s] of watches) { total += s.emitted; s.dispose(); }
846
+ watches.clear();
847
+ return { payload: { ok: true, action: 'stopped-all', stopped: ids, totalEmitted: total }, isError: false };
848
+ }
849
+ const s = watches.get(watchId);
850
+ if (!s) return { payload: { error: `no watch with id "${watchId}"`, active: [...watches.keys()] }, isError: true };
851
+ const summary = { ok: true, action: 'stopped', watchId, polls: s.polls, emitted: s.emitted, dropped: s.dropped, discardedUnread: s.ring.length, ageMs: Date.now() - s.startedAt };
852
+ s.dispose(); watches.delete(watchId);
853
+ return { payload: summary, isError: false };
854
+ }
855
+
856
+ // get_events returns the buffered events as text records plus their settled frames as MCP image items.
857
+ function getEvents(a) {
858
+ const watchId = a.watchId ? String(a.watchId) : 'default';
859
+ const s = watches.get(watchId);
860
+ if (!s) return { result: { payload: { error: `no watch with id "${watchId}"`, active: [...watches.keys()] }, isError: true }, images: [] };
861
+ const max = Number.isFinite(Number(a.max)) ? Math.min(GET_EVENTS_MAX, Math.max(1, Math.floor(Number(a.max)))) : GET_EVENTS_MAX;
862
+ const { records, images, remaining } = s.drain(max);
863
+ const payload = { watchId, events: records, returned: records.length, remaining, status: s.status() };
864
+ return { result: { payload, isError: false }, images };
865
+ }
866
+
301
867
  const TOOLS = [
302
868
  {
303
869
  name: 'probe',
@@ -316,6 +882,11 @@ const TOOLS = [
316
882
  additionalProperties: false,
317
883
  },
318
884
  },
885
+ {
886
+ name: 'classify_activity',
887
+ description: 'Adaptive companion: classify what the user is doing on the foreground window — returns { class: GAME|DEV|MEDIA|BROWSING|PRODUCTIVITY|UNKNOWN, process, title, notificationState, interruptible, canvas, uiaCount, fullscreen, profile, needsChangeRate }. Read-only, zero images. Use it to pick a help profile/cadence; for GAME, sample poll_change to split fast-action (break-gated) vs strategy (periodic). See docs/adaptive-companion.md.',
888
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
889
+ },
319
890
  {
320
891
  name: 'capture_screen',
321
892
  description: 'Pixel capture — fallback for canvas/games that structured perception cannot read. Target (priority): region > window > monitor > (default) around the cursor. Returns a PNG path (volatility is the caller\'s job, §8).',
@@ -410,13 +981,93 @@ const TOOLS = [
410
981
  additionalProperties: false,
411
982
  },
412
983
  },
984
+ {
985
+ name: 'speak',
986
+ description:
987
+ "Speak ONE short line aloud in the user's language through the local TTS — the AGENT tier of the watch design. This is how you act as a companion while they look at a game / another screen: you SEE the screen (capture_screen / get_events), UNDERSTAND it, and say something useful — strategic advice, a warning about a threat/opportunity they might miss, brief commentary, or an answer to their question. These are YOUR OWN judged words (trusted), so they are spoken WITHOUT a provenance mark. Therefore do NOT pipe RAW screen text through here — summarize/judge it into your own sentence first; for unjudged raw screen text use an `ocr` trigger instead (it is marked as screen-derived). Keep it to one concise sentence. Globally rate-limited and never overlaps reflex speech (one set of ears). Windows-only.",
988
+ inputSchema: {
989
+ type: 'object',
990
+ properties: {
991
+ text: { type: 'string', description: 'The sentence to speak — your own words (a summary/advice/answer), not a raw screen dump. Kept short; long text is capped.' },
992
+ },
993
+ required: ['text'],
994
+ additionalProperties: false,
995
+ },
996
+ },
997
+ {
998
+ name: 'start_watch',
999
+ description:
1000
+ 'Start watching a fixed target in the BACKGROUND and return immediately (non-blocking). A built-in noise filter (debounce + cooldown) suppresses the per-frame ripple of video/games/scrolling and keeps only meaningful, SETTLED changes — so it works on screens that change every frame, where raw change-detection would flood you. Events accumulate in an in-memory buffer; collect them with get_events, end with stop_watch. Needs a fixed target (region/window/monitor — NOT the cursor). Tune with thresholds if needed. Windows-only. Auto-stops after 30 min.',
1001
+ inputSchema: {
1002
+ type: 'object',
1003
+ properties: {
1004
+ region: {
1005
+ type: 'object',
1006
+ description: 'Fixed region to watch (virtual-screen physical coordinates). Recommended for a game minimap/alert area.',
1007
+ properties: { x: { type: 'number' }, y: { type: 'number' }, w: { type: 'number' }, h: { type: 'number' } },
1008
+ required: ['x', 'y', 'w', 'h'],
1009
+ additionalProperties: false,
1010
+ },
1011
+ window: { type: 'string', description: 'Window title substring to watch (tracks the window as it moves).' },
1012
+ monitor: { type: 'string', description: "Monitor to watch: 1-based index ('2') or 'primary' (e.g. a game-only screen)." },
1013
+ watchId: { type: 'string', description: 'Watch-session id (default "default"). Use distinct ids to run several watches at once; starting an existing id restarts it.' },
1014
+ pollIntervalMs: { type: 'number', description: 'How often to sample the target (ms, default 600; clamped 400-5000). Lower = snappier + more CPU.' },
1015
+ detail: { type: 'string', description: "Resolution preset for the captured settled frame: 'gist'/'normal'/'text'." },
1016
+ activityThreshold: { type: 'number', description: 'Frame-to-frame % change that WAKES the filter (default 8). Set above your screen\'s ambient jitter (a playing video measures ~2.5-4%) and below a real transition (a scene cut ~16.8%).' },
1017
+ quietThreshold: { type: 'number', description: 'Frame-to-frame % below which a frame counts as "still" (default 5). Hysteresis: forced below activityThreshold.' },
1018
+ debounceQuietMs: { type: 'number', description: 'How long motion must stay quiet before the settled frame is emitted (ms, default 900 = quality gate).' },
1019
+ cooldownMs: { type: 'number', description: 'Minimum gap between events (ms, default 6000 = frequency cap; suppresses ripples of one activity).' },
1020
+ maxWaitMs: { type: 'number', description: 'If motion never settles, emit anyway this often (ms, default 8000 = anti-starvation for continuous motion).' },
1021
+ triggers: {
1022
+ type: 'array',
1023
+ description: 'Reflex triggers (design §22.3): fire a local alert the INSTANT the watched region changes past a threshold — no cloud LLM round-trip, so it reaches the user in well under a second. Actions: `beep` (sound), `say` (speak a FIXED pre-written phrase — safest), `ocr` (read the region\'s text aloud via offline OCR), or `vision` (describe the scene via a LOCAL vision model if one is configured + fast enough, else auto-degrades to ocr). Spoken screen content is marked as screen-derived (by default a verbal "화면 글자:" / "로컬 비전:" prefix; a non-verbal chime when VORTEX_CU_SPEECH_PROVENANCE=earcon) and shaped; speech is globally rate-limited. Use a `beep`/`say` reflex for "ping me the moment X happens"; for the deeper, JUDGED commentary (advice, warnings, answers) look at the get_events frame and voice your own sentence with the `speak` tool — raw `ocr` readout is just a fallback.',
1024
+ maxItems: 8,
1025
+ items: {
1026
+ type: 'object',
1027
+ properties: {
1028
+ action: { type: 'string', enum: ['beep', 'say', 'ocr', 'vision'], description: "What to do on a crossing." },
1029
+ threshold: { type: 'number', description: 'Frame-to-frame % change that fires this trigger (default 12).' },
1030
+ say: { type: 'string', description: "For action 'say': the fixed phrase to speak (e.g. '적 출현')." },
1031
+ beep: { type: 'string', description: "For action 'beep': 'info'/'warn'/'urgent' (default warn)." },
1032
+ cooldownMs: { type: 'number', description: 'Minimum gap between firings of this trigger (ms, default 8000).' },
1033
+ dwellMs: { type: 'number', description: "For action 'ocr': wait this long after the change before reading, so the frame is settled (ms, default 700)." },
1034
+ },
1035
+ required: ['action'],
1036
+ additionalProperties: false,
1037
+ },
1038
+ },
1039
+ },
1040
+ additionalProperties: false,
1041
+ },
1042
+ },
1043
+ {
1044
+ name: 'get_events',
1045
+ description:
1046
+ 'Collect the changes a background watch (start_watch) has buffered since the last call — non-blocking, batched (so a long watch costs only a few looks). Returns one record per settled change (time, reason, change magnitude, capture metadata) plus the settled frames as inline images, and a status block (polls, buffered, dropped, filter phase). Drains what it returns; call again for any remainder.',
1047
+ inputSchema: {
1048
+ type: 'object',
1049
+ properties: {
1050
+ watchId: { type: 'string', description: 'Which watch to collect from (default "default").' },
1051
+ max: { type: 'number', description: 'Max events to return this call (default/cap 12). Remaining stay buffered.' },
1052
+ },
1053
+ additionalProperties: false,
1054
+ },
1055
+ },
1056
+ {
1057
+ name: 'stop_watch',
1058
+ description: 'Stop a background watch and discard its buffer + in-memory frames. Omit watchId to stop ALL watches. Returns a summary (polls, events emitted, unread discarded).',
1059
+ inputSchema: {
1060
+ type: 'object',
1061
+ properties: {
1062
+ watchId: { type: 'string', description: 'Which watch to stop. Omit to stop every active watch.' },
1063
+ },
1064
+ additionalProperties: false,
1065
+ },
1066
+ },
413
1067
  ];
414
1068
 
415
- const server = new Server({ name: 'computer-use', version: '0.0.1-poc' }, { capabilities: { tools: {} } });
416
-
417
- server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
418
-
419
- server.setRequestHandler(CallToolRequestSchema, async (req) => {
1069
+ // The CallTool handler — standalone (no SDK types), wired to the server only on the serve path.
1070
+ async function handleCallTool(req) {
420
1071
  const { name, arguments: a = {} } = req.params;
421
1072
  const useWorker = plat === 'win32'; // the resident worker is Windows PowerShell backend only
422
1073
  let result;
@@ -427,6 +1078,11 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
427
1078
  try {
428
1079
  if (name === 'probe') {
429
1080
  result = useWorker ? await viaWorker('probe', {}) : runBackend('probe');
1081
+ // If a local VLM is configured, also report its availability (synthetic-image probe — no real screen sent).
1082
+ // This is the only place probe touches the network, and only when the user opted in by setting an endpoint.
1083
+ if (VLM.enabled && result && result.payload && typeof result.payload === 'object') {
1084
+ try { result.payload.vlm = await probeVlm(); } catch (e) { result.payload.vlm = { available: false, reason: String((e && e.message) || e) }; }
1085
+ }
430
1086
  } else if (name === 'read_ui') {
431
1087
  if (useWorker) {
432
1088
  const wa = {};
@@ -439,6 +1095,25 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
439
1095
  if (a.target) args.push('-Target', String(a.target));
440
1096
  result = runBackend('read', args);
441
1097
  }
1098
+ } else if (name === 'classify_activity') {
1099
+ if (plat !== 'win32') {
1100
+ result = { payload: { error: 'classify_activity is currently Windows-only', platform: plat }, isError: true };
1101
+ } else {
1102
+ const raw = runBackend('classify', [], 8000); // fast one-shot; 8s hard timeout so a hung UIA call can't freeze the loop
1103
+ if (raw.isError) {
1104
+ result = raw;
1105
+ } else {
1106
+ try {
1107
+ const opts = { profiles: COMPANION_PROFILES };
1108
+ if (process.env.VORTEX_CU_UIA_CANVAS_MAX) opts.uiaCanvasMax = Number(process.env.VORTEX_CU_UIA_CANVAS_MAX);
1109
+ const d = classifyActivity(raw.payload, opts);
1110
+ const p = raw.payload || {};
1111
+ result = { payload: { ...d, redacted: !!p.redacted, reason: p.reason, procId: p.procId, hwnd: p.hwnd, uiaCapped: p.uiaCapped, uiaOk: p.uiaOk, notificationStateCode: p.notificationState } };
1112
+ } catch (e) {
1113
+ result = { payload: { error: 'classify failed', detail: String((e && e.message) || e), raw: raw.payload }, isError: true };
1114
+ }
1115
+ }
1116
+ }
442
1117
  } else if (name === 'capture_screen') {
443
1118
  reqDir = mkdtempSync(join(tmpdir(), 'vortex-cu-'));
444
1119
  if (useWorker) {
@@ -529,6 +1204,31 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
529
1204
  if (a.durationMs != null) wa.durationMs = a.durationMs;
530
1205
  result = await viaWorker('beep', wa);
531
1206
  }
1207
+ } else if (name === 'speak') {
1208
+ // Agent-authored speech: trusted content -> no provenance mark (treated like a 'say' fixed phrase), but
1209
+ // still shaped + globally budgeted + no-overlap with reflex speech (one set of ears). Non-blocking.
1210
+ if (plat !== 'win32') {
1211
+ result = { payload: { error: 'speak is currently Windows-only', platform: plat }, isError: true };
1212
+ } else if (typeof a.text !== 'string' || !a.text.trim()) {
1213
+ result = { payload: { ok: false, error: 'empty text' }, isError: true };
1214
+ } else {
1215
+ const r = reflexSpeak('agent', a.text); // agent's judged words: no provenance mark, but redacted + shaped + budgeted
1216
+ result = { payload: r.ok ? { ok: true, uttered: r.uttered } : { ok: false, skipped: r.reason } };
1217
+ }
1218
+ } else if (name === 'start_watch') {
1219
+ result = startWatch(a);
1220
+ } else if (name === 'stop_watch') {
1221
+ result = stopWatch(a);
1222
+ } else if (name === 'get_events') {
1223
+ // get_events carries its OWN already-materialized image items (from the watch buffer) — return them directly
1224
+ // and skip the generic materializeImages pass (there are no on-disk paths in the payload to re-read).
1225
+ if (plat !== 'win32') {
1226
+ result = { payload: { error: 'get_events is currently Windows-only', platform: plat }, isError: true };
1227
+ } else {
1228
+ const { result: r, images } = getEvents(a);
1229
+ auditLog('get_events', r.payload, images);
1230
+ return { content: [{ type: 'text', text: JSON.stringify(r.payload, null, 2) }, ...images], isError: r.isError };
1231
+ }
532
1232
  } else {
533
1233
  return { content: [{ type: 'text', text: `unknown tool: ${name}` }], isError: true };
534
1234
  }
@@ -538,10 +1238,11 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
538
1238
  } finally {
539
1239
  if (reqDir) { try { rmSync(reqDir, { recursive: true, force: true }); } catch {} }
540
1240
  }
541
- });
1241
+ }
542
1242
 
543
- // Clean up the worker on server shutdown (it would also auto-terminate via stdin EOF when the parent dies, but do it explicitly).
544
- process.on('exit', () => workerMgr.dispose());
1243
+ // Clean up the worker + any background watches on server shutdown (stops the loops, frees in-RAM frames,
1244
+ // removes the watch temp dirs). The worker also auto-terminates via stdin EOF when the parent dies.
1245
+ process.on('exit', () => { try { if (speakingChild) speakingChild.kill(); } catch {} for (const s of watches.values()) { try { s.dispose(); } catch {} } workerMgr.dispose(); });
545
1246
  process.on('SIGINT', () => process.exit(0));
546
1247
  process.on('SIGTERM', () => process.exit(0));
547
1248
 
@@ -611,7 +1312,13 @@ function runInstall(argv) {
611
1312
  if (process.argv.slice(2).includes('install')) {
612
1313
  runInstall(process.argv.slice(2));
613
1314
  } else {
614
- const transport = new StdioServerTransport();
615
- await server.connect(transport);
616
- process.stderr.write(`[computer-use MCP] ready on stdio (worker=${plat === 'win32' ? 'on' : 'off'}; tools: probe, read_ui, capture_screen, watch_capture, poll_change, beep)\n`);
1315
+ // Serve path: load the MCP SDK dynamically (so `install` never requires it), wire the handlers, connect.
1316
+ const { Server } = await import('@modelcontextprotocol/sdk/server/index.js');
1317
+ const { StdioServerTransport } = await import('@modelcontextprotocol/sdk/server/stdio.js');
1318
+ const { ListToolsRequestSchema, CallToolRequestSchema } = await import('@modelcontextprotocol/sdk/types.js');
1319
+ const server = new Server({ name: 'computer-use', version: '0.5.0' }, { capabilities: { tools: {} } });
1320
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
1321
+ server.setRequestHandler(CallToolRequestSchema, handleCallTool);
1322
+ await server.connect(new StdioServerTransport());
1323
+ process.stderr.write(`[computer-use MCP] ready on stdio (worker=${plat === 'win32' ? 'on' : 'off'}; tools: probe, read_ui, classify_activity, capture_screen, watch_capture, poll_change, start_watch, get_events, stop_watch, beep, speak)\n`);
617
1324
  }