@vortex-os/computer-use 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +181 -177
- package/computer-use.config.example.json +29 -28
- package/package.json +74 -73
- package/scripts/activity.mjs +92 -92
- package/scripts/audio-duck.ps1 +180 -180
- package/scripts/classify.ps1 +8 -8
- package/scripts/fetch-supertonic.mjs +82 -65
- package/scripts/lib.ps1 +679 -679
- package/scripts/mcp-stdio.mjs +1376 -1324
- package/scripts/noise-filter.mjs +135 -135
- package/scripts/ocr.ps1 +92 -92
- package/scripts/speak-supertonic.mjs +296 -296
- package/scripts/speak.ps1 +58 -58
- package/scripts/speech-safety.mjs +104 -104
- package/scripts/vlm.mjs +106 -106
package/scripts/mcp-stdio.mjs
CHANGED
|
@@ -1,1324 +1,1376 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
// @vortex-os/computer-use — read-only screen-perception MCP stdio server (Windows-first).
|
|
3
|
-
// Tools: probe · read_ui · classify_activity · capture_screen · watch_capture · poll_change · start_watch · get_events · stop_watch · beep · speak.
|
|
4
|
-
// Control is out of scope.
|
|
5
|
-
// Two modes (bin `vortex-mcp-computer-use`):
|
|
6
|
-
// - default: run the stdio server (what an MCP host launches).
|
|
7
|
-
// - `install`: self-register into the project `.mcp.json` under the non-reserved key
|
|
8
|
-
// `vortex-computer-use` (merge-safe). e.g. `npx vortex-mcp-computer-use install`.
|
|
9
|
-
// Optional dep: @modelcontextprotocol/sdk — loaded DYNAMICALLY only on the serve path (see the
|
|
10
|
-
// bottom dispatch), so `install` registers and exits without needing the SDK present.
|
|
11
|
-
import { spawnSync, spawn } from 'node:child_process';
|
|
12
|
-
import { fileURLToPath } from 'node:url';
|
|
13
|
-
import { dirname, join } from 'node:path';
|
|
14
|
-
import { readFileSync, unlinkSync, statSync, mkdtempSync, rmSync, existsSync, mkdirSync, writeFileSync, renameSync, appendFileSync } from 'node:fs';
|
|
15
|
-
import { tmpdir, homedir } from 'node:os';
|
|
16
|
-
import { createHmac, randomBytes } from 'node:crypto';
|
|
17
|
-
import { NoiseFilter, resolveFilterConfig } from './noise-filter.mjs';
|
|
18
|
-
import { sanitizeForSpeech, buildUtterance, estimateSpeechMs, SpeechBudget } from './speech-safety.mjs';
|
|
19
|
-
import { parseVlmConfig, vlmGate, buildChatBody, extractText, SYNTH_PNG_B64, DEFAULT_VLM_PROMPT } from './vlm.mjs';
|
|
20
|
-
import { classifyActivity } from './activity.mjs';
|
|
21
|
-
|
|
22
|
-
const dir = dirname(fileURLToPath(import.meta.url));
|
|
23
|
-
const plat = process.platform;
|
|
24
|
-
|
|
25
|
-
//
|
|
26
|
-
//
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
//
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
// ──
|
|
86
|
-
//
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
};
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
function
|
|
144
|
-
|
|
145
|
-
if (!
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
this.
|
|
277
|
-
}
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
if (this.
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
const
|
|
313
|
-
const
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
try {
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
if (
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
//
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
//
|
|
349
|
-
//
|
|
350
|
-
//
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
const
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
function
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
//
|
|
388
|
-
//
|
|
389
|
-
//
|
|
390
|
-
//
|
|
391
|
-
//
|
|
392
|
-
//
|
|
393
|
-
//
|
|
394
|
-
//
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
const
|
|
399
|
-
const
|
|
400
|
-
const
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
const
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
const
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
//
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
//
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
const
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
if (
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
if (!
|
|
585
|
-
const
|
|
586
|
-
|
|
587
|
-
const
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
const
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
const
|
|
639
|
-
|
|
640
|
-
const
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
this.
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
const
|
|
725
|
-
if (
|
|
726
|
-
|
|
727
|
-
if (cp.redacted) {
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
if (
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
this.
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
if (this.
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
}
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
this.
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
}
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
}
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
},
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
{
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
window: { type: 'string', description: '
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
},
|
|
941
|
-
},
|
|
942
|
-
{
|
|
943
|
-
name: '
|
|
944
|
-
description: '
|
|
945
|
-
inputSchema: {
|
|
946
|
-
type: 'object',
|
|
947
|
-
properties: {
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
},
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
}
|
|
1115
|
-
}
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
const
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
if (
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
if (a.
|
|
1174
|
-
if (a.
|
|
1175
|
-
if (a.
|
|
1176
|
-
if (a.
|
|
1177
|
-
if (a.
|
|
1178
|
-
|
|
1179
|
-
if (a.
|
|
1180
|
-
if (a.
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
if (
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
if (a.
|
|
1202
|
-
if (a.
|
|
1203
|
-
if (a.
|
|
1204
|
-
if (a.
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
result =
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
if (
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
if (
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
}
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
}
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// @vortex-os/computer-use — read-only screen-perception MCP stdio server (Windows-first).
|
|
3
|
+
// Tools: probe · read_ui · classify_activity · capture_screen · watch_capture · poll_change · start_watch · get_events · stop_watch · beep · speak.
|
|
4
|
+
// Control is out of scope.
|
|
5
|
+
// Two modes (bin `vortex-mcp-computer-use`):
|
|
6
|
+
// - default: run the stdio server (what an MCP host launches).
|
|
7
|
+
// - `install`: self-register into the project `.mcp.json` under the non-reserved key
|
|
8
|
+
// `vortex-computer-use` (merge-safe). e.g. `npx vortex-mcp-computer-use install`.
|
|
9
|
+
// Optional dep: @modelcontextprotocol/sdk — loaded DYNAMICALLY only on the serve path (see the
|
|
10
|
+
// bottom dispatch), so `install` registers and exits without needing the SDK present.
|
|
11
|
+
import { spawnSync, spawn } from 'node:child_process';
|
|
12
|
+
import { fileURLToPath } from 'node:url';
|
|
13
|
+
import { dirname, join } from 'node:path';
|
|
14
|
+
import { readFileSync, unlinkSync, statSync, mkdtempSync, rmSync, existsSync, mkdirSync, writeFileSync, renameSync, appendFileSync } from 'node:fs';
|
|
15
|
+
import { tmpdir, homedir } from 'node:os';
|
|
16
|
+
import { createHmac, randomBytes } from 'node:crypto';
|
|
17
|
+
import { NoiseFilter, resolveFilterConfig } from './noise-filter.mjs';
|
|
18
|
+
import { sanitizeForSpeech, buildUtterance, estimateSpeechMs, SpeechBudget } from './speech-safety.mjs';
|
|
19
|
+
import { parseVlmConfig, vlmGate, buildChatBody, extractText, SYNTH_PNG_B64, DEFAULT_VLM_PROMPT } from './vlm.mjs';
|
|
20
|
+
import { classifyActivity } from './activity.mjs';
|
|
21
|
+
|
|
22
|
+
const dir = dirname(fileURLToPath(import.meta.url));
|
|
23
|
+
const plat = process.platform;
|
|
24
|
+
|
|
25
|
+
// Package version (read from the shipped package.json — `dir` is scripts/, so the manifest is one up),
|
|
26
|
+
// reported as the MCP server version so the host sees the real version (was hardcoded + stale before).
|
|
27
|
+
const PKG_VERSION = (() => {
|
|
28
|
+
try { return JSON.parse(readFileSync(join(dir, '..', 'package.json'), 'utf8')).version || '0.0.0'; }
|
|
29
|
+
catch { return '0.0.0'; }
|
|
30
|
+
})();
|
|
31
|
+
|
|
32
|
+
// Resolve the user config file `computer-use.config.json`. HISTORY: it was read only from `dir` (this
|
|
33
|
+
// scripts/ folder INSIDE node_modules) — a path the docs never named and that npm wipes on every reinstall,
|
|
34
|
+
// so a documented redaction denylist / tts / companion config silently never loaded. NOW, in order:
|
|
35
|
+
// 1. VORTEX_CU_CONFIG — explicit absolute path override.
|
|
36
|
+
// 2. <cwd>/computer-use.config.json — the instance root (the MCP host launches us with cwd = instance root);
|
|
37
|
+
// durable, outside node_modules, and where users are now told to put it.
|
|
38
|
+
// 3. <dir>/computer-use.config.json — legacy scripts/ location, kept for backward compatibility.
|
|
39
|
+
// When none exists we return the cwd path (the canonical "put it here" location), so callers treat config as absent.
|
|
40
|
+
// Because a missing config means an EMPTY redaction denylist (no privacy protection), every "expected
|
|
41
|
+
// config not loaded" case warns to stderr rather than failing silently — a silent inert denylist is the
|
|
42
|
+
// exact trap this release fixes (codex r1, MEDIUM x2).
|
|
43
|
+
function resolveConfigPath() {
|
|
44
|
+
const env = process.env.VORTEX_CU_CONFIG;
|
|
45
|
+
const cwd = join(process.cwd(), 'computer-use.config.json');
|
|
46
|
+
const legacy = join(dir, 'computer-use.config.json');
|
|
47
|
+
if (env && env.trim()) {
|
|
48
|
+
if (existsSync(env)) return env;
|
|
49
|
+
// Explicit path that doesn't exist: warn LOUD (likely a typo) and fall back, never silently honor a dead path.
|
|
50
|
+
process.stderr.write(`[computer-use MCP] WARNING: VORTEX_CU_CONFIG="${env}" does not exist — config (incl. the redaction denylist) is NOT loaded from it; falling back to the instance root.\n`);
|
|
51
|
+
}
|
|
52
|
+
if (existsSync(cwd)) {
|
|
53
|
+
if (existsSync(legacy)) process.stderr.write(`[computer-use MCP] NOTE: using the instance-root computer-use.config.json; a legacy ${legacy} also exists and is IGNORED.\n`);
|
|
54
|
+
return cwd;
|
|
55
|
+
}
|
|
56
|
+
if (existsSync(legacy)) return legacy;
|
|
57
|
+
return cwd;
|
|
58
|
+
}
|
|
59
|
+
// Resolve ONCE at load (so the warnings above fire at most once, not per config section).
|
|
60
|
+
const CONFIG_PATH = resolveConfigPath();
|
|
61
|
+
|
|
62
|
+
// ── redaction config (§8·§14) ─────────────────────────────────────────────
|
|
63
|
+
// Normalize the denylist into env (JSON array) so children (worker / per-call spawn) inherit it -> no per-call args. Config source: env > config file.
|
|
64
|
+
// The actual blocking is done by the backend (lib.ps1 Test-AxDenylist) right before CopyFromScreen (Node doesn't know which windows are inside the region/monitor).
|
|
65
|
+
function loadRedactionConfig() {
|
|
66
|
+
let titles = [], procs = [];
|
|
67
|
+
try {
|
|
68
|
+
const cfgPath = CONFIG_PATH;
|
|
69
|
+
if (existsSync(cfgPath)) {
|
|
70
|
+
const r = (JSON.parse(readFileSync(cfgPath, 'utf8')) || {}).redaction || {};
|
|
71
|
+
if (Array.isArray(r.denyWindowTitles)) titles = r.denyWindowTitles;
|
|
72
|
+
if (Array.isArray(r.denyProcesses)) procs = r.denyProcesses;
|
|
73
|
+
}
|
|
74
|
+
} catch {}
|
|
75
|
+
try { if (process.env.VORTEX_CU_DENY_TITLES) titles = JSON.parse(process.env.VORTEX_CU_DENY_TITLES); } catch {}
|
|
76
|
+
try { if (process.env.VORTEX_CU_DENY_PROCS) procs = JSON.parse(process.env.VORTEX_CU_DENY_PROCS); } catch {}
|
|
77
|
+
titles = (Array.isArray(titles) ? titles : []).map(String).filter(Boolean);
|
|
78
|
+
procs = (Array.isArray(procs) ? procs : []).map(String).filter(Boolean);
|
|
79
|
+
process.env.VORTEX_CU_DENY_TITLES = JSON.stringify(titles); // re-export for child inheritance (after normalization)
|
|
80
|
+
process.env.VORTEX_CU_DENY_PROCS = JSON.stringify(procs);
|
|
81
|
+
return { titles, procs };
|
|
82
|
+
}
|
|
83
|
+
const REDACTION = loadRedactionConfig();
|
|
84
|
+
|
|
85
|
+
// ── TTS / audio-ducking config (file < env precedence, like the denylist) ──────────────────
|
|
86
|
+
// Reads the `tts` section of computer-use.config.json and fills process.env ONLY where the matching env var is
|
|
87
|
+
// unset, so a user can tune voice/engine/ducking in the config FILE (no env needed) while env still wins. The
|
|
88
|
+
// values flow unchanged to the spawned speak helpers (speak-supertonic.mjs / speak.ps1), which read these env vars.
|
|
89
|
+
function loadTtsConfig() {
|
|
90
|
+
let cfg = {};
|
|
91
|
+
try {
|
|
92
|
+
const cfgPath = CONFIG_PATH;
|
|
93
|
+
if (existsSync(cfgPath)) cfg = (JSON.parse(readFileSync(cfgPath, 'utf8')) || {}).tts || {};
|
|
94
|
+
} catch {}
|
|
95
|
+
const setIfUnset = (k, v) => { if (v !== undefined && v !== null && (process.env[k] === undefined || process.env[k] === '')) process.env[k] = String(v); };
|
|
96
|
+
setIfUnset('VORTEX_CU_TTS_ENGINE', cfg.engine); // 'auto' (default) | 'supertonic' | 'heami'
|
|
97
|
+
setIfUnset('VORTEX_CU_TTS_VOICE', cfg.voice); // Supertonic voice: F1..F5 / M1..M5
|
|
98
|
+
setIfUnset('VORTEX_CU_TTS_MODEL_DIR', cfg.modelDir); // Supertonic model cache (default ~/.vortex/computer-use/supertonic-3)
|
|
99
|
+
setIfUnset('VORTEX_CU_TTS_LANG', cfg.lang); // spoken language (defaults to the OCR language)
|
|
100
|
+
setIfUnset('VORTEX_CU_TTS_SPEED', cfg.speed); // speech-rate multiplier (~1.0 = normal; empty = engine default)
|
|
101
|
+
if (cfg.duck === false) setIfUnset('VORTEX_CU_DUCK', 'off'); // lower other apps while speaking (default on)
|
|
102
|
+
// others -> original*factor during speech; clamp a finite value to 0..1 so a typo (e.g. 30) can't pass through (default 0.3).
|
|
103
|
+
const df = Number.isFinite(Number(cfg.duckFactor)) ? Math.max(0, Math.min(1, Number(cfg.duckFactor))) : cfg.duckFactor;
|
|
104
|
+
setIfUnset('VORTEX_CU_DUCK_FACTOR', df);
|
|
105
|
+
}
|
|
106
|
+
loadTtsConfig();
|
|
107
|
+
|
|
108
|
+
// ── companion (adaptive screen companion) config — same file<env precedence ──
|
|
109
|
+
// `companion.uiaCanvasMax` tunes the GPU-canvas (game/video) UIA cutoff; `companion.profiles` overrides per-class
|
|
110
|
+
// cadence/proactivity (e.g. { "GAME": { "cadenceSec": 20 } }). Consumed by classify_activity.
|
|
111
|
+
let COMPANION_PROFILES = {};
|
|
112
|
+
function loadCompanionConfig() {
|
|
113
|
+
let cfg = {};
|
|
114
|
+
try {
|
|
115
|
+
const cfgPath = CONFIG_PATH;
|
|
116
|
+
if (existsSync(cfgPath)) cfg = (JSON.parse(readFileSync(cfgPath, 'utf8')) || {}).companion || {};
|
|
117
|
+
} catch {}
|
|
118
|
+
if (cfg.uiaCanvasMax != null && (process.env.VORTEX_CU_UIA_CANVAS_MAX === undefined || process.env.VORTEX_CU_UIA_CANVAS_MAX === '')) {
|
|
119
|
+
process.env.VORTEX_CU_UIA_CANVAS_MAX = String(cfg.uiaCanvasMax);
|
|
120
|
+
}
|
|
121
|
+
COMPANION_PROFILES = (cfg.profiles && typeof cfg.profiles === 'object') ? cfg.profiles : {};
|
|
122
|
+
}
|
|
123
|
+
loadCompanionConfig();
|
|
124
|
+
|
|
125
|
+
// ── audit log (§8: metadata/HMAC only, original image not stored) ──────────────────────
|
|
126
|
+
// Location = under LocalAppData (outside the instance data/ -> won't leak via corporate sync, codex MEDIUM). The key lives there too.
|
|
127
|
+
const AUDIT_DIR = join(process.env.LOCALAPPDATA || join(homedir(), '.local', 'share'), 'vortex-computer-use', 'audit');
|
|
128
|
+
function loadAuditKey() {
|
|
129
|
+
try {
|
|
130
|
+
mkdirSync(AUDIT_DIR, { recursive: true });
|
|
131
|
+
const kp = join(AUDIT_DIR, '.hmac-key');
|
|
132
|
+
if (existsSync(kp)) { const k = readFileSync(kp, 'utf8').trim(); if (k) return k; }
|
|
133
|
+
const k = randomBytes(32).toString('hex');
|
|
134
|
+
writeFileSync(kp, k, { mode: 0o600 }); // Windows ignores mode, but LocalAppData is per-user
|
|
135
|
+
return k;
|
|
136
|
+
} catch (e) {
|
|
137
|
+
// Auditing is a record, not an access control — a failed key setup must not block perception (capture); local tool availability. But don't swallow it silently — warn (codex r2 MEDIUM).
|
|
138
|
+
process.stderr.write(`[computer-use MCP] WARNING: audit log disabled — could not set up HMAC key (${(e && e.message) || e}). Perception still works; captures will NOT be audited.\n`);
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
const AUDIT_KEY = loadAuditKey();
|
|
143
|
+
function auditHmac(buf) { return createHmac('sha256', AUDIT_KEY).update(buf).digest('hex'); }
|
|
144
|
+
function auditLog(tool, payload, imageItems) {
|
|
145
|
+
if (!AUDIT_KEY) return; // skip silently if key setup failed (the tool keeps working)
|
|
146
|
+
try {
|
|
147
|
+
const p = payload || {};
|
|
148
|
+
// Detect not only top-level redacted/partialRedacted but also the nested captures[].redacted of a multi-frame watch (codex r3 MEDIUM).
|
|
149
|
+
const isRed = !!(p.redacted || p.partialRedacted || (Array.isArray(p.captures) && p.captures.some((f) => f && f.redacted)));
|
|
150
|
+
const rec = {
|
|
151
|
+
ts: new Date().toISOString(), tool,
|
|
152
|
+
mode: typeof p.target === 'string' ? p.target : undefined,
|
|
153
|
+
titleHmac: p.window ? auditHmac(String(p.window)).slice(0, 16) : undefined, // window title not stored in plaintext (HMAC only, A4-1)
|
|
154
|
+
redacted: isRed, reason: p.reason || undefined,
|
|
155
|
+
outputBytes: 0, contentHmac: undefined,
|
|
156
|
+
};
|
|
157
|
+
const h = createHmac('sha256', AUDIT_KEY);
|
|
158
|
+
if (imageItems && imageItems.length) {
|
|
159
|
+
for (const im of imageItems) { const b = Buffer.from(im.data, 'base64'); rec.outputBytes += b.length; h.update(b); }
|
|
160
|
+
} else { h.update(JSON.stringify(p)); } // JSON.stringify blocks JSONL injection (newlines / control chars) (codex MEDIUM)
|
|
161
|
+
rec.contentHmac = h.digest('hex').slice(0, 32);
|
|
162
|
+
const day = rec.ts.slice(0, 10);
|
|
163
|
+
appendFileSync(join(AUDIT_DIR, `cu-${day}.jsonl`), JSON.stringify(rec) + '\n');
|
|
164
|
+
} catch {}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// OS-native backend (same as action-ext.mjs — a single source would be ideal, but it's duplicated since this is a PoC)
|
|
168
|
+
const B = {
|
|
169
|
+
win32: {
|
|
170
|
+
probe: ['pwsh', ['-NoProfile', '-File', join(dir, 'probe.ps1')]],
|
|
171
|
+
read: ['pwsh', ['-NoProfile', '-File', join(dir, 'read-ui.ps1')]],
|
|
172
|
+
classify: ['pwsh', ['-NoProfile', '-File', join(dir, 'classify.ps1')]],
|
|
173
|
+
capture: ['pwsh', ['-NoProfile', '-File', join(dir, 'point-to-ask.ps1')]],
|
|
174
|
+
},
|
|
175
|
+
darwin: {
|
|
176
|
+
probe: ['bash', [join(dir, 'mac', 'probe.sh')]],
|
|
177
|
+
read: ['osascript', ['-l', 'JavaScript', join(dir, 'mac', 'read-ui.js')]],
|
|
178
|
+
capture: ['bash', [join(dir, 'mac', 'point-to-ask.sh')]],
|
|
179
|
+
},
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
// Returns: { payload, isError } — backend abnormal exit / non-JSON output is surfaced as isError (so an error never flows as if normal in the watch loop).
|
|
183
|
+
function runBackend(kind, extraArgs = [], timeoutMs = 0) {
|
|
184
|
+
const b = B[plat]?.[kind];
|
|
185
|
+
if (!b) return { payload: { error: `unsupported platform/op: ${plat}/${kind}`, grade: 'P0 (manual) fallback' }, isError: true };
|
|
186
|
+
const [exe, base] = b;
|
|
187
|
+
// Optional hard timeout: a one-shot tool (e.g. classify_activity) must not let a hung COM/UIA call freeze the
|
|
188
|
+
// synchronous spawn (and thus the event loop). On timeout, spawnSync kills the child and sets r.error.
|
|
189
|
+
const opts = { encoding: 'utf8', maxBuffer: 8 * 1024 * 1024 };
|
|
190
|
+
if (timeoutMs > 0) { opts.timeout = timeoutMs; opts.killSignal = 'SIGKILL'; }
|
|
191
|
+
const r = spawnSync(exe, [...base, ...extraArgs], opts);
|
|
192
|
+
if (r.error) return { payload: { error: String(r.error) }, isError: true };
|
|
193
|
+
const failed = r.status !== 0;
|
|
194
|
+
try {
|
|
195
|
+
return { payload: JSON.parse(r.stdout), isError: failed };
|
|
196
|
+
} catch {
|
|
197
|
+
return { payload: { error: 'backend produced non-JSON output', exitCode: r.status, raw: (r.stdout || '').trim(), stderr: (r.stderr || '').trim() }, isError: true };
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Async backend runner for long/streaming ops (watch_capture). Never blocks the event loop, and a hard
|
|
202
|
+
// timeout kills the child and returns isError — so one watch call can't hang the whole MCP server (codex #high).
|
|
203
|
+
function runBackendAsync(kind, extraArgs = [], timeoutMs = 120000, cleanupDir = null) {
|
|
204
|
+
return new Promise((resolve) => {
|
|
205
|
+
const b = B[plat]?.[kind];
|
|
206
|
+
if (!b) return resolve({ payload: { error: `unsupported platform/op: ${plat}/${kind}`, grade: 'P0 (manual) fallback' }, isError: true });
|
|
207
|
+
const [exe, base] = b;
|
|
208
|
+
const MAX_OUT = 16 * 1024 * 1024; // cap stdout/stderr accumulation; abort on overflow (codex #med DoS)
|
|
209
|
+
let out = '', err = '', settled = false, pending = null, killFallback = null;
|
|
210
|
+
const child = spawn(exe, [...base, ...extraArgs], { stdio: ['ignore', 'pipe', 'pipe'] });
|
|
211
|
+
const resolveOnce = (payload, isError, cleanupOwned = false) => {
|
|
212
|
+
if (settled) return; settled = true;
|
|
213
|
+
clearTimeout(timer); if (killFallback) clearTimeout(killFallback);
|
|
214
|
+
resolve({ payload, isError, cleanupOwned });
|
|
215
|
+
};
|
|
216
|
+
// Abort = decide the outcome + kill the child, but RESOLVE only after the child has CLOSED (file handles released)
|
|
217
|
+
// so the caller's reqDir cleanup can't race a still-flushing process (codex #high). If close never arrives within the
|
|
218
|
+
// grace window, resolve anyway but TAKE OWNERSHIP of reqDir cleanup (deferred to the eventual close, with a hard reaper)
|
|
219
|
+
// and signal cleanupOwned so the caller skips its own rmSync — we never delete a dir a live pwsh might still hold.
|
|
220
|
+
const abort = (payload) => {
|
|
221
|
+
if (pending || settled) return;
|
|
222
|
+
pending = payload;
|
|
223
|
+
try { child.kill(); } catch {}
|
|
224
|
+
killFallback = setTimeout(() => {
|
|
225
|
+
if (cleanupDir) {
|
|
226
|
+
child.once('close', () => { try { rmSync(cleanupDir, { recursive: true, force: true }); } catch {} });
|
|
227
|
+
const reaper = setTimeout(() => { try { rmSync(cleanupDir, { recursive: true, force: true }); } catch {} }, 30000);
|
|
228
|
+
if (reaper.unref) reaper.unref();
|
|
229
|
+
}
|
|
230
|
+
resolveOnce(pending, true, cleanupDir != null);
|
|
231
|
+
}, 3000);
|
|
232
|
+
if (killFallback.unref) killFallback.unref();
|
|
233
|
+
};
|
|
234
|
+
const timer = setTimeout(() => abort({ error: `watch timed out after ${timeoutMs}ms`, partial: (out || '').slice(0, 200) }), timeoutMs);
|
|
235
|
+
child.stdout.setEncoding('utf8'); child.stdout.on('data', (d) => { if (pending || settled) return; out += d; if (out.length > MAX_OUT) abort({ error: 'backend stdout exceeded cap' }); });
|
|
236
|
+
child.stderr.setEncoding('utf8'); child.stderr.on('data', (d) => { if (pending || settled || err.length >= MAX_OUT) return; err += d; });
|
|
237
|
+
child.on('error', (e) => resolveOnce({ error: String((e && e.message) || e) }, true));
|
|
238
|
+
child.on('close', (code) => {
|
|
239
|
+
if (pending) return resolveOnce(pending, true); // aborted AND child closed within grace → caller can clean safely
|
|
240
|
+
if (settled) return;
|
|
241
|
+
try { resolveOnce(JSON.parse(out), code !== 0); }
|
|
242
|
+
catch { resolveOnce({ error: 'backend produced non-JSON output', exitCode: code, raw: (out || '').trim().slice(0, 500), stderr: (err || '').trim().slice(0, 500) }, true); }
|
|
243
|
+
});
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// ── resident PowerShell worker (Windows MCP only) ─────────────────────────────
|
|
248
|
+
// Removes the per-call pwsh re-spawn (~150-370ms setup) — keeps one worker alive and talks to it over JSON-lines.
|
|
249
|
+
// Safeguards (codex cross-check): lazy start (spawn only on first call) · idle shutdown (N seconds with no command) ·
|
|
250
|
+
// single-worker command queue serialization · id matching · a generation (= process identity) guard to ignore stale events ·
|
|
251
|
+
// on crash, reject in-flight (no auto-retry of side-effect ops) · on no-response timeout, kill the worker and re-spawn.
|
|
252
|
+
// Multi-instance: each MCP server has its own worker (dedicated pipe) -> no conflicts. On parent exit the
|
|
253
|
+
// worker auto-terminates via stdin EOF. watch isn't run on the worker (avoids long occupation) — done per-call.
|
|
254
|
+
const WORKER = ['pwsh', ['-NoProfile', '-File', join(dir, 'worker.ps1')]];
|
|
255
|
+
const WORKER_IDLE_MS = Number(process.env.VORTEX_AX_WORKER_IDLE_MS || 60000);
|
|
256
|
+
const OP_TIMEOUT_MS = Number(process.env.VORTEX_AX_OP_TIMEOUT_MS || 10000);
|
|
257
|
+
|
|
258
|
+
class WorkerManager {
|
|
259
|
+
constructor([exe, args]) { this.exe = exe; this.args = args; this.worker = null; this.buf = ''; this.seq = 0; this.inFlight = null; this.queue = []; this.idleTimer = null; }
|
|
260
|
+
|
|
261
|
+
call(op, args, timeoutMs = OP_TIMEOUT_MS) {
|
|
262
|
+
return new Promise((resolve, reject) => { this.queue.push({ op, args, timeoutMs, resolve, reject }); this._pump(); });
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
_spawn() {
|
|
266
|
+
const child = spawn(this.exe, this.args, { stdio: ['pipe', 'pipe', 'pipe'] });
|
|
267
|
+
this.worker = child; this.buf = '';
|
|
268
|
+
child.stdout.setEncoding('utf8');
|
|
269
|
+
child.stdout.on('data', (chunk) => {
|
|
270
|
+
if (child !== this.worker) return; // generation guard: ignore the old worker
|
|
271
|
+
this.buf += chunk;
|
|
272
|
+
let nl;
|
|
273
|
+
while ((nl = this.buf.indexOf('\n')) >= 0) {
|
|
274
|
+
const line = this.buf.slice(0, nl).trim();
|
|
275
|
+
this.buf = this.buf.slice(nl + 1);
|
|
276
|
+
if (line) this._onLine(line);
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
child.stderr.on('data', (d) => process.stderr.write(`[ax-worker] ${d}`));
|
|
280
|
+
child.stdin.on('error', () => {}); // EPIPE etc. are cleaned up in exit/error — here we only prevent an unhandled crash
|
|
281
|
+
child.on('error', (err) => this._onGone(child, `worker spawn/runtime error: ${err && err.message}`));
|
|
282
|
+
child.on('exit', (code) => this._onGone(child, `worker exited (code ${code}) before responding`));
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
_onGone(child, reason) {
|
|
286
|
+
if (child !== this.worker) return; // generation guard: ignore an old / already-replaced worker
|
|
287
|
+
this.worker = null;
|
|
288
|
+
const f = this.inFlight;
|
|
289
|
+
if (f) { clearTimeout(f.timer); this.inFlight = null; f.reject(new Error(reason)); }
|
|
290
|
+
if (this.queue.length) this._pump(); // run the backlog on a new worker (no auto-retry of side-effect ops)
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
_onLine(line) {
|
|
294
|
+
let msg;
|
|
295
|
+
try { msg = JSON.parse(line); } catch { process.stderr.write(`[ax-worker] non-JSON line dropped: ${line.slice(0, 120)}\n`); return; }
|
|
296
|
+
const f = this.inFlight;
|
|
297
|
+
if (!f || msg.id !== f.id) return; // stale/mismatch
|
|
298
|
+
clearTimeout(f.timer); this.inFlight = null;
|
|
299
|
+
if (msg.ok) f.resolve(msg.result); else f.reject(new Error(msg.error || 'worker error'));
|
|
300
|
+
this._pump();
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
_pump() {
|
|
304
|
+
if (this.inFlight) return;
|
|
305
|
+
if (this.queue.length === 0) { this._scheduleIdle(); return; }
|
|
306
|
+
if (this.idleTimer) { clearTimeout(this.idleTimer); this.idleTimer = null; }
|
|
307
|
+
if (!this.worker) this._spawn();
|
|
308
|
+
const job = this.queue.shift();
|
|
309
|
+
const id = ++this.seq;
|
|
310
|
+
const timer = setTimeout(() => {
|
|
311
|
+
if (this.inFlight && this.inFlight.id === id) { // no response -> assume the worker is stuck and kill it
|
|
312
|
+
const w = this.worker; this.worker = null;
|
|
313
|
+
const rej = this.inFlight.reject; this.inFlight = null;
|
|
314
|
+
if (w) { try { w.kill(); } catch {} }
|
|
315
|
+
rej(new Error(`worker op timeout after ${job.timeoutMs}ms`));
|
|
316
|
+
this._pump();
|
|
317
|
+
}
|
|
318
|
+
}, job.timeoutMs);
|
|
319
|
+
this.inFlight = { id, resolve: job.resolve, reject: job.reject, timer };
|
|
320
|
+
try { this.worker.stdin.write(JSON.stringify({ id, op: job.op, args: job.args }) + '\n'); }
|
|
321
|
+
catch (e) {
|
|
322
|
+
clearTimeout(timer); this.inFlight = null;
|
|
323
|
+
const w = this.worker; this.worker = null;
|
|
324
|
+
if (w) { try { w.kill(); } catch {} }
|
|
325
|
+
job.reject(e); this._pump(); // assume the worker is dead, clean up, and proceed to the next request (prevents queue stall)
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
_scheduleIdle() {
|
|
330
|
+
if (this.idleTimer || !this.worker) return;
|
|
331
|
+
this.idleTimer = setTimeout(() => { this.idleTimer = null; if (!this.inFlight && this.queue.length === 0) this.dispose(); }, WORKER_IDLE_MS);
|
|
332
|
+
if (this.idleTimer.unref) this.idleTimer.unref();
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
dispose() {
|
|
336
|
+
if (this.idleTimer) { clearTimeout(this.idleTimer); this.idleTimer = null; }
|
|
337
|
+
const w = this.worker; this.worker = null;
|
|
338
|
+
if (w) { try { w.stdin.end(); } catch {} try { w.kill(); } catch {} }
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
const workerMgr = new WorkerManager(WORKER);
|
|
343
|
+
|
|
344
|
+
// watchIds this server process has dispatched a poll_change for — to detect a SILENT baseline reset
|
|
345
|
+
// (the resident worker was idle-disposed / killed / crashed and lost its in-memory watch state). codex #high.
|
|
346
|
+
const pollSeen = new Set();
|
|
347
|
+
|
|
348
|
+
// Server-owned volatility (codex blocker, design §8): when a backend wrote screenshot file(s), read them inline as
|
|
349
|
+
// MCP image content and DELETE them immediately — no on-disk path is returned, so a crashed/idle caller can't leave
|
|
350
|
+
// sensitive screenshots behind. Bounded: at most MAX_INLINE_IMAGES are embedded; extras are still unlinked + counted.
|
|
351
|
+
const MAX_INLINE_IMAGES = 8;
|
|
352
|
+
const MAX_IMAGE_BYTES = 8 * 1024 * 1024; // per-image cap (codex #med — bound response size, avoid blocking read)
|
|
353
|
+
const MAX_TOTAL_INLINE_BYTES = 24 * 1024 * 1024; // total inline cap across a response
|
|
354
|
+
function materializeImages(payload) {
|
|
355
|
+
const images = [];
|
|
356
|
+
let inlined = 0, dropped = 0, totalBytes = 0;
|
|
357
|
+
const take = (p) => {
|
|
358
|
+
if (!p || typeof p !== 'string') return false;
|
|
359
|
+
let ok = false;
|
|
360
|
+
try {
|
|
361
|
+
const sz = statSync(p).size;
|
|
362
|
+
if (inlined < MAX_INLINE_IMAGES && sz <= MAX_IMAGE_BYTES && totalBytes + sz <= MAX_TOTAL_INLINE_BYTES) {
|
|
363
|
+
images.push({ type: 'image', data: readFileSync(p).toString('base64'), mimeType: 'image/png' });
|
|
364
|
+
inlined++; totalBytes += sz; ok = true;
|
|
365
|
+
} else { dropped++; }
|
|
366
|
+
} catch { /* file already gone — fine */ }
|
|
367
|
+
try { unlinkSync(p); } catch {}
|
|
368
|
+
return ok;
|
|
369
|
+
};
|
|
370
|
+
if (payload && typeof payload === 'object') {
|
|
371
|
+
if (payload.path) { const g = take(payload.path); delete payload.path; payload.image = g ? 'inline' : 'unavailable'; }
|
|
372
|
+
if (Array.isArray(payload.captures)) {
|
|
373
|
+
for (const f of payload.captures) { if (f && f.path) { const g = take(f.path); delete f.path; f.image = g ? 'inline' : 'unavailable'; } }
|
|
374
|
+
}
|
|
375
|
+
if (dropped > 0) payload.imagesDropped = dropped;
|
|
376
|
+
}
|
|
377
|
+
return images;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
async function viaWorker(op, args, timeoutMs) {
|
|
381
|
+
try { return { payload: await workerMgr.call(op, args, timeoutMs), isError: false }; }
|
|
382
|
+
catch (e) { return { payload: { error: String((e && e.message) || e) }, isError: true }; }
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// ── watch sessions: background noise-filtered watch + in-memory event buffer (design §22.1·§22.2) ──
|
|
386
|
+
// start_watch spins a non-blocking poll loop OWNED BY THIS SERVER — not a separate process, and not the
|
|
387
|
+
// single worker's request slot (the worker stays a dumb single-shot capture engine; long watches must
|
|
388
|
+
// not occupy it, codex #1). Each tick polls the target's frame-to-frame change via the worker, feeds it
|
|
389
|
+
// to a NoiseFilter (debounce + cooldown + maxWait), and on an emit captures the settled frame and
|
|
390
|
+
// appends an event to a bounded in-memory ring buffer. get_events drains the buffer (non-blocking,
|
|
391
|
+
// batched -> few LLM looks); stop_watch ends it. The buffer is memory-only with count/byte/TTL caps and
|
|
392
|
+
// the frames live in RAM only (design §24.1 — no screen history on disk; the brief capture temp file is
|
|
393
|
+
// materialized inline + unlinked at once). Denylist + volatility apply per frame via the reused worker ops.
|
|
394
|
+
// Concurrency is deliberately conservative: every watch tick enqueues a poll_change on the SAME single
|
|
395
|
+
// PowerShell worker the foreground tools use, so too many fast watches would starve the agent's own
|
|
396
|
+
// capture_screen/read_ui calls (codex MED). 4 watches at a 400ms floor caps worst-case worker pressure at
|
|
397
|
+
// ~10 polls/s; a global foreground-priority queue is the documented next step if this proves tight.
|
|
398
|
+
const MAX_WATCHES = 4; // concurrent background watches cap
|
|
399
|
+
const WATCH_MIN_INTERVAL = 400, WATCH_MAX_INTERVAL = 5000;
|
|
400
|
+
const WATCH_DEFAULT_INTERVAL = 600;
|
|
401
|
+
const WATCH_MAX_DURATION_MS = 30 * 60 * 1000; // auto-stop a forgotten watch (privacy §8 + runaway guard)
|
|
402
|
+
const EVENT_RING_MAX = 64; // max buffered events per watch (oldest dropped when full)
|
|
403
|
+
const EVENT_TTL_MS = 5 * 60 * 1000; // buffered events older than this are evicted unread (§24.1 TTL)
|
|
404
|
+
const EVENT_IMG_MAX_BYTES = 4 * 1024 * 1024; // per-event inline image cap
|
|
405
|
+
const WATCH_BUF_MAX_BYTES = 24 * 1024 * 1024; // total inline image bytes held across one watch buffer
|
|
406
|
+
const GET_EVENTS_MAX = 12; // events returned per get_events call
|
|
407
|
+
const GET_EVENTS_MAX_IMAGES = 8; // image items returned per get_events call (MCP response bound)
|
|
408
|
+
const round2 = (n) => (Number.isFinite(Number(n)) ? Math.round(Number(n) * 100) / 100 : n);
|
|
409
|
+
|
|
410
|
+
function buildWatchTargetArgs(a) {
|
|
411
|
+
const t = {};
|
|
412
|
+
if (a.region) t.region = `${a.region.x},${a.region.y},${a.region.w},${a.region.h}`;
|
|
413
|
+
if (a.window) t.windowMatch = String(a.window);
|
|
414
|
+
if (a.monitor != null) t.monitor = String(a.monitor);
|
|
415
|
+
if (a.boxW) t.boxW = a.boxW;
|
|
416
|
+
if (a.boxH) t.boxH = a.boxH;
|
|
417
|
+
if (a.detail) t.detail = String(a.detail);
|
|
418
|
+
return t;
|
|
419
|
+
}
|
|
420
|
+
function watchTargetLabel(a) {
|
|
421
|
+
if (a.region) return `region ${a.region.x},${a.region.y} ${a.region.w}x${a.region.h}`;
|
|
422
|
+
if (a.window) return `window "${a.window}"`;
|
|
423
|
+
if (a.monitor != null) return `monitor ${a.monitor}`;
|
|
424
|
+
return 'cursor';
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// ── reflex path: fixed-phrase / OCR readout spoken LOCALLY, no cloud round-trip (design §22.3) ──
|
|
428
|
+
// A registered trigger crossing fires beep / say(fixed phrase) / ocr(read the region's text) directly from
|
|
429
|
+
// the watch loop. Speech goes through the GLOBAL speech safety (codex r1): screen-derived text (ocr) is
|
|
430
|
+
// never voiced raw — it gets a "화면 글자:" provenance prefix + control/secret shaping + a per-minute
|
|
431
|
+
// utterance & seconds budget with no-overlap and auto-mute. OCR uses the in-box Windows PowerShell 5.1
|
|
432
|
+
// (pwsh 7 can't load WinRT OCR); TTS uses pwsh 7 System.Speech. Both are spawned (non-blocking) and
|
|
433
|
+
// degrade silently if absent. The OCR crop comes from the SAME denylist-gated worker capture (never an
|
|
434
|
+
// arbitrary file), so a denylisted window blocks reflex OCR too (codex r1 MED).
|
|
435
|
+
const PS51 = join(process.env.WINDIR || 'C:\\Windows', 'System32', 'WindowsPowerShell', 'v1.0', 'powershell.exe');
|
|
436
|
+
const OCR_SCRIPT = join(dir, 'ocr.ps1');
|
|
437
|
+
const SPEAK_SCRIPT = join(dir, 'speak.ps1');
|
|
438
|
+
const OCR_LANG = process.env.VORTEX_CU_OCR_LANG || 'ko';
|
|
439
|
+
const SPEAK_TOWAV_DIR = process.env.VORTEX_CU_SPEAK_TOWAV_DIR || ''; // test hook: render speech to WAV instead of audio
|
|
440
|
+
const speechBudget = new SpeechBudget(); // GLOBAL across all watches (one set of ears)
|
|
441
|
+
let speakingChild = null;
|
|
442
|
+
let speakSeq = 0;
|
|
443
|
+
|
|
444
|
+
const MAX_SPEAK_MS = 30000; // hard upper bound for one utterance so a hung TTS can't hold the no-overlap lock forever (codex r2 MED)
|
|
445
|
+
|
|
446
|
+
// Provenance for screen-derived speech (ocr/vision): by DEFAULT a self-documenting verbal prefix ("화면 글자:" …)
|
|
447
|
+
// so even a first-time listener knows the source — this is the prior HIGH control (voicing raw screen text is a
|
|
448
|
+
// social-engineering channel) and a chime alone can't convey it, so spoken stays the default (codex r1 HIGH).
|
|
449
|
+
// VORTEX_CU_SPEECH_PROVENANCE=earcon is an explicit opt-in to a non-verbal chime instead. Either way the source
|
|
450
|
+
// is marked. Agent/user-authored speech ('agent' via the `speak` tool, and a fixed 'say' phrase) is trusted
|
|
451
|
+
// content and carries NO provenance mark.
|
|
452
|
+
const SPEECH_PROVENANCE_EARCON = process.env.VORTEX_CU_SPEECH_PROVENANCE === 'earcon';
|
|
453
|
+
const EARCON_EST_MS = 250; // chime duration to reserve against the speech budget when earcon mode is on (codex r1 LOW)
|
|
454
|
+
|
|
455
|
+
// ── TTS engine: Supertonic (separate-install ONNX neural, higher quality) with Heami fallback ──
|
|
456
|
+
// The audio-RENDER step only. The safety/budget/provenance layer above (buildUtterance + speechBudget) is
|
|
457
|
+
// engine-agnostic; only the final spawn differs. Engine is resolved ONCE at startup (not per-utterance): 'auto'
|
|
458
|
+
// picks Supertonic when its models + onnxruntime-node are present, else the always-available Heami (speak.ps1).
|
|
459
|
+
// VORTEX_CU_TTS_ENGINE=auto|supertonic|heami. Models live in VORTEX_CU_TTS_MODEL_DIR (fetch-supertonic.mjs writes
|
|
460
|
+
// the default ~/.vortex/computer-use/supertonic-3). Voice VORTEX_CU_TTS_VOICE (F1..F5/M1..M5), lang follows OCR_LANG.
|
|
461
|
+
const SPEAK_SUPERTONIC = join(dir, 'speak-supertonic.mjs');
|
|
462
|
+
const TTS_ENGINE_CFG = (process.env.VORTEX_CU_TTS_ENGINE || 'auto').toLowerCase();
|
|
463
|
+
const TTS_MODEL_DIR = process.env.VORTEX_CU_TTS_MODEL_DIR || join(homedir(), '.vortex', 'computer-use', 'supertonic-3');
|
|
464
|
+
const TTS_VOICE = process.env.VORTEX_CU_TTS_VOICE || 'F1';
|
|
465
|
+
const TTS_LANG = process.env.VORTEX_CU_TTS_LANG || OCR_LANG;
|
|
466
|
+
// Speech-rate multiplier (config `tts.speed` / env VORTEX_CU_TTS_SPEED). Empty/invalid -> each engine keeps its own
|
|
467
|
+
// default (Supertonic 1.05; Heami SAPI rate 0). Supertonic takes the multiplier verbatim as --speed; Heami's SAPI
|
|
468
|
+
// rate is an integer -10..10, so map the multiplier linearly around 1.0 (1.18 -> +2) and clamp. The multiplier itself
|
|
469
|
+
// is clamped to 0.5..2.0 so a config typo (e.g. 18 for 1.8) can't produce unusable audio. Unset or blank ('' /
|
|
470
|
+
// whitespace) -> engine default (blank is checked BEFORE Number(), since Number('') is 0, not NaN); a non-numeric
|
|
471
|
+
// value -> NaN -> engine default; an explicit 0 is a real (invalid) speed that clamps up to 0.5.
|
|
472
|
+
const TTS_SPEED_ENV = process.env.VORTEX_CU_TTS_SPEED;
|
|
473
|
+
const TTS_SPEED_RAW = (TTS_SPEED_ENV == null || String(TTS_SPEED_ENV).trim() === '') ? NaN : Number(TTS_SPEED_ENV);
|
|
474
|
+
const TTS_SPEED = Number.isFinite(TTS_SPEED_RAW) ? Math.max(0.5, Math.min(2.0, TTS_SPEED_RAW)) : null;
|
|
475
|
+
const HEAMI_RATE = TTS_SPEED != null ? Math.max(-10, Math.min(10, Math.round((TTS_SPEED - 1) * 10))) : null;
|
|
476
|
+
function supertonicAvailable() {
|
|
477
|
+
try {
|
|
478
|
+
const onnx = join(TTS_MODEL_DIR, 'onnx');
|
|
479
|
+
const need = ['duration_predictor.onnx', 'text_encoder.onnx', 'vector_estimator.onnx', 'vocoder.onnx', 'tts.json', 'unicode_indexer.json'];
|
|
480
|
+
if (!need.every((f) => existsSync(join(onnx, f)))) return false;
|
|
481
|
+
if (!existsSync(join(TTS_MODEL_DIR, 'voice_styles', `${TTS_VOICE}.json`))) return false;
|
|
482
|
+
import.meta.resolve('onnxruntime-node'); // throws if the optional dep isn't installed -> fall back to Heami
|
|
483
|
+
return true;
|
|
484
|
+
} catch { return false; }
|
|
485
|
+
}
|
|
486
|
+
// 'heami' forces SAPI; 'supertonic'/'auto' use Supertonic when actually available, else fall back (never go mute).
|
|
487
|
+
const TTS_ENGINE = TTS_ENGINE_CFG === 'heami' ? 'heami'
|
|
488
|
+
: ((TTS_ENGINE_CFG === 'supertonic' || TTS_ENGINE_CFG === 'auto') && supertonicAvailable() ? 'supertonic' : 'heami');
|
|
489
|
+
|
|
490
|
+
// Speak a finalized utterance without blocking the watch loop.
|
|
491
|
+
// kind: 'agent' (the `speak` tool — agent's judged words, no mark, redacted) | 'say' (fixed phrase, no mark)
|
|
492
|
+
// | 'ocr' | 'vision' (screen-derived, untrusted — marked + shaped).
|
|
493
|
+
function reflexSpeak(kind, text) {
|
|
494
|
+
const screenDerived = kind === 'ocr' || kind === 'vision';
|
|
495
|
+
const earcon = screenDerived && SPEECH_PROVENANCE_EARCON;
|
|
496
|
+
// Earcon mode carries provenance via the chime, so DON'T bake the spoken prefix — but keep the SAME shaping
|
|
497
|
+
// (control-char strip, secret redaction, length cap) buildUtterance applies to screen text.
|
|
498
|
+
const utt = earcon ? sanitizeForSpeech(text) : buildUtterance(kind, text);
|
|
499
|
+
if (!utt) return { ok: false, reason: 'empty' };
|
|
500
|
+
const res = speechBudget.tryReserve(estimateSpeechMs(utt) + (earcon ? EARCON_EST_MS : 0), Date.now());
|
|
501
|
+
if (!res.ok) return res;
|
|
502
|
+
try {
|
|
503
|
+
let child;
|
|
504
|
+
if (TTS_ENGINE === 'supertonic') {
|
|
505
|
+
// Render via the separate-install ONNX neural engine (higher quality). Same spawn lifecycle as Heami below:
|
|
506
|
+
// non-blocking, killed by the MAX_SPEAK_MS watchdog, budget released exactly once on exit.
|
|
507
|
+
const sargs = [SPEAK_SUPERTONIC, '--text', utt, '--voice', TTS_VOICE, '--lang', TTS_LANG, '--model-dir', TTS_MODEL_DIR];
|
|
508
|
+
if (TTS_SPEED != null) sargs.push('--speed', String(TTS_SPEED));
|
|
509
|
+
if (earcon) sargs.push('--earcon');
|
|
510
|
+
if (SPEAK_TOWAV_DIR) sargs.push('--to-wav', join(SPEAK_TOWAV_DIR, `utt-${++speakSeq}.wav`));
|
|
511
|
+
child = spawn(process.execPath, sargs, { stdio: 'ignore' });
|
|
512
|
+
} else {
|
|
513
|
+
const args = ['-NoProfile', '-File', SPEAK_SCRIPT, '-Text', utt];
|
|
514
|
+
if (HEAMI_RATE != null) args.push('-Rate', String(HEAMI_RATE));
|
|
515
|
+
if (earcon) args.push('-Earcon', 'screen');
|
|
516
|
+
if (SPEAK_TOWAV_DIR) args.push('-ToWav', join(SPEAK_TOWAV_DIR, `utt-${++speakSeq}.wav`));
|
|
517
|
+
child = spawn('pwsh', args, { stdio: 'ignore' });
|
|
518
|
+
}
|
|
519
|
+
speakingChild = child;
|
|
520
|
+
// Release the budget EXACTLY ONCE per child, and only if this child still owns the active reservation —
|
|
521
|
+
// a late/duplicate exit+error event must not free a newer utterance's slot (codex r2 LOW/MED). A hard
|
|
522
|
+
// timeout kills a hung speaker so it can't deadlock no-overlap (codex r2 MED).
|
|
523
|
+
let released = false;
|
|
524
|
+
const done = () => { if (released) return; released = true; clearTimeout(killer); if (speakingChild === child) { speakingChild = null; speechBudget.release(); } };
|
|
525
|
+
const killer = setTimeout(() => { try { child.kill(); } catch {} done(); }, MAX_SPEAK_MS);
|
|
526
|
+
if (killer.unref) killer.unref();
|
|
527
|
+
child.on('exit', done); child.on('error', done);
|
|
528
|
+
return { ok: true, uttered: utt };
|
|
529
|
+
} catch { speechBudget.release(); return { ok: false, reason: 'spawn-failed' }; }
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// OCR a worker-captured temp PNG via the 5.1 helper, hard time-bounded; returns recognized text or null.
|
|
533
|
+
// Resolves only AFTER the child has CLOSED (even on timeout: kill, then wait for close) so the PNG file
|
|
534
|
+
// handle is released before the caller unlinks the crop — otherwise a still-open handle leaves a screen
|
|
535
|
+
// crop on disk (codex r2 MED).
|
|
536
|
+
function runOcr(pngPath) {
|
|
537
|
+
return new Promise((resolve) => {
|
|
538
|
+
let out = '', settled = false, killed = false, child;
|
|
539
|
+
const done = (v) => { if (settled) return; settled = true; clearTimeout(timer); resolve(v); };
|
|
540
|
+
try {
|
|
541
|
+
child = spawn(PS51, ['-NoProfile', '-NonInteractive', '-ExecutionPolicy', 'Bypass', '-File', OCR_SCRIPT, '-ImagePath', pngPath, '-Lang', OCR_LANG], { stdio: ['ignore', 'pipe', 'ignore'] });
|
|
542
|
+
} catch { return resolve(null); }
|
|
543
|
+
const timer = setTimeout(() => { killed = true; try { child.kill(); } catch {} }, 6000);
|
|
544
|
+
child.stdout.setEncoding('utf8');
|
|
545
|
+
child.stdout.on('data', (d) => { out += d; if (out.length > 65536) { killed = true; try { child.kill(); } catch {} } });
|
|
546
|
+
child.on('error', () => done(null));
|
|
547
|
+
child.on('close', () => { if (killed) return done(null); try { const j = JSON.parse(out.trim()); done(j && j.ok ? j.text : null); } catch { done(null); } });
|
|
548
|
+
});
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// ── local VLM "middle path" (design §22.3 / §23.2 / §24): OPTIONAL, GPU-gated, off unless a trusted fast
|
|
552
|
+
// local endpoint is reachable. The reflex/brain paths work with no GPU; this only adds a smarter local
|
|
553
|
+
// description when the hardware allows. Capability is PROBED per session (never stored), with a SYNTHETIC
|
|
554
|
+
// image first (never a real crop before the endpoint is trusted), and gated on a measured latency SLA.
|
|
555
|
+
const VLM = parseVlmConfig();
|
|
556
|
+
const VLM_PROBE_TTL = 5 * 60 * 1000;
|
|
557
|
+
let vlmProbe = null, vlmProbeAt = 0, vlmProbing = null;
|
|
558
|
+
|
|
559
|
+
const VLM_MAX_CROP_BYTES = 6 * 1024 * 1024; // bound the data: URL we send (crop is already size-bounded; defence-in-depth)
|
|
560
|
+
const VLM_MAX_RESP_BYTES = 256 * 1024; // a short description reply is tiny — cap a hostile/huge response (codex MED)
|
|
561
|
+
|
|
562
|
+
// Read a response body up to maxBytes, then stop (so a huge/streaming reply can't exhaust memory).
|
|
563
|
+
async function readCappedText(r, maxBytes) {
|
|
564
|
+
const reader = r.body && r.body.getReader ? r.body.getReader() : null;
|
|
565
|
+
if (!reader) { const t = await r.text(); return t.length <= maxBytes ? t : null; }
|
|
566
|
+
let total = 0; const parts = [];
|
|
567
|
+
for (;;) {
|
|
568
|
+
const { done, value } = await reader.read();
|
|
569
|
+
if (done) break;
|
|
570
|
+
if (value) { total += value.length; if (total > maxBytes) { try { await reader.cancel(); } catch {} return null; } parts.push(Buffer.from(value)); }
|
|
571
|
+
}
|
|
572
|
+
return Buffer.concat(parts).toString('utf8');
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
async function httpChat(body, timeoutMs) {
|
|
576
|
+
const ctrl = new AbortController();
|
|
577
|
+
const timer = setTimeout(() => ctrl.abort(), timeoutMs);
|
|
578
|
+
try {
|
|
579
|
+
const headers = { 'content-type': 'application/json' };
|
|
580
|
+
if (VLM.key) headers.authorization = `Bearer ${VLM.key}`;
|
|
581
|
+
// redirect:'manual' so a 3xx can't replay this POST (with a real crop) to a DIFFERENT host, bypassing
|
|
582
|
+
// the remote-off trust gate (codex MED). A local VLM server never needs redirects → treat any non-2xx as fail.
|
|
583
|
+
const r = await fetch(`${VLM.endpoint}/chat/completions`, { method: 'POST', headers, body: JSON.stringify(body), signal: ctrl.signal, redirect: 'manual' });
|
|
584
|
+
if (!r.ok || r.type === 'opaqueredirect') return { ok: false, status: r.status, redirected: r.type === 'opaqueredirect' };
|
|
585
|
+
const clen = Number(r.headers.get('content-length') || 0);
|
|
586
|
+
if (clen && clen > VLM_MAX_RESP_BYTES) return { ok: false, error: 'response too large' };
|
|
587
|
+
const txt = await readCappedText(r, VLM_MAX_RESP_BYTES);
|
|
588
|
+
if (txt === null) return { ok: false, error: 'response exceeded cap' };
|
|
589
|
+
try { return { ok: true, json: JSON.parse(txt) }; } catch { return { ok: false, error: 'non-JSON response' }; }
|
|
590
|
+
} catch (e) { return { ok: false, error: String((e && e.name === 'AbortError') ? 'timeout' : (e && e.message) || e) }; }
|
|
591
|
+
finally { clearTimeout(timer); }
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
// Probe the VLM with a SYNTHETIC image only (no real screen) — measure reachability + latency, confirm a
|
|
595
|
+
// usable reply. Cached per process for VLM_PROBE_TTL. Note: synthetic latency is a LOWER bound (a real crop
|
|
596
|
+
// is larger/slower), so it gates OUT a too-slow endpoint but doesn't guarantee a real call is within budget.
|
|
597
|
+
async function probeVlm() {
|
|
598
|
+
const gate = vlmGate(VLM);
|
|
599
|
+
if (!gate.ok) return { available: false, reason: gate.reason, tier: gate.tier };
|
|
600
|
+
const now = Date.now();
|
|
601
|
+
if (vlmProbe && now - vlmProbeAt < VLM_PROBE_TTL) return vlmProbe;
|
|
602
|
+
if (vlmProbing) return vlmProbing;
|
|
603
|
+
vlmProbing = (async () => {
|
|
604
|
+
const t0 = Date.now();
|
|
605
|
+
const res = await httpChat(buildChatBody(VLM.model, '이미지가 보이면 ok 라고만 답해.', SYNTH_PNG_B64, 8), VLM.slaMs);
|
|
606
|
+
const latencyMs = Date.now() - t0;
|
|
607
|
+
let out;
|
|
608
|
+
if (!res.ok) out = { available: false, reason: `endpoint not reachable (${res.error || 'http ' + res.status})`, tier: gate.tier };
|
|
609
|
+
else if (latencyMs > VLM.slaMs) out = { available: false, reason: `too slow (${latencyMs}ms > ${VLM.slaMs}ms SLA)`, tier: gate.tier, latencyMs };
|
|
610
|
+
else out = { available: true, tier: gate.tier, model: VLM.model, latencyMs };
|
|
611
|
+
vlmProbe = out; vlmProbeAt = Date.now(); vlmProbing = null;
|
|
612
|
+
return out;
|
|
613
|
+
})();
|
|
614
|
+
return vlmProbing;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// Describe a worker-captured crop with the local VLM. Returns a short text or null. Output is UNTRUSTED
|
|
618
|
+
// (the caller speaks it via buildUtterance('vision', …) so it gets the "로컬 비전:" prefix + shaping + budget).
|
|
619
|
+
async function runVlm(pngPath) {
|
|
620
|
+
let b64;
|
|
621
|
+
try {
|
|
622
|
+
if (statSync(pngPath).size > VLM_MAX_CROP_BYTES) return null; // bound the data: URL (codex MED)
|
|
623
|
+
b64 = readFileSync(pngPath).toString('base64');
|
|
624
|
+
} catch { return null; }
|
|
625
|
+
const res = await httpChat(buildChatBody(VLM.model, DEFAULT_VLM_PROMPT, b64, VLM.maxTokens), Math.min(20000, VLM.slaMs * 3));
|
|
626
|
+
if (!res.ok) return null;
|
|
627
|
+
return extractText(res.json) || null;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Validate caller triggers into safe internal trigger objects. Triggers evaluate against the watch's main
|
|
631
|
+
// target region. Capped, clamped, action-gated. `say` content is sanitized (not screen-derived → not redacted).
|
|
632
|
+
function parseTriggers(raw) {
|
|
633
|
+
if (!Array.isArray(raw)) return [];
|
|
634
|
+
const out = [];
|
|
635
|
+
for (const t of raw.slice(0, 8)) {
|
|
636
|
+
if (!t || typeof t !== 'object') continue;
|
|
637
|
+
const action = ['say', 'ocr', 'vision'].includes(t.action) ? t.action : 'beep';
|
|
638
|
+
const th = Number(t.threshold);
|
|
639
|
+
const threshold = Math.min(100, Math.max(0.5, Number.isFinite(th) && th > 0 ? th : 12));
|
|
640
|
+
const cd = Number(t.cooldownMs);
|
|
641
|
+
const cooldownMs = Math.min(600000, Math.max(1500, Number.isFinite(cd) && cd > 0 ? Math.floor(cd) : 8000));
|
|
642
|
+
const trg = { action, threshold, cooldownMs, armed: true, pending: false, pendingTs: 0, lastFireTs: 0, fires: 0 };
|
|
643
|
+
if (action === 'say') trg.say = sanitizeForSpeech(String(t.say || ''), { redactTokens: false }) || '알림';
|
|
644
|
+
if (action === 'beep') trg.beep = ['info', 'warn', 'urgent'].includes(t.beep) ? t.beep : 'warn';
|
|
645
|
+
if (action === 'ocr' || action === 'vision') { const dw = Number(t.dwellMs); trg.dwellMs = Math.min(3000, Math.max(0, Number.isFinite(dw) && dw >= 0 ? dw : 700)); }
|
|
646
|
+
out.push(trg);
|
|
647
|
+
}
|
|
648
|
+
return out;
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
class WatchSession {
|
|
652
|
+
constructor(watchId, a) {
|
|
653
|
+
this.watchId = watchId;
|
|
654
|
+
this.bgId = `__watch_${watchId}`; // worker poll-continuity slot, isolated from agent poll_change ids
|
|
655
|
+
this.targetArgs = buildWatchTargetArgs(a);
|
|
656
|
+
this.targetLabel = watchTargetLabel(a);
|
|
657
|
+
const iv = Number(a.pollIntervalMs);
|
|
658
|
+
this.pollIntervalMs = Math.min(WATCH_MAX_INTERVAL, Math.max(WATCH_MIN_INTERVAL, Number.isFinite(iv) && iv > 0 ? Math.floor(iv) : WATCH_DEFAULT_INTERVAL));
|
|
659
|
+
this.filter = new NoiseFilter(a);
|
|
660
|
+
this.cfg = this.filter.cfg;
|
|
661
|
+
this.outDir = mkdtempSync(join(tmpdir(), 'vortex-cu-watch-'));
|
|
662
|
+
this.ring = []; // [{seq, ts, reason, peakPct, ..., _img?}]
|
|
663
|
+
this.seq = 0; this.dropped = 0; this.bufBytes = 0;
|
|
664
|
+
this.polls = 0; this.emitted = 0; this.redactedFrames = 0;
|
|
665
|
+
this.lastChangePct = null; this.lastPollTs = null;
|
|
666
|
+
this.lastError = null; this.consecErrors = 0;
|
|
667
|
+
this.triggers = parseTriggers(a.triggers); // reflex triggers on the main target region (§22.3)
|
|
668
|
+
this.reflexFires = 0; this.lastReflex = null;
|
|
669
|
+
this.startedAt = Date.now();
|
|
670
|
+
this.stopped = false; this.stopReason = null;
|
|
671
|
+
this._timer = null;
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
start() { this._schedule(0); return this.status(); } // first tick goes through the same guarded path (no unhandled rejection, codex LOW)
|
|
675
|
+
|
|
676
|
+
_schedule(delayMs = this.pollIntervalMs) {
|
|
677
|
+
if (this.stopped) return;
|
|
678
|
+
this._timer = setTimeout(() => { this._tick().catch(() => { if (!this.stopped) this._schedule(); }); }, delayMs);
|
|
679
|
+
if (this._timer.unref) this._timer.unref();
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
async _tick() {
|
|
683
|
+
if (this.stopped) return;
|
|
684
|
+
this._evict(); // enforce the TTL on EVERY tick, not just on emit/get_events — a buffered frame must not outlive EVENT_TTL_MS even if the client never polls (codex HIGH, privacy §24.1)
|
|
685
|
+
if (Date.now() - this.startedAt > WATCH_MAX_DURATION_MS) { this._stop('max watch duration reached (auto-stopped)'); return; }
|
|
686
|
+
this.polls++;
|
|
687
|
+
const reset = this.polls === 1;
|
|
688
|
+
const wa = { ...this.targetArgs, watchId: this.bgId };
|
|
689
|
+
if (reset) wa.reset = true;
|
|
690
|
+
const res = await viaWorker('poll_change', wa, OP_TIMEOUT_MS);
|
|
691
|
+
if (this.stopped) return; // stop_watch/dispose may have fired during the await — don't resume into a torn-down session (codex HIGH race)
|
|
692
|
+
const p = res.payload || {};
|
|
693
|
+
if (res.isError) {
|
|
694
|
+
this.consecErrors++; this.lastError = String(p.error || 'poll failed');
|
|
695
|
+
if (this.consecErrors >= 5) { this._stop(`stopped after repeated poll errors: ${this.lastError}`); return; }
|
|
696
|
+
} else {
|
|
697
|
+
this.consecErrors = 0;
|
|
698
|
+
this.lastChangePct = p.changePct != null ? round2(p.changePct) : null;
|
|
699
|
+
this.lastPollTs = Date.now();
|
|
700
|
+
if (p.redacted) {
|
|
701
|
+
// A denylisted window overlaps the target -> no capture this frame. Treat as a blind gap (don't feed
|
|
702
|
+
// the filter a fake diff); surface the count in status so the agent knows the watch is partially blind.
|
|
703
|
+
this.redactedFrames++;
|
|
704
|
+
} else {
|
|
705
|
+
const baseline = p.baseline === true; // includes a silent worker stateReset (fresh baseline)
|
|
706
|
+
const c = p.changePct != null ? p.changePct : 0;
|
|
707
|
+
const emit = this.filter.push({ changePct: c, now: this.lastPollTs, baseline });
|
|
708
|
+
if (emit) { try { await this._onEmit(emit, p); } catch (e) { this.lastError = `emit/capture failed: ${String((e && e.message) || e)}`; } }
|
|
709
|
+
// Reflex triggers run on the RAW per-tick change (not the settled-event path) — fast local beep/say/ocr.
|
|
710
|
+
if (!baseline && this.triggers.length) { try { await this._evalReflexes(c, this.lastPollTs); } catch (e) { this.lastError = `reflex failed: ${String((e && e.message) || e)}`; } }
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
if (!this.stopped) this._schedule();
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
async _onEmit(emit, pollPayload) {
|
|
717
|
+
const cap = await viaWorker('capture', { ...this.targetArgs, outDir: this.outDir }, OP_TIMEOUT_MS);
|
|
718
|
+
if (this.stopped) {
|
|
719
|
+
materializeImages(cap.payload); // stopped during the capture await — unlink the just-captured temp frame and drop it (don't push into a disposed session, codex HIGH race)
|
|
720
|
+
try { rmSync(this.outDir, { recursive: true, force: true }); } catch {} // the worker may have recreated outDir to write that frame — clear the now-empty dir (codex r2 residual)
|
|
721
|
+
return;
|
|
722
|
+
}
|
|
723
|
+
const cp = cap.payload || {};
|
|
724
|
+
const ev = { seq: ++this.seq, ts: Date.now(), reason: emit.reason, peakPct: round2(emit.peakPct), activeMs: emit.activeMs, target: this.targetLabel };
|
|
725
|
+
if (cap.isError) {
|
|
726
|
+
ev.captureError = String(cp.error || 'capture failed');
|
|
727
|
+
} else if (cp.redacted) {
|
|
728
|
+
ev.redacted = true; ev.note = 'settled change detected but the frame was withheld (denylisted window in region)';
|
|
729
|
+
} else {
|
|
730
|
+
const items = materializeImages(cp); // reads the temp PNG inline as base64 and unlinks it (§8 volatility)
|
|
731
|
+
const img = items[0];
|
|
732
|
+
if (img) {
|
|
733
|
+
const bytes = Buffer.from(img.data, 'base64').length;
|
|
734
|
+
if (bytes <= EVENT_IMG_MAX_BYTES) { ev._img = img; ev.bytes = bytes; this.bufBytes += bytes; }
|
|
735
|
+
else ev.imageDropped = `image too large (${bytes} bytes)`;
|
|
736
|
+
}
|
|
737
|
+
if (cp.outputSize) ev.outputSize = cp.outputSize;
|
|
738
|
+
if (cp.approxTokens != null) ev.approxTokens = cp.approxTokens;
|
|
739
|
+
if (cp.captureRect) ev.captureRect = cp.captureRect;
|
|
740
|
+
}
|
|
741
|
+
this.emitted++;
|
|
742
|
+
this.ring.push(ev);
|
|
743
|
+
this._evict();
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
// Reflex evaluation: per trigger, fire when the raw change crosses its threshold — with hysteresis
|
|
747
|
+
// (re-arm only after it goes quiet), per-trigger cooldown, and (for ocr) a one-tick dwell so we read a
|
|
748
|
+
// settled frame rather than a half-drawn one (codex r1 MED). The reflex bypasses the noise-filter debounce
|
|
749
|
+
// for speed but keeps these throttles + the global speech budget (codex r1 — no denial-of-attention).
|
|
750
|
+
async _evalReflexes(changePct, now) {
|
|
751
|
+
for (const trg of this.triggers) {
|
|
752
|
+
// A pending ocr dwell fires after dwellMs regardless of the current change (the point is a stable frame).
|
|
753
|
+
if (trg.pending) {
|
|
754
|
+
if (now - trg.pendingTs >= trg.dwellMs) { trg.pending = false; trg.armed = false; trg.lastFireTs = now; trg.fires++; await this._fireTrigger(trg); }
|
|
755
|
+
continue;
|
|
756
|
+
}
|
|
757
|
+
if (changePct < trg.threshold * 0.5) trg.armed = true; // hysteresis: re-arm once it settles below half
|
|
758
|
+
if (changePct < trg.threshold || !trg.armed) continue;
|
|
759
|
+
if (now - trg.lastFireTs < trg.cooldownMs) continue; // per-trigger cooldown
|
|
760
|
+
if (trg.action === 'ocr') { trg.pending = true; trg.pendingTs = now; continue; } // dwell one tick, then read
|
|
761
|
+
trg.armed = false; trg.lastFireTs = now; trg.fires++;
|
|
762
|
+
await this._fireTrigger(trg);
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
async _fireTrigger(trg) {
|
|
767
|
+
if (this.stopped) return;
|
|
768
|
+
if (trg.action === 'beep') { this._reflexNote(trg, 'beep'); await viaWorker('beep', { pattern: trg.beep }); return; }
|
|
769
|
+
if (trg.action === 'say') { const r = reflexSpeak('say', trg.say); this._reflexNote(trg, r.ok ? 'say' : `say-skip:${r.reason}`); return; }
|
|
770
|
+
// 'vision' uses the local VLM if it's available this session; otherwise it degrades to OCR (A always works).
|
|
771
|
+
let kind = trg.action; // 'ocr' | 'vision'
|
|
772
|
+
if (kind === 'vision') { const pv = await probeVlm(); if (this.stopped) return; if (!pv.available) kind = 'ocr'; } // graceful degrade -> read text
|
|
773
|
+
// Outcome label: surface the degrade ("vision→ocr") in EVERY branch (redacted/nocapture/empty/said), not just success.
|
|
774
|
+
const tag = (k) => (trg.action === 'vision' && k === 'ocr') ? 'vision→ocr' : k;
|
|
775
|
+
// Capture the region through the SAME denylist-gated worker path (never an arbitrary file), then read it.
|
|
776
|
+
const cap = await viaWorker('capture', { ...this.targetArgs, outDir: this.outDir }, OP_TIMEOUT_MS);
|
|
777
|
+
if (this.stopped) { materializeImages(cap.payload); return; }
|
|
778
|
+
const cp = cap.payload || {};
|
|
779
|
+
if (cp.redacted) { this._reflexNote(trg, `${tag(kind)}-redacted`); return; } // denylisted window in region -> stay blind
|
|
780
|
+
if (cap.isError || !cp.path) { this._reflexNote(trg, `${tag(kind)}-nocapture`); return; }
|
|
781
|
+
let text = null, usedKind = kind;
|
|
782
|
+
try {
|
|
783
|
+
if (kind === 'vision') {
|
|
784
|
+
text = await runVlm(cp.path);
|
|
785
|
+
// The probe said available but the LIVE call can still fail (model unloaded, timeout) — degrade to OCR (codex low).
|
|
786
|
+
if (text == null && !this.stopped) { usedKind = 'ocr'; text = await runOcr(cp.path); }
|
|
787
|
+
} else {
|
|
788
|
+
text = await runOcr(cp.path);
|
|
789
|
+
}
|
|
790
|
+
} finally { try { unlinkSync(cp.path); } catch {} } // volatile: delete the crop after reading (§8)
|
|
791
|
+
if (this.stopped) return;
|
|
792
|
+
if (!text) { this._reflexNote(trg, `${tag(usedKind)}-empty`); return; }
|
|
793
|
+
const r = reflexSpeak(usedKind === 'vision' ? 'vision' : 'ocr', text); // screen-derived (untrusted) -> provenance chime (or verbal prefix in spoken mode)
|
|
794
|
+
this._reflexNote(trg, r.ok ? `${tag(usedKind)}-said` : `${tag(usedKind)}-skip:${r.reason}`);
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
_reflexNote(trg, outcome) { this.reflexFires++; this.lastReflex = { ts: Date.now(), action: trg.action, outcome }; }
|
|
798
|
+
|
|
799
|
+
_evict() {
|
|
800
|
+
const cut = Date.now() - EVENT_TTL_MS;
|
|
801
|
+
while (this.ring.length && this.ring[0].ts < cut) this._drop(this.ring.shift());
|
|
802
|
+
while (this.ring.length > EVENT_RING_MAX) this._drop(this.ring.shift());
|
|
803
|
+
while (this.bufBytes > WATCH_BUF_MAX_BYTES && this.ring.length > 0) this._drop(this.ring.shift());
|
|
804
|
+
}
|
|
805
|
+
_drop(ev) { if (ev && ev.bytes) this.bufBytes -= ev.bytes; this.dropped++; }
|
|
806
|
+
|
|
807
|
+
drain(max = GET_EVENTS_MAX, maxImages = GET_EVENTS_MAX_IMAGES) {
|
|
808
|
+
this._evict();
|
|
809
|
+
// Drain only as far as we can return EVERYTHING we remove: stop BEFORE an image-bearing event that would
|
|
810
|
+
// exceed the per-call image cap, leaving it (and the rest) buffered for the next call (codex MED — never
|
|
811
|
+
// remove an event whose frame we then have to discard). Metadata-only events keep draining up to `max`.
|
|
812
|
+
const images = []; const records = [];
|
|
813
|
+
const cap = Math.max(1, max);
|
|
814
|
+
while (this.ring.length && records.length < cap) {
|
|
815
|
+
const ev = this.ring[0];
|
|
816
|
+
if (ev._img && images.length >= maxImages) break; // would overflow images — leave it buffered, truthfully report remaining
|
|
817
|
+
this.ring.shift();
|
|
818
|
+
if (ev.bytes) this.bufBytes -= ev.bytes;
|
|
819
|
+
const rec = { ...ev }; delete rec._img;
|
|
820
|
+
if (ev._img) { images.push(ev._img); rec.image = 'inline'; }
|
|
821
|
+
records.push(rec);
|
|
822
|
+
}
|
|
823
|
+
return { records, images, remaining: this.ring.length };
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
status() {
|
|
827
|
+
this._evict();
|
|
828
|
+
return {
|
|
829
|
+
watchId: this.watchId, target: this.targetLabel, running: !this.stopped, stopReason: this.stopReason || undefined,
|
|
830
|
+
pollIntervalMs: this.pollIntervalMs, polls: this.polls, lastChangePct: this.lastChangePct,
|
|
831
|
+
ageMs: Date.now() - this.startedAt, buffered: this.ring.length, dropped: this.dropped, emitted: this.emitted,
|
|
832
|
+
redactedFrames: this.redactedFrames || undefined, filterPhase: this.filter.status.phase,
|
|
833
|
+
thresholds: { activityThreshold: this.cfg.activityThreshold, quietThreshold: this.cfg.quietThreshold, debounceQuietMs: this.cfg.debounceQuietMs, cooldownMs: this.cfg.cooldownMs, maxWaitMs: this.cfg.maxWaitMs },
|
|
834
|
+
triggers: this.triggers.length || undefined, reflexFires: this.reflexFires || undefined, lastReflex: this.lastReflex || undefined,
|
|
835
|
+
lastError: this.lastError || undefined,
|
|
836
|
+
};
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
// Stop the loop. On an INVOLUNTARY stop (auto-stop at max duration / repeated errors), the client may never
|
|
840
|
+
// call stop_watch, so we must not keep screen frames in RAM indefinitely: drop the captured frames now and
|
|
841
|
+
// remove the (otherwise empty) temp dir, but keep the lightweight metadata records so a later get_events can
|
|
842
|
+
// still report what happened (codex HIGH — privacy/TTL must hold without client cooperation, §24.1).
|
|
843
|
+
_stop(reason) {
|
|
844
|
+
if (this.stopped) return;
|
|
845
|
+
this.stopped = true; this.stopReason = reason;
|
|
846
|
+
if (this._timer) { clearTimeout(this._timer); this._timer = null; }
|
|
847
|
+
this._clearFrames();
|
|
848
|
+
try { rmSync(this.outDir, { recursive: true, force: true }); } catch {}
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
_clearFrames() { // strip pixel data, keep metadata records
|
|
852
|
+
for (const ev of this.ring) { if (ev._img) { delete ev._img; ev.image = 'cleared (watch stopped)'; } }
|
|
853
|
+
this.bufBytes = 0;
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
dispose() {
|
|
857
|
+
this._stop(this.stopReason || 'disposed');
|
|
858
|
+
this.ring = []; this.bufBytes = 0; // drop everything, including metadata
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
const watches = new Map();
|
|
863
|
+
|
|
864
|
+
function startWatch(a) {
|
|
865
|
+
if (plat !== 'win32') return { payload: { error: 'start_watch is currently Windows-only', platform: plat }, isError: true };
|
|
866
|
+
// Require EXACTLY one fixed target — not "at least one" — so the worker's implicit precedence can't make a
|
|
867
|
+
// caller watch a different target than they passed (codex LOW). Cursor mode is disallowed (the cursor moves).
|
|
868
|
+
const targetCount = (a.region ? 1 : 0) + (a.window ? 1 : 0) + (a.monitor != null ? 1 : 0);
|
|
869
|
+
if (targetCount === 0) {
|
|
870
|
+
return { payload: { error: 'start_watch needs a fixed target — pass region, window, or monitor. Cursor mode is not allowed (the cursor moves, so every frame would differ).' }, isError: true };
|
|
871
|
+
}
|
|
872
|
+
if (targetCount > 1) {
|
|
873
|
+
return { payload: { error: 'start_watch takes exactly one target — pass only one of region, window, or monitor.' }, isError: true };
|
|
874
|
+
}
|
|
875
|
+
const watchId = a.watchId ? String(a.watchId) : 'default';
|
|
876
|
+
let replaced = false;
|
|
877
|
+
const existing = watches.get(watchId);
|
|
878
|
+
if (existing) { existing.dispose(); watches.delete(watchId); replaced = true; }
|
|
879
|
+
// Prune stopped/empty sessions before enforcing the cap, then reject if still over.
|
|
880
|
+
if (watches.size >= MAX_WATCHES) {
|
|
881
|
+
for (const [id, s] of watches) { if (s.stopped && s.ring.length === 0) { s.dispose(); watches.delete(id); } }
|
|
882
|
+
}
|
|
883
|
+
if (watches.size >= MAX_WATCHES) {
|
|
884
|
+
return { payload: { error: `too many concurrent watches (max ${MAX_WATCHES}) — stop one first with stop_watch.` }, isError: true };
|
|
885
|
+
}
|
|
886
|
+
const session = new WatchSession(watchId, a);
|
|
887
|
+
watches.set(watchId, session);
|
|
888
|
+
const status = session.start();
|
|
889
|
+
return { payload: { ok: true, action: replaced ? 'restarted' : 'started', ...status, hint: 'Watch runs in the background. Call get_events periodically to collect what changed; stop_watch when done.' }, isError: false };
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
function stopWatch(a) {
|
|
893
|
+
const watchId = a.watchId ? String(a.watchId) : null;
|
|
894
|
+
if (!watchId) {
|
|
895
|
+
const ids = [...watches.keys()];
|
|
896
|
+
let total = 0;
|
|
897
|
+
for (const [, s] of watches) { total += s.emitted; s.dispose(); }
|
|
898
|
+
watches.clear();
|
|
899
|
+
return { payload: { ok: true, action: 'stopped-all', stopped: ids, totalEmitted: total }, isError: false };
|
|
900
|
+
}
|
|
901
|
+
const s = watches.get(watchId);
|
|
902
|
+
if (!s) return { payload: { error: `no watch with id "${watchId}"`, active: [...watches.keys()] }, isError: true };
|
|
903
|
+
const summary = { ok: true, action: 'stopped', watchId, polls: s.polls, emitted: s.emitted, dropped: s.dropped, discardedUnread: s.ring.length, ageMs: Date.now() - s.startedAt };
|
|
904
|
+
s.dispose(); watches.delete(watchId);
|
|
905
|
+
return { payload: summary, isError: false };
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
// get_events returns the buffered events as text records plus their settled frames as MCP image items.
|
|
909
|
+
function getEvents(a) {
|
|
910
|
+
const watchId = a.watchId ? String(a.watchId) : 'default';
|
|
911
|
+
const s = watches.get(watchId);
|
|
912
|
+
if (!s) return { result: { payload: { error: `no watch with id "${watchId}"`, active: [...watches.keys()] }, isError: true }, images: [] };
|
|
913
|
+
const max = Number.isFinite(Number(a.max)) ? Math.min(GET_EVENTS_MAX, Math.max(1, Math.floor(Number(a.max)))) : GET_EVENTS_MAX;
|
|
914
|
+
const { records, images, remaining } = s.drain(max);
|
|
915
|
+
const payload = { watchId, events: records, returned: records.length, remaining, status: s.status() };
|
|
916
|
+
return { result: { payload, isError: false }, images };
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
const TOOLS = [
|
|
920
|
+
{
|
|
921
|
+
name: 'probe',
|
|
922
|
+
description: 'Check whether screen perception works in this environment + measure display/capture latency (P0.5).',
|
|
923
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
924
|
+
},
|
|
925
|
+
{
|
|
926
|
+
name: 'read_ui',
|
|
927
|
+
description: 'Structured perception of the active/targeted window — UIA(Win)/AX(mac) tree: elements, roles, coordinates, text. Zero images, ~0 tokens (P1 primary).',
|
|
928
|
+
inputSchema: {
|
|
929
|
+
type: 'object',
|
|
930
|
+
properties: {
|
|
931
|
+
window: { type: 'string', description: 'Target by a substring of the window title. If omitted, foreground.' },
|
|
932
|
+
target: { type: 'string', enum: ['foreground', 'cursor'], description: 'Target to use when no window is given.' },
|
|
933
|
+
},
|
|
934
|
+
additionalProperties: false,
|
|
935
|
+
},
|
|
936
|
+
},
|
|
937
|
+
{
|
|
938
|
+
name: 'classify_activity',
|
|
939
|
+
description: 'Adaptive companion: classify what the user is doing on the foreground window — returns { class: GAME|DEV|MEDIA|BROWSING|PRODUCTIVITY|UNKNOWN, process, title, notificationState, interruptible, canvas, uiaCount, fullscreen, profile, needsChangeRate }. Read-only, zero images. Use it to pick a help profile/cadence; for GAME, sample poll_change to split fast-action (break-gated) vs strategy (periodic). See docs/adaptive-companion.md.',
|
|
940
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
941
|
+
},
|
|
942
|
+
{
|
|
943
|
+
name: 'capture_screen',
|
|
944
|
+
description: 'Pixel capture — fallback for canvas/games that structured perception cannot read. Target (priority): region > window > monitor > (default) around the cursor. Returns a PNG path (volatility is the caller\'s job, §8).',
|
|
945
|
+
inputSchema: {
|
|
946
|
+
type: 'object',
|
|
947
|
+
properties: {
|
|
948
|
+
boxW: { type: 'number', description: 'Cursor-mode box width (default 600).' },
|
|
949
|
+
boxH: { type: 'number', description: 'Cursor-mode box height (default 400).' },
|
|
950
|
+
region: {
|
|
951
|
+
type: 'object',
|
|
952
|
+
description: 'Capture a fixed region (ignores cursor). Virtual-screen physical coordinates.',
|
|
953
|
+
properties: { x: { type: 'number' }, y: { type: 'number' }, w: { type: 'number' }, h: { type: 'number' } },
|
|
954
|
+
required: ['x', 'y', 'w', 'h'],
|
|
955
|
+
additionalProperties: false,
|
|
956
|
+
},
|
|
957
|
+
window: { type: 'string', description: 'Window title substring -> capture that window\'s region (ignores cursor). Tracks the window even as it moves.' },
|
|
958
|
+
monitor: { type: 'string', description: "Target a monitor (ignores cursor): 1-based index ('3') or 'primary'. For continuously watching a game-only monitor." },
|
|
959
|
+
detail: { type: 'string', description: "Resolution preset: 'gist' (flow only, small) / 'normal' (default) / 'text' (text/code, large). For reading text, prefer region/window + 'text' over a whole monitor." },
|
|
960
|
+
scale: { type: 'number', description: 'Upscale factor (small regions). When set, overrides detail.' },
|
|
961
|
+
maxSide: { type: 'number', description: 'Upper bound on the output longest side in px. When set, overrides detail.' },
|
|
962
|
+
},
|
|
963
|
+
additionalProperties: false,
|
|
964
|
+
},
|
|
965
|
+
},
|
|
966
|
+
{
|
|
967
|
+
name: 'watch_capture',
|
|
968
|
+
description: 'Capture a fixed target N times at a set interval (within one pwsh process — avoids per-frame re-spawn cost). With changeOnly, save only frames that changed from the previous one. Synchronous/blocking (waits for the response over frames×interval) — Windows-only PoC. Target priority: region > window > monitor > cursor.',
|
|
969
|
+
inputSchema: {
|
|
970
|
+
type: 'object',
|
|
971
|
+
properties: {
|
|
972
|
+
frames: { type: 'number', description: 'Number of frames to capture (>1).' },
|
|
973
|
+
intervalMs: { type: 'number', description: 'Interval between frames (ms, default 1000).' },
|
|
974
|
+
changeOnly: { type: 'boolean', description: 'Save only frames that changed from the previous one (default false).' },
|
|
975
|
+
changeThreshold: { type: 'number', description: 'Change-detection threshold % (default 2.0).' },
|
|
976
|
+
region: {
|
|
977
|
+
type: 'object',
|
|
978
|
+
description: 'Fixed region (ignores cursor). Virtual-screen physical coordinates.',
|
|
979
|
+
properties: { x: { type: 'number' }, y: { type: 'number' }, w: { type: 'number' }, h: { type: 'number' } },
|
|
980
|
+
required: ['x', 'y', 'w', 'h'],
|
|
981
|
+
additionalProperties: false,
|
|
982
|
+
},
|
|
983
|
+
window: { type: 'string', description: 'Window title substring (ignores cursor, tracks movement).' },
|
|
984
|
+
monitor: { type: 'string', description: "Target a monitor (ignores cursor): index or 'primary'." },
|
|
985
|
+
boxW: { type: 'number', description: 'Cursor-mode box width.' },
|
|
986
|
+
boxH: { type: 'number', description: 'Cursor-mode box height.' },
|
|
987
|
+
detail: { type: 'string', description: "Resolution preset: 'gist'/'normal'/'text'. An explicit scale/maxSide takes precedence." },
|
|
988
|
+
scale: { type: 'number', description: 'Upscale factor. When set, overrides detail.' },
|
|
989
|
+
maxSide: { type: 'number', description: 'Upper bound on the output longest side in px. When set, overrides detail.' },
|
|
990
|
+
},
|
|
991
|
+
additionalProperties: false,
|
|
992
|
+
},
|
|
993
|
+
},
|
|
994
|
+
{
|
|
995
|
+
name: 'poll_change',
|
|
996
|
+
description: 'Look at the screen once (async watch primitive) — capture the target once, compare with the previous shot, and immediately return only the change rate. The resident worker remembers the previous state per watchId (continuity). **By default, metadata only (changed, changePct; no image saved = token savings)** — pass includeImage:true to also get the image (path) only when you actually need to see the screen. "Watching alongside" is built by the agent repeatedly calling this tool (polling) every 1-2 seconds — non-blocking, the user can step in at any time. Use reset:true on start / target change. Windows-only.',
|
|
997
|
+
inputSchema: {
|
|
998
|
+
type: 'object',
|
|
999
|
+
properties: {
|
|
1000
|
+
region: {
|
|
1001
|
+
type: 'object',
|
|
1002
|
+
description: 'Fixed region (ignores cursor). Virtual-screen physical coordinates.',
|
|
1003
|
+
properties: { x: { type: 'number' }, y: { type: 'number' }, w: { type: 'number' }, h: { type: 'number' } },
|
|
1004
|
+
required: ['x', 'y', 'w', 'h'],
|
|
1005
|
+
additionalProperties: false,
|
|
1006
|
+
},
|
|
1007
|
+
window: { type: 'string', description: 'Window title substring (ignores cursor, tracks movement).' },
|
|
1008
|
+
monitor: { type: 'string', description: "Target a monitor (ignores cursor): index or 'primary'." },
|
|
1009
|
+
boxW: { type: 'number', description: 'Cursor-mode box width.' },
|
|
1010
|
+
boxH: { type: 'number', description: 'Cursor-mode box height.' },
|
|
1011
|
+
scale: { type: 'number', description: 'Upscale factor when includeImage. When set, overrides detail.' },
|
|
1012
|
+
maxSide: { type: 'number', description: 'Upper bound on the output longest side in px when includeImage. When set, overrides detail.' },
|
|
1013
|
+
changeThreshold: { type: 'number', description: 'Change-detection threshold % (default 2).' },
|
|
1014
|
+
watchId: { type: 'string', description: 'Watch-session id (default "default"). The previous state is remembered under this id.' },
|
|
1015
|
+
reset: { type: 'boolean', description: 'If true, discard the previous state and start a new baseline (on watch start / target change).' },
|
|
1016
|
+
detail: { type: 'string', description: "Resolution preset: 'gist'/'normal'/'text' (the saved image size when includeImage)." },
|
|
1017
|
+
includeImage: { type: 'boolean', description: 'Default false = metadata only (changePct etc., token savings). If true, also save and return the changed/baseline frame as a PNG (path).' },
|
|
1018
|
+
},
|
|
1019
|
+
additionalProperties: false,
|
|
1020
|
+
},
|
|
1021
|
+
},
|
|
1022
|
+
{
|
|
1023
|
+
name: 'beep',
|
|
1024
|
+
description: 'Sound alert — call this to emit a beep when you have a message to show the user during watching (so they notice while looking at a game / another screen). Recommended to call right before printing the message. Windows-only. (A precursor to future voice TTS.)',
|
|
1025
|
+
inputSchema: {
|
|
1026
|
+
type: 'object',
|
|
1027
|
+
properties: {
|
|
1028
|
+
pattern: { type: 'string', description: "Beep pattern: 'info' (once) / 'warn' (twice) / 'urgent' (three times). Default info." },
|
|
1029
|
+
count: { type: 'number', description: 'Repeat count (overrides pattern when set). 1-10.' },
|
|
1030
|
+
frequency: { type: 'number', description: 'Pitch in Hz (37-32767).' },
|
|
1031
|
+
durationMs: { type: 'number', description: 'Duration of one beep in ms.' },
|
|
1032
|
+
},
|
|
1033
|
+
additionalProperties: false,
|
|
1034
|
+
},
|
|
1035
|
+
},
|
|
1036
|
+
{
|
|
1037
|
+
name: 'speak',
|
|
1038
|
+
description:
|
|
1039
|
+
"Speak ONE short line aloud in the user's language through the local TTS — the AGENT tier of the watch design. This is how you act as a companion while they look at a game / another screen: you SEE the screen (capture_screen / get_events), UNDERSTAND it, and say something useful — strategic advice, a warning about a threat/opportunity they might miss, brief commentary, or an answer to their question. These are YOUR OWN judged words (trusted), so they are spoken WITHOUT a provenance mark. Therefore do NOT pipe RAW screen text through here — summarize/judge it into your own sentence first; for unjudged raw screen text use an `ocr` trigger instead (it is marked as screen-derived). Keep it to one concise sentence. Globally rate-limited and never overlaps reflex speech (one set of ears). Windows-only.",
|
|
1040
|
+
inputSchema: {
|
|
1041
|
+
type: 'object',
|
|
1042
|
+
properties: {
|
|
1043
|
+
text: { type: 'string', description: 'The sentence to speak — your own words (a summary/advice/answer), not a raw screen dump. Kept short; long text is capped.' },
|
|
1044
|
+
},
|
|
1045
|
+
required: ['text'],
|
|
1046
|
+
additionalProperties: false,
|
|
1047
|
+
},
|
|
1048
|
+
},
|
|
1049
|
+
{
|
|
1050
|
+
name: 'start_watch',
|
|
1051
|
+
description:
|
|
1052
|
+
'Start watching a fixed target in the BACKGROUND and return immediately (non-blocking). A built-in noise filter (debounce + cooldown) suppresses the per-frame ripple of video/games/scrolling and keeps only meaningful, SETTLED changes — so it works on screens that change every frame, where raw change-detection would flood you. Events accumulate in an in-memory buffer; collect them with get_events, end with stop_watch. Needs a fixed target (region/window/monitor — NOT the cursor). Tune with thresholds if needed. Windows-only. Auto-stops after 30 min.',
|
|
1053
|
+
inputSchema: {
|
|
1054
|
+
type: 'object',
|
|
1055
|
+
properties: {
|
|
1056
|
+
region: {
|
|
1057
|
+
type: 'object',
|
|
1058
|
+
description: 'Fixed region to watch (virtual-screen physical coordinates). Recommended for a game minimap/alert area.',
|
|
1059
|
+
properties: { x: { type: 'number' }, y: { type: 'number' }, w: { type: 'number' }, h: { type: 'number' } },
|
|
1060
|
+
required: ['x', 'y', 'w', 'h'],
|
|
1061
|
+
additionalProperties: false,
|
|
1062
|
+
},
|
|
1063
|
+
window: { type: 'string', description: 'Window title substring to watch (tracks the window as it moves).' },
|
|
1064
|
+
monitor: { type: 'string', description: "Monitor to watch: 1-based index ('2') or 'primary' (e.g. a game-only screen)." },
|
|
1065
|
+
watchId: { type: 'string', description: 'Watch-session id (default "default"). Use distinct ids to run several watches at once; starting an existing id restarts it.' },
|
|
1066
|
+
pollIntervalMs: { type: 'number', description: 'How often to sample the target (ms, default 600; clamped 400-5000). Lower = snappier + more CPU.' },
|
|
1067
|
+
detail: { type: 'string', description: "Resolution preset for the captured settled frame: 'gist'/'normal'/'text'." },
|
|
1068
|
+
activityThreshold: { type: 'number', description: 'Frame-to-frame % change that WAKES the filter (default 8). Set above your screen\'s ambient jitter (a playing video measures ~2.5-4%) and below a real transition (a scene cut ~16.8%).' },
|
|
1069
|
+
quietThreshold: { type: 'number', description: 'Frame-to-frame % below which a frame counts as "still" (default 5). Hysteresis: forced below activityThreshold.' },
|
|
1070
|
+
debounceQuietMs: { type: 'number', description: 'How long motion must stay quiet before the settled frame is emitted (ms, default 900 = quality gate).' },
|
|
1071
|
+
cooldownMs: { type: 'number', description: 'Minimum gap between events (ms, default 6000 = frequency cap; suppresses ripples of one activity).' },
|
|
1072
|
+
maxWaitMs: { type: 'number', description: 'If motion never settles, emit anyway this often (ms, default 8000 = anti-starvation for continuous motion).' },
|
|
1073
|
+
triggers: {
|
|
1074
|
+
type: 'array',
|
|
1075
|
+
description: 'Reflex triggers (design §22.3): fire a local alert the INSTANT the watched region changes past a threshold — no cloud LLM round-trip, so it reaches the user in well under a second. Actions: `beep` (sound), `say` (speak a FIXED pre-written phrase — safest), `ocr` (read the region\'s text aloud via offline OCR), or `vision` (describe the scene via a LOCAL vision model if one is configured + fast enough, else auto-degrades to ocr). Spoken screen content is marked as screen-derived (by default a verbal "화면 글자:" / "로컬 비전:" prefix; a non-verbal chime when VORTEX_CU_SPEECH_PROVENANCE=earcon) and shaped; speech is globally rate-limited. Use a `beep`/`say` reflex for "ping me the moment X happens"; for the deeper, JUDGED commentary (advice, warnings, answers) look at the get_events frame and voice your own sentence with the `speak` tool — raw `ocr` readout is just a fallback.',
|
|
1076
|
+
maxItems: 8,
|
|
1077
|
+
items: {
|
|
1078
|
+
type: 'object',
|
|
1079
|
+
properties: {
|
|
1080
|
+
action: { type: 'string', enum: ['beep', 'say', 'ocr', 'vision'], description: "What to do on a crossing." },
|
|
1081
|
+
threshold: { type: 'number', description: 'Frame-to-frame % change that fires this trigger (default 12).' },
|
|
1082
|
+
say: { type: 'string', description: "For action 'say': the fixed phrase to speak (e.g. '적 출현')." },
|
|
1083
|
+
beep: { type: 'string', description: "For action 'beep': 'info'/'warn'/'urgent' (default warn)." },
|
|
1084
|
+
cooldownMs: { type: 'number', description: 'Minimum gap between firings of this trigger (ms, default 8000).' },
|
|
1085
|
+
dwellMs: { type: 'number', description: "For action 'ocr': wait this long after the change before reading, so the frame is settled (ms, default 700)." },
|
|
1086
|
+
},
|
|
1087
|
+
required: ['action'],
|
|
1088
|
+
additionalProperties: false,
|
|
1089
|
+
},
|
|
1090
|
+
},
|
|
1091
|
+
},
|
|
1092
|
+
additionalProperties: false,
|
|
1093
|
+
},
|
|
1094
|
+
},
|
|
1095
|
+
{
|
|
1096
|
+
name: 'get_events',
|
|
1097
|
+
description:
|
|
1098
|
+
'Collect the changes a background watch (start_watch) has buffered since the last call — non-blocking, batched (so a long watch costs only a few looks). Returns one record per settled change (time, reason, change magnitude, capture metadata) plus the settled frames as inline images, and a status block (polls, buffered, dropped, filter phase). Drains what it returns; call again for any remainder.',
|
|
1099
|
+
inputSchema: {
|
|
1100
|
+
type: 'object',
|
|
1101
|
+
properties: {
|
|
1102
|
+
watchId: { type: 'string', description: 'Which watch to collect from (default "default").' },
|
|
1103
|
+
max: { type: 'number', description: 'Max events to return this call (default/cap 12). Remaining stay buffered.' },
|
|
1104
|
+
},
|
|
1105
|
+
additionalProperties: false,
|
|
1106
|
+
},
|
|
1107
|
+
},
|
|
1108
|
+
{
|
|
1109
|
+
name: 'stop_watch',
|
|
1110
|
+
description: 'Stop a background watch and discard its buffer + in-memory frames. Omit watchId to stop ALL watches. Returns a summary (polls, events emitted, unread discarded).',
|
|
1111
|
+
inputSchema: {
|
|
1112
|
+
type: 'object',
|
|
1113
|
+
properties: {
|
|
1114
|
+
watchId: { type: 'string', description: 'Which watch to stop. Omit to stop every active watch.' },
|
|
1115
|
+
},
|
|
1116
|
+
additionalProperties: false,
|
|
1117
|
+
},
|
|
1118
|
+
},
|
|
1119
|
+
];
|
|
1120
|
+
|
|
1121
|
+
// The CallTool handler — standalone (no SDK types), wired to the server only on the serve path.
|
|
1122
|
+
async function handleCallTool(req) {
|
|
1123
|
+
const { name, arguments: a = {} } = req.params;
|
|
1124
|
+
const useWorker = plat === 'win32'; // the resident worker is Windows PowerShell backend only
|
|
1125
|
+
let result;
|
|
1126
|
+
let reqDir = null; // per-request temp dir for screenshots — deleted in finally on EVERY exit path (codex blocker: timeout/error left files behind)
|
|
1127
|
+
// Clamp output-size knobs at the MCP boundary so a huge upscale can't blow up the response / PS render (codex #med).
|
|
1128
|
+
if (a.scale != null) { const s = Number(a.scale); a.scale = Number.isFinite(s) ? Math.min(8, Math.max(0.1, s)) : undefined; }
|
|
1129
|
+
if (a.maxSide != null) { const m = Number(a.maxSide); a.maxSide = Number.isFinite(m) ? Math.min(4096, Math.max(16, Math.floor(m))) : undefined; }
|
|
1130
|
+
try {
|
|
1131
|
+
if (name === 'probe') {
|
|
1132
|
+
result = useWorker ? await viaWorker('probe', {}) : runBackend('probe');
|
|
1133
|
+
// If a local VLM is configured, also report its availability (synthetic-image probe — no real screen sent).
|
|
1134
|
+
// This is the only place probe touches the network, and only when the user opted in by setting an endpoint.
|
|
1135
|
+
if (VLM.enabled && result && result.payload && typeof result.payload === 'object') {
|
|
1136
|
+
try { result.payload.vlm = await probeVlm(); } catch (e) { result.payload.vlm = { available: false, reason: String((e && e.message) || e) }; }
|
|
1137
|
+
}
|
|
1138
|
+
} else if (name === 'read_ui') {
|
|
1139
|
+
if (useWorker) {
|
|
1140
|
+
const wa = {};
|
|
1141
|
+
if (a.window) wa.windowMatch = String(a.window);
|
|
1142
|
+
if (a.target) wa.target = String(a.target);
|
|
1143
|
+
result = await viaWorker('read_ui', wa);
|
|
1144
|
+
} else {
|
|
1145
|
+
const args = [];
|
|
1146
|
+
if (a.window) args.push('-WindowMatch', String(a.window));
|
|
1147
|
+
if (a.target) args.push('-Target', String(a.target));
|
|
1148
|
+
result = runBackend('read', args);
|
|
1149
|
+
}
|
|
1150
|
+
} else if (name === 'classify_activity') {
|
|
1151
|
+
if (plat !== 'win32') {
|
|
1152
|
+
result = { payload: { error: 'classify_activity is currently Windows-only', platform: plat }, isError: true };
|
|
1153
|
+
} else {
|
|
1154
|
+
const raw = runBackend('classify', [], 8000); // fast one-shot; 8s hard timeout so a hung UIA call can't freeze the loop
|
|
1155
|
+
if (raw.isError) {
|
|
1156
|
+
result = raw;
|
|
1157
|
+
} else {
|
|
1158
|
+
try {
|
|
1159
|
+
const opts = { profiles: COMPANION_PROFILES };
|
|
1160
|
+
if (process.env.VORTEX_CU_UIA_CANVAS_MAX) opts.uiaCanvasMax = Number(process.env.VORTEX_CU_UIA_CANVAS_MAX);
|
|
1161
|
+
const d = classifyActivity(raw.payload, opts);
|
|
1162
|
+
const p = raw.payload || {};
|
|
1163
|
+
result = { payload: { ...d, redacted: !!p.redacted, reason: p.reason, procId: p.procId, hwnd: p.hwnd, uiaCapped: p.uiaCapped, uiaOk: p.uiaOk, notificationStateCode: p.notificationState } };
|
|
1164
|
+
} catch (e) {
|
|
1165
|
+
result = { payload: { error: 'classify failed', detail: String((e && e.message) || e), raw: raw.payload }, isError: true };
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
}
|
|
1169
|
+
} else if (name === 'capture_screen') {
|
|
1170
|
+
reqDir = mkdtempSync(join(tmpdir(), 'vortex-cu-'));
|
|
1171
|
+
if (useWorker) {
|
|
1172
|
+
const wa = { outDir: reqDir };
|
|
1173
|
+
if (a.region) wa.region = `${a.region.x},${a.region.y},${a.region.w},${a.region.h}`;
|
|
1174
|
+
if (a.window) wa.windowMatch = String(a.window);
|
|
1175
|
+
if (a.monitor != null) wa.monitor = String(a.monitor);
|
|
1176
|
+
if (a.boxW) wa.boxW = a.boxW;
|
|
1177
|
+
if (a.boxH) wa.boxH = a.boxH;
|
|
1178
|
+
if (a.detail) wa.detail = String(a.detail);
|
|
1179
|
+
if (a.scale != null) wa.scale = a.scale;
|
|
1180
|
+
if (a.maxSide != null) wa.maxSide = a.maxSide;
|
|
1181
|
+
result = await viaWorker('capture', wa);
|
|
1182
|
+
} else {
|
|
1183
|
+
const args = ['-OutDir', reqDir];
|
|
1184
|
+
if (a.boxW) args.push('-BoxW', String(a.boxW));
|
|
1185
|
+
if (a.boxH) args.push('-BoxH', String(a.boxH));
|
|
1186
|
+
result = runBackend('capture', args);
|
|
1187
|
+
}
|
|
1188
|
+
} else if (name === 'watch_capture') {
|
|
1189
|
+
// watch isn't run on the resident worker (avoids blocking other calls with a long occupation, codex #1) -> always per-call spawn.
|
|
1190
|
+
if (plat !== 'win32') {
|
|
1191
|
+
result = { payload: { error: 'watch_capture is currently Windows-only', platform: plat }, isError: true };
|
|
1192
|
+
} else {
|
|
1193
|
+
// Hard caps so one watch call can't run unbounded; async runner so it never blocks the server (codex #high).
|
|
1194
|
+
reqDir = mkdtempSync(join(tmpdir(), 'vortex-cu-'));
|
|
1195
|
+
const MAX_FRAMES = 60, MIN_INTERVAL = 100, MAX_INTERVAL = 10000, MAX_WATCH_MS = 180000;
|
|
1196
|
+
const reqFrames = Number(a.frames), reqInterval = Number(a.intervalMs);
|
|
1197
|
+
const frames = Math.min(MAX_FRAMES, Math.max(1, Number.isFinite(reqFrames) && reqFrames > 0 ? Math.floor(reqFrames) : 2));
|
|
1198
|
+
const intervalMs = Math.min(MAX_INTERVAL, Math.max(MIN_INTERVAL, Number.isFinite(reqInterval) && reqInterval > 0 ? Math.floor(reqInterval) : 1000));
|
|
1199
|
+
const args = ['-WatchFrames', String(frames), '-IntervalMs', String(intervalMs), '-OutDir', reqDir];
|
|
1200
|
+
if (a.changeOnly) args.push('-ChangeOnly');
|
|
1201
|
+
if (a.changeThreshold != null) args.push('-ChangeThreshold', String(a.changeThreshold));
|
|
1202
|
+
if (a.region) args.push('-Region', `${a.region.x},${a.region.y},${a.region.w},${a.region.h}`);
|
|
1203
|
+
if (a.window) args.push('-WindowMatch', String(a.window));
|
|
1204
|
+
if (a.monitor != null) args.push('-Monitor', String(a.monitor));
|
|
1205
|
+
if (a.boxW) args.push('-BoxW', String(a.boxW));
|
|
1206
|
+
if (a.boxH) args.push('-BoxH', String(a.boxH));
|
|
1207
|
+
if (a.detail) args.push('-Detail', String(a.detail));
|
|
1208
|
+
if (a.scale != null) args.push('-Scale', String(a.scale));
|
|
1209
|
+
if (a.maxSide != null) args.push('-MaxSide', String(a.maxSide));
|
|
1210
|
+
const timeoutMs = Math.min(MAX_WATCH_MS, frames * intervalMs + 15000);
|
|
1211
|
+
result = await runBackendAsync('capture', args, timeoutMs, reqDir);
|
|
1212
|
+
if (result.cleanupOwned) reqDir = null; // runBackendAsync deferred cleanup (timeout/cap fallback) — don't race it in finally
|
|
1213
|
+
}
|
|
1214
|
+
} else if (name === 'poll_change') {
|
|
1215
|
+
// async watch primitive — a fast single shot, so it goes through the worker (which also holds the previous state). win32 only.
|
|
1216
|
+
if (plat !== 'win32') {
|
|
1217
|
+
result = { payload: { error: 'poll_change is currently Windows-only', platform: plat }, isError: true };
|
|
1218
|
+
} else {
|
|
1219
|
+
const wa = {};
|
|
1220
|
+
if (a.includeImage) { reqDir = mkdtempSync(join(tmpdir(), 'vortex-cu-')); wa.outDir = reqDir; wa.includeImage = true; }
|
|
1221
|
+
if (a.region) wa.region = `${a.region.x},${a.region.y},${a.region.w},${a.region.h}`;
|
|
1222
|
+
if (a.window) wa.windowMatch = String(a.window);
|
|
1223
|
+
if (a.monitor != null) wa.monitor = String(a.monitor);
|
|
1224
|
+
if (a.boxW) wa.boxW = a.boxW;
|
|
1225
|
+
if (a.boxH) wa.boxH = a.boxH;
|
|
1226
|
+
if (a.scale != null) wa.scale = a.scale;
|
|
1227
|
+
if (a.maxSide != null) wa.maxSide = a.maxSide;
|
|
1228
|
+
if (a.detail) wa.detail = String(a.detail);
|
|
1229
|
+
if (a.changeThreshold != null) wa.changeThreshold = a.changeThreshold;
|
|
1230
|
+
const wid = a.watchId ? String(a.watchId) : 'default';
|
|
1231
|
+
if (a.watchId) wa.watchId = wid;
|
|
1232
|
+
if (a.reset) wa.reset = true;
|
|
1233
|
+
const seenBefore = pollSeen.has(wid);
|
|
1234
|
+
result = await viaWorker('poll_change', wa);
|
|
1235
|
+
// Record the watchId only after a SUCCESSFUL poll (payload carries a boolean baseline) — a failed first call must
|
|
1236
|
+
// not make the next valid baseline look like a reset; bound the set so attacker-chosen watchIds can't grow it forever. codex #low.
|
|
1237
|
+
if (result && result.payload && typeof result.payload.baseline === 'boolean') {
|
|
1238
|
+
if (pollSeen.size > 1000) pollSeen.clear();
|
|
1239
|
+
pollSeen.add(wid);
|
|
1240
|
+
}
|
|
1241
|
+
// A non-reset follow-up that comes back baseline=true means the worker lost its state (restart/idle-dispose/crash).
|
|
1242
|
+
// Surface it as a reset so the agent treats it as a fresh baseline — NOT as "no change" (a real change in the gap would otherwise be silently missed). codex #high.
|
|
1243
|
+
if (!a.reset && seenBefore && result && result.payload && result.payload.baseline === true) {
|
|
1244
|
+
result.payload.stateReset = true;
|
|
1245
|
+
result.payload.warning = 'watch state was lost (worker restarted/idle-disposed) — this is a fresh baseline, not a "no change" result; a change during the gap may have been missed.';
|
|
1246
|
+
}
|
|
1247
|
+
}
|
|
1248
|
+
} else if (name === 'beep') {
|
|
1249
|
+
if (plat !== 'win32') {
|
|
1250
|
+
result = { payload: { error: 'beep is currently Windows-only', platform: plat }, isError: true };
|
|
1251
|
+
} else {
|
|
1252
|
+
const wa = {};
|
|
1253
|
+
if (a.pattern) wa.pattern = String(a.pattern);
|
|
1254
|
+
if (a.count != null) wa.count = a.count;
|
|
1255
|
+
if (a.frequency != null) wa.frequency = a.frequency;
|
|
1256
|
+
if (a.durationMs != null) wa.durationMs = a.durationMs;
|
|
1257
|
+
result = await viaWorker('beep', wa);
|
|
1258
|
+
}
|
|
1259
|
+
} else if (name === 'speak') {
|
|
1260
|
+
// Agent-authored speech: trusted content -> no provenance mark (treated like a 'say' fixed phrase), but
|
|
1261
|
+
// still shaped + globally budgeted + no-overlap with reflex speech (one set of ears). Non-blocking.
|
|
1262
|
+
if (plat !== 'win32') {
|
|
1263
|
+
result = { payload: { error: 'speak is currently Windows-only', platform: plat }, isError: true };
|
|
1264
|
+
} else if (typeof a.text !== 'string' || !a.text.trim()) {
|
|
1265
|
+
result = { payload: { ok: false, error: 'empty text' }, isError: true };
|
|
1266
|
+
} else {
|
|
1267
|
+
const r = reflexSpeak('agent', a.text); // agent's judged words: no provenance mark, but redacted + shaped + budgeted
|
|
1268
|
+
result = { payload: r.ok ? { ok: true, uttered: r.uttered } : { ok: false, skipped: r.reason } };
|
|
1269
|
+
}
|
|
1270
|
+
} else if (name === 'start_watch') {
|
|
1271
|
+
result = startWatch(a);
|
|
1272
|
+
} else if (name === 'stop_watch') {
|
|
1273
|
+
result = stopWatch(a);
|
|
1274
|
+
} else if (name === 'get_events') {
|
|
1275
|
+
// get_events carries its OWN already-materialized image items (from the watch buffer) — return them directly
|
|
1276
|
+
// and skip the generic materializeImages pass (there are no on-disk paths in the payload to re-read).
|
|
1277
|
+
if (plat !== 'win32') {
|
|
1278
|
+
result = { payload: { error: 'get_events is currently Windows-only', platform: plat }, isError: true };
|
|
1279
|
+
} else {
|
|
1280
|
+
const { result: r, images } = getEvents(a);
|
|
1281
|
+
auditLog('get_events', r.payload, images);
|
|
1282
|
+
return { content: [{ type: 'text', text: JSON.stringify(r.payload, null, 2) }, ...images], isError: r.isError };
|
|
1283
|
+
}
|
|
1284
|
+
} else {
|
|
1285
|
+
return { content: [{ type: 'text', text: `unknown tool: ${name}` }], isError: true };
|
|
1286
|
+
}
|
|
1287
|
+
const imageItems = materializeImages(result.payload);
|
|
1288
|
+
auditLog(name, result.payload, imageItems); // metadata/HMAC only — original image and text are not stored (§8)
|
|
1289
|
+
return { content: [{ type: 'text', text: JSON.stringify(result.payload, null, 2) }, ...imageItems], isError: result.isError };
|
|
1290
|
+
} finally {
|
|
1291
|
+
if (reqDir) { try { rmSync(reqDir, { recursive: true, force: true }); } catch {} }
|
|
1292
|
+
}
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
// Clean up the worker + any background watches on server shutdown (stops the loops, frees in-RAM frames,
|
|
1296
|
+
// removes the watch temp dirs). The worker also auto-terminates via stdin EOF when the parent dies.
|
|
1297
|
+
process.on('exit', () => { try { if (speakingChild) speakingChild.kill(); } catch {} for (const s of watches.values()) { try { s.dispose(); } catch {} } workerMgr.dispose(); });
|
|
1298
|
+
process.on('SIGINT', () => process.exit(0));
|
|
1299
|
+
process.on('SIGTERM', () => process.exit(0));
|
|
1300
|
+
|
|
1301
|
+
// ── install mode: self-register into the project .mcp.json (stage 1) ─────────
|
|
1302
|
+
// `vortex-mcp-computer-use install [--path <file>]` — merge-safe registration. ALWAYS uses the
|
|
1303
|
+
// non-reserved key "vortex-computer-use" (the host reserves "computer-use" and silently won't load
|
|
1304
|
+
// it), preserves every other server + top-level field, and refuses to overwrite a malformed file.
|
|
1305
|
+
const SERVER_KEY = 'vortex-computer-use';
|
|
1306
|
+
const SERVER_ENTRY = { command: 'node', args: ['node_modules/@vortex-os/computer-use/scripts/mcp-stdio.mjs'] };
|
|
1307
|
+
|
|
1308
|
+
const isObjLit = (v) => v !== null && typeof v === 'object' && !Array.isArray(v);
|
|
1309
|
+
const refuse = (msg) => { process.stderr.write(`[install] ${msg}\n`); process.exit(1); };
|
|
1310
|
+
|
|
1311
|
+
function runInstall(argv) {
|
|
1312
|
+
const i = argv.indexOf('--path');
|
|
1313
|
+
let target;
|
|
1314
|
+
if (i >= 0) {
|
|
1315
|
+
const v = argv[i + 1];
|
|
1316
|
+
if (!v || v.startsWith('--')) refuse('--path requires a file path argument.');
|
|
1317
|
+
target = v;
|
|
1318
|
+
} else {
|
|
1319
|
+
target = join(process.cwd(), '.mcp.json');
|
|
1320
|
+
}
|
|
1321
|
+
let existing = {};
|
|
1322
|
+
if (existsSync(target)) {
|
|
1323
|
+
let txt;
|
|
1324
|
+
try { txt = readFileSync(target, 'utf8').trim(); }
|
|
1325
|
+
catch (e) { refuse(`${target} could not be read — left untouched: ${e.message}`); }
|
|
1326
|
+
if (txt) {
|
|
1327
|
+
let parsed;
|
|
1328
|
+
try { parsed = JSON.parse(txt); }
|
|
1329
|
+
catch (e) { refuse(`${target} is not valid JSON — refusing to overwrite (fix it first): ${e.message}`); }
|
|
1330
|
+
if (!isObjLit(parsed)) refuse(`${target} is not a JSON object — refusing to overwrite.`);
|
|
1331
|
+
if (parsed.mcpServers !== undefined && !isObjLit(parsed.mcpServers)) refuse(`${target} has a non-object "mcpServers" — refusing to overwrite.`);
|
|
1332
|
+
existing = parsed;
|
|
1333
|
+
}
|
|
1334
|
+
}
|
|
1335
|
+
const servers = isObjLit(existing.mcpServers) ? existing.mcpServers : {};
|
|
1336
|
+
// ADD-ONLY: never clobber an existing "vortex-computer-use" entry (the user may have customized it).
|
|
1337
|
+
if (Object.prototype.hasOwnProperty.call(servers, SERVER_KEY)) {
|
|
1338
|
+
process.stdout.write(JSON.stringify({ ok: true, action: 'already-present', path: target, serverKey: SERVER_KEY }, null, 2) + '\n');
|
|
1339
|
+
process.stderr.write(`[install] "${SERVER_KEY}" already registered in ${target} — left unchanged.\n`);
|
|
1340
|
+
return;
|
|
1341
|
+
}
|
|
1342
|
+
const preserved = Object.keys(servers);
|
|
1343
|
+
const merged = { ...existing, mcpServers: { ...servers, [SERVER_KEY]: SERVER_ENTRY } };
|
|
1344
|
+
// Atomic write via a PRIVATE temp dir (random name, not a guessable sibling): write inside with
|
|
1345
|
+
// exclusive-create, rename onto the target, then remove the dir. Avoids following a pre-placed
|
|
1346
|
+
// temp symlink and leaves no stray temp on failure (codex r2 MEDIUM).
|
|
1347
|
+
try { mkdirSync(dirname(target), { recursive: true }); } catch {}
|
|
1348
|
+
const tmpDir = mkdtempSync(join(dirname(target), '.mcp-cu-'));
|
|
1349
|
+
const tmp = join(tmpDir, 'mcp.json');
|
|
1350
|
+
try {
|
|
1351
|
+
writeFileSync(tmp, JSON.stringify(merged, null, 2) + '\n', { encoding: 'utf8', flag: 'wx' });
|
|
1352
|
+
renameSync(tmp, target);
|
|
1353
|
+
} finally {
|
|
1354
|
+
try { rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
|
1355
|
+
}
|
|
1356
|
+
process.stdout.write(JSON.stringify({ ok: true, action: 'added', path: target, serverKey: SERVER_KEY, preservedServers: preserved }, null, 2) + '\n');
|
|
1357
|
+
process.stderr.write(
|
|
1358
|
+
`[install] added "${SERVER_KEY}" in ${target}` +
|
|
1359
|
+
(preserved.length ? ` (kept: ${preserved.join(', ')})` : '') + '\n' +
|
|
1360
|
+
`[install] Restart the agent — or approve the new MCP server when prompted — to load the computer-use tools.\n`,
|
|
1361
|
+
);
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
if (process.argv.slice(2).includes('install')) {
|
|
1365
|
+
runInstall(process.argv.slice(2));
|
|
1366
|
+
} else {
|
|
1367
|
+
// Serve path: load the MCP SDK dynamically (so `install` never requires it), wire the handlers, connect.
|
|
1368
|
+
const { Server } = await import('@modelcontextprotocol/sdk/server/index.js');
|
|
1369
|
+
const { StdioServerTransport } = await import('@modelcontextprotocol/sdk/server/stdio.js');
|
|
1370
|
+
const { ListToolsRequestSchema, CallToolRequestSchema } = await import('@modelcontextprotocol/sdk/types.js');
|
|
1371
|
+
const server = new Server({ name: 'computer-use', version: PKG_VERSION }, { capabilities: { tools: {} } });
|
|
1372
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
|
|
1373
|
+
server.setRequestHandler(CallToolRequestSchema, handleCallTool);
|
|
1374
|
+
await server.connect(new StdioServerTransport());
|
|
1375
|
+
process.stderr.write(`[computer-use MCP] ready on stdio (worker=${plat === 'win32' ? 'on' : 'off'}; tools: probe, read_ui, classify_activity, capture_screen, watch_capture, poll_change, start_watch, get_events, stop_watch, beep, speak)\n`);
|
|
1376
|
+
}
|