ucu-mcp 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,6 +34,12 @@ export declare class PermissionError extends UcuError {
34
34
  export declare class WindowNotFoundError extends UcuError {
35
35
  constructor(windowId: string);
36
36
  }
37
+ /**
38
+ * Requested accessibility element ID no longer resolves.
39
+ */
40
+ export declare class ElementNotFoundError extends UcuError {
41
+ constructor(elementId: string);
42
+ }
37
43
  /**
38
44
  * Click/scroll target is outside screen bounds.
39
45
  */
@@ -68,6 +68,14 @@ export class WindowNotFoundError extends UcuError {
68
68
  super(`Window ${windowId} not found. It may have been closed. Run list_windows to get fresh IDs.`, "WINDOW_NOT_FOUND", false);
69
69
  }
70
70
  }
71
+ /**
72
+ * Requested accessibility element ID no longer resolves.
73
+ */
74
+ export class ElementNotFoundError extends UcuError {
75
+ constructor(elementId) {
76
+ super(`Element ${elementId} not found. It may have been removed or invalidated. Run find_element to get a fresh ID.`, "ELEMENT_NOT_FOUND", false);
77
+ }
78
+ }
71
79
  // ---------------------------------------------------------------------------
72
80
  // Input Errors
73
81
  // ---------------------------------------------------------------------------
@@ -9,10 +9,51 @@
9
9
  * Windows: Uses SendInput (stub).
10
10
  * Linux: Uses xdotool (stub).
11
11
  */
12
- import { execFile } from "node:child_process";
12
+ import { execFile, execFileSync } from "node:child_process";
13
13
  import { promisify } from "node:util";
14
+ import { join, dirname } from "node:path";
15
+ import { fileURLToPath } from "node:url";
14
16
  import { logger } from "../util/logger.js";
15
17
  const execFileAsync = promisify(execFile);
18
+ // ── Native CGEvent helper (macOS) ──────────────────────────────────────
19
+ // JXA (osascript -l JavaScript) cannot call CGEventPost without segfault.
20
+ // We ship a small Swift binary that does native CGEvent injection instead.
21
+ const __dirname = dirname(fileURLToPath(import.meta.url));
22
+ // In dev: src/utils/input.ts → native/cgevent/cgevent-helper
23
+ // In prod: dist/src/utils/input.js → dist/native/cgevent/cgevent-helper
24
+ const nativeHelperPath = join(__dirname, "..", "..", "..", "native", "cgevent", "cgevent-helper");
25
+ // Fallback: try from project root (dev mode)
26
+ const nativeHelperPathAlt = join(__dirname, "..", "..", "native", "cgevent", "cgevent-helper");
27
+ import { existsSync } from "node:fs";
28
+ const resolvedNativePath = existsSync(nativeHelperPath) ? nativeHelperPath : nativeHelperPathAlt;
29
+ let _nativeAvailable;
30
+ function isNativeAvailable() {
31
+ if (_nativeAvailable !== undefined)
32
+ return _nativeAvailable;
33
+ try {
34
+ const stdout = execFileSync(resolvedNativePath, [], {
35
+ input: '{"command":"ping"}',
36
+ encoding: "utf8",
37
+ timeout: 3000,
38
+ });
39
+ _nativeAvailable = stdout.includes('"ok"');
40
+ }
41
+ catch {
42
+ _nativeAvailable = false;
43
+ }
44
+ return _nativeAvailable;
45
+ }
46
+ function runNativeChecked(payload) {
47
+ const raw = execFileSync(resolvedNativePath, [], {
48
+ input: JSON.stringify(payload),
49
+ encoding: "utf8",
50
+ timeout: 10000,
51
+ }).trim();
52
+ const resp = JSON.parse(raw);
53
+ if (resp.error) {
54
+ throw new Error(`native helper error: ${resp.error}`);
55
+ }
56
+ }
16
57
  // ── Dry-run mode ──────────────────────────────────────────────────────────
17
58
  const isDryRun = () => process.env.UCU_DRY_RUN === "true";
18
59
  function logDryRun(action, details) {
@@ -63,6 +104,10 @@ export async function click(x, y, button = "left", _platform = process.platform)
63
104
  return;
64
105
  }
65
106
  if (_platform === "darwin") {
107
+ if (isNativeAvailable()) {
108
+ runNativeChecked({ command: "click", x, y, button });
109
+ return;
110
+ }
66
111
  const btnType = { left: 0, right: 1, middle: 2 }[button];
67
112
  await runJXA(`
68
113
  ObjC.import('CoreGraphics');
@@ -91,6 +136,10 @@ export async function doubleClick(x, y, button = "left", _platform = process.pla
91
136
  return;
92
137
  }
93
138
  if (_platform === "darwin") {
139
+ if (isNativeAvailable()) {
140
+ runNativeChecked({ command: "doubleClick", x, y, button });
141
+ return;
142
+ }
94
143
  const btnType = { left: 0, right: 1, middle: 2 }[button];
95
144
  await runJXA(`
96
145
  ObjC.import('CoreGraphics');
@@ -125,6 +174,10 @@ export async function move(x, y, _platform = process.platform) {
125
174
  return;
126
175
  }
127
176
  if (_platform === "darwin") {
177
+ if (isNativeAvailable()) {
178
+ runNativeChecked({ command: "move", x, y });
179
+ return;
180
+ }
128
181
  await runJXA(`
129
182
  ObjC.import('CoreGraphics');
130
183
  var loc = $.CGPointMake(${x}, ${y});
@@ -146,6 +199,10 @@ export async function drag(fromX, fromY, toX, toY, button = "left", duration = 3
146
199
  return;
147
200
  }
148
201
  if (_platform === "darwin") {
202
+ if (isNativeAvailable()) {
203
+ runNativeChecked({ command: "drag", fromX, fromY, toX, toY, button, durationMs: duration });
204
+ return;
205
+ }
149
206
  const btnType = { left: 0, right: 1, middle: 2 }[button];
150
207
  const steps = Math.max(2, Math.min(60, Math.ceil(duration / 16)));
151
208
  const delayMicros = Math.max(0, Math.floor((duration * 1000) / steps));
@@ -192,6 +249,10 @@ export async function scroll(x, y, deltaX, deltaY, _platform = process.platform)
192
249
  return;
193
250
  }
194
251
  if (_platform === "darwin") {
252
+ if (isNativeAvailable()) {
253
+ runNativeChecked({ command: "scroll", x, y, deltaX, deltaY });
254
+ return;
255
+ }
195
256
  const verticalDelta = -deltaY;
196
257
  const horizontalDelta = deltaX;
197
258
  await runJXA(`
@@ -314,23 +375,28 @@ export async function typeText(text, delay = 20, _platform = process.platform) {
314
375
  // Process each batch
315
376
  for (const batch of batches) {
316
377
  if (batch.cgEvent && Array.isArray(batch.chars)) {
317
- // Build a single JXA script that types all chars in this CGEvent batch
318
- const keyStatements = batch.chars.map(({ code, shift }) => {
319
- const flags = shift ? SHIFT_FLAG : 0;
320
- return `
321
- kd = $.CGEventCreateKeyboardEvent(null, ${code}, true);
322
- ku = $.CGEventCreateKeyboardEvent(null, ${code}, false);
323
- if (${flags}) { $.CGEventSetFlags(kd, ${flags}); $.CGEventSetFlags(ku, ${flags}); }
324
- $.CGEventPost(0, kd);
325
- $.CGEventPost(0, ku);
326
- $.CFRelease(kd);
327
- $.CFRelease(ku);`;
328
- }).join("\n");
329
- await runJXA(`
330
- ObjC.import('CoreGraphics');
331
- var kd, ku;
332
- ${keyStatements}
333
- `);
378
+ if (isNativeAvailable()) {
379
+ runNativeChecked({ command: "typeBatch", keys: batch.chars });
380
+ }
381
+ else {
382
+ // Build a single JXA script that types all chars in this CGEvent batch
383
+ const keyStatements = batch.chars.map(({ code, shift }) => {
384
+ const flags = shift ? SHIFT_FLAG : 0;
385
+ return `
386
+ kd = $.CGEventCreateKeyboardEvent(null, ${code}, true);
387
+ ku = $.CGEventCreateKeyboardEvent(null, ${code}, false);
388
+ if (${flags}) { $.CGEventSetFlags(kd, ${flags}); $.CGEventSetFlags(ku, ${flags}); }
389
+ $.CGEventPost(0, kd);
390
+ $.CGEventPost(0, ku);
391
+ $.CFRelease(kd);
392
+ $.CFRelease(ku);`;
393
+ }).join("\n");
394
+ await runJXA(`
395
+ ObjC.import('CoreGraphics');
396
+ var kd, ku;
397
+ ${keyStatements}
398
+ `);
399
+ }
334
400
  }
335
401
  else {
336
402
  // Fallback: use osascript keystroke for unsupported chars (emoji, CJK, etc.)
@@ -369,6 +435,10 @@ export async function pressKey(key, modifiers = [], _platform = process.platform
369
435
  }
370
436
  flags |= flag;
371
437
  }
438
+ if (isNativeAvailable()) {
439
+ runNativeChecked({ command: "pressKey", keyCode, flags });
440
+ return;
441
+ }
372
442
  await runJXA(`
373
443
  ObjC.import('CoreGraphics');
374
444
  var flags = ${flags};
Binary file
@@ -0,0 +1,126 @@
1
+ import CoreGraphics
2
+ import Foundation
3
+
4
+ // Simple flat params struct — avoids recursive enum decoding issues
5
+ struct Input: Decodable {
6
+ let command: String
7
+ let x: Double?
8
+ let y: Double?
9
+ let fromX: Double?
10
+ let fromY: Double?
11
+ let toX: Double?
12
+ let toY: Double?
13
+ let button: String?
14
+ let durationMs: Int?
15
+ let deltaX: Int?
16
+ let deltaY: Int?
17
+ let keyCode: Int?
18
+ let flags: Int64?
19
+ let keys: [KeyEntry]?
20
+ struct KeyEntry: Decodable { let code: Int; let shift: Bool? }
21
+ }
22
+
23
+ func out(_ json: String) { FileHandle.standardOutput.write((json + "\n").data(using: .utf8)!); fflush(stdout) }
24
+ func post(_ event: CGEvent) { event.post(tap: CGEventTapLocation.cghidEventTap) }
25
+
26
+ func btn(_ s: String?) -> CGMouseButton {
27
+ switch s { case "right": return .right; case "middle": return .center; default: return .left }
28
+ }
29
+ func downT(_ b: CGMouseButton) -> CGEventType {
30
+ switch b { case .right: return .rightMouseDown; case .center: return .otherMouseDown; default: return .leftMouseDown }
31
+ }
32
+ func upT(_ b: CGMouseButton) -> CGEventType {
33
+ switch b { case .right: return .rightMouseUp; case .center: return .otherMouseUp; default: return .leftMouseUp }
34
+ }
35
+ func dragT(_ b: CGMouseButton) -> CGEventType {
36
+ switch b { case .right: return .rightMouseDragged; case .center: return .otherMouseDragged; default: return .leftMouseDragged }
37
+ }
38
+
39
+ func doClick(_ p: Input) -> String {
40
+ let loc = CGPoint(x: p.x ?? 0, y: p.y ?? 0); let b = btn(p.button)
41
+ guard let dn = CGEvent(mouseEventSource: nil, mouseType: downT(b), mouseCursorPosition: loc, mouseButton: b),
42
+ let up = CGEvent(mouseEventSource: nil, mouseType: upT(b), mouseCursorPosition: loc, mouseButton: b)
43
+ else { return "{\"error\":\"fail\"}" }
44
+ post(dn); post(up); return "{\"ok\":true}"
45
+ }
46
+
47
+ func doDoubleClick(_ p: Input) -> String {
48
+ let loc = CGPoint(x: p.x ?? 0, y: p.y ?? 0); let b = btn(p.button)
49
+ guard let d1 = CGEvent(mouseEventSource: nil, mouseType: downT(b), mouseCursorPosition: loc, mouseButton: b),
50
+ let u1 = CGEvent(mouseEventSource: nil, mouseType: upT(b), mouseCursorPosition: loc, mouseButton: b),
51
+ let d2 = CGEvent(mouseEventSource: nil, mouseType: downT(b), mouseCursorPosition: loc, mouseButton: b),
52
+ let u2 = CGEvent(mouseEventSource: nil, mouseType: upT(b), mouseCursorPosition: loc, mouseButton: b)
53
+ else { return "{\"error\":\"fail\"}" }
54
+ d1.setIntegerValueField(.mouseEventClickState, value: 1); u1.setIntegerValueField(.mouseEventClickState, value: 1)
55
+ d2.setIntegerValueField(.mouseEventClickState, value: 2); u2.setIntegerValueField(.mouseEventClickState, value: 2)
56
+ post(d1); post(u1); post(d2); post(u2); return "{\"ok\":true}"
57
+ }
58
+
59
+ func doMove(_ p: Input) -> String {
60
+ let loc = CGPoint(x: p.x ?? 0, y: p.y ?? 0)
61
+ guard let ev = CGEvent(mouseEventSource: nil, mouseType: .mouseMoved, mouseCursorPosition: loc, mouseButton: .left)
62
+ else { return "{\"error\":\"fail\"}" }
63
+ post(ev); return "{\"ok\":true}"
64
+ }
65
+
66
+ func doDrag(_ p: Input) -> String {
67
+ let from = CGPoint(x: p.fromX ?? 0, y: p.fromY ?? 0)
68
+ let to = CGPoint(x: p.toX ?? 0, y: p.toY ?? 0)
69
+ let ms = p.durationMs ?? 300; let b = btn(p.button)
70
+ let steps = max(2, min(60, Int(ceil(Double(ms) / 16.0))))
71
+ let delay = max(0, (ms * 1000) / steps)
72
+ guard let dn = CGEvent(mouseEventSource: nil, mouseType: downT(b), mouseCursorPosition: from, mouseButton: b)
73
+ else { return "{\"error\":\"fail\"}" }
74
+ post(dn)
75
+ for n in 1...steps {
76
+ let t = Double(n) / Double(steps)
77
+ let pt = CGPoint(x: from.x + (to.x - from.x) * t, y: from.y + (to.y - from.y) * t)
78
+ if let ev = CGEvent(mouseEventSource: nil, mouseType: dragT(b), mouseCursorPosition: pt, mouseButton: b) { post(ev) }
79
+ if delay > 0 && n < steps { usleep(UInt32(delay)) }
80
+ }
81
+ if let up = CGEvent(mouseEventSource: nil, mouseType: upT(b), mouseCursorPosition: to, mouseButton: b) { post(up) }
82
+ return "{\"ok\":true}"
83
+ }
84
+
85
+ func doScroll(_ p: Input) -> String {
86
+ let dy = Int32(-(p.deltaY ?? 0)); let dx = Int32(p.deltaX ?? 0)
87
+ guard let ev = CGEvent(scrollWheelEvent2Source: nil, units: .pixel, wheelCount: 2, wheel1: dy, wheel2: dx, wheel3: 0)
88
+ else { return "{\"error\":\"fail\"}" }
89
+ post(ev); return "{\"ok\":true}"
90
+ }
91
+
92
+ func doPressKey(_ p: Input) -> String {
93
+ let code = UInt16(p.keyCode ?? 0); let flags = CGEventFlags(rawValue: UInt64(p.flags ?? 0))
94
+ guard let dn = CGEvent(keyboardEventSource: nil, virtualKey: code, keyDown: true),
95
+ let up = CGEvent(keyboardEventSource: nil, virtualKey: code, keyDown: false)
96
+ else { return "{\"error\":\"fail\"}" }
97
+ dn.flags = flags; up.flags = flags; post(dn); post(up); return "{\"ok\":true}"
98
+ }
99
+
100
+ func doTypeBatch(_ p: Input) -> String {
101
+ guard let keys = p.keys else { return "{\"error\":\"missing keys\"}" }
102
+ let SHIFT = CGEventFlags(rawValue: 0x00020000)
103
+ for entry in keys {
104
+ let code = UInt16(entry.code); let flags: CGEventFlags = (entry.shift ?? false) ? SHIFT : []
105
+ guard let dn = CGEvent(keyboardEventSource: nil, virtualKey: code, keyDown: true),
106
+ let up = CGEvent(keyboardEventSource: nil, virtualKey: code, keyDown: false) else { continue }
107
+ dn.flags = flags; up.flags = flags; post(dn); post(up)
108
+ }
109
+ return "{\"ok\":true}"
110
+ }
111
+
112
+ guard let line = readLine(), let data = line.data(using: .utf8),
113
+ let input = try? JSONDecoder().decode(Input.self, from: data)
114
+ else { out("{\"error\":\"invalid JSON\"}"); exit(1) }
115
+
116
+ switch input.command {
117
+ case "click": out(doClick(input))
118
+ case "doubleClick": out(doDoubleClick(input))
119
+ case "move": out(doMove(input))
120
+ case "drag": out(doDrag(input))
121
+ case "scroll": out(doScroll(input))
122
+ case "pressKey": out(doPressKey(input))
123
+ case "typeBatch": out(doTypeBatch(input))
124
+ case "ping": out("{\"ok\":true}")
125
+ default: out("{\"error\":\"unknown\"}")
126
+ }
@@ -0,0 +1,89 @@
1
+ import Foundation
2
+ import Vision
3
+ import AppKit
4
+
5
+ struct OCRInput: Decodable {
6
+ let imagePath: String
7
+ }
8
+
9
+ struct OCRElement: Encodable {
10
+ let text: String
11
+ let x: Int
12
+ let y: Int
13
+ let width: Int
14
+ let height: Int
15
+ let confidence: Double
16
+ }
17
+
18
+ struct OCROutput: Encodable {
19
+ let elements: [OCRElement]
20
+ let fullText: String
21
+ let error: String?
22
+ }
23
+
24
+ // Read all of stdin as JSON
25
+ let stdinData = FileHandle.standardInput.readDataToEndOfFile()
26
+ guard let input = try? JSONDecoder().decode(OCRInput.self, from: stdinData) else {
27
+ let err = OCROutput(elements: [], fullText: "", error: "Failed to decode input JSON")
28
+ let d = try! JSONEncoder().encode(err)
29
+ FileHandle.standardOutput.write(d)
30
+ exit(1)
31
+ }
32
+
33
+ let url = URL(fileURLWithPath: input.imagePath)
34
+ guard let image = NSImage(contentsOf: url), image.isValid else {
35
+ let err = OCROutput(elements: [], fullText: "", error: "Failed to load image: \(input.imagePath)")
36
+ let d = try! JSONEncoder().encode(err)
37
+ FileHandle.standardOutput.write(d)
38
+ exit(1)
39
+ }
40
+
41
+ var proposedRect = NSRect.zero
42
+ guard let cgImage = image.cgImage(forProposedRect: &proposedRect, context: nil, hints: nil) else {
43
+ let err = OCROutput(elements: [], fullText: "", error: "Failed to get CGImage")
44
+ let d = try! JSONEncoder().encode(err)
45
+ FileHandle.standardOutput.write(d)
46
+ exit(1)
47
+ }
48
+
49
+ let request = VNRecognizeTextRequest()
50
+ request.recognitionLevel = .accurate
51
+ request.usesLanguageCorrection = true
52
+
53
+ let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
54
+
55
+ do {
56
+ try handler.perform([request])
57
+ } catch {
58
+ let err = OCROutput(elements: [], fullText: "", error: "OCR failed: \(error.localizedDescription)")
59
+ let d = try! JSONEncoder().encode(err)
60
+ FileHandle.standardOutput.write(d)
61
+ exit(1)
62
+ }
63
+
64
+ guard let observations = request.results else {
65
+ let err = OCROutput(elements: [], fullText: "", error: "No OCR results")
66
+ let d = try! JSONEncoder().encode(err)
67
+ FileHandle.standardOutput.write(d)
68
+ exit(1)
69
+ }
70
+
71
+ let imgWidth = CGFloat(cgImage.width)
72
+ let imgHeight = CGFloat(cgImage.height)
73
+ var elements: [OCRElement] = []
74
+ var fullTextParts: [String] = []
75
+
76
+ for obs in observations {
77
+ guard let candidate = obs.topCandidates(1).first else { continue }
78
+ let bbox = obs.boundingBox
79
+ let bx = Int(bbox.origin.x * imgWidth)
80
+ let by = Int((1 - bbox.origin.y - bbox.height) * imgHeight)
81
+ let bw = Int(bbox.width * imgWidth)
82
+ let bh = Int(bbox.height * imgHeight)
83
+ elements.append(OCRElement(text: candidate.string, x: bx, y: by, width: bw, height: bh, confidence: Double(candidate.confidence)))
84
+ fullTextParts.append(candidate.string)
85
+ }
86
+
87
+ let output = OCROutput(elements: elements, fullText: fullTextParts.joined(separator: "\n"), error: nil)
88
+ let encoded = try! JSONEncoder().encode(output)
89
+ FileHandle.standardOutput.write(encoded)
Binary file
package/package.json CHANGED
@@ -1,12 +1,13 @@
1
1
  {
2
2
  "name": "ucu-mcp",
3
- "version": "0.1.3",
3
+ "version": "0.2.0",
4
4
  "description": "MCP server for Universal Computer Use — desktop automation for AI agents via Model Context Protocol",
5
5
  "type": "module",
6
6
  "bin": {
7
7
  "ucu-mcp": "dist/bin/ucu-mcp.js"
8
8
  },
9
9
  "files": [
10
+ "native/",
10
11
  "dist/bin/",
11
12
  "dist/src/",
12
13
  "dist/index.js",
@@ -17,13 +18,15 @@
17
18
  "main": "./dist/index.js",
18
19
  "types": "./dist/index.d.ts",
19
20
  "scripts": {
20
- "build": "tsc",
21
+ "build": "tsc && npm run build:native",
21
22
  "start": "node dist/bin/ucu-mcp.js",
22
23
  "dev": "tsx bin/ucu-mcp.ts",
23
24
  "test": "vitest run",
24
25
  "test:watch": "vitest",
25
26
  "test:integration": "vitest run tests/integration/",
26
- "test:macos-gui": "UCU_MACOS_GUI_SMOKE=1 vitest run tests/integration/macos-gui-smoke.test.ts"
27
+ "test:macos-gui": "UCU_MACOS_GUI_SMOKE=1 vitest run tests/integration/macos-gui-smoke.test.ts",
28
+ "test:client-cli": "UCU_CLIENT_CLI_SMOKE=1 vitest run tests/integration/client-cli-smoke.test.ts",
29
+ "build:native": "cd native/cgevent && swiftc -O -o cgevent-helper main.swift -framework CoreGraphics -framework Foundation && cd ../ocr && swiftc -O -o ocr-helper main.swift -framework Vision -framework AppKit"
27
30
  },
28
31
  "keywords": [
29
32
  "mcp",