desktop-pilot-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,318 @@
1
+ import CoreGraphics
2
+ import ApplicationServices
3
+ import Foundation
4
+
5
+ // MARK: - Virtual Key Codes
6
+
7
+ /// Common macOS virtual key codes for CGEvent keyboard events.
8
+ /// Based on Events.h / HIToolbox kVK constants.
9
+ enum VirtualKeyCode {
10
+ static let returnKey: CGKeyCode = 36
11
+ static let tab: CGKeyCode = 48
12
+ static let space: CGKeyCode = 49
13
+ static let delete: CGKeyCode = 51
14
+ static let escape: CGKeyCode = 53
15
+ static let forwardDelete: CGKeyCode = 117
16
+
17
+ // Arrow keys
18
+ static let leftArrow: CGKeyCode = 123
19
+ static let rightArrow: CGKeyCode = 124
20
+ static let downArrow: CGKeyCode = 125
21
+ static let upArrow: CGKeyCode = 126
22
+
23
+ // Modifier keys (for reference; typically used via CGEventFlags)
24
+ static let command: CGKeyCode = 55
25
+ static let shift: CGKeyCode = 56
26
+ static let option: CGKeyCode = 58
27
+ static let control: CGKeyCode = 59
28
+ static let capsLock: CGKeyCode = 57
29
+
30
+ // Function keys
31
+ static let f1: CGKeyCode = 122
32
+ static let f2: CGKeyCode = 120
33
+ static let f3: CGKeyCode = 99
34
+ static let f4: CGKeyCode = 118
35
+ static let f5: CGKeyCode = 96
36
+ static let f6: CGKeyCode = 97
37
+ static let f7: CGKeyCode = 98
38
+ static let f8: CGKeyCode = 100
39
+ static let f9: CGKeyCode = 101
40
+ static let f10: CGKeyCode = 109
41
+ static let f11: CGKeyCode = 103
42
+ static let f12: CGKeyCode = 111
43
+
44
+ // Navigation
45
+ static let home: CGKeyCode = 115
46
+ static let end: CGKeyCode = 119
47
+ static let pageUp: CGKeyCode = 116
48
+ static let pageDown: CGKeyCode = 121
49
+
50
+ // Common letter keys (virtual key codes for Cmd+key combos)
51
+ static let a: CGKeyCode = 0
52
+ static let c: CGKeyCode = 8
53
+ static let v: CGKeyCode = 9
54
+ static let x: CGKeyCode = 7
55
+ static let z: CGKeyCode = 6
56
+ static let s: CGKeyCode = 1
57
+ static let w: CGKeyCode = 13
58
+ static let q: CGKeyCode = 12
59
+ static let n: CGKeyCode = 45
60
+ static let t: CGKeyCode = 17
61
+ }
62
+
63
+ // MARK: - CGEventLayer
64
+
65
+ /// Ultra-fast input injection via CGEvent.
66
+ /// Used for keyboard input and coordinate-based clicking when other methods fail.
67
+ ///
68
+ /// CGEvent provides 1-5ms latency for mouse and keyboard events, making it
69
+ /// the fastest way to simulate user input on macOS. Unlike the Accessibility
70
+ /// layer, CGEvent cannot read UI state -- it can only inject input events.
71
+ final class CGEventLayer: @unchecked Sendable {
72
+
73
+ private let bridge: AXBridge
74
+ private let store: ElementStore
75
+
76
+ init(bridge: AXBridge, store: ElementStore) {
77
+ self.bridge = bridge
78
+ self.store = store
79
+ }
80
+
81
+ // MARK: - Mouse Events
82
+
83
+ /// Click at screen coordinates.
84
+ func clickAt(x: Double, y: Double) {
85
+ let point = CGPoint(x: x, y: y)
86
+
87
+ let mouseDown = CGEvent(
88
+ mouseEventSource: nil,
89
+ mouseType: .leftMouseDown,
90
+ mouseCursorPosition: point,
91
+ mouseButton: .left
92
+ )
93
+ let mouseUp = CGEvent(
94
+ mouseEventSource: nil,
95
+ mouseType: .leftMouseUp,
96
+ mouseCursorPosition: point,
97
+ mouseButton: .left
98
+ )
99
+
100
+ mouseDown?.post(tap: .cghidEventTap)
101
+ mouseUp?.post(tap: .cghidEventTap)
102
+ }
103
+
104
+ /// Click at the center of an element's bounding rectangle.
105
+ func clickElement(bounds: ElementBounds) {
106
+ let centerX = bounds.x + bounds.width / 2.0
107
+ let centerY = bounds.y + bounds.height / 2.0
108
+ clickAt(x: centerX, y: centerY)
109
+ }
110
+
111
+ /// Double-click at screen coordinates.
112
+ func doubleClickAt(x: Double, y: Double) {
113
+ let point = CGPoint(x: x, y: y)
114
+
115
+ for _ in 0..<2 {
116
+ let mouseDown = CGEvent(
117
+ mouseEventSource: nil,
118
+ mouseType: .leftMouseDown,
119
+ mouseCursorPosition: point,
120
+ mouseButton: .left
121
+ )
122
+ let mouseUp = CGEvent(
123
+ mouseEventSource: nil,
124
+ mouseType: .leftMouseUp,
125
+ mouseCursorPosition: point,
126
+ mouseButton: .left
127
+ )
128
+ mouseDown?.setIntegerValueField(.mouseEventClickState, value: 2)
129
+ mouseUp?.setIntegerValueField(.mouseEventClickState, value: 2)
130
+ mouseDown?.post(tap: .cghidEventTap)
131
+ mouseUp?.post(tap: .cghidEventTap)
132
+ }
133
+ }
134
+
135
+ /// Right-click at screen coordinates.
136
+ func rightClickAt(x: Double, y: Double) {
137
+ let point = CGPoint(x: x, y: y)
138
+
139
+ let mouseDown = CGEvent(
140
+ mouseEventSource: nil,
141
+ mouseType: .rightMouseDown,
142
+ mouseCursorPosition: point,
143
+ mouseButton: .right
144
+ )
145
+ let mouseUp = CGEvent(
146
+ mouseEventSource: nil,
147
+ mouseType: .rightMouseUp,
148
+ mouseCursorPosition: point,
149
+ mouseButton: .right
150
+ )
151
+
152
+ mouseDown?.post(tap: .cghidEventTap)
153
+ mouseUp?.post(tap: .cghidEventTap)
154
+ }
155
+
156
+ /// Move mouse to coordinates.
157
+ func moveTo(x: Double, y: Double) {
158
+ let point = CGPoint(x: x, y: y)
159
+ let moveEvent = CGEvent(
160
+ mouseEventSource: nil,
161
+ mouseType: .mouseMoved,
162
+ mouseCursorPosition: point,
163
+ mouseButton: .left
164
+ )
165
+ moveEvent?.post(tap: .cghidEventTap)
166
+ }
167
+
168
+ /// Drag from one point to another using left mouse button.
169
+ func drag(from start: CGPoint, to end: CGPoint) {
170
+ // Press down at start
171
+ let mouseDown = CGEvent(
172
+ mouseEventSource: nil,
173
+ mouseType: .leftMouseDown,
174
+ mouseCursorPosition: start,
175
+ mouseButton: .left
176
+ )
177
+ mouseDown?.post(tap: .cghidEventTap)
178
+
179
+ // Drag to end point
180
+ let dragEvent = CGEvent(
181
+ mouseEventSource: nil,
182
+ mouseType: .leftMouseDragged,
183
+ mouseCursorPosition: end,
184
+ mouseButton: .left
185
+ )
186
+ dragEvent?.post(tap: .cghidEventTap)
187
+
188
+ // Release at end
189
+ let mouseUp = CGEvent(
190
+ mouseEventSource: nil,
191
+ mouseType: .leftMouseUp,
192
+ mouseCursorPosition: end,
193
+ mouseButton: .left
194
+ )
195
+ mouseUp?.post(tap: .cghidEventTap)
196
+ }
197
+
198
+ // MARK: - Keyboard Events
199
+
200
+ /// Type a string character by character using CGEvent keyboard events.
201
+ /// This is the fastest and most reliable typing method.
202
+ func typeString(_ text: String) {
203
+ let source = CGEventSource(stateID: .hidSystemState)
204
+
205
+ for char in text {
206
+ let str = String(char)
207
+ for scalar in str.unicodeScalars {
208
+ var unichar = UniChar(scalar.value)
209
+ let keyDown = CGEvent(keyboardEventSource: source, virtualKey: 0, keyDown: true)
210
+ let keyUp = CGEvent(keyboardEventSource: source, virtualKey: 0, keyDown: false)
211
+
212
+ keyDown?.keyboardSetUnicodeString(stringLength: 1, unicodeString: &unichar)
213
+ keyUp?.keyboardSetUnicodeString(stringLength: 1, unicodeString: &unichar)
214
+
215
+ keyDown?.post(tap: .cghidEventTap)
216
+ keyUp?.post(tap: .cghidEventTap)
217
+ }
218
+ }
219
+ }
220
+
221
+ /// Press a virtual key with optional modifiers.
222
+ func pressKey(virtualKey: CGKeyCode, flags: CGEventFlags = []) {
223
+ let source = CGEventSource(stateID: .hidSystemState)
224
+ let keyDown = CGEvent(keyboardEventSource: source, virtualKey: virtualKey, keyDown: true)
225
+ let keyUp = CGEvent(keyboardEventSource: source, virtualKey: virtualKey, keyDown: false)
226
+
227
+ keyDown?.flags = flags
228
+ keyUp?.flags = flags
229
+
230
+ keyDown?.post(tap: .cghidEventTap)
231
+ keyUp?.post(tap: .cghidEventTap)
232
+ }
233
+
234
+ /// Press Return/Enter.
235
+ func pressReturn() { pressKey(virtualKey: VirtualKeyCode.returnKey) }
236
+
237
+ /// Press Tab.
238
+ func pressTab() { pressKey(virtualKey: VirtualKeyCode.tab) }
239
+
240
+ /// Press Escape.
241
+ func pressEscape() { pressKey(virtualKey: VirtualKeyCode.escape) }
242
+
243
+ /// Press Delete/Backspace.
244
+ func pressDelete() { pressKey(virtualKey: VirtualKeyCode.delete) }
245
+
246
+ /// Cmd+A (Select All).
247
+ func selectAll() { pressKey(virtualKey: VirtualKeyCode.a, flags: .maskCommand) }
248
+
249
+ /// Cmd+C (Copy).
250
+ func copy() { pressKey(virtualKey: VirtualKeyCode.c, flags: .maskCommand) }
251
+
252
+ /// Cmd+V (Paste).
253
+ func paste() { pressKey(virtualKey: VirtualKeyCode.v, flags: .maskCommand) }
254
+
255
+ /// Cmd+X (Cut).
256
+ func cut() { pressKey(virtualKey: VirtualKeyCode.x, flags: .maskCommand) }
257
+
258
+ /// Cmd+Z (Undo).
259
+ func undo() { pressKey(virtualKey: VirtualKeyCode.z, flags: .maskCommand) }
260
+
261
+ /// Cmd+Shift+Z (Redo).
262
+ func redo() { pressKey(virtualKey: VirtualKeyCode.z, flags: [.maskCommand, .maskShift]) }
263
+
264
+ // MARK: - Scroll
265
+
266
+ /// Scroll at current mouse position.
267
+ func scroll(deltaY: Int32, deltaX: Int32 = 0) {
268
+ let event = CGEvent(
269
+ scrollWheelEvent2Source: nil,
270
+ units: .pixel,
271
+ wheelCount: 2,
272
+ wheel1: deltaY,
273
+ wheel2: deltaX,
274
+ wheel3: 0
275
+ )
276
+ event?.post(tap: .cghidEventTap)
277
+ }
278
+ }
279
+
280
+ // MARK: - InteractionLayer Conformance
281
+
282
+ extension CGEventLayer: InteractionLayer {
283
+ var name: String { "CGEvent" }
284
+ var priority: Int { 40 }
285
+
286
+ func canHandle(bundleID: String?, appName: String) -> Bool {
287
+ // CGEvent works with any app for input injection
288
+ return true
289
+ }
290
+
291
+ func snapshot(pid: Int32, maxDepth: Int) throws -> [PilotElement] {
292
+ throw LayerError.notSupported(
293
+ layer: name,
294
+ reason: "CGEvent cannot read UI state. Use Accessibility layer for snapshots."
295
+ )
296
+ }
297
+
298
+ func click(ref: String) throws {
299
+ // CGEvent click requires screen coordinates, not an element ref.
300
+ // Use clickAt(x:y:) or clickElement(bounds:) directly instead.
301
+ throw LayerError.notSupported(
302
+ layer: name,
303
+ reason: "Use clickAt(x:y:) directly with coordinates from element bounds."
304
+ )
305
+ }
306
+
307
+ func typeText(ref: String, text: String) throws {
308
+ // CGEvent typing doesn't target a specific element -- it types to whatever is focused
309
+ typeString(text)
310
+ }
311
+
312
+ func readValue(ref: String) throws -> String? {
313
+ throw LayerError.notSupported(
314
+ layer: name,
315
+ reason: "CGEvent cannot read UI state. Use Accessibility layer for reading."
316
+ )
317
+ }
318
+ }
@@ -0,0 +1,40 @@
1
+ import Foundation
2
+
3
+ // MARK: - Layer Errors
4
+
5
+ /// Errors specific to interaction layers.
6
+ enum LayerError: Error, Sendable {
7
+ case notSupported(layer: String, reason: String)
8
+ case elementNotInteractable(ref: String)
9
+ case typingFailed(ref: String, reason: String)
10
+ case readFailed(ref: String, reason: String)
11
+ case snapshotFailed(pid: Int32, reason: String)
12
+ }
13
+
14
+ // MARK: - Interaction Layer Protocol
15
+
16
+ /// An interaction method (accessibility, applescript, cgevent, screenshot).
17
+ /// The Router picks the highest-priority layer that can handle a given app.
18
+ protocol InteractionLayer: Sendable {
19
+
20
+ /// Human-readable name of this layer (e.g. "Accessibility", "AppleScript").
21
+ var name: String { get }
22
+
23
+ /// Priority (lower number = preferred). The Router tries layers in order.
24
+ var priority: Int { get }
25
+
26
+ /// Can this layer handle the given app?
27
+ func canHandle(bundleID: String?, appName: String) -> Bool
28
+
29
+ /// Capture a UI snapshot tree for the given app.
30
+ func snapshot(pid: Int32, maxDepth: Int) throws -> [PilotElement]
31
+
32
+ /// Click an element identified by ref.
33
+ func click(ref: String) throws
34
+
35
+ /// Type text into an element identified by ref.
36
+ func typeText(ref: String, text: String) throws
37
+
38
+ /// Read the current value of an element identified by ref.
39
+ func readValue(ref: String) throws -> String?
40
+ }
@@ -0,0 +1,122 @@
1
+ import CoreGraphics
2
+ import AppKit
3
+ import Foundation
4
+
5
+ /// Screenshot fallback layer for custom-rendered content.
6
+ /// Used when Accessibility API can't see UI elements (game viewports, canvas, etc.).
7
+ final class ScreenshotLayer: @unchecked Sendable {
8
+
9
+ private let bridge: AXBridge
10
+ private let store: ElementStore
11
+
12
+ init(bridge: AXBridge, store: ElementStore) {
13
+ self.bridge = bridge
14
+ self.store = store
15
+ }
16
+
17
+ // MARK: - Full Screen Screenshot
18
+
19
+ /// Capture the entire main display.
20
+ func captureFullScreen() -> Data? {
21
+ guard let image = CGDisplayCreateImage(CGMainDisplayID()) else { return nil }
22
+ return pngData(from: image)
23
+ }
24
+
25
+ // MARK: - Region Screenshot
26
+
27
+ /// Capture a specific screen region.
28
+ func captureRegion(x: Double, y: Double, width: Double, height: Double) -> Data? {
29
+ let rect = CGRect(x: x, y: y, width: width, height: height)
30
+ guard let image = CGDisplayCreateImage(CGMainDisplayID(), rect: rect) else { return nil }
31
+ return pngData(from: image)
32
+ }
33
+
34
+ // MARK: - Element Screenshot
35
+
36
+ /// Capture the bounds of a specific element (resolved from ref via AXBridge).
37
+ func captureElement(bounds: ElementBounds) -> Data? {
38
+ return captureRegion(
39
+ x: bounds.x, y: bounds.y,
40
+ width: bounds.width, height: bounds.height
41
+ )
42
+ }
43
+
44
+ // MARK: - Window Screenshot
45
+
46
+ /// Capture a specific window by its windowID.
47
+ func captureWindow(windowID: CGWindowID) -> Data? {
48
+ let image = CGWindowListCreateImage(
49
+ .null,
50
+ .optionIncludingWindow,
51
+ windowID,
52
+ [.boundsIgnoreFraming, .bestResolution]
53
+ )
54
+ guard let image else { return nil }
55
+ return pngData(from: image)
56
+ }
57
+
58
+ // MARK: - PNG Encoding
59
+
60
+ private func pngData(from cgImage: CGImage) -> Data? {
61
+ let bitmapRep = NSBitmapImageRep(cgImage: cgImage)
62
+ return bitmapRep.representation(using: .png, properties: [:])
63
+ }
64
+
65
+ // MARK: - Base64 Encoding
66
+
67
+ /// Capture and return as base64 string (for MCP image responses).
68
+ func captureFullScreenBase64() -> String? {
69
+ guard let data = captureFullScreen() else { return nil }
70
+ return data.base64EncodedString()
71
+ }
72
+
73
+ func captureRegionBase64(x: Double, y: Double, width: Double, height: Double) -> String? {
74
+ guard let data = captureRegion(x: x, y: y, width: width, height: height) else { return nil }
75
+ return data.base64EncodedString()
76
+ }
77
+
78
+ func captureElementBase64(bounds: ElementBounds) -> String? {
79
+ guard let data = captureElement(bounds: bounds) else { return nil }
80
+ return data.base64EncodedString()
81
+ }
82
+ }
83
+
84
+ // MARK: - InteractionLayer Conformance
85
+
86
+ extension ScreenshotLayer: InteractionLayer {
87
+ var name: String { "Screenshot" }
88
+ var priority: Int { 50 } // Lowest priority -- last resort
89
+
90
+ func canHandle(bundleID: String?, appName: String) -> Bool {
91
+ return true // Can always take screenshots
92
+ }
93
+
94
+ func snapshot(pid: Int32, maxDepth: Int) throws -> [PilotElement] {
95
+ // Screenshots can't provide structured UI trees
96
+ throw LayerError.notSupported(
97
+ layer: name,
98
+ reason: "Screenshot layer cannot provide structured snapshots. Use pilot_screenshot tool instead."
99
+ )
100
+ }
101
+
102
+ func click(ref: String) throws {
103
+ throw LayerError.notSupported(
104
+ layer: name,
105
+ reason: "Screenshot layer cannot click elements. Use coordinate-based clicking via CGEvent."
106
+ )
107
+ }
108
+
109
+ func typeText(ref: String, text: String) throws {
110
+ throw LayerError.notSupported(
111
+ layer: name,
112
+ reason: "Screenshot layer cannot type. Use CGEvent or Accessibility layer."
113
+ )
114
+ }
115
+
116
+ func readValue(ref: String) throws -> String? {
117
+ throw LayerError.notSupported(
118
+ layer: name,
119
+ reason: "Screenshot layer cannot read values. Use Accessibility layer."
120
+ )
121
+ }
122
+ }