desktop-pilot-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,242 @@
1
+ import AppKit
2
+ import Foundation
3
+
4
+ // MARK: - Interaction Method
5
+
6
+ /// The layer used to interact with a macOS application.
7
+ enum InteractionMethod: Sendable, Equatable {
8
+ case accessibility
9
+ case applescript
10
+ case cgevent
11
+ case screenshot
12
+ }
13
+
14
+ // MARK: - App Category
15
+
16
+ /// Classification of an app based on its technology stack and scripting support.
17
+ enum AppCategory: Sendable, Equatable {
18
+ /// Has an AppleScript dictionary (scriptable via `sdef`).
19
+ case scriptable
20
+ /// Chromium-based (Electron). Limited Accessibility support, no AppleScript.
21
+ case electron
22
+ /// Standard native macOS app with good Accessibility support.
23
+ case nativeStandard
24
+ /// Unknown technology stack.
25
+ case unknown
26
+ }
27
+
28
+ // MARK: - Router
29
+
30
+ /// Routes operations to the best available interaction layer for a given
31
+ /// app and action.
32
+ ///
33
+ /// Phase 2 implementation selects the optimal method based on:
34
+ /// - The action being performed (snapshot, click, type, menu, script, find)
35
+ /// - The target app's category (scriptable, Electron, native, unknown)
36
+ /// - Known bundle-ID lists for scriptable and Electron apps
37
+ final class Router: Sendable {
38
+
39
+ // MARK: - Known Bundle IDs
40
+
41
+ /// macOS apps that expose an AppleScript dictionary.
42
+ private static let scriptableBundleIDs: Set<String> = [
43
+ "com.apple.finder",
44
+ "com.apple.Safari",
45
+ "com.apple.mail",
46
+ "com.apple.Notes",
47
+ "com.apple.iWork.Keynote",
48
+ "com.apple.iWork.Numbers",
49
+ "com.apple.iWork.Pages",
50
+ "com.apple.dt.Xcode",
51
+ "com.apple.iMovie",
52
+ "com.apple.garageband",
53
+ "com.apple.MobileSMS",
54
+ "com.apple.ical",
55
+ "com.apple.reminders",
56
+ "com.apple.Music",
57
+ ]
58
+
59
+ /// Electron / Chromium-based apps with limited AX and no AppleScript.
60
+ private static let electronBundleIDs: Set<String> = [
61
+ "com.hnc.Discord",
62
+ "com.microsoft.VSCode",
63
+ "com.tinyspeck.slackmacgap",
64
+ "org.whispersystems.signal-desktop",
65
+ "com.spotify.client",
66
+ ]
67
+
68
+ // MARK: - App Categorization
69
+
70
+ /// Classify an app by its bundle ID, with optional `sdef` detection fallback.
71
+ ///
72
+ /// The lookup order is:
73
+ /// 1. Check the static Electron set (these must never use AppleScript).
74
+ /// 2. Check the static scriptable set.
75
+ /// 3. Attempt dynamic `sdef` detection for unknown bundle IDs.
76
+ /// 4. Fall back to `.nativeStandard` for Apple first-party apps,
77
+ /// `.unknown` otherwise.
78
+ ///
79
+ /// - Parameters:
80
+ /// - bundleID: The bundle identifier, if known.
81
+ /// - appName: The display name (used as a heuristic when bundle ID is nil).
82
+ /// - Returns: The inferred `AppCategory`.
83
+ func categorize(bundleID: String?, appName: String) -> AppCategory {
84
+ guard let id = bundleID else {
85
+ return .unknown
86
+ }
87
+
88
+ if Self.electronBundleIDs.contains(id) {
89
+ return .electron
90
+ }
91
+
92
+ if Self.scriptableBundleIDs.contains(id) {
93
+ return .scriptable
94
+ }
95
+
96
+ if hasSdefDictionary(bundleID: id) {
97
+ return .scriptable
98
+ }
99
+
100
+ if id.hasPrefix("com.apple.") {
101
+ return .nativeStandard
102
+ }
103
+
104
+ return .unknown
105
+ }
106
+
107
+ // MARK: - App-level Routing
108
+
109
+ /// Determine the best general interaction method for an app.
110
+ ///
111
+ /// - Parameters:
112
+ /// - appName: The display name of the target application.
113
+ /// - bundleID: The bundle identifier, if known.
114
+ /// - Returns: The recommended interaction method.
115
+ func bestMethod(appName: String, bundleID: String?) -> InteractionMethod {
116
+ let category = categorize(bundleID: bundleID, appName: appName)
117
+
118
+ switch category {
119
+ case .scriptable:
120
+ return .applescript
121
+ case .electron:
122
+ return .accessibility
123
+ case .nativeStandard:
124
+ return .accessibility
125
+ case .unknown:
126
+ return .accessibility
127
+ }
128
+ }
129
+
130
+ // MARK: - Action-level Routing
131
+
132
+ /// Determine the best method for a specific operation on an app.
133
+ ///
134
+ /// Routing rules by action:
135
+ /// - `snapshot` / `read` / `find` -- Accessibility (only method that reads UI state)
136
+ /// - `click` -- Accessibility (AXPress is more precise than coordinate clicking)
137
+ /// - `type` -- CGEvent for most apps (more reliable), Accessibility for Electron
138
+ /// - `menu` -- Accessibility (menu bar traversal)
139
+ /// - `script` -- AppleScript for scriptable apps, Accessibility otherwise
140
+ ///
141
+ /// - Parameters:
142
+ /// - action: The action being performed (e.g. "click", "type", "snapshot").
143
+ /// - appName: The display name of the target application.
144
+ /// - bundleID: The bundle identifier, if known.
145
+ /// - Returns: The recommended interaction method.
146
+ func bestMethodForAction(
147
+ action: String,
148
+ appName: String,
149
+ bundleID: String?
150
+ ) -> InteractionMethod {
151
+ let normalized = action.lowercased()
152
+ let category = categorize(bundleID: bundleID, appName: appName)
153
+
154
+ switch normalized {
155
+ case "snapshot", "read", "find":
156
+ return .accessibility
157
+
158
+ case "click":
159
+ return .accessibility
160
+
161
+ case "type":
162
+ return routeTyping(category: category)
163
+
164
+ case "menu":
165
+ return .accessibility
166
+
167
+ case "script":
168
+ return routeScripting(category: category)
169
+
170
+ default:
171
+ return bestMethod(appName: appName, bundleID: bundleID)
172
+ }
173
+ }
174
+
175
+ // MARK: - Capability Check
176
+
177
+ /// Check whether a given method is available on this system.
178
+ ///
179
+ /// All methods are available in Phase 2.
180
+ ///
181
+ /// - Parameter method: The interaction method to check.
182
+ /// - Returns: `true` if the method can be used.
183
+ func isAvailable(_ method: InteractionMethod) -> Bool {
184
+ return true
185
+ }
186
+
187
+ // MARK: - Private Helpers
188
+
189
+ /// Pick the best method for typing text into the given app category.
190
+ ///
191
+ /// CGEvent is generally more reliable for keystroke injection, but
192
+ /// Electron apps sometimes swallow raw key events, so Accessibility
193
+ /// (AXSetValue) is safer there.
194
+ private func routeTyping(category: AppCategory) -> InteractionMethod {
195
+ switch category {
196
+ case .electron:
197
+ return .accessibility
198
+ case .scriptable, .nativeStandard, .unknown:
199
+ return .cgevent
200
+ }
201
+ }
202
+
203
+ /// Pick the best method for executing a script against the given app category.
204
+ ///
205
+ /// Only apps with an AppleScript dictionary benefit from `.applescript`;
206
+ /// everything else falls back to Accessibility.
207
+ private func routeScripting(category: AppCategory) -> InteractionMethod {
208
+ switch category {
209
+ case .scriptable:
210
+ return .applescript
211
+ case .electron, .nativeStandard, .unknown:
212
+ return .accessibility
213
+ }
214
+ }
215
+
216
+ /// Detect whether an app has an AppleScript dictionary via `sdef`.
217
+ ///
218
+ /// Shells out to `/usr/bin/sdef` with the app's bundle path resolved
219
+ /// through `NSWorkspace`. Returns `false` on any error or if the app
220
+ /// cannot be found.
221
+ private func hasSdefDictionary(bundleID: String) -> Bool {
222
+ guard let url = NSWorkspace.shared.urlForApplication(
223
+ withBundleIdentifier: bundleID
224
+ ) else {
225
+ return false
226
+ }
227
+
228
+ let process = Process()
229
+ process.executableURL = URL(fileURLWithPath: "/usr/bin/sdef")
230
+ process.arguments = [url.path]
231
+ process.standardOutput = FileHandle.nullDevice
232
+ process.standardError = FileHandle.nullDevice
233
+
234
+ do {
235
+ try process.run()
236
+ process.waitUntilExit()
237
+ return process.terminationStatus == 0
238
+ } catch {
239
+ return false
240
+ }
241
+ }
242
+ }
@@ -0,0 +1,192 @@
1
+ import ApplicationServices
2
+ import Foundation
3
+
4
+ // MARK: - Snapshot Builder
5
+
6
+ /// Builds a `PilotElement` tree from a running app's accessibility tree.
7
+ ///
8
+ /// Walks the AX hierarchy starting from the app element, reads attributes
9
+ /// via `AXBridge`, and registers every meaningful element in the
10
+ /// `ElementStore` so it can be referenced later by its opaque ref.
11
+ struct SnapshotBuilder: Sendable {
12
+ let bridge: AXBridge
13
+
14
+ /// Attributes fetched in a single batch call per element for performance.
15
+ private static let batchAttributes: [String] = [
16
+ kAXRoleAttribute,
17
+ kAXTitleAttribute,
18
+ kAXValueAttribute,
19
+ kAXDescriptionAttribute,
20
+ kAXEnabledAttribute,
21
+ kAXFocusedAttribute,
22
+ ]
23
+
24
+ // MARK: - Public API
25
+
26
+ /// Build a complete snapshot of an app's UI tree.
27
+ ///
28
+ /// - Parameters:
29
+ /// - appElement: The root AXUIElement for the app (from `AXBridge.appElement`).
30
+ /// - appName: Display name of the application.
31
+ /// - bundleID: Bundle identifier (e.g. "com.apple.Safari"), if available.
32
+ /// - pid: Process identifier.
33
+ /// - store: The `ElementStore` actor that will hold ref-to-element mappings.
34
+ /// - maxDepth: Maximum recursion depth to prevent runaway traversal (default 10).
35
+ /// - Returns: A fully populated `AppSnapshot`.
36
+ func buildSnapshot(
37
+ appElement: AXUIElement,
38
+ appName: String,
39
+ bundleID: String?,
40
+ pid: Int32,
41
+ store: ElementStore,
42
+ maxDepth: Int = 10
43
+ ) async -> AppSnapshot {
44
+ await store.reset()
45
+
46
+ let windows = bridge.getWindows(appElement)
47
+ var topLevelElements: [PilotElement] = []
48
+
49
+ for window in windows {
50
+ let element = await buildElement(
51
+ from: window,
52
+ store: store,
53
+ depth: 0,
54
+ maxDepth: maxDepth
55
+ )
56
+ if let element {
57
+ topLevelElements.append(element)
58
+ }
59
+ }
60
+
61
+ // If no windows, try direct children of the app element
62
+ if topLevelElements.isEmpty {
63
+ let children = bridge.getChildren(appElement)
64
+ for child in children {
65
+ let element = await buildElement(
66
+ from: child,
67
+ store: store,
68
+ depth: 0,
69
+ maxDepth: maxDepth
70
+ )
71
+ if let element {
72
+ topLevelElements.append(element)
73
+ }
74
+ }
75
+ }
76
+
77
+ let count = await store.count()
78
+ let formatter = ISO8601DateFormatter()
79
+
80
+ return AppSnapshot(
81
+ app: appName,
82
+ bundleID: bundleID,
83
+ pid: pid,
84
+ timestamp: formatter.string(from: Date()),
85
+ elementCount: count,
86
+ elements: topLevelElements
87
+ )
88
+ }
89
+
90
+ // MARK: - Recursive Tree Building
91
+
92
+ /// Build a single `PilotElement` from an `AXUIElement`, recursing into children.
93
+ ///
94
+ /// Returns `nil` if the element has no useful information (no role, title, or value).
95
+ private func buildElement(
96
+ from axElement: AXUIElement,
97
+ store: ElementStore,
98
+ depth: Int,
99
+ maxDepth: Int
100
+ ) async -> PilotElement? {
101
+ let attrs = readBatchAttributes(axElement)
102
+
103
+ let role = attrs.role
104
+
105
+ // Skip elements with unknown or missing roles that carry no info
106
+ if role == nil && attrs.title == nil && attrs.value == nil && attrs.description == nil {
107
+ return nil
108
+ }
109
+
110
+ // Skip explicitly unknown roles
111
+ if role == "AXUnknown" {
112
+ return nil
113
+ }
114
+
115
+ let wrapper = AXElementWrapper(axElement)
116
+ let ref = await store.register(wrapper)
117
+ let bounds = bridge.getBounds(axElement)
118
+
119
+ // Recurse into children if within depth limit
120
+ var childElements: [PilotElement]?
121
+ if depth < maxDepth {
122
+ let axChildren = bridge.getChildren(axElement)
123
+ if !axChildren.isEmpty {
124
+ var built: [PilotElement] = []
125
+ for child in axChildren {
126
+ if let childElement = await buildElement(
127
+ from: child,
128
+ store: store,
129
+ depth: depth + 1,
130
+ maxDepth: maxDepth
131
+ ) {
132
+ built.append(childElement)
133
+ }
134
+ }
135
+ childElements = built.isEmpty ? nil : built
136
+ }
137
+ }
138
+
139
+ return PilotElement(
140
+ ref: ref,
141
+ role: role ?? "AXUnknown",
142
+ title: attrs.title,
143
+ value: attrs.value,
144
+ description: attrs.description,
145
+ enabled: attrs.enabled,
146
+ focused: attrs.focused,
147
+ bounds: bounds,
148
+ children: childElements
149
+ )
150
+ }
151
+
152
+ // MARK: - Batch Attribute Reading
153
+
154
+ /// Holds the parsed result of a batch attribute read.
155
+ private struct BatchResult {
156
+ let role: String?
157
+ let title: String?
158
+ let value: String?
159
+ let description: String?
160
+ let enabled: Bool
161
+ let focused: Bool
162
+ }
163
+
164
+ /// Read all standard attributes in one batch call for performance.
165
+ private func readBatchAttributes(_ element: AXUIElement) -> BatchResult {
166
+ let values = bridge.getAttributes(element, Self.batchAttributes)
167
+
168
+ let role = values[0] as? String
169
+ let title = values[1] as? String
170
+
171
+ // Value needs special handling: could be String, NSNumber, etc.
172
+ let value: String? = {
173
+ guard let raw = values[2] else { return nil }
174
+ if let str = raw as? String { return str }
175
+ if let num = raw as? NSNumber { return num.stringValue }
176
+ return String(describing: raw)
177
+ }()
178
+
179
+ let description = values[3] as? String
180
+ let enabled = (values[4] as? Bool) ?? true
181
+ let focused = (values[5] as? Bool) ?? false
182
+
183
+ return BatchResult(
184
+ role: role,
185
+ title: title,
186
+ value: value,
187
+ description: description,
188
+ enabled: enabled,
189
+ focused: focused
190
+ )
191
+ }
192
+ }
@@ -0,0 +1,190 @@
1
+ import ApplicationServices
2
+ import AppKit
3
+ import Foundation
4
+
5
+ // MARK: - Layer-local Element Cache
6
+
7
+ /// Lock-based element cache used by AccessibilityLayer.
8
+ /// Unlike the actor-based `ElementStore` in Core, this uses NSLock so it
9
+ /// can be called from synchronous `InteractionLayer` protocol methods
10
+ /// without crossing isolation boundaries.
11
+ private final class LayerElementCache: @unchecked Sendable {
12
+
13
+ private let lock = NSLock()
14
+ private var elements: [String: AXElementWrapper] = [:]
15
+ private var counter: Int = 0
16
+
17
+ /// Register an element and return its sequential ref.
18
+ func register(_ element: AXUIElement) -> String {
19
+ lock.lock()
20
+ defer { lock.unlock() }
21
+ counter += 1
22
+ let ref = "e\(counter)"
23
+ elements[ref] = AXElementWrapper(element)
24
+ return ref
25
+ }
26
+
27
+ /// Look up a previously stored element by ref.
28
+ func lookup(_ ref: String) -> AXElementWrapper? {
29
+ lock.lock()
30
+ defer { lock.unlock() }
31
+ return elements[ref]
32
+ }
33
+
34
+ /// Remove all stored elements (call before a fresh snapshot).
35
+ func clear() {
36
+ lock.lock()
37
+ defer { lock.unlock() }
38
+ elements.removeAll()
39
+ counter = 0
40
+ }
41
+
42
+ /// Current number of stored elements.
43
+ func count() -> Int {
44
+ lock.lock()
45
+ defer { lock.unlock() }
46
+ return elements.count
47
+ }
48
+ }
49
+
50
+ // MARK: - Accessibility Layer
51
+
52
+ /// Primary interaction layer using the macOS Accessibility API.
53
+ /// Walks AXUIElement trees via `AXBridge`, builds `PilotElement`
54
+ /// snapshots, and performs actions (click, type, read).
55
+ final class AccessibilityLayer: @unchecked Sendable, InteractionLayer {
56
+
57
+ let name: String = "Accessibility"
58
+ let priority: Int = 0
59
+
60
+ private let bridge: AXBridge
61
+ private let cache: LayerElementCache
62
+
63
+ init(bridge: AXBridge = AXBridge()) {
64
+ self.bridge = bridge
65
+ self.cache = LayerElementCache()
66
+ }
67
+
68
+ // MARK: - InteractionLayer Conformance
69
+
70
+ func canHandle(bundleID: String?, appName: String) -> Bool {
71
+ // The accessibility layer can handle any app that exposes an AX tree.
72
+ // We optimistically return true; individual operations will fail
73
+ // gracefully if the app doesn't cooperate.
74
+ return true
75
+ }
76
+
77
+ func snapshot(pid: Int32, maxDepth: Int) throws -> [PilotElement] {
78
+ guard bridge.isAccessibilityEnabled() else {
79
+ throw PlatformError.permissionDenied
80
+ }
81
+
82
+ cache.clear()
83
+
84
+ let appEl = bridge.appElement(pid: pid)
85
+ let windows = bridge.getWindows(appEl)
86
+ let sources = windows.isEmpty ? bridge.getChildren(appEl) : windows
87
+
88
+ if sources.isEmpty {
89
+ throw LayerError.snapshotFailed(
90
+ pid: pid,
91
+ reason: "App has no windows or accessible children"
92
+ )
93
+ }
94
+
95
+ return sources.compactMap { buildElement($0, depth: 0, maxDepth: maxDepth) }
96
+ }
97
+
98
+ func click(ref: String) throws {
99
+ let wrapper = try resolveRef(ref)
100
+ let pressed = bridge.performAction(wrapper.element, kAXPressAction)
101
+ if !pressed {
102
+ throw PlatformError.actionFailed(
103
+ action: "press",
104
+ reason: "AXPress action failed for ref '\(ref)'"
105
+ )
106
+ }
107
+ }
108
+
109
+ func typeText(ref: String, text: String) throws {
110
+ let wrapper = try resolveRef(ref)
111
+ let element = wrapper.element
112
+
113
+ // Focus the element first
114
+ _ = bridge.setAttribute(element, kAXFocusedAttribute, kCFBooleanTrue)
115
+
116
+ // Try setting the value directly
117
+ let success = bridge.setAttribute(element, kAXValueAttribute, text as CFTypeRef)
118
+ if !success {
119
+ throw LayerError.typingFailed(
120
+ ref: ref,
121
+ reason: "Could not set AXValue on element -- it may not be an editable text field"
122
+ )
123
+ }
124
+ }
125
+
126
+ func readValue(ref: String) throws -> String? {
127
+ let wrapper = try resolveRef(ref)
128
+ return bridge.getValue(wrapper.element)
129
+ }
130
+
131
+ // MARK: - Public Helpers
132
+
133
+ /// Look up a stored element wrapper by ref.
134
+ func findElement(ref: String) -> AXElementWrapper? {
135
+ return cache.lookup(ref)
136
+ }
137
+
138
+ // MARK: - Private Helpers
139
+
140
+ /// Resolve a ref string to an AXElementWrapper, throwing if not found.
141
+ private func resolveRef(_ ref: String) throws -> AXElementWrapper {
142
+ guard let wrapper = cache.lookup(ref) else {
143
+ throw PlatformError.elementNotFound(ref: ref)
144
+ }
145
+ return wrapper
146
+ }
147
+
148
+ /// Recursively build a PilotElement tree from an AXUIElement.
149
+ private func buildElement(
150
+ _ element: AXUIElement,
151
+ depth: Int,
152
+ maxDepth: Int
153
+ ) -> PilotElement? {
154
+ guard let role = bridge.getRole(element) else { return nil }
155
+
156
+ // Skip explicitly unknown roles
157
+ if role == "AXUnknown" { return nil }
158
+
159
+ let ref = cache.register(element)
160
+ let title = bridge.getTitle(element)
161
+ let value = bridge.getValue(element)
162
+ let description = bridge.getDescription(element)
163
+ let enabled = bridge.isEnabled(element)
164
+ let focused = bridge.isFocused(element)
165
+ let bounds = bridge.getBounds(element)
166
+
167
+ var children: [PilotElement]?
168
+ if depth < maxDepth {
169
+ let axChildren = bridge.getChildren(element)
170
+ if !axChildren.isEmpty {
171
+ let built = axChildren.compactMap { child in
172
+ buildElement(child, depth: depth + 1, maxDepth: maxDepth)
173
+ }
174
+ children = built.isEmpty ? nil : built
175
+ }
176
+ }
177
+
178
+ return PilotElement(
179
+ ref: ref,
180
+ role: role,
181
+ title: title,
182
+ value: value,
183
+ description: description,
184
+ enabled: enabled,
185
+ focused: focused,
186
+ bounds: bounds,
187
+ children: children
188
+ )
189
+ }
190
+ }