desktop-pilot-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,462 @@
1
+ import Foundation
2
+
3
+ // MARK: - AppleScript Layer
4
+
5
+ /// Interaction layer using AppleScript + System Events.
6
+ ///
7
+ /// This is the **second priority** layer (priority 20) -- preferred over raw
8
+ /// accessibility (priority 0 = most available but least smart) for scriptable
9
+ /// apps. The Router tries layers in ascending priority order, so lower
10
+ /// priority = tried first. Since AccessibilityLayer has priority 0 it is
11
+ /// the fallback; AppleScriptLayer at 20 is offered as an alternative when
12
+ /// the app is natively scriptable.
13
+ ///
14
+ /// Design decisions:
15
+ /// - **Snapshot**: Uses System Events `entire contents` to enumerate UI
16
+ /// elements. Produces `PilotElement` nodes with synthetic refs prefixed
17
+ /// "as" (e.g. "as1", "as2") so they don't collide with accessibility refs.
18
+ /// - **Click**: Uses System Events `click` on the element description stored
19
+ /// during the last snapshot.
20
+ /// - **Type**: Uses System Events `keystroke`, which simulates real key
21
+ /// presses and is more reliable than AXSetValue for Electron apps, web
22
+ /// views, and other non-native text fields.
23
+ /// - **Read**: Uses System Events to query element properties.
24
+ ///
25
+ /// This layer does NOT use the shared `ElementStore` actor -- it maintains
26
+ /// its own lock-based cache of element descriptions for synchronous access.
27
+ final class AppleScriptLayer: @unchecked Sendable, InteractionLayer {
28
+
29
+ let name: String = "AppleScript"
30
+ let priority: Int = 20
31
+
32
+ private let sysEvents: SystemEventsHelper
33
+ private let elementCache: ASElementCache
34
+
35
+ init(systemEvents: SystemEventsHelper = SystemEventsHelper()) {
36
+ self.sysEvents = systemEvents
37
+ self.elementCache = ASElementCache()
38
+ }
39
+
40
+ // MARK: - InteractionLayer Conformance
41
+
42
+ func canHandle(bundleID: String?, appName: String) -> Bool {
43
+ // System Events works with any running app, but we only claim
44
+ // to handle apps that have a real AppleScript dictionary.
45
+ // For apps without sdef, the Accessibility layer is better.
46
+ return sysEvents.isScriptable(appName: appName)
47
+ }
48
+
49
+ func snapshot(pid: Int32, maxDepth: Int) throws -> [PilotElement] {
50
+ let appName = try resolveAppName(pid: pid)
51
+ elementCache.clear()
52
+
53
+ let result = sysEvents.getUIElements(appName: appName)
54
+
55
+ switch result {
56
+ case .success(let output):
57
+ return parseUIElementOutput(output, appName: appName)
58
+ case .failure(let error):
59
+ throw LayerError.snapshotFailed(
60
+ pid: pid,
61
+ reason: "System Events snapshot failed: \(error.localizedDescription)"
62
+ )
63
+ }
64
+ }
65
+
66
+ func click(ref: String) throws {
67
+ guard let entry = elementCache.lookup(ref) else {
68
+ throw PlatformError.elementNotFound(ref: ref)
69
+ }
70
+
71
+ let result = sysEvents.clickElement(
72
+ appName: entry.appName,
73
+ elementDescription: entry.elementPath
74
+ )
75
+
76
+ switch result {
77
+ case .success:
78
+ return
79
+ case .failure(let error):
80
+ throw PlatformError.actionFailed(
81
+ action: "click",
82
+ reason: "System Events click failed for ref '\(ref)': \(error.localizedDescription)"
83
+ )
84
+ }
85
+ }
86
+
87
+ func typeText(ref: String, text: String) throws {
88
+ guard let entry = elementCache.lookup(ref) else {
89
+ throw PlatformError.elementNotFound(ref: ref)
90
+ }
91
+
92
+ // First try to click the element to focus it
93
+ _ = sysEvents.clickElement(
94
+ appName: entry.appName,
95
+ elementDescription: entry.elementPath
96
+ )
97
+
98
+ // Then type via keystroke
99
+ let result = sysEvents.typeText(
100
+ appName: entry.appName,
101
+ text: text
102
+ )
103
+
104
+ switch result {
105
+ case .success:
106
+ return
107
+ case .failure(let error):
108
+ throw LayerError.typingFailed(
109
+ ref: ref,
110
+ reason: "System Events keystroke failed: \(error.localizedDescription)"
111
+ )
112
+ }
113
+ }
114
+
115
+ func readValue(ref: String) throws -> String? {
116
+ guard let entry = elementCache.lookup(ref) else {
117
+ throw PlatformError.elementNotFound(ref: ref)
118
+ }
119
+
120
+ let script = """
121
+ tell application "System Events"
122
+ tell process "\(escapeForAppleScript(entry.appName))"
123
+ try
124
+ set elemVal to value of \(entry.elementPath)
125
+ return elemVal as text
126
+ on error
127
+ try
128
+ set elemTitle to name of \(entry.elementPath)
129
+ return elemTitle as text
130
+ on error
131
+ return ""
132
+ end try
133
+ end try
134
+ end tell
135
+ end tell
136
+ """
137
+
138
+ let result = sysEvents.runAppleScript(script)
139
+
140
+ switch result {
141
+ case .success(let output):
142
+ return output.isEmpty ? nil : output
143
+ case .failure:
144
+ throw LayerError.readFailed(
145
+ ref: ref,
146
+ reason: "Could not read value for element via System Events"
147
+ )
148
+ }
149
+ }
150
+
151
+ // MARK: - Private Helpers
152
+
153
+ /// Resolve a PID to an app process name for System Events.
154
+ private func resolveAppName(pid: Int32) throws -> String {
155
+ let script = """
156
+ tell application "System Events"
157
+ set targetProcess to first process whose unix id is \(pid)
158
+ return name of targetProcess
159
+ end tell
160
+ """
161
+ let result = sysEvents.runAppleScript(script)
162
+
163
+ switch result {
164
+ case .success(let name):
165
+ if name.isEmpty {
166
+ throw LayerError.snapshotFailed(
167
+ pid: pid,
168
+ reason: "System Events returned empty process name for PID \(pid)"
169
+ )
170
+ }
171
+ return name
172
+ case .failure(let error):
173
+ throw LayerError.snapshotFailed(
174
+ pid: pid,
175
+ reason: "Could not resolve PID \(pid) via System Events: \(error.localizedDescription)"
176
+ )
177
+ }
178
+ }
179
+
180
+ /// Parse the text output from `getUIElements` into `PilotElement` nodes.
181
+ ///
182
+ /// The output format is:
183
+ /// ```
184
+ /// Window: My Window
185
+ /// button: OK (press)
186
+ /// text field: (input)
187
+ /// ```
188
+ private func parseUIElementOutput(
189
+ _ output: String,
190
+ appName: String
191
+ ) -> [PilotElement] {
192
+ let lines = output.components(separatedBy: .newlines)
193
+ var windows: [PilotElement] = []
194
+ var currentWindowChildren: [PilotElement] = []
195
+ var currentWindowName: String?
196
+ var windowIndex = 0
197
+
198
+ for line in lines {
199
+ let trimmed = line.trimmingCharacters(in: .whitespaces)
200
+ if trimmed.isEmpty { continue }
201
+
202
+ if trimmed.hasPrefix("Window: ") {
203
+ // Save previous window if any
204
+ if let windowName = currentWindowName {
205
+ let windowRef = elementCache.register(
206
+ appName: appName,
207
+ elementPath: "window \"\(escapeForAppleScript(windowName))\"",
208
+ role: "AXWindow",
209
+ displayName: windowName
210
+ )
211
+ windows.append(PilotElement(
212
+ ref: windowRef,
213
+ role: "AXWindow",
214
+ title: windowName,
215
+ value: nil,
216
+ description: nil,
217
+ enabled: true,
218
+ focused: false,
219
+ bounds: nil,
220
+ children: currentWindowChildren.isEmpty ? nil : currentWindowChildren
221
+ ))
222
+ }
223
+
224
+ currentWindowName = String(trimmed.dropFirst("Window: ".count))
225
+ currentWindowChildren = []
226
+ windowIndex += 1
227
+ } else {
228
+ // Parse element line: " className: name (description)"
229
+ let parsed = parseElementLine(
230
+ trimmed,
231
+ appName: appName,
232
+ windowIndex: windowIndex
233
+ )
234
+ if let element = parsed {
235
+ currentWindowChildren.append(element)
236
+ }
237
+ }
238
+ }
239
+
240
+ // Save last window
241
+ if let windowName = currentWindowName {
242
+ let windowRef = elementCache.register(
243
+ appName: appName,
244
+ elementPath: "window \"\(escapeForAppleScript(windowName))\"",
245
+ role: "AXWindow",
246
+ displayName: windowName
247
+ )
248
+ windows.append(PilotElement(
249
+ ref: windowRef,
250
+ role: "AXWindow",
251
+ title: windowName,
252
+ value: nil,
253
+ description: nil,
254
+ enabled: true,
255
+ focused: false,
256
+ bounds: nil,
257
+ children: currentWindowChildren.isEmpty ? nil : currentWindowChildren
258
+ ))
259
+ }
260
+
261
+ return windows
262
+ }
263
+
264
+ /// Parse a single element line from the System Events output.
265
+ private func parseElementLine(
266
+ _ line: String,
267
+ appName: String,
268
+ windowIndex: Int
269
+ ) -> PilotElement? {
270
+ // Expected format: "className: name (description)"
271
+ let colonIndex = line.firstIndex(of: ":")
272
+ guard let colonIdx = colonIndex else { return nil }
273
+
274
+ let rawClass = String(line[line.startIndex..<colonIdx])
275
+ .trimmingCharacters(in: .whitespaces)
276
+ let remainder = String(line[line.index(after: colonIdx)...])
277
+ .trimmingCharacters(in: .whitespaces)
278
+
279
+ // Extract name and description from "name (description)"
280
+ var elementName: String? = nil
281
+ var elementDesc: String? = nil
282
+
283
+ if let parenStart = remainder.lastIndex(of: "("),
284
+ let parenEnd = remainder.lastIndex(of: ")"),
285
+ parenStart < parenEnd {
286
+ elementName = String(remainder[remainder.startIndex..<parenStart])
287
+ .trimmingCharacters(in: .whitespaces)
288
+ elementDesc = String(remainder[remainder.index(after: parenStart)..<parenEnd])
289
+ .trimmingCharacters(in: .whitespaces)
290
+ } else {
291
+ elementName = remainder.isEmpty ? nil : remainder
292
+ }
293
+
294
+ if elementName?.isEmpty == true { elementName = nil }
295
+ if elementDesc?.isEmpty == true { elementDesc = nil }
296
+
297
+ let role = mapSystemEventsClass(rawClass)
298
+
299
+ // Build the System Events element path for later interaction.
300
+ // Use class + name when available for more reliable targeting.
301
+ let elementPath: String
302
+ if let name = elementName, !name.isEmpty {
303
+ elementPath = "\(rawClass) \"\(escapeForAppleScript(name))\" of window \(windowIndex)"
304
+ } else {
305
+ // Positional reference -- less reliable but works for unnamed elements
306
+ elementPath = "\(rawClass) 1 of window \(windowIndex)"
307
+ }
308
+
309
+ let ref = elementCache.register(
310
+ appName: appName,
311
+ elementPath: elementPath,
312
+ role: role,
313
+ displayName: elementName
314
+ )
315
+
316
+ return PilotElement(
317
+ ref: ref,
318
+ role: role,
319
+ title: elementName,
320
+ value: nil,
321
+ description: elementDesc,
322
+ enabled: true,
323
+ focused: false,
324
+ bounds: nil,
325
+ children: nil
326
+ )
327
+ }
328
+
329
+ /// Map a System Events class name to the closest AX role equivalent.
330
+ private func mapSystemEventsClass(_ className: String) -> String {
331
+ let lowered = className.lowercased()
332
+ switch lowered {
333
+ case "button":
334
+ return "AXButton"
335
+ case "text field":
336
+ return "AXTextField"
337
+ case "text area":
338
+ return "AXTextArea"
339
+ case "static text":
340
+ return "AXStaticText"
341
+ case "checkbox":
342
+ return "AXCheckBox"
343
+ case "radio button":
344
+ return "AXRadioButton"
345
+ case "pop up button":
346
+ return "AXPopUpButton"
347
+ case "menu button":
348
+ return "AXMenuButton"
349
+ case "slider":
350
+ return "AXSlider"
351
+ case "scroll area":
352
+ return "AXScrollArea"
353
+ case "scroll bar":
354
+ return "AXScrollBar"
355
+ case "table":
356
+ return "AXTable"
357
+ case "row":
358
+ return "AXRow"
359
+ case "column":
360
+ return "AXColumn"
361
+ case "cell":
362
+ return "AXCell"
363
+ case "group":
364
+ return "AXGroup"
365
+ case "toolbar":
366
+ return "AXToolbar"
367
+ case "tab group":
368
+ return "AXTabGroup"
369
+ case "tab":
370
+ return "AXTab"
371
+ case "image":
372
+ return "AXImage"
373
+ case "combo box":
374
+ return "AXComboBox"
375
+ case "list":
376
+ return "AXList"
377
+ case "outline":
378
+ return "AXOutline"
379
+ case "menu":
380
+ return "AXMenu"
381
+ case "menu item":
382
+ return "AXMenuItem"
383
+ case "window":
384
+ return "AXWindow"
385
+ case "sheet":
386
+ return "AXSheet"
387
+ case "splitter":
388
+ return "AXSplitter"
389
+ case "progress indicator":
390
+ return "AXProgressIndicator"
391
+ case "busy indicator":
392
+ return "AXBusyIndicator"
393
+ case "disclosure triangle":
394
+ return "AXDisclosureTriangle"
395
+ default:
396
+ return "AX\(className.split(separator: " ").map { $0.capitalized }.joined())"
397
+ }
398
+ }
399
+
400
+ /// Escape special characters for safe embedding in AppleScript strings.
401
+ private func escapeForAppleScript(_ input: String) -> String {
402
+ return input
403
+ .replacingOccurrences(of: "\\", with: "\\\\")
404
+ .replacingOccurrences(of: "\"", with: "\\\"")
405
+ }
406
+ }
407
+
408
+ // MARK: - AppleScript Element Cache
409
+
410
+ /// Stores the mapping between "as" refs and System Events element paths.
411
+ ///
412
+ /// Uses NSLock (not actor) so it can be called from synchronous
413
+ /// `InteractionLayer` protocol methods without crossing isolation boundaries.
414
+ private final class ASElementCache: @unchecked Sendable {
415
+
416
+ private let lock = NSLock()
417
+ private var entries: [String: ASElementEntry] = [:]
418
+ private var counter: Int = 0
419
+
420
+ /// Register an element and return its sequential ref.
421
+ func register(
422
+ appName: String,
423
+ elementPath: String,
424
+ role: String,
425
+ displayName: String?
426
+ ) -> String {
427
+ lock.lock()
428
+ defer { lock.unlock() }
429
+ counter += 1
430
+ let ref = "as\(counter)"
431
+ entries[ref] = ASElementEntry(
432
+ appName: appName,
433
+ elementPath: elementPath,
434
+ role: role,
435
+ displayName: displayName
436
+ )
437
+ return ref
438
+ }
439
+
440
+ /// Look up a previously stored element by ref.
441
+ func lookup(_ ref: String) -> ASElementEntry? {
442
+ lock.lock()
443
+ defer { lock.unlock() }
444
+ return entries[ref]
445
+ }
446
+
447
+ /// Remove all stored elements (call before a fresh snapshot).
448
+ func clear() {
449
+ lock.lock()
450
+ defer { lock.unlock() }
451
+ entries.removeAll()
452
+ counter = 0
453
+ }
454
+ }
455
+
456
+ /// An entry in the AppleScript element cache.
457
+ private struct ASElementEntry: Sendable {
458
+ let appName: String
459
+ let elementPath: String
460
+ let role: String
461
+ let displayName: String?
462
+ }