desktop-pilot-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/Package.swift +38 -0
- package/README.md +462 -0
- package/Sources/DesktopPilot/Core/AppRegistry.swift +102 -0
- package/Sources/DesktopPilot/Core/ElementStore.swift +59 -0
- package/Sources/DesktopPilot/Core/Router.swift +242 -0
- package/Sources/DesktopPilot/Core/Snapshot.swift +192 -0
- package/Sources/DesktopPilot/Layers/AccessibilityLayer.swift +190 -0
- package/Sources/DesktopPilot/Layers/AppleScriptLayer.swift +462 -0
- package/Sources/DesktopPilot/Layers/CGEventLayer.swift +318 -0
- package/Sources/DesktopPilot/Layers/LayerProtocol.swift +40 -0
- package/Sources/DesktopPilot/Layers/ScreenshotLayer.swift +122 -0
- package/Sources/DesktopPilot/MCP/Server.swift +536 -0
- package/Sources/DesktopPilot/MCP/Tools.swift +772 -0
- package/Sources/DesktopPilot/MCP/Types.swift +107 -0
- package/Sources/DesktopPilot/Platform/PlatformProtocol.swift +49 -0
- package/Sources/DesktopPilot/Platform/macOS/AXBridge.swift +232 -0
- package/Sources/DesktopPilot/Platform/macOS/Permissions.swift +34 -0
- package/Sources/DesktopPilot/Platform/macOS/SystemEvents.swift +323 -0
- package/Sources/DesktopPilot/main.swift +19 -0
- package/Sources/DesktopPilotCLI/main.swift +19 -0
- package/Tests/DesktopPilotTests/DesktopPilotTests.swift +290 -0
- package/bin/cli.js +61 -0
- package/package.json +52 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
|
|
3
|
+
// MARK: - AppleScript Layer
|
|
4
|
+
|
|
5
|
+
/// Interaction layer using AppleScript + System Events.
|
|
6
|
+
///
|
|
7
|
+
/// This is the **second priority** layer (priority 20) -- preferred over raw
|
|
8
|
+
/// accessibility (priority 0 = most available but least smart) for scriptable
|
|
9
|
+
/// apps. The Router tries layers in ascending priority order, so lower
|
|
10
|
+
/// priority = tried first. Since AccessibilityLayer has priority 0 it is
|
|
11
|
+
/// the fallback; AppleScriptLayer at 20 is offered as an alternative when
|
|
12
|
+
/// the app is natively scriptable.
|
|
13
|
+
///
|
|
14
|
+
/// Design decisions:
|
|
15
|
+
/// - **Snapshot**: Uses System Events `entire contents` to enumerate UI
|
|
16
|
+
/// elements. Produces `PilotElement` nodes with synthetic refs prefixed
|
|
17
|
+
/// "as" (e.g. "as1", "as2") so they don't collide with accessibility refs.
|
|
18
|
+
/// - **Click**: Uses System Events `click` on the element description stored
|
|
19
|
+
/// during the last snapshot.
|
|
20
|
+
/// - **Type**: Uses System Events `keystroke`, which simulates real key
|
|
21
|
+
/// presses and is more reliable than AXSetValue for Electron apps, web
|
|
22
|
+
/// views, and other non-native text fields.
|
|
23
|
+
/// - **Read**: Uses System Events to query element properties.
|
|
24
|
+
///
|
|
25
|
+
/// This layer does NOT use the shared `ElementStore` actor -- it maintains
|
|
26
|
+
/// its own lock-based cache of element descriptions for synchronous access.
|
|
27
|
+
final class AppleScriptLayer: @unchecked Sendable, InteractionLayer {
|
|
28
|
+
|
|
29
|
+
let name: String = "AppleScript"
|
|
30
|
+
let priority: Int = 20
|
|
31
|
+
|
|
32
|
+
private let sysEvents: SystemEventsHelper
|
|
33
|
+
private let elementCache: ASElementCache
|
|
34
|
+
|
|
35
|
+
init(systemEvents: SystemEventsHelper = SystemEventsHelper()) {
|
|
36
|
+
self.sysEvents = systemEvents
|
|
37
|
+
self.elementCache = ASElementCache()
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// MARK: - InteractionLayer Conformance
|
|
41
|
+
|
|
42
|
+
func canHandle(bundleID: String?, appName: String) -> Bool {
|
|
43
|
+
// System Events works with any running app, but we only claim
|
|
44
|
+
// to handle apps that have a real AppleScript dictionary.
|
|
45
|
+
// For apps without sdef, the Accessibility layer is better.
|
|
46
|
+
return sysEvents.isScriptable(appName: appName)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
func snapshot(pid: Int32, maxDepth: Int) throws -> [PilotElement] {
|
|
50
|
+
let appName = try resolveAppName(pid: pid)
|
|
51
|
+
elementCache.clear()
|
|
52
|
+
|
|
53
|
+
let result = sysEvents.getUIElements(appName: appName)
|
|
54
|
+
|
|
55
|
+
switch result {
|
|
56
|
+
case .success(let output):
|
|
57
|
+
return parseUIElementOutput(output, appName: appName)
|
|
58
|
+
case .failure(let error):
|
|
59
|
+
throw LayerError.snapshotFailed(
|
|
60
|
+
pid: pid,
|
|
61
|
+
reason: "System Events snapshot failed: \(error.localizedDescription)"
|
|
62
|
+
)
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
func click(ref: String) throws {
|
|
67
|
+
guard let entry = elementCache.lookup(ref) else {
|
|
68
|
+
throw PlatformError.elementNotFound(ref: ref)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
let result = sysEvents.clickElement(
|
|
72
|
+
appName: entry.appName,
|
|
73
|
+
elementDescription: entry.elementPath
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
switch result {
|
|
77
|
+
case .success:
|
|
78
|
+
return
|
|
79
|
+
case .failure(let error):
|
|
80
|
+
throw PlatformError.actionFailed(
|
|
81
|
+
action: "click",
|
|
82
|
+
reason: "System Events click failed for ref '\(ref)': \(error.localizedDescription)"
|
|
83
|
+
)
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
func typeText(ref: String, text: String) throws {
|
|
88
|
+
guard let entry = elementCache.lookup(ref) else {
|
|
89
|
+
throw PlatformError.elementNotFound(ref: ref)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// First try to click the element to focus it
|
|
93
|
+
_ = sysEvents.clickElement(
|
|
94
|
+
appName: entry.appName,
|
|
95
|
+
elementDescription: entry.elementPath
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
// Then type via keystroke
|
|
99
|
+
let result = sysEvents.typeText(
|
|
100
|
+
appName: entry.appName,
|
|
101
|
+
text: text
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
switch result {
|
|
105
|
+
case .success:
|
|
106
|
+
return
|
|
107
|
+
case .failure(let error):
|
|
108
|
+
throw LayerError.typingFailed(
|
|
109
|
+
ref: ref,
|
|
110
|
+
reason: "System Events keystroke failed: \(error.localizedDescription)"
|
|
111
|
+
)
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
func readValue(ref: String) throws -> String? {
|
|
116
|
+
guard let entry = elementCache.lookup(ref) else {
|
|
117
|
+
throw PlatformError.elementNotFound(ref: ref)
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
let script = """
|
|
121
|
+
tell application "System Events"
|
|
122
|
+
tell process "\(escapeForAppleScript(entry.appName))"
|
|
123
|
+
try
|
|
124
|
+
set elemVal to value of \(entry.elementPath)
|
|
125
|
+
return elemVal as text
|
|
126
|
+
on error
|
|
127
|
+
try
|
|
128
|
+
set elemTitle to name of \(entry.elementPath)
|
|
129
|
+
return elemTitle as text
|
|
130
|
+
on error
|
|
131
|
+
return ""
|
|
132
|
+
end try
|
|
133
|
+
end try
|
|
134
|
+
end tell
|
|
135
|
+
end tell
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
let result = sysEvents.runAppleScript(script)
|
|
139
|
+
|
|
140
|
+
switch result {
|
|
141
|
+
case .success(let output):
|
|
142
|
+
return output.isEmpty ? nil : output
|
|
143
|
+
case .failure:
|
|
144
|
+
throw LayerError.readFailed(
|
|
145
|
+
ref: ref,
|
|
146
|
+
reason: "Could not read value for element via System Events"
|
|
147
|
+
)
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// MARK: - Private Helpers
|
|
152
|
+
|
|
153
|
+
/// Resolve a PID to an app process name for System Events.
|
|
154
|
+
private func resolveAppName(pid: Int32) throws -> String {
|
|
155
|
+
let script = """
|
|
156
|
+
tell application "System Events"
|
|
157
|
+
set targetProcess to first process whose unix id is \(pid)
|
|
158
|
+
return name of targetProcess
|
|
159
|
+
end tell
|
|
160
|
+
"""
|
|
161
|
+
let result = sysEvents.runAppleScript(script)
|
|
162
|
+
|
|
163
|
+
switch result {
|
|
164
|
+
case .success(let name):
|
|
165
|
+
if name.isEmpty {
|
|
166
|
+
throw LayerError.snapshotFailed(
|
|
167
|
+
pid: pid,
|
|
168
|
+
reason: "System Events returned empty process name for PID \(pid)"
|
|
169
|
+
)
|
|
170
|
+
}
|
|
171
|
+
return name
|
|
172
|
+
case .failure(let error):
|
|
173
|
+
throw LayerError.snapshotFailed(
|
|
174
|
+
pid: pid,
|
|
175
|
+
reason: "Could not resolve PID \(pid) via System Events: \(error.localizedDescription)"
|
|
176
|
+
)
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/// Parse the text output from `getUIElements` into `PilotElement` nodes.
|
|
181
|
+
///
|
|
182
|
+
/// The output format is:
|
|
183
|
+
/// ```
|
|
184
|
+
/// Window: My Window
|
|
185
|
+
/// button: OK (press)
|
|
186
|
+
/// text field: (input)
|
|
187
|
+
/// ```
|
|
188
|
+
private func parseUIElementOutput(
|
|
189
|
+
_ output: String,
|
|
190
|
+
appName: String
|
|
191
|
+
) -> [PilotElement] {
|
|
192
|
+
let lines = output.components(separatedBy: .newlines)
|
|
193
|
+
var windows: [PilotElement] = []
|
|
194
|
+
var currentWindowChildren: [PilotElement] = []
|
|
195
|
+
var currentWindowName: String?
|
|
196
|
+
var windowIndex = 0
|
|
197
|
+
|
|
198
|
+
for line in lines {
|
|
199
|
+
let trimmed = line.trimmingCharacters(in: .whitespaces)
|
|
200
|
+
if trimmed.isEmpty { continue }
|
|
201
|
+
|
|
202
|
+
if trimmed.hasPrefix("Window: ") {
|
|
203
|
+
// Save previous window if any
|
|
204
|
+
if let windowName = currentWindowName {
|
|
205
|
+
let windowRef = elementCache.register(
|
|
206
|
+
appName: appName,
|
|
207
|
+
elementPath: "window \"\(escapeForAppleScript(windowName))\"",
|
|
208
|
+
role: "AXWindow",
|
|
209
|
+
displayName: windowName
|
|
210
|
+
)
|
|
211
|
+
windows.append(PilotElement(
|
|
212
|
+
ref: windowRef,
|
|
213
|
+
role: "AXWindow",
|
|
214
|
+
title: windowName,
|
|
215
|
+
value: nil,
|
|
216
|
+
description: nil,
|
|
217
|
+
enabled: true,
|
|
218
|
+
focused: false,
|
|
219
|
+
bounds: nil,
|
|
220
|
+
children: currentWindowChildren.isEmpty ? nil : currentWindowChildren
|
|
221
|
+
))
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
currentWindowName = String(trimmed.dropFirst("Window: ".count))
|
|
225
|
+
currentWindowChildren = []
|
|
226
|
+
windowIndex += 1
|
|
227
|
+
} else {
|
|
228
|
+
// Parse element line: " className: name (description)"
|
|
229
|
+
let parsed = parseElementLine(
|
|
230
|
+
trimmed,
|
|
231
|
+
appName: appName,
|
|
232
|
+
windowIndex: windowIndex
|
|
233
|
+
)
|
|
234
|
+
if let element = parsed {
|
|
235
|
+
currentWindowChildren.append(element)
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Save last window
|
|
241
|
+
if let windowName = currentWindowName {
|
|
242
|
+
let windowRef = elementCache.register(
|
|
243
|
+
appName: appName,
|
|
244
|
+
elementPath: "window \"\(escapeForAppleScript(windowName))\"",
|
|
245
|
+
role: "AXWindow",
|
|
246
|
+
displayName: windowName
|
|
247
|
+
)
|
|
248
|
+
windows.append(PilotElement(
|
|
249
|
+
ref: windowRef,
|
|
250
|
+
role: "AXWindow",
|
|
251
|
+
title: windowName,
|
|
252
|
+
value: nil,
|
|
253
|
+
description: nil,
|
|
254
|
+
enabled: true,
|
|
255
|
+
focused: false,
|
|
256
|
+
bounds: nil,
|
|
257
|
+
children: currentWindowChildren.isEmpty ? nil : currentWindowChildren
|
|
258
|
+
))
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return windows
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/// Parse a single element line from the System Events output.
|
|
265
|
+
private func parseElementLine(
|
|
266
|
+
_ line: String,
|
|
267
|
+
appName: String,
|
|
268
|
+
windowIndex: Int
|
|
269
|
+
) -> PilotElement? {
|
|
270
|
+
// Expected format: "className: name (description)"
|
|
271
|
+
let colonIndex = line.firstIndex(of: ":")
|
|
272
|
+
guard let colonIdx = colonIndex else { return nil }
|
|
273
|
+
|
|
274
|
+
let rawClass = String(line[line.startIndex..<colonIdx])
|
|
275
|
+
.trimmingCharacters(in: .whitespaces)
|
|
276
|
+
let remainder = String(line[line.index(after: colonIdx)...])
|
|
277
|
+
.trimmingCharacters(in: .whitespaces)
|
|
278
|
+
|
|
279
|
+
// Extract name and description from "name (description)"
|
|
280
|
+
var elementName: String? = nil
|
|
281
|
+
var elementDesc: String? = nil
|
|
282
|
+
|
|
283
|
+
if let parenStart = remainder.lastIndex(of: "("),
|
|
284
|
+
let parenEnd = remainder.lastIndex(of: ")"),
|
|
285
|
+
parenStart < parenEnd {
|
|
286
|
+
elementName = String(remainder[remainder.startIndex..<parenStart])
|
|
287
|
+
.trimmingCharacters(in: .whitespaces)
|
|
288
|
+
elementDesc = String(remainder[remainder.index(after: parenStart)..<parenEnd])
|
|
289
|
+
.trimmingCharacters(in: .whitespaces)
|
|
290
|
+
} else {
|
|
291
|
+
elementName = remainder.isEmpty ? nil : remainder
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
if elementName?.isEmpty == true { elementName = nil }
|
|
295
|
+
if elementDesc?.isEmpty == true { elementDesc = nil }
|
|
296
|
+
|
|
297
|
+
let role = mapSystemEventsClass(rawClass)
|
|
298
|
+
|
|
299
|
+
// Build the System Events element path for later interaction.
|
|
300
|
+
// Use class + name when available for more reliable targeting.
|
|
301
|
+
let elementPath: String
|
|
302
|
+
if let name = elementName, !name.isEmpty {
|
|
303
|
+
elementPath = "\(rawClass) \"\(escapeForAppleScript(name))\" of window \(windowIndex)"
|
|
304
|
+
} else {
|
|
305
|
+
// Positional reference -- less reliable but works for unnamed elements
|
|
306
|
+
elementPath = "\(rawClass) 1 of window \(windowIndex)"
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
let ref = elementCache.register(
|
|
310
|
+
appName: appName,
|
|
311
|
+
elementPath: elementPath,
|
|
312
|
+
role: role,
|
|
313
|
+
displayName: elementName
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
return PilotElement(
|
|
317
|
+
ref: ref,
|
|
318
|
+
role: role,
|
|
319
|
+
title: elementName,
|
|
320
|
+
value: nil,
|
|
321
|
+
description: elementDesc,
|
|
322
|
+
enabled: true,
|
|
323
|
+
focused: false,
|
|
324
|
+
bounds: nil,
|
|
325
|
+
children: nil
|
|
326
|
+
)
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/// Map a System Events class name to the closest AX role equivalent.
|
|
330
|
+
private func mapSystemEventsClass(_ className: String) -> String {
|
|
331
|
+
let lowered = className.lowercased()
|
|
332
|
+
switch lowered {
|
|
333
|
+
case "button":
|
|
334
|
+
return "AXButton"
|
|
335
|
+
case "text field":
|
|
336
|
+
return "AXTextField"
|
|
337
|
+
case "text area":
|
|
338
|
+
return "AXTextArea"
|
|
339
|
+
case "static text":
|
|
340
|
+
return "AXStaticText"
|
|
341
|
+
case "checkbox":
|
|
342
|
+
return "AXCheckBox"
|
|
343
|
+
case "radio button":
|
|
344
|
+
return "AXRadioButton"
|
|
345
|
+
case "pop up button":
|
|
346
|
+
return "AXPopUpButton"
|
|
347
|
+
case "menu button":
|
|
348
|
+
return "AXMenuButton"
|
|
349
|
+
case "slider":
|
|
350
|
+
return "AXSlider"
|
|
351
|
+
case "scroll area":
|
|
352
|
+
return "AXScrollArea"
|
|
353
|
+
case "scroll bar":
|
|
354
|
+
return "AXScrollBar"
|
|
355
|
+
case "table":
|
|
356
|
+
return "AXTable"
|
|
357
|
+
case "row":
|
|
358
|
+
return "AXRow"
|
|
359
|
+
case "column":
|
|
360
|
+
return "AXColumn"
|
|
361
|
+
case "cell":
|
|
362
|
+
return "AXCell"
|
|
363
|
+
case "group":
|
|
364
|
+
return "AXGroup"
|
|
365
|
+
case "toolbar":
|
|
366
|
+
return "AXToolbar"
|
|
367
|
+
case "tab group":
|
|
368
|
+
return "AXTabGroup"
|
|
369
|
+
case "tab":
|
|
370
|
+
return "AXTab"
|
|
371
|
+
case "image":
|
|
372
|
+
return "AXImage"
|
|
373
|
+
case "combo box":
|
|
374
|
+
return "AXComboBox"
|
|
375
|
+
case "list":
|
|
376
|
+
return "AXList"
|
|
377
|
+
case "outline":
|
|
378
|
+
return "AXOutline"
|
|
379
|
+
case "menu":
|
|
380
|
+
return "AXMenu"
|
|
381
|
+
case "menu item":
|
|
382
|
+
return "AXMenuItem"
|
|
383
|
+
case "window":
|
|
384
|
+
return "AXWindow"
|
|
385
|
+
case "sheet":
|
|
386
|
+
return "AXSheet"
|
|
387
|
+
case "splitter":
|
|
388
|
+
return "AXSplitter"
|
|
389
|
+
case "progress indicator":
|
|
390
|
+
return "AXProgressIndicator"
|
|
391
|
+
case "busy indicator":
|
|
392
|
+
return "AXBusyIndicator"
|
|
393
|
+
case "disclosure triangle":
|
|
394
|
+
return "AXDisclosureTriangle"
|
|
395
|
+
default:
|
|
396
|
+
return "AX\(className.split(separator: " ").map { $0.capitalized }.joined())"
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
/// Escape special characters for safe embedding in AppleScript strings.
|
|
401
|
+
private func escapeForAppleScript(_ input: String) -> String {
|
|
402
|
+
return input
|
|
403
|
+
.replacingOccurrences(of: "\\", with: "\\\\")
|
|
404
|
+
.replacingOccurrences(of: "\"", with: "\\\"")
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// MARK: - AppleScript Element Cache
|
|
409
|
+
|
|
410
|
+
/// Stores the mapping between "as" refs and System Events element paths.
|
|
411
|
+
///
|
|
412
|
+
/// Uses NSLock (not actor) so it can be called from synchronous
|
|
413
|
+
/// `InteractionLayer` protocol methods without crossing isolation boundaries.
|
|
414
|
+
private final class ASElementCache: @unchecked Sendable {
|
|
415
|
+
|
|
416
|
+
private let lock = NSLock()
|
|
417
|
+
private var entries: [String: ASElementEntry] = [:]
|
|
418
|
+
private var counter: Int = 0
|
|
419
|
+
|
|
420
|
+
/// Register an element and return its sequential ref.
|
|
421
|
+
func register(
|
|
422
|
+
appName: String,
|
|
423
|
+
elementPath: String,
|
|
424
|
+
role: String,
|
|
425
|
+
displayName: String?
|
|
426
|
+
) -> String {
|
|
427
|
+
lock.lock()
|
|
428
|
+
defer { lock.unlock() }
|
|
429
|
+
counter += 1
|
|
430
|
+
let ref = "as\(counter)"
|
|
431
|
+
entries[ref] = ASElementEntry(
|
|
432
|
+
appName: appName,
|
|
433
|
+
elementPath: elementPath,
|
|
434
|
+
role: role,
|
|
435
|
+
displayName: displayName
|
|
436
|
+
)
|
|
437
|
+
return ref
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/// Look up a previously stored element by ref.
|
|
441
|
+
func lookup(_ ref: String) -> ASElementEntry? {
|
|
442
|
+
lock.lock()
|
|
443
|
+
defer { lock.unlock() }
|
|
444
|
+
return entries[ref]
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
/// Remove all stored elements (call before a fresh snapshot).
|
|
448
|
+
func clear() {
|
|
449
|
+
lock.lock()
|
|
450
|
+
defer { lock.unlock() }
|
|
451
|
+
entries.removeAll()
|
|
452
|
+
counter = 0
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/// An entry in the AppleScript element cache.
|
|
457
|
+
private struct ASElementEntry: Sendable {
|
|
458
|
+
let appName: String
|
|
459
|
+
let elementPath: String
|
|
460
|
+
let role: String
|
|
461
|
+
let displayName: String?
|
|
462
|
+
}
|