mcp-server-macos-use 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +37 -0
- package/Package.resolved +86 -0
- package/Package.swift +26 -0
- package/README.md +96 -0
- package/Sources/main.swift +1205 -0
- package/bin/mcp-server-macos-use +13 -0
- package/package.json +37 -0
|
@@ -0,0 +1,1205 @@
|
|
|
1
|
+
import MCP
|
|
2
|
+
import Foundation
|
|
3
|
+
import CoreGraphics // Still needed for CGPoint, CGEventFlags
|
|
4
|
+
import ApplicationServices // For AXUIElement APIs (scroll-into-view, window bounds)
|
|
5
|
+
import AppKit // For NSEvent.mouseLocation (cursor position save/restore)
|
|
6
|
+
import MacosUseSDK // <-- Import the SDK
|
|
7
|
+
|
|
8
|
+
// --- Helper to serialize Swift structs to JSON String ---
|
|
9
|
+
func serializeToJsonString<T: Encodable>(_ value: T) -> String? {
|
|
10
|
+
let encoder = JSONEncoder()
|
|
11
|
+
// Use pretty printing for easier debugging of the output if needed
|
|
12
|
+
encoder.outputFormatting = [.sortedKeys, .withoutEscapingSlashes]
|
|
13
|
+
do {
|
|
14
|
+
let jsonData = try encoder.encode(value)
|
|
15
|
+
return String(data: jsonData, encoding: .utf8)
|
|
16
|
+
} catch {
|
|
17
|
+
fputs("error: serializeToJsonString: failed to encode value to JSON: \(error)\n", stderr)
|
|
18
|
+
return nil
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// --- Function to get arguments from MCP Value ---
|
|
23
|
+
// Helper to extract typed values safely
|
|
24
|
+
func getRequiredString(from args: [String: Value]?, key: String) throws -> String {
|
|
25
|
+
guard let val = args?[key]?.stringValue else {
|
|
26
|
+
throw MCPError.invalidParams("Missing or invalid required string argument: '\(key)'")
|
|
27
|
+
}
|
|
28
|
+
return val
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
func getRequiredDouble(from args: [String: Value]?, key: String) throws -> Double {
|
|
32
|
+
guard let value = args?[key] else {
|
|
33
|
+
throw MCPError.invalidParams("Missing required number argument: '\(key)'")
|
|
34
|
+
}
|
|
35
|
+
switch value {
|
|
36
|
+
case .int(let intValue):
|
|
37
|
+
fputs("log: getRequiredDouble: converting int \(intValue) to double for key '\(key)'\n", stderr)
|
|
38
|
+
return Double(intValue)
|
|
39
|
+
case .double(let doubleValue):
|
|
40
|
+
return doubleValue
|
|
41
|
+
default:
|
|
42
|
+
throw MCPError.invalidParams("Invalid type for required number argument: '\(key)', expected Int or Double, got \(value)")
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
func getRequiredInt(from args: [String: Value]?, key: String) throws -> Int {
|
|
47
|
+
guard let value = args?[key] else {
|
|
48
|
+
throw MCPError.invalidParams("Missing required integer argument: '\(key)'")
|
|
49
|
+
}
|
|
50
|
+
// Allow conversion from Double if it's an exact integer
|
|
51
|
+
if let doubleValue = value.doubleValue {
|
|
52
|
+
if let intValue = Int(exactly: doubleValue) {
|
|
53
|
+
fputs("log: getRequiredInt: converting exact double \(doubleValue) to int for key '\(key)'\n", stderr)
|
|
54
|
+
return intValue
|
|
55
|
+
} else {
|
|
56
|
+
fputs("warning: getRequiredInt: received non-exact double \(doubleValue) for key '\(key)', expecting integer.\n", stderr)
|
|
57
|
+
throw MCPError.invalidParams("Invalid type for required integer argument: '\(key)', received non-exact Double \(doubleValue)")
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
// Otherwise, require it to be an Int directly
|
|
61
|
+
guard let intValue = value.intValue else {
|
|
62
|
+
throw MCPError.invalidParams("Invalid type for required integer argument: '\(key)', expected Int or exact Double, got \(value)")
|
|
63
|
+
}
|
|
64
|
+
return intValue
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
// --- Get Optional arguments ---
|
|
69
|
+
// Helper for optional values
|
|
70
|
+
func getOptionalDouble(from args: [String: Value]?, key: String) throws -> Double? {
|
|
71
|
+
guard let value = args?[key] else { return nil } // Key not present is valid for optional
|
|
72
|
+
if value.isNull { return nil } // Explicit null is also valid
|
|
73
|
+
switch value {
|
|
74
|
+
case .int(let intValue):
|
|
75
|
+
fputs("log: getOptionalDouble: converting int \(intValue) to double for key '\(key)'\n", stderr)
|
|
76
|
+
return Double(intValue)
|
|
77
|
+
case .double(let doubleValue):
|
|
78
|
+
return doubleValue
|
|
79
|
+
default:
|
|
80
|
+
throw MCPError.invalidParams("Invalid type for optional number argument: '\(key)', expected Int or Double, got \(value)")
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
func getOptionalInt(from args: [String: Value]?, key: String) throws -> Int? {
|
|
85
|
+
guard let value = args?[key] else { return nil } // Key not present is valid for optional
|
|
86
|
+
if value.isNull { return nil } // Explicit null is also valid
|
|
87
|
+
|
|
88
|
+
if let doubleValue = value.doubleValue {
|
|
89
|
+
if let intValue = Int(exactly: doubleValue) {
|
|
90
|
+
fputs("log: getOptionalInt: converting exact double \(doubleValue) to int for key '\(key)'\n", stderr)
|
|
91
|
+
return intValue
|
|
92
|
+
} else {
|
|
93
|
+
fputs("warning: getOptionalInt: received non-exact double \(doubleValue) for key '\(key)', expecting integer.\n", stderr)
|
|
94
|
+
throw MCPError.invalidParams("Invalid type for optional integer argument: '\(key)', received non-exact Double \(doubleValue)")
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
guard let intValue = value.intValue else {
|
|
98
|
+
throw MCPError.invalidParams("Invalid type for optional integer argument: '\(key)', expected Int or exact Double, got \(value)")
|
|
99
|
+
}
|
|
100
|
+
return intValue
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
func getOptionalBool(from args: [String: Value]?, key: String) throws -> Bool? {
|
|
104
|
+
guard let value = args?[key] else { return nil } // Key not present
|
|
105
|
+
if value.isNull { return nil } // Explicit null
|
|
106
|
+
guard let boolValue = value.boolValue else {
|
|
107
|
+
throw MCPError.invalidParams("Invalid type for optional boolean argument: '\(key)', expected Bool, got \(value)")
|
|
108
|
+
}
|
|
109
|
+
return boolValue
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// --- NEW Helper to parse modifier flags ---
|
|
113
|
+
func parseFlags(from value: Value?) throws -> CGEventFlags {
|
|
114
|
+
guard let arrayValue = value?.arrayValue else {
|
|
115
|
+
// No flags provided or not an array, return empty flags
|
|
116
|
+
return []
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
var flags: CGEventFlags = []
|
|
120
|
+
for flagValue in arrayValue {
|
|
121
|
+
guard let flagString = flagValue.stringValue else {
|
|
122
|
+
throw MCPError.invalidParams("Invalid modifierFlags array: contains non-string element \(flagValue)")
|
|
123
|
+
}
|
|
124
|
+
switch flagString.lowercased() {
|
|
125
|
+
// Standard modifiers
|
|
126
|
+
case "capslock", "caps": flags.insert(.maskAlphaShift)
|
|
127
|
+
case "shift": flags.insert(.maskShift)
|
|
128
|
+
case "control", "ctrl": flags.insert(.maskControl)
|
|
129
|
+
case "option", "opt", "alt": flags.insert(.maskAlternate)
|
|
130
|
+
case "command", "cmd": flags.insert(.maskCommand)
|
|
131
|
+
// Other potentially useful flags
|
|
132
|
+
case "help": flags.insert(.maskHelp)
|
|
133
|
+
case "function", "fn": flags.insert(.maskSecondaryFn)
|
|
134
|
+
case "numericpad", "numpad": flags.insert(.maskNumericPad)
|
|
135
|
+
// Non-keyed state (less common for press simulation)
|
|
136
|
+
// case "noncoalesced": flags.insert(.maskNonCoalesced)
|
|
137
|
+
default:
|
|
138
|
+
fputs("warning: parseFlags: unknown modifier flag string '\(flagString)', ignoring.\n", stderr)
|
|
139
|
+
// Optionally throw an error:
|
|
140
|
+
// throw MCPError.invalidParams("Unknown modifier flag: '\(flagString)'")
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return flags
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// --- Enriched Data Structures (adds in_viewport metadata) ---
|
|
147
|
+
|
|
148
|
+
struct EnrichedElementData: Codable {
|
|
149
|
+
var role: String
|
|
150
|
+
var text: String?
|
|
151
|
+
var x: Double?
|
|
152
|
+
var y: Double?
|
|
153
|
+
var width: Double?
|
|
154
|
+
var height: Double?
|
|
155
|
+
var in_viewport: Bool?
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
struct EnrichedResponseData: Codable {
|
|
159
|
+
let app_name: String
|
|
160
|
+
var elements: [EnrichedElementData]
|
|
161
|
+
var stats: Statistics
|
|
162
|
+
let processing_time_seconds: String
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/// Diff element: role, text, viewport status, and coordinates for spatial targeting
|
|
166
|
+
struct DiffElementData: Codable {
|
|
167
|
+
var role: String
|
|
168
|
+
var text: String?
|
|
169
|
+
var in_viewport: Bool?
|
|
170
|
+
var x: Double?
|
|
171
|
+
var y: Double?
|
|
172
|
+
var width: Double?
|
|
173
|
+
var height: Double?
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
struct DiffAttributeChange: Codable {
|
|
177
|
+
let attributeName: String
|
|
178
|
+
let addedText: String?
|
|
179
|
+
let removedText: String?
|
|
180
|
+
let oldValue: String?
|
|
181
|
+
let newValue: String?
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
struct DiffModifiedElement: Codable {
|
|
185
|
+
let before: DiffElementData
|
|
186
|
+
let after: DiffElementData
|
|
187
|
+
let changes: [DiffAttributeChange]
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
struct EnrichedTraversalDiff: Codable {
|
|
191
|
+
let added: [DiffElementData]
|
|
192
|
+
let removed: [DiffElementData]
|
|
193
|
+
let modified: [DiffModifiedElement]
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/// Simplified response: returns either a full traversal (open/refresh) or a diff (click/type/press)
|
|
197
|
+
struct ToolResponse: Codable {
|
|
198
|
+
var openResult: AppOpenerResult?
|
|
199
|
+
var traversalPid: pid_t?
|
|
200
|
+
var traversal: EnrichedResponseData? // for open/refresh: full current state
|
|
201
|
+
var diff: EnrichedTraversalDiff? // for click/type/press: what changed
|
|
202
|
+
var primaryActionError: String?
|
|
203
|
+
var traversalError: String?
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// --- Viewport Detection Helpers ---
|
|
207
|
+
|
|
208
|
+
/// Extract window bounds from traversal data by finding the AXWindow element
|
|
209
|
+
func getWindowBoundsFromTraversal(_ responseData: ResponseData?) -> CGRect? {
|
|
210
|
+
guard let response = responseData else { return nil }
|
|
211
|
+
for element in response.elements {
|
|
212
|
+
if element.role == "AXWindow",
|
|
213
|
+
let x = element.x, let y = element.y,
|
|
214
|
+
let w = element.width, let h = element.height {
|
|
215
|
+
return CGRect(x: x, y: y, width: w, height: h)
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return nil
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/// Find the window (element + frame) whose frame contains the given point.
|
|
222
|
+
/// Searches all AXWindows of the app; falls back to AXMainWindow if none matches.
|
|
223
|
+
func getWindowContainingPoint(appElement: AXUIElement, point: CGPoint) -> (element: AXUIElement, bounds: CGRect)? {
|
|
224
|
+
var windowsRef: CFTypeRef?
|
|
225
|
+
if AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &windowsRef) == .success,
|
|
226
|
+
let windows = windowsRef as? [AXUIElement] {
|
|
227
|
+
for window in windows {
|
|
228
|
+
guard let frame = getAXElementFrame(window) else { continue }
|
|
229
|
+
if frame.contains(point) {
|
|
230
|
+
fputs("log: getWindowContainingPoint: matched window \(frame) for point \(point)\n", stderr)
|
|
231
|
+
return (window, frame)
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
// Fallback to main window
|
|
236
|
+
var winRef: CFTypeRef?
|
|
237
|
+
guard AXUIElementCopyAttributeValue(appElement, "AXMainWindow" as CFString, &winRef) == .success else { return nil }
|
|
238
|
+
let win = winRef as! AXUIElement
|
|
239
|
+
guard let frame = getAXElementFrame(win) else { return nil }
|
|
240
|
+
fputs("log: getWindowContainingPoint: no window contains \(point), falling back to main window \(frame)\n", stderr)
|
|
241
|
+
return (win, frame)
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/// Get window bounds directly from the accessibility API
|
|
245
|
+
func getWindowBoundsFromAPI(pid: pid_t) -> CGRect? {
|
|
246
|
+
let appElement = AXUIElementCreateApplication(pid)
|
|
247
|
+
|
|
248
|
+
var windowValue: CFTypeRef?
|
|
249
|
+
guard AXUIElementCopyAttributeValue(appElement, "AXMainWindow" as CFString, &windowValue) == .success else {
|
|
250
|
+
fputs("warning: getWindowBoundsFromAPI: could not get main window for pid \(pid)\n", stderr)
|
|
251
|
+
return nil
|
|
252
|
+
}
|
|
253
|
+
let windowElement = windowValue as! AXUIElement
|
|
254
|
+
|
|
255
|
+
var positionValue: CFTypeRef?
|
|
256
|
+
guard AXUIElementCopyAttributeValue(windowElement, "AXPosition" as CFString, &positionValue) == .success else {
|
|
257
|
+
fputs("warning: getWindowBoundsFromAPI: could not get window position\n", stderr)
|
|
258
|
+
return nil
|
|
259
|
+
}
|
|
260
|
+
var position = CGPoint.zero
|
|
261
|
+
AXValueGetValue(positionValue as! AXValue, .cgPoint, &position)
|
|
262
|
+
|
|
263
|
+
var sizeValue: CFTypeRef?
|
|
264
|
+
guard AXUIElementCopyAttributeValue(windowElement, "AXSize" as CFString, &sizeValue) == .success else {
|
|
265
|
+
fputs("warning: getWindowBoundsFromAPI: could not get window size\n", stderr)
|
|
266
|
+
return nil
|
|
267
|
+
}
|
|
268
|
+
var size = CGSize.zero
|
|
269
|
+
AXValueGetValue(sizeValue as! AXValue, .cgSize, &size)
|
|
270
|
+
|
|
271
|
+
return CGRect(origin: position, size: size)
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/// Enrich a ResponseData with in_viewport metadata for each element
|
|
275
|
+
func enrichResponseData(_ response: ResponseData, windowBounds: CGRect?) -> EnrichedResponseData {
|
|
276
|
+
let enrichedElements = response.elements.map { element -> EnrichedElementData in
|
|
277
|
+
let inViewport: Bool?
|
|
278
|
+
if let x = element.x, let y = element.y, let bounds = windowBounds {
|
|
279
|
+
inViewport = bounds.contains(CGPoint(x: x, y: y))
|
|
280
|
+
} else {
|
|
281
|
+
inViewport = nil
|
|
282
|
+
}
|
|
283
|
+
return EnrichedElementData(
|
|
284
|
+
role: element.role, text: element.text,
|
|
285
|
+
x: element.x, y: element.y,
|
|
286
|
+
width: element.width, height: element.height,
|
|
287
|
+
in_viewport: inViewport
|
|
288
|
+
)
|
|
289
|
+
}
|
|
290
|
+
return EnrichedResponseData(
|
|
291
|
+
app_name: response.app_name,
|
|
292
|
+
elements: enrichedElements,
|
|
293
|
+
stats: response.stats,
|
|
294
|
+
processing_time_seconds: response.processing_time_seconds
|
|
295
|
+
)
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/// Look up text for a container element (AXRow, AXCell) by:
|
|
299
|
+
/// 1. Coordinate containment — find a text-bearing child within the element's bounds
|
|
300
|
+
/// 2. List proximity — the traversal is depth-first, so children follow the parent;
|
|
301
|
+
/// find the element in the flat list and check the next few entries for text.
|
|
302
|
+
/// This handles off-screen elements whose children have no coordinates.
|
|
303
|
+
func findTextForElement(_ element: ElementData, in traversal: ResponseData?) -> String? {
|
|
304
|
+
if let text = element.text, !text.isEmpty { return text }
|
|
305
|
+
guard let elements = traversal?.elements else { return nil }
|
|
306
|
+
|
|
307
|
+
// Strategy 1: coordinate containment (works for visible elements)
|
|
308
|
+
if let x = element.x, let y = element.y,
|
|
309
|
+
let w = element.width, let h = element.height {
|
|
310
|
+
let bounds = CGRect(x: x, y: y, width: w, height: h)
|
|
311
|
+
for el in elements {
|
|
312
|
+
if let elText = el.text, !elText.isEmpty,
|
|
313
|
+
let elX = el.x, let elY = el.y,
|
|
314
|
+
bounds.contains(CGPoint(x: elX, y: elY)) {
|
|
315
|
+
return elText
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// Strategy 2: list proximity (works for off-screen elements)
|
|
321
|
+
// Use approximate coordinate matching (±2px) to handle floating-point differences
|
|
322
|
+
if let x = element.x, let y = element.y {
|
|
323
|
+
for (i, el) in elements.enumerated() {
|
|
324
|
+
if el.role == element.role,
|
|
325
|
+
let elX = el.x, let elY = el.y,
|
|
326
|
+
abs(elX - x) < 2, abs(elY - y) < 2 {
|
|
327
|
+
// Found the element — look at next few entries for text
|
|
328
|
+
for j in (i + 1)..<min(i + 6, elements.count) {
|
|
329
|
+
if let text = elements[j].text, !text.isEmpty {
|
|
330
|
+
return text
|
|
331
|
+
}
|
|
332
|
+
// Stop if we hit another row (left the subtree)
|
|
333
|
+
if elements[j].role.contains("AXRow") && j > i + 1 { break }
|
|
334
|
+
}
|
|
335
|
+
break
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
return nil
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/// Returns true if the role represents a scroll-bar component (noise in diffs).
|
|
344
|
+
func isScrollBarNoise(_ role: String) -> Bool {
|
|
345
|
+
let lower = role.lowercased()
|
|
346
|
+
return lower.contains("scrollbar") || lower.contains("scroll bar") ||
|
|
347
|
+
lower.contains("value indicator") ||
|
|
348
|
+
lower.contains("page button") || lower.contains("arrow button")
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
/// Returns true if the role is a structural container that's noise without text.
|
|
352
|
+
func isStructuralNoise(_ role: String, text: String?) -> Bool {
|
|
353
|
+
if let text = text, !text.isEmpty { return false }
|
|
354
|
+
let lower = role.lowercased()
|
|
355
|
+
return lower.contains("axrow") || lower.contains("outline row") ||
|
|
356
|
+
lower.contains("axcell") || lower.contains("cell") ||
|
|
357
|
+
lower.contains("axcolumn") || lower.contains("column") ||
|
|
358
|
+
lower.contains("axmenu") || lower.contains("menu")
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/// Build a ToolResponse from an ActionResult.
|
|
362
|
+
/// For actions with diff: returns only the diff (enriched with in_viewport on added elements).
|
|
363
|
+
/// For open/refresh: returns the full traversal (enriched with in_viewport).
|
|
364
|
+
func buildToolResponse(_ result: ActionResult, hasDiff: Bool) -> ToolResponse {
|
|
365
|
+
var windowBounds = getWindowBoundsFromTraversal(result.traversalAfter)
|
|
366
|
+
?? getWindowBoundsFromTraversal(result.traversalBefore)
|
|
367
|
+
if windowBounds == nil {
|
|
368
|
+
if let pid = result.traversalPid ?? result.openResult?.pid {
|
|
369
|
+
windowBounds = getWindowBoundsFromAPI(pid: pid)
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
var response = ToolResponse()
|
|
374
|
+
response.openResult = result.openResult
|
|
375
|
+
response.traversalPid = result.traversalPid
|
|
376
|
+
response.primaryActionError = result.primaryActionError
|
|
377
|
+
response.traversalError = result.traversalAfterError ?? result.traversalBeforeError
|
|
378
|
+
|
|
379
|
+
if hasDiff, let rawDiff = result.traversalDiff {
|
|
380
|
+
let coordinateAttrs: Set<String> = ["x", "y", "width", "height"]
|
|
381
|
+
|
|
382
|
+
// Filter noise, strip coordinates, add in_viewport, resolve text
|
|
383
|
+
let filteredAdded = rawDiff.added
|
|
384
|
+
.filter { !isScrollBarNoise($0.role) }
|
|
385
|
+
.map { element -> DiffElementData in
|
|
386
|
+
let inViewport: Bool?
|
|
387
|
+
if let x = element.x, let y = element.y, let bounds = windowBounds {
|
|
388
|
+
inViewport = bounds.contains(CGPoint(x: x, y: y))
|
|
389
|
+
} else {
|
|
390
|
+
inViewport = nil
|
|
391
|
+
}
|
|
392
|
+
let text = findTextForElement(element, in: result.traversalAfter)
|
|
393
|
+
return DiffElementData(role: element.role, text: text, in_viewport: inViewport,
|
|
394
|
+
x: element.x, y: element.y, width: element.width, height: element.height)
|
|
395
|
+
}
|
|
396
|
+
.filter { !isStructuralNoise($0.role, text: $0.text) }
|
|
397
|
+
|
|
398
|
+
let filteredRemoved = rawDiff.removed
|
|
399
|
+
.filter { !isScrollBarNoise($0.role) }
|
|
400
|
+
.map { element -> DiffElementData in
|
|
401
|
+
let text = findTextForElement(element, in: result.traversalBefore)
|
|
402
|
+
return DiffElementData(role: element.role, text: text, in_viewport: nil,
|
|
403
|
+
x: element.x, y: element.y, width: element.width, height: element.height)
|
|
404
|
+
}
|
|
405
|
+
.filter { !isStructuralNoise($0.role, text: $0.text) }
|
|
406
|
+
|
|
407
|
+
// Filter modified: skip scroll-bar noise, drop coordinate-only changes
|
|
408
|
+
var filteredModified: [DiffModifiedElement] = []
|
|
409
|
+
for mod in rawDiff.modified {
|
|
410
|
+
if isScrollBarNoise(mod.before.role) || isScrollBarNoise(mod.after.role) { continue }
|
|
411
|
+
|
|
412
|
+
let meaningfulChanges = mod.changes.filter { !coordinateAttrs.contains($0.attributeName) }
|
|
413
|
+
if meaningfulChanges.isEmpty { continue }
|
|
414
|
+
|
|
415
|
+
let diffChanges = meaningfulChanges.map {
|
|
416
|
+
DiffAttributeChange(
|
|
417
|
+
attributeName: $0.attributeName,
|
|
418
|
+
addedText: $0.addedText,
|
|
419
|
+
removedText: $0.removedText,
|
|
420
|
+
oldValue: $0.oldValue,
|
|
421
|
+
newValue: $0.newValue
|
|
422
|
+
)
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
let beforeVP: Bool?
|
|
426
|
+
if let x = mod.before.x, let y = mod.before.y, let bounds = windowBounds {
|
|
427
|
+
beforeVP = bounds.contains(CGPoint(x: x, y: y))
|
|
428
|
+
} else { beforeVP = nil }
|
|
429
|
+
|
|
430
|
+
let afterVP: Bool?
|
|
431
|
+
if let x = mod.after.x, let y = mod.after.y, let bounds = windowBounds {
|
|
432
|
+
afterVP = bounds.contains(CGPoint(x: x, y: y))
|
|
433
|
+
} else { afterVP = nil }
|
|
434
|
+
|
|
435
|
+
let beforeText = findTextForElement(mod.before, in: result.traversalBefore)
|
|
436
|
+
let afterText = findTextForElement(mod.after, in: result.traversalAfter)
|
|
437
|
+
|
|
438
|
+
filteredModified.append(DiffModifiedElement(
|
|
439
|
+
before: DiffElementData(role: mod.before.role, text: beforeText, in_viewport: beforeVP),
|
|
440
|
+
after: DiffElementData(role: mod.after.role, text: afterText, in_viewport: afterVP),
|
|
441
|
+
changes: diffChanges
|
|
442
|
+
))
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
response.diff = EnrichedTraversalDiff(
|
|
446
|
+
added: filteredAdded,
|
|
447
|
+
removed: filteredRemoved,
|
|
448
|
+
modified: filteredModified
|
|
449
|
+
)
|
|
450
|
+
} else if let after = result.traversalAfter {
|
|
451
|
+
// Full traversal for open/refresh
|
|
452
|
+
response.traversal = enrichResponseData(after, windowBounds: windowBounds)
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
return response
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
// --- Compact Summary Builder (file-based MCP responses) ---
|
|
459
|
+
|
|
460
|
+
/// Build a concise text summary for the MCP response instead of returning the full JSON.
|
|
461
|
+
/// The full JSON is written to a file; this summary contains just the key info + file path.
|
|
462
|
+
func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResponse: ToolResponse, filepath: String) -> String {
|
|
463
|
+
var lines: [String] = []
|
|
464
|
+
|
|
465
|
+
// Status line
|
|
466
|
+
let status = (toolResponse.primaryActionError != nil || toolResponse.traversalError != nil) ? "error" : "success"
|
|
467
|
+
lines.append("status: \(status)")
|
|
468
|
+
|
|
469
|
+
// PID and app
|
|
470
|
+
if let pid = toolResponse.traversalPid {
|
|
471
|
+
lines.append("pid: \(pid)")
|
|
472
|
+
}
|
|
473
|
+
if let appName = toolResponse.traversal?.app_name ?? toolResponse.openResult?.appName {
|
|
474
|
+
lines.append("app: \(appName)")
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// File path
|
|
478
|
+
lines.append("file: \(filepath)")
|
|
479
|
+
|
|
480
|
+
// Errors if any
|
|
481
|
+
if let err = toolResponse.primaryActionError {
|
|
482
|
+
lines.append("error: \(err)")
|
|
483
|
+
}
|
|
484
|
+
if let err = toolResponse.traversalError {
|
|
485
|
+
lines.append("traversal_error: \(err)")
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
// Tool-specific summary line
|
|
489
|
+
let summaryLine: String
|
|
490
|
+
switch toolName {
|
|
491
|
+
case "macos-use_open_application_and_traverse":
|
|
492
|
+
let identifier = params.arguments?["identifier"]?.stringValue ?? "unknown"
|
|
493
|
+
if let traversal = toolResponse.traversal {
|
|
494
|
+
let total = traversal.elements.count
|
|
495
|
+
let visible = traversal.elements.filter { $0.in_viewport == true }.count
|
|
496
|
+
summaryLine = "Opened \(identifier) (PID:\(toolResponse.traversalPid ?? 0)). \(total) elements, \(visible) visible."
|
|
497
|
+
} else {
|
|
498
|
+
summaryLine = "Opened \(identifier) (PID:\(toolResponse.traversalPid ?? 0))."
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
case "macos-use_click_and_traverse":
|
|
502
|
+
let x = params.arguments?["x"]?.doubleValue ?? params.arguments?["x"]?.intValue.map(Double.init) ?? 0
|
|
503
|
+
let y = params.arguments?["y"]?.doubleValue ?? params.arguments?["y"]?.intValue.map(Double.init) ?? 0
|
|
504
|
+
let diffSummary = buildDiffSummary(toolResponse.diff)
|
|
505
|
+
summaryLine = "Clicked at (\(Int(x)),\(Int(y))). \(diffSummary)"
|
|
506
|
+
|
|
507
|
+
case "macos-use_type_and_traverse":
|
|
508
|
+
let text = params.arguments?["text"]?.stringValue ?? ""
|
|
509
|
+
let truncatedText = text.count > 40 ? String(text.prefix(40)) + "..." : text
|
|
510
|
+
let diffSummary = buildDiffSummary(toolResponse.diff)
|
|
511
|
+
summaryLine = "Typed '\(truncatedText)'. \(diffSummary)"
|
|
512
|
+
|
|
513
|
+
case "macos-use_press_key_and_traverse":
|
|
514
|
+
let keyName = params.arguments?["keyName"]?.stringValue ?? "unknown"
|
|
515
|
+
let mods = params.arguments?["modifierFlags"]?.arrayValue?.compactMap { $0.stringValue }.joined(separator: "+")
|
|
516
|
+
let keyDesc = (mods != nil && !mods!.isEmpty) ? "\(mods!)+\(keyName)" : keyName
|
|
517
|
+
let diffSummary = buildDiffSummary(toolResponse.diff)
|
|
518
|
+
summaryLine = "Pressed \(keyDesc). \(diffSummary)"
|
|
519
|
+
|
|
520
|
+
case "macos-use_scroll_and_traverse":
|
|
521
|
+
let x = params.arguments?["x"]?.doubleValue ?? params.arguments?["x"]?.intValue.map(Double.init) ?? 0
|
|
522
|
+
let y = params.arguments?["y"]?.doubleValue ?? params.arguments?["y"]?.intValue.map(Double.init) ?? 0
|
|
523
|
+
let deltaY = params.arguments?["deltaY"]?.intValue ?? 0
|
|
524
|
+
let diffSummary = buildDiffSummary(toolResponse.diff)
|
|
525
|
+
summaryLine = "Scrolled deltaY=\(deltaY) at (\(Int(x)),\(Int(y))). \(diffSummary)"
|
|
526
|
+
|
|
527
|
+
case "macos-use_refresh_traversal":
|
|
528
|
+
if let traversal = toolResponse.traversal {
|
|
529
|
+
let total = traversal.elements.count
|
|
530
|
+
let visible = traversal.elements.filter { $0.in_viewport == true }.count
|
|
531
|
+
summaryLine = "Refreshed PID \(toolResponse.traversalPid ?? 0) (\(traversal.app_name)). \(total) elements, \(visible) visible."
|
|
532
|
+
} else {
|
|
533
|
+
summaryLine = "Refreshed PID \(toolResponse.traversalPid ?? 0)."
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
default:
|
|
537
|
+
summaryLine = "Tool \(toolName) completed."
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
lines.append("summary: \(summaryLine)")
|
|
541
|
+
|
|
542
|
+
// Append notable text changes from diff (up to 3, truncated)
|
|
543
|
+
if let diff = toolResponse.diff {
|
|
544
|
+
var textChanges: [String] = []
|
|
545
|
+
|
|
546
|
+
for mod in diff.modified.prefix(5) {
|
|
547
|
+
for change in mod.changes {
|
|
548
|
+
if change.attributeName == "text" || change.attributeName == "AXValue" {
|
|
549
|
+
let oldVal = truncate(change.oldValue ?? change.removedText ?? "", maxLen: 60)
|
|
550
|
+
let newVal = truncate(change.newValue ?? change.addedText ?? "", maxLen: 60)
|
|
551
|
+
if !oldVal.isEmpty || !newVal.isEmpty {
|
|
552
|
+
textChanges.append(" '\(oldVal)' -> '\(newVal)'")
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
if textChanges.count >= 3 { break }
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
if !textChanges.isEmpty {
|
|
560
|
+
lines.append("text_changes:")
|
|
561
|
+
lines.append(contentsOf: textChanges.prefix(3))
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return lines.joined(separator: "\n")
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
/// Build a short diff summary string like "3 added, 2 removed, 1 modified."
|
|
569
|
+
func buildDiffSummary(_ diff: EnrichedTraversalDiff?) -> String {
|
|
570
|
+
guard let diff = diff else { return "No diff." }
|
|
571
|
+
var parts: [String] = []
|
|
572
|
+
if !diff.added.isEmpty { parts.append("\(diff.added.count) added") }
|
|
573
|
+
if !diff.removed.isEmpty { parts.append("\(diff.removed.count) removed") }
|
|
574
|
+
if !diff.modified.isEmpty { parts.append("\(diff.modified.count) modified") }
|
|
575
|
+
return parts.isEmpty ? "No changes." : parts.joined(separator: ", ") + "."
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
/// Truncate a string to maxLen characters
|
|
579
|
+
func truncate(_ s: String, maxLen: Int) -> String {
|
|
580
|
+
s.count > maxLen ? String(s.prefix(maxLen)) + "..." : s
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// --- Direct AX Element Interaction ---
|
|
584
|
+
|
|
585
|
+
// --- Auto-Scroll via Scroll Wheel Events ---
|
|
586
|
+
|
|
587
|
+
/// Walk the AX tree to find an element whose frame contains the given point.
|
|
588
|
+
/// Returns the deepest (smallest) match. Always recurses into children since
|
|
589
|
+
/// scroll area content may extend beyond the parent's visible frame.
|
|
590
|
+
func findAXElementAtPoint(root: AXUIElement, point: CGPoint, maxDepth: Int = 25) -> AXUIElement? {
|
|
591
|
+
guard maxDepth > 0 else { return nil }
|
|
592
|
+
|
|
593
|
+
var posRef: CFTypeRef?
|
|
594
|
+
var sizeRef: CFTypeRef?
|
|
595
|
+
let hasFrame =
|
|
596
|
+
AXUIElementCopyAttributeValue(root, "AXPosition" as CFString, &posRef) == .success &&
|
|
597
|
+
AXUIElementCopyAttributeValue(root, "AXSize" as CFString, &sizeRef) == .success
|
|
598
|
+
var containsPoint = false
|
|
599
|
+
if hasFrame {
|
|
600
|
+
var pos = CGPoint.zero; var sz = CGSize.zero
|
|
601
|
+
AXValueGetValue(posRef as! AXValue, .cgPoint, &pos)
|
|
602
|
+
AXValueGetValue(sizeRef as! AXValue, .cgSize, &sz)
|
|
603
|
+
containsPoint = CGRect(origin: pos, size: sz).contains(point)
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
var childrenRef: CFTypeRef?
|
|
607
|
+
if AXUIElementCopyAttributeValue(root, "AXChildren" as CFString, &childrenRef) == .success,
|
|
608
|
+
let children = childrenRef as? [AXUIElement] {
|
|
609
|
+
for child in children {
|
|
610
|
+
if let found = findAXElementAtPoint(root: child, point: point, maxDepth: maxDepth - 1) {
|
|
611
|
+
return found
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
return containsPoint ? root : nil
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
/// Get the text (AXValue or AXTitle) of an AX element.
|
|
619
|
+
/// If the element itself has no text, search its children (e.g. AXRow -> AXCell -> AXStaticText).
|
|
620
|
+
func getAXElementText(_ element: AXUIElement, searchChildren: Bool = true) -> String? {
|
|
621
|
+
var valueRef: CFTypeRef?
|
|
622
|
+
if AXUIElementCopyAttributeValue(element, "AXValue" as CFString, &valueRef) == .success,
|
|
623
|
+
let str = valueRef as? String, !str.isEmpty {
|
|
624
|
+
return str
|
|
625
|
+
}
|
|
626
|
+
if AXUIElementCopyAttributeValue(element, "AXTitle" as CFString, &valueRef) == .success,
|
|
627
|
+
let str = valueRef as? String, !str.isEmpty {
|
|
628
|
+
return str
|
|
629
|
+
}
|
|
630
|
+
// For container elements (AXRow, AXCell), check children for text
|
|
631
|
+
if searchChildren {
|
|
632
|
+
var childrenRef: CFTypeRef?
|
|
633
|
+
if AXUIElementCopyAttributeValue(element, "AXChildren" as CFString, &childrenRef) == .success,
|
|
634
|
+
let children = childrenRef as? [AXUIElement] {
|
|
635
|
+
for child in children {
|
|
636
|
+
if let childText = getAXElementText(child, searchChildren: true) {
|
|
637
|
+
return childText
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
return nil
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
/// Get the frame (position + size) of an AX element as a CGRect.
|
|
646
|
+
func getAXElementFrame(_ element: AXUIElement) -> CGRect? {
|
|
647
|
+
var posRef: CFTypeRef?
|
|
648
|
+
var sizeRef: CFTypeRef?
|
|
649
|
+
guard AXUIElementCopyAttributeValue(element, "AXPosition" as CFString, &posRef) == .success,
|
|
650
|
+
AXUIElementCopyAttributeValue(element, "AXSize" as CFString, &sizeRef) == .success else { return nil }
|
|
651
|
+
var pos = CGPoint.zero; var sz = CGSize.zero
|
|
652
|
+
AXValueGetValue(posRef as! AXValue, .cgPoint, &pos)
|
|
653
|
+
AXValueGetValue(sizeRef as! AXValue, .cgSize, &sz)
|
|
654
|
+
return CGRect(origin: pos, size: sz)
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
/// Search the AX tree for an element with matching text, returning its center if within viewport.
|
|
658
|
+
/// Uses a 40px inset margin to ensure the element is well within the visible area, not at the edge.
|
|
659
|
+
func findElementByText(root: AXUIElement, text: String, viewport: CGRect, maxDepth: Int = 25) -> CGPoint? {
|
|
660
|
+
guard maxDepth > 0 else { return nil }
|
|
661
|
+
let safeViewport = viewport.insetBy(dx: 0, dy: 15)
|
|
662
|
+
|
|
663
|
+
if let elementText = getAXElementText(root), elementText == text {
|
|
664
|
+
if let frame = getAXElementFrame(root) {
|
|
665
|
+
let center = CGPoint(x: frame.midX, y: frame.midY)
|
|
666
|
+
if safeViewport.contains(center) {
|
|
667
|
+
return center
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
var childrenRef: CFTypeRef?
|
|
673
|
+
if AXUIElementCopyAttributeValue(root, "AXChildren" as CFString, &childrenRef) == .success,
|
|
674
|
+
let children = childrenRef as? [AXUIElement] {
|
|
675
|
+
for child in children {
|
|
676
|
+
if let found = findElementByText(root: child, text: text, viewport: viewport, maxDepth: maxDepth - 1) {
|
|
677
|
+
return found
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
return nil
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
/// If the target point is outside the window viewport:
|
|
685
|
+
/// 1. Find the element at that point in the AX tree to get its text
|
|
686
|
+
/// 2. Scroll incrementally, targeting the element's x coordinate
|
|
687
|
+
/// 3. After each scroll step, search the AX tree for the element by text
|
|
688
|
+
/// 4. When found within viewport, return its actual position
|
|
689
|
+
///
|
|
690
|
+
/// If the element at the target has no text (common for far off-screen items),
|
|
691
|
+
/// scroll toward the target and keep probing until an element with text appears.
|
|
692
|
+
func scrollIntoViewIfNeeded(pid: pid_t, point: CGPoint) async -> CGPoint {
|
|
693
|
+
let appElement = AXUIElementCreateApplication(pid)
|
|
694
|
+
|
|
695
|
+
guard let (windowElement, windowBounds) = getWindowContainingPoint(appElement: appElement, point: point) else {
|
|
696
|
+
fputs("log: scrollIntoViewIfNeeded: could not get window bounds, using original point\n", stderr)
|
|
697
|
+
return point
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
if windowBounds.contains(point) {
|
|
701
|
+
// Already in viewport — the caller already centered the point from (x + w/2, y + h/2).
|
|
702
|
+
// Do NOT try to refine via findAXElementAtPoint: the AX tree has overlapping full-width
|
|
703
|
+
// group elements (e.g. message rows spanning the entire window) that would shadow sidebar
|
|
704
|
+
// items and send clicks to the wrong location.
|
|
705
|
+
fputs("log: scrollIntoViewIfNeeded: in viewport, using caller-centered point \(point)\n", stderr)
|
|
706
|
+
return point
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
// Try to find the element and its text
|
|
710
|
+
let targetElement = findAXElementAtPoint(root: windowElement, point: point)
|
|
711
|
+
let targetText = targetElement != nil ? getAXElementText(targetElement!) : nil
|
|
712
|
+
|
|
713
|
+
let scrollingUp: Bool = point.y > windowBounds.maxY // need to reveal content below
|
|
714
|
+
let distance: CGFloat = scrollingUp
|
|
715
|
+
? point.y - windowBounds.maxY
|
|
716
|
+
: windowBounds.minY - point.y
|
|
717
|
+
// Scale lines per step to distance: 1 line for tiny offsets, up to 3 for large ones.
|
|
718
|
+
// Each scroll line ≈ 20-40px, so 1 line is enough when distance < 80px.
|
|
719
|
+
let linesPerStep: Int32 = distance < 80 ? 1 : (distance < 250 ? 2 : 3)
|
|
720
|
+
let scrollDirection: Int32 = scrollingUp ? -linesPerStep : linesPerStep
|
|
721
|
+
let maxSteps = 30
|
|
722
|
+
|
|
723
|
+
if let targetText = targetText {
|
|
724
|
+
// CASE 1: We have the element's text - scroll and search by text
|
|
725
|
+
fputs("log: scrollIntoViewIfNeeded: target text=\"\(targetText)\", distance=\(distance)px, lines/step=\(linesPerStep), dir=\(scrollDirection)\n", stderr)
|
|
726
|
+
|
|
727
|
+
for step in 1...maxSteps {
|
|
728
|
+
guard let scrollEvent = CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 1, wheel1: scrollDirection, wheel2: 0, wheel3: 0) else { return point }
|
|
729
|
+
scrollEvent.location = CGPoint(x: point.x, y: windowBounds.midY)
|
|
730
|
+
scrollEvent.post(tap: .cghidEventTap)
|
|
731
|
+
try? await Task.sleep(nanoseconds: 100_000_000)
|
|
732
|
+
|
|
733
|
+
// Debug: check where the target element currently is
|
|
734
|
+
if step % 5 == 1 || step <= 3 {
|
|
735
|
+
if let el = targetElement, let frame = getAXElementFrame(el) {
|
|
736
|
+
fputs("log: scrollIntoViewIfNeeded: step \(step): element frame=\(frame)\n", stderr)
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
if let foundCenter = findElementByText(root: windowElement, text: targetText, viewport: windowBounds) {
|
|
741
|
+
fputs("log: scrollIntoViewIfNeeded: found \"\(targetText)\" at \(foundCenter) after \(step) steps\n", stderr)
|
|
742
|
+
try? await Task.sleep(nanoseconds: 100_000_000)
|
|
743
|
+
return foundCenter
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
fputs("warning: scrollIntoViewIfNeeded: could not scroll \"\(targetText)\" into view after \(maxSteps) steps\n", stderr)
|
|
747
|
+
// Debug: final position
|
|
748
|
+
if let el = targetElement, let frame = getAXElementFrame(el) {
|
|
749
|
+
fputs("log: scrollIntoViewIfNeeded: final element frame=\(frame)\n", stderr)
|
|
750
|
+
}
|
|
751
|
+
return point
|
|
752
|
+
} else {
|
|
753
|
+
// CASE 2: No text at target coordinates (common for far off-screen sidebar items).
|
|
754
|
+
// Strategy: scroll proportionally to the distance, then probe near the viewport
|
|
755
|
+
// edge where new content appears. As we scroll, check the bottom/top edge for
|
|
756
|
+
// newly revealed elements.
|
|
757
|
+
let distance = point.y > windowBounds.maxY
|
|
758
|
+
? point.y - windowBounds.maxY
|
|
759
|
+
: windowBounds.minY - point.y
|
|
760
|
+
fputs("log: scrollIntoViewIfNeeded: no text at \(point), distance=\(distance)px, scrolling and probing edge\n", stderr)
|
|
761
|
+
|
|
762
|
+
// Probe position: near the edge where new content scrolls in
|
|
763
|
+
let probeY = point.y > windowBounds.maxY
|
|
764
|
+
? windowBounds.maxY - 60 // probe near bottom edge
|
|
765
|
+
: windowBounds.minY + 60 // probe near top edge
|
|
766
|
+
let probePoint = CGPoint(x: point.x, y: probeY)
|
|
767
|
+
|
|
768
|
+
var lastText: String? = nil
|
|
769
|
+
|
|
770
|
+
for step in 1...maxSteps {
|
|
771
|
+
guard let scrollEvent = CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 1, wheel1: scrollDirection, wheel2: 0, wheel3: 0) else { return point }
|
|
772
|
+
scrollEvent.location = CGPoint(x: point.x, y: windowBounds.midY)
|
|
773
|
+
scrollEvent.post(tap: .cghidEventTap)
|
|
774
|
+
try? await Task.sleep(nanoseconds: 150_000_000)
|
|
775
|
+
|
|
776
|
+
// Probe near the viewport edge for newly revealed elements
|
|
777
|
+
if let element = findAXElementAtPoint(root: windowElement, point: probePoint),
|
|
778
|
+
let text = getAXElementText(element) {
|
|
779
|
+
if text != lastText {
|
|
780
|
+
fputs("log: scrollIntoViewIfNeeded: edge element after \(step) steps: \"\(text)\"\n", stderr)
|
|
781
|
+
lastText = text
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
// Also check: has the target point come into range? (for shorter distances)
|
|
786
|
+
if let element = findAXElementAtPoint(root: windowElement, point: point),
|
|
787
|
+
let text = getAXElementText(element) {
|
|
788
|
+
fputs("log: scrollIntoViewIfNeeded: target appeared after \(step) steps: \"\(text)\"\n", stderr)
|
|
789
|
+
// Switch to text-based tracking
|
|
790
|
+
if let foundCenter = findElementByText(root: windowElement, text: text, viewport: windowBounds) {
|
|
791
|
+
fputs("log: scrollIntoViewIfNeeded: \"\(text)\" in viewport at \(foundCenter)\n", stderr)
|
|
792
|
+
try? await Task.sleep(nanoseconds: 100_000_000)
|
|
793
|
+
return foundCenter
|
|
794
|
+
}
|
|
795
|
+
// Nudge a few more steps
|
|
796
|
+
for step2 in 1...8 {
|
|
797
|
+
guard let se = CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 1, wheel1: scrollDirection > 0 ? linesPerStep : -linesPerStep, wheel2: 0, wheel3: 0) else { return point }
|
|
798
|
+
se.location = CGPoint(x: point.x, y: windowBounds.midY)
|
|
799
|
+
se.post(tap: .cghidEventTap)
|
|
800
|
+
try? await Task.sleep(nanoseconds: 150_000_000)
|
|
801
|
+
if let fc = findElementByText(root: windowElement, text: text, viewport: windowBounds) {
|
|
802
|
+
fputs("log: scrollIntoViewIfNeeded: \"\(text)\" at \(fc) after \(step+step2) steps\n", stderr)
|
|
803
|
+
try? await Task.sleep(nanoseconds: 100_000_000)
|
|
804
|
+
return fc
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
// Use current position as fallback
|
|
808
|
+
if let frame = getAXElementFrame(element) {
|
|
809
|
+
return CGPoint(x: frame.midX, y: frame.midY)
|
|
810
|
+
}
|
|
811
|
+
return point
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
fputs("warning: scrollIntoViewIfNeeded: no element appeared after \(maxSteps) scroll steps\n", stderr)
|
|
815
|
+
return point
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
// Async helper function to set up and start the server
|
|
820
|
+
func setupAndStartServer() async throws -> Server {
|
|
821
|
+
fputs("log: setupAndStartServer: entering function.\n", stderr)
|
|
822
|
+
|
|
823
|
+
// --- Define Schemas and Tools for Simplified Actions ---
|
|
824
|
+
// (Schemas remain the same as they define the MCP interface)
|
|
825
|
+
let openAppSchema: Value = .object([
|
|
826
|
+
"type": .string("object"),
|
|
827
|
+
"properties": .object([
|
|
828
|
+
"identifier": .object(["type": .string("string"), "description": .string("REQUIRED. App name, path, or bundle ID.")])
|
|
829
|
+
]),
|
|
830
|
+
"required": .array([.string("identifier")])
|
|
831
|
+
])
|
|
832
|
+
let openAppTool = Tool(
|
|
833
|
+
name: "macos-use_open_application_and_traverse",
|
|
834
|
+
description: "Opens/activates an application and then traverses its accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
|
|
835
|
+
inputSchema: openAppSchema
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
let clickSchema: Value = .object([
|
|
839
|
+
"type": .string("object"),
|
|
840
|
+
"properties": .object([
|
|
841
|
+
"pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
|
|
842
|
+
"x": .object(["type": .string("number"), "description": .string("REQUIRED. X coordinate for the click (top-left of element).")]),
|
|
843
|
+
"y": .object(["type": .string("number"), "description": .string("REQUIRED. Y coordinate for the click (top-left of element).")]),
|
|
844
|
+
"width": .object(["type": .string("number"), "description": .string("Optional. Element width from traversal. When provided with height, click lands at center (x+width/2, y+height/2).")]),
|
|
845
|
+
"height": .object(["type": .string("number"), "description": .string("Optional. Element height from traversal. When provided with width, click lands at center (x+width/2, y+height/2).")])
|
|
846
|
+
]),
|
|
847
|
+
"required": .array([.string("pid"), .string("x"), .string("y")])
|
|
848
|
+
])
|
|
849
|
+
let clickTool = Tool(
|
|
850
|
+
name: "macos-use_click_and_traverse",
|
|
851
|
+
description: "Simulates a click at the given coordinates within the app specified by PID, then traverses its accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
|
|
852
|
+
inputSchema: clickSchema
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
let typeSchema: Value = .object([
|
|
856
|
+
"type": .string("object"),
|
|
857
|
+
"properties": .object([
|
|
858
|
+
"pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
|
|
859
|
+
"text": .object(["type": .string("string"), "description": .string("REQUIRED. Text to type.")])
|
|
860
|
+
// Add optional options here if needed later
|
|
861
|
+
]),
|
|
862
|
+
"required": .array([.string("pid"), .string("text")])
|
|
863
|
+
])
|
|
864
|
+
let typeTool = Tool(
|
|
865
|
+
name: "macos-use_type_and_traverse",
|
|
866
|
+
description: "Simulates typing text into the app specified by PID, then traverses its accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
|
|
867
|
+
inputSchema: typeSchema
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
let refreshSchema: Value = .object([
|
|
871
|
+
"type": .string("object"),
|
|
872
|
+
"properties": .object([
|
|
873
|
+
"pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the application to traverse.")])
|
|
874
|
+
// Add optional options here if needed later
|
|
875
|
+
]),
|
|
876
|
+
"required": .array([.string("pid")])
|
|
877
|
+
])
|
|
878
|
+
let refreshTool = Tool(
|
|
879
|
+
name: "macos-use_refresh_traversal",
|
|
880
|
+
description: "Traverses the accessibility tree of the application specified by PID. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
|
|
881
|
+
inputSchema: refreshSchema
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
// *** NEW: Schema and Tool for Press Key ***
|
|
885
|
+
let pressKeySchema: Value = .object([
|
|
886
|
+
"type": .string("object"),
|
|
887
|
+
"properties": .object([
|
|
888
|
+
"pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
|
|
889
|
+
"keyName": .object(["type": .string("string"), "description": .string("REQUIRED. Name of the key to press (e.g., 'Return', 'Enter', 'Escape', 'Tab', 'up', 'down', 'left', 'right', 'PageUp', 'PageDown', 'Home', 'End', 'Delete', 'a', 'B'). Case-insensitive for special keys.")]),
|
|
890
|
+
"modifierFlags": .object([ // Optional array of strings
|
|
891
|
+
"type": .string("array"),
|
|
892
|
+
"description": .string("OPTIONAL. Modifier keys to hold (e.g., ['Command', 'Shift']). Valid: CapsLock, Shift, Control, Option, Command, Function, NumericPad, Help."),
|
|
893
|
+
"items": .object(["type": .string("string")]) // Items in the array must be strings
|
|
894
|
+
])
|
|
895
|
+
// Add optional ActionOptions overrides here if needed later
|
|
896
|
+
]),
|
|
897
|
+
"required": .array([.string("pid"), .string("keyName")])
|
|
898
|
+
])
|
|
899
|
+
let pressKeyTool = Tool(
|
|
900
|
+
name: "macos-use_press_key_and_traverse",
|
|
901
|
+
description: "Simulates pressing a specific key (like Return, Enter, Escape, Tab, Arrow Keys, regular characters) with optional modifiers, then traverses the accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
|
|
902
|
+
inputSchema: pressKeySchema
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
// *** NEW: Schema and Tool for Scroll Wheel ***
|
|
906
|
+
let scrollSchema: Value = .object([
|
|
907
|
+
"type": .string("object"),
|
|
908
|
+
"properties": .object([
|
|
909
|
+
"pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
|
|
910
|
+
"x": .object(["type": .string("number"), "description": .string("REQUIRED. X coordinate for the scroll location.")]),
|
|
911
|
+
"y": .object(["type": .string("number"), "description": .string("REQUIRED. Y coordinate for the scroll location.")]),
|
|
912
|
+
"deltaY": .object(["type": .string("integer"), "description": .string("REQUIRED. Vertical scroll amount in lines. Negative = scroll up (reveal content above), positive = scroll down (reveal content below).")]),
|
|
913
|
+
"deltaX": .object(["type": .string("integer"), "description": .string("Optional. Horizontal scroll amount in lines. Negative = scroll left, positive = scroll right. Defaults to 0.")])
|
|
914
|
+
]),
|
|
915
|
+
"required": .array([.string("pid"), .string("x"), .string("y"), .string("deltaY")])
|
|
916
|
+
])
|
|
917
|
+
let scrollTool = Tool(
|
|
918
|
+
name: "macos-use_scroll_and_traverse",
|
|
919
|
+
description: "Simulates a scroll wheel event at the given coordinates within the app specified by PID, then traverses its accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
|
|
920
|
+
inputSchema: scrollSchema
|
|
921
|
+
)
|
|
922
|
+
|
|
923
|
+
// --- Aggregate list of tools ---
|
|
924
|
+
let allTools = [openAppTool, clickTool, typeTool, pressKeyTool, scrollTool, refreshTool]
|
|
925
|
+
fputs("log: setupAndStartServer: defined \(allTools.count) tools: \(allTools.map { $0.name })\n", stderr)
|
|
926
|
+
|
|
927
|
+
let server = Server(
|
|
928
|
+
name: "SwiftMacOSServerDirect", // Renamed slightly
|
|
929
|
+
version: "1.3.0", // Incremented version for major change
|
|
930
|
+
capabilities: .init(
|
|
931
|
+
tools: .init(listChanged: true)
|
|
932
|
+
)
|
|
933
|
+
)
|
|
934
|
+
fputs("log: setupAndStartServer: server instance created (\(server.name)) version \(server.version).\n", stderr)
|
|
935
|
+
|
|
936
|
+
// --- Dummy Handlers (ReadResource, ListResources, ListPrompts) ---
|
|
937
|
+
// (Keep these as they are part of the MCP spec, even if unused for now)
|
|
938
|
+
await server.withMethodHandler(ReadResource.self) { params in
|
|
939
|
+
let uri = params.uri
|
|
940
|
+
fputs("log: handler(ReadResource): received request for uri: \(uri) (dummy handler)\n", stderr)
|
|
941
|
+
// In a real scenario, you might fetch resource content here
|
|
942
|
+
return .init(contents: [.text("dummy content for \(uri)", uri: uri)])
|
|
943
|
+
}
|
|
944
|
+
fputs("log: setupAndStartServer: registered ReadResource handler (dummy).\n", stderr)
|
|
945
|
+
|
|
946
|
+
await server.withMethodHandler(ListResources.self) { _ in
|
|
947
|
+
fputs("log: handler(ListResources): received request (dummy handler).\n", stderr)
|
|
948
|
+
// In a real scenario, list available resources
|
|
949
|
+
return ListResources.Result(resources: [])
|
|
950
|
+
}
|
|
951
|
+
fputs("log: setupAndStartServer: registered ListResources handler (dummy).\n", stderr)
|
|
952
|
+
|
|
953
|
+
await server.withMethodHandler(ListPrompts.self) { _ in
|
|
954
|
+
fputs("log: handler(ListPrompts): received request (dummy handler).\n", stderr)
|
|
955
|
+
// In a real scenario, list available prompts
|
|
956
|
+
return ListPrompts.Result(prompts: [])
|
|
957
|
+
}
|
|
958
|
+
fputs("log: setupAndStartServer: registered ListPrompts handler (dummy).\n", stderr)
|
|
959
|
+
|
|
960
|
+
// --- ListTools Handler ---
|
|
961
|
+
await server.withMethodHandler(ListTools.self) { _ in
|
|
962
|
+
fputs("log: handler(ListTools): received request.\n", stderr)
|
|
963
|
+
let result = ListTools.Result(tools: allTools)
|
|
964
|
+
fputs("log: handler(ListTools): responding with \(result.tools.count) tools: \(result.tools.map { $0.name })\n", stderr)
|
|
965
|
+
return result
|
|
966
|
+
}
|
|
967
|
+
fputs("log: setupAndStartServer: registered ListTools handler.\n", stderr)
|
|
968
|
+
|
|
969
|
+
// --- UPDATED CallTool Handler (Direct SDK Call) ---
|
|
970
|
+
await server.withMethodHandler(CallTool.self) { params in
|
|
971
|
+
fputs("log: handler(CallTool): received request for tool: \(params.name).\n", stderr)
|
|
972
|
+
fputs("log: handler(CallTool): arguments received (raw MCP): \(params.arguments?.debugDescription ?? "nil")\n", stderr)
|
|
973
|
+
|
|
974
|
+
do {
|
|
975
|
+
// --- Determine Action and Options from MCP Params ---
|
|
976
|
+
let primaryAction: PrimaryAction
|
|
977
|
+
var options = ActionOptions(traverseAfter: true, showAnimation: false) // MCP tools should return the tree by default, no visual highlighting
|
|
978
|
+
|
|
979
|
+
// PID is required for click, type, press, refresh
|
|
980
|
+
// Optional only for open (where SDK finds it)
|
|
981
|
+
let pidOptionalInt = try getOptionalInt(from: params.arguments, key: "pid")
|
|
982
|
+
|
|
983
|
+
// Convert Int? to pid_t?
|
|
984
|
+
let pidForOptions: pid_t?
|
|
985
|
+
if let unwrappedPid = pidOptionalInt {
|
|
986
|
+
guard let convertedPid = pid_t(exactly: unwrappedPid) else {
|
|
987
|
+
fputs("error: handler(CallTool): PID value \(unwrappedPid) is out of range for pid_t (Int32).\n", stderr)
|
|
988
|
+
throw MCPError.invalidParams("PID value \(unwrappedPid) is out of range.")
|
|
989
|
+
}
|
|
990
|
+
pidForOptions = convertedPid
|
|
991
|
+
} else {
|
|
992
|
+
pidForOptions = nil
|
|
993
|
+
}
|
|
994
|
+
options.pidForTraversal = pidForOptions
|
|
995
|
+
|
|
996
|
+
// Potentially allow overriding default options from params
|
|
997
|
+
options.traverseBefore = try getOptionalBool(from: params.arguments, key: "traverseBefore") ?? options.traverseBefore
|
|
998
|
+
options.traverseAfter = try getOptionalBool(from: params.arguments, key: "traverseAfter") ?? options.traverseAfter
|
|
999
|
+
options.showDiff = try getOptionalBool(from: params.arguments, key: "showDiff") ?? options.showDiff
|
|
1000
|
+
options.onlyVisibleElements = try getOptionalBool(from: params.arguments, key: "onlyVisibleElements") ?? options.onlyVisibleElements
|
|
1001
|
+
options.showAnimation = try getOptionalBool(from: params.arguments, key: "showAnimation") ?? options.showAnimation
|
|
1002
|
+
options.animationDuration = try getOptionalDouble(from: params.arguments, key: "animationDuration") ?? options.animationDuration
|
|
1003
|
+
options.delayAfterAction = try getOptionalDouble(from: params.arguments, key: "delayAfterAction") ?? options.delayAfterAction
|
|
1004
|
+
|
|
1005
|
+
options = options.validated()
|
|
1006
|
+
fputs("log: handler(CallTool): constructed ActionOptions: \(options)\n", stderr)
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
// Track whether this tool returns a diff (click/type/press) or full traversal (open/refresh)
|
|
1010
|
+
var hasDiff = false
|
|
1011
|
+
|
|
1012
|
+
switch params.name {
|
|
1013
|
+
case openAppTool.name:
|
|
1014
|
+
let identifier = try getRequiredString(from: params.arguments, key: "identifier")
|
|
1015
|
+
primaryAction = .open(identifier: identifier)
|
|
1016
|
+
|
|
1017
|
+
case clickTool.name:
|
|
1018
|
+
guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for click tool") }
|
|
1019
|
+
let x = try getRequiredDouble(from: params.arguments, key: "x")
|
|
1020
|
+
let y = try getRequiredDouble(from: params.arguments, key: "y")
|
|
1021
|
+
let w = try getOptionalDouble(from: params.arguments, key: "width")
|
|
1022
|
+
let h = try getOptionalDouble(from: params.arguments, key: "height")
|
|
1023
|
+
// If width+height provided, compute exact center; otherwise use raw point (AX will center via lookup)
|
|
1024
|
+
let rawPoint: CGPoint
|
|
1025
|
+
if let w = w, let h = h {
|
|
1026
|
+
rawPoint = CGPoint(x: x + w / 2, y: y + h / 2)
|
|
1027
|
+
fputs("log: click_and_traverse: centering (\(x),\(y)) + size(\(w)×\(h)) → \(rawPoint)\n", stderr)
|
|
1028
|
+
} else {
|
|
1029
|
+
rawPoint = CGPoint(x: x, y: y)
|
|
1030
|
+
}
|
|
1031
|
+
// Activate the target app before clicking so the click registers correctly
|
|
1032
|
+
if let runningApp = NSRunningApplication(processIdentifier: reqPid) {
|
|
1033
|
+
runningApp.activate(options: [])
|
|
1034
|
+
fputs("log: click_and_traverse: activated app pid=\(reqPid)\n", stderr)
|
|
1035
|
+
try? await Task.sleep(nanoseconds: 200_000_000) // 200ms for activation
|
|
1036
|
+
}
|
|
1037
|
+
// Auto-scroll element into view if it's outside the visible window area
|
|
1038
|
+
let adjustedPoint = await scrollIntoViewIfNeeded(pid: reqPid, point: rawPoint)
|
|
1039
|
+
primaryAction = .input(action: .click(point: adjustedPoint))
|
|
1040
|
+
options.pidForTraversal = reqPid
|
|
1041
|
+
options.showDiff = true // enables traverseBefore automatically
|
|
1042
|
+
hasDiff = true
|
|
1043
|
+
|
|
1044
|
+
case typeTool.name:
|
|
1045
|
+
guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for type tool") }
|
|
1046
|
+
let text = try getRequiredString(from: params.arguments, key: "text")
|
|
1047
|
+
primaryAction = .input(action: .type(text: text))
|
|
1048
|
+
options.pidForTraversal = reqPid
|
|
1049
|
+
options.showDiff = true
|
|
1050
|
+
hasDiff = true
|
|
1051
|
+
|
|
1052
|
+
case pressKeyTool.name:
|
|
1053
|
+
guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for press key tool") }
|
|
1054
|
+
let keyName = try getRequiredString(from: params.arguments, key: "keyName")
|
|
1055
|
+
let flags = try parseFlags(from: params.arguments?["modifierFlags"])
|
|
1056
|
+
fputs("log: handler(CallTool): parsed modifierFlags: \(flags)\n", stderr)
|
|
1057
|
+
primaryAction = .input(action: .press(keyName: keyName, flags: flags))
|
|
1058
|
+
options.pidForTraversal = reqPid
|
|
1059
|
+
options.showDiff = true
|
|
1060
|
+
hasDiff = true
|
|
1061
|
+
|
|
1062
|
+
case scrollTool.name:
|
|
1063
|
+
guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for scroll tool") }
|
|
1064
|
+
let x = try getRequiredDouble(from: params.arguments, key: "x")
|
|
1065
|
+
let y = try getRequiredDouble(from: params.arguments, key: "y")
|
|
1066
|
+
let deltaY = try getRequiredInt(from: params.arguments, key: "deltaY")
|
|
1067
|
+
let deltaX = try getOptionalInt(from: params.arguments, key: "deltaX") ?? 0
|
|
1068
|
+
let scrollPoint = CGPoint(x: x, y: y)
|
|
1069
|
+
// Activate the target app before scrolling so the event registers correctly
|
|
1070
|
+
if let runningApp = NSRunningApplication(processIdentifier: reqPid) {
|
|
1071
|
+
runningApp.activate(options: [])
|
|
1072
|
+
fputs("log: scroll_and_traverse: activated app pid=\(reqPid)\n", stderr)
|
|
1073
|
+
try? await Task.sleep(nanoseconds: 200_000_000) // 200ms for activation
|
|
1074
|
+
}
|
|
1075
|
+
primaryAction = .input(action: .scroll(point: scrollPoint, deltaY: Int32(deltaY), deltaX: Int32(deltaX)))
|
|
1076
|
+
options.pidForTraversal = reqPid
|
|
1077
|
+
options.showDiff = true
|
|
1078
|
+
hasDiff = true
|
|
1079
|
+
|
|
1080
|
+
case refreshTool.name:
|
|
1081
|
+
guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for refresh tool") }
|
|
1082
|
+
primaryAction = .traverseOnly
|
|
1083
|
+
options.pidForTraversal = reqPid
|
|
1084
|
+
|
|
1085
|
+
default:
|
|
1086
|
+
fputs("error: handler(CallTool): received request for unknown or unsupported tool: \(params.name)\n", stderr)
|
|
1087
|
+
throw MCPError.methodNotFound(params.name)
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
fputs("log: handler(CallTool): constructed PrimaryAction: \(primaryAction)\n", stderr)
|
|
1091
|
+
|
|
1092
|
+
// --- Save cursor position before click actions so we can restore it after ---
|
|
1093
|
+
var savedCursorPos: CGPoint? = nil
|
|
1094
|
+
if case .input(let inputAction) = primaryAction, case .click = inputAction {
|
|
1095
|
+
let nsPos = NSEvent.mouseLocation
|
|
1096
|
+
if let primaryScreen = NSScreen.screens.first {
|
|
1097
|
+
// NSEvent uses bottom-left origin; CGEvent uses top-left — flip Y
|
|
1098
|
+
savedCursorPos = CGPoint(x: nsPos.x, y: primaryScreen.frame.height - nsPos.y)
|
|
1099
|
+
fputs("log: handler(CallTool): saved cursor position \(savedCursorPos!)\n", stderr)
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
|
|
1103
|
+
// --- Execute the Action using MacosUseSDK ---
|
|
1104
|
+
let actionResult: ActionResult = await Task { @MainActor in
|
|
1105
|
+
fputs("log: handler(CallTool): executing performAction on MainActor via Task...\n", stderr)
|
|
1106
|
+
return await performAction(action: primaryAction, optionsInput: options)
|
|
1107
|
+
}.value
|
|
1108
|
+
fputs("log: handler(CallTool): performAction task completed.\n", stderr)
|
|
1109
|
+
|
|
1110
|
+
// --- Restore cursor position after click ---
|
|
1111
|
+
if let pos = savedCursorPos,
|
|
1112
|
+
let moveEvent = CGEvent(mouseEventSource: nil, mouseType: .mouseMoved,
|
|
1113
|
+
mouseCursorPosition: pos, mouseButton: .left) {
|
|
1114
|
+
moveEvent.post(tap: .cghidEventTap)
|
|
1115
|
+
fputs("log: handler(CallTool): restored cursor to \(pos)\n", stderr)
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
// --- Build simplified response and serialize to JSON ---
|
|
1119
|
+
let toolResponse = buildToolResponse(actionResult, hasDiff: hasDiff)
|
|
1120
|
+
guard let resultJsonString = serializeToJsonString(toolResponse) else {
|
|
1121
|
+
fputs("error: handler(CallTool): failed to serialize ToolResponse to JSON for tool \(params.name).\n", stderr)
|
|
1122
|
+
throw MCPError.internalError("failed to serialize ToolResponse to JSON")
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
// --- Determine if it was an error overall ---
|
|
1126
|
+
let isError = actionResult.primaryActionError != nil ||
|
|
1127
|
+
(options.traverseBefore && actionResult.traversalBeforeError != nil) ||
|
|
1128
|
+
(options.traverseAfter && actionResult.traversalAfterError != nil)
|
|
1129
|
+
|
|
1130
|
+
if isError {
|
|
1131
|
+
fputs("warning: handler(CallTool): Action resulted in an error state (primary: \(actionResult.primaryActionError ?? "nil"), before: \(actionResult.traversalBeforeError ?? "nil"), after: \(actionResult.traversalAfterError ?? "nil")).\n", stderr)
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
// --- Write full JSON to file, return compact summary ---
|
|
1135
|
+
let outputDir = "/tmp/macos-use"
|
|
1136
|
+
try? FileManager.default.createDirectory(atPath: outputDir, withIntermediateDirectories: true)
|
|
1137
|
+
|
|
1138
|
+
let timestamp = Int(Date().timeIntervalSince1970 * 1000) // ms precision to avoid collisions
|
|
1139
|
+
let safeName = params.name.replacingOccurrences(of: "macos-use_", with: "")
|
|
1140
|
+
let filename = "\(timestamp)_\(safeName).json"
|
|
1141
|
+
let filepath = "\(outputDir)/\(filename)"
|
|
1142
|
+
try? resultJsonString.write(toFile: filepath, atomically: true, encoding: .utf8)
|
|
1143
|
+
fputs("log: handler(CallTool): wrote full response to \(filepath) (\(resultJsonString.count) bytes)\n", stderr)
|
|
1144
|
+
|
|
1145
|
+
let summary = buildCompactSummary(toolName: params.name, params: params, toolResponse: toolResponse, filepath: filepath)
|
|
1146
|
+
fputs("log: handler(CallTool): returning compact summary (\(summary.count) chars)\n", stderr)
|
|
1147
|
+
|
|
1148
|
+
return .init(content: [.text(summary)], isError: isError)
|
|
1149
|
+
|
|
1150
|
+
} catch let error as MCPError {
|
|
1151
|
+
fputs("error: handler(CallTool): MCPError occurred processing MCP params for tool '\(params.name)': \(error)\n", stderr)
|
|
1152
|
+
return .init(content: [.text("Error processing parameters for tool '\(params.name)': \(error.localizedDescription)")], isError: true)
|
|
1153
|
+
} catch {
|
|
1154
|
+
fputs("error: handler(CallTool): Unexpected error occurred setting up call for tool '\(params.name)': \(error)\n", stderr)
|
|
1155
|
+
return .init(content: [.text("Unexpected setup error executing tool '\(params.name)': \(error.localizedDescription)")], isError: true)
|
|
1156
|
+
}
|
|
1157
|
+
}
|
|
1158
|
+
fputs("log: setupAndStartServer: registered CallTool handler.\n", stderr)
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
// --- Transport and Start ---
|
|
1162
|
+
let transport = StdioTransport()
|
|
1163
|
+
fputs("log: setupAndStartServer: created StdioTransport.\n", stderr)
|
|
1164
|
+
|
|
1165
|
+
fputs("log: setupAndStartServer: calling server.start()...\n", stderr)
|
|
1166
|
+
try await server.start(transport: transport)
|
|
1167
|
+
fputs("log: setupAndStartServer: server.start() completed (background task launched).\n", stderr)
|
|
1168
|
+
|
|
1169
|
+
fputs("log: setupAndStartServer: returning server instance.\n", stderr)
|
|
1170
|
+
return server
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
// --- @main Entry Point ---
|
|
1174
|
+
@main
|
|
1175
|
+
struct MCPServer {
|
|
1176
|
+
// Main entry point - Async
|
|
1177
|
+
static func main() async {
|
|
1178
|
+
fputs("log: main: starting server (async).\n", stderr)
|
|
1179
|
+
|
|
1180
|
+
// Configure logging if needed (optional)
|
|
1181
|
+
// LoggingSystem.bootstrap { label in MultiplexLogHandler([...]) }
|
|
1182
|
+
|
|
1183
|
+
let server: Server
|
|
1184
|
+
do {
|
|
1185
|
+
fputs("log: main: calling setupAndStartServer()...\n", stderr)
|
|
1186
|
+
server = try await setupAndStartServer()
|
|
1187
|
+
fputs("log: main: setupAndStartServer() successful, server instance obtained.\n", stderr)
|
|
1188
|
+
|
|
1189
|
+
fputs("log: main: server started, calling server.waitUntilCompleted()...\n", stderr)
|
|
1190
|
+
await server.waitUntilCompleted() // Waits until the server loop finishes/errors
|
|
1191
|
+
fputs("log: main: server.waitUntilCompleted() returned. Server has stopped.\n", stderr)
|
|
1192
|
+
|
|
1193
|
+
} catch {
|
|
1194
|
+
fputs("error: main: server setup or run failed: \(error)\n", stderr)
|
|
1195
|
+
if let mcpError = error as? MCPError {
|
|
1196
|
+
fputs("error: main: MCPError details: \(mcpError.localizedDescription)\n", stderr)
|
|
1197
|
+
}
|
|
1198
|
+
// Consider more specific exit codes if useful
|
|
1199
|
+
exit(1) // Exit with error code
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
fputs("log: main: Server processing finished gracefully. Exiting.\n", stderr)
|
|
1203
|
+
exit(0) // Exit cleanly
|
|
1204
|
+
}
|
|
1205
|
+
}
|