mcp-server-macos-use 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1205 @@
1
+ import MCP
2
+ import Foundation
3
+ import CoreGraphics // Still needed for CGPoint, CGEventFlags
4
+ import ApplicationServices // For AXUIElement APIs (scroll-into-view, window bounds)
5
+ import AppKit // For NSEvent.mouseLocation (cursor position save/restore)
6
+ import MacosUseSDK // <-- Import the SDK
7
+
8
+ // --- Helper to serialize Swift structs to JSON String ---
9
+ func serializeToJsonString<T: Encodable>(_ value: T) -> String? {
10
+ let encoder = JSONEncoder()
11
+ // Use pretty printing for easier debugging of the output if needed
12
+ encoder.outputFormatting = [.sortedKeys, .withoutEscapingSlashes]
13
+ do {
14
+ let jsonData = try encoder.encode(value)
15
+ return String(data: jsonData, encoding: .utf8)
16
+ } catch {
17
+ fputs("error: serializeToJsonString: failed to encode value to JSON: \(error)\n", stderr)
18
+ return nil
19
+ }
20
+ }
21
+
22
+ // --- Function to get arguments from MCP Value ---
23
+ // Helper to extract typed values safely
24
+ func getRequiredString(from args: [String: Value]?, key: String) throws -> String {
25
+ guard let val = args?[key]?.stringValue else {
26
+ throw MCPError.invalidParams("Missing or invalid required string argument: '\(key)'")
27
+ }
28
+ return val
29
+ }
30
+
31
+ func getRequiredDouble(from args: [String: Value]?, key: String) throws -> Double {
32
+ guard let value = args?[key] else {
33
+ throw MCPError.invalidParams("Missing required number argument: '\(key)'")
34
+ }
35
+ switch value {
36
+ case .int(let intValue):
37
+ fputs("log: getRequiredDouble: converting int \(intValue) to double for key '\(key)'\n", stderr)
38
+ return Double(intValue)
39
+ case .double(let doubleValue):
40
+ return doubleValue
41
+ default:
42
+ throw MCPError.invalidParams("Invalid type for required number argument: '\(key)', expected Int or Double, got \(value)")
43
+ }
44
+ }
45
+
46
+ func getRequiredInt(from args: [String: Value]?, key: String) throws -> Int {
47
+ guard let value = args?[key] else {
48
+ throw MCPError.invalidParams("Missing required integer argument: '\(key)'")
49
+ }
50
+ // Allow conversion from Double if it's an exact integer
51
+ if let doubleValue = value.doubleValue {
52
+ if let intValue = Int(exactly: doubleValue) {
53
+ fputs("log: getRequiredInt: converting exact double \(doubleValue) to int for key '\(key)'\n", stderr)
54
+ return intValue
55
+ } else {
56
+ fputs("warning: getRequiredInt: received non-exact double \(doubleValue) for key '\(key)', expecting integer.\n", stderr)
57
+ throw MCPError.invalidParams("Invalid type for required integer argument: '\(key)', received non-exact Double \(doubleValue)")
58
+ }
59
+ }
60
+ // Otherwise, require it to be an Int directly
61
+ guard let intValue = value.intValue else {
62
+ throw MCPError.invalidParams("Invalid type for required integer argument: '\(key)', expected Int or exact Double, got \(value)")
63
+ }
64
+ return intValue
65
+ }
66
+
67
+
68
+ // --- Get Optional arguments ---
69
+ // Helper for optional values
70
+ func getOptionalDouble(from args: [String: Value]?, key: String) throws -> Double? {
71
+ guard let value = args?[key] else { return nil } // Key not present is valid for optional
72
+ if value.isNull { return nil } // Explicit null is also valid
73
+ switch value {
74
+ case .int(let intValue):
75
+ fputs("log: getOptionalDouble: converting int \(intValue) to double for key '\(key)'\n", stderr)
76
+ return Double(intValue)
77
+ case .double(let doubleValue):
78
+ return doubleValue
79
+ default:
80
+ throw MCPError.invalidParams("Invalid type for optional number argument: '\(key)', expected Int or Double, got \(value)")
81
+ }
82
+ }
83
+
84
+ func getOptionalInt(from args: [String: Value]?, key: String) throws -> Int? {
85
+ guard let value = args?[key] else { return nil } // Key not present is valid for optional
86
+ if value.isNull { return nil } // Explicit null is also valid
87
+
88
+ if let doubleValue = value.doubleValue {
89
+ if let intValue = Int(exactly: doubleValue) {
90
+ fputs("log: getOptionalInt: converting exact double \(doubleValue) to int for key '\(key)'\n", stderr)
91
+ return intValue
92
+ } else {
93
+ fputs("warning: getOptionalInt: received non-exact double \(doubleValue) for key '\(key)', expecting integer.\n", stderr)
94
+ throw MCPError.invalidParams("Invalid type for optional integer argument: '\(key)', received non-exact Double \(doubleValue)")
95
+ }
96
+ }
97
+ guard let intValue = value.intValue else {
98
+ throw MCPError.invalidParams("Invalid type for optional integer argument: '\(key)', expected Int or exact Double, got \(value)")
99
+ }
100
+ return intValue
101
+ }
102
+
103
+ func getOptionalBool(from args: [String: Value]?, key: String) throws -> Bool? {
104
+ guard let value = args?[key] else { return nil } // Key not present
105
+ if value.isNull { return nil } // Explicit null
106
+ guard let boolValue = value.boolValue else {
107
+ throw MCPError.invalidParams("Invalid type for optional boolean argument: '\(key)', expected Bool, got \(value)")
108
+ }
109
+ return boolValue
110
+ }
111
+
112
+ // --- NEW Helper to parse modifier flags ---
113
+ func parseFlags(from value: Value?) throws -> CGEventFlags {
114
+ guard let arrayValue = value?.arrayValue else {
115
+ // No flags provided or not an array, return empty flags
116
+ return []
117
+ }
118
+
119
+ var flags: CGEventFlags = []
120
+ for flagValue in arrayValue {
121
+ guard let flagString = flagValue.stringValue else {
122
+ throw MCPError.invalidParams("Invalid modifierFlags array: contains non-string element \(flagValue)")
123
+ }
124
+ switch flagString.lowercased() {
125
+ // Standard modifiers
126
+ case "capslock", "caps": flags.insert(.maskAlphaShift)
127
+ case "shift": flags.insert(.maskShift)
128
+ case "control", "ctrl": flags.insert(.maskControl)
129
+ case "option", "opt", "alt": flags.insert(.maskAlternate)
130
+ case "command", "cmd": flags.insert(.maskCommand)
131
+ // Other potentially useful flags
132
+ case "help": flags.insert(.maskHelp)
133
+ case "function", "fn": flags.insert(.maskSecondaryFn)
134
+ case "numericpad", "numpad": flags.insert(.maskNumericPad)
135
+ // Non-keyed state (less common for press simulation)
136
+ // case "noncoalesced": flags.insert(.maskNonCoalesced)
137
+ default:
138
+ fputs("warning: parseFlags: unknown modifier flag string '\(flagString)', ignoring.\n", stderr)
139
+ // Optionally throw an error:
140
+ // throw MCPError.invalidParams("Unknown modifier flag: '\(flagString)'")
141
+ }
142
+ }
143
+ return flags
144
+ }
145
+
146
+ // --- Enriched Data Structures (adds in_viewport metadata) ---
147
+
148
+ struct EnrichedElementData: Codable {
149
+ var role: String
150
+ var text: String?
151
+ var x: Double?
152
+ var y: Double?
153
+ var width: Double?
154
+ var height: Double?
155
+ var in_viewport: Bool?
156
+ }
157
+
158
+ struct EnrichedResponseData: Codable {
159
+ let app_name: String
160
+ var elements: [EnrichedElementData]
161
+ var stats: Statistics
162
+ let processing_time_seconds: String
163
+ }
164
+
165
+ /// Diff element: role, text, viewport status, and coordinates for spatial targeting
166
+ struct DiffElementData: Codable {
167
+ var role: String
168
+ var text: String?
169
+ var in_viewport: Bool?
170
+ var x: Double?
171
+ var y: Double?
172
+ var width: Double?
173
+ var height: Double?
174
+ }
175
+
176
+ struct DiffAttributeChange: Codable {
177
+ let attributeName: String
178
+ let addedText: String?
179
+ let removedText: String?
180
+ let oldValue: String?
181
+ let newValue: String?
182
+ }
183
+
184
+ struct DiffModifiedElement: Codable {
185
+ let before: DiffElementData
186
+ let after: DiffElementData
187
+ let changes: [DiffAttributeChange]
188
+ }
189
+
190
+ struct EnrichedTraversalDiff: Codable {
191
+ let added: [DiffElementData]
192
+ let removed: [DiffElementData]
193
+ let modified: [DiffModifiedElement]
194
+ }
195
+
196
+ /// Simplified response: returns either a full traversal (open/refresh) or a diff (click/type/press)
197
+ struct ToolResponse: Codable {
198
+ var openResult: AppOpenerResult?
199
+ var traversalPid: pid_t?
200
+ var traversal: EnrichedResponseData? // for open/refresh: full current state
201
+ var diff: EnrichedTraversalDiff? // for click/type/press: what changed
202
+ var primaryActionError: String?
203
+ var traversalError: String?
204
+ }
205
+
206
+ // --- Viewport Detection Helpers ---
207
+
208
+ /// Extract window bounds from traversal data by finding the AXWindow element
209
+ func getWindowBoundsFromTraversal(_ responseData: ResponseData?) -> CGRect? {
210
+ guard let response = responseData else { return nil }
211
+ for element in response.elements {
212
+ if element.role == "AXWindow",
213
+ let x = element.x, let y = element.y,
214
+ let w = element.width, let h = element.height {
215
+ return CGRect(x: x, y: y, width: w, height: h)
216
+ }
217
+ }
218
+ return nil
219
+ }
220
+
221
+ /// Find the window (element + frame) whose frame contains the given point.
222
+ /// Searches all AXWindows of the app; falls back to AXMainWindow if none matches.
223
+ func getWindowContainingPoint(appElement: AXUIElement, point: CGPoint) -> (element: AXUIElement, bounds: CGRect)? {
224
+ var windowsRef: CFTypeRef?
225
+ if AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &windowsRef) == .success,
226
+ let windows = windowsRef as? [AXUIElement] {
227
+ for window in windows {
228
+ guard let frame = getAXElementFrame(window) else { continue }
229
+ if frame.contains(point) {
230
+ fputs("log: getWindowContainingPoint: matched window \(frame) for point \(point)\n", stderr)
231
+ return (window, frame)
232
+ }
233
+ }
234
+ }
235
+ // Fallback to main window
236
+ var winRef: CFTypeRef?
237
+ guard AXUIElementCopyAttributeValue(appElement, "AXMainWindow" as CFString, &winRef) == .success else { return nil }
238
+ let win = winRef as! AXUIElement
239
+ guard let frame = getAXElementFrame(win) else { return nil }
240
+ fputs("log: getWindowContainingPoint: no window contains \(point), falling back to main window \(frame)\n", stderr)
241
+ return (win, frame)
242
+ }
243
+
244
+ /// Get window bounds directly from the accessibility API
245
+ func getWindowBoundsFromAPI(pid: pid_t) -> CGRect? {
246
+ let appElement = AXUIElementCreateApplication(pid)
247
+
248
+ var windowValue: CFTypeRef?
249
+ guard AXUIElementCopyAttributeValue(appElement, "AXMainWindow" as CFString, &windowValue) == .success else {
250
+ fputs("warning: getWindowBoundsFromAPI: could not get main window for pid \(pid)\n", stderr)
251
+ return nil
252
+ }
253
+ let windowElement = windowValue as! AXUIElement
254
+
255
+ var positionValue: CFTypeRef?
256
+ guard AXUIElementCopyAttributeValue(windowElement, "AXPosition" as CFString, &positionValue) == .success else {
257
+ fputs("warning: getWindowBoundsFromAPI: could not get window position\n", stderr)
258
+ return nil
259
+ }
260
+ var position = CGPoint.zero
261
+ AXValueGetValue(positionValue as! AXValue, .cgPoint, &position)
262
+
263
+ var sizeValue: CFTypeRef?
264
+ guard AXUIElementCopyAttributeValue(windowElement, "AXSize" as CFString, &sizeValue) == .success else {
265
+ fputs("warning: getWindowBoundsFromAPI: could not get window size\n", stderr)
266
+ return nil
267
+ }
268
+ var size = CGSize.zero
269
+ AXValueGetValue(sizeValue as! AXValue, .cgSize, &size)
270
+
271
+ return CGRect(origin: position, size: size)
272
+ }
273
+
274
+ /// Enrich a ResponseData with in_viewport metadata for each element
275
+ func enrichResponseData(_ response: ResponseData, windowBounds: CGRect?) -> EnrichedResponseData {
276
+ let enrichedElements = response.elements.map { element -> EnrichedElementData in
277
+ let inViewport: Bool?
278
+ if let x = element.x, let y = element.y, let bounds = windowBounds {
279
+ inViewport = bounds.contains(CGPoint(x: x, y: y))
280
+ } else {
281
+ inViewport = nil
282
+ }
283
+ return EnrichedElementData(
284
+ role: element.role, text: element.text,
285
+ x: element.x, y: element.y,
286
+ width: element.width, height: element.height,
287
+ in_viewport: inViewport
288
+ )
289
+ }
290
+ return EnrichedResponseData(
291
+ app_name: response.app_name,
292
+ elements: enrichedElements,
293
+ stats: response.stats,
294
+ processing_time_seconds: response.processing_time_seconds
295
+ )
296
+ }
297
+
298
+ /// Look up text for a container element (AXRow, AXCell) by:
299
+ /// 1. Coordinate containment — find a text-bearing child within the element's bounds
300
+ /// 2. List proximity — the traversal is depth-first, so children follow the parent;
301
+ /// find the element in the flat list and check the next few entries for text.
302
+ /// This handles off-screen elements whose children have no coordinates.
303
+ func findTextForElement(_ element: ElementData, in traversal: ResponseData?) -> String? {
304
+ if let text = element.text, !text.isEmpty { return text }
305
+ guard let elements = traversal?.elements else { return nil }
306
+
307
+ // Strategy 1: coordinate containment (works for visible elements)
308
+ if let x = element.x, let y = element.y,
309
+ let w = element.width, let h = element.height {
310
+ let bounds = CGRect(x: x, y: y, width: w, height: h)
311
+ for el in elements {
312
+ if let elText = el.text, !elText.isEmpty,
313
+ let elX = el.x, let elY = el.y,
314
+ bounds.contains(CGPoint(x: elX, y: elY)) {
315
+ return elText
316
+ }
317
+ }
318
+ }
319
+
320
+ // Strategy 2: list proximity (works for off-screen elements)
321
+ // Use approximate coordinate matching (±2px) to handle floating-point differences
322
+ if let x = element.x, let y = element.y {
323
+ for (i, el) in elements.enumerated() {
324
+ if el.role == element.role,
325
+ let elX = el.x, let elY = el.y,
326
+ abs(elX - x) < 2, abs(elY - y) < 2 {
327
+ // Found the element — look at next few entries for text
328
+ for j in (i + 1)..<min(i + 6, elements.count) {
329
+ if let text = elements[j].text, !text.isEmpty {
330
+ return text
331
+ }
332
+ // Stop if we hit another row (left the subtree)
333
+ if elements[j].role.contains("AXRow") && j > i + 1 { break }
334
+ }
335
+ break
336
+ }
337
+ }
338
+ }
339
+
340
+ return nil
341
+ }
342
+
343
+ /// Returns true if the role represents a scroll-bar component (noise in diffs).
344
+ func isScrollBarNoise(_ role: String) -> Bool {
345
+ let lower = role.lowercased()
346
+ return lower.contains("scrollbar") || lower.contains("scroll bar") ||
347
+ lower.contains("value indicator") ||
348
+ lower.contains("page button") || lower.contains("arrow button")
349
+ }
350
+
351
+ /// Returns true if the role is a structural container that's noise without text.
352
+ func isStructuralNoise(_ role: String, text: String?) -> Bool {
353
+ if let text = text, !text.isEmpty { return false }
354
+ let lower = role.lowercased()
355
+ return lower.contains("axrow") || lower.contains("outline row") ||
356
+ lower.contains("axcell") || lower.contains("cell") ||
357
+ lower.contains("axcolumn") || lower.contains("column") ||
358
+ lower.contains("axmenu") || lower.contains("menu")
359
+ }
360
+
361
+ /// Build a ToolResponse from an ActionResult.
362
+ /// For actions with diff: returns only the diff (enriched with in_viewport on added elements).
363
+ /// For open/refresh: returns the full traversal (enriched with in_viewport).
364
+ func buildToolResponse(_ result: ActionResult, hasDiff: Bool) -> ToolResponse {
365
+ var windowBounds = getWindowBoundsFromTraversal(result.traversalAfter)
366
+ ?? getWindowBoundsFromTraversal(result.traversalBefore)
367
+ if windowBounds == nil {
368
+ if let pid = result.traversalPid ?? result.openResult?.pid {
369
+ windowBounds = getWindowBoundsFromAPI(pid: pid)
370
+ }
371
+ }
372
+
373
+ var response = ToolResponse()
374
+ response.openResult = result.openResult
375
+ response.traversalPid = result.traversalPid
376
+ response.primaryActionError = result.primaryActionError
377
+ response.traversalError = result.traversalAfterError ?? result.traversalBeforeError
378
+
379
+ if hasDiff, let rawDiff = result.traversalDiff {
380
+ let coordinateAttrs: Set<String> = ["x", "y", "width", "height"]
381
+
382
+ // Filter noise, strip coordinates, add in_viewport, resolve text
383
+ let filteredAdded = rawDiff.added
384
+ .filter { !isScrollBarNoise($0.role) }
385
+ .map { element -> DiffElementData in
386
+ let inViewport: Bool?
387
+ if let x = element.x, let y = element.y, let bounds = windowBounds {
388
+ inViewport = bounds.contains(CGPoint(x: x, y: y))
389
+ } else {
390
+ inViewport = nil
391
+ }
392
+ let text = findTextForElement(element, in: result.traversalAfter)
393
+ return DiffElementData(role: element.role, text: text, in_viewport: inViewport,
394
+ x: element.x, y: element.y, width: element.width, height: element.height)
395
+ }
396
+ .filter { !isStructuralNoise($0.role, text: $0.text) }
397
+
398
+ let filteredRemoved = rawDiff.removed
399
+ .filter { !isScrollBarNoise($0.role) }
400
+ .map { element -> DiffElementData in
401
+ let text = findTextForElement(element, in: result.traversalBefore)
402
+ return DiffElementData(role: element.role, text: text, in_viewport: nil,
403
+ x: element.x, y: element.y, width: element.width, height: element.height)
404
+ }
405
+ .filter { !isStructuralNoise($0.role, text: $0.text) }
406
+
407
+ // Filter modified: skip scroll-bar noise, drop coordinate-only changes
408
+ var filteredModified: [DiffModifiedElement] = []
409
+ for mod in rawDiff.modified {
410
+ if isScrollBarNoise(mod.before.role) || isScrollBarNoise(mod.after.role) { continue }
411
+
412
+ let meaningfulChanges = mod.changes.filter { !coordinateAttrs.contains($0.attributeName) }
413
+ if meaningfulChanges.isEmpty { continue }
414
+
415
+ let diffChanges = meaningfulChanges.map {
416
+ DiffAttributeChange(
417
+ attributeName: $0.attributeName,
418
+ addedText: $0.addedText,
419
+ removedText: $0.removedText,
420
+ oldValue: $0.oldValue,
421
+ newValue: $0.newValue
422
+ )
423
+ }
424
+
425
+ let beforeVP: Bool?
426
+ if let x = mod.before.x, let y = mod.before.y, let bounds = windowBounds {
427
+ beforeVP = bounds.contains(CGPoint(x: x, y: y))
428
+ } else { beforeVP = nil }
429
+
430
+ let afterVP: Bool?
431
+ if let x = mod.after.x, let y = mod.after.y, let bounds = windowBounds {
432
+ afterVP = bounds.contains(CGPoint(x: x, y: y))
433
+ } else { afterVP = nil }
434
+
435
+ let beforeText = findTextForElement(mod.before, in: result.traversalBefore)
436
+ let afterText = findTextForElement(mod.after, in: result.traversalAfter)
437
+
438
+ filteredModified.append(DiffModifiedElement(
439
+ before: DiffElementData(role: mod.before.role, text: beforeText, in_viewport: beforeVP),
440
+ after: DiffElementData(role: mod.after.role, text: afterText, in_viewport: afterVP),
441
+ changes: diffChanges
442
+ ))
443
+ }
444
+
445
+ response.diff = EnrichedTraversalDiff(
446
+ added: filteredAdded,
447
+ removed: filteredRemoved,
448
+ modified: filteredModified
449
+ )
450
+ } else if let after = result.traversalAfter {
451
+ // Full traversal for open/refresh
452
+ response.traversal = enrichResponseData(after, windowBounds: windowBounds)
453
+ }
454
+
455
+ return response
456
+ }
457
+
458
+ // --- Compact Summary Builder (file-based MCP responses) ---
459
+
460
+ /// Build a concise text summary for the MCP response instead of returning the full JSON.
461
+ /// The full JSON is written to a file; this summary contains just the key info + file path.
462
+ func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResponse: ToolResponse, filepath: String) -> String {
463
+ var lines: [String] = []
464
+
465
+ // Status line
466
+ let status = (toolResponse.primaryActionError != nil || toolResponse.traversalError != nil) ? "error" : "success"
467
+ lines.append("status: \(status)")
468
+
469
+ // PID and app
470
+ if let pid = toolResponse.traversalPid {
471
+ lines.append("pid: \(pid)")
472
+ }
473
+ if let appName = toolResponse.traversal?.app_name ?? toolResponse.openResult?.appName {
474
+ lines.append("app: \(appName)")
475
+ }
476
+
477
+ // File path
478
+ lines.append("file: \(filepath)")
479
+
480
+ // Errors if any
481
+ if let err = toolResponse.primaryActionError {
482
+ lines.append("error: \(err)")
483
+ }
484
+ if let err = toolResponse.traversalError {
485
+ lines.append("traversal_error: \(err)")
486
+ }
487
+
488
+ // Tool-specific summary line
489
+ let summaryLine: String
490
+ switch toolName {
491
+ case "macos-use_open_application_and_traverse":
492
+ let identifier = params.arguments?["identifier"]?.stringValue ?? "unknown"
493
+ if let traversal = toolResponse.traversal {
494
+ let total = traversal.elements.count
495
+ let visible = traversal.elements.filter { $0.in_viewport == true }.count
496
+ summaryLine = "Opened \(identifier) (PID:\(toolResponse.traversalPid ?? 0)). \(total) elements, \(visible) visible."
497
+ } else {
498
+ summaryLine = "Opened \(identifier) (PID:\(toolResponse.traversalPid ?? 0))."
499
+ }
500
+
501
+ case "macos-use_click_and_traverse":
502
+ let x = params.arguments?["x"]?.doubleValue ?? params.arguments?["x"]?.intValue.map(Double.init) ?? 0
503
+ let y = params.arguments?["y"]?.doubleValue ?? params.arguments?["y"]?.intValue.map(Double.init) ?? 0
504
+ let diffSummary = buildDiffSummary(toolResponse.diff)
505
+ summaryLine = "Clicked at (\(Int(x)),\(Int(y))). \(diffSummary)"
506
+
507
+ case "macos-use_type_and_traverse":
508
+ let text = params.arguments?["text"]?.stringValue ?? ""
509
+ let truncatedText = text.count > 40 ? String(text.prefix(40)) + "..." : text
510
+ let diffSummary = buildDiffSummary(toolResponse.diff)
511
+ summaryLine = "Typed '\(truncatedText)'. \(diffSummary)"
512
+
513
+ case "macos-use_press_key_and_traverse":
514
+ let keyName = params.arguments?["keyName"]?.stringValue ?? "unknown"
515
+ let mods = params.arguments?["modifierFlags"]?.arrayValue?.compactMap { $0.stringValue }.joined(separator: "+")
516
+ let keyDesc = (mods != nil && !mods!.isEmpty) ? "\(mods!)+\(keyName)" : keyName
517
+ let diffSummary = buildDiffSummary(toolResponse.diff)
518
+ summaryLine = "Pressed \(keyDesc). \(diffSummary)"
519
+
520
+ case "macos-use_scroll_and_traverse":
521
+ let x = params.arguments?["x"]?.doubleValue ?? params.arguments?["x"]?.intValue.map(Double.init) ?? 0
522
+ let y = params.arguments?["y"]?.doubleValue ?? params.arguments?["y"]?.intValue.map(Double.init) ?? 0
523
+ let deltaY = params.arguments?["deltaY"]?.intValue ?? 0
524
+ let diffSummary = buildDiffSummary(toolResponse.diff)
525
+ summaryLine = "Scrolled deltaY=\(deltaY) at (\(Int(x)),\(Int(y))). \(diffSummary)"
526
+
527
+ case "macos-use_refresh_traversal":
528
+ if let traversal = toolResponse.traversal {
529
+ let total = traversal.elements.count
530
+ let visible = traversal.elements.filter { $0.in_viewport == true }.count
531
+ summaryLine = "Refreshed PID \(toolResponse.traversalPid ?? 0) (\(traversal.app_name)). \(total) elements, \(visible) visible."
532
+ } else {
533
+ summaryLine = "Refreshed PID \(toolResponse.traversalPid ?? 0)."
534
+ }
535
+
536
+ default:
537
+ summaryLine = "Tool \(toolName) completed."
538
+ }
539
+
540
+ lines.append("summary: \(summaryLine)")
541
+
542
+ // Append notable text changes from diff (up to 3, truncated)
543
+ if let diff = toolResponse.diff {
544
+ var textChanges: [String] = []
545
+
546
+ for mod in diff.modified.prefix(5) {
547
+ for change in mod.changes {
548
+ if change.attributeName == "text" || change.attributeName == "AXValue" {
549
+ let oldVal = truncate(change.oldValue ?? change.removedText ?? "", maxLen: 60)
550
+ let newVal = truncate(change.newValue ?? change.addedText ?? "", maxLen: 60)
551
+ if !oldVal.isEmpty || !newVal.isEmpty {
552
+ textChanges.append(" '\(oldVal)' -> '\(newVal)'")
553
+ }
554
+ }
555
+ }
556
+ if textChanges.count >= 3 { break }
557
+ }
558
+
559
+ if !textChanges.isEmpty {
560
+ lines.append("text_changes:")
561
+ lines.append(contentsOf: textChanges.prefix(3))
562
+ }
563
+ }
564
+
565
+ return lines.joined(separator: "\n")
566
+ }
567
+
568
+ /// Build a short diff summary string like "3 added, 2 removed, 1 modified."
569
+ func buildDiffSummary(_ diff: EnrichedTraversalDiff?) -> String {
570
+ guard let diff = diff else { return "No diff." }
571
+ var parts: [String] = []
572
+ if !diff.added.isEmpty { parts.append("\(diff.added.count) added") }
573
+ if !diff.removed.isEmpty { parts.append("\(diff.removed.count) removed") }
574
+ if !diff.modified.isEmpty { parts.append("\(diff.modified.count) modified") }
575
+ return parts.isEmpty ? "No changes." : parts.joined(separator: ", ") + "."
576
+ }
577
+
578
+ /// Truncate a string to maxLen characters
579
+ func truncate(_ s: String, maxLen: Int) -> String {
580
+ s.count > maxLen ? String(s.prefix(maxLen)) + "..." : s
581
+ }
582
+
583
+ // --- Direct AX Element Interaction ---
584
+
585
+ // --- Auto-Scroll via Scroll Wheel Events ---
586
+
587
+ /// Walk the AX tree to find an element whose frame contains the given point.
588
+ /// Returns the deepest (smallest) match. Always recurses into children since
589
+ /// scroll area content may extend beyond the parent's visible frame.
590
+ func findAXElementAtPoint(root: AXUIElement, point: CGPoint, maxDepth: Int = 25) -> AXUIElement? {
591
+ guard maxDepth > 0 else { return nil }
592
+
593
+ var posRef: CFTypeRef?
594
+ var sizeRef: CFTypeRef?
595
+ let hasFrame =
596
+ AXUIElementCopyAttributeValue(root, "AXPosition" as CFString, &posRef) == .success &&
597
+ AXUIElementCopyAttributeValue(root, "AXSize" as CFString, &sizeRef) == .success
598
+ var containsPoint = false
599
+ if hasFrame {
600
+ var pos = CGPoint.zero; var sz = CGSize.zero
601
+ AXValueGetValue(posRef as! AXValue, .cgPoint, &pos)
602
+ AXValueGetValue(sizeRef as! AXValue, .cgSize, &sz)
603
+ containsPoint = CGRect(origin: pos, size: sz).contains(point)
604
+ }
605
+
606
+ var childrenRef: CFTypeRef?
607
+ if AXUIElementCopyAttributeValue(root, "AXChildren" as CFString, &childrenRef) == .success,
608
+ let children = childrenRef as? [AXUIElement] {
609
+ for child in children {
610
+ if let found = findAXElementAtPoint(root: child, point: point, maxDepth: maxDepth - 1) {
611
+ return found
612
+ }
613
+ }
614
+ }
615
+ return containsPoint ? root : nil
616
+ }
617
+
618
+ /// Get the text (AXValue or AXTitle) of an AX element.
619
+ /// If the element itself has no text, search its children (e.g. AXRow -> AXCell -> AXStaticText).
620
+ func getAXElementText(_ element: AXUIElement, searchChildren: Bool = true) -> String? {
621
+ var valueRef: CFTypeRef?
622
+ if AXUIElementCopyAttributeValue(element, "AXValue" as CFString, &valueRef) == .success,
623
+ let str = valueRef as? String, !str.isEmpty {
624
+ return str
625
+ }
626
+ if AXUIElementCopyAttributeValue(element, "AXTitle" as CFString, &valueRef) == .success,
627
+ let str = valueRef as? String, !str.isEmpty {
628
+ return str
629
+ }
630
+ // For container elements (AXRow, AXCell), check children for text
631
+ if searchChildren {
632
+ var childrenRef: CFTypeRef?
633
+ if AXUIElementCopyAttributeValue(element, "AXChildren" as CFString, &childrenRef) == .success,
634
+ let children = childrenRef as? [AXUIElement] {
635
+ for child in children {
636
+ if let childText = getAXElementText(child, searchChildren: true) {
637
+ return childText
638
+ }
639
+ }
640
+ }
641
+ }
642
+ return nil
643
+ }
644
+
645
+ /// Get the frame (position + size) of an AX element as a CGRect.
646
+ func getAXElementFrame(_ element: AXUIElement) -> CGRect? {
647
+ var posRef: CFTypeRef?
648
+ var sizeRef: CFTypeRef?
649
+ guard AXUIElementCopyAttributeValue(element, "AXPosition" as CFString, &posRef) == .success,
650
+ AXUIElementCopyAttributeValue(element, "AXSize" as CFString, &sizeRef) == .success else { return nil }
651
+ var pos = CGPoint.zero; var sz = CGSize.zero
652
+ AXValueGetValue(posRef as! AXValue, .cgPoint, &pos)
653
+ AXValueGetValue(sizeRef as! AXValue, .cgSize, &sz)
654
+ return CGRect(origin: pos, size: sz)
655
+ }
656
+
657
+ /// Search the AX tree for an element with matching text, returning its center if within viewport.
658
+ /// Uses a 40px inset margin to ensure the element is well within the visible area, not at the edge.
659
+ func findElementByText(root: AXUIElement, text: String, viewport: CGRect, maxDepth: Int = 25) -> CGPoint? {
660
+ guard maxDepth > 0 else { return nil }
661
+ let safeViewport = viewport.insetBy(dx: 0, dy: 15)
662
+
663
+ if let elementText = getAXElementText(root), elementText == text {
664
+ if let frame = getAXElementFrame(root) {
665
+ let center = CGPoint(x: frame.midX, y: frame.midY)
666
+ if safeViewport.contains(center) {
667
+ return center
668
+ }
669
+ }
670
+ }
671
+
672
+ var childrenRef: CFTypeRef?
673
+ if AXUIElementCopyAttributeValue(root, "AXChildren" as CFString, &childrenRef) == .success,
674
+ let children = childrenRef as? [AXUIElement] {
675
+ for child in children {
676
+ if let found = findElementByText(root: child, text: text, viewport: viewport, maxDepth: maxDepth - 1) {
677
+ return found
678
+ }
679
+ }
680
+ }
681
+ return nil
682
+ }
683
+
684
+ /// If the target point is outside the window viewport:
685
+ /// 1. Find the element at that point in the AX tree to get its text
686
+ /// 2. Scroll incrementally, targeting the element's x coordinate
687
+ /// 3. After each scroll step, search the AX tree for the element by text
688
+ /// 4. When found within viewport, return its actual position
689
+ ///
690
+ /// If the element at the target has no text (common for far off-screen items),
691
+ /// scroll toward the target and keep probing until an element with text appears.
692
+ func scrollIntoViewIfNeeded(pid: pid_t, point: CGPoint) async -> CGPoint {
693
+ let appElement = AXUIElementCreateApplication(pid)
694
+
695
+ guard let (windowElement, windowBounds) = getWindowContainingPoint(appElement: appElement, point: point) else {
696
+ fputs("log: scrollIntoViewIfNeeded: could not get window bounds, using original point\n", stderr)
697
+ return point
698
+ }
699
+
700
+ if windowBounds.contains(point) {
701
+ // Already in viewport — the caller already centered the point from (x + w/2, y + h/2).
702
+ // Do NOT try to refine via findAXElementAtPoint: the AX tree has overlapping full-width
703
+ // group elements (e.g. message rows spanning the entire window) that would shadow sidebar
704
+ // items and send clicks to the wrong location.
705
+ fputs("log: scrollIntoViewIfNeeded: in viewport, using caller-centered point \(point)\n", stderr)
706
+ return point
707
+ }
708
+
709
+ // Try to find the element and its text
710
+ let targetElement = findAXElementAtPoint(root: windowElement, point: point)
711
+ let targetText = targetElement != nil ? getAXElementText(targetElement!) : nil
712
+
713
+ let scrollingUp: Bool = point.y > windowBounds.maxY // need to reveal content below
714
+ let distance: CGFloat = scrollingUp
715
+ ? point.y - windowBounds.maxY
716
+ : windowBounds.minY - point.y
717
+ // Scale lines per step to distance: 1 line for tiny offsets, up to 3 for large ones.
718
+ // Each scroll line ≈ 20-40px, so 1 line is enough when distance < 80px.
719
+ let linesPerStep: Int32 = distance < 80 ? 1 : (distance < 250 ? 2 : 3)
720
+ let scrollDirection: Int32 = scrollingUp ? -linesPerStep : linesPerStep
721
+ let maxSteps = 30
722
+
723
+ if let targetText = targetText {
724
+ // CASE 1: We have the element's text - scroll and search by text
725
+ fputs("log: scrollIntoViewIfNeeded: target text=\"\(targetText)\", distance=\(distance)px, lines/step=\(linesPerStep), dir=\(scrollDirection)\n", stderr)
726
+
727
+ for step in 1...maxSteps {
728
+ guard let scrollEvent = CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 1, wheel1: scrollDirection, wheel2: 0, wheel3: 0) else { return point }
729
+ scrollEvent.location = CGPoint(x: point.x, y: windowBounds.midY)
730
+ scrollEvent.post(tap: .cghidEventTap)
731
+ try? await Task.sleep(nanoseconds: 100_000_000)
732
+
733
+ // Debug: check where the target element currently is
734
+ if step % 5 == 1 || step <= 3 {
735
+ if let el = targetElement, let frame = getAXElementFrame(el) {
736
+ fputs("log: scrollIntoViewIfNeeded: step \(step): element frame=\(frame)\n", stderr)
737
+ }
738
+ }
739
+
740
+ if let foundCenter = findElementByText(root: windowElement, text: targetText, viewport: windowBounds) {
741
+ fputs("log: scrollIntoViewIfNeeded: found \"\(targetText)\" at \(foundCenter) after \(step) steps\n", stderr)
742
+ try? await Task.sleep(nanoseconds: 100_000_000)
743
+ return foundCenter
744
+ }
745
+ }
746
+ fputs("warning: scrollIntoViewIfNeeded: could not scroll \"\(targetText)\" into view after \(maxSteps) steps\n", stderr)
747
+ // Debug: final position
748
+ if let el = targetElement, let frame = getAXElementFrame(el) {
749
+ fputs("log: scrollIntoViewIfNeeded: final element frame=\(frame)\n", stderr)
750
+ }
751
+ return point
752
+ } else {
753
+ // CASE 2: No text at target coordinates (common for far off-screen sidebar items).
754
+ // Strategy: scroll proportionally to the distance, then probe near the viewport
755
+ // edge where new content appears. As we scroll, check the bottom/top edge for
756
+ // newly revealed elements.
757
+ let distance = point.y > windowBounds.maxY
758
+ ? point.y - windowBounds.maxY
759
+ : windowBounds.minY - point.y
760
+ fputs("log: scrollIntoViewIfNeeded: no text at \(point), distance=\(distance)px, scrolling and probing edge\n", stderr)
761
+
762
+ // Probe position: near the edge where new content scrolls in
763
+ let probeY = point.y > windowBounds.maxY
764
+ ? windowBounds.maxY - 60 // probe near bottom edge
765
+ : windowBounds.minY + 60 // probe near top edge
766
+ let probePoint = CGPoint(x: point.x, y: probeY)
767
+
768
+ var lastText: String? = nil
769
+
770
+ for step in 1...maxSteps {
771
+ guard let scrollEvent = CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 1, wheel1: scrollDirection, wheel2: 0, wheel3: 0) else { return point }
772
+ scrollEvent.location = CGPoint(x: point.x, y: windowBounds.midY)
773
+ scrollEvent.post(tap: .cghidEventTap)
774
+ try? await Task.sleep(nanoseconds: 150_000_000)
775
+
776
+ // Probe near the viewport edge for newly revealed elements
777
+ if let element = findAXElementAtPoint(root: windowElement, point: probePoint),
778
+ let text = getAXElementText(element) {
779
+ if text != lastText {
780
+ fputs("log: scrollIntoViewIfNeeded: edge element after \(step) steps: \"\(text)\"\n", stderr)
781
+ lastText = text
782
+ }
783
+ }
784
+
785
+ // Also check: has the target point come into range? (for shorter distances)
786
+ if let element = findAXElementAtPoint(root: windowElement, point: point),
787
+ let text = getAXElementText(element) {
788
+ fputs("log: scrollIntoViewIfNeeded: target appeared after \(step) steps: \"\(text)\"\n", stderr)
789
+ // Switch to text-based tracking
790
+ if let foundCenter = findElementByText(root: windowElement, text: text, viewport: windowBounds) {
791
+ fputs("log: scrollIntoViewIfNeeded: \"\(text)\" in viewport at \(foundCenter)\n", stderr)
792
+ try? await Task.sleep(nanoseconds: 100_000_000)
793
+ return foundCenter
794
+ }
795
+ // Nudge a few more steps
796
+ for step2 in 1...8 {
797
+ guard let se = CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 1, wheel1: scrollDirection > 0 ? linesPerStep : -linesPerStep, wheel2: 0, wheel3: 0) else { return point }
798
+ se.location = CGPoint(x: point.x, y: windowBounds.midY)
799
+ se.post(tap: .cghidEventTap)
800
+ try? await Task.sleep(nanoseconds: 150_000_000)
801
+ if let fc = findElementByText(root: windowElement, text: text, viewport: windowBounds) {
802
+ fputs("log: scrollIntoViewIfNeeded: \"\(text)\" at \(fc) after \(step+step2) steps\n", stderr)
803
+ try? await Task.sleep(nanoseconds: 100_000_000)
804
+ return fc
805
+ }
806
+ }
807
+ // Use current position as fallback
808
+ if let frame = getAXElementFrame(element) {
809
+ return CGPoint(x: frame.midX, y: frame.midY)
810
+ }
811
+ return point
812
+ }
813
+ }
814
+ fputs("warning: scrollIntoViewIfNeeded: no element appeared after \(maxSteps) scroll steps\n", stderr)
815
+ return point
816
+ }
817
+ }
818
+
819
+ // Async helper function to set up and start the server
820
+ func setupAndStartServer() async throws -> Server {
821
+ fputs("log: setupAndStartServer: entering function.\n", stderr)
822
+
823
+ // --- Define Schemas and Tools for Simplified Actions ---
824
+ // (Schemas remain the same as they define the MCP interface)
825
+ let openAppSchema: Value = .object([
826
+ "type": .string("object"),
827
+ "properties": .object([
828
+ "identifier": .object(["type": .string("string"), "description": .string("REQUIRED. App name, path, or bundle ID.")])
829
+ ]),
830
+ "required": .array([.string("identifier")])
831
+ ])
832
+ let openAppTool = Tool(
833
+ name: "macos-use_open_application_and_traverse",
834
+ description: "Opens/activates an application and then traverses its accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
835
+ inputSchema: openAppSchema
836
+ )
837
+
838
+ let clickSchema: Value = .object([
839
+ "type": .string("object"),
840
+ "properties": .object([
841
+ "pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
842
+ "x": .object(["type": .string("number"), "description": .string("REQUIRED. X coordinate for the click (top-left of element).")]),
843
+ "y": .object(["type": .string("number"), "description": .string("REQUIRED. Y coordinate for the click (top-left of element).")]),
844
+ "width": .object(["type": .string("number"), "description": .string("Optional. Element width from traversal. When provided with height, click lands at center (x+width/2, y+height/2).")]),
845
+ "height": .object(["type": .string("number"), "description": .string("Optional. Element height from traversal. When provided with width, click lands at center (x+width/2, y+height/2).")])
846
+ ]),
847
+ "required": .array([.string("pid"), .string("x"), .string("y")])
848
+ ])
849
+ let clickTool = Tool(
850
+ name: "macos-use_click_and_traverse",
851
+ description: "Simulates a click at the given coordinates within the app specified by PID, then traverses its accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
852
+ inputSchema: clickSchema
853
+ )
854
+
855
+ let typeSchema: Value = .object([
856
+ "type": .string("object"),
857
+ "properties": .object([
858
+ "pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
859
+ "text": .object(["type": .string("string"), "description": .string("REQUIRED. Text to type.")])
860
+ // Add optional options here if needed later
861
+ ]),
862
+ "required": .array([.string("pid"), .string("text")])
863
+ ])
864
+ let typeTool = Tool(
865
+ name: "macos-use_type_and_traverse",
866
+ description: "Simulates typing text into the app specified by PID, then traverses its accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
867
+ inputSchema: typeSchema
868
+ )
869
+
870
+ let refreshSchema: Value = .object([
871
+ "type": .string("object"),
872
+ "properties": .object([
873
+ "pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the application to traverse.")])
874
+ // Add optional options here if needed later
875
+ ]),
876
+ "required": .array([.string("pid")])
877
+ ])
878
+ let refreshTool = Tool(
879
+ name: "macos-use_refresh_traversal",
880
+ description: "Traverses the accessibility tree of the application specified by PID. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
881
+ inputSchema: refreshSchema
882
+ )
883
+
884
+ // *** NEW: Schema and Tool for Press Key ***
885
+ let pressKeySchema: Value = .object([
886
+ "type": .string("object"),
887
+ "properties": .object([
888
+ "pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
889
+ "keyName": .object(["type": .string("string"), "description": .string("REQUIRED. Name of the key to press (e.g., 'Return', 'Enter', 'Escape', 'Tab', 'up', 'down', 'left', 'right', 'PageUp', 'PageDown', 'Home', 'End', 'Delete', 'a', 'B'). Case-insensitive for special keys.")]),
890
+ "modifierFlags": .object([ // Optional array of strings
891
+ "type": .string("array"),
892
+ "description": .string("OPTIONAL. Modifier keys to hold (e.g., ['Command', 'Shift']). Valid: CapsLock, Shift, Control, Option, Command, Function, NumericPad, Help."),
893
+ "items": .object(["type": .string("string")]) // Items in the array must be strings
894
+ ])
895
+ // Add optional ActionOptions overrides here if needed later
896
+ ]),
897
+ "required": .array([.string("pid"), .string("keyName")])
898
+ ])
899
+ let pressKeyTool = Tool(
900
+ name: "macos-use_press_key_and_traverse",
901
+ description: "Simulates pressing a specific key (like Return, Enter, Escape, Tab, Arrow Keys, regular characters) with optional modifiers, then traverses the accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
902
+ inputSchema: pressKeySchema
903
+ )
904
+
905
+ // *** NEW: Schema and Tool for Scroll Wheel ***
906
+ let scrollSchema: Value = .object([
907
+ "type": .string("object"),
908
+ "properties": .object([
909
+ "pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
910
+ "x": .object(["type": .string("number"), "description": .string("REQUIRED. X coordinate for the scroll location.")]),
911
+ "y": .object(["type": .string("number"), "description": .string("REQUIRED. Y coordinate for the scroll location.")]),
912
+ "deltaY": .object(["type": .string("integer"), "description": .string("REQUIRED. Vertical scroll amount in lines. Negative = scroll up (reveal content above), positive = scroll down (reveal content below).")]),
913
+ "deltaX": .object(["type": .string("integer"), "description": .string("Optional. Horizontal scroll amount in lines. Negative = scroll left, positive = scroll right. Defaults to 0.")])
914
+ ]),
915
+ "required": .array([.string("pid"), .string("x"), .string("y"), .string("deltaY")])
916
+ ])
917
+ let scrollTool = Tool(
918
+ name: "macos-use_scroll_and_traverse",
919
+ description: "Simulates a scroll wheel event at the given coordinates within the app specified by PID, then traverses its accessibility tree. Returns a summary with file path. Use Grep/Read on the file to find specific elements.",
920
+ inputSchema: scrollSchema
921
+ )
922
+
923
+ // --- Aggregate list of tools ---
924
+ let allTools = [openAppTool, clickTool, typeTool, pressKeyTool, scrollTool, refreshTool]
925
+ fputs("log: setupAndStartServer: defined \(allTools.count) tools: \(allTools.map { $0.name })\n", stderr)
926
+
927
+ let server = Server(
928
+ name: "SwiftMacOSServerDirect", // Renamed slightly
929
+ version: "1.3.0", // Incremented version for major change
930
+ capabilities: .init(
931
+ tools: .init(listChanged: true)
932
+ )
933
+ )
934
+ fputs("log: setupAndStartServer: server instance created (\(server.name)) version \(server.version).\n", stderr)
935
+
936
+ // --- Dummy Handlers (ReadResource, ListResources, ListPrompts) ---
937
+ // (Keep these as they are part of the MCP spec, even if unused for now)
938
+ await server.withMethodHandler(ReadResource.self) { params in
939
+ let uri = params.uri
940
+ fputs("log: handler(ReadResource): received request for uri: \(uri) (dummy handler)\n", stderr)
941
+ // In a real scenario, you might fetch resource content here
942
+ return .init(contents: [.text("dummy content for \(uri)", uri: uri)])
943
+ }
944
+ fputs("log: setupAndStartServer: registered ReadResource handler (dummy).\n", stderr)
945
+
946
+ await server.withMethodHandler(ListResources.self) { _ in
947
+ fputs("log: handler(ListResources): received request (dummy handler).\n", stderr)
948
+ // In a real scenario, list available resources
949
+ return ListResources.Result(resources: [])
950
+ }
951
+ fputs("log: setupAndStartServer: registered ListResources handler (dummy).\n", stderr)
952
+
953
+ await server.withMethodHandler(ListPrompts.self) { _ in
954
+ fputs("log: handler(ListPrompts): received request (dummy handler).\n", stderr)
955
+ // In a real scenario, list available prompts
956
+ return ListPrompts.Result(prompts: [])
957
+ }
958
+ fputs("log: setupAndStartServer: registered ListPrompts handler (dummy).\n", stderr)
959
+
960
+ // --- ListTools Handler ---
961
+ await server.withMethodHandler(ListTools.self) { _ in
962
+ fputs("log: handler(ListTools): received request.\n", stderr)
963
+ let result = ListTools.Result(tools: allTools)
964
+ fputs("log: handler(ListTools): responding with \(result.tools.count) tools: \(result.tools.map { $0.name })\n", stderr)
965
+ return result
966
+ }
967
+ fputs("log: setupAndStartServer: registered ListTools handler.\n", stderr)
968
+
969
+ // --- UPDATED CallTool Handler (Direct SDK Call) ---
970
+ await server.withMethodHandler(CallTool.self) { params in
971
+ fputs("log: handler(CallTool): received request for tool: \(params.name).\n", stderr)
972
+ fputs("log: handler(CallTool): arguments received (raw MCP): \(params.arguments?.debugDescription ?? "nil")\n", stderr)
973
+
974
+ do {
975
+ // --- Determine Action and Options from MCP Params ---
976
+ let primaryAction: PrimaryAction
977
+ var options = ActionOptions(traverseAfter: true, showAnimation: false) // MCP tools should return the tree by default, no visual highlighting
978
+
979
+ // PID is required for click, type, press, refresh
980
+ // Optional only for open (where SDK finds it)
981
+ let pidOptionalInt = try getOptionalInt(from: params.arguments, key: "pid")
982
+
983
+ // Convert Int? to pid_t?
984
+ let pidForOptions: pid_t?
985
+ if let unwrappedPid = pidOptionalInt {
986
+ guard let convertedPid = pid_t(exactly: unwrappedPid) else {
987
+ fputs("error: handler(CallTool): PID value \(unwrappedPid) is out of range for pid_t (Int32).\n", stderr)
988
+ throw MCPError.invalidParams("PID value \(unwrappedPid) is out of range.")
989
+ }
990
+ pidForOptions = convertedPid
991
+ } else {
992
+ pidForOptions = nil
993
+ }
994
+ options.pidForTraversal = pidForOptions
995
+
996
+ // Potentially allow overriding default options from params
997
+ options.traverseBefore = try getOptionalBool(from: params.arguments, key: "traverseBefore") ?? options.traverseBefore
998
+ options.traverseAfter = try getOptionalBool(from: params.arguments, key: "traverseAfter") ?? options.traverseAfter
999
+ options.showDiff = try getOptionalBool(from: params.arguments, key: "showDiff") ?? options.showDiff
1000
+ options.onlyVisibleElements = try getOptionalBool(from: params.arguments, key: "onlyVisibleElements") ?? options.onlyVisibleElements
1001
+ options.showAnimation = try getOptionalBool(from: params.arguments, key: "showAnimation") ?? options.showAnimation
1002
+ options.animationDuration = try getOptionalDouble(from: params.arguments, key: "animationDuration") ?? options.animationDuration
1003
+ options.delayAfterAction = try getOptionalDouble(from: params.arguments, key: "delayAfterAction") ?? options.delayAfterAction
1004
+
1005
+ options = options.validated()
1006
+ fputs("log: handler(CallTool): constructed ActionOptions: \(options)\n", stderr)
1007
+
1008
+
1009
+ // Track whether this tool returns a diff (click/type/press) or full traversal (open/refresh)
1010
+ var hasDiff = false
1011
+
1012
+ switch params.name {
1013
+ case openAppTool.name:
1014
+ let identifier = try getRequiredString(from: params.arguments, key: "identifier")
1015
+ primaryAction = .open(identifier: identifier)
1016
+
1017
+ case clickTool.name:
1018
+ guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for click tool") }
1019
+ let x = try getRequiredDouble(from: params.arguments, key: "x")
1020
+ let y = try getRequiredDouble(from: params.arguments, key: "y")
1021
+ let w = try getOptionalDouble(from: params.arguments, key: "width")
1022
+ let h = try getOptionalDouble(from: params.arguments, key: "height")
1023
+ // If width+height provided, compute exact center; otherwise use raw point (AX will center via lookup)
1024
+ let rawPoint: CGPoint
1025
+ if let w = w, let h = h {
1026
+ rawPoint = CGPoint(x: x + w / 2, y: y + h / 2)
1027
+ fputs("log: click_and_traverse: centering (\(x),\(y)) + size(\(w)×\(h)) → \(rawPoint)\n", stderr)
1028
+ } else {
1029
+ rawPoint = CGPoint(x: x, y: y)
1030
+ }
1031
+ // Activate the target app before clicking so the click registers correctly
1032
+ if let runningApp = NSRunningApplication(processIdentifier: reqPid) {
1033
+ runningApp.activate(options: [])
1034
+ fputs("log: click_and_traverse: activated app pid=\(reqPid)\n", stderr)
1035
+ try? await Task.sleep(nanoseconds: 200_000_000) // 200ms for activation
1036
+ }
1037
+ // Auto-scroll element into view if it's outside the visible window area
1038
+ let adjustedPoint = await scrollIntoViewIfNeeded(pid: reqPid, point: rawPoint)
1039
+ primaryAction = .input(action: .click(point: adjustedPoint))
1040
+ options.pidForTraversal = reqPid
1041
+ options.showDiff = true // enables traverseBefore automatically
1042
+ hasDiff = true
1043
+
1044
+ case typeTool.name:
1045
+ guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for type tool") }
1046
+ let text = try getRequiredString(from: params.arguments, key: "text")
1047
+ primaryAction = .input(action: .type(text: text))
1048
+ options.pidForTraversal = reqPid
1049
+ options.showDiff = true
1050
+ hasDiff = true
1051
+
1052
+ case pressKeyTool.name:
1053
+ guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for press key tool") }
1054
+ let keyName = try getRequiredString(from: params.arguments, key: "keyName")
1055
+ let flags = try parseFlags(from: params.arguments?["modifierFlags"])
1056
+ fputs("log: handler(CallTool): parsed modifierFlags: \(flags)\n", stderr)
1057
+ primaryAction = .input(action: .press(keyName: keyName, flags: flags))
1058
+ options.pidForTraversal = reqPid
1059
+ options.showDiff = true
1060
+ hasDiff = true
1061
+
1062
+ case scrollTool.name:
1063
+ guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for scroll tool") }
1064
+ let x = try getRequiredDouble(from: params.arguments, key: "x")
1065
+ let y = try getRequiredDouble(from: params.arguments, key: "y")
1066
+ let deltaY = try getRequiredInt(from: params.arguments, key: "deltaY")
1067
+ let deltaX = try getOptionalInt(from: params.arguments, key: "deltaX") ?? 0
1068
+ let scrollPoint = CGPoint(x: x, y: y)
1069
+ // Activate the target app before scrolling so the event registers correctly
1070
+ if let runningApp = NSRunningApplication(processIdentifier: reqPid) {
1071
+ runningApp.activate(options: [])
1072
+ fputs("log: scroll_and_traverse: activated app pid=\(reqPid)\n", stderr)
1073
+ try? await Task.sleep(nanoseconds: 200_000_000) // 200ms for activation
1074
+ }
1075
+ primaryAction = .input(action: .scroll(point: scrollPoint, deltaY: Int32(deltaY), deltaX: Int32(deltaX)))
1076
+ options.pidForTraversal = reqPid
1077
+ options.showDiff = true
1078
+ hasDiff = true
1079
+
1080
+ case refreshTool.name:
1081
+ guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for refresh tool") }
1082
+ primaryAction = .traverseOnly
1083
+ options.pidForTraversal = reqPid
1084
+
1085
+ default:
1086
+ fputs("error: handler(CallTool): received request for unknown or unsupported tool: \(params.name)\n", stderr)
1087
+ throw MCPError.methodNotFound(params.name)
1088
+ }
1089
+
1090
+ fputs("log: handler(CallTool): constructed PrimaryAction: \(primaryAction)\n", stderr)
1091
+
1092
+ // --- Save cursor position before click actions so we can restore it after ---
1093
+ var savedCursorPos: CGPoint? = nil
1094
+ if case .input(let inputAction) = primaryAction, case .click = inputAction {
1095
+ let nsPos = NSEvent.mouseLocation
1096
+ if let primaryScreen = NSScreen.screens.first {
1097
+ // NSEvent uses bottom-left origin; CGEvent uses top-left — flip Y
1098
+ savedCursorPos = CGPoint(x: nsPos.x, y: primaryScreen.frame.height - nsPos.y)
1099
+ fputs("log: handler(CallTool): saved cursor position \(savedCursorPos!)\n", stderr)
1100
+ }
1101
+ }
1102
+
1103
+ // --- Execute the Action using MacosUseSDK ---
1104
+ let actionResult: ActionResult = await Task { @MainActor in
1105
+ fputs("log: handler(CallTool): executing performAction on MainActor via Task...\n", stderr)
1106
+ return await performAction(action: primaryAction, optionsInput: options)
1107
+ }.value
1108
+ fputs("log: handler(CallTool): performAction task completed.\n", stderr)
1109
+
1110
+ // --- Restore cursor position after click ---
1111
+ if let pos = savedCursorPos,
1112
+ let moveEvent = CGEvent(mouseEventSource: nil, mouseType: .mouseMoved,
1113
+ mouseCursorPosition: pos, mouseButton: .left) {
1114
+ moveEvent.post(tap: .cghidEventTap)
1115
+ fputs("log: handler(CallTool): restored cursor to \(pos)\n", stderr)
1116
+ }
1117
+
1118
+ // --- Build simplified response and serialize to JSON ---
1119
+ let toolResponse = buildToolResponse(actionResult, hasDiff: hasDiff)
1120
+ guard let resultJsonString = serializeToJsonString(toolResponse) else {
1121
+ fputs("error: handler(CallTool): failed to serialize ToolResponse to JSON for tool \(params.name).\n", stderr)
1122
+ throw MCPError.internalError("failed to serialize ToolResponse to JSON")
1123
+ }
1124
+
1125
+ // --- Determine if it was an error overall ---
1126
+ let isError = actionResult.primaryActionError != nil ||
1127
+ (options.traverseBefore && actionResult.traversalBeforeError != nil) ||
1128
+ (options.traverseAfter && actionResult.traversalAfterError != nil)
1129
+
1130
+ if isError {
1131
+ fputs("warning: handler(CallTool): Action resulted in an error state (primary: \(actionResult.primaryActionError ?? "nil"), before: \(actionResult.traversalBeforeError ?? "nil"), after: \(actionResult.traversalAfterError ?? "nil")).\n", stderr)
1132
+ }
1133
+
1134
+ // --- Write full JSON to file, return compact summary ---
1135
+ let outputDir = "/tmp/macos-use"
1136
+ try? FileManager.default.createDirectory(atPath: outputDir, withIntermediateDirectories: true)
1137
+
1138
+ let timestamp = Int(Date().timeIntervalSince1970 * 1000) // ms precision to avoid collisions
1139
+ let safeName = params.name.replacingOccurrences(of: "macos-use_", with: "")
1140
+ let filename = "\(timestamp)_\(safeName).json"
1141
+ let filepath = "\(outputDir)/\(filename)"
1142
+ try? resultJsonString.write(toFile: filepath, atomically: true, encoding: .utf8)
1143
+ fputs("log: handler(CallTool): wrote full response to \(filepath) (\(resultJsonString.count) bytes)\n", stderr)
1144
+
1145
+ let summary = buildCompactSummary(toolName: params.name, params: params, toolResponse: toolResponse, filepath: filepath)
1146
+ fputs("log: handler(CallTool): returning compact summary (\(summary.count) chars)\n", stderr)
1147
+
1148
+ return .init(content: [.text(summary)], isError: isError)
1149
+
1150
+ } catch let error as MCPError {
1151
+ fputs("error: handler(CallTool): MCPError occurred processing MCP params for tool '\(params.name)': \(error)\n", stderr)
1152
+ return .init(content: [.text("Error processing parameters for tool '\(params.name)': \(error.localizedDescription)")], isError: true)
1153
+ } catch {
1154
+ fputs("error: handler(CallTool): Unexpected error occurred setting up call for tool '\(params.name)': \(error)\n", stderr)
1155
+ return .init(content: [.text("Unexpected setup error executing tool '\(params.name)': \(error.localizedDescription)")], isError: true)
1156
+ }
1157
+ }
1158
+ fputs("log: setupAndStartServer: registered CallTool handler.\n", stderr)
1159
+
1160
+
1161
+ // --- Transport and Start ---
1162
+ let transport = StdioTransport()
1163
+ fputs("log: setupAndStartServer: created StdioTransport.\n", stderr)
1164
+
1165
+ fputs("log: setupAndStartServer: calling server.start()...\n", stderr)
1166
+ try await server.start(transport: transport)
1167
+ fputs("log: setupAndStartServer: server.start() completed (background task launched).\n", stderr)
1168
+
1169
+ fputs("log: setupAndStartServer: returning server instance.\n", stderr)
1170
+ return server
1171
+ }
1172
+
1173
+ // --- @main Entry Point ---
1174
+ @main
1175
+ struct MCPServer {
1176
+ // Main entry point - Async
1177
+ static func main() async {
1178
+ fputs("log: main: starting server (async).\n", stderr)
1179
+
1180
+ // Configure logging if needed (optional)
1181
+ // LoggingSystem.bootstrap { label in MultiplexLogHandler([...]) }
1182
+
1183
+ let server: Server
1184
+ do {
1185
+ fputs("log: main: calling setupAndStartServer()...\n", stderr)
1186
+ server = try await setupAndStartServer()
1187
+ fputs("log: main: setupAndStartServer() successful, server instance obtained.\n", stderr)
1188
+
1189
+ fputs("log: main: server started, calling server.waitUntilCompleted()...\n", stderr)
1190
+ await server.waitUntilCompleted() // Waits until the server loop finishes/errors
1191
+ fputs("log: main: server.waitUntilCompleted() returned. Server has stopped.\n", stderr)
1192
+
1193
+ } catch {
1194
+ fputs("error: main: server setup or run failed: \(error)\n", stderr)
1195
+ if let mcpError = error as? MCPError {
1196
+ fputs("error: main: MCPError details: \(mcpError.localizedDescription)\n", stderr)
1197
+ }
1198
+ // Consider more specific exit codes if useful
1199
+ exit(1) // Exit with error code
1200
+ }
1201
+
1202
+ fputs("log: main: Server processing finished gracefully. Exiting.\n", stderr)
1203
+ exit(0) // Exit cleanly
1204
+ }
1205
+ }