mcp-server-macos-use 0.1.7 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Package.resolved CHANGED
@@ -15,7 +15,7 @@
15
15
  "location" : "https://github.com/mediar-ai/MacosUseSDK.git",
16
16
  "state" : {
17
17
  "branch" : "main",
18
- "revision" : "df6989da30b580c203828fa24e9e25a318c2d113"
18
+ "revision" : "aef5bfc4c0b5ab13e4981909a314d772d8a02ca2"
19
19
  }
20
20
  },
21
21
  {
@@ -201,6 +201,55 @@ struct ToolResponse: Codable {
201
201
  var diff: EnrichedTraversalDiff? // for click/type/press: what changed
202
202
  var primaryActionError: String?
203
203
  var traversalError: String?
204
+
205
+ // Cross-app handoff: populated when a different app became frontmost after the action
206
+ var appSwitchPid: pid_t?
207
+ var appSwitchAppName: String?
208
+ var appSwitchTraversal: EnrichedResponseData?
209
+
210
+ // Sheet/dialog detection
211
+ var sheetDetected: Bool?
212
+ }
213
+
214
+ // --- Sheet Detection ---
215
+
216
+ /// Check if the app has an AXSheet child (file dialogs, save sheets, etc.)
217
+ /// and return its bounds for viewport scoping.
218
+ func findSheetBounds(pid: pid_t) -> CGRect? {
219
+ let appElement = AXUIElementCreateApplication(pid)
220
+ AXUIElementSetMessagingTimeout(appElement, 5.0)
221
+
222
+ // Check all windows for AXSheet children
223
+ var windowsRef: CFTypeRef?
224
+ guard AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &windowsRef) == .success,
225
+ let windows = windowsRef as? [AXUIElement] else { return nil }
226
+
227
+ for window in windows {
228
+ AXUIElementSetMessagingTimeout(window, 5.0)
229
+ // Look for AXSheet role among children
230
+ var childCount: CFIndex = 0
231
+ guard AXUIElementGetAttributeValueCount(window, kAXChildrenAttribute as CFString, &childCount) == .success,
232
+ childCount > 0 else { continue }
233
+ let fetchCount = min(CFIndex(50), childCount)
234
+ var childrenRef: CFArray?
235
+ guard AXUIElementCopyAttributeValues(window, kAXChildrenAttribute as CFString, 0, fetchCount, &childrenRef) == .success,
236
+ let cfArray = childrenRef else { continue }
237
+ let children = cfArray as [AnyObject]
238
+ for child in children {
239
+ let childElement = child as! AXUIElement
240
+ AXUIElementSetMessagingTimeout(childElement, 5.0)
241
+ var roleRef: CFTypeRef?
242
+ guard AXUIElementCopyAttributeValue(childElement, kAXRoleAttribute as CFString, &roleRef) == .success,
243
+ let role = roleRef as? String else { continue }
244
+ if role == "AXSheet" {
245
+ if let frame = getAXElementFrame(childElement) {
246
+ fputs("log: findSheetBounds: found AXSheet at \(frame)\n", stderr)
247
+ return frame
248
+ }
249
+ }
250
+ }
251
+ }
252
+ return nil
204
253
  }
205
254
 
206
255
  // --- Viewport Detection Helpers ---
@@ -225,6 +274,7 @@ func getWindowContainingPoint(appElement: AXUIElement, point: CGPoint) -> (eleme
225
274
  if AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &windowsRef) == .success,
226
275
  let windows = windowsRef as? [AXUIElement] {
227
276
  for window in windows {
277
+ AXUIElementSetMessagingTimeout(window, 5.0)
228
278
  guard let frame = getAXElementFrame(window) else { continue }
229
279
  if frame.contains(point) {
230
280
  fputs("log: getWindowContainingPoint: matched window \(frame) for point \(point)\n", stderr)
@@ -236,6 +286,7 @@ func getWindowContainingPoint(appElement: AXUIElement, point: CGPoint) -> (eleme
236
286
  var winRef: CFTypeRef?
237
287
  guard AXUIElementCopyAttributeValue(appElement, "AXMainWindow" as CFString, &winRef) == .success else { return nil }
238
288
  let win = winRef as! AXUIElement
289
+ AXUIElementSetMessagingTimeout(win, 5.0)
239
290
  guard let frame = getAXElementFrame(win) else { return nil }
240
291
  fputs("log: getWindowContainingPoint: no window contains \(point), falling back to main window \(frame)\n", stderr)
241
292
  return (win, frame)
@@ -244,6 +295,7 @@ func getWindowContainingPoint(appElement: AXUIElement, point: CGPoint) -> (eleme
244
295
  /// Get window bounds directly from the accessibility API
245
296
  func getWindowBoundsFromAPI(pid: pid_t) -> CGRect? {
246
297
  let appElement = AXUIElementCreateApplication(pid)
298
+ AXUIElementSetMessagingTimeout(appElement, 5.0)
247
299
 
248
300
  var windowValue: CFTypeRef?
249
301
  guard AXUIElementCopyAttributeValue(appElement, "AXMainWindow" as CFString, &windowValue) == .success else {
@@ -271,6 +323,133 @@ func getWindowBoundsFromAPI(pid: pid_t) -> CGRect? {
271
323
  return CGRect(origin: position, size: size)
272
324
  }
273
325
 
326
+ /// Capture a screenshot of the window(s) belonging to a given PID and save as PNG.
327
+ /// If `clickPoint` is provided (screen coordinates), draws a red crosshair at that location.
328
+ /// Returns the file path on success, nil on failure.
329
+ func captureWindowScreenshot(pid: pid_t, outputPath: String, clickPoint: CGPoint? = nil) -> String? {
330
+ // Get the list of windows for this PID
331
+ guard let windowList = CGWindowListCopyWindowInfo([.optionOnScreenOnly, .excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
332
+ fputs("warning: captureWindowScreenshot: could not get window list\n", stderr)
333
+ return nil
334
+ }
335
+
336
+ // Find the main window ID for this PID
337
+ var targetWindowID: CGWindowID? = nil
338
+ var windowBoundsDict: CFDictionary? = nil
339
+ for window in windowList {
340
+ guard let ownerPID = window[kCGWindowOwnerPID as String] as? pid_t,
341
+ ownerPID == pid,
342
+ let layer = window[kCGWindowLayer as String] as? Int,
343
+ layer == 0, // normal window layer
344
+ let windowID = window[kCGWindowNumber as String] as? CGWindowID else {
345
+ continue
346
+ }
347
+ targetWindowID = windowID
348
+ windowBoundsDict = window[kCGWindowBounds as String] as! CFDictionary?
349
+ fputs("log: captureWindowScreenshot: found window \(windowID), boundsDict=\(windowBoundsDict != nil ? "yes" : "nil")\n", stderr)
350
+ break
351
+ }
352
+
353
+ guard let windowID = targetWindowID else {
354
+ fputs("warning: captureWindowScreenshot: no on-screen window found for PID \(pid)\n", stderr)
355
+ return nil
356
+ }
357
+
358
+ // Capture just this window — use a short timeout approach:
359
+ // Try the capture in a detached thread and bail if it takes too long
360
+ // (CGWindowListCreateImage can block indefinitely without screen recording permission)
361
+ fputs("log: captureWindowScreenshot: starting image capture for window \(windowID)...\n", stderr)
362
+ var capturedImage: CGImage? = nil
363
+ let captureGroup = DispatchGroup()
364
+ captureGroup.enter()
365
+ DispatchQueue.global(qos: .userInitiated).async {
366
+ capturedImage = CGWindowListCreateImage(.null, .optionIncludingWindow, windowID, [.boundsIgnoreFraming, .bestResolution])
367
+ captureGroup.leave()
368
+ }
369
+ let captureResult = captureGroup.wait(timeout: .now() + 3.0)
370
+ if captureResult == .timedOut {
371
+ fputs("warning: captureWindowScreenshot: CGWindowListCreateImage timed out (3s) — likely missing screen recording permission\n", stderr)
372
+ return nil
373
+ }
374
+ guard let image = capturedImage else {
375
+ fputs("warning: captureWindowScreenshot: CGWindowListCreateImage failed for window \(windowID)\n", stderr)
376
+ return nil
377
+ }
378
+
379
+ // Draw click point crosshair if provided
380
+ var finalImage = image
381
+ fputs("log: captureWindowScreenshot: clickPoint=\(clickPoint?.debugDescription ?? "nil"), boundsDict=\(windowBoundsDict != nil ? "present" : "nil")\n", stderr)
382
+ if let clickPoint = clickPoint, let boundsDict = windowBoundsDict {
383
+ var windowRect = CGRect.zero
384
+ CGRectMakeWithDictionaryRepresentation(boundsDict, &windowRect)
385
+
386
+ // Convert screen coordinates to image coordinates
387
+ // CGWindowListCreateImage with .boundsIgnoreFraming may include shadow, so we
388
+ // compute the offset from the window's screen origin to the image pixel space.
389
+ let imageWidth = CGFloat(image.width)
390
+ let imageHeight = CGFloat(image.height)
391
+ // Scale factor: image pixels may differ from window points (Retina)
392
+ let scaleX = imageWidth / windowRect.width
393
+ let scaleY = imageHeight / windowRect.height
394
+ let localX = (clickPoint.x - windowRect.origin.x) * scaleX
395
+ let localY = (clickPoint.y - windowRect.origin.y) * scaleY
396
+
397
+ fputs("log: captureWindowScreenshot: drawing crosshair at screen(\(clickPoint.x),\(clickPoint.y)) → image(\(localX),\(localY)) windowOrigin(\(windowRect.origin.x),\(windowRect.origin.y)) scale(\(scaleX),\(scaleY))\n", stderr)
398
+
399
+ // Draw crosshair on the image
400
+ let colorSpace = CGColorSpaceCreateDeviceRGB()
401
+ if let ctx = CGContext(data: nil, width: image.width, height: image.height,
402
+ bitsPerComponent: 8, bytesPerRow: 0, space: colorSpace,
403
+ bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue) {
404
+ // Draw original image
405
+ ctx.draw(image, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
406
+
407
+ // Flip Y for CoreGraphics drawing (origin is bottom-left)
408
+ let drawX = localX
409
+ let drawY = imageHeight - localY
410
+
411
+ // Red crosshair
412
+ ctx.setStrokeColor(CGColor(red: 1, green: 0, blue: 0, alpha: 1))
413
+ ctx.setLineWidth(2.0 * max(scaleX, scaleY))
414
+
415
+ let armLength: CGFloat = 15 * max(scaleX, scaleY)
416
+ // Horizontal line
417
+ ctx.move(to: CGPoint(x: drawX - armLength, y: drawY))
418
+ ctx.addLine(to: CGPoint(x: drawX + armLength, y: drawY))
419
+ // Vertical line
420
+ ctx.move(to: CGPoint(x: drawX, y: drawY - armLength))
421
+ ctx.addLine(to: CGPoint(x: drawX, y: drawY + armLength))
422
+ ctx.strokePath()
423
+
424
+ // Circle around crosshair
425
+ ctx.setLineWidth(1.5 * max(scaleX, scaleY))
426
+ let radius: CGFloat = 10 * max(scaleX, scaleY)
427
+ ctx.addEllipse(in: CGRect(x: drawX - radius, y: drawY - radius, width: radius * 2, height: radius * 2))
428
+ ctx.strokePath()
429
+
430
+ if let annotatedImage = ctx.makeImage() {
431
+ finalImage = annotatedImage
432
+ }
433
+ }
434
+ }
435
+
436
+ // Convert to PNG data and write to file
437
+ let bitmapRep = NSBitmapImageRep(cgImage: finalImage)
438
+ guard let pngData = bitmapRep.representation(using: .png, properties: [:]) else {
439
+ fputs("warning: captureWindowScreenshot: failed to create PNG data\n", stderr)
440
+ return nil
441
+ }
442
+
443
+ do {
444
+ try pngData.write(to: URL(fileURLWithPath: outputPath))
445
+ fputs("log: captureWindowScreenshot: saved screenshot to \(outputPath) (\(pngData.count) bytes)\n", stderr)
446
+ return outputPath
447
+ } catch {
448
+ fputs("warning: captureWindowScreenshot: failed to write screenshot: \(error)\n", stderr)
449
+ return nil
450
+ }
451
+ }
452
+
274
453
  /// Enrich a ResponseData with in_viewport metadata for each element
275
454
  func enrichResponseData(_ response: ResponseData, windowBounds: CGRect?) -> EnrichedResponseData {
276
455
  let enrichedElements = response.elements.map { element -> EnrichedElementData in
@@ -370,11 +549,21 @@ func buildToolResponse(_ result: ActionResult, hasDiff: Bool) -> ToolResponse {
370
549
  }
371
550
  }
372
551
 
552
+ // Check for AXSheet (file dialogs, save sheets) — use sheet bounds for viewport
553
+ var sheetDetected = false
554
+ if let pid = result.traversalPid ?? result.openResult?.pid {
555
+ if let sheetBounds = findSheetBounds(pid: pid) {
556
+ windowBounds = sheetBounds
557
+ sheetDetected = true
558
+ }
559
+ }
560
+
373
561
  var response = ToolResponse()
374
562
  response.openResult = result.openResult
375
563
  response.traversalPid = result.traversalPid
376
564
  response.primaryActionError = result.primaryActionError
377
565
  response.traversalError = result.traversalAfterError ?? result.traversalBeforeError
566
+ response.sheetDetected = sheetDetected ? true : nil
378
567
 
379
568
  if hasDiff, let rawDiff = result.traversalDiff {
380
569
  let coordinateAttrs: Set<String> = ["x", "y", "width", "height"]
@@ -459,7 +648,7 @@ func buildToolResponse(_ result: ActionResult, hasDiff: Bool) -> ToolResponse {
459
648
 
460
649
  /// Build a concise text summary for the MCP response instead of returning the full JSON.
461
650
  /// The full JSON is written to a file; this summary contains just the key info + file path.
462
- func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResponse: ToolResponse, filepath: String) -> String {
651
+ func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResponse: ToolResponse, filepath: String, fileSize: Int, screenshotPath: String? = nil) -> String {
463
652
  var lines: [String] = []
464
653
 
465
654
  // Status line
@@ -473,9 +662,26 @@ func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResp
473
662
  if let appName = toolResponse.traversal?.app_name ?? toolResponse.openResult?.appName {
474
663
  lines.append("app: \(appName)")
475
664
  }
665
+ if toolResponse.sheetDetected == true {
666
+ lines.append("dialog: AXSheet detected (viewport scoped to sheet bounds)")
667
+ }
476
668
 
477
- // File path
669
+ // File path + metadata
478
670
  lines.append("file: \(filepath)")
671
+ let elementCount: Int
672
+ if let traversal = toolResponse.traversal {
673
+ elementCount = traversal.elements.count
674
+ } else {
675
+ let added = toolResponse.diff?.added.count ?? 0
676
+ let removed = toolResponse.diff?.removed.count ?? 0
677
+ let modified = toolResponse.diff?.modified.count ?? 0
678
+ elementCount = added + removed + modified
679
+ }
680
+ lines.append("file_size: \(fileSize) bytes (\(elementCount) elements)")
681
+ lines.append("hint: grep -n 'AXButton' \(filepath) # search by role or text")
682
+ if let screenshotPath = screenshotPath {
683
+ lines.append("screenshot: \(screenshotPath)")
684
+ }
479
685
 
480
686
  // Errors if any
481
687
  if let err = toolResponse.primaryActionError {
@@ -499,10 +705,19 @@ func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResp
499
705
  }
500
706
 
501
707
  case "macos-use_click_and_traverse":
502
- let x = params.arguments?["x"]?.doubleValue ?? params.arguments?["x"]?.intValue.map(Double.init) ?? 0
503
- let y = params.arguments?["y"]?.doubleValue ?? params.arguments?["y"]?.intValue.map(Double.init) ?? 0
708
+ let isDoubleClick = params.arguments?["doubleClick"]?.boolValue ?? false
709
+ let isRightClick = params.arguments?["rightClick"]?.boolValue ?? false
710
+ let clickType = isDoubleClick ? "Double-clicked" : isRightClick ? "Right-clicked" : "Clicked"
504
711
  let diffSummary = buildDiffSummary(toolResponse.diff)
505
- summaryLine = "Clicked at (\(Int(x)),\(Int(y))). \(diffSummary)"
712
+ if let elemSearch = params.arguments?["element"]?.stringValue {
713
+ let roleFilter = params.arguments?["role"]?.stringValue
714
+ let roleDesc = roleFilter != nil ? " [\(roleFilter!)]" : ""
715
+ summaryLine = "\(clickType) element '\(elemSearch)'\(roleDesc). \(diffSummary)"
716
+ } else {
717
+ let x = params.arguments?["x"]?.doubleValue ?? params.arguments?["x"]?.intValue.map(Double.init) ?? 0
718
+ let y = params.arguments?["y"]?.doubleValue ?? params.arguments?["y"]?.intValue.map(Double.init) ?? 0
719
+ summaryLine = "\(clickType) at (\(Int(x)),\(Int(y))). \(diffSummary)"
720
+ }
506
721
 
507
722
  case "macos-use_type_and_traverse":
508
723
  let text = params.arguments?["text"]?.stringValue ?? ""
@@ -562,6 +777,30 @@ func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResp
562
777
  }
563
778
  }
564
779
 
780
+ // Inline visible interactive elements
781
+ if let traversal = toolResponse.traversal {
782
+ // Full traversal (open/refresh): show all visible interactive elements
783
+ let visLines = buildVisibleElementsSection(elements: traversal.elements, label: "visible_elements")
784
+ lines.append(contentsOf: visLines)
785
+ } else if let diff = toolResponse.diff, !diff.added.isEmpty {
786
+ // Diff (click/type/press/scroll): show newly added visible elements
787
+ let visLines = buildVisibleElementsSection(elements: diff.added, label: "visible_elements", interactiveCap: 20, textCap: 10)
788
+ lines.append(contentsOf: visLines)
789
+ }
790
+
791
+ // Cross-app handoff: a different app became frontmost after the action
792
+ if let switchPid = toolResponse.appSwitchPid {
793
+ let switchName = toolResponse.appSwitchAppName ?? "Unknown"
794
+ lines.append("app_switch: \(switchName) (PID: \(switchPid)) is now frontmost")
795
+ if let switchTraversal = toolResponse.appSwitchTraversal {
796
+ let total = switchTraversal.elements.count
797
+ let visible = switchTraversal.elements.filter { $0.in_viewport == true }.count
798
+ lines.append("app_switch_elements: \(total) total, \(visible) visible")
799
+ let visLines = buildVisibleElementsSection(elements: switchTraversal.elements, label: "app_switch_visible_elements")
800
+ lines.append(contentsOf: visLines)
801
+ }
802
+ }
803
+
565
804
  return lines.joined(separator: "\n")
566
805
  }
567
806
 
@@ -580,6 +819,154 @@ func truncate(_ s: String, maxLen: Int) -> String {
580
819
  s.count > maxLen ? String(s.prefix(maxLen)) + "..." : s
581
820
  }
582
821
 
822
+ /// Protocol for element types that can be displayed in visible elements section
823
+ protocol VisibleElement {
824
+ var role: String { get }
825
+ var text: String? { get }
826
+ var in_viewport: Bool? { get }
827
+ var x: Double? { get }
828
+ var y: Double? { get }
829
+ var width: Double? { get }
830
+ var height: Double? { get }
831
+ }
832
+ extension EnrichedElementData: VisibleElement {}
833
+ extension DiffElementData: VisibleElement {}
834
+
835
+ /// Interactive role prefixes worth showing inline in the compact summary
836
+ private let interactiveRolePrefixes: [String] = [
837
+ "AXButton", "AXLink", "AXTextField", "AXTextArea", "AXCheckBox",
838
+ "AXRadioButton", "AXPopUpButton", "AXComboBox", "AXSlider",
839
+ "AXMenuItem", "AXMenuButton", "AXTab"
840
+ ]
841
+
842
+ /// Check if a role string matches any interactive prefix
843
+ private func isInteractiveRole(_ role: String) -> Bool {
844
+ interactiveRolePrefixes.contains { role.hasPrefix($0) }
845
+ }
846
+
847
+ /// Check if a role string is static text
848
+ private func isStaticTextRole(_ role: String) -> Bool {
849
+ role.hasPrefix("AXStaticText")
850
+ }
851
+
852
+ /// Build a visible_elements section from a list of elements
853
+ func buildVisibleElementsSection<T: VisibleElement>(elements: [T], label: String, interactiveCap: Int = 30, textCap: Int = 10) -> [String] {
854
+ var interactive: [String] = []
855
+ var staticText: [String] = []
856
+
857
+ for el in elements {
858
+ guard el.in_viewport == true else { continue }
859
+ guard let text = el.text, !text.isEmpty else { continue }
860
+
861
+ let truncatedText = truncate(text, maxLen: 50)
862
+ let pos: String
863
+ if let x = el.x, let y = el.y, let w = el.width, let h = el.height {
864
+ pos = " (\(Int(x)),\(Int(y)) \(Int(w))×\(Int(h)))"
865
+ } else {
866
+ pos = ""
867
+ }
868
+ let line = " [\(el.role)] \"\(truncatedText)\"\(pos)"
869
+
870
+ if isInteractiveRole(el.role) {
871
+ if interactive.count < interactiveCap {
872
+ interactive.append(line)
873
+ }
874
+ } else if isStaticTextRole(el.role) {
875
+ if staticText.count < textCap {
876
+ staticText.append(line)
877
+ }
878
+ }
879
+ }
880
+
881
+ if interactive.isEmpty && staticText.isEmpty { return [] }
882
+
883
+ var result = ["\(label):"]
884
+ result.append(contentsOf: interactive)
885
+ result.append(contentsOf: staticText)
886
+ return result
887
+ }
888
+
889
+ // --- Flat Text Response File Builder ---
890
+
891
+ /// Format a single element as a grep-friendly text line
892
+ func formatElementLine(_ el: VisibleElement, prefix: String = " ") -> String {
893
+ var parts: [String] = []
894
+ parts.append("[\(el.role)]")
895
+ if let text = el.text, !text.isEmpty {
896
+ let truncated = text.count > 80 ? String(text.prefix(80)) + "..." : text
897
+ parts.append("\"\(truncated)\"")
898
+ }
899
+ if let x = el.x, let y = el.y {
900
+ parts.append("x:\(Int(x)) y:\(Int(y))")
901
+ }
902
+ if let w = el.width, let h = el.height {
903
+ parts.append("w:\(Int(w)) h:\(Int(h))")
904
+ }
905
+ if el.in_viewport == true {
906
+ parts.append("visible")
907
+ }
908
+ return "\(prefix)\(parts.joined(separator: " "))"
909
+ }
910
+
911
+ /// Build a flat text representation of a ToolResponse for writing to .txt files
912
+ func buildFlatTextResponse(_ toolResponse: ToolResponse) -> String {
913
+ var lines: [String] = []
914
+
915
+ if let traversal = toolResponse.traversal {
916
+ // Full traversal
917
+ lines.append("# \(traversal.app_name) — \(traversal.elements.count) elements (\(traversal.processing_time_seconds)s)")
918
+ if toolResponse.sheetDetected == true {
919
+ lines.append("# dialog: AXSheet detected")
920
+ }
921
+ lines.append("")
922
+ for el in traversal.elements {
923
+ lines.append(formatElementLine(el))
924
+ }
925
+ }
926
+
927
+ if let diff = toolResponse.diff {
928
+ lines.append("# diff: +\(diff.added.count) added, -\(diff.removed.count) removed, ~\(diff.modified.count) modified")
929
+ if toolResponse.sheetDetected == true {
930
+ lines.append("# dialog: AXSheet detected")
931
+ }
932
+ lines.append("")
933
+ for el in diff.added {
934
+ lines.append(formatElementLine(el, prefix: "+ "))
935
+ }
936
+ for el in diff.removed {
937
+ lines.append(formatElementLine(el, prefix: "- "))
938
+ }
939
+ for mod in diff.modified {
940
+ var changeParts: [String] = []
941
+ for change in mod.changes {
942
+ let old = change.oldValue ?? change.removedText ?? ""
943
+ let new = change.newValue ?? change.addedText ?? ""
944
+ changeParts.append("\(change.attributeName): '\(old)' -> '\(new)'")
945
+ }
946
+ lines.append("~ [\(mod.after.role)] \"\(mod.after.text ?? "")\" | \(changeParts.joined(separator: ", "))")
947
+ }
948
+ }
949
+
950
+ // Cross-app handoff
951
+ if let switchTraversal = toolResponse.appSwitchTraversal {
952
+ lines.append("")
953
+ lines.append("# app_switch: \(toolResponse.appSwitchAppName ?? "Unknown") (PID: \(toolResponse.appSwitchPid ?? 0))")
954
+ for el in switchTraversal.elements {
955
+ lines.append(formatElementLine(el))
956
+ }
957
+ }
958
+
959
+ // Errors
960
+ if let err = toolResponse.primaryActionError {
961
+ lines.append("# error: \(err)")
962
+ }
963
+ if let err = toolResponse.traversalError {
964
+ lines.append("# traversal_error: \(err)")
965
+ }
966
+
967
+ return lines.joined(separator: "\n")
968
+ }
969
+
583
970
  // --- Direct AX Element Interaction ---
584
971
 
585
972
  // --- Auto-Scroll via Scroll Wheel Events ---
@@ -691,6 +1078,7 @@ func findElementByText(root: AXUIElement, text: String, viewport: CGRect, maxDep
691
1078
  /// scroll toward the target and keep probing until an element with text appears.
692
1079
  func scrollIntoViewIfNeeded(pid: pid_t, point: CGPoint) async -> CGPoint {
693
1080
  let appElement = AXUIElementCreateApplication(pid)
1081
+ AXUIElementSetMessagingTimeout(appElement, 5.0)
694
1082
 
695
1083
  guard let (windowElement, windowBounds) = getWindowContainingPoint(appElement: appElement, point: point) else {
696
1084
  fputs("log: scrollIntoViewIfNeeded: could not get window bounds, using original point\n", stderr)
@@ -839,12 +1227,16 @@ func setupAndStartServer() async throws -> Server {
839
1227
  "type": .string("object"),
840
1228
  "properties": .object([
841
1229
  "pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
842
- "x": .object(["type": .string("number"), "description": .string("REQUIRED. X coordinate for the click (top-left of element).")]),
843
- "y": .object(["type": .string("number"), "description": .string("REQUIRED. Y coordinate for the click (top-left of element).")]),
1230
+ "x": .object(["type": .string("number"), "description": .string("X coordinate for the click (top-left of element). Required unless 'element' is provided.")]),
1231
+ "y": .object(["type": .string("number"), "description": .string("Y coordinate for the click (top-left of element). Required unless 'element' is provided.")]),
844
1232
  "width": .object(["type": .string("number"), "description": .string("Optional. Element width from traversal. When provided with height, click lands at center (x+width/2, y+height/2).")]),
845
- "height": .object(["type": .string("number"), "description": .string("Optional. Element height from traversal. When provided with width, click lands at center (x+width/2, y+height/2).")])
1233
+ "height": .object(["type": .string("number"), "description": .string("Optional. Element height from traversal. When provided with width, click lands at center (x+width/2, y+height/2).")]),
1234
+ "element": .object(["type": .string("string"), "description": .string("Optional. Case-insensitive partial text match to find and click an element (e.g. \"Open\", \"Submit\"). Searches visible elements in the accessibility tree. First match is clicked. Alternative to x/y coordinates.")]),
1235
+ "role": .object(["type": .string("string"), "description": .string("Optional. Filter element search by accessibility role. Common roles: AXButton, AXLink, AXTextField, AXTextArea, AXCheckBox, AXRadioButton, AXPopUpButton, AXComboBox, AXSlider, AXMenuItem, AXMenuButton, AXTab, AXStaticText, AXImage, AXGroup, AXCell, AXRow.")]),
1236
+ "doubleClick": .object(["type": .string("boolean"), "description": .string("Optional. If true, performs a double-click instead of a single click.")]),
1237
+ "rightClick": .object(["type": .string("boolean"), "description": .string("Optional. If true, performs a right-click (context menu) instead of a left click.")])
846
1238
  ]),
847
- "required": .array([.string("pid"), .string("x"), .string("y")])
1239
+ "required": .array([.string("pid")])
848
1240
  ])
849
1241
  let clickTool = Tool(
850
1242
  name: "macos-use_click_and_traverse",
@@ -926,7 +1318,7 @@ func setupAndStartServer() async throws -> Server {
926
1318
 
927
1319
  let server = Server(
928
1320
  name: "SwiftMacOSServerDirect", // Renamed slightly
929
- version: "1.3.0", // Incremented version for major change
1321
+ version: "1.6.0", // Screenshot capture, AX timeout protection
930
1322
  capabilities: .init(
931
1323
  tools: .init(listChanged: true)
932
1324
  )
@@ -974,6 +1366,7 @@ func setupAndStartServer() async throws -> Server {
974
1366
  do {
975
1367
  // --- Determine Action and Options from MCP Params ---
976
1368
  let primaryAction: PrimaryAction
1369
+ var lastClickPoint: CGPoint? = nil // Track click location for screenshot annotation
977
1370
  var options = ActionOptions(traverseAfter: true, showAnimation: false) // MCP tools should return the tree by default, no visual highlighting
978
1371
 
979
1372
  // PID is required for click, type, press, refresh
@@ -1016,18 +1409,55 @@ func setupAndStartServer() async throws -> Server {
1016
1409
 
1017
1410
  case clickTool.name:
1018
1411
  guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for click tool") }
1019
- let x = try getRequiredDouble(from: params.arguments, key: "x")
1020
- let y = try getRequiredDouble(from: params.arguments, key: "y")
1021
- let w = try getOptionalDouble(from: params.arguments, key: "width")
1022
- let h = try getOptionalDouble(from: params.arguments, key: "height")
1023
- // If width+height provided, compute exact center; otherwise use raw point (AX will center via lookup)
1412
+
1413
+ let elementSearch = params.arguments?["element"]?.stringValue
1414
+ let roleFilter = params.arguments?["role"]?.stringValue
1415
+
1024
1416
  let rawPoint: CGPoint
1025
- if let w = w, let h = h {
1026
- rawPoint = CGPoint(x: x + w / 2, y: y + h / 2)
1027
- fputs("log: click_and_traverse: centering (\(x),\(y)) + size(\(w)×\(h)) \(rawPoint)\n", stderr)
1417
+ if let elementSearch = elementSearch {
1418
+ // --- Search mode: find element by text match ---
1419
+ fputs("log: click_and_traverse: searching for element '\(elementSearch)' (role: \(roleFilter ?? "any"))\n", stderr)
1420
+ let traversal: ResponseData = try await Task { @MainActor in
1421
+ return try traverseAccessibilityTree(pid: reqPid)
1422
+ }.value
1423
+ let windowBounds = getWindowBoundsFromTraversal(traversal) ?? getWindowBoundsFromAPI(pid: reqPid)
1424
+ let enriched = enrichResponseData(traversal, windowBounds: windowBounds)
1425
+
1426
+ let searchLower = elementSearch.lowercased()
1427
+ let matches = enriched.elements.filter { elem in
1428
+ guard let text = elem.text, !text.isEmpty,
1429
+ let _ = elem.x, let _ = elem.y,
1430
+ let w = elem.width, let h = elem.height,
1431
+ w > 0, h > 0 else { return false }
1432
+ let textMatch = text.lowercased().contains(searchLower)
1433
+ let roleMatch = roleFilter == nil || elem.role.lowercased().hasPrefix(roleFilter!.lowercased())
1434
+ return textMatch && roleMatch
1435
+ }
1436
+
1437
+ guard let match = matches.first else {
1438
+ let roleHint = roleFilter != nil ? " with role '\(roleFilter!)'" : ""
1439
+ throw MCPError.invalidParams("No visible element matching '\(elementSearch)'\(roleHint) found in app (PID \(reqPid)). Use the traversal file to find available elements.")
1440
+ }
1441
+ let matchX = match.x!, matchY = match.y!, matchW = match.width!, matchH = match.height!
1442
+ rawPoint = CGPoint(x: matchX + matchW / 2, y: matchY + matchH / 2)
1443
+ let matchCount = matches.count
1444
+ fputs("log: click_and_traverse: found \(matchCount) match(es) for '\(elementSearch)'. Clicking '\(match.text ?? "")' [\(match.role)] at center (\(rawPoint.x),\(rawPoint.y))\n", stderr)
1028
1445
  } else {
1029
- rawPoint = CGPoint(x: x, y: y)
1446
+ // --- Coordinate mode ---
1447
+ guard let x = try getOptionalDouble(from: params.arguments, key: "x"),
1448
+ let y = try getOptionalDouble(from: params.arguments, key: "y") else {
1449
+ throw MCPError.invalidParams("Either 'element' or both 'x' and 'y' must be provided for click tool")
1450
+ }
1451
+ let w = try getOptionalDouble(from: params.arguments, key: "width")
1452
+ let h = try getOptionalDouble(from: params.arguments, key: "height")
1453
+ if let w = w, let h = h {
1454
+ rawPoint = CGPoint(x: x + w / 2, y: y + h / 2)
1455
+ fputs("log: click_and_traverse: centering (\(x),\(y)) + size(\(w)×\(h)) → \(rawPoint)\n", stderr)
1456
+ } else {
1457
+ rawPoint = CGPoint(x: x, y: y)
1458
+ }
1030
1459
  }
1460
+
1031
1461
  // Activate the target app before clicking so the click registers correctly
1032
1462
  if let runningApp = NSRunningApplication(processIdentifier: reqPid) {
1033
1463
  runningApp.activate(options: [])
@@ -1036,7 +1466,16 @@ func setupAndStartServer() async throws -> Server {
1036
1466
  }
1037
1467
  // Auto-scroll element into view if it's outside the visible window area
1038
1468
  let adjustedPoint = await scrollIntoViewIfNeeded(pid: reqPid, point: rawPoint)
1039
- primaryAction = .input(action: .click(point: adjustedPoint))
1469
+ lastClickPoint = adjustedPoint
1470
+ let isDoubleClick = params.arguments?["doubleClick"]?.boolValue ?? false
1471
+ let isRightClick = params.arguments?["rightClick"]?.boolValue ?? false
1472
+ if isDoubleClick {
1473
+ primaryAction = .input(action: .doubleClick(point: adjustedPoint))
1474
+ } else if isRightClick {
1475
+ primaryAction = .input(action: .rightClick(point: adjustedPoint))
1476
+ } else {
1477
+ primaryAction = .input(action: .click(point: adjustedPoint))
1478
+ }
1040
1479
  options.pidForTraversal = reqPid
1041
1480
  options.showDiff = true // enables traverseBefore automatically
1042
1481
  hasDiff = true
@@ -1116,11 +1555,33 @@ func setupAndStartServer() async throws -> Server {
1116
1555
  }
1117
1556
 
1118
1557
  // --- Build simplified response and serialize to JSON ---
1119
- let toolResponse = buildToolResponse(actionResult, hasDiff: hasDiff)
1120
- guard let resultJsonString = serializeToJsonString(toolResponse) else {
1121
- fputs("error: handler(CallTool): failed to serialize ToolResponse to JSON for tool \(params.name).\n", stderr)
1122
- throw MCPError.internalError("failed to serialize ToolResponse to JSON")
1558
+ var toolResponse = buildToolResponse(actionResult, hasDiff: hasDiff)
1559
+
1560
+ // --- Detect cross-app handoff ---
1561
+ // After diff-based actions, check if a different app became frontmost
1562
+ if hasDiff, let originalPid = options.pidForTraversal {
1563
+ let frontmostPid = NSWorkspace.shared.frontmostApplication?.processIdentifier
1564
+ if let newPid = frontmostPid, newPid != originalPid {
1565
+ let frontmostName = NSWorkspace.shared.frontmostApplication?.localizedName ?? "Unknown"
1566
+ fputs("log: handler(CallTool): app switch detected! Original PID \(originalPid) -> new frontmost PID \(newPid) (\(frontmostName))\n", stderr)
1567
+ toolResponse.appSwitchPid = newPid
1568
+ toolResponse.appSwitchAppName = frontmostName
1569
+
1570
+ // Traverse the new frontmost app
1571
+ do {
1572
+ let newTraversal: ResponseData = try await Task { @MainActor in
1573
+ return try traverseAccessibilityTree(pid: newPid)
1574
+ }.value
1575
+ let newWindowBounds = getWindowBoundsFromTraversal(newTraversal)
1576
+ ?? getWindowBoundsFromAPI(pid: newPid)
1577
+ toolResponse.appSwitchTraversal = enrichResponseData(newTraversal, windowBounds: newWindowBounds)
1578
+ fputs("log: handler(CallTool): traversed new frontmost app \(frontmostName) (PID \(newPid)): \(newTraversal.elements.count) elements\n", stderr)
1579
+ } catch {
1580
+ fputs("warning: handler(CallTool): failed to traverse new frontmost app \(frontmostName) (PID \(newPid)): \(error)\n", stderr)
1581
+ }
1582
+ }
1123
1583
  }
1584
+ let resultTextString = buildFlatTextResponse(toolResponse)
1124
1585
 
1125
1586
  // --- Determine if it was an error overall ---
1126
1587
  let isError = actionResult.primaryActionError != nil ||
@@ -1131,18 +1592,28 @@ func setupAndStartServer() async throws -> Server {
1131
1592
  fputs("warning: handler(CallTool): Action resulted in an error state (primary: \(actionResult.primaryActionError ?? "nil"), before: \(actionResult.traversalBeforeError ?? "nil"), after: \(actionResult.traversalAfterError ?? "nil")).\n", stderr)
1132
1593
  }
1133
1594
 
1134
- // --- Write full JSON to file, return compact summary ---
1595
+ // --- Write flat text to file, return compact summary ---
1135
1596
  let outputDir = "/tmp/macos-use"
1136
1597
  try? FileManager.default.createDirectory(atPath: outputDir, withIntermediateDirectories: true)
1137
1598
 
1138
1599
  let timestamp = Int(Date().timeIntervalSince1970 * 1000) // ms precision to avoid collisions
1139
1600
  let safeName = params.name.replacingOccurrences(of: "macos-use_", with: "")
1140
- let filename = "\(timestamp)_\(safeName).json"
1601
+ let filename = "\(timestamp)_\(safeName).txt"
1141
1602
  let filepath = "\(outputDir)/\(filename)"
1142
- try? resultJsonString.write(toFile: filepath, atomically: true, encoding: .utf8)
1143
- fputs("log: handler(CallTool): wrote full response to \(filepath) (\(resultJsonString.count) bytes)\n", stderr)
1603
+ try? resultTextString.write(toFile: filepath, atomically: true, encoding: .utf8)
1604
+ fputs("log: handler(CallTool): wrote full response to \(filepath) (\(resultTextString.count) bytes)\n", stderr)
1605
+
1606
+ // --- Capture window screenshot ---
1607
+ var screenshotPath: String? = nil
1608
+ let screenshotFilename = "\(timestamp)_\(safeName).png"
1609
+ let screenshotFilepath = "\(outputDir)/\(screenshotFilename)"
1610
+ // Use the effective PID (could be app-switched)
1611
+ let screenshotPid = toolResponse.appSwitchPid ?? toolResponse.traversalPid ?? options.pidForTraversal
1612
+ if let pid = screenshotPid {
1613
+ screenshotPath = captureWindowScreenshot(pid: pid, outputPath: screenshotFilepath, clickPoint: lastClickPoint)
1614
+ }
1144
1615
 
1145
- let summary = buildCompactSummary(toolName: params.name, params: params, toolResponse: toolResponse, filepath: filepath)
1616
+ let summary = buildCompactSummary(toolName: params.name, params: params, toolResponse: toolResponse, filepath: filepath, fileSize: resultTextString.count, screenshotPath: screenshotPath)
1146
1617
  fputs("log: handler(CallTool): returning compact summary (\(summary.count) chars)\n", stderr)
1147
1618
 
1148
1619
  return .init(content: [.text(summary)], isError: isError)
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mcp-server-macos-use",
3
- "version": "0.1.7",
3
+ "version": "0.1.10",
4
4
  "description": "MCP server that lets AI agents control any macOS application through accessibility APIs",
5
5
  "bin": {
6
6
  "mcp-server-macos-use": "bin/mcp-server-macos-use"