mcp-server-macos-use 0.1.7 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Package.resolved +1 -1
- package/Sources/main.swift +500 -29
- package/extensions-check.png +0 -0
- package/package.json +1 -1
package/Package.resolved
CHANGED
package/Sources/main.swift
CHANGED
|
@@ -201,6 +201,55 @@ struct ToolResponse: Codable {
|
|
|
201
201
|
var diff: EnrichedTraversalDiff? // for click/type/press: what changed
|
|
202
202
|
var primaryActionError: String?
|
|
203
203
|
var traversalError: String?
|
|
204
|
+
|
|
205
|
+
// Cross-app handoff: populated when a different app became frontmost after the action
|
|
206
|
+
var appSwitchPid: pid_t?
|
|
207
|
+
var appSwitchAppName: String?
|
|
208
|
+
var appSwitchTraversal: EnrichedResponseData?
|
|
209
|
+
|
|
210
|
+
// Sheet/dialog detection
|
|
211
|
+
var sheetDetected: Bool?
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// --- Sheet Detection ---
|
|
215
|
+
|
|
216
|
+
/// Check if the app has an AXSheet child (file dialogs, save sheets, etc.)
|
|
217
|
+
/// and return its bounds for viewport scoping.
|
|
218
|
+
func findSheetBounds(pid: pid_t) -> CGRect? {
|
|
219
|
+
let appElement = AXUIElementCreateApplication(pid)
|
|
220
|
+
AXUIElementSetMessagingTimeout(appElement, 5.0)
|
|
221
|
+
|
|
222
|
+
// Check all windows for AXSheet children
|
|
223
|
+
var windowsRef: CFTypeRef?
|
|
224
|
+
guard AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &windowsRef) == .success,
|
|
225
|
+
let windows = windowsRef as? [AXUIElement] else { return nil }
|
|
226
|
+
|
|
227
|
+
for window in windows {
|
|
228
|
+
AXUIElementSetMessagingTimeout(window, 5.0)
|
|
229
|
+
// Look for AXSheet role among children
|
|
230
|
+
var childCount: CFIndex = 0
|
|
231
|
+
guard AXUIElementGetAttributeValueCount(window, kAXChildrenAttribute as CFString, &childCount) == .success,
|
|
232
|
+
childCount > 0 else { continue }
|
|
233
|
+
let fetchCount = min(CFIndex(50), childCount)
|
|
234
|
+
var childrenRef: CFArray?
|
|
235
|
+
guard AXUIElementCopyAttributeValues(window, kAXChildrenAttribute as CFString, 0, fetchCount, &childrenRef) == .success,
|
|
236
|
+
let cfArray = childrenRef else { continue }
|
|
237
|
+
let children = cfArray as [AnyObject]
|
|
238
|
+
for child in children {
|
|
239
|
+
let childElement = child as! AXUIElement
|
|
240
|
+
AXUIElementSetMessagingTimeout(childElement, 5.0)
|
|
241
|
+
var roleRef: CFTypeRef?
|
|
242
|
+
guard AXUIElementCopyAttributeValue(childElement, kAXRoleAttribute as CFString, &roleRef) == .success,
|
|
243
|
+
let role = roleRef as? String else { continue }
|
|
244
|
+
if role == "AXSheet" {
|
|
245
|
+
if let frame = getAXElementFrame(childElement) {
|
|
246
|
+
fputs("log: findSheetBounds: found AXSheet at \(frame)\n", stderr)
|
|
247
|
+
return frame
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
return nil
|
|
204
253
|
}
|
|
205
254
|
|
|
206
255
|
// --- Viewport Detection Helpers ---
|
|
@@ -225,6 +274,7 @@ func getWindowContainingPoint(appElement: AXUIElement, point: CGPoint) -> (eleme
|
|
|
225
274
|
if AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &windowsRef) == .success,
|
|
226
275
|
let windows = windowsRef as? [AXUIElement] {
|
|
227
276
|
for window in windows {
|
|
277
|
+
AXUIElementSetMessagingTimeout(window, 5.0)
|
|
228
278
|
guard let frame = getAXElementFrame(window) else { continue }
|
|
229
279
|
if frame.contains(point) {
|
|
230
280
|
fputs("log: getWindowContainingPoint: matched window \(frame) for point \(point)\n", stderr)
|
|
@@ -236,6 +286,7 @@ func getWindowContainingPoint(appElement: AXUIElement, point: CGPoint) -> (eleme
|
|
|
236
286
|
var winRef: CFTypeRef?
|
|
237
287
|
guard AXUIElementCopyAttributeValue(appElement, "AXMainWindow" as CFString, &winRef) == .success else { return nil }
|
|
238
288
|
let win = winRef as! AXUIElement
|
|
289
|
+
AXUIElementSetMessagingTimeout(win, 5.0)
|
|
239
290
|
guard let frame = getAXElementFrame(win) else { return nil }
|
|
240
291
|
fputs("log: getWindowContainingPoint: no window contains \(point), falling back to main window \(frame)\n", stderr)
|
|
241
292
|
return (win, frame)
|
|
@@ -244,6 +295,7 @@ func getWindowContainingPoint(appElement: AXUIElement, point: CGPoint) -> (eleme
|
|
|
244
295
|
/// Get window bounds directly from the accessibility API
|
|
245
296
|
func getWindowBoundsFromAPI(pid: pid_t) -> CGRect? {
|
|
246
297
|
let appElement = AXUIElementCreateApplication(pid)
|
|
298
|
+
AXUIElementSetMessagingTimeout(appElement, 5.0)
|
|
247
299
|
|
|
248
300
|
var windowValue: CFTypeRef?
|
|
249
301
|
guard AXUIElementCopyAttributeValue(appElement, "AXMainWindow" as CFString, &windowValue) == .success else {
|
|
@@ -271,6 +323,133 @@ func getWindowBoundsFromAPI(pid: pid_t) -> CGRect? {
|
|
|
271
323
|
return CGRect(origin: position, size: size)
|
|
272
324
|
}
|
|
273
325
|
|
|
326
|
+
/// Capture a screenshot of the window(s) belonging to a given PID and save as PNG.
|
|
327
|
+
/// If `clickPoint` is provided (screen coordinates), draws a red crosshair at that location.
|
|
328
|
+
/// Returns the file path on success, nil on failure.
|
|
329
|
+
func captureWindowScreenshot(pid: pid_t, outputPath: String, clickPoint: CGPoint? = nil) -> String? {
|
|
330
|
+
// Get the list of windows for this PID
|
|
331
|
+
guard let windowList = CGWindowListCopyWindowInfo([.optionOnScreenOnly, .excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
|
|
332
|
+
fputs("warning: captureWindowScreenshot: could not get window list\n", stderr)
|
|
333
|
+
return nil
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Find the main window ID for this PID
|
|
337
|
+
var targetWindowID: CGWindowID? = nil
|
|
338
|
+
var windowBoundsDict: CFDictionary? = nil
|
|
339
|
+
for window in windowList {
|
|
340
|
+
guard let ownerPID = window[kCGWindowOwnerPID as String] as? pid_t,
|
|
341
|
+
ownerPID == pid,
|
|
342
|
+
let layer = window[kCGWindowLayer as String] as? Int,
|
|
343
|
+
layer == 0, // normal window layer
|
|
344
|
+
let windowID = window[kCGWindowNumber as String] as? CGWindowID else {
|
|
345
|
+
continue
|
|
346
|
+
}
|
|
347
|
+
targetWindowID = windowID
|
|
348
|
+
windowBoundsDict = window[kCGWindowBounds as String] as! CFDictionary?
|
|
349
|
+
fputs("log: captureWindowScreenshot: found window \(windowID), boundsDict=\(windowBoundsDict != nil ? "yes" : "nil")\n", stderr)
|
|
350
|
+
break
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
guard let windowID = targetWindowID else {
|
|
354
|
+
fputs("warning: captureWindowScreenshot: no on-screen window found for PID \(pid)\n", stderr)
|
|
355
|
+
return nil
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// Capture just this window — use a short timeout approach:
|
|
359
|
+
// Try the capture in a detached thread and bail if it takes too long
|
|
360
|
+
// (CGWindowListCreateImage can block indefinitely without screen recording permission)
|
|
361
|
+
fputs("log: captureWindowScreenshot: starting image capture for window \(windowID)...\n", stderr)
|
|
362
|
+
var capturedImage: CGImage? = nil
|
|
363
|
+
let captureGroup = DispatchGroup()
|
|
364
|
+
captureGroup.enter()
|
|
365
|
+
DispatchQueue.global(qos: .userInitiated).async {
|
|
366
|
+
capturedImage = CGWindowListCreateImage(.null, .optionIncludingWindow, windowID, [.boundsIgnoreFraming, .bestResolution])
|
|
367
|
+
captureGroup.leave()
|
|
368
|
+
}
|
|
369
|
+
let captureResult = captureGroup.wait(timeout: .now() + 3.0)
|
|
370
|
+
if captureResult == .timedOut {
|
|
371
|
+
fputs("warning: captureWindowScreenshot: CGWindowListCreateImage timed out (3s) — likely missing screen recording permission\n", stderr)
|
|
372
|
+
return nil
|
|
373
|
+
}
|
|
374
|
+
guard let image = capturedImage else {
|
|
375
|
+
fputs("warning: captureWindowScreenshot: CGWindowListCreateImage failed for window \(windowID)\n", stderr)
|
|
376
|
+
return nil
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Draw click point crosshair if provided
|
|
380
|
+
var finalImage = image
|
|
381
|
+
fputs("log: captureWindowScreenshot: clickPoint=\(clickPoint?.debugDescription ?? "nil"), boundsDict=\(windowBoundsDict != nil ? "present" : "nil")\n", stderr)
|
|
382
|
+
if let clickPoint = clickPoint, let boundsDict = windowBoundsDict {
|
|
383
|
+
var windowRect = CGRect.zero
|
|
384
|
+
CGRectMakeWithDictionaryRepresentation(boundsDict, &windowRect)
|
|
385
|
+
|
|
386
|
+
// Convert screen coordinates to image coordinates
|
|
387
|
+
// CGWindowListCreateImage with .boundsIgnoreFraming may include shadow, so we
|
|
388
|
+
// compute the offset from the window's screen origin to the image pixel space.
|
|
389
|
+
let imageWidth = CGFloat(image.width)
|
|
390
|
+
let imageHeight = CGFloat(image.height)
|
|
391
|
+
// Scale factor: image pixels may differ from window points (Retina)
|
|
392
|
+
let scaleX = imageWidth / windowRect.width
|
|
393
|
+
let scaleY = imageHeight / windowRect.height
|
|
394
|
+
let localX = (clickPoint.x - windowRect.origin.x) * scaleX
|
|
395
|
+
let localY = (clickPoint.y - windowRect.origin.y) * scaleY
|
|
396
|
+
|
|
397
|
+
fputs("log: captureWindowScreenshot: drawing crosshair at screen(\(clickPoint.x),\(clickPoint.y)) → image(\(localX),\(localY)) windowOrigin(\(windowRect.origin.x),\(windowRect.origin.y)) scale(\(scaleX),\(scaleY))\n", stderr)
|
|
398
|
+
|
|
399
|
+
// Draw crosshair on the image
|
|
400
|
+
let colorSpace = CGColorSpaceCreateDeviceRGB()
|
|
401
|
+
if let ctx = CGContext(data: nil, width: image.width, height: image.height,
|
|
402
|
+
bitsPerComponent: 8, bytesPerRow: 0, space: colorSpace,
|
|
403
|
+
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue) {
|
|
404
|
+
// Draw original image
|
|
405
|
+
ctx.draw(image, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
|
|
406
|
+
|
|
407
|
+
// Flip Y for CoreGraphics drawing (origin is bottom-left)
|
|
408
|
+
let drawX = localX
|
|
409
|
+
let drawY = imageHeight - localY
|
|
410
|
+
|
|
411
|
+
// Red crosshair
|
|
412
|
+
ctx.setStrokeColor(CGColor(red: 1, green: 0, blue: 0, alpha: 1))
|
|
413
|
+
ctx.setLineWidth(2.0 * max(scaleX, scaleY))
|
|
414
|
+
|
|
415
|
+
let armLength: CGFloat = 15 * max(scaleX, scaleY)
|
|
416
|
+
// Horizontal line
|
|
417
|
+
ctx.move(to: CGPoint(x: drawX - armLength, y: drawY))
|
|
418
|
+
ctx.addLine(to: CGPoint(x: drawX + armLength, y: drawY))
|
|
419
|
+
// Vertical line
|
|
420
|
+
ctx.move(to: CGPoint(x: drawX, y: drawY - armLength))
|
|
421
|
+
ctx.addLine(to: CGPoint(x: drawX, y: drawY + armLength))
|
|
422
|
+
ctx.strokePath()
|
|
423
|
+
|
|
424
|
+
// Circle around crosshair
|
|
425
|
+
ctx.setLineWidth(1.5 * max(scaleX, scaleY))
|
|
426
|
+
let radius: CGFloat = 10 * max(scaleX, scaleY)
|
|
427
|
+
ctx.addEllipse(in: CGRect(x: drawX - radius, y: drawY - radius, width: radius * 2, height: radius * 2))
|
|
428
|
+
ctx.strokePath()
|
|
429
|
+
|
|
430
|
+
if let annotatedImage = ctx.makeImage() {
|
|
431
|
+
finalImage = annotatedImage
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Convert to PNG data and write to file
|
|
437
|
+
let bitmapRep = NSBitmapImageRep(cgImage: finalImage)
|
|
438
|
+
guard let pngData = bitmapRep.representation(using: .png, properties: [:]) else {
|
|
439
|
+
fputs("warning: captureWindowScreenshot: failed to create PNG data\n", stderr)
|
|
440
|
+
return nil
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
do {
|
|
444
|
+
try pngData.write(to: URL(fileURLWithPath: outputPath))
|
|
445
|
+
fputs("log: captureWindowScreenshot: saved screenshot to \(outputPath) (\(pngData.count) bytes)\n", stderr)
|
|
446
|
+
return outputPath
|
|
447
|
+
} catch {
|
|
448
|
+
fputs("warning: captureWindowScreenshot: failed to write screenshot: \(error)\n", stderr)
|
|
449
|
+
return nil
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
274
453
|
/// Enrich a ResponseData with in_viewport metadata for each element
|
|
275
454
|
func enrichResponseData(_ response: ResponseData, windowBounds: CGRect?) -> EnrichedResponseData {
|
|
276
455
|
let enrichedElements = response.elements.map { element -> EnrichedElementData in
|
|
@@ -370,11 +549,21 @@ func buildToolResponse(_ result: ActionResult, hasDiff: Bool) -> ToolResponse {
|
|
|
370
549
|
}
|
|
371
550
|
}
|
|
372
551
|
|
|
552
|
+
// Check for AXSheet (file dialogs, save sheets) — use sheet bounds for viewport
|
|
553
|
+
var sheetDetected = false
|
|
554
|
+
if let pid = result.traversalPid ?? result.openResult?.pid {
|
|
555
|
+
if let sheetBounds = findSheetBounds(pid: pid) {
|
|
556
|
+
windowBounds = sheetBounds
|
|
557
|
+
sheetDetected = true
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
|
|
373
561
|
var response = ToolResponse()
|
|
374
562
|
response.openResult = result.openResult
|
|
375
563
|
response.traversalPid = result.traversalPid
|
|
376
564
|
response.primaryActionError = result.primaryActionError
|
|
377
565
|
response.traversalError = result.traversalAfterError ?? result.traversalBeforeError
|
|
566
|
+
response.sheetDetected = sheetDetected ? true : nil
|
|
378
567
|
|
|
379
568
|
if hasDiff, let rawDiff = result.traversalDiff {
|
|
380
569
|
let coordinateAttrs: Set<String> = ["x", "y", "width", "height"]
|
|
@@ -459,7 +648,7 @@ func buildToolResponse(_ result: ActionResult, hasDiff: Bool) -> ToolResponse {
|
|
|
459
648
|
|
|
460
649
|
/// Build a concise text summary for the MCP response instead of returning the full JSON.
|
|
461
650
|
/// The full JSON is written to a file; this summary contains just the key info + file path.
|
|
462
|
-
func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResponse: ToolResponse, filepath: String) -> String {
|
|
651
|
+
func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResponse: ToolResponse, filepath: String, fileSize: Int, screenshotPath: String? = nil) -> String {
|
|
463
652
|
var lines: [String] = []
|
|
464
653
|
|
|
465
654
|
// Status line
|
|
@@ -473,9 +662,26 @@ func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResp
|
|
|
473
662
|
if let appName = toolResponse.traversal?.app_name ?? toolResponse.openResult?.appName {
|
|
474
663
|
lines.append("app: \(appName)")
|
|
475
664
|
}
|
|
665
|
+
if toolResponse.sheetDetected == true {
|
|
666
|
+
lines.append("dialog: AXSheet detected (viewport scoped to sheet bounds)")
|
|
667
|
+
}
|
|
476
668
|
|
|
477
|
-
// File path
|
|
669
|
+
// File path + metadata
|
|
478
670
|
lines.append("file: \(filepath)")
|
|
671
|
+
let elementCount: Int
|
|
672
|
+
if let traversal = toolResponse.traversal {
|
|
673
|
+
elementCount = traversal.elements.count
|
|
674
|
+
} else {
|
|
675
|
+
let added = toolResponse.diff?.added.count ?? 0
|
|
676
|
+
let removed = toolResponse.diff?.removed.count ?? 0
|
|
677
|
+
let modified = toolResponse.diff?.modified.count ?? 0
|
|
678
|
+
elementCount = added + removed + modified
|
|
679
|
+
}
|
|
680
|
+
lines.append("file_size: \(fileSize) bytes (\(elementCount) elements)")
|
|
681
|
+
lines.append("hint: grep -n 'AXButton' \(filepath) # search by role or text")
|
|
682
|
+
if let screenshotPath = screenshotPath {
|
|
683
|
+
lines.append("screenshot: \(screenshotPath)")
|
|
684
|
+
}
|
|
479
685
|
|
|
480
686
|
// Errors if any
|
|
481
687
|
if let err = toolResponse.primaryActionError {
|
|
@@ -499,10 +705,19 @@ func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResp
|
|
|
499
705
|
}
|
|
500
706
|
|
|
501
707
|
case "macos-use_click_and_traverse":
|
|
502
|
-
let
|
|
503
|
-
let
|
|
708
|
+
let isDoubleClick = params.arguments?["doubleClick"]?.boolValue ?? false
|
|
709
|
+
let isRightClick = params.arguments?["rightClick"]?.boolValue ?? false
|
|
710
|
+
let clickType = isDoubleClick ? "Double-clicked" : isRightClick ? "Right-clicked" : "Clicked"
|
|
504
711
|
let diffSummary = buildDiffSummary(toolResponse.diff)
|
|
505
|
-
|
|
712
|
+
if let elemSearch = params.arguments?["element"]?.stringValue {
|
|
713
|
+
let roleFilter = params.arguments?["role"]?.stringValue
|
|
714
|
+
let roleDesc = roleFilter != nil ? " [\(roleFilter!)]" : ""
|
|
715
|
+
summaryLine = "\(clickType) element '\(elemSearch)'\(roleDesc). \(diffSummary)"
|
|
716
|
+
} else {
|
|
717
|
+
let x = params.arguments?["x"]?.doubleValue ?? params.arguments?["x"]?.intValue.map(Double.init) ?? 0
|
|
718
|
+
let y = params.arguments?["y"]?.doubleValue ?? params.arguments?["y"]?.intValue.map(Double.init) ?? 0
|
|
719
|
+
summaryLine = "\(clickType) at (\(Int(x)),\(Int(y))). \(diffSummary)"
|
|
720
|
+
}
|
|
506
721
|
|
|
507
722
|
case "macos-use_type_and_traverse":
|
|
508
723
|
let text = params.arguments?["text"]?.stringValue ?? ""
|
|
@@ -562,6 +777,30 @@ func buildCompactSummary(toolName: String, params: CallTool.Parameters, toolResp
|
|
|
562
777
|
}
|
|
563
778
|
}
|
|
564
779
|
|
|
780
|
+
// Inline visible interactive elements
|
|
781
|
+
if let traversal = toolResponse.traversal {
|
|
782
|
+
// Full traversal (open/refresh): show all visible interactive elements
|
|
783
|
+
let visLines = buildVisibleElementsSection(elements: traversal.elements, label: "visible_elements")
|
|
784
|
+
lines.append(contentsOf: visLines)
|
|
785
|
+
} else if let diff = toolResponse.diff, !diff.added.isEmpty {
|
|
786
|
+
// Diff (click/type/press/scroll): show newly added visible elements
|
|
787
|
+
let visLines = buildVisibleElementsSection(elements: diff.added, label: "visible_elements", interactiveCap: 20, textCap: 10)
|
|
788
|
+
lines.append(contentsOf: visLines)
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
// Cross-app handoff: a different app became frontmost after the action
|
|
792
|
+
if let switchPid = toolResponse.appSwitchPid {
|
|
793
|
+
let switchName = toolResponse.appSwitchAppName ?? "Unknown"
|
|
794
|
+
lines.append("app_switch: \(switchName) (PID: \(switchPid)) is now frontmost")
|
|
795
|
+
if let switchTraversal = toolResponse.appSwitchTraversal {
|
|
796
|
+
let total = switchTraversal.elements.count
|
|
797
|
+
let visible = switchTraversal.elements.filter { $0.in_viewport == true }.count
|
|
798
|
+
lines.append("app_switch_elements: \(total) total, \(visible) visible")
|
|
799
|
+
let visLines = buildVisibleElementsSection(elements: switchTraversal.elements, label: "app_switch_visible_elements")
|
|
800
|
+
lines.append(contentsOf: visLines)
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
|
|
565
804
|
return lines.joined(separator: "\n")
|
|
566
805
|
}
|
|
567
806
|
|
|
@@ -580,6 +819,154 @@ func truncate(_ s: String, maxLen: Int) -> String {
|
|
|
580
819
|
s.count > maxLen ? String(s.prefix(maxLen)) + "..." : s
|
|
581
820
|
}
|
|
582
821
|
|
|
822
|
+
/// Protocol for element types that can be displayed in visible elements section
|
|
823
|
+
protocol VisibleElement {
|
|
824
|
+
var role: String { get }
|
|
825
|
+
var text: String? { get }
|
|
826
|
+
var in_viewport: Bool? { get }
|
|
827
|
+
var x: Double? { get }
|
|
828
|
+
var y: Double? { get }
|
|
829
|
+
var width: Double? { get }
|
|
830
|
+
var height: Double? { get }
|
|
831
|
+
}
|
|
832
|
+
extension EnrichedElementData: VisibleElement {}
|
|
833
|
+
extension DiffElementData: VisibleElement {}
|
|
834
|
+
|
|
835
|
+
/// Interactive role prefixes worth showing inline in the compact summary
|
|
836
|
+
private let interactiveRolePrefixes: [String] = [
|
|
837
|
+
"AXButton", "AXLink", "AXTextField", "AXTextArea", "AXCheckBox",
|
|
838
|
+
"AXRadioButton", "AXPopUpButton", "AXComboBox", "AXSlider",
|
|
839
|
+
"AXMenuItem", "AXMenuButton", "AXTab"
|
|
840
|
+
]
|
|
841
|
+
|
|
842
|
+
/// Check if a role string matches any interactive prefix
|
|
843
|
+
private func isInteractiveRole(_ role: String) -> Bool {
|
|
844
|
+
interactiveRolePrefixes.contains { role.hasPrefix($0) }
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
/// Check if a role string is static text
|
|
848
|
+
private func isStaticTextRole(_ role: String) -> Bool {
|
|
849
|
+
role.hasPrefix("AXStaticText")
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
/// Build a visible_elements section from a list of elements
|
|
853
|
+
func buildVisibleElementsSection<T: VisibleElement>(elements: [T], label: String, interactiveCap: Int = 30, textCap: Int = 10) -> [String] {
|
|
854
|
+
var interactive: [String] = []
|
|
855
|
+
var staticText: [String] = []
|
|
856
|
+
|
|
857
|
+
for el in elements {
|
|
858
|
+
guard el.in_viewport == true else { continue }
|
|
859
|
+
guard let text = el.text, !text.isEmpty else { continue }
|
|
860
|
+
|
|
861
|
+
let truncatedText = truncate(text, maxLen: 50)
|
|
862
|
+
let pos: String
|
|
863
|
+
if let x = el.x, let y = el.y, let w = el.width, let h = el.height {
|
|
864
|
+
pos = " (\(Int(x)),\(Int(y)) \(Int(w))×\(Int(h)))"
|
|
865
|
+
} else {
|
|
866
|
+
pos = ""
|
|
867
|
+
}
|
|
868
|
+
let line = " [\(el.role)] \"\(truncatedText)\"\(pos)"
|
|
869
|
+
|
|
870
|
+
if isInteractiveRole(el.role) {
|
|
871
|
+
if interactive.count < interactiveCap {
|
|
872
|
+
interactive.append(line)
|
|
873
|
+
}
|
|
874
|
+
} else if isStaticTextRole(el.role) {
|
|
875
|
+
if staticText.count < textCap {
|
|
876
|
+
staticText.append(line)
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
if interactive.isEmpty && staticText.isEmpty { return [] }
|
|
882
|
+
|
|
883
|
+
var result = ["\(label):"]
|
|
884
|
+
result.append(contentsOf: interactive)
|
|
885
|
+
result.append(contentsOf: staticText)
|
|
886
|
+
return result
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
// --- Flat Text Response File Builder ---
|
|
890
|
+
|
|
891
|
+
/// Format a single element as a grep-friendly text line
|
|
892
|
+
func formatElementLine(_ el: VisibleElement, prefix: String = " ") -> String {
|
|
893
|
+
var parts: [String] = []
|
|
894
|
+
parts.append("[\(el.role)]")
|
|
895
|
+
if let text = el.text, !text.isEmpty {
|
|
896
|
+
let truncated = text.count > 80 ? String(text.prefix(80)) + "..." : text
|
|
897
|
+
parts.append("\"\(truncated)\"")
|
|
898
|
+
}
|
|
899
|
+
if let x = el.x, let y = el.y {
|
|
900
|
+
parts.append("x:\(Int(x)) y:\(Int(y))")
|
|
901
|
+
}
|
|
902
|
+
if let w = el.width, let h = el.height {
|
|
903
|
+
parts.append("w:\(Int(w)) h:\(Int(h))")
|
|
904
|
+
}
|
|
905
|
+
if el.in_viewport == true {
|
|
906
|
+
parts.append("visible")
|
|
907
|
+
}
|
|
908
|
+
return "\(prefix)\(parts.joined(separator: " "))"
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
/// Build a flat text representation of a ToolResponse for writing to .txt files
|
|
912
|
+
func buildFlatTextResponse(_ toolResponse: ToolResponse) -> String {
|
|
913
|
+
var lines: [String] = []
|
|
914
|
+
|
|
915
|
+
if let traversal = toolResponse.traversal {
|
|
916
|
+
// Full traversal
|
|
917
|
+
lines.append("# \(traversal.app_name) — \(traversal.elements.count) elements (\(traversal.processing_time_seconds)s)")
|
|
918
|
+
if toolResponse.sheetDetected == true {
|
|
919
|
+
lines.append("# dialog: AXSheet detected")
|
|
920
|
+
}
|
|
921
|
+
lines.append("")
|
|
922
|
+
for el in traversal.elements {
|
|
923
|
+
lines.append(formatElementLine(el))
|
|
924
|
+
}
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
if let diff = toolResponse.diff {
|
|
928
|
+
lines.append("# diff: +\(diff.added.count) added, -\(diff.removed.count) removed, ~\(diff.modified.count) modified")
|
|
929
|
+
if toolResponse.sheetDetected == true {
|
|
930
|
+
lines.append("# dialog: AXSheet detected")
|
|
931
|
+
}
|
|
932
|
+
lines.append("")
|
|
933
|
+
for el in diff.added {
|
|
934
|
+
lines.append(formatElementLine(el, prefix: "+ "))
|
|
935
|
+
}
|
|
936
|
+
for el in diff.removed {
|
|
937
|
+
lines.append(formatElementLine(el, prefix: "- "))
|
|
938
|
+
}
|
|
939
|
+
for mod in diff.modified {
|
|
940
|
+
var changeParts: [String] = []
|
|
941
|
+
for change in mod.changes {
|
|
942
|
+
let old = change.oldValue ?? change.removedText ?? ""
|
|
943
|
+
let new = change.newValue ?? change.addedText ?? ""
|
|
944
|
+
changeParts.append("\(change.attributeName): '\(old)' -> '\(new)'")
|
|
945
|
+
}
|
|
946
|
+
lines.append("~ [\(mod.after.role)] \"\(mod.after.text ?? "")\" | \(changeParts.joined(separator: ", "))")
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
// Cross-app handoff
|
|
951
|
+
if let switchTraversal = toolResponse.appSwitchTraversal {
|
|
952
|
+
lines.append("")
|
|
953
|
+
lines.append("# app_switch: \(toolResponse.appSwitchAppName ?? "Unknown") (PID: \(toolResponse.appSwitchPid ?? 0))")
|
|
954
|
+
for el in switchTraversal.elements {
|
|
955
|
+
lines.append(formatElementLine(el))
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
// Errors
|
|
960
|
+
if let err = toolResponse.primaryActionError {
|
|
961
|
+
lines.append("# error: \(err)")
|
|
962
|
+
}
|
|
963
|
+
if let err = toolResponse.traversalError {
|
|
964
|
+
lines.append("# traversal_error: \(err)")
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
return lines.joined(separator: "\n")
|
|
968
|
+
}
|
|
969
|
+
|
|
583
970
|
// --- Direct AX Element Interaction ---
|
|
584
971
|
|
|
585
972
|
// --- Auto-Scroll via Scroll Wheel Events ---
|
|
@@ -691,6 +1078,7 @@ func findElementByText(root: AXUIElement, text: String, viewport: CGRect, maxDep
|
|
|
691
1078
|
/// scroll toward the target and keep probing until an element with text appears.
|
|
692
1079
|
func scrollIntoViewIfNeeded(pid: pid_t, point: CGPoint) async -> CGPoint {
|
|
693
1080
|
let appElement = AXUIElementCreateApplication(pid)
|
|
1081
|
+
AXUIElementSetMessagingTimeout(appElement, 5.0)
|
|
694
1082
|
|
|
695
1083
|
guard let (windowElement, windowBounds) = getWindowContainingPoint(appElement: appElement, point: point) else {
|
|
696
1084
|
fputs("log: scrollIntoViewIfNeeded: could not get window bounds, using original point\n", stderr)
|
|
@@ -839,12 +1227,16 @@ func setupAndStartServer() async throws -> Server {
|
|
|
839
1227
|
"type": .string("object"),
|
|
840
1228
|
"properties": .object([
|
|
841
1229
|
"pid": .object(["type": .string("number"), "description": .string("REQUIRED. PID of the target application window.")]),
|
|
842
|
-
"x": .object(["type": .string("number"), "description": .string("
|
|
843
|
-
"y": .object(["type": .string("number"), "description": .string("
|
|
1230
|
+
"x": .object(["type": .string("number"), "description": .string("X coordinate for the click (top-left of element). Required unless 'element' is provided.")]),
|
|
1231
|
+
"y": .object(["type": .string("number"), "description": .string("Y coordinate for the click (top-left of element). Required unless 'element' is provided.")]),
|
|
844
1232
|
"width": .object(["type": .string("number"), "description": .string("Optional. Element width from traversal. When provided with height, click lands at center (x+width/2, y+height/2).")]),
|
|
845
|
-
"height": .object(["type": .string("number"), "description": .string("Optional. Element height from traversal. When provided with width, click lands at center (x+width/2, y+height/2).")])
|
|
1233
|
+
"height": .object(["type": .string("number"), "description": .string("Optional. Element height from traversal. When provided with width, click lands at center (x+width/2, y+height/2).")]),
|
|
1234
|
+
"element": .object(["type": .string("string"), "description": .string("Optional. Case-insensitive partial text match to find and click an element (e.g. \"Open\", \"Submit\"). Searches visible elements in the accessibility tree. First match is clicked. Alternative to x/y coordinates.")]),
|
|
1235
|
+
"role": .object(["type": .string("string"), "description": .string("Optional. Filter element search by accessibility role. Common roles: AXButton, AXLink, AXTextField, AXTextArea, AXCheckBox, AXRadioButton, AXPopUpButton, AXComboBox, AXSlider, AXMenuItem, AXMenuButton, AXTab, AXStaticText, AXImage, AXGroup, AXCell, AXRow.")]),
|
|
1236
|
+
"doubleClick": .object(["type": .string("boolean"), "description": .string("Optional. If true, performs a double-click instead of a single click.")]),
|
|
1237
|
+
"rightClick": .object(["type": .string("boolean"), "description": .string("Optional. If true, performs a right-click (context menu) instead of a left click.")])
|
|
846
1238
|
]),
|
|
847
|
-
"required": .array([.string("pid")
|
|
1239
|
+
"required": .array([.string("pid")])
|
|
848
1240
|
])
|
|
849
1241
|
let clickTool = Tool(
|
|
850
1242
|
name: "macos-use_click_and_traverse",
|
|
@@ -926,7 +1318,7 @@ func setupAndStartServer() async throws -> Server {
|
|
|
926
1318
|
|
|
927
1319
|
let server = Server(
|
|
928
1320
|
name: "SwiftMacOSServerDirect", // Renamed slightly
|
|
929
|
-
version: "1.
|
|
1321
|
+
version: "1.6.0", // Screenshot capture, AX timeout protection
|
|
930
1322
|
capabilities: .init(
|
|
931
1323
|
tools: .init(listChanged: true)
|
|
932
1324
|
)
|
|
@@ -974,6 +1366,7 @@ func setupAndStartServer() async throws -> Server {
|
|
|
974
1366
|
do {
|
|
975
1367
|
// --- Determine Action and Options from MCP Params ---
|
|
976
1368
|
let primaryAction: PrimaryAction
|
|
1369
|
+
var lastClickPoint: CGPoint? = nil // Track click location for screenshot annotation
|
|
977
1370
|
var options = ActionOptions(traverseAfter: true, showAnimation: false) // MCP tools should return the tree by default, no visual highlighting
|
|
978
1371
|
|
|
979
1372
|
// PID is required for click, type, press, refresh
|
|
@@ -1016,18 +1409,55 @@ func setupAndStartServer() async throws -> Server {
|
|
|
1016
1409
|
|
|
1017
1410
|
case clickTool.name:
|
|
1018
1411
|
guard let reqPid = pidForOptions else { throw MCPError.invalidParams("Missing required 'pid' for click tool") }
|
|
1019
|
-
|
|
1020
|
-
let
|
|
1021
|
-
let
|
|
1022
|
-
|
|
1023
|
-
// If width+height provided, compute exact center; otherwise use raw point (AX will center via lookup)
|
|
1412
|
+
|
|
1413
|
+
let elementSearch = params.arguments?["element"]?.stringValue
|
|
1414
|
+
let roleFilter = params.arguments?["role"]?.stringValue
|
|
1415
|
+
|
|
1024
1416
|
let rawPoint: CGPoint
|
|
1025
|
-
if let
|
|
1026
|
-
|
|
1027
|
-
fputs("log: click_and_traverse:
|
|
1417
|
+
if let elementSearch = elementSearch {
|
|
1418
|
+
// --- Search mode: find element by text match ---
|
|
1419
|
+
fputs("log: click_and_traverse: searching for element '\(elementSearch)' (role: \(roleFilter ?? "any"))\n", stderr)
|
|
1420
|
+
let traversal: ResponseData = try await Task { @MainActor in
|
|
1421
|
+
return try traverseAccessibilityTree(pid: reqPid)
|
|
1422
|
+
}.value
|
|
1423
|
+
let windowBounds = getWindowBoundsFromTraversal(traversal) ?? getWindowBoundsFromAPI(pid: reqPid)
|
|
1424
|
+
let enriched = enrichResponseData(traversal, windowBounds: windowBounds)
|
|
1425
|
+
|
|
1426
|
+
let searchLower = elementSearch.lowercased()
|
|
1427
|
+
let matches = enriched.elements.filter { elem in
|
|
1428
|
+
guard let text = elem.text, !text.isEmpty,
|
|
1429
|
+
let _ = elem.x, let _ = elem.y,
|
|
1430
|
+
let w = elem.width, let h = elem.height,
|
|
1431
|
+
w > 0, h > 0 else { return false }
|
|
1432
|
+
let textMatch = text.lowercased().contains(searchLower)
|
|
1433
|
+
let roleMatch = roleFilter == nil || elem.role.lowercased().hasPrefix(roleFilter!.lowercased())
|
|
1434
|
+
return textMatch && roleMatch
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
guard let match = matches.first else {
|
|
1438
|
+
let roleHint = roleFilter != nil ? " with role '\(roleFilter!)'" : ""
|
|
1439
|
+
throw MCPError.invalidParams("No visible element matching '\(elementSearch)'\(roleHint) found in app (PID \(reqPid)). Use the traversal file to find available elements.")
|
|
1440
|
+
}
|
|
1441
|
+
let matchX = match.x!, matchY = match.y!, matchW = match.width!, matchH = match.height!
|
|
1442
|
+
rawPoint = CGPoint(x: matchX + matchW / 2, y: matchY + matchH / 2)
|
|
1443
|
+
let matchCount = matches.count
|
|
1444
|
+
fputs("log: click_and_traverse: found \(matchCount) match(es) for '\(elementSearch)'. Clicking '\(match.text ?? "")' [\(match.role)] at center (\(rawPoint.x),\(rawPoint.y))\n", stderr)
|
|
1028
1445
|
} else {
|
|
1029
|
-
|
|
1446
|
+
// --- Coordinate mode ---
|
|
1447
|
+
guard let x = try getOptionalDouble(from: params.arguments, key: "x"),
|
|
1448
|
+
let y = try getOptionalDouble(from: params.arguments, key: "y") else {
|
|
1449
|
+
throw MCPError.invalidParams("Either 'element' or both 'x' and 'y' must be provided for click tool")
|
|
1450
|
+
}
|
|
1451
|
+
let w = try getOptionalDouble(from: params.arguments, key: "width")
|
|
1452
|
+
let h = try getOptionalDouble(from: params.arguments, key: "height")
|
|
1453
|
+
if let w = w, let h = h {
|
|
1454
|
+
rawPoint = CGPoint(x: x + w / 2, y: y + h / 2)
|
|
1455
|
+
fputs("log: click_and_traverse: centering (\(x),\(y)) + size(\(w)×\(h)) → \(rawPoint)\n", stderr)
|
|
1456
|
+
} else {
|
|
1457
|
+
rawPoint = CGPoint(x: x, y: y)
|
|
1458
|
+
}
|
|
1030
1459
|
}
|
|
1460
|
+
|
|
1031
1461
|
// Activate the target app before clicking so the click registers correctly
|
|
1032
1462
|
if let runningApp = NSRunningApplication(processIdentifier: reqPid) {
|
|
1033
1463
|
runningApp.activate(options: [])
|
|
@@ -1036,7 +1466,16 @@ func setupAndStartServer() async throws -> Server {
|
|
|
1036
1466
|
}
|
|
1037
1467
|
// Auto-scroll element into view if it's outside the visible window area
|
|
1038
1468
|
let adjustedPoint = await scrollIntoViewIfNeeded(pid: reqPid, point: rawPoint)
|
|
1039
|
-
|
|
1469
|
+
lastClickPoint = adjustedPoint
|
|
1470
|
+
let isDoubleClick = params.arguments?["doubleClick"]?.boolValue ?? false
|
|
1471
|
+
let isRightClick = params.arguments?["rightClick"]?.boolValue ?? false
|
|
1472
|
+
if isDoubleClick {
|
|
1473
|
+
primaryAction = .input(action: .doubleClick(point: adjustedPoint))
|
|
1474
|
+
} else if isRightClick {
|
|
1475
|
+
primaryAction = .input(action: .rightClick(point: adjustedPoint))
|
|
1476
|
+
} else {
|
|
1477
|
+
primaryAction = .input(action: .click(point: adjustedPoint))
|
|
1478
|
+
}
|
|
1040
1479
|
options.pidForTraversal = reqPid
|
|
1041
1480
|
options.showDiff = true // enables traverseBefore automatically
|
|
1042
1481
|
hasDiff = true
|
|
@@ -1116,11 +1555,33 @@ func setupAndStartServer() async throws -> Server {
|
|
|
1116
1555
|
}
|
|
1117
1556
|
|
|
1118
1557
|
// --- Build simplified response and serialize to JSON ---
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1558
|
+
var toolResponse = buildToolResponse(actionResult, hasDiff: hasDiff)
|
|
1559
|
+
|
|
1560
|
+
// --- Detect cross-app handoff ---
|
|
1561
|
+
// After diff-based actions, check if a different app became frontmost
|
|
1562
|
+
if hasDiff, let originalPid = options.pidForTraversal {
|
|
1563
|
+
let frontmostPid = NSWorkspace.shared.frontmostApplication?.processIdentifier
|
|
1564
|
+
if let newPid = frontmostPid, newPid != originalPid {
|
|
1565
|
+
let frontmostName = NSWorkspace.shared.frontmostApplication?.localizedName ?? "Unknown"
|
|
1566
|
+
fputs("log: handler(CallTool): app switch detected! Original PID \(originalPid) -> new frontmost PID \(newPid) (\(frontmostName))\n", stderr)
|
|
1567
|
+
toolResponse.appSwitchPid = newPid
|
|
1568
|
+
toolResponse.appSwitchAppName = frontmostName
|
|
1569
|
+
|
|
1570
|
+
// Traverse the new frontmost app
|
|
1571
|
+
do {
|
|
1572
|
+
let newTraversal: ResponseData = try await Task { @MainActor in
|
|
1573
|
+
return try traverseAccessibilityTree(pid: newPid)
|
|
1574
|
+
}.value
|
|
1575
|
+
let newWindowBounds = getWindowBoundsFromTraversal(newTraversal)
|
|
1576
|
+
?? getWindowBoundsFromAPI(pid: newPid)
|
|
1577
|
+
toolResponse.appSwitchTraversal = enrichResponseData(newTraversal, windowBounds: newWindowBounds)
|
|
1578
|
+
fputs("log: handler(CallTool): traversed new frontmost app \(frontmostName) (PID \(newPid)): \(newTraversal.elements.count) elements\n", stderr)
|
|
1579
|
+
} catch {
|
|
1580
|
+
fputs("warning: handler(CallTool): failed to traverse new frontmost app \(frontmostName) (PID \(newPid)): \(error)\n", stderr)
|
|
1581
|
+
}
|
|
1582
|
+
}
|
|
1123
1583
|
}
|
|
1584
|
+
let resultTextString = buildFlatTextResponse(toolResponse)
|
|
1124
1585
|
|
|
1125
1586
|
// --- Determine if it was an error overall ---
|
|
1126
1587
|
let isError = actionResult.primaryActionError != nil ||
|
|
@@ -1131,18 +1592,28 @@ func setupAndStartServer() async throws -> Server {
|
|
|
1131
1592
|
fputs("warning: handler(CallTool): Action resulted in an error state (primary: \(actionResult.primaryActionError ?? "nil"), before: \(actionResult.traversalBeforeError ?? "nil"), after: \(actionResult.traversalAfterError ?? "nil")).\n", stderr)
|
|
1132
1593
|
}
|
|
1133
1594
|
|
|
1134
|
-
// --- Write
|
|
1595
|
+
// --- Write flat text to file, return compact summary ---
|
|
1135
1596
|
let outputDir = "/tmp/macos-use"
|
|
1136
1597
|
try? FileManager.default.createDirectory(atPath: outputDir, withIntermediateDirectories: true)
|
|
1137
1598
|
|
|
1138
1599
|
let timestamp = Int(Date().timeIntervalSince1970 * 1000) // ms precision to avoid collisions
|
|
1139
1600
|
let safeName = params.name.replacingOccurrences(of: "macos-use_", with: "")
|
|
1140
|
-
let filename = "\(timestamp)_\(safeName).
|
|
1601
|
+
let filename = "\(timestamp)_\(safeName).txt"
|
|
1141
1602
|
let filepath = "\(outputDir)/\(filename)"
|
|
1142
|
-
try?
|
|
1143
|
-
fputs("log: handler(CallTool): wrote full response to \(filepath) (\(
|
|
1603
|
+
try? resultTextString.write(toFile: filepath, atomically: true, encoding: .utf8)
|
|
1604
|
+
fputs("log: handler(CallTool): wrote full response to \(filepath) (\(resultTextString.count) bytes)\n", stderr)
|
|
1605
|
+
|
|
1606
|
+
// --- Capture window screenshot ---
|
|
1607
|
+
var screenshotPath: String? = nil
|
|
1608
|
+
let screenshotFilename = "\(timestamp)_\(safeName).png"
|
|
1609
|
+
let screenshotFilepath = "\(outputDir)/\(screenshotFilename)"
|
|
1610
|
+
// Use the effective PID (could be app-switched)
|
|
1611
|
+
let screenshotPid = toolResponse.appSwitchPid ?? toolResponse.traversalPid ?? options.pidForTraversal
|
|
1612
|
+
if let pid = screenshotPid {
|
|
1613
|
+
screenshotPath = captureWindowScreenshot(pid: pid, outputPath: screenshotFilepath, clickPoint: lastClickPoint)
|
|
1614
|
+
}
|
|
1144
1615
|
|
|
1145
|
-
let summary = buildCompactSummary(toolName: params.name, params: params, toolResponse: toolResponse, filepath: filepath)
|
|
1616
|
+
let summary = buildCompactSummary(toolName: params.name, params: params, toolResponse: toolResponse, filepath: filepath, fileSize: resultTextString.count, screenshotPath: screenshotPath)
|
|
1146
1617
|
fputs("log: handler(CallTool): returning compact summary (\(summary.count) chars)\n", stderr)
|
|
1147
1618
|
|
|
1148
1619
|
return .init(content: [.text(summary)], isError: isError)
|
|
Binary file
|
package/package.json
CHANGED