agent-device 0.11.0 → 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,15 @@
1
1
  import AppKit
2
2
  import ApplicationServices
3
+ import CoreGraphics
3
4
  import Foundation
4
5
 
5
6
  private enum SnapshotTraversalLimits {
6
7
  static let maxDesktopApps = 24
7
8
  static let maxNodes = 1500
8
9
  static let maxDepth = 12
10
+ static let maxMenuBarBandY = 64.0
11
+ static let maxMenuBarBandHeight = 64.0
12
+ static let maxMenuBarExtraWidth = 256.0
9
13
  }
10
14
 
11
15
  struct RectResponse: Encodable {
@@ -62,7 +66,17 @@ private struct SnapshotTraversalState {
62
66
  var truncated = false
63
67
  }
64
68
 
65
- func captureSnapshotResponse(surface: String) throws -> SnapshotResponse {
69
+ private struct MenuBarWindowFallbackCandidate {
70
+ let windowNumber: Int
71
+ let rect: RectResponse
72
+ let layer: Int
73
+
74
+ var area: Double {
75
+ rect.width * rect.height
76
+ }
77
+ }
78
+
79
+ func captureSnapshotResponse(surface: String, bundleId: String? = nil) throws -> SnapshotResponse {
66
80
  let result: SnapshotBuildResult
67
81
  switch surface {
68
82
  case "frontmost-app":
@@ -70,7 +84,7 @@ func captureSnapshotResponse(surface: String) throws -> SnapshotResponse {
70
84
  case "desktop":
71
85
  result = snapshotDesktop()
72
86
  case "menubar":
73
- result = snapshotMenuBar()
87
+ result = try snapshotMenuBar(bundleId: bundleId)
74
88
  default:
75
89
  throw HelperError.invalidArgs("snapshot requires --surface <frontmost-app|desktop|menubar>")
76
90
  }
@@ -200,8 +214,9 @@ private func appendApplicationSnapshot(
200
214
  return true
201
215
  }
202
216
 
203
- private func snapshotMenuBar() -> SnapshotBuildResult {
217
+ private func snapshotMenuBar(bundleId: String?) throws -> SnapshotBuildResult {
204
218
  var state = SnapshotTraversalState()
219
+ let screenRect = mainScreenRectResponse()
205
220
  guard
206
221
  let rootIndex = appendSyntheticSnapshotNode(
207
222
  into: &state,
@@ -209,29 +224,54 @@ private func snapshotMenuBar() -> SnapshotBuildResult {
209
224
  label: "Menu Bar",
210
225
  depth: 0,
211
226
  parentIndex: nil,
212
- surface: "menubar"
227
+ surface: "menubar",
228
+ rect: screenRect
213
229
  )
214
230
  else {
215
231
  return SnapshotBuildResult(nodes: state.nodes, truncated: true)
216
232
  }
217
233
 
218
- if let frontmost = NSWorkspace.shared.frontmostApplication {
219
- let frontmostElement = AXUIElementCreateApplication(frontmost.processIdentifier)
220
- if let menuBar = elementAttribute(frontmostElement, attribute: kAXMenuBarAttribute as String) {
221
- _ = appendElementSnapshot(
222
- menuBar,
234
+ if let bundleId {
235
+ let targetApp = try resolveTargetApplication(bundleId: bundleId, surface: nil)
236
+ let appendedExtras = appendMenuBarSnapshot(
237
+ targetApp,
238
+ attribute: kAXExtrasMenuBarAttribute as String,
239
+ depth: 1,
240
+ parentIndex: rootIndex,
241
+ surface: "menubar",
242
+ state: &state
243
+ )
244
+ if !appendedExtras {
245
+ let appendedMenuBar = appendMenuBarSnapshot(
246
+ targetApp,
247
+ attribute: kAXMenuBarAttribute as String,
223
248
  depth: 1,
224
249
  parentIndex: rootIndex,
225
- context: SnapshotContext(
226
- surface: "menubar",
227
- pid: Int32(frontmost.processIdentifier),
228
- bundleId: frontmost.bundleIdentifier,
229
- appName: frontmost.localizedName,
230
- windowTitle: frontmost.localizedName
231
- ),
250
+ surface: "menubar",
232
251
  state: &state
233
252
  )
253
+ if !appendedMenuBar {
254
+ _ = appendMenuBarWindowFallback(
255
+ targetApp,
256
+ depth: 1,
257
+ parentIndex: rootIndex,
258
+ surface: "menubar",
259
+ state: &state
260
+ )
261
+ }
234
262
  }
263
+ return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated)
264
+ }
265
+
266
+ if let frontmost = NSWorkspace.shared.frontmostApplication {
267
+ _ = appendMenuBarSnapshot(
268
+ frontmost,
269
+ attribute: kAXMenuBarAttribute as String,
270
+ depth: 1,
271
+ parentIndex: rootIndex,
272
+ surface: "menubar",
273
+ state: &state
274
+ )
235
275
  }
236
276
 
237
277
  if !state.truncated,
@@ -239,20 +279,24 @@ private func snapshotMenuBar() -> SnapshotBuildResult {
239
279
  withBundleIdentifier: "com.apple.systemuiserver"
240
280
  ).first
241
281
  {
242
- let systemUiElement = AXUIElementCreateApplication(systemUiServer.processIdentifier)
243
- if let menuExtras = elementAttribute(systemUiElement, attribute: kAXMenuBarAttribute as String) {
244
- _ = appendElementSnapshot(
245
- menuExtras,
282
+ let appendedExtras = appendMenuBarSnapshot(
283
+ systemUiServer,
284
+ attribute: kAXExtrasMenuBarAttribute as String,
285
+ depth: 1,
286
+ parentIndex: rootIndex,
287
+ surface: "menubar",
288
+ state: &state,
289
+ windowTitle: "System Menu Extras"
290
+ )
291
+ if !appendedExtras {
292
+ _ = appendMenuBarSnapshot(
293
+ systemUiServer,
294
+ attribute: kAXMenuBarAttribute as String,
246
295
  depth: 1,
247
296
  parentIndex: rootIndex,
248
- context: SnapshotContext(
249
- surface: "menubar",
250
- pid: Int32(systemUiServer.processIdentifier),
251
- bundleId: systemUiServer.bundleIdentifier,
252
- appName: systemUiServer.localizedName,
253
- windowTitle: "System Menu Extras"
254
- ),
255
- state: &state
297
+ surface: "menubar",
298
+ state: &state,
299
+ windowTitle: "System Menu Extras"
256
300
  )
257
301
  }
258
302
  }
@@ -260,6 +304,67 @@ private func snapshotMenuBar() -> SnapshotBuildResult {
260
304
  return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated)
261
305
  }
262
306
 
307
+ @discardableResult
308
+ private func appendMenuBarSnapshot(
309
+ _ app: NSRunningApplication,
310
+ attribute: String,
311
+ depth: Int,
312
+ parentIndex: Int,
313
+ surface: String,
314
+ state: inout SnapshotTraversalState,
315
+ windowTitle: String? = nil
316
+ ) -> Bool {
317
+ let appElement = AXUIElementCreateApplication(app.processIdentifier)
318
+ guard let menuBar = elementAttribute(appElement, attribute: attribute) else {
319
+ return false
320
+ }
321
+
322
+ let nodeCountBefore = state.nodes.count
323
+ _ = appendElementSnapshot(
324
+ menuBar,
325
+ depth: depth,
326
+ parentIndex: parentIndex,
327
+ context: SnapshotContext(
328
+ surface: surface,
329
+ pid: Int32(app.processIdentifier),
330
+ bundleId: app.bundleIdentifier,
331
+ appName: app.localizedName,
332
+ windowTitle: windowTitle ?? app.localizedName
333
+ ),
334
+ state: &state
335
+ )
336
+ return state.nodes.count > nodeCountBefore
337
+ }
338
+
339
+ @discardableResult
340
+ private func appendMenuBarWindowFallback(
341
+ _ app: NSRunningApplication,
342
+ depth: Int,
343
+ parentIndex: Int,
344
+ surface: String,
345
+ state: inout SnapshotTraversalState
346
+ ) -> Bool {
347
+ guard let candidate = menuBarWindowFallbackCandidate(for: app) else {
348
+ return false
349
+ }
350
+
351
+ return appendSyntheticSnapshotNode(
352
+ into: &state,
353
+ type: "MenuBarItem",
354
+ label: app.localizedName ?? app.bundleIdentifier ?? "Menu Bar Item",
355
+ depth: depth,
356
+ parentIndex: parentIndex,
357
+ surface: surface,
358
+ identifier: "cgwindow:\(candidate.windowNumber)",
359
+ pid: Int32(app.processIdentifier),
360
+ bundleId: app.bundleIdentifier,
361
+ appName: app.localizedName,
362
+ windowTitle: app.localizedName,
363
+ rect: candidate.rect,
364
+ hittable: true
365
+ ) != nil
366
+ }
367
+
263
368
  @discardableResult
264
369
  private func appendSyntheticSnapshotNode(
265
370
  into state: inout SnapshotTraversalState,
@@ -272,7 +377,9 @@ private func appendSyntheticSnapshotNode(
272
377
  pid: Int32? = nil,
273
378
  bundleId: String? = nil,
274
379
  appName: String? = nil,
275
- windowTitle: String? = nil
380
+ windowTitle: String? = nil,
381
+ rect: RectResponse? = nil,
382
+ hittable: Bool = false
276
383
  ) -> Int? {
277
384
  guard reserveSnapshotNodeCapacity(&state) else {
278
385
  return nil
@@ -288,10 +395,10 @@ private func appendSyntheticSnapshotNode(
288
395
  label: label,
289
396
  value: nil,
290
397
  identifier: identifier ?? "surface:\(surface):\(type.lowercased())",
291
- rect: nil,
398
+ rect: rect,
292
399
  enabled: true,
293
400
  selected: nil,
294
- hittable: false,
401
+ hittable: hittable && rect != nil,
295
402
  depth: depth,
296
403
  parentIndex: parentIndex,
297
404
  pid: pid,
@@ -304,6 +411,83 @@ private func appendSyntheticSnapshotNode(
304
411
  return index
305
412
  }
306
413
 
414
+ private func mainScreenRectResponse() -> RectResponse? {
415
+ guard let screenFrame = NSScreen.main?.frame, screenFrame.width > 0, screenFrame.height > 0 else {
416
+ return nil
417
+ }
418
+ return RectResponse(
419
+ x: Double(screenFrame.origin.x),
420
+ y: Double(screenFrame.origin.y),
421
+ width: Double(screenFrame.width),
422
+ height: Double(screenFrame.height)
423
+ )
424
+ }
425
+
426
+ private func menuBarWindowFallbackCandidate(
427
+ for app: NSRunningApplication
428
+ ) -> MenuBarWindowFallbackCandidate? {
429
+ guard
430
+ let windowInfoList = CGWindowListCopyWindowInfo([.optionAll], kCGNullWindowID)
431
+ as? [[String: Any]]
432
+ else {
433
+ return nil
434
+ }
435
+
436
+ let pid = Int(app.processIdentifier)
437
+ let allCandidates = windowInfoList.compactMap { info -> MenuBarWindowFallbackCandidate? in
438
+ let ownerPid = (info[kCGWindowOwnerPID as String] as? NSNumber)?.intValue
439
+ guard ownerPid == pid else {
440
+ return nil
441
+ }
442
+ guard let boundsDictionary = info[kCGWindowBounds as String] as? NSDictionary,
443
+ let bounds = CGRect(dictionaryRepresentation: boundsDictionary)
444
+ else {
445
+ return nil
446
+ }
447
+ guard bounds.width > 0, bounds.height > 0 else {
448
+ return nil
449
+ }
450
+ let alpha = (info[kCGWindowAlpha as String] as? NSNumber)?.doubleValue ?? 1
451
+ guard alpha > 0 else {
452
+ return nil
453
+ }
454
+ let rect = RectResponse(
455
+ x: Double(bounds.origin.x),
456
+ y: Double(bounds.origin.y),
457
+ width: Double(bounds.width),
458
+ height: Double(bounds.height)
459
+ )
460
+ let windowNumber = (info[kCGWindowNumber as String] as? NSNumber)?.intValue ?? 0
461
+ let layer = (info[kCGWindowLayer as String] as? NSNumber)?.intValue ?? 0
462
+ return MenuBarWindowFallbackCandidate(
463
+ windowNumber: windowNumber,
464
+ rect: rect,
465
+ layer: layer
466
+ )
467
+ }
468
+
469
+ // CGWindowList can surface multiple app-owned utility windows. Prefer the small
470
+ // top-band window that matches typical menu bar extra geometry before ranking by area.
471
+ let menuBarBandCandidates = allCandidates.filter { candidate in
472
+ candidate.rect.y <= SnapshotTraversalLimits.maxMenuBarBandY
473
+ && candidate.rect.height <= SnapshotTraversalLimits.maxMenuBarBandHeight
474
+ }
475
+ let narrowCandidates = menuBarBandCandidates.filter { candidate in
476
+ candidate.rect.width <= SnapshotTraversalLimits.maxMenuBarExtraWidth
477
+ }
478
+ let rankedCandidates = (narrowCandidates.isEmpty ? menuBarBandCandidates : narrowCandidates)
479
+ .sorted { left, right in
480
+ if left.area != right.area {
481
+ return left.area < right.area
482
+ }
483
+ if left.layer != right.layer {
484
+ return left.layer < right.layer
485
+ }
486
+ return left.windowNumber < right.windowNumber
487
+ }
488
+ return rankedCandidates.first
489
+ }
490
+
307
491
  @discardableResult
308
492
  private func appendElementSnapshot(
309
493
  _ element: AXUIElement,
@@ -361,7 +545,7 @@ private func appendElementSnapshot(
361
545
  return index
362
546
  }
363
547
 
364
- for child in children(of: element) {
548
+ for child in snapshotChildren(of: element, role: role) {
365
549
  if state.truncated {
366
550
  break
367
551
  }
@@ -522,9 +706,9 @@ private func accessibilityAxValue(_ value: CFTypeRef?) -> AXValue? {
522
706
  return (value as! AXValue)
523
707
  }
524
708
 
525
- func children(of element: AXUIElement) -> [AXUIElement] {
709
+ private func elementArrayAttribute(_ element: AXUIElement, attribute: String) -> [AXUIElement] {
526
710
  var value: CFTypeRef?
527
- guard AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &value) == .success,
711
+ guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success,
528
712
  let children = value as? [AXUIElement]
529
713
  else {
530
714
  return []
@@ -532,6 +716,23 @@ func children(of element: AXUIElement) -> [AXUIElement] {
532
716
  return children
533
717
  }
534
718
 
719
+ func children(of element: AXUIElement) -> [AXUIElement] {
720
+ return elementArrayAttribute(element, attribute: kAXChildrenAttribute as String)
721
+ }
722
+
723
+ private func snapshotChildren(of element: AXUIElement, role: String?) -> [AXUIElement] {
724
+ let directChildren = children(of: element)
725
+ if !directChildren.isEmpty {
726
+ return directChildren
727
+ }
728
+ switch role {
729
+ case "AXMenuBar", "AXMenuBarItem", "AXMenu":
730
+ return elementArrayAttribute(element, attribute: kAXVisibleChildrenAttribute as String)
731
+ default:
732
+ return []
733
+ }
734
+ }
735
+
535
736
  func windows(of appElement: AXUIElement) -> [AXUIElement] {
536
737
  var value: CFTypeRef?
537
738
  guard AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &value) == .success,
@@ -2,6 +2,9 @@ import AppKit
2
2
  import ApplicationServices
3
3
  import CoreGraphics
4
4
  import Foundation
5
+ import ImageIO
6
+ import ScreenCaptureKit
7
+ import UniformTypeIdentifiers
5
8
 
6
9
  enum HelperError: Error {
7
10
  case invalidArgs(String)
@@ -57,6 +60,19 @@ struct ReadResponse: Encodable {
57
60
  let text: String
58
61
  }
59
62
 
63
+ struct PressResponse: Encodable {
64
+ let x: Double
65
+ let y: Double
66
+ let bundleId: String?
67
+ let surface: String?
68
+ }
69
+
70
+ struct ScreenshotResponse: Encodable {
71
+ let path: String
72
+ let surface: String?
73
+ let fullscreen: Bool
74
+ }
75
+
60
76
  struct AgentDeviceMacOSHelper {
61
77
  static func main() {
62
78
  do {
@@ -100,6 +116,10 @@ struct AgentDeviceMacOSHelper {
100
116
  return try handleSnapshot(arguments: Array(arguments.dropFirst()))
101
117
  case "read":
102
118
  return try handleRead(arguments: Array(arguments.dropFirst()))
119
+ case "press":
120
+ return try handlePress(arguments: Array(arguments.dropFirst()))
121
+ case "screenshot":
122
+ return try handleScreenshot(arguments: Array(arguments.dropFirst()))
103
123
  default:
104
124
  throw HelperError.invalidArgs("unknown command: \(command)")
105
125
  }
@@ -315,11 +335,13 @@ struct AgentDeviceMacOSHelper {
315
335
  throw HelperError.invalidArgs("snapshot requires --surface <frontmost-app|desktop|menubar>")
316
336
  }
317
337
 
338
+ let bundleId = try optionValue(arguments: arguments, name: "--bundle-id").map(validatedBundleId)
339
+
318
340
  switch surface {
319
341
  case "frontmost-app":
320
- return SuccessEnvelope(data: try captureSnapshotResponse(surface: surface))
342
+ return SuccessEnvelope(data: try captureSnapshotResponse(surface: surface, bundleId: bundleId))
321
343
  case "desktop", "menubar":
322
- return SuccessEnvelope(data: try captureSnapshotResponse(surface: surface))
344
+ return SuccessEnvelope(data: try captureSnapshotResponse(surface: surface, bundleId: bundleId))
323
345
  default:
324
346
  throw HelperError.invalidArgs("snapshot requires --surface <frontmost-app|desktop|menubar>")
325
347
  }
@@ -339,6 +361,35 @@ struct AgentDeviceMacOSHelper {
339
361
  let text = try readTextAtPosition(bundleId: bundleId, surface: surface, x: x, y: y)
340
362
  return SuccessEnvelope(data: ReadResponse(text: text))
341
363
  }
364
+
365
+ static func handlePress(arguments: [String]) throws -> any Encodable {
366
+ guard let rawX = optionValue(arguments: arguments, name: "--x"),
367
+ let rawY = optionValue(arguments: arguments, name: "--y"),
368
+ let x = Double(rawX),
369
+ let y = Double(rawY)
370
+ else {
371
+ throw HelperError.invalidArgs("press requires --x <number> --y <number>")
372
+ }
373
+
374
+ let bundleId = try optionValue(arguments: arguments, name: "--bundle-id").map(validatedBundleId)
375
+ let surface = optionValue(arguments: arguments, name: "--surface")
376
+ try pressAtPosition(bundleId: bundleId, surface: surface, x: x, y: y)
377
+ return SuccessEnvelope(data: PressResponse(x: x, y: y, bundleId: bundleId, surface: surface))
378
+ }
379
+
380
+ static func handleScreenshot(arguments: [String]) throws -> any Encodable {
381
+ guard let outPath = optionValue(arguments: arguments, name: "--out")?
382
+ .trimmingCharacters(in: .whitespacesAndNewlines),
383
+ !outPath.isEmpty
384
+ else {
385
+ throw HelperError.invalidArgs("screenshot requires --out <path>")
386
+ }
387
+
388
+ let surface = optionValue(arguments: arguments, name: "--surface")
389
+ let fullscreen = arguments.contains("--fullscreen")
390
+ try captureSurfaceScreenshot(surface: surface, outPath: outPath, fullscreen: fullscreen)
391
+ return SuccessEnvelope(data: ScreenshotResponse(path: outPath, surface: surface, fullscreen: fullscreen))
392
+ }
342
393
  }
343
394
 
344
395
  private func optionValue(arguments: [String], name: String) -> String? {
@@ -395,6 +446,74 @@ private func readTextAtPosition(bundleId: String?, surface: String?, x: Double,
395
446
  throw HelperError.commandFailed("read did not resolve text")
396
447
  }
397
448
 
449
+ private func pressAtPosition(bundleId: String?, surface: String?, x: Double, y: Double) throws {
450
+ _ = bundleId
451
+ _ = surface
452
+ let point = CGPoint(x: x, y: y)
453
+ guard let move = CGEvent(mouseEventSource: nil, mouseType: .mouseMoved, mouseCursorPosition: point, mouseButton: .left),
454
+ let down = CGEvent(mouseEventSource: nil, mouseType: .leftMouseDown, mouseCursorPosition: point, mouseButton: .left),
455
+ let up = CGEvent(mouseEventSource: nil, mouseType: .leftMouseUp, mouseCursorPosition: point, mouseButton: .left)
456
+ else {
457
+ throw HelperError.commandFailed("press action failed", details: ["reason": "event_creation_failed"])
458
+ }
459
+ move.post(tap: .cghidEventTap)
460
+ down.post(tap: .cghidEventTap)
461
+ up.post(tap: .cghidEventTap)
462
+ }
463
+
464
+ private func captureSurfaceScreenshot(surface: String?, outPath: String, fullscreen: Bool) throws {
465
+ _ = fullscreen
466
+ guard #available(macOS 15.2, *) else {
467
+ throw HelperError.commandFailed(
468
+ "screenshot on macOS desktop and menubar surfaces requires macOS 15.2 or newer"
469
+ )
470
+ }
471
+ guard let screenFrame = NSScreen.main?.frame, screenFrame.width > 0, screenFrame.height > 0 else {
472
+ throw HelperError.commandFailed("screenshot could not resolve main screen bounds")
473
+ }
474
+
475
+ let rect = CGRect(origin: screenFrame.origin, size: screenFrame.size)
476
+ let semaphore = DispatchSemaphore(value: 0)
477
+ var capturedImage: CGImage?
478
+ var capturedError: Error?
479
+ SCScreenshotManager.captureImage(in: rect) { image, error in
480
+ capturedImage = image
481
+ capturedError = error
482
+ semaphore.signal()
483
+ }
484
+ semaphore.wait()
485
+
486
+ if let error = capturedError as NSError? {
487
+ if error.domain == "com.apple.ScreenCaptureKit.SCStreamErrorDomain", error.code == -3801 {
488
+ throw HelperError.commandFailed(
489
+ "screenshot requires Screen Recording permission on macOS desktop and menubar surfaces",
490
+ details: ["surface": surface ?? "", "permission": "screen-recording"]
491
+ )
492
+ }
493
+ throw HelperError.commandFailed("screenshot failed", details: ["error": error.localizedDescription])
494
+ }
495
+ guard let capturedImage else {
496
+ throw HelperError.commandFailed("screenshot failed")
497
+ }
498
+
499
+ let outputURL = URL(fileURLWithPath: outPath)
500
+ if let parent = outputURL.deletingLastPathComponent().path.removingPercentEncoding, !parent.isEmpty {
501
+ try FileManager.default.createDirectory(atPath: parent, withIntermediateDirectories: true)
502
+ }
503
+ guard let destination = CGImageDestinationCreateWithURL(
504
+ outputURL as CFURL,
505
+ UTType.png.identifier as CFString,
506
+ 1,
507
+ nil
508
+ ) else {
509
+ throw HelperError.commandFailed("screenshot could not create PNG destination")
510
+ }
511
+ CGImageDestinationAddImage(destination, capturedImage, nil)
512
+ guard CGImageDestinationFinalize(destination) else {
513
+ throw HelperError.commandFailed("screenshot could not write PNG file")
514
+ }
515
+ }
516
+
398
517
  private func readableText(for element: AXUIElement) -> String? {
399
518
  return
400
519
  stringAttribute(element, attribute: kAXValueAttribute as String)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-device",
3
- "version": "0.11.0",
3
+ "version": "0.11.2",
4
4
  "description": "Unified control plane for physical and virtual devices via an agent-driven CLI.",
5
5
  "license": "MIT",
6
6
  "author": "Callstack",
@@ -24,13 +24,18 @@
24
24
  "clean:daemon": "rm -f ~/.agent-device/daemon.json && rm -f ~/.agent-device/daemon.lock",
25
25
  "build:node": "pnpm build && pnpm clean:daemon",
26
26
  "build:xcuitest": "pnpm build:xcuitest:ios && pnpm build:xcuitest:macos",
27
- "build:xcuitest:ios": "rm -rf ~/.agent-device/ios-runner/derived/device && xcodebuild build-for-testing -project ios-runner/AgentDeviceRunner/AgentDeviceRunner.xcodeproj -scheme AgentDeviceRunner -destination \"generic/platform=iOS Simulator\" -derivedDataPath ~/.agent-device/ios-runner/derived",
28
- "build:xcuitest:macos": "rm -rf ~/.agent-device/ios-runner/derived/macos && xcodebuild build-for-testing -project ios-runner/AgentDeviceRunner/AgentDeviceRunner.xcodeproj -scheme AgentDeviceRunner -destination \"platform=macOS,arch=$(uname -m)\" -derivedDataPath ~/.agent-device/ios-runner/derived/macos CODE_SIGNING_ALLOWED=NO CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY=\"\" DEVELOPMENT_TEAM=\"\" COMPILER_INDEX_STORE_ENABLE=NO ENABLE_CODE_COVERAGE=NO",
29
- "build:xcuitest:tvos": "rm -rf ~/.agent-device/ios-runner/derived/tvos && xcodebuild build-for-testing -project ios-runner/AgentDeviceRunner/AgentDeviceRunner.xcodeproj -scheme AgentDeviceRunner -destination \"generic/platform=tvOS Simulator\" -derivedDataPath ~/.agent-device/ios-runner/derived/tvos",
27
+ "build:xcuitest:ios": "AGENT_DEVICE_XCUITEST_PLATFORM=ios AGENT_DEVICE_IOS_CLEAN_DERIVED=1 sh ./scripts/build-xcuitest-apple.sh",
28
+ "build:xcuitest:macos": "AGENT_DEVICE_XCUITEST_PLATFORM=macos sh ./scripts/build-xcuitest-apple.sh",
29
+ "build:xcuitest:tvos": "AGENT_DEVICE_XCUITEST_PLATFORM=tvos AGENT_DEVICE_IOS_CLEAN_DERIVED=1 sh ./scripts/build-xcuitest-apple.sh",
30
30
  "build:macos-helper": "swift build -c release --package-path macos-helper",
31
31
  "build:all": "pnpm build:node && pnpm build:xcuitest",
32
32
  "ad": "node bin/agent-device.mjs",
33
- "format": "prettier --write src test skills",
33
+ "lint": "oxlint . --deny-warnings",
34
+ "format": "oxfmt --write src test skills package.json tsconfig.json .oxlintrc.json .oxfmtrc.json",
35
+ "check:quick": "pnpm lint && pnpm typecheck",
36
+ "check:tooling": "pnpm lint && pnpm typecheck && pnpm build",
37
+ "check:unit": "pnpm test:unit && pnpm test:smoke",
38
+ "check": "pnpm check:tooling && pnpm check:unit",
34
39
  "prepack": "pnpm build:all",
35
40
  "typecheck": "tsc -p tsconfig.json",
36
41
  "test": "vitest run",
@@ -66,22 +71,17 @@
66
71
  "ios",
67
72
  "android"
68
73
  ],
69
- "prettier": {
70
- "singleQuote": true,
71
- "semi": true,
72
- "trailingComma": "all",
73
- "printWidth": 100
74
- },
75
74
  "dependencies": {
76
75
  "pngjs": "^7.0.0"
77
76
  },
78
77
  "devDependencies": {
79
78
  "@microsoft/api-extractor": "^7.52.10",
80
- "@rslib/core": "0.19.4",
79
+ "@rslib/core": "0.20.1",
81
80
  "@types/node": "^22.0.0",
82
81
  "@types/pngjs": "^6.0.5",
83
- "prettier": "^3.3.3",
84
- "typescript": "^5.9.3",
82
+ "oxfmt": "^0.42.0",
83
+ "oxlint": "^1.57.0",
84
+ "typescript": "^6.0.2",
85
85
  "vitest": "^4.1.2"
86
86
  }
87
87
  }
@@ -112,6 +112,19 @@ agent-device press 'id="camera_row" || label="Camera" role=button'
112
112
  agent-device is visible 'id="camera_settings_anchor"'
113
113
  ```
114
114
 
115
+ ## Interaction fallbacks
116
+
117
+ When `press @ref` fails:
118
+
119
+ 1. Re-snapshot if the UI may have changed.
120
+ 2. Retry `press @ref` or a selector-based `press`.
121
+ 3. If `screenshot --overlay-refs --json` returned a reliable `overlayRefs[].center`, use `agent-device press <x> <y>`.
122
+ 4. Use an external vision-based tap tool only after semantic and coordinate targeting fail.
123
+
124
+ - Prefer `@ref` over coordinates.
125
+ - Do not guess coordinates from the image when structured `center` is available.
126
+ - `agent-device` does not provide a built-in vision-tap flag.
127
+
115
128
  ## Text entry rules
116
129
 
117
130
  - Use `fill` to replace text in an editable field.
@@ -30,7 +30,8 @@ agent-device close
30
30
  - `app`: default surface and the normal choice for `click`, `fill`, `press`, `scroll`, `screenshot`, and `record`.
31
31
  - `frontmost-app`: inspect the currently focused app without naming it first.
32
32
  - `desktop`: inspect visible desktop windows across apps.
33
- - `menubar`: inspect the active app menu bar and system menu extras.
33
+ - `menubar`: inspect the active app menu bar and system menu extras. Use `open <app> --platform macos --surface menubar` when you need one menu bar app's extras, such as a status-item app.
34
+ - Menu bar apps can expose a sparse or empty default `app` tree. Prefer the `menubar` surface first when the app lives entirely in the top bar.
34
35
 
35
36
  Use inspect-first surfaces to understand desktop-global UI, then switch back to `app` when you need to act in one app.
36
37
 
@@ -81,6 +82,6 @@ Troubleshooting:
81
82
 
82
83
  - If visible content is missing from `snapshot -i`, re-snapshot after the UI settles.
83
84
  - If `desktop` is too broad, retry with `frontmost-app`.
84
- - If `menubar` is missing the expected menu, make the app frontmost first and retry.
85
+ - If `menubar` is missing the expected menu, retry with `open <app> --platform macos --surface menubar` for menu bar apps, or make the app frontmost first and retry the generic menubar surface.
85
86
  - If the wrong menu opened, retry secondary-clicking the row or cell wrapper rather than the nested text node.
86
87
  - If the app has multiple windows, make the correct window frontmost before relying on refs.
@@ -22,7 +22,7 @@ Do not use verification tools as the first exploration step. First get the app i
22
22
  agent-device open Settings --platform ios
23
23
  # after using exploration to reach the state you want to verify
24
24
  agent-device snapshot
25
- agent-device screenshot /tmp/settings-proof.png
25
+ agent-device screenshot /tmp/settings-proof.png --overlay-refs
26
26
  agent-device close
27
27
  ```
28
28
 
@@ -45,6 +45,8 @@ agent-device diff snapshot -i
45
45
 
46
46
  Use `screenshot` when the proof needs a rendered image instead of a structural tree.
47
47
 
48
+ - Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot.
49
+
48
50
  ## Session recording
49
51
 
50
52
  Use `record` for debugging, documentation, or shareable verification artifacts.