screenhand 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. package/README.md +193 -109
  2. package/bin/darwin-arm64/macos-bridge +0 -0
  3. package/dist/mcp-desktop.js +5876 -0
  4. package/dist/scripts/codex-monitor-daemon.js +335 -0
  5. package/dist/scripts/export-help-center.js +112 -0
  6. package/dist/scripts/marketing-loop.js +117 -0
  7. package/dist/scripts/observer-daemon.js +288 -0
  8. package/dist/scripts/orchestrator-daemon.js +399 -0
  9. package/dist/scripts/supervisor-daemon.js +272 -0
  10. package/dist/scripts/threads-campaign.js +208 -0
  11. package/dist/scripts/worker-daemon.js +228 -0
  12. package/dist/src/agent/cli.js +82 -0
  13. package/dist/src/agent/loop.js +274 -0
  14. package/dist/src/community/fetcher.js +109 -0
  15. package/dist/src/community/index.js +6 -0
  16. package/dist/src/community/publisher.js +191 -0
  17. package/dist/src/community/remote-api.js +121 -0
  18. package/dist/src/community/types.js +3 -0
  19. package/dist/src/community/validator.js +95 -0
  20. package/{src/config.ts → dist/src/config.js} +5 -10
  21. package/dist/src/context-tracker.js +489 -0
  22. package/{src/index.ts → dist/src/index.js} +32 -52
  23. package/dist/src/ingestion/coverage-auditor.js +233 -0
  24. package/dist/src/ingestion/doc-parser.js +164 -0
  25. package/dist/src/ingestion/index.js +8 -0
  26. package/dist/src/ingestion/menu-scanner.js +152 -0
  27. package/dist/src/ingestion/reference-merger.js +186 -0
  28. package/dist/src/ingestion/shortcut-extractor.js +180 -0
  29. package/dist/src/ingestion/tutorial-extractor.js +170 -0
  30. package/dist/src/ingestion/types.js +3 -0
  31. package/dist/src/jobs/manager.js +305 -0
  32. package/dist/src/jobs/runner.js +806 -0
  33. package/dist/src/jobs/store.js +102 -0
  34. package/dist/src/jobs/types.js +30 -0
  35. package/dist/src/jobs/worker.js +97 -0
  36. package/dist/src/learning/engine.js +356 -0
  37. package/dist/src/learning/index.js +9 -0
  38. package/dist/src/learning/locator-policy.js +120 -0
  39. package/dist/src/learning/pattern-policy.js +89 -0
  40. package/dist/src/learning/recovery-policy.js +116 -0
  41. package/dist/src/learning/sensor-policy.js +115 -0
  42. package/dist/src/learning/timing-model.js +204 -0
  43. package/dist/src/learning/topology-policy.js +90 -0
  44. package/dist/src/learning/types.js +9 -0
  45. package/dist/src/logging/timeline-logger.js +48 -0
  46. package/dist/src/mcp/mcp-stdio-server.js +464 -0
  47. package/dist/src/mcp/server.js +363 -0
  48. package/dist/src/mcp-entry.js +60 -0
  49. package/dist/src/memory/playbook-seeds.js +200 -0
  50. package/dist/src/memory/recall.js +222 -0
  51. package/dist/src/memory/research.js +104 -0
  52. package/dist/src/memory/seeds.js +101 -0
  53. package/dist/src/memory/service.js +446 -0
  54. package/dist/src/memory/session.js +169 -0
  55. package/dist/src/memory/store.js +451 -0
  56. package/{src/runtime/locator-cache.ts → dist/src/memory/types.js} +1 -17
  57. package/dist/src/monitor/codex-monitor.js +382 -0
  58. package/dist/src/monitor/task-queue.js +97 -0
  59. package/dist/src/monitor/types.js +62 -0
  60. package/dist/src/native/bridge-client.js +412 -0
  61. package/{src/native/macos-bridge-client.ts → dist/src/native/macos-bridge-client.js} +0 -1
  62. package/dist/src/observer/state.js +199 -0
  63. package/dist/src/observer/types.js +43 -0
  64. package/dist/src/orchestrator/state.js +68 -0
  65. package/dist/src/orchestrator/types.js +22 -0
  66. package/dist/src/perception/ax-source.js +162 -0
  67. package/dist/src/perception/cdp-source.js +162 -0
  68. package/dist/src/perception/coordinator.js +771 -0
  69. package/dist/src/perception/frame-differ.js +287 -0
  70. package/dist/src/perception/index.js +22 -0
  71. package/dist/src/perception/manager.js +199 -0
  72. package/dist/src/perception/types.js +47 -0
  73. package/dist/src/perception/vision-source.js +399 -0
  74. package/dist/src/planner/deterministic.js +298 -0
  75. package/dist/src/planner/executor.js +870 -0
  76. package/dist/src/planner/goal-store.js +92 -0
  77. package/dist/src/planner/index.js +21 -0
  78. package/dist/src/planner/planner.js +520 -0
  79. package/dist/src/planner/tool-registry.js +71 -0
  80. package/dist/src/planner/types.js +22 -0
  81. package/dist/src/platform/explorer.js +213 -0
  82. package/dist/src/platform/help-center-markdown.js +527 -0
  83. package/dist/src/platform/learner.js +257 -0
  84. package/dist/src/playbook/engine.js +486 -0
  85. package/dist/src/playbook/index.js +20 -0
  86. package/dist/src/playbook/mcp-recorder.js +204 -0
  87. package/dist/src/playbook/recorder.js +536 -0
  88. package/dist/src/playbook/runner.js +408 -0
  89. package/dist/src/playbook/store.js +312 -0
  90. package/dist/src/playbook/types.js +17 -0
  91. package/dist/src/recovery/detectors.js +156 -0
  92. package/dist/src/recovery/engine.js +327 -0
  93. package/dist/src/recovery/index.js +20 -0
  94. package/dist/src/recovery/strategies.js +274 -0
  95. package/dist/src/recovery/types.js +20 -0
  96. package/dist/src/runtime/accessibility-adapter.js +430 -0
  97. package/dist/src/runtime/app-adapter.js +64 -0
  98. package/dist/src/runtime/applescript-adapter.js +305 -0
  99. package/dist/src/runtime/ax-role-map.js +96 -0
  100. package/dist/src/runtime/browser-adapter.js +52 -0
  101. package/dist/src/runtime/cdp-chrome-adapter.js +521 -0
  102. package/dist/src/runtime/composite-adapter.js +221 -0
  103. package/dist/src/runtime/execution-contract.js +159 -0
  104. package/dist/src/runtime/executor.js +286 -0
  105. package/dist/src/runtime/locator-cache.js +50 -0
  106. package/dist/src/runtime/planning-loop.js +63 -0
  107. package/dist/src/runtime/service.js +432 -0
  108. package/dist/src/runtime/session-manager.js +63 -0
  109. package/dist/src/runtime/state-observer.js +121 -0
  110. package/dist/src/runtime/vision-adapter.js +225 -0
  111. package/dist/src/state/app-map-types.js +72 -0
  112. package/dist/src/state/app-map.js +1974 -0
  113. package/dist/src/state/entity-tracker.js +108 -0
  114. package/dist/src/state/fusion.js +96 -0
  115. package/dist/src/state/index.js +21 -0
  116. package/dist/src/state/ladder-generator.js +236 -0
  117. package/dist/src/state/persistence.js +156 -0
  118. package/dist/src/state/types.js +17 -0
  119. package/dist/src/state/world-model.js +1456 -0
  120. package/dist/src/supervisor/locks.js +186 -0
  121. package/dist/src/supervisor/supervisor.js +403 -0
  122. package/dist/src/supervisor/types.js +30 -0
  123. package/dist/src/test-mcp-protocol.js +154 -0
  124. package/dist/src/types.js +17 -0
  125. package/dist/src/util/atomic-write.js +133 -0
  126. package/dist/src/util/sanitize.js +146 -0
  127. package/dist-app-maps/com.figma.Desktop.json +959 -0
  128. package/dist-app-maps/com.hnc.Discord.json +1146 -0
  129. package/dist-app-maps/notion.id.json +2831 -0
  130. package/dist-playbooks/canva-screenhand-carousel.json +445 -0
  131. package/dist-playbooks/codex-desktop.json +76 -0
  132. package/dist-playbooks/competitor-research-stack.json +122 -0
  133. package/dist-playbooks/davinci-color-grade.json +153 -0
  134. package/dist-playbooks/davinci-edit-timeline.json +162 -0
  135. package/dist-playbooks/davinci-render.json +114 -0
  136. package/dist-playbooks/devto.json +52 -0
  137. package/dist-playbooks/discord.json +41 -0
  138. package/dist-playbooks/google-flow-create-project.json +59 -0
  139. package/dist-playbooks/google-flow-edit-image.json +90 -0
  140. package/dist-playbooks/google-flow-edit-video.json +90 -0
  141. package/dist-playbooks/google-flow-generate-image.json +68 -0
  142. package/dist-playbooks/google-flow-generate-video.json +191 -0
  143. package/dist-playbooks/google-flow-open-project.json +48 -0
  144. package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
  145. package/dist-playbooks/google-flow-search-assets.json +64 -0
  146. package/dist-playbooks/instagram.json +57 -0
  147. package/dist-playbooks/linkedin.json +52 -0
  148. package/dist-playbooks/n8n.json +43 -0
  149. package/dist-playbooks/reddit.json +52 -0
  150. package/dist-playbooks/threads.json +59 -0
  151. package/dist-playbooks/x-twitter.json +59 -0
  152. package/dist-playbooks/youtube.json +59 -0
  153. package/dist-references/canva.json +646 -0
  154. package/dist-references/codex-desktop.json +305 -0
  155. package/dist-references/davinci-resolve-keyboard.json +594 -0
  156. package/dist-references/davinci-resolve-menu-map.json +1139 -0
  157. package/dist-references/davinci-resolve-menus-batch1.json +116 -0
  158. package/dist-references/davinci-resolve-menus-batch2.json +372 -0
  159. package/dist-references/davinci-resolve-menus-batch3.json +330 -0
  160. package/dist-references/davinci-resolve-menus-batch4.json +297 -0
  161. package/dist-references/davinci-resolve-shortcuts.json +333 -0
  162. package/dist-references/devto.json +317 -0
  163. package/dist-references/discord.json +549 -0
  164. package/dist-references/figma.json +1186 -0
  165. package/dist-references/finder.json +146 -0
  166. package/dist-references/google-ads-transparency.json +95 -0
  167. package/dist-references/google-flow.json +649 -0
  168. package/dist-references/instagram.json +341 -0
  169. package/dist-references/linkedin.json +324 -0
  170. package/dist-references/meta-ad-library.json +86 -0
  171. package/dist-references/n8n.json +387 -0
  172. package/dist-references/notes.json +27 -0
  173. package/dist-references/notion.json +163 -0
  174. package/dist-references/reddit.json +341 -0
  175. package/dist-references/threads.json +337 -0
  176. package/dist-references/x-twitter.json +403 -0
  177. package/dist-references/youtube.json +373 -0
  178. package/native/macos-bridge/Package.swift +1 -0
  179. package/native/macos-bridge/Sources/AccessibilityBridge.swift +257 -36
  180. package/native/macos-bridge/Sources/AppManagement.swift +212 -2
  181. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +348 -53
  182. package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
  183. package/native/macos-bridge/Sources/VisionBridge.swift +165 -7
  184. package/native/macos-bridge/Sources/main.swift +169 -16
  185. package/native/windows-bridge/Program.cs +5 -0
  186. package/native/windows-bridge/ScreenCapture.cs +124 -0
  187. package/package.json +29 -4
  188. package/scripts/postinstall.cjs +127 -0
  189. package/.claude/commands/automate.md +0 -28
  190. package/.claude/commands/debug-ui.md +0 -19
  191. package/.claude/commands/screenshot.md +0 -15
  192. package/.github/FUNDING.yml +0 -1
  193. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -27
  194. package/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
  195. package/.mcp.json +0 -8
  196. package/DESKTOP_MCP_GUIDE.md +0 -92
  197. package/SECURITY.md +0 -44
  198. package/docs/architecture.md +0 -47
  199. package/install-skills.sh +0 -19
  200. package/mcp-bridge.ts +0 -271
  201. package/mcp-desktop.ts +0 -1221
  202. package/playbooks/instagram.json +0 -41
  203. package/playbooks/instagram_v2.json +0 -201
  204. package/playbooks/x_v1.json +0 -211
  205. package/scripts/devpost-live-loop.mjs +0 -421
  206. package/src/logging/timeline-logger.ts +0 -55
  207. package/src/mcp/server.ts +0 -449
  208. package/src/memory/recall.ts +0 -191
  209. package/src/memory/research.ts +0 -146
  210. package/src/memory/seeds.ts +0 -123
  211. package/src/memory/session.ts +0 -201
  212. package/src/memory/store.ts +0 -434
  213. package/src/memory/types.ts +0 -69
  214. package/src/native/bridge-client.ts +0 -239
  215. package/src/runtime/accessibility-adapter.ts +0 -487
  216. package/src/runtime/app-adapter.ts +0 -169
  217. package/src/runtime/applescript-adapter.ts +0 -376
  218. package/src/runtime/ax-role-map.ts +0 -102
  219. package/src/runtime/browser-adapter.ts +0 -129
  220. package/src/runtime/cdp-chrome-adapter.ts +0 -676
  221. package/src/runtime/composite-adapter.ts +0 -274
  222. package/src/runtime/executor.ts +0 -396
  223. package/src/runtime/planning-loop.ts +0 -81
  224. package/src/runtime/service.ts +0 -448
  225. package/src/runtime/session-manager.ts +0 -50
  226. package/src/runtime/state-observer.ts +0 -136
  227. package/src/runtime/vision-adapter.ts +0 -297
  228. package/src/types.ts +0 -297
  229. package/tests/bridge-client.test.ts +0 -176
  230. package/tests/browser-stealth.test.ts +0 -210
  231. package/tests/composite-adapter.test.ts +0 -64
  232. package/tests/mcp-server.test.ts +0 -151
  233. package/tests/memory-recall.test.ts +0 -339
  234. package/tests/memory-research.test.ts +0 -159
  235. package/tests/memory-seeds.test.ts +0 -120
  236. package/tests/memory-store.test.ts +0 -392
  237. package/tests/types.test.ts +0 -92
  238. package/tsconfig.check.json +0 -17
  239. package/tsconfig.json +0 -19
  240. package/vitest.config.ts +0 -8
  241. /package/{playbooks → dist-references}/devpost.json +0 -0
@@ -1,13 +1,102 @@
1
1
  import Foundation
2
2
  import Vision
3
3
  import AppKit
4
+ import CoreML
4
5
 
5
6
  class VisionBridge {
6
7
 
8
+ // MARK: - YOLO Element Detection
9
+
10
+ private var yoloModel: VNCoreMLModel?
11
+ private let yoloClassNames = ["button", "field", "heading", "iframe", "image", "label", "link", "text"]
12
+
13
+ /// Load the YOLO CoreML model for UI element detection.
14
+ /// Called lazily on first detectElements call.
15
+ private func ensureYoloModel() throws {
16
+ if yoloModel != nil { return }
17
+
18
+ // Look for model relative to the binary or via known paths
19
+ let execPath = URL(fileURLWithPath: CommandLine.arguments[0]).resolvingSymlinksInPath()
20
+ let execDir = execPath.deletingLastPathComponent()
21
+ let possiblePaths = [
22
+ execDir.appendingPathComponent("../Resources/ui-elements.mlpackage"),
23
+ execDir.appendingPathComponent("../../Resources/ui-elements.mlpackage"),
24
+ execDir.appendingPathComponent("Resources/ui-elements.mlpackage"),
25
+ URL(fileURLWithPath: "/tmp/vins-yolo/web-ui-8cls.mlpackage"), // Development fallback
26
+ ]
27
+
28
+ for modelURL in possiblePaths {
29
+ if FileManager.default.fileExists(atPath: modelURL.path) {
30
+ let config = MLModelConfiguration()
31
+ config.computeUnits = .all // Use ANE when available
32
+ let coreMLModel = try MLModel(contentsOf: modelURL, configuration: config)
33
+ yoloModel = try VNCoreMLModel(for: coreMLModel)
34
+ return
35
+ }
36
+ }
37
+
38
+ throw BridgeError.general("YOLO model not found")
39
+ }
40
+
41
+ /// Detect UI elements in an image using the YOLO CoreML model.
42
+ /// Returns bounding boxes with class labels and confidence scores.
43
+ func detectElements(imagePath: String, confidence: Double = 0.25) throws -> [[String: Any]] {
44
+ try ensureYoloModel()
45
+ guard let model = yoloModel else {
46
+ throw BridgeError.general("YOLO model not loaded")
47
+ }
48
+
49
+ let url = URL(fileURLWithPath: imagePath)
50
+ guard let image = NSImage(contentsOf: url),
51
+ let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
52
+ throw BridgeError.general("Failed to load image at \(imagePath)")
53
+ }
54
+
55
+ let imageWidth = CGFloat(cgImage.width)
56
+ let imageHeight = CGFloat(cgImage.height)
57
+
58
+ let request = VNCoreMLRequest(model: model)
59
+ request.imageCropAndScaleOption = .scaleFit
60
+
61
+ let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
62
+ try handler.perform([request])
63
+
64
+ guard let observations = request.results as? [VNRecognizedObjectObservation] else {
65
+ return []
66
+ }
67
+
68
+ var results: [[String: Any]] = []
69
+ for obs in observations {
70
+ guard obs.confidence >= Float(confidence) else { continue }
71
+
72
+ let bbox = obs.boundingBox
73
+ // Convert from Vision normalized coords (origin bottom-left) to screen coords
74
+ let x = bbox.origin.x * imageWidth
75
+ let y = (1 - bbox.origin.y - bbox.height) * imageHeight
76
+ let width = bbox.width * imageWidth
77
+ let height = bbox.height * imageHeight
78
+
79
+ let className = obs.labels.first?.identifier ?? "unknown"
80
+
81
+ results.append([
82
+ "class": className,
83
+ "confidence": Double(obs.confidence),
84
+ "bounds": [
85
+ "x": Double(x),
86
+ "y": Double(y),
87
+ "width": Double(width),
88
+ "height": Double(height),
89
+ ] as [String: Any],
90
+ ] as [String: Any])
91
+ }
92
+
93
+ return results
94
+ }
95
+
7
96
  /// Perform OCR on an image, optionally searching for specific text.
8
97
  /// Returns all recognized text with bounding boxes.
9
- func findText(imagePath: String, searchText: String?) throws -> [[String: Any]] {
10
- let results = try performOCR(imagePath: imagePath)
98
+ func findText(imagePath: String, searchText: String?, mode: String = "accurate") throws -> [[String: Any]] {
99
+ let results = try performOCR(imagePath: imagePath, mode: mode)
11
100
 
12
101
  guard let search = searchText?.lowercased() else {
13
102
  return results
@@ -20,8 +109,9 @@ class VisionBridge {
20
109
  }
21
110
 
22
111
  /// Full OCR of an image — returns all recognized text.
23
- func ocr(imagePath: String) throws -> [String: Any] {
24
- let results = try performOCR(imagePath: imagePath)
112
+ /// mode: "fast" (perception loop, 10x faster) or "accurate" (tool actions, default)
113
+ func ocr(imagePath: String, mode: String = "accurate") throws -> [String: Any] {
114
+ let results = try performOCR(imagePath: imagePath, mode: mode)
25
115
  let fullText = results.compactMap { $0["text"] as? String }.joined(separator: "\n")
26
116
  return [
27
117
  "text": fullText,
@@ -29,7 +119,52 @@ class VisionBridge {
29
119
  ]
30
120
  }
31
121
 
32
- private func performOCR(imagePath: String) throws -> [[String: Any]] {
122
+ /// OCR a specific region of a window — captures the window, crops to the ROI,
123
+ /// and runs text recognition on just that region. Returns bounds in window coordinates.
124
+ func ocrRegion(windowId: Int, region: [String: Double], mode: String = "accurate") throws -> [String: Any] {
125
+ let roiX = region["x"] ?? 0
126
+ let roiY = region["y"] ?? 0
127
+ let roiW = region["width"] ?? 0
128
+ let roiH = region["height"] ?? 0
129
+
130
+ // Capture the full window
131
+ guard let fullImage = CGWindowListCreateImage(
132
+ .null, .optionIncludingWindow, CGWindowID(windowId), [.bestResolution, .boundsIgnoreFraming]
133
+ ) else {
134
+ throw BridgeError.general("CGWindowListCreateImage returned nil for window \(windowId)")
135
+ }
136
+
137
+ // Crop to the ROI (CGImage coordinates have origin top-left, same as our ROI)
138
+ let cropRect = CGRect(x: roiX, y: roiY, width: roiW, height: roiH)
139
+ .intersection(CGRect(x: 0, y: 0, width: fullImage.width, height: fullImage.height))
140
+
141
+ guard !cropRect.isEmpty,
142
+ let cropped = fullImage.cropping(to: cropRect) else {
143
+ return ["text": "", "regions": [] as [Any]]
144
+ }
145
+
146
+ // Run OCR on cropped image
147
+ let results = try performOCROnImage(cropped, mode: mode)
148
+
149
+ // Translate bounds from cropped-image coordinates back to window coordinates
150
+ let adjustedResults: [[String: Any]] = results.map { entry in
151
+ var adjusted = entry
152
+ if var bounds = entry["bounds"] as? [String: Double] {
153
+ bounds["x"] = (bounds["x"] ?? 0) + roiX
154
+ bounds["y"] = (bounds["y"] ?? 0) + roiY
155
+ adjusted["bounds"] = bounds
156
+ }
157
+ return adjusted
158
+ }
159
+
160
+ let fullText = adjustedResults.compactMap { $0["text"] as? String }.joined(separator: "\n")
161
+ return [
162
+ "text": fullText,
163
+ "regions": adjustedResults,
164
+ ]
165
+ }
166
+
167
+ private func performOCR(imagePath: String, mode: String = "accurate") throws -> [[String: Any]] {
33
168
  let url = URL(fileURLWithPath: imagePath)
34
169
 
35
170
  guard let image = NSImage(contentsOf: url),
@@ -37,12 +172,35 @@ class VisionBridge {
37
172
  throw BridgeError.general("Failed to load image at \(imagePath)")
38
173
  }
39
174
 
175
+ return try performOCROnImage(cgImage, mode: mode)
176
+ }
177
+
178
+ private func performOCROnImage(_ cgImage: CGImage, mode: String = "accurate") throws -> [[String: Any]] {
40
179
  let imageWidth = CGFloat(cgImage.width)
41
180
  let imageHeight = CGFloat(cgImage.height)
42
181
 
43
182
  let request = VNRecognizeTextRequest()
44
- request.recognitionLevel = .accurate
45
- request.usesLanguageCorrection = true
183
+
184
+ if mode == "fast" {
185
+ // FAST mode: 10x faster (~60ms vs ~631ms), used for perception loop
186
+ // Trades ~20% text coverage for speed — acceptable for frame diffing
187
+ request.recognitionLevel = .fast
188
+ request.usesLanguageCorrection = false
189
+ request.recognitionLanguages = ["en-US"]
190
+ } else {
191
+ // ACCURATE mode: full precision, used for tool actions (ocr, click_text)
192
+ request.recognitionLevel = .accurate
193
+ request.usesLanguageCorrection = true
194
+ if #available(macOS 13.0, *) {
195
+ request.automaticallyDetectsLanguage = true
196
+ }
197
+ let supportedLangs = try? request.supportedRecognitionLanguages()
198
+ if let supported = supportedLangs {
199
+ request.recognitionLanguages = supported
200
+ } else {
201
+ request.recognitionLanguages = ["en-US", "zh-Hans", "zh-Hant", "ja", "ko", "hi", "ar", "de", "fr", "es", "pt", "it", "ru"]
202
+ }
203
+ }
46
204
 
47
205
  let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
48
206
  try handler.perform([request])
@@ -4,6 +4,40 @@ import Foundation
4
4
  /// Reads JSON requests from stdin (one per line), dispatches to the appropriate bridge,
5
5
  /// and writes JSON responses to stdout (one per line).
6
6
 
7
+ // MARK: - Signal Handlers
8
+ // Catch fatal signals (SIGSEGV, SIGBUS, SIGABRT) that CGWindowListCreateImage
9
+ // can trigger on GPU-heavy windows. Write an error to stderr so the Node.js
10
+ // bridge client can detect the crash, then exit cleanly.
11
+ func installSignalHandlers() {
12
+ // Fatal signals — crash reporting
13
+ let fatalSignals: [Int32] = [SIGSEGV, SIGBUS, SIGABRT]
14
+ for sig in fatalSignals {
15
+ signal(sig) { signum in
16
+ let msg = "Bridge fatal signal \(signum) — restarting\n"
17
+ msg.withCString { ptr in
18
+ _ = Darwin.write(STDERR_FILENO, ptr, Int(strlen(ptr)))
19
+ }
20
+ _exit(128 + signum)
21
+ }
22
+ }
23
+
24
+ // Graceful shutdown signals — notify Node.js BridgeClient before exit
25
+ let gracefulSignals: [Int32] = [SIGTERM, SIGINT]
26
+ for sig in gracefulSignals {
27
+ signal(sig) { signum in
28
+ let reason = signum == SIGTERM ? "SIGTERM" : "SIGINT"
29
+ let notification = "{\"jsonrpc\":\"2.0\",\"method\":\"bridge.shutdown\",\"params\":{\"reason\":\"\(reason)\"}}\n"
30
+ notification.withCString { ptr in
31
+ _ = Darwin.write(STDOUT_FILENO, ptr, Int(strlen(ptr)))
32
+ }
33
+ // Flush stdout
34
+ fflush(stdout)
35
+ _exit(0)
36
+ }
37
+ }
38
+ }
39
+ installSignalHandlers()
40
+
7
41
  struct JsonRpcRequest: Codable {
8
42
  let id: Int
9
43
  let method: String
@@ -117,7 +151,8 @@ let accessibilityBridge = AccessibilityBridge()
117
151
  let observerBridge = ObserverBridge()
118
152
  let coreGraphicsBridge = CoreGraphicsBridge()
119
153
  let visionBridge = VisionBridge()
120
- let appManagement = AppManagement()
154
+ let appManagement = AppManagement(ax: accessibilityBridge)
155
+ let streamCapture = StreamCapture()
121
156
 
122
157
  // MARK: - Method Dispatch
123
158
 
@@ -149,6 +184,15 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
149
184
  case "app.frontmost":
150
185
  return appManagement.frontmostApp()
151
186
 
187
+ // Window management (AX-enriched)
188
+ case "window.list":
189
+ return appManagement.listWindowsWithAX()
190
+
191
+ case "window.focus":
192
+ let windowId: Int = try requiredParam(params, "windowId")
193
+ try appManagement.focusWindow(windowId: windowId)
194
+ return ["ok": true]
195
+
152
196
  // Accessibility
153
197
  case "ax.findElement":
154
198
  let pid: Int = try requiredParam(params, "pid")
@@ -157,21 +201,29 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
157
201
  let value: String? = param(params, "value")
158
202
  let identifier: String? = param(params, "identifier")
159
203
  let exact: Bool = param(params, "exact") ?? true
204
+ let maxDepth: Int = param(params, "maxDepth") ?? 30
160
205
  return try accessibilityBridge.findElement(
161
206
  pid: pid_t(pid), role: role, title: title, value: value,
162
- identifier: identifier, exact: exact
207
+ identifier: identifier, exact: exact, maxDepth: maxDepth
163
208
  )
164
209
 
165
210
  case "ax.getElementTree":
166
211
  let pid: Int = try requiredParam(params, "pid")
167
212
  let maxDepth: Int = param(params, "maxDepth") ?? 5
168
- return try accessibilityBridge.getElementTree(pid: pid_t(pid), maxDepth: maxDepth)
213
+ let windowId: Int? = param(params, "windowId")
214
+ return try accessibilityBridge.getElementTree(pid: pid_t(pid), maxDepth: maxDepth, windowId: windowId)
215
+
216
+ case "ax.getMenuBar":
217
+ let pid: Int = try requiredParam(params, "pid")
218
+ let maxDepth: Int = param(params, "maxDepth") ?? 10
219
+ return try accessibilityBridge.getMenuBarTree(pid: pid_t(pid), maxDepth: maxDepth)
169
220
 
170
221
  case "ax.performAction":
171
222
  let pid: Int = try requiredParam(params, "pid")
172
223
  let elementPath: [Int] = try requiredParam(params, "elementPath")
173
224
  let action: String = param(params, "action") ?? "AXPress"
174
- try accessibilityBridge.performAction(pid: pid_t(pid), elementPath: elementPath, action: action)
225
+ let expectedTitle: String? = param(params, "expectedTitle")
226
+ try accessibilityBridge.performAction(pid: pid_t(pid), elementPath: elementPath, action: action, expectedTitle: expectedTitle)
175
227
  return ["ok": true]
176
228
 
177
229
  case "ax.setElementValue":
@@ -210,13 +262,16 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
210
262
  let y: Double = try requiredParam(params, "y")
211
263
  let button: String = param(params, "button") ?? "left"
212
264
  let clickCount: Int = param(params, "clickCount") ?? 1
213
- coreGraphicsBridge.mouseClick(x: x, y: y, button: button, clickCount: clickCount)
265
+ let modifiers: [String] = param(params, "modifiers") ?? []
266
+ let mcTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
267
+ coreGraphicsBridge.mouseClick(x: x, y: y, button: button, clickCount: clickCount, modifiers: modifiers, targetPid: mcTargetPid)
214
268
  return ["ok": true]
215
269
 
216
270
  case "cg.mouseMove":
217
271
  let x: Double = try requiredParam(params, "x")
218
272
  let y: Double = try requiredParam(params, "y")
219
- coreGraphicsBridge.mouseMove(x: x, y: y)
273
+ let mmTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
274
+ coreGraphicsBridge.mouseMove(x: x, y: y, targetPid: mmTargetPid)
220
275
  return ["ok": true]
221
276
 
222
277
  case "cg.mouseDrag":
@@ -224,7 +279,24 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
224
279
  let fromY: Double = try requiredParam(params, "fromY")
225
280
  let toX: Double = try requiredParam(params, "toX")
226
281
  let toY: Double = try requiredParam(params, "toY")
227
- coreGraphicsBridge.mouseDrag(fromX: fromX, fromY: fromY, toX: toX, toY: toY)
282
+ let dragModifiers: [String] = param(params, "modifiers") ?? []
283
+ let mdTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
284
+ coreGraphicsBridge.mouseDrag(fromX: fromX, fromY: fromY, toX: toX, toY: toY, modifiers: dragModifiers, targetPid: mdTargetPid)
285
+ return ["ok": true]
286
+
287
+ case "cg.mousePressAndHold":
288
+ let phX: Double = try requiredParam(params, "x")
289
+ let phY: Double = try requiredParam(params, "y")
290
+ let phDuration: Int = param(params, "durationMs") ?? 500
291
+ let phTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
292
+ coreGraphicsBridge.mousePressAndHold(x: phX, y: phY, durationMs: phDuration, targetPid: phTargetPid)
293
+ return ["ok": true]
294
+
295
+ case "cg.keyPressAndHold":
296
+ let kphKey: String = try requiredParam(params, "key")
297
+ let kphDuration: Int = param(params, "durationMs") ?? 500
298
+ let kphTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
299
+ coreGraphicsBridge.keyPressAndHold(key: kphKey, durationMs: kphDuration, targetPid: kphTargetPid)
228
300
  return ["ok": true]
229
301
 
230
302
  case "cg.mouseFlick":
@@ -232,17 +304,20 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
232
304
  let fyF: Double = try requiredParam(params, "fromY")
233
305
  let txF: Double = try requiredParam(params, "toX")
234
306
  let tyF: Double = try requiredParam(params, "toY")
235
- coreGraphicsBridge.mouseFlick(fromX: fxF, fromY: fyF, toX: txF, toY: tyF)
307
+ let mfTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
308
+ coreGraphicsBridge.mouseFlick(fromX: fxF, fromY: fyF, toX: txF, toY: tyF, targetPid: mfTargetPid)
236
309
  return ["ok": true]
237
310
 
238
311
  case "cg.keyCombo":
239
312
  let keys: [String] = try requiredParam(params, "keys")
240
- coreGraphicsBridge.keyCombo(keys: keys)
313
+ let kcTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
314
+ coreGraphicsBridge.keyCombo(keys: keys, targetPid: kcTargetPid)
241
315
  return ["ok": true]
242
316
 
243
317
  case "cg.typeText":
244
318
  let text: String = try requiredParam(params, "text")
245
- coreGraphicsBridge.typeText(text: text)
319
+ let ttTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
320
+ coreGraphicsBridge.typeText(text: text, targetPid: ttTargetPid)
246
321
  return ["ok": true]
247
322
 
248
323
  case "cg.captureScreen":
@@ -251,25 +326,89 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
251
326
 
252
327
  case "cg.captureWindow":
253
328
  let windowId: Int = try requiredParam(params, "windowId")
254
- return try coreGraphicsBridge.captureWindow(windowId: windowId)
329
+ let safeCLI: Bool = param(params, "safeCLI") ?? false
330
+ return try coreGraphicsBridge.captureWindow(windowId: windowId, safeCLI: safeCLI)
331
+
332
+ case "cg.captureWindowBuffer":
333
+ let windowId: Int = try requiredParam(params, "windowId")
334
+ let safeCLI: Bool = param(params, "safeCLI") ?? false
335
+ return try coreGraphicsBridge.captureWindowBuffer(windowId: windowId, safeCLI: safeCLI)
255
336
 
256
337
  case "cg.scroll":
257
338
  let x: Double = try requiredParam(params, "x")
258
339
  let y: Double = try requiredParam(params, "y")
259
340
  let deltaX: Int = param(params, "deltaX") ?? 0
260
341
  let deltaY: Int = param(params, "deltaY") ?? 0
261
- coreGraphicsBridge.scroll(x: x, y: y, deltaX: deltaX, deltaY: deltaY)
342
+ let scTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
343
+ coreGraphicsBridge.scroll(x: x, y: y, deltaX: deltaX, deltaY: deltaY, targetPid: scTargetPid)
262
344
  return ["ok": true]
263
345
 
264
346
  // Vision
265
347
  case "vision.findText":
266
348
  let imagePath: String = try requiredParam(params, "imagePath")
267
349
  let searchText: String? = param(params, "searchText")
268
- return try visionBridge.findText(imagePath: imagePath, searchText: searchText)
350
+ let ftMode: String = param(params, "mode") ?? "accurate"
351
+ return try visionBridge.findText(imagePath: imagePath, searchText: searchText, mode: ftMode)
269
352
 
270
353
  case "vision.ocr":
271
354
  let imagePath: String = try requiredParam(params, "imagePath")
272
- return try visionBridge.ocr(imagePath: imagePath)
355
+ let ocrMode: String = param(params, "mode") ?? "accurate"
356
+ return try visionBridge.ocr(imagePath: imagePath, mode: ocrMode)
357
+
358
+ case "vision.ocrRegion":
359
+ let windowId: Int = try requiredParam(params, "windowId")
360
+ let region: [String: Double] = try requiredParam(params, "region")
361
+ let ocrRegionMode: String = param(params, "mode") ?? "accurate"
362
+ return try visionBridge.ocrRegion(windowId: windowId, region: region, mode: ocrRegionMode)
363
+
364
+ case "vision.detectElements":
365
+ let imagePath: String = try requiredParam(params, "imagePath")
366
+ let confidence: Double = param(params, "confidence") ?? 0.25
367
+ let elements = try visionBridge.detectElements(imagePath: imagePath, confidence: confidence)
368
+ return ["elements": elements, "count": elements.count]
369
+
370
+ // Stream capture — continuous SCStream for fast perception
371
+ case "vision.startStream":
372
+ let windowId: Int = try requiredParam(params, "windowId")
373
+ let fps: Int = param(params, "fps") ?? 30
374
+ let sem = DispatchSemaphore(value: 0)
375
+ var streamError: Error?
376
+ Task {
377
+ do {
378
+ try await streamCapture.start(windowId: windowId, fps: fps)
379
+ } catch {
380
+ streamError = error
381
+ }
382
+ sem.signal()
383
+ }
384
+ sem.wait()
385
+ if let err = streamError { throw err }
386
+ return ["ok": true, "fps": fps]
387
+
388
+ case "vision.stopStream":
389
+ let sem = DispatchSemaphore(value: 0)
390
+ Task {
391
+ await streamCapture.stop()
392
+ sem.signal()
393
+ }
394
+ sem.wait()
395
+ return ["ok": true]
396
+
397
+ case "vision.streamStatus":
398
+ let running = streamCapture.isRunning
399
+ if running, let info = streamCapture.getLatestInfo() {
400
+ return ["running": true, "path": info["path"]!, "width": info["width"]!, "height": info["height"]!, "ageMs": info["ageMs"]!, "frameCount": info["frameCount"]!]
401
+ }
402
+ return ["running": running]
403
+
404
+ case "vision.latestFrame":
405
+ guard streamCapture.isRunning else {
406
+ throw BridgeError.general("Stream not running")
407
+ }
408
+ guard let info = streamCapture.getLatestInfo() else {
409
+ throw BridgeError.general("No frame captured yet")
410
+ }
411
+ return info
273
412
 
274
413
  default:
275
414
  throw BridgeError.general("Unknown method: \(method)")
@@ -334,9 +473,23 @@ while let line = readLine() {
334
473
  writeResponse(response)
335
474
  }
336
475
  } catch {
337
- // Malformed JSON — write error with id=0
476
+ // Malformed JSON — try to extract id from raw string
477
+ var extractedId = 0
478
+ if let idRange = line.range(of: "\"id\"\\s*:\\s*(\\d+)", options: .regularExpression) {
479
+ let match = line[idRange]
480
+ if let digitRange = match.range(of: "\\d+$", options: .regularExpression) {
481
+ extractedId = Int(match[digitRange]) ?? 0
482
+ }
483
+ }
484
+ if extractedId == 0 {
485
+ // No id could be extracted — log to stderr so Node.js BridgeClient can detect it
486
+ let stderrMsg = "Bridge parse error (no id): \(error.localizedDescription)\n"
487
+ stderrMsg.withCString { ptr in
488
+ _ = Darwin.write(STDERR_FILENO, ptr, Int(strlen(ptr)))
489
+ }
490
+ }
338
491
  let response = JsonRpcResponse(
339
- id: 0,
492
+ id: extractedId,
340
493
  result: nil,
341
494
  error: JsonRpcError(code: -32700, message: "Parse error: \(error.localizedDescription)")
342
495
  )
@@ -156,6 +156,8 @@ class Program
156
156
  Param<Dictionary<string, double>>(p, "region")),
157
157
  "cg.captureWindow" => _screenCapture.CaptureWindow(
158
158
  RequiredParam<int>(p, "windowId")),
159
+ "cg.captureWindowBuffer" => _screenCapture.CaptureWindowBuffer(
160
+ RequiredParam<int>(p, "windowId")),
159
161
 
160
162
  // Vision (OCR)
161
163
  "vision.findText" => _screenCapture.FindText(
@@ -163,6 +165,9 @@ class Program
163
165
  Param<string>(p, "searchText")),
164
166
  "vision.ocr" => _screenCapture.Ocr(
165
167
  RequiredParam<string>(p, "imagePath")),
168
+ "vision.ocrRegion" => _screenCapture.OcrRegion(
169
+ RequiredParam<int>(p, "windowId"),
170
+ RequiredParam<Dictionary<string, double>>(p, "region")),
166
171
 
167
172
  _ => throw new BridgeException($"Unknown method: {method}"),
168
173
  };
@@ -157,6 +157,130 @@ class ScreenCapture
157
157
  };
158
158
  }
159
159
 
160
+ /// <summary>
161
+ /// Capture a specific window in-memory, return base64 PNG (no disk I/O).
162
+ /// Equivalent to macOS captureWindowBuffer.
163
+ /// </summary>
164
+ public Dictionary<string, object> CaptureWindowBuffer(int windowId)
165
+ {
166
+ var hWnd = new IntPtr(windowId);
167
+ GetWindowRect(hWnd, out RECT rect);
168
+
169
+ int width = rect.Right - rect.Left;
170
+ int height = rect.Bottom - rect.Top;
171
+
172
+ if (width <= 0 || height <= 0)
173
+ throw new BridgeException($"Window {windowId} has invalid dimensions");
174
+
175
+ using var bitmap = new Bitmap(width, height, PixelFormat.Format32bppArgb);
176
+ using var graphics = Graphics.FromImage(bitmap);
177
+
178
+ var hdc = graphics.GetHdc();
179
+ bool success = PrintWindow(hWnd, hdc, PW_RENDERFULLCONTENT);
180
+ graphics.ReleaseHdc(hdc);
181
+
182
+ if (!success)
183
+ {
184
+ graphics.CopyFromScreen(rect.Left, rect.Top, 0, 0,
185
+ new Size(width, height), CopyPixelOperation.SourceCopy);
186
+ }
187
+
188
+ using var ms = new MemoryStream();
189
+ bitmap.Save(ms, ImageFormat.Png);
190
+ var base64 = Convert.ToBase64String(ms.ToArray());
191
+
192
+ return new Dictionary<string, object>
193
+ {
194
+ ["base64"] = base64,
195
+ ["width"] = width,
196
+ ["height"] = height,
197
+ };
198
+ }
199
+
200
+ /// <summary>
201
+ /// OCR a specific region of a window. Captures window, crops to ROI, runs OCR,
202
+ /// then translates bounds back to window coordinates.
203
+ /// Equivalent to macOS vision.ocrRegion.
204
+ /// </summary>
205
+ public Dictionary<string, object> OcrRegion(int windowId, Dictionary<string, double> region)
206
+ {
207
+ var hWnd = new IntPtr(windowId);
208
+ GetWindowRect(hWnd, out RECT rect);
209
+
210
+ int winWidth = rect.Right - rect.Left;
211
+ int winHeight = rect.Bottom - rect.Top;
212
+
213
+ if (winWidth <= 0 || winHeight <= 0)
214
+ throw new BridgeException($"Window {windowId} has invalid dimensions");
215
+
216
+ int roiX = (int)region.GetValueOrDefault("x", 0);
217
+ int roiY = (int)region.GetValueOrDefault("y", 0);
218
+ int roiW = (int)region.GetValueOrDefault("width", winWidth);
219
+ int roiH = (int)region.GetValueOrDefault("height", winHeight);
220
+
221
+ // Clamp ROI to window bounds
222
+ roiX = Math.Max(0, Math.Min(roiX, winWidth));
223
+ roiY = Math.Max(0, Math.Min(roiY, winHeight));
224
+ roiW = Math.Min(roiW, winWidth - roiX);
225
+ roiH = Math.Min(roiH, winHeight - roiY);
226
+
227
+ if (roiW <= 0 || roiH <= 0)
228
+ throw new BridgeException("ROI has zero or negative area after clamping");
229
+
230
+ // Capture full window
231
+ using var fullBitmap = new Bitmap(winWidth, winHeight, PixelFormat.Format32bppArgb);
232
+ using (var graphics = Graphics.FromImage(fullBitmap))
233
+ {
234
+ var hdc = graphics.GetHdc();
235
+ bool success = PrintWindow(hWnd, hdc, PW_RENDERFULLCONTENT);
236
+ graphics.ReleaseHdc(hdc);
237
+
238
+ if (!success)
239
+ {
240
+ graphics.CopyFromScreen(rect.Left, rect.Top, 0, 0,
241
+ new Size(winWidth, winHeight), CopyPixelOperation.SourceCopy);
242
+ }
243
+ }
244
+
245
+ // Crop to ROI
246
+ using var cropped = fullBitmap.Clone(
247
+ new Rectangle(roiX, roiY, roiW, roiH), fullBitmap.PixelFormat);
248
+
249
+ // Save cropped to temp file for OCR
250
+ var tempPath = Path.Combine(_tempDir, $"ocr_region_{DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()}.png");
251
+ cropped.Save(tempPath, ImageFormat.Png);
252
+
253
+ try
254
+ {
255
+ var ocrResult = Ocr(tempPath);
256
+
257
+ // Translate bounds back to window coordinates
258
+ if (ocrResult["regions"] is List<object> regions)
259
+ {
260
+ foreach (var regionObj in regions)
261
+ {
262
+ if (regionObj is Dictionary<string, object> entry &&
263
+ entry["bounds"] is Dictionary<string, object> bounds)
264
+ {
265
+ bounds["x"] = (double)bounds["x"] + roiX;
266
+ bounds["y"] = (double)bounds["y"] + roiY;
267
+ }
268
+ }
269
+ }
270
+
271
+ ocrResult["roiX"] = roiX;
272
+ ocrResult["roiY"] = roiY;
273
+ ocrResult["roiWidth"] = roiW;
274
+ ocrResult["roiHeight"] = roiH;
275
+
276
+ return ocrResult;
277
+ }
278
+ finally
279
+ {
280
+ try { File.Delete(tempPath); } catch { /* best-effort cleanup */ }
281
+ }
282
+ }
283
+
160
284
  /// <summary>
161
285
  /// OCR an image file. Uses Windows.Media.Ocr when available, falls back to basic implementation.
162
286
  /// </summary>