screenhand 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. package/README.md +165 -446
  2. package/bin/darwin-arm64/macos-bridge +0 -0
  3. package/dist/mcp-desktop.js +3615 -400
  4. package/dist/scripts/export-help-center.js +112 -0
  5. package/dist/scripts/marketing-loop.js +117 -0
  6. package/dist/scripts/observer-daemon.js +288 -0
  7. package/dist/scripts/orchestrator-daemon.js +399 -0
  8. package/dist/scripts/threads-campaign.js +208 -0
  9. package/dist/src/community/fetcher.js +109 -0
  10. package/dist/src/community/index.js +6 -0
  11. package/dist/src/community/publisher.js +191 -0
  12. package/dist/src/community/remote-api.js +121 -0
  13. package/dist/src/community/types.js +3 -0
  14. package/dist/src/community/validator.js +95 -0
  15. package/dist/src/context-tracker.js +489 -0
  16. package/dist/src/ingestion/coverage-auditor.js +233 -0
  17. package/dist/src/ingestion/doc-parser.js +164 -0
  18. package/dist/src/ingestion/index.js +8 -0
  19. package/dist/src/ingestion/menu-scanner.js +152 -0
  20. package/dist/src/ingestion/reference-merger.js +186 -0
  21. package/dist/src/ingestion/shortcut-extractor.js +180 -0
  22. package/dist/src/ingestion/tutorial-extractor.js +170 -0
  23. package/dist/src/ingestion/types.js +3 -0
  24. package/dist/src/jobs/manager.js +82 -14
  25. package/dist/src/jobs/runner.js +138 -15
  26. package/dist/src/learning/engine.js +356 -0
  27. package/dist/src/learning/index.js +9 -0
  28. package/dist/src/learning/locator-policy.js +120 -0
  29. package/dist/src/learning/pattern-policy.js +89 -0
  30. package/dist/src/learning/recovery-policy.js +116 -0
  31. package/dist/src/learning/sensor-policy.js +115 -0
  32. package/dist/src/learning/timing-model.js +204 -0
  33. package/dist/src/learning/topology-policy.js +90 -0
  34. package/dist/src/learning/types.js +9 -0
  35. package/dist/src/logging/timeline-logger.js +4 -1
  36. package/dist/src/memory/playbook-seeds.js +200 -0
  37. package/dist/src/memory/recall.js +60 -8
  38. package/dist/src/memory/service.js +30 -5
  39. package/dist/src/memory/store.js +34 -5
  40. package/dist/src/native/bridge-client.js +253 -31
  41. package/dist/src/observer/state.js +199 -0
  42. package/dist/src/observer/types.js +43 -0
  43. package/dist/src/orchestrator/state.js +68 -0
  44. package/dist/src/orchestrator/types.js +22 -0
  45. package/dist/src/perception/ax-source.js +162 -0
  46. package/dist/src/perception/cdp-source.js +162 -0
  47. package/dist/src/perception/coordinator.js +771 -0
  48. package/dist/src/perception/frame-differ.js +287 -0
  49. package/dist/src/perception/index.js +22 -0
  50. package/dist/src/perception/manager.js +199 -0
  51. package/dist/src/perception/types.js +47 -0
  52. package/dist/src/perception/vision-source.js +399 -0
  53. package/dist/src/planner/deterministic.js +298 -0
  54. package/dist/src/planner/executor.js +870 -0
  55. package/dist/src/planner/goal-store.js +92 -0
  56. package/dist/src/planner/index.js +21 -0
  57. package/dist/src/planner/planner.js +520 -0
  58. package/dist/src/planner/tool-registry.js +71 -0
  59. package/dist/src/planner/types.js +22 -0
  60. package/dist/src/platform/explorer.js +213 -0
  61. package/dist/src/platform/help-center-markdown.js +527 -0
  62. package/dist/src/platform/learner.js +257 -0
  63. package/dist/src/playbook/engine.js +296 -11
  64. package/dist/src/playbook/mcp-recorder.js +204 -0
  65. package/dist/src/playbook/recorder.js +3 -2
  66. package/dist/src/playbook/runner.js +1 -1
  67. package/dist/src/playbook/store.js +139 -10
  68. package/dist/src/recovery/detectors.js +156 -0
  69. package/dist/src/recovery/engine.js +327 -0
  70. package/dist/src/recovery/index.js +20 -0
  71. package/dist/src/recovery/strategies.js +274 -0
  72. package/dist/src/recovery/types.js +20 -0
  73. package/dist/src/runtime/accessibility-adapter.js +55 -18
  74. package/dist/src/runtime/applescript-adapter.js +8 -2
  75. package/dist/src/runtime/cdp-chrome-adapter.js +1 -1
  76. package/dist/src/runtime/executor.js +23 -3
  77. package/dist/src/runtime/locator-cache.js +24 -2
  78. package/dist/src/runtime/service.js +59 -15
  79. package/dist/src/runtime/session-manager.js +4 -1
  80. package/dist/src/runtime/vision-adapter.js +2 -1
  81. package/dist/src/state/app-map-types.js +72 -0
  82. package/dist/src/state/app-map.js +1974 -0
  83. package/dist/src/state/entity-tracker.js +108 -0
  84. package/dist/src/state/fusion.js +96 -0
  85. package/dist/src/state/index.js +21 -0
  86. package/dist/src/state/ladder-generator.js +236 -0
  87. package/dist/src/state/persistence.js +156 -0
  88. package/dist/src/state/types.js +17 -0
  89. package/dist/src/state/world-model.js +1456 -0
  90. package/dist/src/util/atomic-write.js +19 -4
  91. package/dist/src/util/sanitize.js +146 -0
  92. package/dist-app-maps/com.figma.Desktop.json +959 -0
  93. package/dist-app-maps/com.hnc.Discord.json +1146 -0
  94. package/dist-app-maps/notion.id.json +2831 -0
  95. package/dist-playbooks/canva-screenhand-carousel.json +445 -0
  96. package/dist-playbooks/codex-desktop.json +76 -0
  97. package/dist-playbooks/competitor-research-stack.json +122 -0
  98. package/dist-playbooks/davinci-color-grade.json +153 -0
  99. package/dist-playbooks/davinci-edit-timeline.json +162 -0
  100. package/dist-playbooks/davinci-render.json +114 -0
  101. package/dist-playbooks/devto.json +52 -0
  102. package/dist-playbooks/discord.json +41 -0
  103. package/dist-playbooks/google-flow-create-project.json +59 -0
  104. package/dist-playbooks/google-flow-edit-image.json +90 -0
  105. package/dist-playbooks/google-flow-edit-video.json +90 -0
  106. package/dist-playbooks/google-flow-generate-image.json +68 -0
  107. package/dist-playbooks/google-flow-generate-video.json +191 -0
  108. package/dist-playbooks/google-flow-open-project.json +48 -0
  109. package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
  110. package/dist-playbooks/google-flow-search-assets.json +64 -0
  111. package/dist-playbooks/instagram.json +57 -0
  112. package/dist-playbooks/linkedin.json +52 -0
  113. package/dist-playbooks/n8n.json +43 -0
  114. package/dist-playbooks/reddit.json +52 -0
  115. package/dist-playbooks/threads.json +59 -0
  116. package/dist-playbooks/x-twitter.json +59 -0
  117. package/dist-playbooks/youtube.json +59 -0
  118. package/dist-references/canva.json +646 -0
  119. package/dist-references/codex-desktop.json +305 -0
  120. package/dist-references/davinci-resolve-keyboard.json +594 -0
  121. package/dist-references/davinci-resolve-menu-map.json +1139 -0
  122. package/dist-references/davinci-resolve-menus-batch1.json +116 -0
  123. package/dist-references/davinci-resolve-menus-batch2.json +372 -0
  124. package/dist-references/davinci-resolve-menus-batch3.json +330 -0
  125. package/dist-references/davinci-resolve-menus-batch4.json +297 -0
  126. package/dist-references/davinci-resolve-shortcuts.json +333 -0
  127. package/dist-references/devpost.json +186 -0
  128. package/dist-references/devto.json +317 -0
  129. package/dist-references/discord.json +549 -0
  130. package/dist-references/figma.json +1186 -0
  131. package/dist-references/finder.json +146 -0
  132. package/dist-references/google-ads-transparency.json +95 -0
  133. package/dist-references/google-flow.json +649 -0
  134. package/dist-references/instagram.json +341 -0
  135. package/dist-references/linkedin.json +324 -0
  136. package/dist-references/meta-ad-library.json +86 -0
  137. package/dist-references/n8n.json +387 -0
  138. package/dist-references/notes.json +27 -0
  139. package/dist-references/notion.json +163 -0
  140. package/dist-references/reddit.json +341 -0
  141. package/dist-references/threads.json +337 -0
  142. package/dist-references/x-twitter.json +403 -0
  143. package/dist-references/youtube.json +373 -0
  144. package/native/macos-bridge/Package.swift +22 -0
  145. package/native/macos-bridge/Sources/AccessibilityBridge.swift +482 -0
  146. package/native/macos-bridge/Sources/AppManagement.swift +339 -0
  147. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +537 -0
  148. package/native/macos-bridge/Sources/ObserverBridge.swift +120 -0
  149. package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
  150. package/native/macos-bridge/Sources/VisionBridge.swift +238 -0
  151. package/native/macos-bridge/Sources/main.swift +498 -0
  152. package/native/windows-bridge/AppManagement.cs +234 -0
  153. package/native/windows-bridge/InputBridge.cs +436 -0
  154. package/native/windows-bridge/Program.cs +270 -0
  155. package/native/windows-bridge/ScreenCapture.cs +453 -0
  156. package/native/windows-bridge/UIAutomationBridge.cs +571 -0
  157. package/native/windows-bridge/WindowsBridge.csproj +17 -0
  158. package/package.json +12 -1
  159. package/scripts/postinstall.cjs +127 -0
  160. package/dist/.audit-log.jsonl +0 -55
  161. package/dist/.screenhand/memory/.lock +0 -1
  162. package/dist/.screenhand/memory/actions.jsonl +0 -85
  163. package/dist/.screenhand/memory/errors.jsonl +0 -5
  164. package/dist/.screenhand/memory/errors.jsonl.bak +0 -4
  165. package/dist/.screenhand/memory/state.json +0 -35
  166. package/dist/.screenhand/memory/state.json.bak +0 -35
  167. package/dist/.screenhand/memory/strategies.jsonl +0 -12
  168. package/dist/agent/cli.js +0 -73
  169. package/dist/agent/loop.js +0 -258
  170. package/dist/config.js +0 -9
  171. package/dist/index.js +0 -56
  172. package/dist/logging/timeline-logger.js +0 -29
  173. package/dist/mcp/mcp-stdio-server.js +0 -448
  174. package/dist/mcp/server.js +0 -347
  175. package/dist/mcp-entry.js +0 -59
  176. package/dist/memory/recall.js +0 -160
  177. package/dist/memory/research.js +0 -98
  178. package/dist/memory/seeds.js +0 -89
  179. package/dist/memory/session.js +0 -161
  180. package/dist/memory/store.js +0 -391
  181. package/dist/memory/types.js +0 -4
  182. package/dist/monitor/codex-monitor.js +0 -377
  183. package/dist/monitor/task-queue.js +0 -84
  184. package/dist/monitor/types.js +0 -49
  185. package/dist/native/bridge-client.js +0 -174
  186. package/dist/native/macos-bridge-client.js +0 -5
  187. package/dist/npm-publish-helper.js +0 -117
  188. package/dist/npm-token-cdp.js +0 -113
  189. package/dist/npm-token-create.js +0 -135
  190. package/dist/npm-token-finish.js +0 -126
  191. package/dist/playbook/engine.js +0 -193
  192. package/dist/playbook/index.js +0 -4
  193. package/dist/playbook/recorder.js +0 -519
  194. package/dist/playbook/runner.js +0 -392
  195. package/dist/playbook/store.js +0 -166
  196. package/dist/playbook/types.js +0 -4
  197. package/dist/runtime/accessibility-adapter.js +0 -377
  198. package/dist/runtime/app-adapter.js +0 -48
  199. package/dist/runtime/applescript-adapter.js +0 -283
  200. package/dist/runtime/ax-role-map.js +0 -80
  201. package/dist/runtime/browser-adapter.js +0 -36
  202. package/dist/runtime/cdp-chrome-adapter.js +0 -505
  203. package/dist/runtime/composite-adapter.js +0 -205
  204. package/dist/runtime/executor.js +0 -250
  205. package/dist/runtime/locator-cache.js +0 -12
  206. package/dist/runtime/planning-loop.js +0 -47
  207. package/dist/runtime/service.js +0 -372
  208. package/dist/runtime/session-manager.js +0 -28
  209. package/dist/runtime/state-observer.js +0 -105
  210. package/dist/runtime/vision-adapter.js +0 -208
  211. package/dist/test-mcp-protocol.js +0 -138
  212. package/dist/types.js +0 -1
@@ -0,0 +1,136 @@
1
+ import Foundation
2
+ import ScreenCaptureKit
3
+ import CoreMedia
4
+ import AppKit
5
+
6
+ /// Continuous screen capture using SCStream.
7
+ /// Keeps the latest frame as a temp PNG file on disk.
8
+ /// Replaces one-shot CGWindowListCreateImage (~200ms) with pre-captured frames (~0ms read).
9
+ class StreamCapture: NSObject, SCStreamOutput {
10
+ private var stream: SCStream?
11
+ private var _running = false
12
+ private let queue = DispatchQueue(label: "streamcapture.state")
13
+
14
+ /// Path to the latest captured frame (PNG file)
15
+ private var _latestFramePath: String?
16
+ private var _latestWidth: Int = 0
17
+ private var _latestHeight: Int = 0
18
+ private var _latestFrameTime: Date?
19
+ private var _frameCount: UInt64 = 0
20
+ private var saveEveryN: Int = 1
21
+
22
+ /// Start continuous capture for a specific window.
23
+ func start(windowId: Int, fps: Int = 30) async throws {
24
+ var alreadyRunning = false
25
+ queue.sync { alreadyRunning = self._running }
26
+ if alreadyRunning { return }
27
+
28
+ let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true)
29
+ guard let window = content.windows.first(where: { $0.windowID == CGWindowID(windowId) }) else {
30
+ throw BridgeError.general("Window \(windowId) not found for stream capture")
31
+ }
32
+
33
+ self.saveEveryN = max(1, fps / 30)
34
+
35
+ let filter = SCContentFilter(desktopIndependentWindow: window)
36
+ let config = SCStreamConfiguration()
37
+ config.width = window.frame.width > 0 ? Int(window.frame.width) * 2 : 2880
38
+ config.height = window.frame.height > 0 ? Int(window.frame.height) * 2 : 1800
39
+ config.showsCursor = false
40
+ config.capturesAudio = false
41
+ config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
42
+ config.queueDepth = 3
43
+
44
+ let newStream = SCStream(filter: filter, configuration: config, delegate: nil)
45
+ try newStream.addStreamOutput(self, type: .screen, sampleHandlerQueue: .global(qos: .userInitiated))
46
+ try await newStream.startCapture()
47
+
48
+ queue.sync {
49
+ self.stream = newStream
50
+ self._running = true
51
+ self._frameCount = 0
52
+ }
53
+ }
54
+
55
+ /// Stop the stream and clean up.
56
+ func stop() async {
57
+ var s: SCStream?
58
+ var pathToClean: String?
59
+
60
+ queue.sync {
61
+ s = self.stream
62
+ self._running = false
63
+ self.stream = nil
64
+ pathToClean = self._latestFramePath
65
+ self._latestFramePath = nil
66
+ }
67
+
68
+ if let s = s {
69
+ try? await s.stopCapture()
70
+ }
71
+
72
+ if let path = pathToClean {
73
+ try? FileManager.default.removeItem(atPath: path)
74
+ }
75
+ }
76
+
77
+ var isRunning: Bool {
78
+ queue.sync { _running }
79
+ }
80
+
81
+ /// Get info about the latest frame.
82
+ func getLatestInfo() -> [String: Any]? {
83
+ queue.sync {
84
+ guard let path = _latestFramePath, let time = _latestFrameTime else { return nil }
85
+ return [
86
+ "path": path,
87
+ "width": _latestWidth,
88
+ "height": _latestHeight,
89
+ "ageMs": Int(Date().timeIntervalSince(time) * 1000),
90
+ "frameCount": _frameCount,
91
+ ]
92
+ }
93
+ }
94
+
95
+ // MARK: - SCStreamOutput
96
+
97
+ func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, of type: SCStreamOutputType) {
98
+ guard type == .screen else { return }
99
+
100
+ var shouldSave = false
101
+ queue.sync {
102
+ guard self._running else { shouldSave = false; return }
103
+ _frameCount += 1
104
+ shouldSave = _frameCount % UInt64(saveEveryN) == 0
105
+ }
106
+ guard shouldSave else { return }
107
+
108
+ guard let imageBuffer = sampleBuffer.imageBuffer else { return }
109
+ let ciImage = CIImage(cvImageBuffer: imageBuffer)
110
+ let context = CIContext()
111
+ let width = CVPixelBufferGetWidth(imageBuffer)
112
+ let height = CVPixelBufferGetHeight(imageBuffer)
113
+
114
+ guard let cgImage = context.createCGImage(ciImage, from: CGRect(x: 0, y: 0, width: width, height: height)) else { return }
115
+
116
+ let tempDir = FileManager.default.temporaryDirectory
117
+ let fileURL = tempDir.appendingPathComponent("stream_frame_latest.png")
118
+ let bitmapRep = NSBitmapImageRep(cgImage: cgImage)
119
+ guard let pngData = bitmapRep.representation(using: .png, properties: [:]) else { return }
120
+
121
+ do {
122
+ let tmpURL = tempDir.appendingPathComponent("stream_frame_tmp_\(ProcessInfo.processInfo.processIdentifier).png")
123
+ try pngData.write(to: tmpURL)
124
+ _ = try FileManager.default.replaceItemAt(fileURL, withItemAt: tmpURL)
125
+
126
+ queue.sync {
127
+ self._latestFramePath = fileURL.path
128
+ self._latestWidth = width
129
+ self._latestHeight = height
130
+ self._latestFrameTime = Date()
131
+ }
132
+ } catch {
133
+ // Skip frame on write failure
134
+ }
135
+ }
136
+ }
@@ -0,0 +1,238 @@
1
+ import Foundation
2
+ import Vision
3
+ import AppKit
4
+ import CoreML
5
+
6
+ class VisionBridge {
7
+
8
+ // MARK: - YOLO Element Detection
9
+
10
+ private var yoloModel: VNCoreMLModel?
11
+ private let yoloClassNames = ["button", "field", "heading", "iframe", "image", "label", "link", "text"]
12
+
13
+ /// Load the YOLO CoreML model for UI element detection.
14
+ /// Called lazily on first detectElements call.
15
+ private func ensureYoloModel() throws {
16
+ if yoloModel != nil { return }
17
+
18
+ // Look for model relative to the binary or via known paths
19
+ let execPath = URL(fileURLWithPath: CommandLine.arguments[0]).resolvingSymlinksInPath()
20
+ let execDir = execPath.deletingLastPathComponent()
21
+ let possiblePaths = [
22
+ execDir.appendingPathComponent("../Resources/ui-elements.mlpackage"),
23
+ execDir.appendingPathComponent("../../Resources/ui-elements.mlpackage"),
24
+ execDir.appendingPathComponent("Resources/ui-elements.mlpackage"),
25
+ URL(fileURLWithPath: "/tmp/vins-yolo/web-ui-8cls.mlpackage"), // Development fallback
26
+ ]
27
+
28
+ for modelURL in possiblePaths {
29
+ if FileManager.default.fileExists(atPath: modelURL.path) {
30
+ let config = MLModelConfiguration()
31
+ config.computeUnits = .all // Use ANE when available
32
+ let coreMLModel = try MLModel(contentsOf: modelURL, configuration: config)
33
+ yoloModel = try VNCoreMLModel(for: coreMLModel)
34
+ return
35
+ }
36
+ }
37
+
38
+ throw BridgeError.general("YOLO model not found")
39
+ }
40
+
41
+ /// Detect UI elements in an image using the YOLO CoreML model.
42
+ /// Returns bounding boxes with class labels and confidence scores.
43
+ func detectElements(imagePath: String, confidence: Double = 0.25) throws -> [[String: Any]] {
44
+ try ensureYoloModel()
45
+ guard let model = yoloModel else {
46
+ throw BridgeError.general("YOLO model not loaded")
47
+ }
48
+
49
+ let url = URL(fileURLWithPath: imagePath)
50
+ guard let image = NSImage(contentsOf: url),
51
+ let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
52
+ throw BridgeError.general("Failed to load image at \(imagePath)")
53
+ }
54
+
55
+ let imageWidth = CGFloat(cgImage.width)
56
+ let imageHeight = CGFloat(cgImage.height)
57
+
58
+ let request = VNCoreMLRequest(model: model)
59
+ request.imageCropAndScaleOption = .scaleFit
60
+
61
+ let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
62
+ try handler.perform([request])
63
+
64
+ guard let observations = request.results as? [VNRecognizedObjectObservation] else {
65
+ return []
66
+ }
67
+
68
+ var results: [[String: Any]] = []
69
+ for obs in observations {
70
+ guard obs.confidence >= Float(confidence) else { continue }
71
+
72
+ let bbox = obs.boundingBox
73
+ // Convert from Vision normalized coords (origin bottom-left) to screen coords
74
+ let x = bbox.origin.x * imageWidth
75
+ let y = (1 - bbox.origin.y - bbox.height) * imageHeight
76
+ let width = bbox.width * imageWidth
77
+ let height = bbox.height * imageHeight
78
+
79
+ let className = obs.labels.first?.identifier ?? "unknown"
80
+
81
+ results.append([
82
+ "class": className,
83
+ "confidence": Double(obs.confidence),
84
+ "bounds": [
85
+ "x": Double(x),
86
+ "y": Double(y),
87
+ "width": Double(width),
88
+ "height": Double(height),
89
+ ] as [String: Any],
90
+ ] as [String: Any])
91
+ }
92
+
93
+ return results
94
+ }
95
+
96
+ /// Perform OCR on an image, optionally searching for specific text.
97
+ /// Returns all recognized text with bounding boxes.
98
+ func findText(imagePath: String, searchText: String?, mode: String = "accurate") throws -> [[String: Any]] {
99
+ let results = try performOCR(imagePath: imagePath, mode: mode)
100
+
101
+ guard let search = searchText?.lowercased() else {
102
+ return results
103
+ }
104
+
105
+ return results.filter { result in
106
+ guard let text = result["text"] as? String else { return false }
107
+ return text.lowercased().contains(search)
108
+ }
109
+ }
110
+
111
+ /// Full OCR of an image — returns all recognized text.
112
+ /// mode: "fast" (perception loop, 10x faster) or "accurate" (tool actions, default)
113
+ func ocr(imagePath: String, mode: String = "accurate") throws -> [String: Any] {
114
+ let results = try performOCR(imagePath: imagePath, mode: mode)
115
+ let fullText = results.compactMap { $0["text"] as? String }.joined(separator: "\n")
116
+ return [
117
+ "text": fullText,
118
+ "regions": results,
119
+ ]
120
+ }
121
+
122
+ /// OCR a specific region of a window — captures the window, crops to the ROI,
123
+ /// and runs text recognition on just that region. Returns bounds in window coordinates.
124
+ func ocrRegion(windowId: Int, region: [String: Double], mode: String = "accurate") throws -> [String: Any] {
125
+ let roiX = region["x"] ?? 0
126
+ let roiY = region["y"] ?? 0
127
+ let roiW = region["width"] ?? 0
128
+ let roiH = region["height"] ?? 0
129
+
130
+ // Capture the full window
131
+ guard let fullImage = CGWindowListCreateImage(
132
+ .null, .optionIncludingWindow, CGWindowID(windowId), [.bestResolution, .boundsIgnoreFraming]
133
+ ) else {
134
+ throw BridgeError.general("CGWindowListCreateImage returned nil for window \(windowId)")
135
+ }
136
+
137
+ // Crop to the ROI (CGImage coordinates have origin top-left, same as our ROI)
138
+ let cropRect = CGRect(x: roiX, y: roiY, width: roiW, height: roiH)
139
+ .intersection(CGRect(x: 0, y: 0, width: fullImage.width, height: fullImage.height))
140
+
141
+ guard !cropRect.isEmpty,
142
+ let cropped = fullImage.cropping(to: cropRect) else {
143
+ return ["text": "", "regions": [] as [Any]]
144
+ }
145
+
146
+ // Run OCR on cropped image
147
+ let results = try performOCROnImage(cropped, mode: mode)
148
+
149
+ // Translate bounds from cropped-image coordinates back to window coordinates
150
+ let adjustedResults: [[String: Any]] = results.map { entry in
151
+ var adjusted = entry
152
+ if var bounds = entry["bounds"] as? [String: Double] {
153
+ bounds["x"] = (bounds["x"] ?? 0) + roiX
154
+ bounds["y"] = (bounds["y"] ?? 0) + roiY
155
+ adjusted["bounds"] = bounds
156
+ }
157
+ return adjusted
158
+ }
159
+
160
+ let fullText = adjustedResults.compactMap { $0["text"] as? String }.joined(separator: "\n")
161
+ return [
162
+ "text": fullText,
163
+ "regions": adjustedResults,
164
+ ]
165
+ }
166
+
167
+ private func performOCR(imagePath: String, mode: String = "accurate") throws -> [[String: Any]] {
168
+ let url = URL(fileURLWithPath: imagePath)
169
+
170
+ guard let image = NSImage(contentsOf: url),
171
+ let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
172
+ throw BridgeError.general("Failed to load image at \(imagePath)")
173
+ }
174
+
175
+ return try performOCROnImage(cgImage, mode: mode)
176
+ }
177
+
178
+ private func performOCROnImage(_ cgImage: CGImage, mode: String = "accurate") throws -> [[String: Any]] {
179
+ let imageWidth = CGFloat(cgImage.width)
180
+ let imageHeight = CGFloat(cgImage.height)
181
+
182
+ let request = VNRecognizeTextRequest()
183
+
184
+ if mode == "fast" {
185
+ // FAST mode: 10x faster (~60ms vs ~631ms), used for perception loop
186
+ // Trades ~20% text coverage for speed — acceptable for frame diffing
187
+ request.recognitionLevel = .fast
188
+ request.usesLanguageCorrection = false
189
+ request.recognitionLanguages = ["en-US"]
190
+ } else {
191
+ // ACCURATE mode: full precision, used for tool actions (ocr, click_text)
192
+ request.recognitionLevel = .accurate
193
+ request.usesLanguageCorrection = true
194
+ if #available(macOS 13.0, *) {
195
+ request.automaticallyDetectsLanguage = true
196
+ }
197
+ let supportedLangs = try? request.supportedRecognitionLanguages()
198
+ if let supported = supportedLangs {
199
+ request.recognitionLanguages = supported
200
+ } else {
201
+ request.recognitionLanguages = ["en-US", "zh-Hans", "zh-Hant", "ja", "ko", "hi", "ar", "de", "fr", "es", "pt", "it", "ru"]
202
+ }
203
+ }
204
+
205
+ let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
206
+ try handler.perform([request])
207
+
208
+ guard let observations = request.results else {
209
+ return []
210
+ }
211
+
212
+ var results: [[String: Any]] = []
213
+
214
+ for observation in observations {
215
+ guard let candidate = observation.topCandidates(1).first else { continue }
216
+
217
+ let boundingBox = observation.boundingBox
218
+ // Convert from Vision's normalized coordinates (origin bottom-left) to screen coordinates
219
+ let x = boundingBox.origin.x * imageWidth
220
+ let y = (1 - boundingBox.origin.y - boundingBox.height) * imageHeight
221
+ let width = boundingBox.width * imageWidth
222
+ let height = boundingBox.height * imageHeight
223
+
224
+ results.append([
225
+ "text": candidate.string,
226
+ "confidence": Double(candidate.confidence),
227
+ "bounds": [
228
+ "x": Double(x),
229
+ "y": Double(y),
230
+ "width": Double(width),
231
+ "height": Double(height),
232
+ ] as [String: Any],
233
+ ] as [String: Any])
234
+ }
235
+
236
+ return results
237
+ }
238
+ }