screenhand 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -446
- package/bin/darwin-arm64/macos-bridge +0 -0
- package/dist/mcp-desktop.js +3615 -400
- package/dist/scripts/export-help-center.js +112 -0
- package/dist/scripts/marketing-loop.js +117 -0
- package/dist/scripts/observer-daemon.js +288 -0
- package/dist/scripts/orchestrator-daemon.js +399 -0
- package/dist/scripts/threads-campaign.js +208 -0
- package/dist/src/community/fetcher.js +109 -0
- package/dist/src/community/index.js +6 -0
- package/dist/src/community/publisher.js +191 -0
- package/dist/src/community/remote-api.js +121 -0
- package/dist/src/community/types.js +3 -0
- package/dist/src/community/validator.js +95 -0
- package/dist/src/context-tracker.js +489 -0
- package/dist/src/ingestion/coverage-auditor.js +233 -0
- package/dist/src/ingestion/doc-parser.js +164 -0
- package/dist/src/ingestion/index.js +8 -0
- package/dist/src/ingestion/menu-scanner.js +152 -0
- package/dist/src/ingestion/reference-merger.js +186 -0
- package/dist/src/ingestion/shortcut-extractor.js +180 -0
- package/dist/src/ingestion/tutorial-extractor.js +170 -0
- package/dist/src/ingestion/types.js +3 -0
- package/dist/src/jobs/manager.js +82 -14
- package/dist/src/jobs/runner.js +138 -15
- package/dist/src/learning/engine.js +356 -0
- package/dist/src/learning/index.js +9 -0
- package/dist/src/learning/locator-policy.js +120 -0
- package/dist/src/learning/pattern-policy.js +89 -0
- package/dist/src/learning/recovery-policy.js +116 -0
- package/dist/src/learning/sensor-policy.js +115 -0
- package/dist/src/learning/timing-model.js +204 -0
- package/dist/src/learning/topology-policy.js +90 -0
- package/dist/src/learning/types.js +9 -0
- package/dist/src/logging/timeline-logger.js +4 -1
- package/dist/src/memory/playbook-seeds.js +200 -0
- package/dist/src/memory/recall.js +60 -8
- package/dist/src/memory/service.js +30 -5
- package/dist/src/memory/store.js +34 -5
- package/dist/src/native/bridge-client.js +253 -31
- package/dist/src/observer/state.js +199 -0
- package/dist/src/observer/types.js +43 -0
- package/dist/src/orchestrator/state.js +68 -0
- package/dist/src/orchestrator/types.js +22 -0
- package/dist/src/perception/ax-source.js +162 -0
- package/dist/src/perception/cdp-source.js +162 -0
- package/dist/src/perception/coordinator.js +771 -0
- package/dist/src/perception/frame-differ.js +287 -0
- package/dist/src/perception/index.js +22 -0
- package/dist/src/perception/manager.js +199 -0
- package/dist/src/perception/types.js +47 -0
- package/dist/src/perception/vision-source.js +399 -0
- package/dist/src/planner/deterministic.js +298 -0
- package/dist/src/planner/executor.js +870 -0
- package/dist/src/planner/goal-store.js +92 -0
- package/dist/src/planner/index.js +21 -0
- package/dist/src/planner/planner.js +520 -0
- package/dist/src/planner/tool-registry.js +71 -0
- package/dist/src/planner/types.js +22 -0
- package/dist/src/platform/explorer.js +213 -0
- package/dist/src/platform/help-center-markdown.js +527 -0
- package/dist/src/platform/learner.js +257 -0
- package/dist/src/playbook/engine.js +296 -11
- package/dist/src/playbook/mcp-recorder.js +204 -0
- package/dist/src/playbook/recorder.js +3 -2
- package/dist/src/playbook/runner.js +1 -1
- package/dist/src/playbook/store.js +139 -10
- package/dist/src/recovery/detectors.js +156 -0
- package/dist/src/recovery/engine.js +327 -0
- package/dist/src/recovery/index.js +20 -0
- package/dist/src/recovery/strategies.js +274 -0
- package/dist/src/recovery/types.js +20 -0
- package/dist/src/runtime/accessibility-adapter.js +55 -18
- package/dist/src/runtime/applescript-adapter.js +8 -2
- package/dist/src/runtime/cdp-chrome-adapter.js +1 -1
- package/dist/src/runtime/executor.js +23 -3
- package/dist/src/runtime/locator-cache.js +24 -2
- package/dist/src/runtime/service.js +59 -15
- package/dist/src/runtime/session-manager.js +4 -1
- package/dist/src/runtime/vision-adapter.js +2 -1
- package/dist/src/state/app-map-types.js +72 -0
- package/dist/src/state/app-map.js +1974 -0
- package/dist/src/state/entity-tracker.js +108 -0
- package/dist/src/state/fusion.js +96 -0
- package/dist/src/state/index.js +21 -0
- package/dist/src/state/ladder-generator.js +236 -0
- package/dist/src/state/persistence.js +156 -0
- package/dist/src/state/types.js +17 -0
- package/dist/src/state/world-model.js +1456 -0
- package/dist/src/util/atomic-write.js +19 -4
- package/dist/src/util/sanitize.js +146 -0
- package/dist-app-maps/com.figma.Desktop.json +959 -0
- package/dist-app-maps/com.hnc.Discord.json +1146 -0
- package/dist-app-maps/notion.id.json +2831 -0
- package/dist-playbooks/canva-screenhand-carousel.json +445 -0
- package/dist-playbooks/codex-desktop.json +76 -0
- package/dist-playbooks/competitor-research-stack.json +122 -0
- package/dist-playbooks/davinci-color-grade.json +153 -0
- package/dist-playbooks/davinci-edit-timeline.json +162 -0
- package/dist-playbooks/davinci-render.json +114 -0
- package/dist-playbooks/devto.json +52 -0
- package/dist-playbooks/discord.json +41 -0
- package/dist-playbooks/google-flow-create-project.json +59 -0
- package/dist-playbooks/google-flow-edit-image.json +90 -0
- package/dist-playbooks/google-flow-edit-video.json +90 -0
- package/dist-playbooks/google-flow-generate-image.json +68 -0
- package/dist-playbooks/google-flow-generate-video.json +191 -0
- package/dist-playbooks/google-flow-open-project.json +48 -0
- package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
- package/dist-playbooks/google-flow-search-assets.json +64 -0
- package/dist-playbooks/instagram.json +57 -0
- package/dist-playbooks/linkedin.json +52 -0
- package/dist-playbooks/n8n.json +43 -0
- package/dist-playbooks/reddit.json +52 -0
- package/dist-playbooks/threads.json +59 -0
- package/dist-playbooks/x-twitter.json +59 -0
- package/dist-playbooks/youtube.json +59 -0
- package/dist-references/canva.json +646 -0
- package/dist-references/codex-desktop.json +305 -0
- package/dist-references/davinci-resolve-keyboard.json +594 -0
- package/dist-references/davinci-resolve-menu-map.json +1139 -0
- package/dist-references/davinci-resolve-menus-batch1.json +116 -0
- package/dist-references/davinci-resolve-menus-batch2.json +372 -0
- package/dist-references/davinci-resolve-menus-batch3.json +330 -0
- package/dist-references/davinci-resolve-menus-batch4.json +297 -0
- package/dist-references/davinci-resolve-shortcuts.json +333 -0
- package/dist-references/devpost.json +186 -0
- package/dist-references/devto.json +317 -0
- package/dist-references/discord.json +549 -0
- package/dist-references/figma.json +1186 -0
- package/dist-references/finder.json +146 -0
- package/dist-references/google-ads-transparency.json +95 -0
- package/dist-references/google-flow.json +649 -0
- package/dist-references/instagram.json +341 -0
- package/dist-references/linkedin.json +324 -0
- package/dist-references/meta-ad-library.json +86 -0
- package/dist-references/n8n.json +387 -0
- package/dist-references/notes.json +27 -0
- package/dist-references/notion.json +163 -0
- package/dist-references/reddit.json +341 -0
- package/dist-references/threads.json +337 -0
- package/dist-references/x-twitter.json +403 -0
- package/dist-references/youtube.json +373 -0
- package/native/macos-bridge/Package.swift +22 -0
- package/native/macos-bridge/Sources/AccessibilityBridge.swift +482 -0
- package/native/macos-bridge/Sources/AppManagement.swift +339 -0
- package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +537 -0
- package/native/macos-bridge/Sources/ObserverBridge.swift +120 -0
- package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
- package/native/macos-bridge/Sources/VisionBridge.swift +238 -0
- package/native/macos-bridge/Sources/main.swift +498 -0
- package/native/windows-bridge/AppManagement.cs +234 -0
- package/native/windows-bridge/InputBridge.cs +436 -0
- package/native/windows-bridge/Program.cs +270 -0
- package/native/windows-bridge/ScreenCapture.cs +453 -0
- package/native/windows-bridge/UIAutomationBridge.cs +571 -0
- package/native/windows-bridge/WindowsBridge.csproj +17 -0
- package/package.json +12 -1
- package/scripts/postinstall.cjs +127 -0
- package/dist/.audit-log.jsonl +0 -55
- package/dist/.screenhand/memory/.lock +0 -1
- package/dist/.screenhand/memory/actions.jsonl +0 -85
- package/dist/.screenhand/memory/errors.jsonl +0 -5
- package/dist/.screenhand/memory/errors.jsonl.bak +0 -4
- package/dist/.screenhand/memory/state.json +0 -35
- package/dist/.screenhand/memory/state.json.bak +0 -35
- package/dist/.screenhand/memory/strategies.jsonl +0 -12
- package/dist/agent/cli.js +0 -73
- package/dist/agent/loop.js +0 -258
- package/dist/config.js +0 -9
- package/dist/index.js +0 -56
- package/dist/logging/timeline-logger.js +0 -29
- package/dist/mcp/mcp-stdio-server.js +0 -448
- package/dist/mcp/server.js +0 -347
- package/dist/mcp-entry.js +0 -59
- package/dist/memory/recall.js +0 -160
- package/dist/memory/research.js +0 -98
- package/dist/memory/seeds.js +0 -89
- package/dist/memory/session.js +0 -161
- package/dist/memory/store.js +0 -391
- package/dist/memory/types.js +0 -4
- package/dist/monitor/codex-monitor.js +0 -377
- package/dist/monitor/task-queue.js +0 -84
- package/dist/monitor/types.js +0 -49
- package/dist/native/bridge-client.js +0 -174
- package/dist/native/macos-bridge-client.js +0 -5
- package/dist/npm-publish-helper.js +0 -117
- package/dist/npm-token-cdp.js +0 -113
- package/dist/npm-token-create.js +0 -135
- package/dist/npm-token-finish.js +0 -126
- package/dist/playbook/engine.js +0 -193
- package/dist/playbook/index.js +0 -4
- package/dist/playbook/recorder.js +0 -519
- package/dist/playbook/runner.js +0 -392
- package/dist/playbook/store.js +0 -166
- package/dist/playbook/types.js +0 -4
- package/dist/runtime/accessibility-adapter.js +0 -377
- package/dist/runtime/app-adapter.js +0 -48
- package/dist/runtime/applescript-adapter.js +0 -283
- package/dist/runtime/ax-role-map.js +0 -80
- package/dist/runtime/browser-adapter.js +0 -36
- package/dist/runtime/cdp-chrome-adapter.js +0 -505
- package/dist/runtime/composite-adapter.js +0 -205
- package/dist/runtime/executor.js +0 -250
- package/dist/runtime/locator-cache.js +0 -12
- package/dist/runtime/planning-loop.js +0 -47
- package/dist/runtime/service.js +0 -372
- package/dist/runtime/session-manager.js +0 -28
- package/dist/runtime/state-observer.js +0 -105
- package/dist/runtime/vision-adapter.js +0 -208
- package/dist/test-mcp-protocol.js +0 -138
- package/dist/types.js +0 -1
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import ScreenCaptureKit
|
|
3
|
+
import CoreMedia
|
|
4
|
+
import AppKit
|
|
5
|
+
|
|
6
|
+
/// Continuous screen capture using SCStream.
|
|
7
|
+
/// Keeps the latest frame as a temp PNG file on disk.
|
|
8
|
+
/// Replaces one-shot CGWindowListCreateImage (~200ms) with pre-captured frames (~0ms read).
|
|
9
|
+
class StreamCapture: NSObject, SCStreamOutput {
|
|
10
|
+
private var stream: SCStream?
|
|
11
|
+
private var _running = false
|
|
12
|
+
private let queue = DispatchQueue(label: "streamcapture.state")
|
|
13
|
+
|
|
14
|
+
/// Path to the latest captured frame (PNG file)
|
|
15
|
+
private var _latestFramePath: String?
|
|
16
|
+
private var _latestWidth: Int = 0
|
|
17
|
+
private var _latestHeight: Int = 0
|
|
18
|
+
private var _latestFrameTime: Date?
|
|
19
|
+
private var _frameCount: UInt64 = 0
|
|
20
|
+
private var saveEveryN: Int = 1
|
|
21
|
+
|
|
22
|
+
/// Start continuous capture for a specific window.
|
|
23
|
+
func start(windowId: Int, fps: Int = 30) async throws {
|
|
24
|
+
var alreadyRunning = false
|
|
25
|
+
queue.sync { alreadyRunning = self._running }
|
|
26
|
+
if alreadyRunning { return }
|
|
27
|
+
|
|
28
|
+
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true)
|
|
29
|
+
guard let window = content.windows.first(where: { $0.windowID == CGWindowID(windowId) }) else {
|
|
30
|
+
throw BridgeError.general("Window \(windowId) not found for stream capture")
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
self.saveEveryN = max(1, fps / 30)
|
|
34
|
+
|
|
35
|
+
let filter = SCContentFilter(desktopIndependentWindow: window)
|
|
36
|
+
let config = SCStreamConfiguration()
|
|
37
|
+
config.width = window.frame.width > 0 ? Int(window.frame.width) * 2 : 2880
|
|
38
|
+
config.height = window.frame.height > 0 ? Int(window.frame.height) * 2 : 1800
|
|
39
|
+
config.showsCursor = false
|
|
40
|
+
config.capturesAudio = false
|
|
41
|
+
config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
|
|
42
|
+
config.queueDepth = 3
|
|
43
|
+
|
|
44
|
+
let newStream = SCStream(filter: filter, configuration: config, delegate: nil)
|
|
45
|
+
try newStream.addStreamOutput(self, type: .screen, sampleHandlerQueue: .global(qos: .userInitiated))
|
|
46
|
+
try await newStream.startCapture()
|
|
47
|
+
|
|
48
|
+
queue.sync {
|
|
49
|
+
self.stream = newStream
|
|
50
|
+
self._running = true
|
|
51
|
+
self._frameCount = 0
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Stop the stream and clean up.
|
|
56
|
+
func stop() async {
|
|
57
|
+
var s: SCStream?
|
|
58
|
+
var pathToClean: String?
|
|
59
|
+
|
|
60
|
+
queue.sync {
|
|
61
|
+
s = self.stream
|
|
62
|
+
self._running = false
|
|
63
|
+
self.stream = nil
|
|
64
|
+
pathToClean = self._latestFramePath
|
|
65
|
+
self._latestFramePath = nil
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if let s = s {
|
|
69
|
+
try? await s.stopCapture()
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if let path = pathToClean {
|
|
73
|
+
try? FileManager.default.removeItem(atPath: path)
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
var isRunning: Bool {
|
|
78
|
+
queue.sync { _running }
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Get info about the latest frame.
|
|
82
|
+
func getLatestInfo() -> [String: Any]? {
|
|
83
|
+
queue.sync {
|
|
84
|
+
guard let path = _latestFramePath, let time = _latestFrameTime else { return nil }
|
|
85
|
+
return [
|
|
86
|
+
"path": path,
|
|
87
|
+
"width": _latestWidth,
|
|
88
|
+
"height": _latestHeight,
|
|
89
|
+
"ageMs": Int(Date().timeIntervalSince(time) * 1000),
|
|
90
|
+
"frameCount": _frameCount,
|
|
91
|
+
]
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// MARK: - SCStreamOutput
|
|
96
|
+
|
|
97
|
+
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, of type: SCStreamOutputType) {
|
|
98
|
+
guard type == .screen else { return }
|
|
99
|
+
|
|
100
|
+
var shouldSave = false
|
|
101
|
+
queue.sync {
|
|
102
|
+
guard self._running else { shouldSave = false; return }
|
|
103
|
+
_frameCount += 1
|
|
104
|
+
shouldSave = _frameCount % UInt64(saveEveryN) == 0
|
|
105
|
+
}
|
|
106
|
+
guard shouldSave else { return }
|
|
107
|
+
|
|
108
|
+
guard let imageBuffer = sampleBuffer.imageBuffer else { return }
|
|
109
|
+
let ciImage = CIImage(cvImageBuffer: imageBuffer)
|
|
110
|
+
let context = CIContext()
|
|
111
|
+
let width = CVPixelBufferGetWidth(imageBuffer)
|
|
112
|
+
let height = CVPixelBufferGetHeight(imageBuffer)
|
|
113
|
+
|
|
114
|
+
guard let cgImage = context.createCGImage(ciImage, from: CGRect(x: 0, y: 0, width: width, height: height)) else { return }
|
|
115
|
+
|
|
116
|
+
let tempDir = FileManager.default.temporaryDirectory
|
|
117
|
+
let fileURL = tempDir.appendingPathComponent("stream_frame_latest.png")
|
|
118
|
+
let bitmapRep = NSBitmapImageRep(cgImage: cgImage)
|
|
119
|
+
guard let pngData = bitmapRep.representation(using: .png, properties: [:]) else { return }
|
|
120
|
+
|
|
121
|
+
do {
|
|
122
|
+
let tmpURL = tempDir.appendingPathComponent("stream_frame_tmp_\(ProcessInfo.processInfo.processIdentifier).png")
|
|
123
|
+
try pngData.write(to: tmpURL)
|
|
124
|
+
_ = try FileManager.default.replaceItemAt(fileURL, withItemAt: tmpURL)
|
|
125
|
+
|
|
126
|
+
queue.sync {
|
|
127
|
+
self._latestFramePath = fileURL.path
|
|
128
|
+
self._latestWidth = width
|
|
129
|
+
self._latestHeight = height
|
|
130
|
+
self._latestFrameTime = Date()
|
|
131
|
+
}
|
|
132
|
+
} catch {
|
|
133
|
+
// Skip frame on write failure
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Vision
|
|
3
|
+
import AppKit
|
|
4
|
+
import CoreML
|
|
5
|
+
|
|
6
|
+
class VisionBridge {
|
|
7
|
+
|
|
8
|
+
// MARK: - YOLO Element Detection
|
|
9
|
+
|
|
10
|
+
private var yoloModel: VNCoreMLModel?
|
|
11
|
+
private let yoloClassNames = ["button", "field", "heading", "iframe", "image", "label", "link", "text"]
|
|
12
|
+
|
|
13
|
+
/// Load the YOLO CoreML model for UI element detection.
|
|
14
|
+
/// Called lazily on first detectElements call.
|
|
15
|
+
private func ensureYoloModel() throws {
|
|
16
|
+
if yoloModel != nil { return }
|
|
17
|
+
|
|
18
|
+
// Look for model relative to the binary or via known paths
|
|
19
|
+
let execPath = URL(fileURLWithPath: CommandLine.arguments[0]).resolvingSymlinksInPath()
|
|
20
|
+
let execDir = execPath.deletingLastPathComponent()
|
|
21
|
+
let possiblePaths = [
|
|
22
|
+
execDir.appendingPathComponent("../Resources/ui-elements.mlpackage"),
|
|
23
|
+
execDir.appendingPathComponent("../../Resources/ui-elements.mlpackage"),
|
|
24
|
+
execDir.appendingPathComponent("Resources/ui-elements.mlpackage"),
|
|
25
|
+
URL(fileURLWithPath: "/tmp/vins-yolo/web-ui-8cls.mlpackage"), // Development fallback
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
for modelURL in possiblePaths {
|
|
29
|
+
if FileManager.default.fileExists(atPath: modelURL.path) {
|
|
30
|
+
let config = MLModelConfiguration()
|
|
31
|
+
config.computeUnits = .all // Use ANE when available
|
|
32
|
+
let coreMLModel = try MLModel(contentsOf: modelURL, configuration: config)
|
|
33
|
+
yoloModel = try VNCoreMLModel(for: coreMLModel)
|
|
34
|
+
return
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
throw BridgeError.general("YOLO model not found")
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Detect UI elements in an image using the YOLO CoreML model.
|
|
42
|
+
/// Returns bounding boxes with class labels and confidence scores.
|
|
43
|
+
func detectElements(imagePath: String, confidence: Double = 0.25) throws -> [[String: Any]] {
|
|
44
|
+
try ensureYoloModel()
|
|
45
|
+
guard let model = yoloModel else {
|
|
46
|
+
throw BridgeError.general("YOLO model not loaded")
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
let url = URL(fileURLWithPath: imagePath)
|
|
50
|
+
guard let image = NSImage(contentsOf: url),
|
|
51
|
+
let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
|
52
|
+
throw BridgeError.general("Failed to load image at \(imagePath)")
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
let imageWidth = CGFloat(cgImage.width)
|
|
56
|
+
let imageHeight = CGFloat(cgImage.height)
|
|
57
|
+
|
|
58
|
+
let request = VNCoreMLRequest(model: model)
|
|
59
|
+
request.imageCropAndScaleOption = .scaleFit
|
|
60
|
+
|
|
61
|
+
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
|
62
|
+
try handler.perform([request])
|
|
63
|
+
|
|
64
|
+
guard let observations = request.results as? [VNRecognizedObjectObservation] else {
|
|
65
|
+
return []
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
var results: [[String: Any]] = []
|
|
69
|
+
for obs in observations {
|
|
70
|
+
guard obs.confidence >= Float(confidence) else { continue }
|
|
71
|
+
|
|
72
|
+
let bbox = obs.boundingBox
|
|
73
|
+
// Convert from Vision normalized coords (origin bottom-left) to screen coords
|
|
74
|
+
let x = bbox.origin.x * imageWidth
|
|
75
|
+
let y = (1 - bbox.origin.y - bbox.height) * imageHeight
|
|
76
|
+
let width = bbox.width * imageWidth
|
|
77
|
+
let height = bbox.height * imageHeight
|
|
78
|
+
|
|
79
|
+
let className = obs.labels.first?.identifier ?? "unknown"
|
|
80
|
+
|
|
81
|
+
results.append([
|
|
82
|
+
"class": className,
|
|
83
|
+
"confidence": Double(obs.confidence),
|
|
84
|
+
"bounds": [
|
|
85
|
+
"x": Double(x),
|
|
86
|
+
"y": Double(y),
|
|
87
|
+
"width": Double(width),
|
|
88
|
+
"height": Double(height),
|
|
89
|
+
] as [String: Any],
|
|
90
|
+
] as [String: Any])
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return results
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/// Perform OCR on an image, optionally searching for specific text.
|
|
97
|
+
/// Returns all recognized text with bounding boxes.
|
|
98
|
+
func findText(imagePath: String, searchText: String?, mode: String = "accurate") throws -> [[String: Any]] {
|
|
99
|
+
let results = try performOCR(imagePath: imagePath, mode: mode)
|
|
100
|
+
|
|
101
|
+
guard let search = searchText?.lowercased() else {
|
|
102
|
+
return results
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return results.filter { result in
|
|
106
|
+
guard let text = result["text"] as? String else { return false }
|
|
107
|
+
return text.lowercased().contains(search)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/// Full OCR of an image — returns all recognized text.
|
|
112
|
+
/// mode: "fast" (perception loop, 10x faster) or "accurate" (tool actions, default)
|
|
113
|
+
func ocr(imagePath: String, mode: String = "accurate") throws -> [String: Any] {
|
|
114
|
+
let results = try performOCR(imagePath: imagePath, mode: mode)
|
|
115
|
+
let fullText = results.compactMap { $0["text"] as? String }.joined(separator: "\n")
|
|
116
|
+
return [
|
|
117
|
+
"text": fullText,
|
|
118
|
+
"regions": results,
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/// OCR a specific region of a window — captures the window, crops to the ROI,
|
|
123
|
+
/// and runs text recognition on just that region. Returns bounds in window coordinates.
|
|
124
|
+
func ocrRegion(windowId: Int, region: [String: Double], mode: String = "accurate") throws -> [String: Any] {
|
|
125
|
+
let roiX = region["x"] ?? 0
|
|
126
|
+
let roiY = region["y"] ?? 0
|
|
127
|
+
let roiW = region["width"] ?? 0
|
|
128
|
+
let roiH = region["height"] ?? 0
|
|
129
|
+
|
|
130
|
+
// Capture the full window
|
|
131
|
+
guard let fullImage = CGWindowListCreateImage(
|
|
132
|
+
.null, .optionIncludingWindow, CGWindowID(windowId), [.bestResolution, .boundsIgnoreFraming]
|
|
133
|
+
) else {
|
|
134
|
+
throw BridgeError.general("CGWindowListCreateImage returned nil for window \(windowId)")
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Crop to the ROI (CGImage coordinates have origin top-left, same as our ROI)
|
|
138
|
+
let cropRect = CGRect(x: roiX, y: roiY, width: roiW, height: roiH)
|
|
139
|
+
.intersection(CGRect(x: 0, y: 0, width: fullImage.width, height: fullImage.height))
|
|
140
|
+
|
|
141
|
+
guard !cropRect.isEmpty,
|
|
142
|
+
let cropped = fullImage.cropping(to: cropRect) else {
|
|
143
|
+
return ["text": "", "regions": [] as [Any]]
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Run OCR on cropped image
|
|
147
|
+
let results = try performOCROnImage(cropped, mode: mode)
|
|
148
|
+
|
|
149
|
+
// Translate bounds from cropped-image coordinates back to window coordinates
|
|
150
|
+
let adjustedResults: [[String: Any]] = results.map { entry in
|
|
151
|
+
var adjusted = entry
|
|
152
|
+
if var bounds = entry["bounds"] as? [String: Double] {
|
|
153
|
+
bounds["x"] = (bounds["x"] ?? 0) + roiX
|
|
154
|
+
bounds["y"] = (bounds["y"] ?? 0) + roiY
|
|
155
|
+
adjusted["bounds"] = bounds
|
|
156
|
+
}
|
|
157
|
+
return adjusted
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
let fullText = adjustedResults.compactMap { $0["text"] as? String }.joined(separator: "\n")
|
|
161
|
+
return [
|
|
162
|
+
"text": fullText,
|
|
163
|
+
"regions": adjustedResults,
|
|
164
|
+
]
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
private func performOCR(imagePath: String, mode: String = "accurate") throws -> [[String: Any]] {
|
|
168
|
+
let url = URL(fileURLWithPath: imagePath)
|
|
169
|
+
|
|
170
|
+
guard let image = NSImage(contentsOf: url),
|
|
171
|
+
let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
|
172
|
+
throw BridgeError.general("Failed to load image at \(imagePath)")
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return try performOCROnImage(cgImage, mode: mode)
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
private func performOCROnImage(_ cgImage: CGImage, mode: String = "accurate") throws -> [[String: Any]] {
|
|
179
|
+
let imageWidth = CGFloat(cgImage.width)
|
|
180
|
+
let imageHeight = CGFloat(cgImage.height)
|
|
181
|
+
|
|
182
|
+
let request = VNRecognizeTextRequest()
|
|
183
|
+
|
|
184
|
+
if mode == "fast" {
|
|
185
|
+
// FAST mode: 10x faster (~60ms vs ~631ms), used for perception loop
|
|
186
|
+
// Trades ~20% text coverage for speed — acceptable for frame diffing
|
|
187
|
+
request.recognitionLevel = .fast
|
|
188
|
+
request.usesLanguageCorrection = false
|
|
189
|
+
request.recognitionLanguages = ["en-US"]
|
|
190
|
+
} else {
|
|
191
|
+
// ACCURATE mode: full precision, used for tool actions (ocr, click_text)
|
|
192
|
+
request.recognitionLevel = .accurate
|
|
193
|
+
request.usesLanguageCorrection = true
|
|
194
|
+
if #available(macOS 13.0, *) {
|
|
195
|
+
request.automaticallyDetectsLanguage = true
|
|
196
|
+
}
|
|
197
|
+
let supportedLangs = try? request.supportedRecognitionLanguages()
|
|
198
|
+
if let supported = supportedLangs {
|
|
199
|
+
request.recognitionLanguages = supported
|
|
200
|
+
} else {
|
|
201
|
+
request.recognitionLanguages = ["en-US", "zh-Hans", "zh-Hant", "ja", "ko", "hi", "ar", "de", "fr", "es", "pt", "it", "ru"]
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
|
206
|
+
try handler.perform([request])
|
|
207
|
+
|
|
208
|
+
guard let observations = request.results else {
|
|
209
|
+
return []
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
var results: [[String: Any]] = []
|
|
213
|
+
|
|
214
|
+
for observation in observations {
|
|
215
|
+
guard let candidate = observation.topCandidates(1).first else { continue }
|
|
216
|
+
|
|
217
|
+
let boundingBox = observation.boundingBox
|
|
218
|
+
// Convert from Vision's normalized coordinates (origin bottom-left) to screen coordinates
|
|
219
|
+
let x = boundingBox.origin.x * imageWidth
|
|
220
|
+
let y = (1 - boundingBox.origin.y - boundingBox.height) * imageHeight
|
|
221
|
+
let width = boundingBox.width * imageWidth
|
|
222
|
+
let height = boundingBox.height * imageHeight
|
|
223
|
+
|
|
224
|
+
results.append([
|
|
225
|
+
"text": candidate.string,
|
|
226
|
+
"confidence": Double(candidate.confidence),
|
|
227
|
+
"bounds": [
|
|
228
|
+
"x": Double(x),
|
|
229
|
+
"y": Double(y),
|
|
230
|
+
"width": Double(width),
|
|
231
|
+
"height": Double(height),
|
|
232
|
+
] as [String: Any],
|
|
233
|
+
] as [String: Any])
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
return results
|
|
237
|
+
}
|
|
238
|
+
}
|