screenhand 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +193 -109
- package/bin/darwin-arm64/macos-bridge +0 -0
- package/dist/mcp-desktop.js +5876 -0
- package/dist/scripts/codex-monitor-daemon.js +335 -0
- package/dist/scripts/export-help-center.js +112 -0
- package/dist/scripts/marketing-loop.js +117 -0
- package/dist/scripts/observer-daemon.js +288 -0
- package/dist/scripts/orchestrator-daemon.js +399 -0
- package/dist/scripts/supervisor-daemon.js +272 -0
- package/dist/scripts/threads-campaign.js +208 -0
- package/dist/scripts/worker-daemon.js +228 -0
- package/dist/src/agent/cli.js +82 -0
- package/dist/src/agent/loop.js +274 -0
- package/dist/src/community/fetcher.js +109 -0
- package/dist/src/community/index.js +6 -0
- package/dist/src/community/publisher.js +191 -0
- package/dist/src/community/remote-api.js +121 -0
- package/dist/src/community/types.js +3 -0
- package/dist/src/community/validator.js +95 -0
- package/{src/config.ts → dist/src/config.js} +5 -10
- package/dist/src/context-tracker.js +489 -0
- package/{src/index.ts → dist/src/index.js} +32 -52
- package/dist/src/ingestion/coverage-auditor.js +233 -0
- package/dist/src/ingestion/doc-parser.js +164 -0
- package/dist/src/ingestion/index.js +8 -0
- package/dist/src/ingestion/menu-scanner.js +152 -0
- package/dist/src/ingestion/reference-merger.js +186 -0
- package/dist/src/ingestion/shortcut-extractor.js +180 -0
- package/dist/src/ingestion/tutorial-extractor.js +170 -0
- package/dist/src/ingestion/types.js +3 -0
- package/dist/src/jobs/manager.js +305 -0
- package/dist/src/jobs/runner.js +806 -0
- package/dist/src/jobs/store.js +102 -0
- package/dist/src/jobs/types.js +30 -0
- package/dist/src/jobs/worker.js +97 -0
- package/dist/src/learning/engine.js +356 -0
- package/dist/src/learning/index.js +9 -0
- package/dist/src/learning/locator-policy.js +120 -0
- package/dist/src/learning/pattern-policy.js +89 -0
- package/dist/src/learning/recovery-policy.js +116 -0
- package/dist/src/learning/sensor-policy.js +115 -0
- package/dist/src/learning/timing-model.js +204 -0
- package/dist/src/learning/topology-policy.js +90 -0
- package/dist/src/learning/types.js +9 -0
- package/dist/src/logging/timeline-logger.js +48 -0
- package/dist/src/mcp/mcp-stdio-server.js +464 -0
- package/dist/src/mcp/server.js +363 -0
- package/dist/src/mcp-entry.js +60 -0
- package/dist/src/memory/playbook-seeds.js +200 -0
- package/dist/src/memory/recall.js +222 -0
- package/dist/src/memory/research.js +104 -0
- package/dist/src/memory/seeds.js +101 -0
- package/dist/src/memory/service.js +446 -0
- package/dist/src/memory/session.js +169 -0
- package/dist/src/memory/store.js +451 -0
- package/{src/runtime/locator-cache.ts → dist/src/memory/types.js} +1 -17
- package/dist/src/monitor/codex-monitor.js +382 -0
- package/dist/src/monitor/task-queue.js +97 -0
- package/dist/src/monitor/types.js +62 -0
- package/dist/src/native/bridge-client.js +412 -0
- package/{src/native/macos-bridge-client.ts → dist/src/native/macos-bridge-client.js} +0 -1
- package/dist/src/observer/state.js +199 -0
- package/dist/src/observer/types.js +43 -0
- package/dist/src/orchestrator/state.js +68 -0
- package/dist/src/orchestrator/types.js +22 -0
- package/dist/src/perception/ax-source.js +162 -0
- package/dist/src/perception/cdp-source.js +162 -0
- package/dist/src/perception/coordinator.js +771 -0
- package/dist/src/perception/frame-differ.js +287 -0
- package/dist/src/perception/index.js +22 -0
- package/dist/src/perception/manager.js +199 -0
- package/dist/src/perception/types.js +47 -0
- package/dist/src/perception/vision-source.js +399 -0
- package/dist/src/planner/deterministic.js +298 -0
- package/dist/src/planner/executor.js +870 -0
- package/dist/src/planner/goal-store.js +92 -0
- package/dist/src/planner/index.js +21 -0
- package/dist/src/planner/planner.js +520 -0
- package/dist/src/planner/tool-registry.js +71 -0
- package/dist/src/planner/types.js +22 -0
- package/dist/src/platform/explorer.js +213 -0
- package/dist/src/platform/help-center-markdown.js +527 -0
- package/dist/src/platform/learner.js +257 -0
- package/dist/src/playbook/engine.js +486 -0
- package/dist/src/playbook/index.js +20 -0
- package/dist/src/playbook/mcp-recorder.js +204 -0
- package/dist/src/playbook/recorder.js +536 -0
- package/dist/src/playbook/runner.js +408 -0
- package/dist/src/playbook/store.js +312 -0
- package/dist/src/playbook/types.js +17 -0
- package/dist/src/recovery/detectors.js +156 -0
- package/dist/src/recovery/engine.js +327 -0
- package/dist/src/recovery/index.js +20 -0
- package/dist/src/recovery/strategies.js +274 -0
- package/dist/src/recovery/types.js +20 -0
- package/dist/src/runtime/accessibility-adapter.js +430 -0
- package/dist/src/runtime/app-adapter.js +64 -0
- package/dist/src/runtime/applescript-adapter.js +305 -0
- package/dist/src/runtime/ax-role-map.js +96 -0
- package/dist/src/runtime/browser-adapter.js +52 -0
- package/dist/src/runtime/cdp-chrome-adapter.js +521 -0
- package/dist/src/runtime/composite-adapter.js +221 -0
- package/dist/src/runtime/execution-contract.js +159 -0
- package/dist/src/runtime/executor.js +286 -0
- package/dist/src/runtime/locator-cache.js +50 -0
- package/dist/src/runtime/planning-loop.js +63 -0
- package/dist/src/runtime/service.js +432 -0
- package/dist/src/runtime/session-manager.js +63 -0
- package/dist/src/runtime/state-observer.js +121 -0
- package/dist/src/runtime/vision-adapter.js +225 -0
- package/dist/src/state/app-map-types.js +72 -0
- package/dist/src/state/app-map.js +1974 -0
- package/dist/src/state/entity-tracker.js +108 -0
- package/dist/src/state/fusion.js +96 -0
- package/dist/src/state/index.js +21 -0
- package/dist/src/state/ladder-generator.js +236 -0
- package/dist/src/state/persistence.js +156 -0
- package/dist/src/state/types.js +17 -0
- package/dist/src/state/world-model.js +1456 -0
- package/dist/src/supervisor/locks.js +186 -0
- package/dist/src/supervisor/supervisor.js +403 -0
- package/dist/src/supervisor/types.js +30 -0
- package/dist/src/test-mcp-protocol.js +154 -0
- package/dist/src/types.js +17 -0
- package/dist/src/util/atomic-write.js +133 -0
- package/dist/src/util/sanitize.js +146 -0
- package/dist-app-maps/com.figma.Desktop.json +959 -0
- package/dist-app-maps/com.hnc.Discord.json +1146 -0
- package/dist-app-maps/notion.id.json +2831 -0
- package/dist-playbooks/canva-screenhand-carousel.json +445 -0
- package/dist-playbooks/codex-desktop.json +76 -0
- package/dist-playbooks/competitor-research-stack.json +122 -0
- package/dist-playbooks/davinci-color-grade.json +153 -0
- package/dist-playbooks/davinci-edit-timeline.json +162 -0
- package/dist-playbooks/davinci-render.json +114 -0
- package/dist-playbooks/devto.json +52 -0
- package/dist-playbooks/discord.json +41 -0
- package/dist-playbooks/google-flow-create-project.json +59 -0
- package/dist-playbooks/google-flow-edit-image.json +90 -0
- package/dist-playbooks/google-flow-edit-video.json +90 -0
- package/dist-playbooks/google-flow-generate-image.json +68 -0
- package/dist-playbooks/google-flow-generate-video.json +191 -0
- package/dist-playbooks/google-flow-open-project.json +48 -0
- package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
- package/dist-playbooks/google-flow-search-assets.json +64 -0
- package/dist-playbooks/instagram.json +57 -0
- package/dist-playbooks/linkedin.json +52 -0
- package/dist-playbooks/n8n.json +43 -0
- package/dist-playbooks/reddit.json +52 -0
- package/dist-playbooks/threads.json +59 -0
- package/dist-playbooks/x-twitter.json +59 -0
- package/dist-playbooks/youtube.json +59 -0
- package/dist-references/canva.json +646 -0
- package/dist-references/codex-desktop.json +305 -0
- package/dist-references/davinci-resolve-keyboard.json +594 -0
- package/dist-references/davinci-resolve-menu-map.json +1139 -0
- package/dist-references/davinci-resolve-menus-batch1.json +116 -0
- package/dist-references/davinci-resolve-menus-batch2.json +372 -0
- package/dist-references/davinci-resolve-menus-batch3.json +330 -0
- package/dist-references/davinci-resolve-menus-batch4.json +297 -0
- package/dist-references/davinci-resolve-shortcuts.json +333 -0
- package/dist-references/devto.json +317 -0
- package/dist-references/discord.json +549 -0
- package/dist-references/figma.json +1186 -0
- package/dist-references/finder.json +146 -0
- package/dist-references/google-ads-transparency.json +95 -0
- package/dist-references/google-flow.json +649 -0
- package/dist-references/instagram.json +341 -0
- package/dist-references/linkedin.json +324 -0
- package/dist-references/meta-ad-library.json +86 -0
- package/dist-references/n8n.json +387 -0
- package/dist-references/notes.json +27 -0
- package/dist-references/notion.json +163 -0
- package/dist-references/reddit.json +341 -0
- package/dist-references/threads.json +337 -0
- package/dist-references/x-twitter.json +403 -0
- package/dist-references/youtube.json +373 -0
- package/native/macos-bridge/Package.swift +1 -0
- package/native/macos-bridge/Sources/AccessibilityBridge.swift +257 -36
- package/native/macos-bridge/Sources/AppManagement.swift +212 -2
- package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +348 -53
- package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
- package/native/macos-bridge/Sources/VisionBridge.swift +165 -7
- package/native/macos-bridge/Sources/main.swift +169 -16
- package/native/windows-bridge/Program.cs +5 -0
- package/native/windows-bridge/ScreenCapture.cs +124 -0
- package/package.json +29 -4
- package/scripts/postinstall.cjs +127 -0
- package/.claude/commands/automate.md +0 -28
- package/.claude/commands/debug-ui.md +0 -19
- package/.claude/commands/screenshot.md +0 -15
- package/.github/FUNDING.yml +0 -1
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -27
- package/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
- package/.mcp.json +0 -8
- package/DESKTOP_MCP_GUIDE.md +0 -92
- package/SECURITY.md +0 -44
- package/docs/architecture.md +0 -47
- package/install-skills.sh +0 -19
- package/mcp-bridge.ts +0 -271
- package/mcp-desktop.ts +0 -1221
- package/playbooks/instagram.json +0 -41
- package/playbooks/instagram_v2.json +0 -201
- package/playbooks/x_v1.json +0 -211
- package/scripts/devpost-live-loop.mjs +0 -421
- package/src/logging/timeline-logger.ts +0 -55
- package/src/mcp/server.ts +0 -449
- package/src/memory/recall.ts +0 -191
- package/src/memory/research.ts +0 -146
- package/src/memory/seeds.ts +0 -123
- package/src/memory/session.ts +0 -201
- package/src/memory/store.ts +0 -434
- package/src/memory/types.ts +0 -69
- package/src/native/bridge-client.ts +0 -239
- package/src/runtime/accessibility-adapter.ts +0 -487
- package/src/runtime/app-adapter.ts +0 -169
- package/src/runtime/applescript-adapter.ts +0 -376
- package/src/runtime/ax-role-map.ts +0 -102
- package/src/runtime/browser-adapter.ts +0 -129
- package/src/runtime/cdp-chrome-adapter.ts +0 -676
- package/src/runtime/composite-adapter.ts +0 -274
- package/src/runtime/executor.ts +0 -396
- package/src/runtime/planning-loop.ts +0 -81
- package/src/runtime/service.ts +0 -448
- package/src/runtime/session-manager.ts +0 -50
- package/src/runtime/state-observer.ts +0 -136
- package/src/runtime/vision-adapter.ts +0 -297
- package/src/types.ts +0 -297
- package/tests/bridge-client.test.ts +0 -176
- package/tests/browser-stealth.test.ts +0 -210
- package/tests/composite-adapter.test.ts +0 -64
- package/tests/mcp-server.test.ts +0 -151
- package/tests/memory-recall.test.ts +0 -339
- package/tests/memory-research.test.ts +0 -159
- package/tests/memory-seeds.test.ts +0 -120
- package/tests/memory-store.test.ts +0 -392
- package/tests/types.test.ts +0 -92
- package/tsconfig.check.json +0 -17
- package/tsconfig.json +0 -19
- package/vitest.config.ts +0 -8
- /package/{playbooks → dist-references}/devpost.json +0 -0
|
@@ -1,13 +1,102 @@
|
|
|
1
1
|
import Foundation
|
|
2
2
|
import Vision
|
|
3
3
|
import AppKit
|
|
4
|
+
import CoreML
|
|
4
5
|
|
|
5
6
|
class VisionBridge {
|
|
6
7
|
|
|
8
|
+
// MARK: - YOLO Element Detection
|
|
9
|
+
|
|
10
|
+
private var yoloModel: VNCoreMLModel?
|
|
11
|
+
private let yoloClassNames = ["button", "field", "heading", "iframe", "image", "label", "link", "text"]
|
|
12
|
+
|
|
13
|
+
/// Load the YOLO CoreML model for UI element detection.
|
|
14
|
+
/// Called lazily on first detectElements call.
|
|
15
|
+
private func ensureYoloModel() throws {
|
|
16
|
+
if yoloModel != nil { return }
|
|
17
|
+
|
|
18
|
+
// Look for model relative to the binary or via known paths
|
|
19
|
+
let execPath = URL(fileURLWithPath: CommandLine.arguments[0]).resolvingSymlinksInPath()
|
|
20
|
+
let execDir = execPath.deletingLastPathComponent()
|
|
21
|
+
let possiblePaths = [
|
|
22
|
+
execDir.appendingPathComponent("../Resources/ui-elements.mlpackage"),
|
|
23
|
+
execDir.appendingPathComponent("../../Resources/ui-elements.mlpackage"),
|
|
24
|
+
execDir.appendingPathComponent("Resources/ui-elements.mlpackage"),
|
|
25
|
+
URL(fileURLWithPath: "/tmp/vins-yolo/web-ui-8cls.mlpackage"), // Development fallback
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
for modelURL in possiblePaths {
|
|
29
|
+
if FileManager.default.fileExists(atPath: modelURL.path) {
|
|
30
|
+
let config = MLModelConfiguration()
|
|
31
|
+
config.computeUnits = .all // Use ANE when available
|
|
32
|
+
let coreMLModel = try MLModel(contentsOf: modelURL, configuration: config)
|
|
33
|
+
yoloModel = try VNCoreMLModel(for: coreMLModel)
|
|
34
|
+
return
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
throw BridgeError.general("YOLO model not found")
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Detect UI elements in an image using the YOLO CoreML model.
|
|
42
|
+
/// Returns bounding boxes with class labels and confidence scores.
|
|
43
|
+
func detectElements(imagePath: String, confidence: Double = 0.25) throws -> [[String: Any]] {
|
|
44
|
+
try ensureYoloModel()
|
|
45
|
+
guard let model = yoloModel else {
|
|
46
|
+
throw BridgeError.general("YOLO model not loaded")
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
let url = URL(fileURLWithPath: imagePath)
|
|
50
|
+
guard let image = NSImage(contentsOf: url),
|
|
51
|
+
let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
|
52
|
+
throw BridgeError.general("Failed to load image at \(imagePath)")
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
let imageWidth = CGFloat(cgImage.width)
|
|
56
|
+
let imageHeight = CGFloat(cgImage.height)
|
|
57
|
+
|
|
58
|
+
let request = VNCoreMLRequest(model: model)
|
|
59
|
+
request.imageCropAndScaleOption = .scaleFit
|
|
60
|
+
|
|
61
|
+
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
|
62
|
+
try handler.perform([request])
|
|
63
|
+
|
|
64
|
+
guard let observations = request.results as? [VNRecognizedObjectObservation] else {
|
|
65
|
+
return []
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
var results: [[String: Any]] = []
|
|
69
|
+
for obs in observations {
|
|
70
|
+
guard obs.confidence >= Float(confidence) else { continue }
|
|
71
|
+
|
|
72
|
+
let bbox = obs.boundingBox
|
|
73
|
+
// Convert from Vision normalized coords (origin bottom-left) to screen coords
|
|
74
|
+
let x = bbox.origin.x * imageWidth
|
|
75
|
+
let y = (1 - bbox.origin.y - bbox.height) * imageHeight
|
|
76
|
+
let width = bbox.width * imageWidth
|
|
77
|
+
let height = bbox.height * imageHeight
|
|
78
|
+
|
|
79
|
+
let className = obs.labels.first?.identifier ?? "unknown"
|
|
80
|
+
|
|
81
|
+
results.append([
|
|
82
|
+
"class": className,
|
|
83
|
+
"confidence": Double(obs.confidence),
|
|
84
|
+
"bounds": [
|
|
85
|
+
"x": Double(x),
|
|
86
|
+
"y": Double(y),
|
|
87
|
+
"width": Double(width),
|
|
88
|
+
"height": Double(height),
|
|
89
|
+
] as [String: Any],
|
|
90
|
+
] as [String: Any])
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return results
|
|
94
|
+
}
|
|
95
|
+
|
|
7
96
|
/// Perform OCR on an image, optionally searching for specific text.
|
|
8
97
|
/// Returns all recognized text with bounding boxes.
|
|
9
|
-
func findText(imagePath: String, searchText: String
|
|
10
|
-
let results = try performOCR(imagePath: imagePath)
|
|
98
|
+
func findText(imagePath: String, searchText: String?, mode: String = "accurate") throws -> [[String: Any]] {
|
|
99
|
+
let results = try performOCR(imagePath: imagePath, mode: mode)
|
|
11
100
|
|
|
12
101
|
guard let search = searchText?.lowercased() else {
|
|
13
102
|
return results
|
|
@@ -20,8 +109,9 @@ class VisionBridge {
|
|
|
20
109
|
}
|
|
21
110
|
|
|
22
111
|
/// Full OCR of an image — returns all recognized text.
|
|
23
|
-
|
|
24
|
-
|
|
112
|
+
/// mode: "fast" (perception loop, 10x faster) or "accurate" (tool actions, default)
|
|
113
|
+
func ocr(imagePath: String, mode: String = "accurate") throws -> [String: Any] {
|
|
114
|
+
let results = try performOCR(imagePath: imagePath, mode: mode)
|
|
25
115
|
let fullText = results.compactMap { $0["text"] as? String }.joined(separator: "\n")
|
|
26
116
|
return [
|
|
27
117
|
"text": fullText,
|
|
@@ -29,7 +119,52 @@ class VisionBridge {
|
|
|
29
119
|
]
|
|
30
120
|
}
|
|
31
121
|
|
|
32
|
-
|
|
122
|
+
/// OCR a specific region of a window — captures the window, crops to the ROI,
|
|
123
|
+
/// and runs text recognition on just that region. Returns bounds in window coordinates.
|
|
124
|
+
func ocrRegion(windowId: Int, region: [String: Double], mode: String = "accurate") throws -> [String: Any] {
|
|
125
|
+
let roiX = region["x"] ?? 0
|
|
126
|
+
let roiY = region["y"] ?? 0
|
|
127
|
+
let roiW = region["width"] ?? 0
|
|
128
|
+
let roiH = region["height"] ?? 0
|
|
129
|
+
|
|
130
|
+
// Capture the full window
|
|
131
|
+
guard let fullImage = CGWindowListCreateImage(
|
|
132
|
+
.null, .optionIncludingWindow, CGWindowID(windowId), [.bestResolution, .boundsIgnoreFraming]
|
|
133
|
+
) else {
|
|
134
|
+
throw BridgeError.general("CGWindowListCreateImage returned nil for window \(windowId)")
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Crop to the ROI (CGImage coordinates have origin top-left, same as our ROI)
|
|
138
|
+
let cropRect = CGRect(x: roiX, y: roiY, width: roiW, height: roiH)
|
|
139
|
+
.intersection(CGRect(x: 0, y: 0, width: fullImage.width, height: fullImage.height))
|
|
140
|
+
|
|
141
|
+
guard !cropRect.isEmpty,
|
|
142
|
+
let cropped = fullImage.cropping(to: cropRect) else {
|
|
143
|
+
return ["text": "", "regions": [] as [Any]]
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Run OCR on cropped image
|
|
147
|
+
let results = try performOCROnImage(cropped, mode: mode)
|
|
148
|
+
|
|
149
|
+
// Translate bounds from cropped-image coordinates back to window coordinates
|
|
150
|
+
let adjustedResults: [[String: Any]] = results.map { entry in
|
|
151
|
+
var adjusted = entry
|
|
152
|
+
if var bounds = entry["bounds"] as? [String: Double] {
|
|
153
|
+
bounds["x"] = (bounds["x"] ?? 0) + roiX
|
|
154
|
+
bounds["y"] = (bounds["y"] ?? 0) + roiY
|
|
155
|
+
adjusted["bounds"] = bounds
|
|
156
|
+
}
|
|
157
|
+
return adjusted
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
let fullText = adjustedResults.compactMap { $0["text"] as? String }.joined(separator: "\n")
|
|
161
|
+
return [
|
|
162
|
+
"text": fullText,
|
|
163
|
+
"regions": adjustedResults,
|
|
164
|
+
]
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
private func performOCR(imagePath: String, mode: String = "accurate") throws -> [[String: Any]] {
|
|
33
168
|
let url = URL(fileURLWithPath: imagePath)
|
|
34
169
|
|
|
35
170
|
guard let image = NSImage(contentsOf: url),
|
|
@@ -37,12 +172,35 @@ class VisionBridge {
|
|
|
37
172
|
throw BridgeError.general("Failed to load image at \(imagePath)")
|
|
38
173
|
}
|
|
39
174
|
|
|
175
|
+
return try performOCROnImage(cgImage, mode: mode)
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
private func performOCROnImage(_ cgImage: CGImage, mode: String = "accurate") throws -> [[String: Any]] {
|
|
40
179
|
let imageWidth = CGFloat(cgImage.width)
|
|
41
180
|
let imageHeight = CGFloat(cgImage.height)
|
|
42
181
|
|
|
43
182
|
let request = VNRecognizeTextRequest()
|
|
44
|
-
|
|
45
|
-
|
|
183
|
+
|
|
184
|
+
if mode == "fast" {
|
|
185
|
+
// FAST mode: 10x faster (~60ms vs ~631ms), used for perception loop
|
|
186
|
+
// Trades ~20% text coverage for speed — acceptable for frame diffing
|
|
187
|
+
request.recognitionLevel = .fast
|
|
188
|
+
request.usesLanguageCorrection = false
|
|
189
|
+
request.recognitionLanguages = ["en-US"]
|
|
190
|
+
} else {
|
|
191
|
+
// ACCURATE mode: full precision, used for tool actions (ocr, click_text)
|
|
192
|
+
request.recognitionLevel = .accurate
|
|
193
|
+
request.usesLanguageCorrection = true
|
|
194
|
+
if #available(macOS 13.0, *) {
|
|
195
|
+
request.automaticallyDetectsLanguage = true
|
|
196
|
+
}
|
|
197
|
+
let supportedLangs = try? request.supportedRecognitionLanguages()
|
|
198
|
+
if let supported = supportedLangs {
|
|
199
|
+
request.recognitionLanguages = supported
|
|
200
|
+
} else {
|
|
201
|
+
request.recognitionLanguages = ["en-US", "zh-Hans", "zh-Hant", "ja", "ko", "hi", "ar", "de", "fr", "es", "pt", "it", "ru"]
|
|
202
|
+
}
|
|
203
|
+
}
|
|
46
204
|
|
|
47
205
|
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
|
48
206
|
try handler.perform([request])
|
|
@@ -4,6 +4,40 @@ import Foundation
|
|
|
4
4
|
/// Reads JSON requests from stdin (one per line), dispatches to the appropriate bridge,
|
|
5
5
|
/// and writes JSON responses to stdout (one per line).
|
|
6
6
|
|
|
7
|
+
// MARK: - Signal Handlers
|
|
8
|
+
// Catch fatal signals (SIGSEGV, SIGBUS, SIGABRT) that CGWindowListCreateImage
|
|
9
|
+
// can trigger on GPU-heavy windows. Write an error to stderr so the Node.js
|
|
10
|
+
// bridge client can detect the crash, then exit cleanly.
|
|
11
|
+
func installSignalHandlers() {
|
|
12
|
+
// Fatal signals — crash reporting
|
|
13
|
+
let fatalSignals: [Int32] = [SIGSEGV, SIGBUS, SIGABRT]
|
|
14
|
+
for sig in fatalSignals {
|
|
15
|
+
signal(sig) { signum in
|
|
16
|
+
let msg = "Bridge fatal signal \(signum) — restarting\n"
|
|
17
|
+
msg.withCString { ptr in
|
|
18
|
+
_ = Darwin.write(STDERR_FILENO, ptr, Int(strlen(ptr)))
|
|
19
|
+
}
|
|
20
|
+
_exit(128 + signum)
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Graceful shutdown signals — notify Node.js BridgeClient before exit
|
|
25
|
+
let gracefulSignals: [Int32] = [SIGTERM, SIGINT]
|
|
26
|
+
for sig in gracefulSignals {
|
|
27
|
+
signal(sig) { signum in
|
|
28
|
+
let reason = signum == SIGTERM ? "SIGTERM" : "SIGINT"
|
|
29
|
+
let notification = "{\"jsonrpc\":\"2.0\",\"method\":\"bridge.shutdown\",\"params\":{\"reason\":\"\(reason)\"}}\n"
|
|
30
|
+
notification.withCString { ptr in
|
|
31
|
+
_ = Darwin.write(STDOUT_FILENO, ptr, Int(strlen(ptr)))
|
|
32
|
+
}
|
|
33
|
+
// Flush stdout
|
|
34
|
+
fflush(stdout)
|
|
35
|
+
_exit(0)
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
installSignalHandlers()
|
|
40
|
+
|
|
7
41
|
struct JsonRpcRequest: Codable {
|
|
8
42
|
let id: Int
|
|
9
43
|
let method: String
|
|
@@ -117,7 +151,8 @@ let accessibilityBridge = AccessibilityBridge()
|
|
|
117
151
|
let observerBridge = ObserverBridge()
|
|
118
152
|
let coreGraphicsBridge = CoreGraphicsBridge()
|
|
119
153
|
let visionBridge = VisionBridge()
|
|
120
|
-
let appManagement = AppManagement()
|
|
154
|
+
let appManagement = AppManagement(ax: accessibilityBridge)
|
|
155
|
+
let streamCapture = StreamCapture()
|
|
121
156
|
|
|
122
157
|
// MARK: - Method Dispatch
|
|
123
158
|
|
|
@@ -149,6 +184,15 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
|
|
|
149
184
|
case "app.frontmost":
|
|
150
185
|
return appManagement.frontmostApp()
|
|
151
186
|
|
|
187
|
+
// Window management (AX-enriched)
|
|
188
|
+
case "window.list":
|
|
189
|
+
return appManagement.listWindowsWithAX()
|
|
190
|
+
|
|
191
|
+
case "window.focus":
|
|
192
|
+
let windowId: Int = try requiredParam(params, "windowId")
|
|
193
|
+
try appManagement.focusWindow(windowId: windowId)
|
|
194
|
+
return ["ok": true]
|
|
195
|
+
|
|
152
196
|
// Accessibility
|
|
153
197
|
case "ax.findElement":
|
|
154
198
|
let pid: Int = try requiredParam(params, "pid")
|
|
@@ -157,21 +201,29 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
|
|
|
157
201
|
let value: String? = param(params, "value")
|
|
158
202
|
let identifier: String? = param(params, "identifier")
|
|
159
203
|
let exact: Bool = param(params, "exact") ?? true
|
|
204
|
+
let maxDepth: Int = param(params, "maxDepth") ?? 30
|
|
160
205
|
return try accessibilityBridge.findElement(
|
|
161
206
|
pid: pid_t(pid), role: role, title: title, value: value,
|
|
162
|
-
identifier: identifier, exact: exact
|
|
207
|
+
identifier: identifier, exact: exact, maxDepth: maxDepth
|
|
163
208
|
)
|
|
164
209
|
|
|
165
210
|
case "ax.getElementTree":
|
|
166
211
|
let pid: Int = try requiredParam(params, "pid")
|
|
167
212
|
let maxDepth: Int = param(params, "maxDepth") ?? 5
|
|
168
|
-
|
|
213
|
+
let windowId: Int? = param(params, "windowId")
|
|
214
|
+
return try accessibilityBridge.getElementTree(pid: pid_t(pid), maxDepth: maxDepth, windowId: windowId)
|
|
215
|
+
|
|
216
|
+
case "ax.getMenuBar":
|
|
217
|
+
let pid: Int = try requiredParam(params, "pid")
|
|
218
|
+
let maxDepth: Int = param(params, "maxDepth") ?? 10
|
|
219
|
+
return try accessibilityBridge.getMenuBarTree(pid: pid_t(pid), maxDepth: maxDepth)
|
|
169
220
|
|
|
170
221
|
case "ax.performAction":
|
|
171
222
|
let pid: Int = try requiredParam(params, "pid")
|
|
172
223
|
let elementPath: [Int] = try requiredParam(params, "elementPath")
|
|
173
224
|
let action: String = param(params, "action") ?? "AXPress"
|
|
174
|
-
|
|
225
|
+
let expectedTitle: String? = param(params, "expectedTitle")
|
|
226
|
+
try accessibilityBridge.performAction(pid: pid_t(pid), elementPath: elementPath, action: action, expectedTitle: expectedTitle)
|
|
175
227
|
return ["ok": true]
|
|
176
228
|
|
|
177
229
|
case "ax.setElementValue":
|
|
@@ -210,13 +262,16 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
|
|
|
210
262
|
let y: Double = try requiredParam(params, "y")
|
|
211
263
|
let button: String = param(params, "button") ?? "left"
|
|
212
264
|
let clickCount: Int = param(params, "clickCount") ?? 1
|
|
213
|
-
|
|
265
|
+
let modifiers: [String] = param(params, "modifiers") ?? []
|
|
266
|
+
let mcTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
267
|
+
coreGraphicsBridge.mouseClick(x: x, y: y, button: button, clickCount: clickCount, modifiers: modifiers, targetPid: mcTargetPid)
|
|
214
268
|
return ["ok": true]
|
|
215
269
|
|
|
216
270
|
case "cg.mouseMove":
|
|
217
271
|
let x: Double = try requiredParam(params, "x")
|
|
218
272
|
let y: Double = try requiredParam(params, "y")
|
|
219
|
-
|
|
273
|
+
let mmTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
274
|
+
coreGraphicsBridge.mouseMove(x: x, y: y, targetPid: mmTargetPid)
|
|
220
275
|
return ["ok": true]
|
|
221
276
|
|
|
222
277
|
case "cg.mouseDrag":
|
|
@@ -224,7 +279,24 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
|
|
|
224
279
|
let fromY: Double = try requiredParam(params, "fromY")
|
|
225
280
|
let toX: Double = try requiredParam(params, "toX")
|
|
226
281
|
let toY: Double = try requiredParam(params, "toY")
|
|
227
|
-
|
|
282
|
+
let dragModifiers: [String] = param(params, "modifiers") ?? []
|
|
283
|
+
let mdTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
284
|
+
coreGraphicsBridge.mouseDrag(fromX: fromX, fromY: fromY, toX: toX, toY: toY, modifiers: dragModifiers, targetPid: mdTargetPid)
|
|
285
|
+
return ["ok": true]
|
|
286
|
+
|
|
287
|
+
case "cg.mousePressAndHold":
|
|
288
|
+
let phX: Double = try requiredParam(params, "x")
|
|
289
|
+
let phY: Double = try requiredParam(params, "y")
|
|
290
|
+
let phDuration: Int = param(params, "durationMs") ?? 500
|
|
291
|
+
let phTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
292
|
+
coreGraphicsBridge.mousePressAndHold(x: phX, y: phY, durationMs: phDuration, targetPid: phTargetPid)
|
|
293
|
+
return ["ok": true]
|
|
294
|
+
|
|
295
|
+
case "cg.keyPressAndHold":
|
|
296
|
+
let kphKey: String = try requiredParam(params, "key")
|
|
297
|
+
let kphDuration: Int = param(params, "durationMs") ?? 500
|
|
298
|
+
let kphTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
299
|
+
coreGraphicsBridge.keyPressAndHold(key: kphKey, durationMs: kphDuration, targetPid: kphTargetPid)
|
|
228
300
|
return ["ok": true]
|
|
229
301
|
|
|
230
302
|
case "cg.mouseFlick":
|
|
@@ -232,17 +304,20 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
|
|
|
232
304
|
let fyF: Double = try requiredParam(params, "fromY")
|
|
233
305
|
let txF: Double = try requiredParam(params, "toX")
|
|
234
306
|
let tyF: Double = try requiredParam(params, "toY")
|
|
235
|
-
|
|
307
|
+
let mfTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
308
|
+
coreGraphicsBridge.mouseFlick(fromX: fxF, fromY: fyF, toX: txF, toY: tyF, targetPid: mfTargetPid)
|
|
236
309
|
return ["ok": true]
|
|
237
310
|
|
|
238
311
|
case "cg.keyCombo":
|
|
239
312
|
let keys: [String] = try requiredParam(params, "keys")
|
|
240
|
-
|
|
313
|
+
let kcTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
314
|
+
coreGraphicsBridge.keyCombo(keys: keys, targetPid: kcTargetPid)
|
|
241
315
|
return ["ok": true]
|
|
242
316
|
|
|
243
317
|
case "cg.typeText":
|
|
244
318
|
let text: String = try requiredParam(params, "text")
|
|
245
|
-
|
|
319
|
+
let ttTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
320
|
+
coreGraphicsBridge.typeText(text: text, targetPid: ttTargetPid)
|
|
246
321
|
return ["ok": true]
|
|
247
322
|
|
|
248
323
|
case "cg.captureScreen":
|
|
@@ -251,25 +326,89 @@ func dispatch(method: String, params: [String: AnyCodable]?) throws -> Any {
|
|
|
251
326
|
|
|
252
327
|
case "cg.captureWindow":
|
|
253
328
|
let windowId: Int = try requiredParam(params, "windowId")
|
|
254
|
-
|
|
329
|
+
let safeCLI: Bool = param(params, "safeCLI") ?? false
|
|
330
|
+
return try coreGraphicsBridge.captureWindow(windowId: windowId, safeCLI: safeCLI)
|
|
331
|
+
|
|
332
|
+
case "cg.captureWindowBuffer":
|
|
333
|
+
let windowId: Int = try requiredParam(params, "windowId")
|
|
334
|
+
let safeCLI: Bool = param(params, "safeCLI") ?? false
|
|
335
|
+
return try coreGraphicsBridge.captureWindowBuffer(windowId: windowId, safeCLI: safeCLI)
|
|
255
336
|
|
|
256
337
|
case "cg.scroll":
|
|
257
338
|
let x: Double = try requiredParam(params, "x")
|
|
258
339
|
let y: Double = try requiredParam(params, "y")
|
|
259
340
|
let deltaX: Int = param(params, "deltaX") ?? 0
|
|
260
341
|
let deltaY: Int = param(params, "deltaY") ?? 0
|
|
261
|
-
|
|
342
|
+
let scTargetPid: pid_t? = (param(params, "targetPid") as Int?).map { pid_t($0) }
|
|
343
|
+
coreGraphicsBridge.scroll(x: x, y: y, deltaX: deltaX, deltaY: deltaY, targetPid: scTargetPid)
|
|
262
344
|
return ["ok": true]
|
|
263
345
|
|
|
264
346
|
// Vision
|
|
265
347
|
case "vision.findText":
|
|
266
348
|
let imagePath: String = try requiredParam(params, "imagePath")
|
|
267
349
|
let searchText: String? = param(params, "searchText")
|
|
268
|
-
|
|
350
|
+
let ftMode: String = param(params, "mode") ?? "accurate"
|
|
351
|
+
return try visionBridge.findText(imagePath: imagePath, searchText: searchText, mode: ftMode)
|
|
269
352
|
|
|
270
353
|
case "vision.ocr":
|
|
271
354
|
let imagePath: String = try requiredParam(params, "imagePath")
|
|
272
|
-
|
|
355
|
+
let ocrMode: String = param(params, "mode") ?? "accurate"
|
|
356
|
+
return try visionBridge.ocr(imagePath: imagePath, mode: ocrMode)
|
|
357
|
+
|
|
358
|
+
case "vision.ocrRegion":
|
|
359
|
+
let windowId: Int = try requiredParam(params, "windowId")
|
|
360
|
+
let region: [String: Double] = try requiredParam(params, "region")
|
|
361
|
+
let ocrRegionMode: String = param(params, "mode") ?? "accurate"
|
|
362
|
+
return try visionBridge.ocrRegion(windowId: windowId, region: region, mode: ocrRegionMode)
|
|
363
|
+
|
|
364
|
+
case "vision.detectElements":
|
|
365
|
+
let imagePath: String = try requiredParam(params, "imagePath")
|
|
366
|
+
let confidence: Double = param(params, "confidence") ?? 0.25
|
|
367
|
+
let elements = try visionBridge.detectElements(imagePath: imagePath, confidence: confidence)
|
|
368
|
+
return ["elements": elements, "count": elements.count]
|
|
369
|
+
|
|
370
|
+
// Stream capture — continuous SCStream for fast perception
|
|
371
|
+
case "vision.startStream":
|
|
372
|
+
let windowId: Int = try requiredParam(params, "windowId")
|
|
373
|
+
let fps: Int = param(params, "fps") ?? 30
|
|
374
|
+
let sem = DispatchSemaphore(value: 0)
|
|
375
|
+
var streamError: Error?
|
|
376
|
+
Task {
|
|
377
|
+
do {
|
|
378
|
+
try await streamCapture.start(windowId: windowId, fps: fps)
|
|
379
|
+
} catch {
|
|
380
|
+
streamError = error
|
|
381
|
+
}
|
|
382
|
+
sem.signal()
|
|
383
|
+
}
|
|
384
|
+
sem.wait()
|
|
385
|
+
if let err = streamError { throw err }
|
|
386
|
+
return ["ok": true, "fps": fps]
|
|
387
|
+
|
|
388
|
+
case "vision.stopStream":
|
|
389
|
+
let sem = DispatchSemaphore(value: 0)
|
|
390
|
+
Task {
|
|
391
|
+
await streamCapture.stop()
|
|
392
|
+
sem.signal()
|
|
393
|
+
}
|
|
394
|
+
sem.wait()
|
|
395
|
+
return ["ok": true]
|
|
396
|
+
|
|
397
|
+
case "vision.streamStatus":
|
|
398
|
+
let running = streamCapture.isRunning
|
|
399
|
+
if running, let info = streamCapture.getLatestInfo() {
|
|
400
|
+
return ["running": true, "path": info["path"]!, "width": info["width"]!, "height": info["height"]!, "ageMs": info["ageMs"]!, "frameCount": info["frameCount"]!]
|
|
401
|
+
}
|
|
402
|
+
return ["running": running]
|
|
403
|
+
|
|
404
|
+
case "vision.latestFrame":
|
|
405
|
+
guard streamCapture.isRunning else {
|
|
406
|
+
throw BridgeError.general("Stream not running")
|
|
407
|
+
}
|
|
408
|
+
guard let info = streamCapture.getLatestInfo() else {
|
|
409
|
+
throw BridgeError.general("No frame captured yet")
|
|
410
|
+
}
|
|
411
|
+
return info
|
|
273
412
|
|
|
274
413
|
default:
|
|
275
414
|
throw BridgeError.general("Unknown method: \(method)")
|
|
@@ -334,9 +473,23 @@ while let line = readLine() {
|
|
|
334
473
|
writeResponse(response)
|
|
335
474
|
}
|
|
336
475
|
} catch {
|
|
337
|
-
// Malformed JSON —
|
|
476
|
+
// Malformed JSON — try to extract id from raw string
|
|
477
|
+
var extractedId = 0
|
|
478
|
+
if let idRange = line.range(of: "\"id\"\\s*:\\s*(\\d+)", options: .regularExpression) {
|
|
479
|
+
let match = line[idRange]
|
|
480
|
+
if let digitRange = match.range(of: "\\d+$", options: .regularExpression) {
|
|
481
|
+
extractedId = Int(match[digitRange]) ?? 0
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
if extractedId == 0 {
|
|
485
|
+
// No id could be extracted — log to stderr so Node.js BridgeClient can detect it
|
|
486
|
+
let stderrMsg = "Bridge parse error (no id): \(error.localizedDescription)\n"
|
|
487
|
+
stderrMsg.withCString { ptr in
|
|
488
|
+
_ = Darwin.write(STDERR_FILENO, ptr, Int(strlen(ptr)))
|
|
489
|
+
}
|
|
490
|
+
}
|
|
338
491
|
let response = JsonRpcResponse(
|
|
339
|
-
id:
|
|
492
|
+
id: extractedId,
|
|
340
493
|
result: nil,
|
|
341
494
|
error: JsonRpcError(code: -32700, message: "Parse error: \(error.localizedDescription)")
|
|
342
495
|
)
|
|
@@ -156,6 +156,8 @@ class Program
|
|
|
156
156
|
Param<Dictionary<string, double>>(p, "region")),
|
|
157
157
|
"cg.captureWindow" => _screenCapture.CaptureWindow(
|
|
158
158
|
RequiredParam<int>(p, "windowId")),
|
|
159
|
+
"cg.captureWindowBuffer" => _screenCapture.CaptureWindowBuffer(
|
|
160
|
+
RequiredParam<int>(p, "windowId")),
|
|
159
161
|
|
|
160
162
|
// Vision (OCR)
|
|
161
163
|
"vision.findText" => _screenCapture.FindText(
|
|
@@ -163,6 +165,9 @@ class Program
|
|
|
163
165
|
Param<string>(p, "searchText")),
|
|
164
166
|
"vision.ocr" => _screenCapture.Ocr(
|
|
165
167
|
RequiredParam<string>(p, "imagePath")),
|
|
168
|
+
"vision.ocrRegion" => _screenCapture.OcrRegion(
|
|
169
|
+
RequiredParam<int>(p, "windowId"),
|
|
170
|
+
RequiredParam<Dictionary<string, double>>(p, "region")),
|
|
166
171
|
|
|
167
172
|
_ => throw new BridgeException($"Unknown method: {method}"),
|
|
168
173
|
};
|
|
@@ -157,6 +157,130 @@ class ScreenCapture
|
|
|
157
157
|
};
|
|
158
158
|
}
|
|
159
159
|
|
|
160
|
+
/// <summary>
|
|
161
|
+
/// Capture a specific window in-memory, return base64 PNG (no disk I/O).
|
|
162
|
+
/// Equivalent to macOS captureWindowBuffer.
|
|
163
|
+
/// </summary>
|
|
164
|
+
public Dictionary<string, object> CaptureWindowBuffer(int windowId)
|
|
165
|
+
{
|
|
166
|
+
var hWnd = new IntPtr(windowId);
|
|
167
|
+
GetWindowRect(hWnd, out RECT rect);
|
|
168
|
+
|
|
169
|
+
int width = rect.Right - rect.Left;
|
|
170
|
+
int height = rect.Bottom - rect.Top;
|
|
171
|
+
|
|
172
|
+
if (width <= 0 || height <= 0)
|
|
173
|
+
throw new BridgeException($"Window {windowId} has invalid dimensions");
|
|
174
|
+
|
|
175
|
+
using var bitmap = new Bitmap(width, height, PixelFormat.Format32bppArgb);
|
|
176
|
+
using var graphics = Graphics.FromImage(bitmap);
|
|
177
|
+
|
|
178
|
+
var hdc = graphics.GetHdc();
|
|
179
|
+
bool success = PrintWindow(hWnd, hdc, PW_RENDERFULLCONTENT);
|
|
180
|
+
graphics.ReleaseHdc(hdc);
|
|
181
|
+
|
|
182
|
+
if (!success)
|
|
183
|
+
{
|
|
184
|
+
graphics.CopyFromScreen(rect.Left, rect.Top, 0, 0,
|
|
185
|
+
new Size(width, height), CopyPixelOperation.SourceCopy);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
using var ms = new MemoryStream();
|
|
189
|
+
bitmap.Save(ms, ImageFormat.Png);
|
|
190
|
+
var base64 = Convert.ToBase64String(ms.ToArray());
|
|
191
|
+
|
|
192
|
+
return new Dictionary<string, object>
|
|
193
|
+
{
|
|
194
|
+
["base64"] = base64,
|
|
195
|
+
["width"] = width,
|
|
196
|
+
["height"] = height,
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/// <summary>
|
|
201
|
+
/// OCR a specific region of a window. Captures window, crops to ROI, runs OCR,
|
|
202
|
+
/// then translates bounds back to window coordinates.
|
|
203
|
+
/// Equivalent to macOS vision.ocrRegion.
|
|
204
|
+
/// </summary>
|
|
205
|
+
public Dictionary<string, object> OcrRegion(int windowId, Dictionary<string, double> region)
|
|
206
|
+
{
|
|
207
|
+
var hWnd = new IntPtr(windowId);
|
|
208
|
+
GetWindowRect(hWnd, out RECT rect);
|
|
209
|
+
|
|
210
|
+
int winWidth = rect.Right - rect.Left;
|
|
211
|
+
int winHeight = rect.Bottom - rect.Top;
|
|
212
|
+
|
|
213
|
+
if (winWidth <= 0 || winHeight <= 0)
|
|
214
|
+
throw new BridgeException($"Window {windowId} has invalid dimensions");
|
|
215
|
+
|
|
216
|
+
int roiX = (int)region.GetValueOrDefault("x", 0);
|
|
217
|
+
int roiY = (int)region.GetValueOrDefault("y", 0);
|
|
218
|
+
int roiW = (int)region.GetValueOrDefault("width", winWidth);
|
|
219
|
+
int roiH = (int)region.GetValueOrDefault("height", winHeight);
|
|
220
|
+
|
|
221
|
+
// Clamp ROI to window bounds
|
|
222
|
+
roiX = Math.Max(0, Math.Min(roiX, winWidth));
|
|
223
|
+
roiY = Math.Max(0, Math.Min(roiY, winHeight));
|
|
224
|
+
roiW = Math.Min(roiW, winWidth - roiX);
|
|
225
|
+
roiH = Math.Min(roiH, winHeight - roiY);
|
|
226
|
+
|
|
227
|
+
if (roiW <= 0 || roiH <= 0)
|
|
228
|
+
throw new BridgeException("ROI has zero or negative area after clamping");
|
|
229
|
+
|
|
230
|
+
// Capture full window
|
|
231
|
+
using var fullBitmap = new Bitmap(winWidth, winHeight, PixelFormat.Format32bppArgb);
|
|
232
|
+
using (var graphics = Graphics.FromImage(fullBitmap))
|
|
233
|
+
{
|
|
234
|
+
var hdc = graphics.GetHdc();
|
|
235
|
+
bool success = PrintWindow(hWnd, hdc, PW_RENDERFULLCONTENT);
|
|
236
|
+
graphics.ReleaseHdc(hdc);
|
|
237
|
+
|
|
238
|
+
if (!success)
|
|
239
|
+
{
|
|
240
|
+
graphics.CopyFromScreen(rect.Left, rect.Top, 0, 0,
|
|
241
|
+
new Size(winWidth, winHeight), CopyPixelOperation.SourceCopy);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Crop to ROI
|
|
246
|
+
using var cropped = fullBitmap.Clone(
|
|
247
|
+
new Rectangle(roiX, roiY, roiW, roiH), fullBitmap.PixelFormat);
|
|
248
|
+
|
|
249
|
+
// Save cropped to temp file for OCR
|
|
250
|
+
var tempPath = Path.Combine(_tempDir, $"ocr_region_{DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()}.png");
|
|
251
|
+
cropped.Save(tempPath, ImageFormat.Png);
|
|
252
|
+
|
|
253
|
+
try
|
|
254
|
+
{
|
|
255
|
+
var ocrResult = Ocr(tempPath);
|
|
256
|
+
|
|
257
|
+
// Translate bounds back to window coordinates
|
|
258
|
+
if (ocrResult["regions"] is List<object> regions)
|
|
259
|
+
{
|
|
260
|
+
foreach (var regionObj in regions)
|
|
261
|
+
{
|
|
262
|
+
if (regionObj is Dictionary<string, object> entry &&
|
|
263
|
+
entry["bounds"] is Dictionary<string, object> bounds)
|
|
264
|
+
{
|
|
265
|
+
bounds["x"] = (double)bounds["x"] + roiX;
|
|
266
|
+
bounds["y"] = (double)bounds["y"] + roiY;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
ocrResult["roiX"] = roiX;
|
|
272
|
+
ocrResult["roiY"] = roiY;
|
|
273
|
+
ocrResult["roiWidth"] = roiW;
|
|
274
|
+
ocrResult["roiHeight"] = roiH;
|
|
275
|
+
|
|
276
|
+
return ocrResult;
|
|
277
|
+
}
|
|
278
|
+
finally
|
|
279
|
+
{
|
|
280
|
+
try { File.Delete(tempPath); } catch { /* best-effort cleanup */ }
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
160
284
|
/// <summary>
|
|
161
285
|
/// OCR an image file. Uses Windows.Media.Ocr when available, falls back to basic implementation.
|
|
162
286
|
/// </summary>
|