screenhand 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. package/README.md +165 -446
  2. package/bin/darwin-arm64/macos-bridge +0 -0
  3. package/dist/mcp-desktop.js +3615 -400
  4. package/dist/scripts/export-help-center.js +112 -0
  5. package/dist/scripts/marketing-loop.js +117 -0
  6. package/dist/scripts/observer-daemon.js +288 -0
  7. package/dist/scripts/orchestrator-daemon.js +399 -0
  8. package/dist/scripts/threads-campaign.js +208 -0
  9. package/dist/src/community/fetcher.js +109 -0
  10. package/dist/src/community/index.js +6 -0
  11. package/dist/src/community/publisher.js +191 -0
  12. package/dist/src/community/remote-api.js +121 -0
  13. package/dist/src/community/types.js +3 -0
  14. package/dist/src/community/validator.js +95 -0
  15. package/dist/src/context-tracker.js +489 -0
  16. package/dist/src/ingestion/coverage-auditor.js +233 -0
  17. package/dist/src/ingestion/doc-parser.js +164 -0
  18. package/dist/src/ingestion/index.js +8 -0
  19. package/dist/src/ingestion/menu-scanner.js +152 -0
  20. package/dist/src/ingestion/reference-merger.js +186 -0
  21. package/dist/src/ingestion/shortcut-extractor.js +180 -0
  22. package/dist/src/ingestion/tutorial-extractor.js +170 -0
  23. package/dist/src/ingestion/types.js +3 -0
  24. package/dist/src/jobs/manager.js +82 -14
  25. package/dist/src/jobs/runner.js +138 -15
  26. package/dist/src/learning/engine.js +356 -0
  27. package/dist/src/learning/index.js +9 -0
  28. package/dist/src/learning/locator-policy.js +120 -0
  29. package/dist/src/learning/pattern-policy.js +89 -0
  30. package/dist/src/learning/recovery-policy.js +116 -0
  31. package/dist/src/learning/sensor-policy.js +115 -0
  32. package/dist/src/learning/timing-model.js +204 -0
  33. package/dist/src/learning/topology-policy.js +90 -0
  34. package/dist/src/learning/types.js +9 -0
  35. package/dist/src/logging/timeline-logger.js +4 -1
  36. package/dist/src/memory/playbook-seeds.js +200 -0
  37. package/dist/src/memory/recall.js +60 -8
  38. package/dist/src/memory/service.js +30 -5
  39. package/dist/src/memory/store.js +34 -5
  40. package/dist/src/native/bridge-client.js +253 -31
  41. package/dist/src/observer/state.js +199 -0
  42. package/dist/src/observer/types.js +43 -0
  43. package/dist/src/orchestrator/state.js +68 -0
  44. package/dist/src/orchestrator/types.js +22 -0
  45. package/dist/src/perception/ax-source.js +162 -0
  46. package/dist/src/perception/cdp-source.js +162 -0
  47. package/dist/src/perception/coordinator.js +771 -0
  48. package/dist/src/perception/frame-differ.js +287 -0
  49. package/dist/src/perception/index.js +22 -0
  50. package/dist/src/perception/manager.js +199 -0
  51. package/dist/src/perception/types.js +47 -0
  52. package/dist/src/perception/vision-source.js +399 -0
  53. package/dist/src/planner/deterministic.js +298 -0
  54. package/dist/src/planner/executor.js +870 -0
  55. package/dist/src/planner/goal-store.js +92 -0
  56. package/dist/src/planner/index.js +21 -0
  57. package/dist/src/planner/planner.js +520 -0
  58. package/dist/src/planner/tool-registry.js +71 -0
  59. package/dist/src/planner/types.js +22 -0
  60. package/dist/src/platform/explorer.js +213 -0
  61. package/dist/src/platform/help-center-markdown.js +527 -0
  62. package/dist/src/platform/learner.js +257 -0
  63. package/dist/src/playbook/engine.js +296 -11
  64. package/dist/src/playbook/mcp-recorder.js +204 -0
  65. package/dist/src/playbook/recorder.js +3 -2
  66. package/dist/src/playbook/runner.js +1 -1
  67. package/dist/src/playbook/store.js +139 -10
  68. package/dist/src/recovery/detectors.js +156 -0
  69. package/dist/src/recovery/engine.js +327 -0
  70. package/dist/src/recovery/index.js +20 -0
  71. package/dist/src/recovery/strategies.js +274 -0
  72. package/dist/src/recovery/types.js +20 -0
  73. package/dist/src/runtime/accessibility-adapter.js +55 -18
  74. package/dist/src/runtime/applescript-adapter.js +8 -2
  75. package/dist/src/runtime/cdp-chrome-adapter.js +1 -1
  76. package/dist/src/runtime/executor.js +23 -3
  77. package/dist/src/runtime/locator-cache.js +24 -2
  78. package/dist/src/runtime/service.js +59 -15
  79. package/dist/src/runtime/session-manager.js +4 -1
  80. package/dist/src/runtime/vision-adapter.js +2 -1
  81. package/dist/src/state/app-map-types.js +72 -0
  82. package/dist/src/state/app-map.js +1974 -0
  83. package/dist/src/state/entity-tracker.js +108 -0
  84. package/dist/src/state/fusion.js +96 -0
  85. package/dist/src/state/index.js +21 -0
  86. package/dist/src/state/ladder-generator.js +236 -0
  87. package/dist/src/state/persistence.js +156 -0
  88. package/dist/src/state/types.js +17 -0
  89. package/dist/src/state/world-model.js +1456 -0
  90. package/dist/src/util/atomic-write.js +19 -4
  91. package/dist/src/util/sanitize.js +146 -0
  92. package/dist-app-maps/com.figma.Desktop.json +959 -0
  93. package/dist-app-maps/com.hnc.Discord.json +1146 -0
  94. package/dist-app-maps/notion.id.json +2831 -0
  95. package/dist-playbooks/canva-screenhand-carousel.json +445 -0
  96. package/dist-playbooks/codex-desktop.json +76 -0
  97. package/dist-playbooks/competitor-research-stack.json +122 -0
  98. package/dist-playbooks/davinci-color-grade.json +153 -0
  99. package/dist-playbooks/davinci-edit-timeline.json +162 -0
  100. package/dist-playbooks/davinci-render.json +114 -0
  101. package/dist-playbooks/devto.json +52 -0
  102. package/dist-playbooks/discord.json +41 -0
  103. package/dist-playbooks/google-flow-create-project.json +59 -0
  104. package/dist-playbooks/google-flow-edit-image.json +90 -0
  105. package/dist-playbooks/google-flow-edit-video.json +90 -0
  106. package/dist-playbooks/google-flow-generate-image.json +68 -0
  107. package/dist-playbooks/google-flow-generate-video.json +191 -0
  108. package/dist-playbooks/google-flow-open-project.json +48 -0
  109. package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
  110. package/dist-playbooks/google-flow-search-assets.json +64 -0
  111. package/dist-playbooks/instagram.json +57 -0
  112. package/dist-playbooks/linkedin.json +52 -0
  113. package/dist-playbooks/n8n.json +43 -0
  114. package/dist-playbooks/reddit.json +52 -0
  115. package/dist-playbooks/threads.json +59 -0
  116. package/dist-playbooks/x-twitter.json +59 -0
  117. package/dist-playbooks/youtube.json +59 -0
  118. package/dist-references/canva.json +646 -0
  119. package/dist-references/codex-desktop.json +305 -0
  120. package/dist-references/davinci-resolve-keyboard.json +594 -0
  121. package/dist-references/davinci-resolve-menu-map.json +1139 -0
  122. package/dist-references/davinci-resolve-menus-batch1.json +116 -0
  123. package/dist-references/davinci-resolve-menus-batch2.json +372 -0
  124. package/dist-references/davinci-resolve-menus-batch3.json +330 -0
  125. package/dist-references/davinci-resolve-menus-batch4.json +297 -0
  126. package/dist-references/davinci-resolve-shortcuts.json +333 -0
  127. package/dist-references/devpost.json +186 -0
  128. package/dist-references/devto.json +317 -0
  129. package/dist-references/discord.json +549 -0
  130. package/dist-references/figma.json +1186 -0
  131. package/dist-references/finder.json +146 -0
  132. package/dist-references/google-ads-transparency.json +95 -0
  133. package/dist-references/google-flow.json +649 -0
  134. package/dist-references/instagram.json +341 -0
  135. package/dist-references/linkedin.json +324 -0
  136. package/dist-references/meta-ad-library.json +86 -0
  137. package/dist-references/n8n.json +387 -0
  138. package/dist-references/notes.json +27 -0
  139. package/dist-references/notion.json +163 -0
  140. package/dist-references/reddit.json +341 -0
  141. package/dist-references/threads.json +337 -0
  142. package/dist-references/x-twitter.json +403 -0
  143. package/dist-references/youtube.json +373 -0
  144. package/native/macos-bridge/Package.swift +22 -0
  145. package/native/macos-bridge/Sources/AccessibilityBridge.swift +482 -0
  146. package/native/macos-bridge/Sources/AppManagement.swift +339 -0
  147. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +537 -0
  148. package/native/macos-bridge/Sources/ObserverBridge.swift +120 -0
  149. package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
  150. package/native/macos-bridge/Sources/VisionBridge.swift +238 -0
  151. package/native/macos-bridge/Sources/main.swift +498 -0
  152. package/native/windows-bridge/AppManagement.cs +234 -0
  153. package/native/windows-bridge/InputBridge.cs +436 -0
  154. package/native/windows-bridge/Program.cs +270 -0
  155. package/native/windows-bridge/ScreenCapture.cs +453 -0
  156. package/native/windows-bridge/UIAutomationBridge.cs +571 -0
  157. package/native/windows-bridge/WindowsBridge.csproj +17 -0
  158. package/package.json +12 -1
  159. package/scripts/postinstall.cjs +127 -0
  160. package/dist/.audit-log.jsonl +0 -55
  161. package/dist/.screenhand/memory/.lock +0 -1
  162. package/dist/.screenhand/memory/actions.jsonl +0 -85
  163. package/dist/.screenhand/memory/errors.jsonl +0 -5
  164. package/dist/.screenhand/memory/errors.jsonl.bak +0 -4
  165. package/dist/.screenhand/memory/state.json +0 -35
  166. package/dist/.screenhand/memory/state.json.bak +0 -35
  167. package/dist/.screenhand/memory/strategies.jsonl +0 -12
  168. package/dist/agent/cli.js +0 -73
  169. package/dist/agent/loop.js +0 -258
  170. package/dist/config.js +0 -9
  171. package/dist/index.js +0 -56
  172. package/dist/logging/timeline-logger.js +0 -29
  173. package/dist/mcp/mcp-stdio-server.js +0 -448
  174. package/dist/mcp/server.js +0 -347
  175. package/dist/mcp-entry.js +0 -59
  176. package/dist/memory/recall.js +0 -160
  177. package/dist/memory/research.js +0 -98
  178. package/dist/memory/seeds.js +0 -89
  179. package/dist/memory/session.js +0 -161
  180. package/dist/memory/store.js +0 -391
  181. package/dist/memory/types.js +0 -4
  182. package/dist/monitor/codex-monitor.js +0 -377
  183. package/dist/monitor/task-queue.js +0 -84
  184. package/dist/monitor/types.js +0 -49
  185. package/dist/native/bridge-client.js +0 -174
  186. package/dist/native/macos-bridge-client.js +0 -5
  187. package/dist/npm-publish-helper.js +0 -117
  188. package/dist/npm-token-cdp.js +0 -113
  189. package/dist/npm-token-create.js +0 -135
  190. package/dist/npm-token-finish.js +0 -126
  191. package/dist/playbook/engine.js +0 -193
  192. package/dist/playbook/index.js +0 -4
  193. package/dist/playbook/recorder.js +0 -519
  194. package/dist/playbook/runner.js +0 -392
  195. package/dist/playbook/store.js +0 -166
  196. package/dist/playbook/types.js +0 -4
  197. package/dist/runtime/accessibility-adapter.js +0 -377
  198. package/dist/runtime/app-adapter.js +0 -48
  199. package/dist/runtime/applescript-adapter.js +0 -283
  200. package/dist/runtime/ax-role-map.js +0 -80
  201. package/dist/runtime/browser-adapter.js +0 -36
  202. package/dist/runtime/cdp-chrome-adapter.js +0 -505
  203. package/dist/runtime/composite-adapter.js +0 -205
  204. package/dist/runtime/executor.js +0 -250
  205. package/dist/runtime/locator-cache.js +0 -12
  206. package/dist/runtime/planning-loop.js +0 -47
  207. package/dist/runtime/service.js +0 -372
  208. package/dist/runtime/session-manager.js +0 -28
  209. package/dist/runtime/state-observer.js +0 -105
  210. package/dist/runtime/vision-adapter.js +0 -208
  211. package/dist/test-mcp-protocol.js +0 -138
  212. package/dist/types.js +0 -1
@@ -0,0 +1,399 @@
1
+ // Copyright (C) 2025 Clazro Technology Private Limited
2
+ // SPDX-License-Identifier: AGPL-3.0-only
3
+ //
4
+ // This file is part of ScreenHand.
5
+ //
6
+ // ScreenHand is free software: you can redistribute it and/or modify
7
+ // it under the terms of the GNU Affero General Public License as
8
+ // published by the Free Software Foundation, version 3.
9
+ //
10
+ // ScreenHand is distributed in the hope that it will be useful,
11
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ // GNU Affero General Public License for more details.
14
+ //
15
+ // You should have received a copy of the GNU Affero General Public License
16
+ // along with ScreenHand. If not, see <https://www.gnu.org/licenses/>.
17
+ import fs from "node:fs";
18
+ import { FrameDiffer } from "./frame-differ.js";
19
+ /**
20
+ * Vision perception source — screenshot diff + ROI OCR.
21
+ *
22
+ * Uses the native bridge for capture and OCR. Keeps last frame in memory
23
+ * to avoid file I/O on the diff path. Falls back to file-based capture
24
+ * if in-memory buffer capture is not available from the bridge.
25
+ */
26
+ export class VisionSource {
27
+ bridge;
28
+ differ;
29
+ /** When true, always use CLI fallback for captures (avoids CG API SIGSEGV on browser windows) */
30
+ safeCLI = false;
31
+ constructor(bridge, cellSize = 128) {
32
+ this.bridge = bridge;
33
+ this.differ = new FrameDiffer(cellSize);
34
+ }
35
+ /** Enable safe CLI mode for browser windows that crash CGWindowListCreateImage */
36
+ setSafeCLI(enabled) {
37
+ this.safeCLI = enabled;
38
+ }
39
+ /**
40
+ * SLOW rate: capture window and diff against last frame.
41
+ * Returns changed status and regions needing OCR.
42
+ */
43
+ async captureAndDiff(windowId) {
44
+ const start = Date.now();
45
+ try {
46
+ // Try in-memory buffer capture first (cg.captureWindowBuffer — not yet in native bridges),
47
+ // fall back to file-based capture (cg.captureWindow — always available)
48
+ let buffer;
49
+ let width;
50
+ let height;
51
+ try {
52
+ const result = await this.bridge.call("cg.captureWindowBuffer", { windowId, safeCLI: this.safeCLI });
53
+ buffer = Buffer.from(result.base64, "base64");
54
+ width = result.width;
55
+ height = result.height;
56
+ }
57
+ catch {
58
+ // Fallback: file-based capture
59
+ const fileResult = await this.bridge.call("cg.captureWindow", { windowId, safeCLI: this.safeCLI });
60
+ buffer = fs.readFileSync(fileResult.path);
61
+ width = fileResult.width;
62
+ height = fileResult.height;
63
+ // Clean up temp file
64
+ try {
65
+ fs.unlinkSync(fileResult.path);
66
+ }
67
+ catch {
68
+ /* ignore */
69
+ }
70
+ }
71
+ const diffResult = this.differ.diff(buffer, width, height);
72
+ const captureMs = Date.now() - start;
73
+ return {
74
+ source: "vision_diff",
75
+ rate: "slow",
76
+ timestamp: new Date().toISOString(),
77
+ data: {
78
+ type: "vision_diff",
79
+ changed: diffResult.changed,
80
+ hash: diffResult.hash,
81
+ changedRegions: diffResult.changedRegions,
82
+ captureMs,
83
+ },
84
+ };
85
+ }
86
+ catch {
87
+ return null;
88
+ }
89
+ }
90
+ /**
91
+ * OCR a specific region of interest.
92
+ * Uses bridge's vision.ocrRegion if available (not yet in native bridges),
93
+ * falls back to full capture + full OCR via vision.ocr.
94
+ */
95
+ async ocrRegion(windowId, roi, mode = "fast") {
96
+ const start = Date.now();
97
+ try {
98
+ let text;
99
+ let regions;
100
+ try {
101
+ // Try ROI-specific OCR
102
+ const result = await this.bridge.call("vision.ocrRegion", {
103
+ windowId,
104
+ region: { x: roi.x, y: roi.y, width: roi.width, height: roi.height },
105
+ mode,
106
+ });
107
+ text = result.text;
108
+ regions = result.regions;
109
+ }
110
+ catch {
111
+ // Fallback: full capture + OCR (less efficient)
112
+ const shot = await this.bridge.call("cg.captureWindow", { windowId, safeCLI: this.safeCLI });
113
+ const ocrResult = await this.bridge.call("vision.ocr", { imagePath: shot.path, mode: "fast" });
114
+ text = ocrResult.text;
115
+ regions = ocrResult.regions ?? [];
116
+ try {
117
+ fs.unlinkSync(shot.path);
118
+ }
119
+ catch {
120
+ /* ignore */
121
+ }
122
+ }
123
+ const latencyMs = Date.now() - start;
124
+ return {
125
+ source: "vision_ocr",
126
+ rate: "slow",
127
+ timestamp: new Date().toISOString(),
128
+ data: {
129
+ type: "vision_ocr",
130
+ roi,
131
+ text,
132
+ regions,
133
+ latencyMs,
134
+ },
135
+ };
136
+ }
137
+ catch {
138
+ return null;
139
+ }
140
+ }
141
+ /**
142
+ * Capture window to a temp file (no base64 round-trip).
143
+ * Returns the file path and dimensions, or null on failure.
144
+ */
145
+ async captureToFile(windowId) {
146
+ try {
147
+ return await this.bridge.call("cg.captureWindow", { windowId, safeCLI: this.safeCLI });
148
+ }
149
+ catch {
150
+ return null;
151
+ }
152
+ }
153
+ /**
154
+ * OCR an existing image file (no new capture needed).
155
+ */
156
+ async ocrFile(imagePath, roi, mode = "accurate") {
157
+ const start = Date.now();
158
+ try {
159
+ const result = await this.bridge.call("vision.ocr", { imagePath, mode });
160
+ return {
161
+ source: "vision_ocr", rate: "slow",
162
+ timestamp: new Date().toISOString(),
163
+ data: {
164
+ type: "vision_ocr",
165
+ roi: roi ?? { x: 0, y: 0, width: 0, height: 0, reason: "changed_pixels" },
166
+ text: result.text,
167
+ regions: result.regions ?? [],
168
+ latencyMs: Date.now() - start,
169
+ },
170
+ };
171
+ }
172
+ catch {
173
+ return null;
174
+ }
175
+ }
176
+ /**
177
+ * Optimized single-capture pipeline: capture ONCE to file, diff for change
178
+ * detection with region extraction, OCR only changed regions.
179
+ *
180
+ * Performance: unchanged ~113ms, changed ~175ms (vs ~370ms before Phase 2).
181
+ * Phase 1 (FAST OCR) + Phase 2 (region OCR) combined.
182
+ */
183
+ async captureAndDiffOptimized(windowId, maxROIs = 3) {
184
+ const start = Date.now();
185
+ // 1. Capture: stream frame (~0ms) or one-shot (~112ms)
186
+ const capture = await this.captureToFileOrStream(windowId);
187
+ if (!capture)
188
+ return { diffEvent: null, ocrEvent: null };
189
+ try {
190
+ // 2. Full diff with grid hashing — detects changed regions (~5ms)
191
+ const { changed, hash, changedRegions } = this.differ.diffFile(capture.path, capture.width, capture.height);
192
+ const captureMs = Date.now() - start;
193
+ const diffEvent = {
194
+ source: "vision_diff", rate: "slow",
195
+ timestamp: new Date().toISOString(),
196
+ data: { type: "vision_diff", changed, hash, changedRegions, captureMs },
197
+ };
198
+ // 3. If unchanged → done (~113ms total)
199
+ if (!changed) {
200
+ if (!capture.fromStream) {
201
+ try {
202
+ fs.unlinkSync(capture.path);
203
+ }
204
+ catch { }
205
+ }
206
+ return { diffEvent, ocrEvent: null };
207
+ }
208
+ // 4. Run OCR and YOLO in parallel on the same captured frame
209
+ const mergedRegions = FrameDiffer.mergeRegions(changedRegions, maxROIs, 64, capture.width, capture.height);
210
+ // OCR (region-based or full)
211
+ const ocrPromise = (mergedRegions.length > 0 && mergedRegions.length <= maxROIs)
212
+ ? (async () => {
213
+ const regionResults = [];
214
+ for (const roi of mergedRegions) {
215
+ const regionEvent = await this.ocrRegion(windowId, roi);
216
+ if (regionEvent?.data.type === "vision_ocr" && regionEvent.data.regions) {
217
+ regionResults.push(...regionEvent.data.regions);
218
+ }
219
+ }
220
+ const fullText = regionResults.map((r) => r.text).join("\n");
221
+ return {
222
+ source: "vision_ocr", rate: "slow",
223
+ timestamp: new Date().toISOString(),
224
+ data: {
225
+ type: "vision_ocr",
226
+ roi: mergedRegions[0] ?? { x: 0, y: 0, width: 0, height: 0, reason: "changed_pixels" },
227
+ text: fullText,
228
+ regions: regionResults,
229
+ latencyMs: Date.now() - start - captureMs,
230
+ },
231
+ };
232
+ })()
233
+ : this.ocrFile(capture.path, undefined, "fast");
234
+ // YOLO element detection (~2-5ms on ANE, runs parallel with OCR)
235
+ const yoloPromise = this.detectElements(capture.path, 0.25);
236
+ const [ocrEvent, yoloElements] = await Promise.all([ocrPromise, yoloPromise]);
237
+ // 5. Cleanup (don't delete stream frames — they're owned by the native bridge)
238
+ if (!capture.fromStream) {
239
+ try {
240
+ fs.unlinkSync(capture.path);
241
+ }
242
+ catch { }
243
+ }
244
+ return { diffEvent, ocrEvent, yoloElements };
245
+ }
246
+ catch {
247
+ if (!capture.fromStream) {
248
+ try {
249
+ fs.unlinkSync(capture.path);
250
+ }
251
+ catch { }
252
+ }
253
+ return { diffEvent: null, ocrEvent: null };
254
+ }
255
+ }
256
+ // ── YOLO element detection ──
257
+ /**
258
+ * Detect UI elements in an image using the YOLO CoreML model.
259
+ * Returns classified elements (button, field, heading, image, label, link, text)
260
+ * with bounding boxes and confidence scores.
261
+ */
262
+ async detectElements(imagePath, confidence = 0.25) {
263
+ try {
264
+ const result = await this.bridge.call("vision.detectElements", { imagePath, confidence });
265
+ return result.elements;
266
+ }
267
+ catch {
268
+ return []; // Model not available — degrade gracefully
269
+ }
270
+ }
271
+ /**
272
+ * Fuse OCR text regions with YOLO element detections.
273
+ * Matches YOLO bounding boxes to nearest OCR text to produce labeled elements.
274
+ * E.g., YOLO "button at (450,200)" + OCR "Submit at (460,205)" → "Submit button"
275
+ */
276
+ static fuseOcrAndYolo(ocrRegions, yoloElements, maxDistance = 50) {
277
+ const results = [];
278
+ const usedOcr = new Set();
279
+ // For each YOLO detection, find nearest OCR text
280
+ for (const elem of yoloElements) {
281
+ const elemCenterX = elem.bounds.x + elem.bounds.width / 2;
282
+ const elemCenterY = elem.bounds.y + elem.bounds.height / 2;
283
+ let bestIdx = -1;
284
+ let bestDist = maxDistance + 1;
285
+ for (let i = 0; i < ocrRegions.length; i++) {
286
+ if (usedOcr.has(i))
287
+ continue;
288
+ const ocr = ocrRegions[i];
289
+ const ocrCenterX = ocr.bounds.x + ocr.bounds.width / 2;
290
+ const ocrCenterY = ocr.bounds.y + ocr.bounds.height / 2;
291
+ const dist = Math.sqrt((elemCenterX - ocrCenterX) ** 2 + (elemCenterY - ocrCenterY) ** 2);
292
+ if (dist < bestDist) {
293
+ bestDist = dist;
294
+ bestIdx = i;
295
+ }
296
+ }
297
+ if (bestIdx >= 0) {
298
+ // Fused: YOLO class + OCR text
299
+ usedOcr.add(bestIdx);
300
+ results.push({
301
+ text: ocrRegions[bestIdx].text,
302
+ class: elem.class,
303
+ confidence: elem.confidence,
304
+ bounds: elem.bounds,
305
+ source: "fused",
306
+ });
307
+ }
308
+ else {
309
+ // YOLO only (no nearby text — unlabeled icon/button)
310
+ results.push({
311
+ text: "",
312
+ class: elem.class,
313
+ confidence: elem.confidence,
314
+ bounds: elem.bounds,
315
+ source: "yolo",
316
+ });
317
+ }
318
+ }
319
+ // Add remaining OCR-only elements (text not matched to any YOLO detection)
320
+ for (let i = 0; i < ocrRegions.length; i++) {
321
+ if (usedOcr.has(i))
322
+ continue;
323
+ const ocr = ocrRegions[i];
324
+ results.push({
325
+ text: ocr.text,
326
+ class: "text",
327
+ confidence: 0.7, // Default OCR confidence
328
+ bounds: ocr.bounds,
329
+ source: "ocr",
330
+ });
331
+ }
332
+ return results;
333
+ }
334
+ // ── Stream capture mode ──
335
+ streamRunning = false;
336
+ /**
337
+ * Start continuous SCStream capture for a window.
338
+ * Frames are buffered in the native bridge — getLatestFrame returns instantly.
339
+ */
340
+ async startStream(windowId, fps = 30) {
341
+ try {
342
+ await this.bridge.call("vision.startStream", { windowId, fps });
343
+ this.streamRunning = true;
344
+ return true;
345
+ }
346
+ catch {
347
+ this.streamRunning = false;
348
+ return false;
349
+ }
350
+ }
351
+ /**
352
+ * Stop the continuous capture stream.
353
+ */
354
+ async stopStream() {
355
+ try {
356
+ await this.bridge.call("vision.stopStream", {});
357
+ }
358
+ catch { /* ignore */ }
359
+ this.streamRunning = false;
360
+ }
361
+ get isStreaming() {
362
+ return this.streamRunning;
363
+ }
364
+ /**
365
+ * Get the latest frame from the running stream (~0ms, already captured).
366
+ * Returns null if stream is not running or no frame available yet.
367
+ */
368
+ async getStreamFrame() {
369
+ if (!this.streamRunning)
370
+ return null;
371
+ try {
372
+ return await this.bridge.call("vision.latestFrame", {});
373
+ }
374
+ catch {
375
+ return null;
376
+ }
377
+ }
378
+ /**
379
+ * Optimized capture: use stream frame if available (~0ms), else fall back to one-shot (~200ms).
380
+ */
381
+ async captureToFileOrStream(windowId) {
382
+ // Try stream frame first (instant)
383
+ if (this.streamRunning) {
384
+ const frame = await this.getStreamFrame();
385
+ if (frame && frame.ageMs < 200) { // Only use if fresh (<200ms old)
386
+ return { path: frame.path, width: frame.width, height: frame.height, fromStream: true };
387
+ }
388
+ }
389
+ // Fall back to one-shot capture
390
+ const result = await this.captureToFile(windowId);
391
+ return result ? { ...result, fromStream: false } : null;
392
+ }
393
+ /**
394
+ * Reset differ state (e.g., on context switch to new window).
395
+ */
396
+ reset() {
397
+ this.differ.reset();
398
+ }
399
+ }
@@ -0,0 +1,298 @@
1
+ // Copyright (C) 2025 Clazro Technology Private Limited
2
+ // SPDX-License-Identifier: AGPL-3.0-only
3
+ //
4
+ // This file is part of ScreenHand.
5
+ //
6
+ // ScreenHand is free software: you can redistribute it and/or modify
7
+ // it under the terms of the GNU Affero General Public License as
8
+ // published by the Free Software Foundation, version 3.
9
+ //
10
+ // ScreenHand is distributed in the hope that it will be useful,
11
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ // GNU Affero General Public License for more details.
14
+ //
15
+ // You should have received a copy of the GNU Affero General Public License
16
+ // along with ScreenHand. If not, see <https://www.gnu.org/licenses/>.
17
+ import { DEFAULT_PLANNER_CONFIG } from "./types.js";
18
+ /** Minimum score threshold for a learned locator to override a playbook/strategy locator */
19
+ const LEARNED_LOCATOR_MIN_SCORE = 0.7;
20
+ /**
21
+ * Safe MCP tool names for flow step parsing — excludes dangerous tools
22
+ * that allow arbitrary code execution (browser_js, applescript).
23
+ * These tools should only come from trusted playbooks, not parsed flow descriptions.
24
+ */
25
+ const KNOWN_TOOLS = new Set([
26
+ "browser_navigate", "browser_click", "browser_type", "browser_wait",
27
+ "browser_dom", "browser_open", "browser_tabs",
28
+ "browser_page_info", "browser_fill_form", "browser_human_click", "browser_stealth",
29
+ "screenshot", "screenshot_file", "ocr", "ui_tree", "ui_find", "ui_press",
30
+ "ui_set_value", "click_text", "click", "click_with_fallback", "type_text",
31
+ "type_with_fallback", "key", "drag", "scroll", "scroll_with_fallback",
32
+ "launch", "focus", "menu_click", "wait_for_state",
33
+ "select_with_fallback", "read_with_fallback", "locate_with_fallback",
34
+ // Excluded: "browser_js", "applescript" — arbitrary code execution risk
35
+ ]);
36
+ /**
37
+ * Maps PlaybookStep action types to MCP tool names.
38
+ */
39
+ const ACTION_TO_TOOL = {
40
+ navigate: "browser_navigate",
41
+ press: "click_with_fallback",
42
+ type_into: "type_with_fallback",
43
+ key: "key",
44
+ scroll: "scroll_with_fallback",
45
+ wait: "wait_for_state",
46
+ screenshot: "screenshot",
47
+ extract: "browser_js",
48
+ menu_click: "menu_click",
49
+ browser_js: "browser_js",
50
+ browser_click: "browser_click",
51
+ browser_type: "browser_type",
52
+ cdp_key_event: "browser_js",
53
+ key_combo: "key",
54
+ };
55
+ /**
56
+ * Converts a Playbook into an ActionPlan for deterministic execution.
57
+ * No LLM calls needed — all steps come from the playbook.
58
+ */
59
+ export function playbookToPlan(playbook, config = DEFAULT_PLANNER_CONFIG, learningEngine, bundleId) {
60
+ const steps = playbook.steps.map((step, i) => playbookStepToPlanStep(step, i, config, learningEngine, bundleId));
61
+ const reliability = playbook.successCount + playbook.failCount > 0
62
+ ? playbook.successCount / (playbook.successCount + playbook.failCount)
63
+ : 0.5;
64
+ return {
65
+ steps,
66
+ currentStepIndex: 0,
67
+ confidence: reliability,
68
+ source: "playbook",
69
+ sourceId: playbook.id,
70
+ };
71
+ }
72
+ /**
73
+ * Converts a Strategy (from memory recall) into an ActionPlan.
74
+ */
75
+ export function strategyToPlan(strategy, config = DEFAULT_PLANNER_CONFIG, learningEngine, bundleId) {
76
+ const steps = strategy.steps.map((step, i) => strategyStepToPlanStep(step, i, config, learningEngine, bundleId));
77
+ const reliability = strategy.successCount + strategy.failCount > 0
78
+ ? strategy.successCount / (strategy.successCount + strategy.failCount)
79
+ : 0.5;
80
+ return {
81
+ steps,
82
+ currentStepIndex: 0,
83
+ confidence: reliability,
84
+ source: "strategy",
85
+ sourceId: strategy.id,
86
+ };
87
+ }
88
+ /**
89
+ * Try to parse a flow step description into a concrete tool + params.
90
+ * Many flow steps embed tool names (e.g. "browser_navigate to canva.com").
91
+ * Returns null if the step is too vague to parse.
92
+ */
93
+ function parseFlowStep(stepDesc, ctx) {
94
+ const desc = stepDesc.trim();
95
+ // Pattern 1: function call syntax — tool(key: 'value')
96
+ // Only accept known MCP tool names to prevent arbitrary tool injection
97
+ const funcMatch = desc.match(/^(\w+)\((.+)\)$/);
98
+ if (funcMatch) {
99
+ const tool = funcMatch[1];
100
+ if (!KNOWN_TOOLS.has(tool.toLowerCase()))
101
+ return null;
102
+ const argsStr = funcMatch[2];
103
+ const params = {};
104
+ const argPattern = /(\w+)\s*:\s*'([^']+)'/g;
105
+ let m;
106
+ while ((m = argPattern.exec(argsStr)) !== null) {
107
+ params[m[1]] = m[2];
108
+ }
109
+ return { tool, params };
110
+ }
111
+ // Pattern 2: tool_name at start of description
112
+ const toolPrefixMatch = desc.match(/^(browser_navigate|browser_click|browser_type|browser_wait|browser_dom|browser_open|browser_tabs|browser_page_info|browser_fill_form|screenshot|screenshot_file|ocr|ui_tree|ui_find|ui_press|ui_set_value|click_text|click|type_text|key|drag|scroll|launch|focus|menu_click|wait_for_state)\b/i);
113
+ if (toolPrefixMatch) {
114
+ const tool = toolPrefixMatch[1].toLowerCase();
115
+ const rest = desc.slice(toolPrefixMatch[0].length).trim();
116
+ const params = {};
117
+ if (tool === "browser_navigate") {
118
+ const urlMatch = rest.match(/(?:to\s+)?(\S+\.(?:com|org|net|io|dev|app|co)\S*)/i);
119
+ if (urlMatch)
120
+ params.url = urlMatch[1].startsWith("http") ? urlMatch[1] : `https://${urlMatch[1]}`;
121
+ }
122
+ else if (tool === "browser_click") {
123
+ const quoteMatch = rest.match(/'([^']+)'|"([^"]+)"/);
124
+ if (quoteMatch)
125
+ params.selector = quoteMatch[1] ?? quoteMatch[2];
126
+ }
127
+ else if (tool === "browser_type") {
128
+ const quoteMatch = rest.match(/'([^']+)'|"([^"]+)"/);
129
+ if (quoteMatch)
130
+ params.text = quoteMatch[1] ?? quoteMatch[2];
131
+ }
132
+ else if (tool === "click_text") {
133
+ const quoteMatch = rest.match(/'([^']+)'|"([^"]+)"/);
134
+ if (quoteMatch) {
135
+ params.text = quoteMatch[1] ?? quoteMatch[2];
136
+ if (ctx.windowId)
137
+ params.windowId = ctx.windowId;
138
+ }
139
+ }
140
+ else if (tool === "ui_press") {
141
+ const quoteMatch = rest.match(/'([^']+)'|"([^"]+)"/);
142
+ if (quoteMatch) {
143
+ params.title = quoteMatch[1] ?? quoteMatch[2];
144
+ if (ctx.pid)
145
+ params.pid = ctx.pid;
146
+ }
147
+ }
148
+ else if (tool === "launch") {
149
+ const bundleMatch = rest.match(/'([^']+)'|"([^"]+)"/);
150
+ if (bundleMatch)
151
+ params.bundleId = bundleMatch[1] ?? bundleMatch[2];
152
+ }
153
+ else if (tool === "focus") {
154
+ const appMatch = rest.match(/'([^']+)'|"([^"]+)"|(\S+)/);
155
+ if (appMatch)
156
+ params.bundleId = (appMatch[1] ?? appMatch[2] ?? appMatch[3]).replace(/['"]/g, "");
157
+ }
158
+ else if (tool === "key") {
159
+ const keyMatch = rest.match(/'([^']+)'|"([^"]+)"|(\S+)/);
160
+ if (keyMatch)
161
+ params.combo = keyMatch[1] ?? keyMatch[2] ?? keyMatch[3];
162
+ }
163
+ // screenshot, ocr, ui_tree need no extra params
164
+ return { tool, params };
165
+ }
166
+ return null;
167
+ }
168
+ /**
169
+ * Converts a reference flow (from references/*.json) into an ActionPlan.
170
+ * Parses tool names and params from step descriptions where possible.
171
+ * Steps that can't be parsed are marked requiresLLM=true for client resolution.
172
+ */
173
+ export function flowToPlan(flowName, flow, config = DEFAULT_PLANNER_CONFIG, runtimeContext) {
174
+ const ctx = runtimeContext ?? {};
175
+ const steps = flow.steps.map((stepDesc) => {
176
+ const parsed = parseFlowStep(stepDesc, ctx);
177
+ if (parsed) {
178
+ return {
179
+ tool: parsed.tool,
180
+ params: parsed.params,
181
+ expectedPostcondition: null,
182
+ timeout: config.defaultStepTimeout,
183
+ fallbackTool: null,
184
+ requiresLLM: false,
185
+ status: "pending",
186
+ description: stepDesc,
187
+ };
188
+ }
189
+ return {
190
+ tool: "",
191
+ params: {},
192
+ expectedPostcondition: null,
193
+ timeout: config.defaultStepTimeout,
194
+ fallbackTool: null,
195
+ requiresLLM: true,
196
+ status: "pending",
197
+ description: stepDesc,
198
+ };
199
+ });
200
+ const executableCount = steps.filter((s) => !s.requiresLLM).length;
201
+ const confidence = steps.length > 0 ? 0.3 + 0.4 * (executableCount / steps.length) : 0;
202
+ return {
203
+ steps,
204
+ currentStepIndex: 0,
205
+ confidence,
206
+ source: "reference_flow",
207
+ sourceId: flowName,
208
+ };
209
+ }
210
+ function playbookStepToPlanStep(step, _index, config, learningEngine, bundleId) {
211
+ const tool = ACTION_TO_TOOL[step.action] ?? step.action;
212
+ const params = {};
213
+ if (step.target)
214
+ params.target = step.target;
215
+ if (step.text)
216
+ params.text = step.text;
217
+ if (step.url)
218
+ params.url = step.url;
219
+ if (step.keys)
220
+ params.keys = step.keys;
221
+ if (step.code)
222
+ params.code = step.code;
223
+ if (step.format)
224
+ params.format = step.format;
225
+ if (step.amount !== undefined)
226
+ params.amount = step.amount;
227
+ if (step.locateByOcr) {
228
+ params.locateByOcr = step.locateByOcr;
229
+ if (step.offsetX !== undefined)
230
+ params.offsetX = step.offsetX;
231
+ if (step.offsetY !== undefined)
232
+ params.offsetY = step.offsetY;
233
+ }
234
+ if (step.keyEvent)
235
+ params.keyEvent = step.keyEvent;
236
+ if (step.menuPath)
237
+ params.menuPath = step.menuPath;
238
+ if (step.ms !== undefined)
239
+ params.ms = step.ms;
240
+ // Normalize keys array → combo string for the key tool
241
+ if (tool === "key" && Array.isArray(params.keys)) {
242
+ params.combo = params.keys.join("+");
243
+ delete params.keys;
244
+ }
245
+ // Normalize menuPath array → "/" string for menu_click
246
+ if (tool === "menu_click" && Array.isArray(params.menuPath)) {
247
+ params.menuPath = params.menuPath.join("/");
248
+ }
249
+ // Overlay learned locator if confidence is high enough
250
+ applyLearnedLocator(params, tool, learningEngine, bundleId);
251
+ return {
252
+ tool,
253
+ params,
254
+ expectedPostcondition: step.verify
255
+ ? { type: "control_exists", target: step.verify }
256
+ : null,
257
+ timeout: step.verifyTimeoutMs ?? config.defaultStepTimeout,
258
+ fallbackTool: null,
259
+ requiresLLM: false,
260
+ status: "pending",
261
+ description: step.description ?? `${step.action} ${step.target ?? ""}`.trim(),
262
+ };
263
+ }
264
+ function strategyStepToPlanStep(step, _index, config, learningEngine, bundleId) {
265
+ const params = { ...step.params };
266
+ // Overlay learned locator if confidence is high enough
267
+ applyLearnedLocator(params, step.tool, learningEngine, bundleId);
268
+ return {
269
+ tool: step.tool,
270
+ params,
271
+ expectedPostcondition: null,
272
+ timeout: config.defaultStepTimeout,
273
+ fallbackTool: null,
274
+ requiresLLM: false,
275
+ status: "pending",
276
+ description: `${step.tool} (from strategy)`,
277
+ };
278
+ }
279
+ /**
280
+ * If the learning engine has a proven locator for this tool×app pair,
281
+ * override the step's target/selector with the learned one.
282
+ */
283
+ function applyLearnedLocator(params, tool, learningEngine, bundleId) {
284
+ if (!learningEngine || !bundleId)
285
+ return;
286
+ const rec = learningEngine.recommendLocator(bundleId, tool);
287
+ if (!rec || rec.score < LEARNED_LOCATOR_MIN_SCORE)
288
+ return;
289
+ // Only override target-based params — don't replace url, keys, code, etc.
290
+ if (params.target !== undefined || params.selector !== undefined) {
291
+ params._originalTarget = params.target ?? params.selector;
292
+ params.target = rec.locator;
293
+ params._learnedLocator = true;
294
+ if (rec.method === "cdp") {
295
+ params.selector = rec.locator;
296
+ }
297
+ }
298
+ }