screenhand 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. package/README.md +193 -109
  2. package/bin/darwin-arm64/macos-bridge +0 -0
  3. package/dist/mcp-desktop.js +5876 -0
  4. package/dist/scripts/codex-monitor-daemon.js +335 -0
  5. package/dist/scripts/export-help-center.js +112 -0
  6. package/dist/scripts/marketing-loop.js +117 -0
  7. package/dist/scripts/observer-daemon.js +288 -0
  8. package/dist/scripts/orchestrator-daemon.js +399 -0
  9. package/dist/scripts/supervisor-daemon.js +272 -0
  10. package/dist/scripts/threads-campaign.js +208 -0
  11. package/dist/scripts/worker-daemon.js +228 -0
  12. package/dist/src/agent/cli.js +82 -0
  13. package/dist/src/agent/loop.js +274 -0
  14. package/dist/src/community/fetcher.js +109 -0
  15. package/dist/src/community/index.js +6 -0
  16. package/dist/src/community/publisher.js +191 -0
  17. package/dist/src/community/remote-api.js +121 -0
  18. package/dist/src/community/types.js +3 -0
  19. package/dist/src/community/validator.js +95 -0
  20. package/{src/config.ts → dist/src/config.js} +5 -10
  21. package/dist/src/context-tracker.js +489 -0
  22. package/{src/index.ts → dist/src/index.js} +32 -52
  23. package/dist/src/ingestion/coverage-auditor.js +233 -0
  24. package/dist/src/ingestion/doc-parser.js +164 -0
  25. package/dist/src/ingestion/index.js +8 -0
  26. package/dist/src/ingestion/menu-scanner.js +152 -0
  27. package/dist/src/ingestion/reference-merger.js +186 -0
  28. package/dist/src/ingestion/shortcut-extractor.js +180 -0
  29. package/dist/src/ingestion/tutorial-extractor.js +170 -0
  30. package/dist/src/ingestion/types.js +3 -0
  31. package/dist/src/jobs/manager.js +305 -0
  32. package/dist/src/jobs/runner.js +806 -0
  33. package/dist/src/jobs/store.js +102 -0
  34. package/dist/src/jobs/types.js +30 -0
  35. package/dist/src/jobs/worker.js +97 -0
  36. package/dist/src/learning/engine.js +356 -0
  37. package/dist/src/learning/index.js +9 -0
  38. package/dist/src/learning/locator-policy.js +120 -0
  39. package/dist/src/learning/pattern-policy.js +89 -0
  40. package/dist/src/learning/recovery-policy.js +116 -0
  41. package/dist/src/learning/sensor-policy.js +115 -0
  42. package/dist/src/learning/timing-model.js +204 -0
  43. package/dist/src/learning/topology-policy.js +90 -0
  44. package/dist/src/learning/types.js +9 -0
  45. package/dist/src/logging/timeline-logger.js +48 -0
  46. package/dist/src/mcp/mcp-stdio-server.js +464 -0
  47. package/dist/src/mcp/server.js +363 -0
  48. package/dist/src/mcp-entry.js +60 -0
  49. package/dist/src/memory/playbook-seeds.js +200 -0
  50. package/dist/src/memory/recall.js +222 -0
  51. package/dist/src/memory/research.js +104 -0
  52. package/dist/src/memory/seeds.js +101 -0
  53. package/dist/src/memory/service.js +446 -0
  54. package/dist/src/memory/session.js +169 -0
  55. package/dist/src/memory/store.js +451 -0
  56. package/{src/runtime/locator-cache.ts → dist/src/memory/types.js} +1 -17
  57. package/dist/src/monitor/codex-monitor.js +382 -0
  58. package/dist/src/monitor/task-queue.js +97 -0
  59. package/dist/src/monitor/types.js +62 -0
  60. package/dist/src/native/bridge-client.js +412 -0
  61. package/{src/native/macos-bridge-client.ts → dist/src/native/macos-bridge-client.js} +0 -1
  62. package/dist/src/observer/state.js +199 -0
  63. package/dist/src/observer/types.js +43 -0
  64. package/dist/src/orchestrator/state.js +68 -0
  65. package/dist/src/orchestrator/types.js +22 -0
  66. package/dist/src/perception/ax-source.js +162 -0
  67. package/dist/src/perception/cdp-source.js +162 -0
  68. package/dist/src/perception/coordinator.js +771 -0
  69. package/dist/src/perception/frame-differ.js +287 -0
  70. package/dist/src/perception/index.js +22 -0
  71. package/dist/src/perception/manager.js +199 -0
  72. package/dist/src/perception/types.js +47 -0
  73. package/dist/src/perception/vision-source.js +399 -0
  74. package/dist/src/planner/deterministic.js +298 -0
  75. package/dist/src/planner/executor.js +870 -0
  76. package/dist/src/planner/goal-store.js +92 -0
  77. package/dist/src/planner/index.js +21 -0
  78. package/dist/src/planner/planner.js +520 -0
  79. package/dist/src/planner/tool-registry.js +71 -0
  80. package/dist/src/planner/types.js +22 -0
  81. package/dist/src/platform/explorer.js +213 -0
  82. package/dist/src/platform/help-center-markdown.js +527 -0
  83. package/dist/src/platform/learner.js +257 -0
  84. package/dist/src/playbook/engine.js +486 -0
  85. package/dist/src/playbook/index.js +20 -0
  86. package/dist/src/playbook/mcp-recorder.js +204 -0
  87. package/dist/src/playbook/recorder.js +536 -0
  88. package/dist/src/playbook/runner.js +408 -0
  89. package/dist/src/playbook/store.js +312 -0
  90. package/dist/src/playbook/types.js +17 -0
  91. package/dist/src/recovery/detectors.js +156 -0
  92. package/dist/src/recovery/engine.js +327 -0
  93. package/dist/src/recovery/index.js +20 -0
  94. package/dist/src/recovery/strategies.js +274 -0
  95. package/dist/src/recovery/types.js +20 -0
  96. package/dist/src/runtime/accessibility-adapter.js +430 -0
  97. package/dist/src/runtime/app-adapter.js +64 -0
  98. package/dist/src/runtime/applescript-adapter.js +305 -0
  99. package/dist/src/runtime/ax-role-map.js +96 -0
  100. package/dist/src/runtime/browser-adapter.js +52 -0
  101. package/dist/src/runtime/cdp-chrome-adapter.js +521 -0
  102. package/dist/src/runtime/composite-adapter.js +221 -0
  103. package/dist/src/runtime/execution-contract.js +159 -0
  104. package/dist/src/runtime/executor.js +286 -0
  105. package/dist/src/runtime/locator-cache.js +50 -0
  106. package/dist/src/runtime/planning-loop.js +63 -0
  107. package/dist/src/runtime/service.js +432 -0
  108. package/dist/src/runtime/session-manager.js +63 -0
  109. package/dist/src/runtime/state-observer.js +121 -0
  110. package/dist/src/runtime/vision-adapter.js +225 -0
  111. package/dist/src/state/app-map-types.js +72 -0
  112. package/dist/src/state/app-map.js +1974 -0
  113. package/dist/src/state/entity-tracker.js +108 -0
  114. package/dist/src/state/fusion.js +96 -0
  115. package/dist/src/state/index.js +21 -0
  116. package/dist/src/state/ladder-generator.js +236 -0
  117. package/dist/src/state/persistence.js +156 -0
  118. package/dist/src/state/types.js +17 -0
  119. package/dist/src/state/world-model.js +1456 -0
  120. package/dist/src/supervisor/locks.js +186 -0
  121. package/dist/src/supervisor/supervisor.js +403 -0
  122. package/dist/src/supervisor/types.js +30 -0
  123. package/dist/src/test-mcp-protocol.js +154 -0
  124. package/dist/src/types.js +17 -0
  125. package/dist/src/util/atomic-write.js +133 -0
  126. package/dist/src/util/sanitize.js +146 -0
  127. package/dist-app-maps/com.figma.Desktop.json +959 -0
  128. package/dist-app-maps/com.hnc.Discord.json +1146 -0
  129. package/dist-app-maps/notion.id.json +2831 -0
  130. package/dist-playbooks/canva-screenhand-carousel.json +445 -0
  131. package/dist-playbooks/codex-desktop.json +76 -0
  132. package/dist-playbooks/competitor-research-stack.json +122 -0
  133. package/dist-playbooks/davinci-color-grade.json +153 -0
  134. package/dist-playbooks/davinci-edit-timeline.json +162 -0
  135. package/dist-playbooks/davinci-render.json +114 -0
  136. package/dist-playbooks/devto.json +52 -0
  137. package/dist-playbooks/discord.json +41 -0
  138. package/dist-playbooks/google-flow-create-project.json +59 -0
  139. package/dist-playbooks/google-flow-edit-image.json +90 -0
  140. package/dist-playbooks/google-flow-edit-video.json +90 -0
  141. package/dist-playbooks/google-flow-generate-image.json +68 -0
  142. package/dist-playbooks/google-flow-generate-video.json +191 -0
  143. package/dist-playbooks/google-flow-open-project.json +48 -0
  144. package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
  145. package/dist-playbooks/google-flow-search-assets.json +64 -0
  146. package/dist-playbooks/instagram.json +57 -0
  147. package/dist-playbooks/linkedin.json +52 -0
  148. package/dist-playbooks/n8n.json +43 -0
  149. package/dist-playbooks/reddit.json +52 -0
  150. package/dist-playbooks/threads.json +59 -0
  151. package/dist-playbooks/x-twitter.json +59 -0
  152. package/dist-playbooks/youtube.json +59 -0
  153. package/dist-references/canva.json +646 -0
  154. package/dist-references/codex-desktop.json +305 -0
  155. package/dist-references/davinci-resolve-keyboard.json +594 -0
  156. package/dist-references/davinci-resolve-menu-map.json +1139 -0
  157. package/dist-references/davinci-resolve-menus-batch1.json +116 -0
  158. package/dist-references/davinci-resolve-menus-batch2.json +372 -0
  159. package/dist-references/davinci-resolve-menus-batch3.json +330 -0
  160. package/dist-references/davinci-resolve-menus-batch4.json +297 -0
  161. package/dist-references/davinci-resolve-shortcuts.json +333 -0
  162. package/dist-references/devto.json +317 -0
  163. package/dist-references/discord.json +549 -0
  164. package/dist-references/figma.json +1186 -0
  165. package/dist-references/finder.json +146 -0
  166. package/dist-references/google-ads-transparency.json +95 -0
  167. package/dist-references/google-flow.json +649 -0
  168. package/dist-references/instagram.json +341 -0
  169. package/dist-references/linkedin.json +324 -0
  170. package/dist-references/meta-ad-library.json +86 -0
  171. package/dist-references/n8n.json +387 -0
  172. package/dist-references/notes.json +27 -0
  173. package/dist-references/notion.json +163 -0
  174. package/dist-references/reddit.json +341 -0
  175. package/dist-references/threads.json +337 -0
  176. package/dist-references/x-twitter.json +403 -0
  177. package/dist-references/youtube.json +373 -0
  178. package/native/macos-bridge/Package.swift +1 -0
  179. package/native/macos-bridge/Sources/AccessibilityBridge.swift +257 -36
  180. package/native/macos-bridge/Sources/AppManagement.swift +212 -2
  181. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +348 -53
  182. package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
  183. package/native/macos-bridge/Sources/VisionBridge.swift +165 -7
  184. package/native/macos-bridge/Sources/main.swift +169 -16
  185. package/native/windows-bridge/Program.cs +5 -0
  186. package/native/windows-bridge/ScreenCapture.cs +124 -0
  187. package/package.json +29 -4
  188. package/scripts/postinstall.cjs +127 -0
  189. package/.claude/commands/automate.md +0 -28
  190. package/.claude/commands/debug-ui.md +0 -19
  191. package/.claude/commands/screenshot.md +0 -15
  192. package/.github/FUNDING.yml +0 -1
  193. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -27
  194. package/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
  195. package/.mcp.json +0 -8
  196. package/DESKTOP_MCP_GUIDE.md +0 -92
  197. package/SECURITY.md +0 -44
  198. package/docs/architecture.md +0 -47
  199. package/install-skills.sh +0 -19
  200. package/mcp-bridge.ts +0 -271
  201. package/mcp-desktop.ts +0 -1221
  202. package/playbooks/instagram.json +0 -41
  203. package/playbooks/instagram_v2.json +0 -201
  204. package/playbooks/x_v1.json +0 -211
  205. package/scripts/devpost-live-loop.mjs +0 -421
  206. package/src/logging/timeline-logger.ts +0 -55
  207. package/src/mcp/server.ts +0 -449
  208. package/src/memory/recall.ts +0 -191
  209. package/src/memory/research.ts +0 -146
  210. package/src/memory/seeds.ts +0 -123
  211. package/src/memory/session.ts +0 -201
  212. package/src/memory/store.ts +0 -434
  213. package/src/memory/types.ts +0 -69
  214. package/src/native/bridge-client.ts +0 -239
  215. package/src/runtime/accessibility-adapter.ts +0 -487
  216. package/src/runtime/app-adapter.ts +0 -169
  217. package/src/runtime/applescript-adapter.ts +0 -376
  218. package/src/runtime/ax-role-map.ts +0 -102
  219. package/src/runtime/browser-adapter.ts +0 -129
  220. package/src/runtime/cdp-chrome-adapter.ts +0 -676
  221. package/src/runtime/composite-adapter.ts +0 -274
  222. package/src/runtime/executor.ts +0 -396
  223. package/src/runtime/planning-loop.ts +0 -81
  224. package/src/runtime/service.ts +0 -448
  225. package/src/runtime/session-manager.ts +0 -50
  226. package/src/runtime/state-observer.ts +0 -136
  227. package/src/runtime/vision-adapter.ts +0 -297
  228. package/src/types.ts +0 -297
  229. package/tests/bridge-client.test.ts +0 -176
  230. package/tests/browser-stealth.test.ts +0 -210
  231. package/tests/composite-adapter.test.ts +0 -64
  232. package/tests/mcp-server.test.ts +0 -151
  233. package/tests/memory-recall.test.ts +0 -339
  234. package/tests/memory-research.test.ts +0 -159
  235. package/tests/memory-seeds.test.ts +0 -120
  236. package/tests/memory-store.test.ts +0 -392
  237. package/tests/types.test.ts +0 -92
  238. package/tsconfig.check.json +0 -17
  239. package/tsconfig.json +0 -19
  240. package/vitest.config.ts +0 -8
  241. /package/{playbooks → dist-references}/devpost.json +0 -0
@@ -1,136 +0,0 @@
1
- // Copyright (C) 2025 Clazro Technology Private Limited
2
- // SPDX-License-Identifier: AGPL-3.0-only
3
- //
4
- // This file is part of ScreenHand.
5
- //
6
- // ScreenHand is free software: you can redistribute it and/or modify
7
- // it under the terms of the GNU Affero General Public License as
8
- // published by the Free Software Foundation, version 3.
9
- //
10
- // ScreenHand is distributed in the hope that it will be useful,
11
- // but WITHOUT ANY WARRANTY; without even the implied warranty of
12
- // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
- // GNU Affero General Public License for more details.
14
- //
15
- // You should have received a copy of the GNU Affero General Public License
16
- // along with ScreenHand. If not, see <https://www.gnu.org/licenses/>.
17
-
18
- import { EventEmitter } from "node:events";
19
- import type { MacOSBridgeClient } from "../native/macos-bridge-client.js";
20
- import type { UIEvent, UIEventType } from "../types.js";
21
-
22
- /**
23
- * Wraps the native bridge's AX observer events into typed UIEvent objects.
24
- * Buffers events for consumption by the planning loop.
25
- */
26
- export class StateObserver extends EventEmitter {
27
- private observedPids = new Set<number>();
28
- private eventBuffer: UIEvent[] = [];
29
- private readonly maxBufferSize: number;
30
-
31
- constructor(
32
- private readonly bridge: MacOSBridgeClient,
33
- maxBufferSize = 200,
34
- ) {
35
- super();
36
- this.maxBufferSize = maxBufferSize;
37
-
38
- // Listen for AX events from the bridge
39
- this.bridge.on("ax-event", (raw: Record<string, unknown>) => {
40
- const event = this.parseEvent(raw);
41
- if (event) {
42
- this.eventBuffer.push(event);
43
- if (this.eventBuffer.length > this.maxBufferSize) {
44
- this.eventBuffer.shift();
45
- }
46
- this.emit("event", event);
47
- }
48
- });
49
- }
50
-
51
- async startObserving(pid: number, eventTypes?: UIEventType[]): Promise<void> {
52
- if (this.observedPids.has(pid)) return;
53
-
54
- const notifications = eventTypes
55
- ? this.mapEventTypesToNotifications(eventTypes)
56
- : undefined;
57
-
58
- await this.bridge.call("observer.start", {
59
- pid,
60
- notifications,
61
- });
62
-
63
- this.observedPids.add(pid);
64
- }
65
-
66
- async stopObserving(pid: number): Promise<void> {
67
- if (!this.observedPids.has(pid)) return;
68
-
69
- await this.bridge.call("observer.stop", { pid });
70
- this.observedPids.delete(pid);
71
- }
72
-
73
- /** Get and clear the event buffer. */
74
- drainEvents(): UIEvent[] {
75
- const events = [...this.eventBuffer];
76
- this.eventBuffer = [];
77
- return events;
78
- }
79
-
80
- /** Get recent events without clearing. */
81
- peekEvents(limit = 50): UIEvent[] {
82
- return this.eventBuffer.slice(-limit);
83
- }
84
-
85
- /** Clear the event buffer. */
86
- clearEvents(): void {
87
- this.eventBuffer = [];
88
- }
89
-
90
- get isObserving(): boolean {
91
- return this.observedPids.size > 0;
92
- }
93
-
94
- get observedProcesses(): number[] {
95
- return [...this.observedPids];
96
- }
97
-
98
- private parseEvent(raw: Record<string, unknown>): UIEvent | null {
99
- const type = raw.type as UIEventType | undefined;
100
- if (!type) return null;
101
-
102
- const event: UIEvent = {
103
- type,
104
- timestamp: (raw.timestamp as string) ?? new Date().toISOString(),
105
- pid: (raw.pid as number) ?? 0,
106
- };
107
-
108
- if (typeof raw.bundleId === "string") event.bundleId = raw.bundleId;
109
- if (typeof raw.elementRole === "string") event.elementRole = raw.elementRole;
110
- if (typeof raw.elementLabel === "string") event.elementLabel = raw.elementLabel;
111
- if (typeof raw.oldValue === "string") event.oldValue = raw.oldValue;
112
- if (typeof raw.newValue === "string") event.newValue = raw.newValue;
113
- if (typeof raw.windowTitle === "string") event.windowTitle = raw.windowTitle;
114
-
115
- return event;
116
- }
117
-
118
- private mapEventTypesToNotifications(types: UIEventType[]): string[] {
119
- const map: Record<string, string> = {
120
- value_changed: "AXValueChanged",
121
- focus_changed: "AXFocusedUIElementChanged",
122
- window_created: "AXWindowCreated",
123
- window_closed: "AXUIElementDestroyed",
124
- title_changed: "AXTitleChanged",
125
- menu_opened: "AXMenuOpened",
126
- layout_changed: "AXLayoutChanged",
127
- dialog_appeared: "AXSheetCreated",
128
- app_activated: "AXApplicationActivated",
129
- app_deactivated: "AXApplicationDeactivated",
130
- };
131
-
132
- return types
133
- .map((t) => map[t])
134
- .filter((n): n is string => n !== undefined);
135
- }
136
- }
@@ -1,297 +0,0 @@
1
- // Copyright (C) 2025 Clazro Technology Private Limited
2
- // SPDX-License-Identifier: AGPL-3.0-only
3
- //
4
- // This file is part of ScreenHand.
5
- //
6
- // ScreenHand is free software: you can redistribute it and/or modify
7
- // it under the terms of the GNU Affero General Public License as
8
- // published by the Free Software Foundation, version 3.
9
- //
10
- // ScreenHand is distributed in the hope that it will be useful,
11
- // but WITHOUT ANY WARRANTY; without even the implied warranty of
12
- // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
- // GNU Affero General Public License for more details.
14
- //
15
- // You should have received a copy of the GNU Affero General Public License
16
- // along with ScreenHand. If not, see <https://www.gnu.org/licenses/>.
17
-
18
- import type {
19
- AXNode,
20
- AppContext,
21
- ExtractFormat,
22
- LocatedElement,
23
- PageMeta,
24
- SessionInfo,
25
- Target,
26
- WaitCondition,
27
- } from "../types.js";
28
- import type { AppAdapter } from "./app-adapter.js";
29
- import type { MacOSBridgeClient } from "../native/macos-bridge-client.js";
30
-
31
- const POLL_INTERVAL_MS = 200;
32
-
33
- interface VisionSessionState {
34
- info: SessionInfo;
35
- pid: number;
36
- bundleId: string;
37
- appName: string;
38
- lastScreenshotPath?: string;
39
- }
40
-
41
- interface OCRResult {
42
- text: string;
43
- confidence: number;
44
- bounds: { x: number; y: number; width: number; height: number };
45
- }
46
-
47
- /**
48
- * Vision-based adapter for apps with poor/no accessibility support.
49
- * Uses screenshots + OCR to locate elements and CG events to interact.
50
- */
51
- export class VisionAdapter implements AppAdapter {
52
- private readonly sessions = new Map<string, VisionSessionState>();
53
- private readonly sessionsByProfile = new Map<string, VisionSessionState>();
54
-
55
- constructor(private readonly bridge: MacOSBridgeClient) {}
56
-
57
- async attach(profile: string): Promise<SessionInfo> {
58
- const existing = this.sessionsByProfile.get(profile);
59
- if (existing) return existing.info;
60
-
61
- await this.bridge.start();
62
-
63
- const frontmost = await this.bridge.call<{ bundleId: string; name: string; pid: number }>(
64
- "app.frontmost",
65
- );
66
-
67
- const info: SessionInfo = {
68
- sessionId: `vision_session_${profile}_${Date.now()}`,
69
- profile,
70
- createdAt: new Date().toISOString(),
71
- adapterType: "vision",
72
- };
73
-
74
- const state: VisionSessionState = {
75
- info,
76
- pid: frontmost.pid,
77
- bundleId: frontmost.bundleId,
78
- appName: frontmost.name,
79
- };
80
-
81
- this.sessions.set(info.sessionId, state);
82
- this.sessionsByProfile.set(profile, state);
83
- return info;
84
- }
85
-
86
- async getAppContext(sessionId: string): Promise<AppContext> {
87
- const state = this.requireSession(sessionId);
88
- return {
89
- bundleId: state.bundleId,
90
- appName: state.appName,
91
- pid: state.pid,
92
- windowTitle: state.appName,
93
- };
94
- }
95
-
96
- async getPageMeta(sessionId: string): Promise<PageMeta> {
97
- const ctx = await this.getAppContext(sessionId);
98
- return {
99
- url: `app://${ctx.bundleId}`,
100
- title: ctx.appName,
101
- };
102
- }
103
-
104
- async navigate(sessionId: string, url: string, _timeoutMs: number): Promise<PageMeta> {
105
- if (url.startsWith("app://")) {
106
- const bundleId = url.slice(6);
107
- const result = await this.bridge.call<{ bundleId: string; appName: string; pid: number }>(
108
- "app.launch",
109
- { bundleId },
110
- );
111
- const state = this.requireSession(sessionId);
112
- state.pid = result.pid;
113
- state.bundleId = result.bundleId;
114
- state.appName = result.appName;
115
- }
116
- return this.getPageMeta(sessionId);
117
- }
118
-
119
- async locate(sessionId: string, target: Target, timeoutMs: number): Promise<LocatedElement | null> {
120
- const state = this.requireSession(sessionId);
121
- const deadline = Date.now() + timeoutMs;
122
-
123
- while (Date.now() < deadline) {
124
- // Take a screenshot
125
- const screenshotResult = await this.bridge.call<{ path: string }>(
126
- "cg.captureScreen",
127
- {},
128
- );
129
- state.lastScreenshotPath = screenshotResult.path;
130
-
131
- const searchText = this.getSearchText(target);
132
- if (!searchText) {
133
- // For coordinate targets, just return coordinates directly
134
- if (target.type === "coordinates") {
135
- return {
136
- handleId: `vision_coords_${target.x}_${target.y}`,
137
- locatorUsed: "vision:coordinates",
138
- coordinates: { x: target.x, y: target.y, width: 1, height: 1 },
139
- };
140
- }
141
- return null;
142
- }
143
-
144
- // OCR the screenshot
145
- const matches = await this.bridge.call<OCRResult[]>("vision.findText", {
146
- imagePath: screenshotResult.path,
147
- searchText,
148
- });
149
-
150
- if (matches.length > 0) {
151
- const best = matches.reduce((a, b) => (a.confidence > b.confidence ? a : b));
152
- return {
153
- handleId: `vision_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
154
- locatorUsed: `vision:text:${searchText}`,
155
- label: best.text,
156
- coordinates: best.bounds,
157
- };
158
- }
159
-
160
- await sleep(POLL_INTERVAL_MS);
161
- }
162
-
163
- return null;
164
- }
165
-
166
- async click(_sessionId: string, element: LocatedElement): Promise<void> {
167
- if (!element.coordinates) {
168
- throw new Error("Vision adapter requires coordinates to click");
169
- }
170
- const cx = element.coordinates.x + element.coordinates.width / 2;
171
- const cy = element.coordinates.y + element.coordinates.height / 2;
172
- await this.bridge.call("cg.mouseClick", { x: cx, y: cy });
173
- }
174
-
175
- async setValue(_sessionId: string, element: LocatedElement, text: string, clear: boolean): Promise<void> {
176
- // Click to focus
177
- await this.click(_sessionId, element);
178
- await sleep(100);
179
-
180
- if (clear) {
181
- await this.bridge.call("cg.keyCombo", { keys: ["cmd", "a"] });
182
- await sleep(50);
183
- }
184
-
185
- await this.bridge.call("cg.typeText", { text });
186
- }
187
-
188
- async getValue(_sessionId: string, element: LocatedElement): Promise<string> {
189
- // Vision can't reliably read values; return label if available
190
- return element.label ?? "";
191
- }
192
-
193
- async waitFor(sessionId: string, condition: WaitCondition, timeoutMs: number): Promise<boolean> {
194
- const deadline = Date.now() + timeoutMs;
195
-
196
- while (Date.now() < deadline) {
197
- if (condition.type === "text_appears") {
198
- const found = await this.locate(
199
- sessionId,
200
- { type: "text", value: condition.text },
201
- 200,
202
- );
203
- if (found) return true;
204
- } else if (condition.type === "element_exists") {
205
- const found = await this.locate(sessionId, condition.target, 200);
206
- if (found) return true;
207
- } else if (condition.type === "element_gone") {
208
- const found = await this.locate(sessionId, condition.target, 200);
209
- if (!found) return true;
210
- } else {
211
- // Unsupported condition types
212
- return false;
213
- }
214
- await sleep(POLL_INTERVAL_MS);
215
- }
216
- return false;
217
- }
218
-
219
- async extract(sessionId: string, _target: Target, format: ExtractFormat): Promise<unknown> {
220
- const state = this.requireSession(sessionId);
221
-
222
- // Take a fresh screenshot and OCR it
223
- const screenshotResult = await this.bridge.call<{ path: string }>("cg.captureScreen", {});
224
- state.lastScreenshotPath = screenshotResult.path;
225
-
226
- const ocrResult = await this.bridge.call<{ text: string; regions: OCRResult[] }>(
227
- "vision.ocr",
228
- { imagePath: screenshotResult.path },
229
- );
230
-
231
- if (format === "text") {
232
- return ocrResult.text;
233
- }
234
-
235
- if (format === "json") {
236
- return ocrResult;
237
- }
238
-
239
- // table format
240
- return {
241
- headers: ["text", "confidence", "x", "y", "width", "height"],
242
- rows: ocrResult.regions.map((r) => [
243
- r.text,
244
- r.confidence,
245
- r.bounds.x,
246
- r.bounds.y,
247
- r.bounds.width,
248
- r.bounds.height,
249
- ]),
250
- };
251
- }
252
-
253
- async screenshot(_sessionId: string, region?: { x: number; y: number; width: number; height: number }): Promise<string> {
254
- const result = await this.bridge.call<{ path: string }>(
255
- "cg.captureScreen",
256
- region ? { region } : {},
257
- );
258
- return result.path;
259
- }
260
-
261
- async keyCombo(_sessionId: string, keys: string[]): Promise<void> {
262
- await this.bridge.call("cg.keyCombo", { keys });
263
- }
264
-
265
- async elementTree(_sessionId: string, _maxDepth?: number, _root?: Target): Promise<AXNode> {
266
- throw new Error("Vision adapter does not support elementTree — use accessibility adapter");
267
- }
268
-
269
- // ── Private ──
270
-
271
- private requireSession(sessionId: string): VisionSessionState {
272
- const state = this.sessions.get(sessionId);
273
- if (!state) throw new Error(`Session not found: ${sessionId}`);
274
- return state;
275
- }
276
-
277
- private getSearchText(target: Target): string | null {
278
- switch (target.type) {
279
- case "text":
280
- return target.value;
281
- case "role":
282
- return target.name;
283
- case "selector":
284
- return target.value;
285
- case "ax_attribute":
286
- return target.value;
287
- case "image":
288
- case "coordinates":
289
- case "ax_path":
290
- return null;
291
- }
292
- }
293
- }
294
-
295
- function sleep(ms: number): Promise<void> {
296
- return new Promise((resolve) => setTimeout(resolve, ms));
297
- }