screenhand 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. package/README.md +193 -109
  2. package/bin/darwin-arm64/macos-bridge +0 -0
  3. package/dist/mcp-desktop.js +5876 -0
  4. package/dist/scripts/codex-monitor-daemon.js +335 -0
  5. package/dist/scripts/export-help-center.js +112 -0
  6. package/dist/scripts/marketing-loop.js +117 -0
  7. package/dist/scripts/observer-daemon.js +288 -0
  8. package/dist/scripts/orchestrator-daemon.js +399 -0
  9. package/dist/scripts/supervisor-daemon.js +272 -0
  10. package/dist/scripts/threads-campaign.js +208 -0
  11. package/dist/scripts/worker-daemon.js +228 -0
  12. package/dist/src/agent/cli.js +82 -0
  13. package/dist/src/agent/loop.js +274 -0
  14. package/dist/src/community/fetcher.js +109 -0
  15. package/dist/src/community/index.js +6 -0
  16. package/dist/src/community/publisher.js +191 -0
  17. package/dist/src/community/remote-api.js +121 -0
  18. package/dist/src/community/types.js +3 -0
  19. package/dist/src/community/validator.js +95 -0
  20. package/{src/config.ts → dist/src/config.js} +5 -10
  21. package/dist/src/context-tracker.js +489 -0
  22. package/{src/index.ts → dist/src/index.js} +32 -52
  23. package/dist/src/ingestion/coverage-auditor.js +233 -0
  24. package/dist/src/ingestion/doc-parser.js +164 -0
  25. package/dist/src/ingestion/index.js +8 -0
  26. package/dist/src/ingestion/menu-scanner.js +152 -0
  27. package/dist/src/ingestion/reference-merger.js +186 -0
  28. package/dist/src/ingestion/shortcut-extractor.js +180 -0
  29. package/dist/src/ingestion/tutorial-extractor.js +170 -0
  30. package/dist/src/ingestion/types.js +3 -0
  31. package/dist/src/jobs/manager.js +305 -0
  32. package/dist/src/jobs/runner.js +806 -0
  33. package/dist/src/jobs/store.js +102 -0
  34. package/dist/src/jobs/types.js +30 -0
  35. package/dist/src/jobs/worker.js +97 -0
  36. package/dist/src/learning/engine.js +356 -0
  37. package/dist/src/learning/index.js +9 -0
  38. package/dist/src/learning/locator-policy.js +120 -0
  39. package/dist/src/learning/pattern-policy.js +89 -0
  40. package/dist/src/learning/recovery-policy.js +116 -0
  41. package/dist/src/learning/sensor-policy.js +115 -0
  42. package/dist/src/learning/timing-model.js +204 -0
  43. package/dist/src/learning/topology-policy.js +90 -0
  44. package/dist/src/learning/types.js +9 -0
  45. package/dist/src/logging/timeline-logger.js +48 -0
  46. package/dist/src/mcp/mcp-stdio-server.js +464 -0
  47. package/dist/src/mcp/server.js +363 -0
  48. package/dist/src/mcp-entry.js +60 -0
  49. package/dist/src/memory/playbook-seeds.js +200 -0
  50. package/dist/src/memory/recall.js +222 -0
  51. package/dist/src/memory/research.js +104 -0
  52. package/dist/src/memory/seeds.js +101 -0
  53. package/dist/src/memory/service.js +446 -0
  54. package/dist/src/memory/session.js +169 -0
  55. package/dist/src/memory/store.js +451 -0
  56. package/{src/runtime/locator-cache.ts → dist/src/memory/types.js} +1 -17
  57. package/dist/src/monitor/codex-monitor.js +382 -0
  58. package/dist/src/monitor/task-queue.js +97 -0
  59. package/dist/src/monitor/types.js +62 -0
  60. package/dist/src/native/bridge-client.js +412 -0
  61. package/{src/native/macos-bridge-client.ts → dist/src/native/macos-bridge-client.js} +0 -1
  62. package/dist/src/observer/state.js +199 -0
  63. package/dist/src/observer/types.js +43 -0
  64. package/dist/src/orchestrator/state.js +68 -0
  65. package/dist/src/orchestrator/types.js +22 -0
  66. package/dist/src/perception/ax-source.js +162 -0
  67. package/dist/src/perception/cdp-source.js +162 -0
  68. package/dist/src/perception/coordinator.js +771 -0
  69. package/dist/src/perception/frame-differ.js +287 -0
  70. package/dist/src/perception/index.js +22 -0
  71. package/dist/src/perception/manager.js +199 -0
  72. package/dist/src/perception/types.js +47 -0
  73. package/dist/src/perception/vision-source.js +399 -0
  74. package/dist/src/planner/deterministic.js +298 -0
  75. package/dist/src/planner/executor.js +870 -0
  76. package/dist/src/planner/goal-store.js +92 -0
  77. package/dist/src/planner/index.js +21 -0
  78. package/dist/src/planner/planner.js +520 -0
  79. package/dist/src/planner/tool-registry.js +71 -0
  80. package/dist/src/planner/types.js +22 -0
  81. package/dist/src/platform/explorer.js +213 -0
  82. package/dist/src/platform/help-center-markdown.js +527 -0
  83. package/dist/src/platform/learner.js +257 -0
  84. package/dist/src/playbook/engine.js +486 -0
  85. package/dist/src/playbook/index.js +20 -0
  86. package/dist/src/playbook/mcp-recorder.js +204 -0
  87. package/dist/src/playbook/recorder.js +536 -0
  88. package/dist/src/playbook/runner.js +408 -0
  89. package/dist/src/playbook/store.js +312 -0
  90. package/dist/src/playbook/types.js +17 -0
  91. package/dist/src/recovery/detectors.js +156 -0
  92. package/dist/src/recovery/engine.js +327 -0
  93. package/dist/src/recovery/index.js +20 -0
  94. package/dist/src/recovery/strategies.js +274 -0
  95. package/dist/src/recovery/types.js +20 -0
  96. package/dist/src/runtime/accessibility-adapter.js +430 -0
  97. package/dist/src/runtime/app-adapter.js +64 -0
  98. package/dist/src/runtime/applescript-adapter.js +305 -0
  99. package/dist/src/runtime/ax-role-map.js +96 -0
  100. package/dist/src/runtime/browser-adapter.js +52 -0
  101. package/dist/src/runtime/cdp-chrome-adapter.js +521 -0
  102. package/dist/src/runtime/composite-adapter.js +221 -0
  103. package/dist/src/runtime/execution-contract.js +159 -0
  104. package/dist/src/runtime/executor.js +286 -0
  105. package/dist/src/runtime/locator-cache.js +50 -0
  106. package/dist/src/runtime/planning-loop.js +63 -0
  107. package/dist/src/runtime/service.js +432 -0
  108. package/dist/src/runtime/session-manager.js +63 -0
  109. package/dist/src/runtime/state-observer.js +121 -0
  110. package/dist/src/runtime/vision-adapter.js +225 -0
  111. package/dist/src/state/app-map-types.js +72 -0
  112. package/dist/src/state/app-map.js +1974 -0
  113. package/dist/src/state/entity-tracker.js +108 -0
  114. package/dist/src/state/fusion.js +96 -0
  115. package/dist/src/state/index.js +21 -0
  116. package/dist/src/state/ladder-generator.js +236 -0
  117. package/dist/src/state/persistence.js +156 -0
  118. package/dist/src/state/types.js +17 -0
  119. package/dist/src/state/world-model.js +1456 -0
  120. package/dist/src/supervisor/locks.js +186 -0
  121. package/dist/src/supervisor/supervisor.js +403 -0
  122. package/dist/src/supervisor/types.js +30 -0
  123. package/dist/src/test-mcp-protocol.js +154 -0
  124. package/dist/src/types.js +17 -0
  125. package/dist/src/util/atomic-write.js +133 -0
  126. package/dist/src/util/sanitize.js +146 -0
  127. package/dist-app-maps/com.figma.Desktop.json +959 -0
  128. package/dist-app-maps/com.hnc.Discord.json +1146 -0
  129. package/dist-app-maps/notion.id.json +2831 -0
  130. package/dist-playbooks/canva-screenhand-carousel.json +445 -0
  131. package/dist-playbooks/codex-desktop.json +76 -0
  132. package/dist-playbooks/competitor-research-stack.json +122 -0
  133. package/dist-playbooks/davinci-color-grade.json +153 -0
  134. package/dist-playbooks/davinci-edit-timeline.json +162 -0
  135. package/dist-playbooks/davinci-render.json +114 -0
  136. package/dist-playbooks/devto.json +52 -0
  137. package/dist-playbooks/discord.json +41 -0
  138. package/dist-playbooks/google-flow-create-project.json +59 -0
  139. package/dist-playbooks/google-flow-edit-image.json +90 -0
  140. package/dist-playbooks/google-flow-edit-video.json +90 -0
  141. package/dist-playbooks/google-flow-generate-image.json +68 -0
  142. package/dist-playbooks/google-flow-generate-video.json +191 -0
  143. package/dist-playbooks/google-flow-open-project.json +48 -0
  144. package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
  145. package/dist-playbooks/google-flow-search-assets.json +64 -0
  146. package/dist-playbooks/instagram.json +57 -0
  147. package/dist-playbooks/linkedin.json +52 -0
  148. package/dist-playbooks/n8n.json +43 -0
  149. package/dist-playbooks/reddit.json +52 -0
  150. package/dist-playbooks/threads.json +59 -0
  151. package/dist-playbooks/x-twitter.json +59 -0
  152. package/dist-playbooks/youtube.json +59 -0
  153. package/dist-references/canva.json +646 -0
  154. package/dist-references/codex-desktop.json +305 -0
  155. package/dist-references/davinci-resolve-keyboard.json +594 -0
  156. package/dist-references/davinci-resolve-menu-map.json +1139 -0
  157. package/dist-references/davinci-resolve-menus-batch1.json +116 -0
  158. package/dist-references/davinci-resolve-menus-batch2.json +372 -0
  159. package/dist-references/davinci-resolve-menus-batch3.json +330 -0
  160. package/dist-references/davinci-resolve-menus-batch4.json +297 -0
  161. package/dist-references/davinci-resolve-shortcuts.json +333 -0
  162. package/dist-references/devto.json +317 -0
  163. package/dist-references/discord.json +549 -0
  164. package/dist-references/figma.json +1186 -0
  165. package/dist-references/finder.json +146 -0
  166. package/dist-references/google-ads-transparency.json +95 -0
  167. package/dist-references/google-flow.json +649 -0
  168. package/dist-references/instagram.json +341 -0
  169. package/dist-references/linkedin.json +324 -0
  170. package/dist-references/meta-ad-library.json +86 -0
  171. package/dist-references/n8n.json +387 -0
  172. package/dist-references/notes.json +27 -0
  173. package/dist-references/notion.json +163 -0
  174. package/dist-references/reddit.json +341 -0
  175. package/dist-references/threads.json +337 -0
  176. package/dist-references/x-twitter.json +403 -0
  177. package/dist-references/youtube.json +373 -0
  178. package/native/macos-bridge/Package.swift +1 -0
  179. package/native/macos-bridge/Sources/AccessibilityBridge.swift +257 -36
  180. package/native/macos-bridge/Sources/AppManagement.swift +212 -2
  181. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +348 -53
  182. package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
  183. package/native/macos-bridge/Sources/VisionBridge.swift +165 -7
  184. package/native/macos-bridge/Sources/main.swift +169 -16
  185. package/native/windows-bridge/Program.cs +5 -0
  186. package/native/windows-bridge/ScreenCapture.cs +124 -0
  187. package/package.json +29 -4
  188. package/scripts/postinstall.cjs +127 -0
  189. package/.claude/commands/automate.md +0 -28
  190. package/.claude/commands/debug-ui.md +0 -19
  191. package/.claude/commands/screenshot.md +0 -15
  192. package/.github/FUNDING.yml +0 -1
  193. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -27
  194. package/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
  195. package/.mcp.json +0 -8
  196. package/DESKTOP_MCP_GUIDE.md +0 -92
  197. package/SECURITY.md +0 -44
  198. package/docs/architecture.md +0 -47
  199. package/install-skills.sh +0 -19
  200. package/mcp-bridge.ts +0 -271
  201. package/mcp-desktop.ts +0 -1221
  202. package/playbooks/instagram.json +0 -41
  203. package/playbooks/instagram_v2.json +0 -201
  204. package/playbooks/x_v1.json +0 -211
  205. package/scripts/devpost-live-loop.mjs +0 -421
  206. package/src/logging/timeline-logger.ts +0 -55
  207. package/src/mcp/server.ts +0 -449
  208. package/src/memory/recall.ts +0 -191
  209. package/src/memory/research.ts +0 -146
  210. package/src/memory/seeds.ts +0 -123
  211. package/src/memory/session.ts +0 -201
  212. package/src/memory/store.ts +0 -434
  213. package/src/memory/types.ts +0 -69
  214. package/src/native/bridge-client.ts +0 -239
  215. package/src/runtime/accessibility-adapter.ts +0 -487
  216. package/src/runtime/app-adapter.ts +0 -169
  217. package/src/runtime/applescript-adapter.ts +0 -376
  218. package/src/runtime/ax-role-map.ts +0 -102
  219. package/src/runtime/browser-adapter.ts +0 -129
  220. package/src/runtime/cdp-chrome-adapter.ts +0 -676
  221. package/src/runtime/composite-adapter.ts +0 -274
  222. package/src/runtime/executor.ts +0 -396
  223. package/src/runtime/planning-loop.ts +0 -81
  224. package/src/runtime/service.ts +0 -448
  225. package/src/runtime/session-manager.ts +0 -50
  226. package/src/runtime/state-observer.ts +0 -136
  227. package/src/runtime/vision-adapter.ts +0 -297
  228. package/src/types.ts +0 -297
  229. package/tests/bridge-client.test.ts +0 -176
  230. package/tests/browser-stealth.test.ts +0 -210
  231. package/tests/composite-adapter.test.ts +0 -64
  232. package/tests/mcp-server.test.ts +0 -151
  233. package/tests/memory-recall.test.ts +0 -339
  234. package/tests/memory-research.test.ts +0 -159
  235. package/tests/memory-seeds.test.ts +0 -120
  236. package/tests/memory-store.test.ts +0 -392
  237. package/tests/types.test.ts +0 -92
  238. package/tsconfig.check.json +0 -17
  239. package/tsconfig.json +0 -19
  240. package/vitest.config.ts +0 -8
  241. /package/{playbooks → dist-references}/devpost.json +0 -0
@@ -0,0 +1,536 @@
1
+ // Copyright (C) 2025 Clazro Technology Private Limited
2
+ // SPDX-License-Identifier: AGPL-3.0-only
3
+ //
4
+ // This file is part of ScreenHand.
5
+ //
6
+ // ScreenHand is free software: you can redistribute it and/or modify
7
+ // it under the terms of the GNU Affero General Public License as
8
+ // published by the Free Software Foundation, version 3.
9
+ //
10
+ // ScreenHand is distributed in the hope that it will be useful,
11
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ // GNU Affero General Public License for more details.
14
+ //
15
+ // You should have received a copy of the GNU Affero General Public License
16
+ // along with ScreenHand. If not, see <https://www.gnu.org/licenses/>.
17
+ /**
18
+ * Playbook Recorder v2 — event-driven + screenshot-based
19
+ *
20
+ * Two capture modes running in parallel:
21
+ *
22
+ * 1. AX Event Stream (real-time, ~0ms latency)
23
+ * - Listens to macOS accessibility notifications via the native bridge
24
+ * - Captures: focus changes, value changes, window creates, app switches
25
+ * - This is how we know WHAT the user clicked/typed
26
+ *
27
+ * 2. Periodic Screenshots (every 2s)
28
+ * - Captures visual state of the screen
29
+ * - At stop time, AI analyzes the screenshot sequence + AX events
30
+ * - This is how we handle things AX events miss (Chrome DOM, visual changes)
31
+ *
32
+ * On stop:
33
+ * - All AX events + screenshots sent to AI
34
+ * - AI produces clean PlaybookStep[] from the combined data
35
+ * - Saved to disk as a replayable playbook
36
+ */
37
+ import Anthropic from "@anthropic-ai/sdk";
38
+ import fs from "node:fs";
39
+ import { PlaybookStore } from "./store.js";
40
+ const SCREENSHOT_INTERVAL_MS = 2500;
41
+ const AX_POLL_INTERVAL_MS = 500;
42
+ export class PlaybookRecorder {
43
+ runtime;
44
+ options;
45
+ recording = false;
46
+ events = [];
47
+ screenshots = [];
48
+ screenshotTimer = null;
49
+ axPollTimer = null;
50
+ sessionId = "";
51
+ // Track previous AX state for diff detection
52
+ prevFocused = "";
53
+ prevActiveApp = "";
54
+ prevWindowTitle = "";
55
+ prevUrl = "";
56
+ prevTextFields = new Map();
57
+ store;
58
+ ai;
59
+ model;
60
+ log;
61
+ onEvent;
62
+ captureScreenshots;
63
+ constructor(runtime, playbookDir, options = {}) {
64
+ this.runtime = runtime;
65
+ this.options = options;
66
+ this.store = new PlaybookStore(playbookDir);
67
+ this.store.load();
68
+ this.ai = new Anthropic();
69
+ this.model = options.model ?? "claude-sonnet-4-20250514";
70
+ this.log = options.onLog ?? ((msg) => console.error(`[Recorder] ${msg}`));
71
+ this.onEvent = options.onEvent;
72
+ this.captureScreenshots = options.screenshots !== false;
73
+ }
74
+ /**
75
+ * Start recording user actions.
76
+ */
77
+ async start(sessionId) {
78
+ if (this.recording) {
79
+ this.log("Already recording");
80
+ return;
81
+ }
82
+ this.recording = true;
83
+ this.sessionId = sessionId;
84
+ this.events = [];
85
+ this.screenshots = [];
86
+ this.prevFocused = "";
87
+ this.prevActiveApp = "";
88
+ this.prevWindowTitle = "";
89
+ this.prevUrl = "";
90
+ this.prevTextFields.clear();
91
+ // Take initial state snapshot
92
+ await this.captureState("initial");
93
+ this.log("Recording started — watching for AX events + taking screenshots");
94
+ // Start AX event polling (fast — every 500ms)
95
+ this.axPollTimer = setInterval(async () => {
96
+ if (!this.recording)
97
+ return;
98
+ try {
99
+ await this.pollAXState();
100
+ }
101
+ catch { /* non-fatal */ }
102
+ }, AX_POLL_INTERVAL_MS);
103
+ // Start screenshot capture (slower — every 2.5s)
104
+ if (this.captureScreenshots) {
105
+ this.screenshotTimer = setInterval(async () => {
106
+ if (!this.recording)
107
+ return;
108
+ try {
109
+ await this.takeScreenshot();
110
+ }
111
+ catch { /* non-fatal */ }
112
+ }, SCREENSHOT_INTERVAL_MS);
113
+ }
114
+ }
115
+ /**
116
+ * Stop recording and generate a playbook.
117
+ */
118
+ async stop(name, description, platform) {
119
+ this.recording = false;
120
+ this.clearTimers();
121
+ // Take final screenshot
122
+ if (this.captureScreenshots) {
123
+ try {
124
+ await this.takeScreenshot();
125
+ }
126
+ catch { /* ignore */ }
127
+ }
128
+ this.log(`Recording stopped. ${this.events.length} events, ${this.screenshots.length} screenshots captured.`);
129
+ // Convert raw events + screenshots to playbook steps via AI
130
+ const steps = await this.eventsToSteps(this.events, this.screenshots, name, platform);
131
+ // Save as playbook
132
+ const id = `rec_${platform}_${Date.now()}`;
133
+ const playbook = {
134
+ id,
135
+ name,
136
+ description,
137
+ platform,
138
+ steps,
139
+ version: "1.0.0",
140
+ tags: [
141
+ platform,
142
+ ...name.toLowerCase().split(/\W+/).filter((w) => w.length >= 3),
143
+ ],
144
+ successCount: 0,
145
+ failCount: 0,
146
+ lastRun: new Date().toISOString(),
147
+ };
148
+ this.store.save(playbook);
149
+ this.log(`Playbook saved: ${id} (${steps.length} steps)`);
150
+ return playbook;
151
+ }
152
+ /**
153
+ * Cancel recording without saving.
154
+ */
155
+ cancel() {
156
+ this.recording = false;
157
+ this.clearTimers();
158
+ this.events = [];
159
+ this.screenshots = [];
160
+ this.log("Recording cancelled");
161
+ }
162
+ get isRecording() {
163
+ return this.recording;
164
+ }
165
+ get eventCount() {
166
+ return this.events.length;
167
+ }
168
+ getEvents() {
169
+ return [...this.events];
170
+ }
171
+ // ── AX State Polling (fast, event-driven feel) ──
172
+ async pollAXState() {
173
+ // 1. Check which app is active
174
+ try {
175
+ const apps = await this.runtime.appList(this.sessionId);
176
+ if (apps.ok) {
177
+ const active = apps.data.find((a) => a.isActive);
178
+ if (active && active.bundleId !== this.prevActiveApp) {
179
+ if (this.prevActiveApp) {
180
+ this.addEvent({
181
+ type: "app_activated",
182
+ details: {
183
+ from: this.prevActiveApp,
184
+ to: active.bundleId,
185
+ appName: active.name,
186
+ },
187
+ });
188
+ }
189
+ this.prevActiveApp = active.bundleId;
190
+ }
191
+ }
192
+ }
193
+ catch { /* ignore */ }
194
+ // 2. Get accessibility tree — find focused element and text field values
195
+ try {
196
+ const tree = await this.runtime.elementTree({ sessionId: this.sessionId, maxDepth: 4 });
197
+ if (!tree.ok)
198
+ return;
199
+ // Detect focus change
200
+ const focused = findFocused(tree.data);
201
+ if (focused && focused !== this.prevFocused) {
202
+ this.addEvent({
203
+ type: "focus_changed",
204
+ details: {
205
+ from: this.prevFocused,
206
+ to: focused,
207
+ element: describeFocused(tree.data),
208
+ },
209
+ });
210
+ this.prevFocused = focused;
211
+ }
212
+ // Detect text field value changes (typing detection)
213
+ const currentFields = collectTextFields(tree.data);
214
+ for (const [fieldId, value] of currentFields) {
215
+ const prev = this.prevTextFields.get(fieldId);
216
+ if (prev !== undefined && prev !== value) {
217
+ this.addEvent({
218
+ type: "value_changed",
219
+ details: {
220
+ field: fieldId,
221
+ from: prev.slice(-50),
222
+ to: value.slice(-50),
223
+ typed: value.slice(prev.length),
224
+ },
225
+ });
226
+ }
227
+ }
228
+ this.prevTextFields = currentFields;
229
+ // Detect window title change (navigation in browser)
230
+ const title = tree.data.title ?? "";
231
+ if (title && title !== this.prevWindowTitle) {
232
+ if (this.prevWindowTitle) {
233
+ this.addEvent({
234
+ type: "title_changed",
235
+ details: { from: this.prevWindowTitle, to: title },
236
+ });
237
+ }
238
+ this.prevWindowTitle = title;
239
+ }
240
+ }
241
+ catch { /* ignore */ }
242
+ }
243
+ // ── Screenshot Capture ──
244
+ async takeScreenshot() {
245
+ try {
246
+ const result = await this.runtime.screenshot({ sessionId: this.sessionId });
247
+ if (result.ok) {
248
+ const record = {
249
+ path: result.data.path,
250
+ timestamp: new Date().toISOString(),
251
+ index: this.screenshots.length,
252
+ };
253
+ this.screenshots.push(record);
254
+ }
255
+ }
256
+ catch { /* non-fatal */ }
257
+ }
258
+ // ── State Capture ──
259
+ async captureState(label) {
260
+ // Capture initial app state
261
+ try {
262
+ const apps = await this.runtime.appList(this.sessionId);
263
+ if (apps.ok) {
264
+ const active = apps.data.find((a) => a.isActive);
265
+ if (active) {
266
+ this.prevActiveApp = active.bundleId;
267
+ this.addEvent({
268
+ type: "app_activated",
269
+ details: { to: active.bundleId, appName: active.name, label },
270
+ });
271
+ }
272
+ }
273
+ }
274
+ catch { /* ignore */ }
275
+ // Capture initial tree state
276
+ try {
277
+ const tree = await this.runtime.elementTree({ sessionId: this.sessionId, maxDepth: 4 });
278
+ if (tree.ok) {
279
+ this.prevFocused = findFocused(tree.data);
280
+ this.prevWindowTitle = tree.data.title ?? "";
281
+ this.prevTextFields = collectTextFields(tree.data);
282
+ }
283
+ }
284
+ catch { /* ignore */ }
285
+ // Take initial screenshot
286
+ if (this.captureScreenshots) {
287
+ await this.takeScreenshot();
288
+ }
289
+ }
290
+ // ── Event Management ──
291
+ addEvent(partial) {
292
+ const event = {
293
+ ...partial,
294
+ timestamp: new Date().toISOString(),
295
+ };
296
+ this.events.push(event);
297
+ this.log(`Event: ${event.type} — ${JSON.stringify(event.details).slice(0, 120)}`);
298
+ if (this.onEvent)
299
+ this.onEvent(event);
300
+ }
301
+ clearTimers() {
302
+ if (this.axPollTimer) {
303
+ clearInterval(this.axPollTimer);
304
+ this.axPollTimer = null;
305
+ }
306
+ if (this.screenshotTimer) {
307
+ clearInterval(this.screenshotTimer);
308
+ this.screenshotTimer = null;
309
+ }
310
+ }
311
+ // ── AI Conversion ──
312
+ /**
313
+ * Convert raw events + screenshots into clean playbook steps.
314
+ * Sends first + last screenshot as images so AI can see what happened visually.
315
+ */
316
+ async eventsToSteps(events, screenshots, taskName, platform) {
317
+ if (events.length === 0)
318
+ return [];
319
+ // Build the content array — text + optional images
320
+ const content = [];
321
+ // Add text prompt
322
+ content.push({
323
+ type: "text",
324
+ text: `Convert these recorded user events into a clean, replayable automation playbook.
325
+
326
+ Task: ${taskName}
327
+ Platform: ${platform}
328
+
329
+ Raw events recorded (in chronological order):
330
+ ${events.map((e, i) => `${i + 1}. [${e.timestamp}] ${e.type}: ${JSON.stringify(e.details)}`).join("\n")}
331
+
332
+ ${screenshots.length > 0 ? `\n${screenshots.length} screenshots were taken during recording. The first and last are attached below for visual context.\n` : ""}
333
+ Convert these into a JSON array of playbook steps. Each step:
334
+ {
335
+ "action": "navigate" | "press" | "type_into" | "key" | "key_combo" | "menu_click" | "scroll" | "wait" | "screenshot",
336
+ "target": "CSS selector, text label, or {\"selector\": \"...\"}",
337
+ "url": "for navigate",
338
+ "text": "for type_into",
339
+ "keys": ["for", "key or key_combo"],
340
+ "menuPath": ["for", "menu_click"],
341
+ "ms": 1000,
342
+ "description": "human-readable description of what this step does",
343
+ "verify": "optional CSS selector or text to verify success",
344
+ "optional": false
345
+ }
346
+
347
+ Rules:
348
+ - Infer the user's INTENT from events, not just mirror them mechanically
349
+ - focus_changed events usually mean a click — convert to "press" with the element label
350
+ - value_changed events mean typing — convert to "type_into" with the field and text
351
+ - title_changed often means navigation — add appropriate navigate or wait steps
352
+ - app_activated means switching apps — use app_focus or app_launch
353
+ - Use stable selectors: data-testid, aria-label, role+name over fragile CSS
354
+ - Merge rapid consecutive events into single meaningful steps
355
+ - Add wait steps (500-2000ms) after navigation/page loads
356
+ - Add verify conditions for critical steps (modal opened, page loaded, etc.)
357
+ - Skip noise (duplicate events, layout thrash, irrelevant focus changes)
358
+
359
+ Respond with ONLY a valid JSON array, no markdown fences, no explanation.`,
360
+ });
361
+ // Attach first and last screenshots as images (if available)
362
+ if (screenshots.length > 0) {
363
+ const toAttach = [screenshots[0]];
364
+ if (screenshots.length > 1) {
365
+ toAttach.push(screenshots[screenshots.length - 1]);
366
+ }
367
+ for (const shot of toAttach) {
368
+ try {
369
+ const imageData = fs.readFileSync(shot.path);
370
+ const base64 = imageData.toString("base64");
371
+ content.push({
372
+ type: "image",
373
+ source: {
374
+ type: "base64",
375
+ media_type: "image/png",
376
+ data: base64,
377
+ },
378
+ });
379
+ content.push({
380
+ type: "text",
381
+ text: `Screenshot ${shot.index + 1} taken at ${shot.timestamp}`,
382
+ });
383
+ }
384
+ catch {
385
+ // Skip unreadable screenshots
386
+ }
387
+ }
388
+ }
389
+ try {
390
+ const resp = await this.ai.messages.create({
391
+ model: this.model,
392
+ max_tokens: 4096,
393
+ messages: [{ role: "user", content }],
394
+ });
395
+ const text = resp.content[0]?.type === "text" ? resp.content[0].text : "";
396
+ const jsonMatch = text.match(/\[[\s\S]*\]/);
397
+ if (jsonMatch) {
398
+ const steps = JSON.parse(jsonMatch[0]);
399
+ this.log(`AI generated ${steps.length} playbook steps`);
400
+ return steps;
401
+ }
402
+ }
403
+ catch (err) {
404
+ this.log(`AI conversion failed: ${err instanceof Error ? err.message : String(err)}`);
405
+ }
406
+ // Fallback without AI
407
+ return this.eventsToStepsFallback(events);
408
+ }
409
+ eventsToStepsFallback(events) {
410
+ const steps = [];
411
+ for (const event of events) {
412
+ switch (event.type) {
413
+ case "app_activated":
414
+ if (event.details.label === "initial")
415
+ break;
416
+ steps.push({
417
+ action: "wait",
418
+ ms: 500,
419
+ description: `Switched to ${event.details.appName ?? event.details.to}`,
420
+ });
421
+ break;
422
+ case "focus_changed": {
423
+ const target = String(event.details.to ?? "");
424
+ if (!target || target === this.prevFocused)
425
+ break;
426
+ steps.push({
427
+ action: "press",
428
+ target,
429
+ description: `Click on ${event.details.element ?? target}`,
430
+ });
431
+ break;
432
+ }
433
+ case "value_changed": {
434
+ const typed = String(event.details.typed ?? "");
435
+ const field = String(event.details.field ?? "");
436
+ if (typed) {
437
+ steps.push({
438
+ action: "type_into",
439
+ target: field,
440
+ text: typed,
441
+ description: `Type "${typed.slice(0, 30)}" into ${field}`,
442
+ });
443
+ }
444
+ break;
445
+ }
446
+ case "title_changed":
447
+ steps.push({
448
+ action: "wait",
449
+ ms: 1500,
450
+ description: `Page changed to: ${event.details.to}`,
451
+ });
452
+ break;
453
+ case "url_changed":
454
+ steps.push({
455
+ action: "navigate",
456
+ url: event.details.to,
457
+ description: `Navigate to ${event.details.to}`,
458
+ });
459
+ break;
460
+ case "menu_opened":
461
+ case "dialog_appeared":
462
+ steps.push({
463
+ action: "wait",
464
+ ms: 1000,
465
+ description: `${event.type}: ${JSON.stringify(event.details).slice(0, 50)}`,
466
+ });
467
+ break;
468
+ }
469
+ }
470
+ return steps;
471
+ }
472
+ }
473
+ // ── AX Tree Helpers ──
474
+ /** Find the focused element and return a stable identifier. */
475
+ function findFocused(node, depth = 0) {
476
+ if (depth > 6)
477
+ return "";
478
+ if (node.focused === true) {
479
+ const role = node.role?.replace("AX", "") ?? "";
480
+ const label = node.title ?? node.description ?? node.identifier ?? "";
481
+ return `${role}:${label}`;
482
+ }
483
+ if (node.children) {
484
+ for (const child of node.children) {
485
+ const found = findFocused(child, depth + 1);
486
+ if (found)
487
+ return found;
488
+ }
489
+ }
490
+ return "";
491
+ }
492
+ /** Get a human-readable description of the focused element + context. */
493
+ function describeFocused(node, depth = 0) {
494
+ if (depth > 6)
495
+ return "";
496
+ if (node.focused === true) {
497
+ const parts = [node.role?.replace("AX", "")];
498
+ if (node.title)
499
+ parts.push(`"${node.title}"`);
500
+ if (node.description)
501
+ parts.push(`desc="${node.description}"`);
502
+ if (node.value)
503
+ parts.push(`val="${node.value.slice(0, 30)}"`);
504
+ if (node.position)
505
+ parts.push(`@${Math.round(node.position.x)},${Math.round(node.position.y)}`);
506
+ return parts.filter(Boolean).join(" ");
507
+ }
508
+ if (node.children) {
509
+ for (const child of node.children) {
510
+ const found = describeFocused(child, depth + 1);
511
+ if (found)
512
+ return found;
513
+ }
514
+ }
515
+ return "";
516
+ }
517
+ /** Collect all text field values from the tree for typing detection. */
518
+ function collectTextFields(node, depth = 0) {
519
+ const fields = new Map();
520
+ if (depth > 5)
521
+ return fields;
522
+ const role = node.role?.replace("AX", "").toLowerCase() ?? "";
523
+ const isTextField = role === "textfield" || role === "textarea" || role === "combobox" || role === "searchfield";
524
+ if (isTextField && node.value !== undefined) {
525
+ const id = node.identifier ?? node.title ?? node.description ?? `field_${depth}`;
526
+ fields.set(id, node.value);
527
+ }
528
+ if (node.children) {
529
+ for (const child of node.children) {
530
+ for (const [k, v] of collectTextFields(child, depth + 1)) {
531
+ fields.set(k, v);
532
+ }
533
+ }
534
+ }
535
+ return fields;
536
+ }