screenhand 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. package/README.md +165 -446
  2. package/bin/darwin-arm64/macos-bridge +0 -0
  3. package/dist/mcp-desktop.js +3615 -400
  4. package/dist/scripts/export-help-center.js +112 -0
  5. package/dist/scripts/marketing-loop.js +117 -0
  6. package/dist/scripts/observer-daemon.js +288 -0
  7. package/dist/scripts/orchestrator-daemon.js +399 -0
  8. package/dist/scripts/threads-campaign.js +208 -0
  9. package/dist/src/community/fetcher.js +109 -0
  10. package/dist/src/community/index.js +6 -0
  11. package/dist/src/community/publisher.js +191 -0
  12. package/dist/src/community/remote-api.js +121 -0
  13. package/dist/src/community/types.js +3 -0
  14. package/dist/src/community/validator.js +95 -0
  15. package/dist/src/context-tracker.js +489 -0
  16. package/dist/src/ingestion/coverage-auditor.js +233 -0
  17. package/dist/src/ingestion/doc-parser.js +164 -0
  18. package/dist/src/ingestion/index.js +8 -0
  19. package/dist/src/ingestion/menu-scanner.js +152 -0
  20. package/dist/src/ingestion/reference-merger.js +186 -0
  21. package/dist/src/ingestion/shortcut-extractor.js +180 -0
  22. package/dist/src/ingestion/tutorial-extractor.js +170 -0
  23. package/dist/src/ingestion/types.js +3 -0
  24. package/dist/src/jobs/manager.js +82 -14
  25. package/dist/src/jobs/runner.js +138 -15
  26. package/dist/src/learning/engine.js +356 -0
  27. package/dist/src/learning/index.js +9 -0
  28. package/dist/src/learning/locator-policy.js +120 -0
  29. package/dist/src/learning/pattern-policy.js +89 -0
  30. package/dist/src/learning/recovery-policy.js +116 -0
  31. package/dist/src/learning/sensor-policy.js +115 -0
  32. package/dist/src/learning/timing-model.js +204 -0
  33. package/dist/src/learning/topology-policy.js +90 -0
  34. package/dist/src/learning/types.js +9 -0
  35. package/dist/src/logging/timeline-logger.js +4 -1
  36. package/dist/src/memory/playbook-seeds.js +200 -0
  37. package/dist/src/memory/recall.js +60 -8
  38. package/dist/src/memory/service.js +30 -5
  39. package/dist/src/memory/store.js +34 -5
  40. package/dist/src/native/bridge-client.js +253 -31
  41. package/dist/src/observer/state.js +199 -0
  42. package/dist/src/observer/types.js +43 -0
  43. package/dist/src/orchestrator/state.js +68 -0
  44. package/dist/src/orchestrator/types.js +22 -0
  45. package/dist/src/perception/ax-source.js +162 -0
  46. package/dist/src/perception/cdp-source.js +162 -0
  47. package/dist/src/perception/coordinator.js +771 -0
  48. package/dist/src/perception/frame-differ.js +287 -0
  49. package/dist/src/perception/index.js +22 -0
  50. package/dist/src/perception/manager.js +199 -0
  51. package/dist/src/perception/types.js +47 -0
  52. package/dist/src/perception/vision-source.js +399 -0
  53. package/dist/src/planner/deterministic.js +298 -0
  54. package/dist/src/planner/executor.js +870 -0
  55. package/dist/src/planner/goal-store.js +92 -0
  56. package/dist/src/planner/index.js +21 -0
  57. package/dist/src/planner/planner.js +520 -0
  58. package/dist/src/planner/tool-registry.js +71 -0
  59. package/dist/src/planner/types.js +22 -0
  60. package/dist/src/platform/explorer.js +213 -0
  61. package/dist/src/platform/help-center-markdown.js +527 -0
  62. package/dist/src/platform/learner.js +257 -0
  63. package/dist/src/playbook/engine.js +296 -11
  64. package/dist/src/playbook/mcp-recorder.js +204 -0
  65. package/dist/src/playbook/recorder.js +3 -2
  66. package/dist/src/playbook/runner.js +1 -1
  67. package/dist/src/playbook/store.js +139 -10
  68. package/dist/src/recovery/detectors.js +156 -0
  69. package/dist/src/recovery/engine.js +327 -0
  70. package/dist/src/recovery/index.js +20 -0
  71. package/dist/src/recovery/strategies.js +274 -0
  72. package/dist/src/recovery/types.js +20 -0
  73. package/dist/src/runtime/accessibility-adapter.js +55 -18
  74. package/dist/src/runtime/applescript-adapter.js +8 -2
  75. package/dist/src/runtime/cdp-chrome-adapter.js +1 -1
  76. package/dist/src/runtime/executor.js +23 -3
  77. package/dist/src/runtime/locator-cache.js +24 -2
  78. package/dist/src/runtime/service.js +59 -15
  79. package/dist/src/runtime/session-manager.js +4 -1
  80. package/dist/src/runtime/vision-adapter.js +2 -1
  81. package/dist/src/state/app-map-types.js +72 -0
  82. package/dist/src/state/app-map.js +1974 -0
  83. package/dist/src/state/entity-tracker.js +108 -0
  84. package/dist/src/state/fusion.js +96 -0
  85. package/dist/src/state/index.js +21 -0
  86. package/dist/src/state/ladder-generator.js +236 -0
  87. package/dist/src/state/persistence.js +156 -0
  88. package/dist/src/state/types.js +17 -0
  89. package/dist/src/state/world-model.js +1456 -0
  90. package/dist/src/util/atomic-write.js +19 -4
  91. package/dist/src/util/sanitize.js +146 -0
  92. package/dist-app-maps/com.figma.Desktop.json +959 -0
  93. package/dist-app-maps/com.hnc.Discord.json +1146 -0
  94. package/dist-app-maps/notion.id.json +2831 -0
  95. package/dist-playbooks/canva-screenhand-carousel.json +445 -0
  96. package/dist-playbooks/codex-desktop.json +76 -0
  97. package/dist-playbooks/competitor-research-stack.json +122 -0
  98. package/dist-playbooks/davinci-color-grade.json +153 -0
  99. package/dist-playbooks/davinci-edit-timeline.json +162 -0
  100. package/dist-playbooks/davinci-render.json +114 -0
  101. package/dist-playbooks/devto.json +52 -0
  102. package/dist-playbooks/discord.json +41 -0
  103. package/dist-playbooks/google-flow-create-project.json +59 -0
  104. package/dist-playbooks/google-flow-edit-image.json +90 -0
  105. package/dist-playbooks/google-flow-edit-video.json +90 -0
  106. package/dist-playbooks/google-flow-generate-image.json +68 -0
  107. package/dist-playbooks/google-flow-generate-video.json +191 -0
  108. package/dist-playbooks/google-flow-open-project.json +48 -0
  109. package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
  110. package/dist-playbooks/google-flow-search-assets.json +64 -0
  111. package/dist-playbooks/instagram.json +57 -0
  112. package/dist-playbooks/linkedin.json +52 -0
  113. package/dist-playbooks/n8n.json +43 -0
  114. package/dist-playbooks/reddit.json +52 -0
  115. package/dist-playbooks/threads.json +59 -0
  116. package/dist-playbooks/x-twitter.json +59 -0
  117. package/dist-playbooks/youtube.json +59 -0
  118. package/dist-references/canva.json +646 -0
  119. package/dist-references/codex-desktop.json +305 -0
  120. package/dist-references/davinci-resolve-keyboard.json +594 -0
  121. package/dist-references/davinci-resolve-menu-map.json +1139 -0
  122. package/dist-references/davinci-resolve-menus-batch1.json +116 -0
  123. package/dist-references/davinci-resolve-menus-batch2.json +372 -0
  124. package/dist-references/davinci-resolve-menus-batch3.json +330 -0
  125. package/dist-references/davinci-resolve-menus-batch4.json +297 -0
  126. package/dist-references/davinci-resolve-shortcuts.json +333 -0
  127. package/dist-references/devpost.json +186 -0
  128. package/dist-references/devto.json +317 -0
  129. package/dist-references/discord.json +549 -0
  130. package/dist-references/figma.json +1186 -0
  131. package/dist-references/finder.json +146 -0
  132. package/dist-references/google-ads-transparency.json +95 -0
  133. package/dist-references/google-flow.json +649 -0
  134. package/dist-references/instagram.json +341 -0
  135. package/dist-references/linkedin.json +324 -0
  136. package/dist-references/meta-ad-library.json +86 -0
  137. package/dist-references/n8n.json +387 -0
  138. package/dist-references/notes.json +27 -0
  139. package/dist-references/notion.json +163 -0
  140. package/dist-references/reddit.json +341 -0
  141. package/dist-references/threads.json +337 -0
  142. package/dist-references/x-twitter.json +403 -0
  143. package/dist-references/youtube.json +373 -0
  144. package/native/macos-bridge/Package.swift +22 -0
  145. package/native/macos-bridge/Sources/AccessibilityBridge.swift +482 -0
  146. package/native/macos-bridge/Sources/AppManagement.swift +339 -0
  147. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +537 -0
  148. package/native/macos-bridge/Sources/ObserverBridge.swift +120 -0
  149. package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
  150. package/native/macos-bridge/Sources/VisionBridge.swift +238 -0
  151. package/native/macos-bridge/Sources/main.swift +498 -0
  152. package/native/windows-bridge/AppManagement.cs +234 -0
  153. package/native/windows-bridge/InputBridge.cs +436 -0
  154. package/native/windows-bridge/Program.cs +270 -0
  155. package/native/windows-bridge/ScreenCapture.cs +453 -0
  156. package/native/windows-bridge/UIAutomationBridge.cs +571 -0
  157. package/native/windows-bridge/WindowsBridge.csproj +17 -0
  158. package/package.json +12 -1
  159. package/scripts/postinstall.cjs +127 -0
  160. package/dist/.audit-log.jsonl +0 -55
  161. package/dist/.screenhand/memory/.lock +0 -1
  162. package/dist/.screenhand/memory/actions.jsonl +0 -85
  163. package/dist/.screenhand/memory/errors.jsonl +0 -5
  164. package/dist/.screenhand/memory/errors.jsonl.bak +0 -4
  165. package/dist/.screenhand/memory/state.json +0 -35
  166. package/dist/.screenhand/memory/state.json.bak +0 -35
  167. package/dist/.screenhand/memory/strategies.jsonl +0 -12
  168. package/dist/agent/cli.js +0 -73
  169. package/dist/agent/loop.js +0 -258
  170. package/dist/config.js +0 -9
  171. package/dist/index.js +0 -56
  172. package/dist/logging/timeline-logger.js +0 -29
  173. package/dist/mcp/mcp-stdio-server.js +0 -448
  174. package/dist/mcp/server.js +0 -347
  175. package/dist/mcp-entry.js +0 -59
  176. package/dist/memory/recall.js +0 -160
  177. package/dist/memory/research.js +0 -98
  178. package/dist/memory/seeds.js +0 -89
  179. package/dist/memory/session.js +0 -161
  180. package/dist/memory/store.js +0 -391
  181. package/dist/memory/types.js +0 -4
  182. package/dist/monitor/codex-monitor.js +0 -377
  183. package/dist/monitor/task-queue.js +0 -84
  184. package/dist/monitor/types.js +0 -49
  185. package/dist/native/bridge-client.js +0 -174
  186. package/dist/native/macos-bridge-client.js +0 -5
  187. package/dist/npm-publish-helper.js +0 -117
  188. package/dist/npm-token-cdp.js +0 -113
  189. package/dist/npm-token-create.js +0 -135
  190. package/dist/npm-token-finish.js +0 -126
  191. package/dist/playbook/engine.js +0 -193
  192. package/dist/playbook/index.js +0 -4
  193. package/dist/playbook/recorder.js +0 -519
  194. package/dist/playbook/runner.js +0 -392
  195. package/dist/playbook/store.js +0 -166
  196. package/dist/playbook/types.js +0 -4
  197. package/dist/runtime/accessibility-adapter.js +0 -377
  198. package/dist/runtime/app-adapter.js +0 -48
  199. package/dist/runtime/applescript-adapter.js +0 -283
  200. package/dist/runtime/ax-role-map.js +0 -80
  201. package/dist/runtime/browser-adapter.js +0 -36
  202. package/dist/runtime/cdp-chrome-adapter.js +0 -505
  203. package/dist/runtime/composite-adapter.js +0 -205
  204. package/dist/runtime/executor.js +0 -250
  205. package/dist/runtime/locator-cache.js +0 -12
  206. package/dist/runtime/planning-loop.js +0 -47
  207. package/dist/runtime/service.js +0 -372
  208. package/dist/runtime/session-manager.js +0 -28
  209. package/dist/runtime/state-observer.js +0 -105
  210. package/dist/runtime/vision-adapter.js +0 -208
  211. package/dist/test-mcp-protocol.js +0 -138
  212. package/dist/types.js +0 -1
@@ -1,519 +0,0 @@
1
- /**
2
- * Playbook Recorder v2 — event-driven + screenshot-based
3
- *
4
- * Two capture modes running in parallel:
5
- *
6
- * 1. AX Event Stream (real-time, ~0ms latency)
7
- * - Listens to macOS accessibility notifications via the native bridge
8
- * - Captures: focus changes, value changes, window creates, app switches
9
- * - This is how we know WHAT the user clicked/typed
10
- *
11
- * 2. Periodic Screenshots (every 2s)
12
- * - Captures visual state of the screen
13
- * - At stop time, AI analyzes the screenshot sequence + AX events
14
- * - This is how we handle things AX events miss (Chrome DOM, visual changes)
15
- *
16
- * On stop:
17
- * - All AX events + screenshots sent to AI
18
- * - AI produces clean PlaybookStep[] from the combined data
19
- * - Saved to disk as a replayable playbook
20
- */
21
- import Anthropic from "@anthropic-ai/sdk";
22
- import fs from "node:fs";
23
- import { PlaybookStore } from "./store.js";
24
- const SCREENSHOT_INTERVAL_MS = 2500;
25
- const AX_POLL_INTERVAL_MS = 500;
26
- export class PlaybookRecorder {
27
- runtime;
28
- options;
29
- recording = false;
30
- events = [];
31
- screenshots = [];
32
- screenshotTimer = null;
33
- axPollTimer = null;
34
- sessionId = "";
35
- // Track previous AX state for diff detection
36
- prevFocused = "";
37
- prevActiveApp = "";
38
- prevWindowTitle = "";
39
- prevUrl = "";
40
- prevTextFields = new Map();
41
- store;
42
- ai;
43
- model;
44
- log;
45
- onEvent;
46
- captureScreenshots;
47
- constructor(runtime, playbookDir, options = {}) {
48
- this.runtime = runtime;
49
- this.options = options;
50
- this.store = new PlaybookStore(playbookDir);
51
- this.store.load();
52
- this.ai = new Anthropic();
53
- this.model = options.model ?? "claude-sonnet-4-20250514";
54
- this.log = options.onLog ?? ((msg) => console.error(`[Recorder] ${msg}`));
55
- this.onEvent = options.onEvent;
56
- this.captureScreenshots = options.screenshots !== false;
57
- }
58
- /**
59
- * Start recording user actions.
60
- */
61
- async start(sessionId) {
62
- if (this.recording) {
63
- this.log("Already recording");
64
- return;
65
- }
66
- this.recording = true;
67
- this.sessionId = sessionId;
68
- this.events = [];
69
- this.screenshots = [];
70
- this.prevFocused = "";
71
- this.prevActiveApp = "";
72
- this.prevWindowTitle = "";
73
- this.prevUrl = "";
74
- this.prevTextFields.clear();
75
- // Take initial state snapshot
76
- await this.captureState("initial");
77
- this.log("Recording started — watching for AX events + taking screenshots");
78
- // Start AX event polling (fast — every 500ms)
79
- this.axPollTimer = setInterval(async () => {
80
- if (!this.recording)
81
- return;
82
- try {
83
- await this.pollAXState();
84
- }
85
- catch { /* non-fatal */ }
86
- }, AX_POLL_INTERVAL_MS);
87
- // Start screenshot capture (slower — every 2.5s)
88
- if (this.captureScreenshots) {
89
- this.screenshotTimer = setInterval(async () => {
90
- if (!this.recording)
91
- return;
92
- try {
93
- await this.takeScreenshot();
94
- }
95
- catch { /* non-fatal */ }
96
- }, SCREENSHOT_INTERVAL_MS);
97
- }
98
- }
99
- /**
100
- * Stop recording and generate a playbook.
101
- */
102
- async stop(name, description, platform) {
103
- this.recording = false;
104
- this.clearTimers();
105
- // Take final screenshot
106
- if (this.captureScreenshots) {
107
- try {
108
- await this.takeScreenshot();
109
- }
110
- catch { /* ignore */ }
111
- }
112
- this.log(`Recording stopped. ${this.events.length} events, ${this.screenshots.length} screenshots captured.`);
113
- // Convert raw events + screenshots to playbook steps via AI
114
- const steps = await this.eventsToSteps(this.events, this.screenshots, name, platform);
115
- // Save as playbook
116
- const id = `rec_${platform}_${Date.now()}`;
117
- const playbook = {
118
- id,
119
- name,
120
- description,
121
- platform,
122
- steps,
123
- version: "1.0.0",
124
- tags: [
125
- platform,
126
- ...name.toLowerCase().split(/\W+/).filter((w) => w.length >= 3),
127
- ],
128
- successCount: 0,
129
- failCount: 0,
130
- lastRun: new Date().toISOString(),
131
- };
132
- this.store.save(playbook);
133
- this.log(`Playbook saved: ${id} (${steps.length} steps)`);
134
- return playbook;
135
- }
136
- /**
137
- * Cancel recording without saving.
138
- */
139
- cancel() {
140
- this.recording = false;
141
- this.clearTimers();
142
- this.events = [];
143
- this.screenshots = [];
144
- this.log("Recording cancelled");
145
- }
146
- get isRecording() {
147
- return this.recording;
148
- }
149
- get eventCount() {
150
- return this.events.length;
151
- }
152
- getEvents() {
153
- return [...this.events];
154
- }
155
- // ── AX State Polling (fast, event-driven feel) ──
156
- async pollAXState() {
157
- // 1. Check which app is active
158
- try {
159
- const apps = await this.runtime.appList(this.sessionId);
160
- if (apps.ok) {
161
- const active = apps.data.find((a) => a.isActive);
162
- if (active && active.bundleId !== this.prevActiveApp) {
163
- if (this.prevActiveApp) {
164
- this.addEvent({
165
- type: "app_activated",
166
- details: {
167
- from: this.prevActiveApp,
168
- to: active.bundleId,
169
- appName: active.name,
170
- },
171
- });
172
- }
173
- this.prevActiveApp = active.bundleId;
174
- }
175
- }
176
- }
177
- catch { /* ignore */ }
178
- // 2. Get accessibility tree — find focused element and text field values
179
- try {
180
- const tree = await this.runtime.elementTree({ sessionId: this.sessionId, maxDepth: 4 });
181
- if (!tree.ok)
182
- return;
183
- // Detect focus change
184
- const focused = findFocused(tree.data);
185
- if (focused && focused !== this.prevFocused) {
186
- this.addEvent({
187
- type: "focus_changed",
188
- details: {
189
- from: this.prevFocused,
190
- to: focused,
191
- element: describeFocused(tree.data),
192
- },
193
- });
194
- this.prevFocused = focused;
195
- }
196
- // Detect text field value changes (typing detection)
197
- const currentFields = collectTextFields(tree.data);
198
- for (const [fieldId, value] of currentFields) {
199
- const prev = this.prevTextFields.get(fieldId);
200
- if (prev !== undefined && prev !== value) {
201
- this.addEvent({
202
- type: "value_changed",
203
- details: {
204
- field: fieldId,
205
- from: prev.slice(-50),
206
- to: value.slice(-50),
207
- typed: value.slice(prev.length),
208
- },
209
- });
210
- }
211
- }
212
- this.prevTextFields = currentFields;
213
- // Detect window title change (navigation in browser)
214
- const title = tree.data.title ?? "";
215
- if (title && title !== this.prevWindowTitle) {
216
- if (this.prevWindowTitle) {
217
- this.addEvent({
218
- type: "title_changed",
219
- details: { from: this.prevWindowTitle, to: title },
220
- });
221
- }
222
- this.prevWindowTitle = title;
223
- }
224
- }
225
- catch { /* ignore */ }
226
- }
227
- // ── Screenshot Capture ──
228
- async takeScreenshot() {
229
- try {
230
- const result = await this.runtime.screenshot({ sessionId: this.sessionId });
231
- if (result.ok) {
232
- const record = {
233
- path: result.data.path,
234
- timestamp: new Date().toISOString(),
235
- index: this.screenshots.length,
236
- };
237
- this.screenshots.push(record);
238
- }
239
- }
240
- catch { /* non-fatal */ }
241
- }
242
- // ── State Capture ──
243
- async captureState(label) {
244
- // Capture initial app state
245
- try {
246
- const apps = await this.runtime.appList(this.sessionId);
247
- if (apps.ok) {
248
- const active = apps.data.find((a) => a.isActive);
249
- if (active) {
250
- this.prevActiveApp = active.bundleId;
251
- this.addEvent({
252
- type: "app_activated",
253
- details: { to: active.bundleId, appName: active.name, label },
254
- });
255
- }
256
- }
257
- }
258
- catch { /* ignore */ }
259
- // Capture initial tree state
260
- try {
261
- const tree = await this.runtime.elementTree({ sessionId: this.sessionId, maxDepth: 4 });
262
- if (tree.ok) {
263
- this.prevFocused = findFocused(tree.data);
264
- this.prevWindowTitle = tree.data.title ?? "";
265
- this.prevTextFields = collectTextFields(tree.data);
266
- }
267
- }
268
- catch { /* ignore */ }
269
- // Take initial screenshot
270
- if (this.captureScreenshots) {
271
- await this.takeScreenshot();
272
- }
273
- }
274
- // ── Event Management ──
275
- addEvent(partial) {
276
- const event = {
277
- ...partial,
278
- timestamp: new Date().toISOString(),
279
- };
280
- this.events.push(event);
281
- this.log(`Event: ${event.type} — ${JSON.stringify(event.details).slice(0, 120)}`);
282
- if (this.onEvent)
283
- this.onEvent(event);
284
- }
285
- clearTimers() {
286
- if (this.axPollTimer) {
287
- clearInterval(this.axPollTimer);
288
- this.axPollTimer = null;
289
- }
290
- if (this.screenshotTimer) {
291
- clearInterval(this.screenshotTimer);
292
- this.screenshotTimer = null;
293
- }
294
- }
295
- // ── AI Conversion ──
296
- /**
297
- * Convert raw events + screenshots into clean playbook steps.
298
- * Sends first + last screenshot as images so AI can see what happened visually.
299
- */
300
- async eventsToSteps(events, screenshots, taskName, platform) {
301
- if (events.length === 0)
302
- return [];
303
- // Build the content array — text + optional images
304
- const content = [];
305
- // Add text prompt
306
- content.push({
307
- type: "text",
308
- text: `Convert these recorded user events into a clean, replayable automation playbook.
309
-
310
- Task: ${taskName}
311
- Platform: ${platform}
312
-
313
- Raw events recorded (in chronological order):
314
- ${events.map((e, i) => `${i + 1}. [${e.timestamp}] ${e.type}: ${JSON.stringify(e.details)}`).join("\n")}
315
-
316
- ${screenshots.length > 0 ? `\n${screenshots.length} screenshots were taken during recording. The first and last are attached below for visual context.\n` : ""}
317
- Convert these into a JSON array of playbook steps. Each step:
318
- {
319
- "action": "navigate" | "press" | "type_into" | "key_combo" | "scroll" | "wait" | "screenshot",
320
- "target": "CSS selector, text label, or {\"selector\": \"...\"}",
321
- "url": "for navigate",
322
- "text": "for type_into",
323
- "keys": ["for", "key_combo"],
324
- "ms": 1000,
325
- "description": "human-readable description of what this step does",
326
- "verify": "optional CSS selector or text to verify success",
327
- "optional": false
328
- }
329
-
330
- Rules:
331
- - Infer the user's INTENT from events, not just mirror them mechanically
332
- - focus_changed events usually mean a click — convert to "press" with the element label
333
- - value_changed events mean typing — convert to "type_into" with the field and text
334
- - title_changed often means navigation — add appropriate navigate or wait steps
335
- - app_activated means switching apps — use app_focus or app_launch
336
- - Use stable selectors: data-testid, aria-label, role+name over fragile CSS
337
- - Merge rapid consecutive events into single meaningful steps
338
- - Add wait steps (500-2000ms) after navigation/page loads
339
- - Add verify conditions for critical steps (modal opened, page loaded, etc.)
340
- - Skip noise (duplicate events, layout thrash, irrelevant focus changes)
341
-
342
- Respond with ONLY a valid JSON array, no markdown fences, no explanation.`,
343
- });
344
- // Attach first and last screenshots as images (if available)
345
- if (screenshots.length > 0) {
346
- const toAttach = [screenshots[0]];
347
- if (screenshots.length > 1) {
348
- toAttach.push(screenshots[screenshots.length - 1]);
349
- }
350
- for (const shot of toAttach) {
351
- try {
352
- const imageData = fs.readFileSync(shot.path);
353
- const base64 = imageData.toString("base64");
354
- content.push({
355
- type: "image",
356
- source: {
357
- type: "base64",
358
- media_type: "image/png",
359
- data: base64,
360
- },
361
- });
362
- content.push({
363
- type: "text",
364
- text: `Screenshot ${shot.index + 1} taken at ${shot.timestamp}`,
365
- });
366
- }
367
- catch {
368
- // Skip unreadable screenshots
369
- }
370
- }
371
- }
372
- try {
373
- const resp = await this.ai.messages.create({
374
- model: this.model,
375
- max_tokens: 4096,
376
- messages: [{ role: "user", content }],
377
- });
378
- const text = resp.content[0]?.type === "text" ? resp.content[0].text : "";
379
- const jsonMatch = text.match(/\[[\s\S]*\]/);
380
- if (jsonMatch) {
381
- const steps = JSON.parse(jsonMatch[0]);
382
- this.log(`AI generated ${steps.length} playbook steps`);
383
- return steps;
384
- }
385
- }
386
- catch (err) {
387
- this.log(`AI conversion failed: ${err instanceof Error ? err.message : String(err)}`);
388
- }
389
- // Fallback without AI
390
- return this.eventsToStepsFallback(events);
391
- }
392
- eventsToStepsFallback(events) {
393
- const steps = [];
394
- for (const event of events) {
395
- switch (event.type) {
396
- case "app_activated":
397
- if (event.details.label === "initial")
398
- break;
399
- steps.push({
400
- action: "wait",
401
- ms: 500,
402
- description: `Switched to ${event.details.appName ?? event.details.to}`,
403
- });
404
- break;
405
- case "focus_changed": {
406
- const target = String(event.details.to ?? "");
407
- if (!target || target === this.prevFocused)
408
- break;
409
- steps.push({
410
- action: "press",
411
- target,
412
- description: `Click on ${event.details.element ?? target}`,
413
- });
414
- break;
415
- }
416
- case "value_changed": {
417
- const typed = String(event.details.typed ?? "");
418
- const field = String(event.details.field ?? "");
419
- if (typed) {
420
- steps.push({
421
- action: "type_into",
422
- target: field,
423
- text: typed,
424
- description: `Type "${typed.slice(0, 30)}" into ${field}`,
425
- });
426
- }
427
- break;
428
- }
429
- case "title_changed":
430
- steps.push({
431
- action: "wait",
432
- ms: 1500,
433
- description: `Page changed to: ${event.details.to}`,
434
- });
435
- break;
436
- case "url_changed":
437
- steps.push({
438
- action: "navigate",
439
- url: event.details.to,
440
- description: `Navigate to ${event.details.to}`,
441
- });
442
- break;
443
- case "menu_opened":
444
- case "dialog_appeared":
445
- steps.push({
446
- action: "wait",
447
- ms: 1000,
448
- description: `${event.type}: ${JSON.stringify(event.details).slice(0, 50)}`,
449
- });
450
- break;
451
- }
452
- }
453
- return steps;
454
- }
455
- }
456
- // ── AX Tree Helpers ──
457
- /** Find the focused element and return a stable identifier. */
458
- function findFocused(node, depth = 0) {
459
- if (depth > 6)
460
- return "";
461
- if (node.focused === true) {
462
- const role = node.role?.replace("AX", "") ?? "";
463
- const label = node.title ?? node.description ?? node.identifier ?? "";
464
- return `${role}:${label}`;
465
- }
466
- if (node.children) {
467
- for (const child of node.children) {
468
- const found = findFocused(child, depth + 1);
469
- if (found)
470
- return found;
471
- }
472
- }
473
- return "";
474
- }
475
- /** Get a human-readable description of the focused element + context. */
476
- function describeFocused(node, depth = 0) {
477
- if (depth > 6)
478
- return "";
479
- if (node.focused === true) {
480
- const parts = [node.role?.replace("AX", "")];
481
- if (node.title)
482
- parts.push(`"${node.title}"`);
483
- if (node.description)
484
- parts.push(`desc="${node.description}"`);
485
- if (node.value)
486
- parts.push(`val="${node.value.slice(0, 30)}"`);
487
- if (node.position)
488
- parts.push(`@${Math.round(node.position.x)},${Math.round(node.position.y)}`);
489
- return parts.filter(Boolean).join(" ");
490
- }
491
- if (node.children) {
492
- for (const child of node.children) {
493
- const found = describeFocused(child, depth + 1);
494
- if (found)
495
- return found;
496
- }
497
- }
498
- return "";
499
- }
500
- /** Collect all text field values from the tree for typing detection. */
501
- function collectTextFields(node, depth = 0) {
502
- const fields = new Map();
503
- if (depth > 5)
504
- return fields;
505
- const role = node.role?.replace("AX", "").toLowerCase() ?? "";
506
- const isTextField = role === "textfield" || role === "textarea" || role === "combobox" || role === "searchfield";
507
- if (isTextField && node.value !== undefined) {
508
- const id = node.identifier ?? node.title ?? node.description ?? `field_${depth}`;
509
- fields.set(id, node.value);
510
- }
511
- if (node.children) {
512
- for (const child of node.children) {
513
- for (const [k, v] of collectTextFields(child, depth + 1)) {
514
- fields.set(k, v);
515
- }
516
- }
517
- }
518
- return fields;
519
- }