screenhand 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. package/README.md +165 -446
  2. package/bin/darwin-arm64/macos-bridge +0 -0
  3. package/dist/mcp-desktop.js +3549 -404
  4. package/dist/scripts/export-help-center.js +112 -0
  5. package/dist/scripts/marketing-loop.js +117 -0
  6. package/dist/scripts/observer-daemon.js +288 -0
  7. package/dist/scripts/orchestrator-daemon.js +399 -0
  8. package/dist/scripts/threads-campaign.js +208 -0
  9. package/dist/src/community/fetcher.js +109 -0
  10. package/dist/src/community/index.js +6 -0
  11. package/dist/src/community/publisher.js +191 -0
  12. package/dist/src/community/remote-api.js +121 -0
  13. package/dist/src/community/types.js +3 -0
  14. package/dist/src/community/validator.js +95 -0
  15. package/dist/src/context-tracker.js +489 -0
  16. package/dist/src/ingestion/coverage-auditor.js +233 -0
  17. package/dist/src/ingestion/doc-parser.js +164 -0
  18. package/dist/src/ingestion/index.js +8 -0
  19. package/dist/src/ingestion/menu-scanner.js +152 -0
  20. package/dist/src/ingestion/reference-merger.js +186 -0
  21. package/dist/src/ingestion/shortcut-extractor.js +180 -0
  22. package/dist/src/ingestion/tutorial-extractor.js +170 -0
  23. package/dist/src/ingestion/types.js +3 -0
  24. package/dist/src/jobs/manager.js +82 -14
  25. package/dist/src/jobs/runner.js +138 -15
  26. package/dist/src/learning/engine.js +356 -0
  27. package/dist/src/learning/index.js +9 -0
  28. package/dist/src/learning/locator-policy.js +120 -0
  29. package/dist/src/learning/pattern-policy.js +89 -0
  30. package/dist/src/learning/recovery-policy.js +116 -0
  31. package/dist/src/learning/sensor-policy.js +115 -0
  32. package/dist/src/learning/timing-model.js +204 -0
  33. package/dist/src/learning/topology-policy.js +90 -0
  34. package/dist/src/learning/types.js +9 -0
  35. package/dist/src/logging/timeline-logger.js +4 -1
  36. package/dist/src/memory/playbook-seeds.js +200 -0
  37. package/dist/src/memory/recall.js +60 -8
  38. package/dist/src/memory/service.js +30 -5
  39. package/dist/src/memory/store.js +34 -5
  40. package/dist/src/native/bridge-client.js +253 -31
  41. package/dist/src/observer/state.js +199 -0
  42. package/dist/src/observer/types.js +43 -0
  43. package/dist/src/orchestrator/state.js +68 -0
  44. package/dist/src/orchestrator/types.js +22 -0
  45. package/dist/src/perception/ax-source.js +162 -0
  46. package/dist/src/perception/cdp-source.js +162 -0
  47. package/dist/src/perception/coordinator.js +771 -0
  48. package/dist/src/perception/frame-differ.js +287 -0
  49. package/dist/src/perception/index.js +22 -0
  50. package/dist/src/perception/manager.js +199 -0
  51. package/dist/src/perception/types.js +47 -0
  52. package/dist/src/perception/vision-source.js +399 -0
  53. package/dist/src/planner/deterministic.js +298 -0
  54. package/dist/src/planner/executor.js +870 -0
  55. package/dist/src/planner/goal-store.js +92 -0
  56. package/dist/src/planner/index.js +21 -0
  57. package/dist/src/planner/planner.js +520 -0
  58. package/dist/src/planner/tool-registry.js +71 -0
  59. package/dist/src/planner/types.js +22 -0
  60. package/dist/src/platform/explorer.js +213 -0
  61. package/dist/src/platform/help-center-markdown.js +527 -0
  62. package/dist/src/platform/learner.js +257 -0
  63. package/dist/src/playbook/engine.js +296 -11
  64. package/dist/src/playbook/mcp-recorder.js +204 -0
  65. package/dist/src/playbook/recorder.js +3 -2
  66. package/dist/src/playbook/runner.js +1 -1
  67. package/dist/src/playbook/store.js +139 -10
  68. package/dist/src/recovery/detectors.js +156 -0
  69. package/dist/src/recovery/engine.js +327 -0
  70. package/dist/src/recovery/index.js +20 -0
  71. package/dist/src/recovery/strategies.js +274 -0
  72. package/dist/src/recovery/types.js +20 -0
  73. package/dist/src/runtime/accessibility-adapter.js +55 -18
  74. package/dist/src/runtime/applescript-adapter.js +8 -2
  75. package/dist/src/runtime/cdp-chrome-adapter.js +1 -1
  76. package/dist/src/runtime/executor.js +23 -3
  77. package/dist/src/runtime/locator-cache.js +24 -2
  78. package/dist/src/runtime/service.js +59 -15
  79. package/dist/src/runtime/session-manager.js +4 -1
  80. package/dist/src/runtime/vision-adapter.js +2 -1
  81. package/dist/src/state/app-map-types.js +72 -0
  82. package/dist/src/state/app-map.js +1974 -0
  83. package/dist/src/state/entity-tracker.js +108 -0
  84. package/dist/src/state/fusion.js +96 -0
  85. package/dist/src/state/index.js +21 -0
  86. package/dist/src/state/ladder-generator.js +236 -0
  87. package/dist/src/state/persistence.js +156 -0
  88. package/dist/src/state/types.js +17 -0
  89. package/dist/src/state/world-model.js +1456 -0
  90. package/dist/src/util/atomic-write.js +19 -4
  91. package/dist/src/util/sanitize.js +146 -0
  92. package/dist-app-maps/com.figma.Desktop.json +959 -0
  93. package/dist-app-maps/com.hnc.Discord.json +1146 -0
  94. package/dist-app-maps/notion.id.json +2831 -0
  95. package/dist-playbooks/canva-screenhand-carousel.json +445 -0
  96. package/dist-playbooks/codex-desktop.json +76 -0
  97. package/dist-playbooks/competitor-research-stack.json +122 -0
  98. package/dist-playbooks/davinci-color-grade.json +153 -0
  99. package/dist-playbooks/davinci-edit-timeline.json +162 -0
  100. package/dist-playbooks/davinci-render.json +114 -0
  101. package/dist-playbooks/devto.json +52 -0
  102. package/dist-playbooks/discord.json +41 -0
  103. package/dist-playbooks/google-flow-create-project.json +59 -0
  104. package/dist-playbooks/google-flow-edit-image.json +90 -0
  105. package/dist-playbooks/google-flow-edit-video.json +90 -0
  106. package/dist-playbooks/google-flow-generate-image.json +68 -0
  107. package/dist-playbooks/google-flow-generate-video.json +191 -0
  108. package/dist-playbooks/google-flow-open-project.json +48 -0
  109. package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
  110. package/dist-playbooks/google-flow-search-assets.json +64 -0
  111. package/dist-playbooks/instagram.json +57 -0
  112. package/dist-playbooks/linkedin.json +52 -0
  113. package/dist-playbooks/n8n.json +43 -0
  114. package/dist-playbooks/reddit.json +52 -0
  115. package/dist-playbooks/threads.json +59 -0
  116. package/dist-playbooks/x-twitter.json +59 -0
  117. package/dist-playbooks/youtube.json +59 -0
  118. package/dist-references/canva.json +646 -0
  119. package/dist-references/codex-desktop.json +305 -0
  120. package/dist-references/davinci-resolve-keyboard.json +594 -0
  121. package/dist-references/davinci-resolve-menu-map.json +1139 -0
  122. package/dist-references/davinci-resolve-menus-batch1.json +116 -0
  123. package/dist-references/davinci-resolve-menus-batch2.json +372 -0
  124. package/dist-references/davinci-resolve-menus-batch3.json +330 -0
  125. package/dist-references/davinci-resolve-menus-batch4.json +297 -0
  126. package/dist-references/davinci-resolve-shortcuts.json +333 -0
  127. package/dist-references/devpost.json +186 -0
  128. package/dist-references/devto.json +317 -0
  129. package/dist-references/discord.json +549 -0
  130. package/dist-references/figma.json +1186 -0
  131. package/dist-references/finder.json +146 -0
  132. package/dist-references/google-ads-transparency.json +95 -0
  133. package/dist-references/google-flow.json +649 -0
  134. package/dist-references/instagram.json +341 -0
  135. package/dist-references/linkedin.json +324 -0
  136. package/dist-references/meta-ad-library.json +86 -0
  137. package/dist-references/n8n.json +387 -0
  138. package/dist-references/notes.json +27 -0
  139. package/dist-references/notion.json +163 -0
  140. package/dist-references/reddit.json +341 -0
  141. package/dist-references/threads.json +337 -0
  142. package/dist-references/x-twitter.json +403 -0
  143. package/dist-references/youtube.json +373 -0
  144. package/native/macos-bridge/Package.swift +22 -0
  145. package/native/macos-bridge/Sources/AccessibilityBridge.swift +482 -0
  146. package/native/macos-bridge/Sources/AppManagement.swift +339 -0
  147. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +537 -0
  148. package/native/macos-bridge/Sources/ObserverBridge.swift +120 -0
  149. package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
  150. package/native/macos-bridge/Sources/VisionBridge.swift +238 -0
  151. package/native/macos-bridge/Sources/main.swift +498 -0
  152. package/native/windows-bridge/AppManagement.cs +234 -0
  153. package/native/windows-bridge/InputBridge.cs +436 -0
  154. package/native/windows-bridge/Program.cs +270 -0
  155. package/native/windows-bridge/ScreenCapture.cs +453 -0
  156. package/native/windows-bridge/UIAutomationBridge.cs +571 -0
  157. package/native/windows-bridge/WindowsBridge.csproj +17 -0
  158. package/package.json +12 -1
  159. package/scripts/postinstall.cjs +127 -0
  160. package/dist/.audit-log.jsonl +0 -55
  161. package/dist/.screenhand/memory/.lock +0 -1
  162. package/dist/.screenhand/memory/actions.jsonl +0 -85
  163. package/dist/.screenhand/memory/errors.jsonl +0 -5
  164. package/dist/.screenhand/memory/errors.jsonl.bak +0 -4
  165. package/dist/.screenhand/memory/state.json +0 -35
  166. package/dist/.screenhand/memory/state.json.bak +0 -35
  167. package/dist/.screenhand/memory/strategies.jsonl +0 -12
  168. package/dist/agent/cli.js +0 -73
  169. package/dist/agent/loop.js +0 -258
  170. package/dist/config.js +0 -9
  171. package/dist/index.js +0 -56
  172. package/dist/logging/timeline-logger.js +0 -29
  173. package/dist/mcp/mcp-stdio-server.js +0 -448
  174. package/dist/mcp/server.js +0 -347
  175. package/dist/mcp-entry.js +0 -59
  176. package/dist/memory/recall.js +0 -160
  177. package/dist/memory/research.js +0 -98
  178. package/dist/memory/seeds.js +0 -89
  179. package/dist/memory/session.js +0 -161
  180. package/dist/memory/store.js +0 -391
  181. package/dist/memory/types.js +0 -4
  182. package/dist/monitor/codex-monitor.js +0 -377
  183. package/dist/monitor/task-queue.js +0 -84
  184. package/dist/monitor/types.js +0 -49
  185. package/dist/native/bridge-client.js +0 -174
  186. package/dist/native/macos-bridge-client.js +0 -5
  187. package/dist/npm-publish-helper.js +0 -117
  188. package/dist/npm-token-cdp.js +0 -113
  189. package/dist/npm-token-create.js +0 -135
  190. package/dist/npm-token-finish.js +0 -126
  191. package/dist/playbook/engine.js +0 -193
  192. package/dist/playbook/index.js +0 -4
  193. package/dist/playbook/recorder.js +0 -519
  194. package/dist/playbook/runner.js +0 -392
  195. package/dist/playbook/store.js +0 -166
  196. package/dist/playbook/types.js +0 -4
  197. package/dist/runtime/accessibility-adapter.js +0 -377
  198. package/dist/runtime/app-adapter.js +0 -48
  199. package/dist/runtime/applescript-adapter.js +0 -283
  200. package/dist/runtime/ax-role-map.js +0 -80
  201. package/dist/runtime/browser-adapter.js +0 -36
  202. package/dist/runtime/cdp-chrome-adapter.js +0 -505
  203. package/dist/runtime/composite-adapter.js +0 -205
  204. package/dist/runtime/executor.js +0 -250
  205. package/dist/runtime/locator-cache.js +0 -12
  206. package/dist/runtime/planning-loop.js +0 -47
  207. package/dist/runtime/service.js +0 -372
  208. package/dist/runtime/session-manager.js +0 -28
  209. package/dist/runtime/state-observer.js +0 -105
  210. package/dist/runtime/vision-adapter.js +0 -208
  211. package/dist/test-mcp-protocol.js +0 -138
  212. package/dist/types.js +0 -1
@@ -34,10 +34,13 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
34
34
  import { z } from "zod";
35
35
  import path from "node:path";
36
36
  import { fileURLToPath } from "node:url";
37
- import { execSync } from "node:child_process";
37
+ import { execSync, exec } from "node:child_process";
38
+ import { promisify } from "node:util";
39
+ const execAsync = promisify(exec);
38
40
  import fs from "node:fs";
39
41
  import { BridgeClient } from "./src/native/bridge-client.js";
40
42
  import { writeFileAtomicSync, readJsonWithRecovery } from "./src/util/atomic-write.js";
43
+ import { sanitizeUrl, redactSensitiveLabel, redactUsername, redactPII } from "./src/util/sanitize.js";
41
44
  import { MemoryService } from "./src/memory/service.js";
42
45
  import { backgroundResearch } from "./src/memory/research.js";
43
46
  import { SessionSupervisor, LeaseManager } from "./src/supervisor/supervisor.js";
@@ -46,11 +49,30 @@ import { JobRunner } from "./src/jobs/runner.js";
46
49
  import { getWorkerLiveStatus, getWorkerDaemonPid, WORKER_LOG_FILE } from "./src/jobs/worker.js";
47
50
  import { PlaybookEngine } from "./src/playbook/engine.js";
48
51
  import { PlaybookStore } from "./src/playbook/store.js";
52
+ import { ContextTracker } from "./src/context-tracker.js";
53
+ import { McpPlaybookRecorder } from "./src/playbook/mcp-recorder.js";
54
+ import { WorldModel } from "./src/state/index.js";
55
+ import { PerceptionManager } from "./src/perception/index.js";
56
+ import { Planner, PlanExecutor, GoalStore, ToolRegistry } from "./src/planner/index.js";
57
+ import { RecoveryEngine } from "./src/recovery/index.js";
58
+ import { LearningEngine } from "./src/learning/index.js";
59
+ import { discoverWebElements, testWebElement, compileReference, saveExploreResult, discoverNativeElements } from "./src/platform/explorer.js";
60
+ import { buildDocUrls, crawlPage, compileLearnResult, saveLearnResult } from "./src/platform/learner.js";
49
61
  import { AccessibilityAdapter } from "./src/runtime/accessibility-adapter.js";
50
62
  import { AutomationRuntimeService } from "./src/runtime/service.js";
63
+ import { LocatorCache } from "./src/runtime/locator-cache.js";
51
64
  import { TimelineLogger } from "./src/logging/timeline-logger.js";
65
+ import { readObserverState, getObserverDaemonPid, submitObserverCommand, getObserverCommand } from "./src/observer/state.js";
66
+ import { OBSERVER_LOG_FILE } from "./src/observer/types.js";
52
67
  import { spawn } from "node:child_process";
53
68
  import os from "node:os";
69
+ import { MenuScanner } from "./src/ingestion/menu-scanner.js";
70
+ import { DocParser } from "./src/ingestion/doc-parser.js";
71
+ import { TutorialExtractor } from "./src/ingestion/tutorial-extractor.js";
72
+ import { CoverageAuditor } from "./src/ingestion/coverage-auditor.js";
73
+ import { ReferenceMerger } from "./src/ingestion/reference-merger.js";
74
+ import { PlaybookPublisher } from "./src/community/publisher.js";
75
+ import { PlaybookFetcher } from "./src/community/fetcher.js";
54
76
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
55
77
  // ── Audit logging for dangerous tools ──
56
78
  const AUDIT_LOG_PATH = path.resolve(__dirname, ".audit-log.jsonl");
@@ -73,18 +95,155 @@ const bridgePath = process.platform === "win32"
73
95
  : path.resolve(__dirname, "native/macos-bridge/.build/release/macos-bridge");
74
96
  const bridge = new BridgeClient(bridgePath);
75
97
  let bridgeReady = false;
98
+ // Focus mutex — only one focus() call runs at a time since only one app can be frontmost.
99
+ // Prevents N concurrent focus calls from generating N*5 bridge calls that overwhelm the bridge.
100
+ let focusLock = Promise.resolve();
76
101
  async function ensureBridge() {
77
102
  if (!bridgeReady) {
78
103
  await bridge.start();
79
104
  bridgeReady = true;
105
+ perceptionManager.createSources(bridge);
80
106
  }
81
107
  }
108
+ /** Window titles that indicate auxiliary/utility windows — deprioritize these */
109
+ const AUXILIARY_WINDOW_TITLES = new Set([
110
+ "Privacy Report", "Downloads", "Extensions", "Bookmarks",
111
+ "History", "Preferences", "Settings", "Web Inspector",
112
+ ]);
113
+ /**
114
+ * L3-04 fix: Check if a PID is running — checks app.list first, then falls back to
115
+ * app.frontmost and window list. Some Electron apps (Slack, Discord) don't appear in
116
+ * NSWorkspace.runningApplications but are visible via CGWindowList and frontmost checks.
117
+ */
118
+ async function isPidRunning(pid) {
119
+ try {
120
+ const apps = await bridge.call("app.list", {});
121
+ if (apps?.some((a) => a.pid === pid))
122
+ return true;
123
+ }
124
+ catch { /* ignore */ }
125
+ // Fallback 1: check frontmost
126
+ try {
127
+ const front = await bridge.call("app.frontmost", {});
128
+ if (front.pid === pid)
129
+ return true;
130
+ }
131
+ catch { /* ignore */ }
132
+ // Fallback 2: check window list
133
+ try {
134
+ const wins = await bridge.call("app.windows");
135
+ if (wins?.some((w) => (w.pid || w.ownerPid) === pid))
136
+ return true;
137
+ }
138
+ catch { /* ignore */ }
139
+ return false;
140
+ }
141
+ /** Resolve the native windowId for a given PID via the AX bridge. */
142
+ async function resolveWindowId(pid) {
143
+ // Prefer AX-enriched window.list — returns focused/isMain fields from AX API
144
+ try {
145
+ const wins = await bridge.call("window.list", {});
146
+ const matching = wins?.filter((w) => w.pid === pid);
147
+ if (matching && matching.length > 0) {
148
+ // Filter out auxiliary windows (Privacy Report, Downloads, etc.)
149
+ const contentWindows = matching.filter((w) => !AUXILIARY_WINDOW_TITLES.has(w.title) && w.subrole !== "AXFloatingWindow");
150
+ const candidates = contentWindows.length > 0 ? contentWindows : matching;
151
+ // Prefer focused > isMain > first content window
152
+ const focused = candidates.find((w) => w.focused);
153
+ if (focused?.windowId != null)
154
+ return focused.windowId;
155
+ const main = candidates.find((w) => w.isMain);
156
+ if (main?.windowId != null)
157
+ return main.windowId;
158
+ const win = candidates[0];
159
+ if (win?.windowId != null)
160
+ return win.windowId;
161
+ }
162
+ }
163
+ catch { /* fall through */ }
164
+ try {
165
+ // Fallback to CG-based app.windows (no focused/isMain, may crash on GPU-heavy windows)
166
+ const wins = await bridge.call("app.windows");
167
+ const matching = wins?.filter((w) => w.pid === pid);
168
+ if (matching && matching.length > 0) {
169
+ // Still filter auxiliary windows even in fallback path
170
+ const content = matching.filter((w) => !AUXILIARY_WINDOW_TITLES.has(w.title));
171
+ const win = content.length > 0 ? content[0] : matching[0];
172
+ if (win?.windowId != null)
173
+ return win.windowId;
174
+ }
175
+ }
176
+ catch { /* ignore */ }
177
+ return undefined;
178
+ }
179
+ /** Check if the focused app is a browser — used to enable safeCLI capture mode */
180
+ function isBrowserApp() {
181
+ const bundleId = worldModel.getState().focusedApp?.bundleId ?? "";
182
+ return /^com\.(apple\.Safari|google\.Chrome|microsoft\.edgemac)$|^org\.mozilla\.firefox$/.test(bundleId);
183
+ }
184
+ /**
185
+ * Install async Safari browser enricher on the perception coordinator.
186
+ * Non-blocking — uses async exec instead of execSync.
187
+ * Only installs if bundleId is Safari; clears enricher otherwise.
188
+ */
189
+ function installSafariEnricher(bundleId) {
190
+ const coord = perceptionManager.getCoordinator();
191
+ if (!coord)
192
+ return;
193
+ if (bundleId !== "com.apple.Safari") {
194
+ coord.setBrowserEnricher(null);
195
+ return;
196
+ }
197
+ coord.setBrowserEnricher(async () => {
198
+ const script = `tell application "Safari"
199
+ set t to current tab of front window
200
+ set tabInfo to name of t & "|" & URL of t
201
+ set tabList to ""
202
+ set tabIdx to 1
203
+ repeat with w in windows
204
+ repeat with tb in tabs of w
205
+ set isActive to (tb = current tab of w) as string
206
+ set tabList to tabList & tabIdx & "|" & name of tb & "|" & URL of tb & "|" & isActive & "\\n"
207
+ set tabIdx to tabIdx + 1
208
+ end repeat
209
+ end repeat
210
+ return tabInfo & "\\n---\\n" & tabList
211
+ end tell`;
212
+ const { stdout } = await execAsync(`osascript -e '${script.replace(/'/g, "'\\''")}'`, {
213
+ encoding: "utf-8",
214
+ timeout: 5000,
215
+ });
216
+ const result = (stdout ?? "").trim();
217
+ if (result) {
218
+ const [currentLine, , ...tabLines] = result.split("\n");
219
+ const [title, url] = (currentLine ?? "").split("|");
220
+ const tabs = tabLines
221
+ .filter((l) => l.includes("|"))
222
+ .map((l) => {
223
+ const [idx, tTitle, tUrl, active] = l.split("|");
224
+ return { index: parseInt(idx ?? "0", 10), title: tTitle ?? "", url: tUrl ?? "", isActive: active === "true" };
225
+ });
226
+ if (url)
227
+ worldModel.ingestSafariBrowserState(url, title ?? "", tabs.length > 0 ? tabs : undefined);
228
+ }
229
+ });
230
+ }
82
231
  // CDP connection cache
83
232
  let cdpPort = null;
84
233
  let CDP = null;
85
- async function ensureCDP() {
234
+ async function ensureCDP(overridePort) {
86
235
  if (!CDP)
87
236
  CDP = (await import("chrome-remote-interface")).default;
237
+ // If caller specified a port, use it directly (e.g. 9333 for Electron apps)
238
+ if (overridePort) {
239
+ try {
240
+ await CDP.Version({ port: overridePort });
241
+ return { CDP, port: overridePort };
242
+ }
243
+ catch {
244
+ throw new Error(`CDP not available on port ${overridePort}. Ensure the app is running with --remote-debugging-port=${overridePort}`);
245
+ }
246
+ }
88
247
  if (cdpPort) {
89
248
  try {
90
249
  await CDP.Version({ port: cdpPort });
@@ -92,8 +251,8 @@ async function ensureCDP() {
92
251
  }
93
252
  catch { }
94
253
  }
95
- // Try common ports
96
- for (const p of [9222, 9223, 9224]) {
254
+ // Try common ports (9222-9224 = Chrome, 9333 = Codex desktop)
255
+ for (const p of [9222, 9223, 9224, 9333]) {
97
256
  try {
98
257
  await CDP.Version({ port: p });
99
258
  cdpPort = p;
@@ -103,7 +262,7 @@ async function ensureCDP() {
103
262
  }
104
263
  throw new Error("Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug");
105
264
  }
106
- const server = new McpServer({ name: "screenhand", version: "2.0.0" });
265
+ const server = new McpServer({ name: "screenhand", version: "3.0.0" });
107
266
  // ═══════════════════════════════════════════════
108
267
  // LEARNING MEMORY — cached, auto-recall, non-blocking
109
268
  // ═══════════════════════════════════════════════
@@ -118,7 +277,82 @@ jobManager.init();
118
277
  // Direct lease manager that shares the filesystem lock dir with the daemon
119
278
  const LOCK_DIR = path.join(os.homedir(), ".screenhand", "locks");
120
279
  const leaseManager = new LeaseManager(LOCK_DIR);
121
- // Skip logging for memory tools themselves
280
+ // ── Context tracker connects tool execution to playbook knowledge ──
281
+ // References dir holds curated platform knowledge (selectors, flows, errors)
282
+ // Playbooks dir holds only executable step sequences for job_create
283
+ // Resolution order: local dev paths → npm dist paths → ~/.screenhand/ user paths
284
+ function resolveDataDir(name) {
285
+ // 1. Local dev path (when running from source)
286
+ const local = path.resolve(__dirname, name);
287
+ if (fs.existsSync(local) && fs.readdirSync(local).some(f => f.endsWith(".json"))) {
288
+ return local;
289
+ }
290
+ // 2. npm dist path (when installed via npx/npm)
291
+ const dist = path.resolve(__dirname, `dist-${name}`);
292
+ if (fs.existsSync(dist) && fs.readdirSync(dist).some(f => f.endsWith(".json"))) {
293
+ return dist;
294
+ }
295
+ // 3. User home path (always available for user-generated content)
296
+ const userDir = path.join(os.homedir(), ".screenhand", name);
297
+ if (!fs.existsSync(userDir)) {
298
+ fs.mkdirSync(userDir, { recursive: true });
299
+ }
300
+ return userDir;
301
+ }
302
+ const referencesDir = resolveDataDir("references");
303
+ const _playbookStoreForContext = new PlaybookStore(referencesDir);
304
+ _playbookStoreForContext.load();
305
+ const playbooksDir = resolveDataDir("playbooks");
306
+ const contextTracker = new ContextTracker(_playbookStoreForContext, playbooksDir);
307
+ const worldModel = new WorldModel();
308
+ const perceptionManager = new PerceptionManager(worldModel);
309
+ const learningEngine = new LearningEngine();
310
+ learningEngine.init();
311
+ import { AppMap } from "./src/state/app-map.js";
312
+ // Seed app maps: check npm dist path first, then local dev path
313
+ const seedAppMapsDir = (() => {
314
+ const dist = path.resolve(__dirname, "dist-app-maps");
315
+ if (fs.existsSync(dist))
316
+ return dist;
317
+ const local = path.resolve(__dirname, "seed-app-maps");
318
+ if (fs.existsSync(local))
319
+ return local;
320
+ return undefined;
321
+ })();
322
+ const appMap = new AppMap(seedAppMapsDir ? { seedDir: seedAppMapsDir } : undefined);
323
+ appMap.init();
324
+ // Cross-feature workflow tracking: per-app buffer of distinct features hit by action tools
325
+ const crossFeatureBuffer = new Map();
326
+ // Visibility tracking throttle: run conditional UI check every 10th tool call
327
+ let visibilityCheckCounter = 0;
328
+ // Previous tool name for ready-signal recording (what action preceded a wait)
329
+ let lastSuccessfulToolName = "unknown";
330
+ // Last known bundleId — survives focusedApp being nulled by app_deactivated events
331
+ let lastKnownBundleId = null;
332
+ contextTracker.setAppMap(appMap);
333
+ perceptionManager.setAppMap(appMap);
334
+ const _executablePlaybookStore = new PlaybookStore(playbooksDir);
335
+ try {
336
+ _executablePlaybookStore.load();
337
+ }
338
+ catch { /* dir may not exist */ }
339
+ const planner = new Planner(_executablePlaybookStore, memory, contextTracker, worldModel, learningEngine);
340
+ const goalStore = new GoalStore(path.join(os.homedir(), ".screenhand", "planner"));
341
+ goalStore.init();
342
+ const toolRegistry = new ToolRegistry();
343
+ const recoveryEngine = new RecoveryEngine(worldModel, toolRegistry.toExecutor(), memory);
344
+ recoveryEngine.setLearningEngine(learningEngine);
345
+ planner.setToolRegistry(toolRegistry);
346
+ perceptionManager.setLearningEngine(learningEngine);
347
+ const mcpRecorder = new McpPlaybookRecorder(playbooksDir);
348
+ const referenceMerger = new ReferenceMerger(referencesDir);
349
+ const communityPublisher = new PlaybookPublisher();
350
+ const communityFetcher = new PlaybookFetcher();
351
+ // Tools excluded from the intelligence wrapper (memory/context hints).
352
+ // Memory, supervisor, job, and daemon lifecycle tools skip the wrapper to avoid recursion
353
+ // and because they don't benefit from playbook hints.
354
+ // NOTE: platform knowledge tools (platform_guide, playbook_preflight, export_playbook)
355
+ // are NOT excluded — they benefit from context-aware hints.
122
356
  const MEMORY_TOOLS = new Set([
123
357
  "memory_snapshot", "memory_recall", "memory_save", "memory_record_error",
124
358
  "memory_record_learning", "memory_query_patterns", "memory_errors",
@@ -131,19 +365,49 @@ const MEMORY_TOOLS = new Set([
131
365
  "job_step_done", "job_step_fail", "job_resume", "job_dequeue", "job_remove",
132
366
  "job_run", "job_run_all",
133
367
  "worker_start", "worker_stop", "worker_status",
368
+ "job_create_chain",
369
+ "observer_start", "observer_stop", "observer_status", "observer_ocr_roi",
370
+ "orchestrator_start", "orchestrator_stop", "orchestrator_submit", "orchestrator_status",
371
+ "world_state", "world_state_diff", "perception_status", "perception_start", "perception_stop",
372
+ "learning_status", "learning_reset",
373
+ "plan_goal", "plan_execute", "plan_step", "plan_step_resolve", "plan_status", "plan_list", "plan_cancel",
374
+ "recovery_status", "recovery_configure",
375
+ "community_publish", "community_fetch",
134
376
  ]);
135
377
  // Track the strategy we're currently following (for feedback loop)
136
378
  let activeStrategyFingerprint = null;
379
+ let currentAdaptiveBudget = null;
137
380
  // Intercept all tool registrations to auto-log + auto-recall
138
- const originalTool = server.tool.bind(server);
381
+ const _rawOriginalTool = server.tool.bind(server);
382
+ // Wrap originalTool to also register handlers in the tool registry
383
+ const originalTool = ((...args) => {
384
+ const handlerIdx = args.findIndex((a) => typeof a === "function");
385
+ if (handlerIdx !== -1) {
386
+ const name = args[0];
387
+ const handler = args[handlerIdx];
388
+ // Wrap handler to ensure world model session rebinding (same as server.tool wrapper)
389
+ const wrappedHandler = async (params, extra) => {
390
+ const sessionId = memory.getSessionId();
391
+ if (sessionId && worldModel.getState().sessionId !== sessionId) {
392
+ worldModel.init(sessionId);
393
+ }
394
+ return handler(params, extra);
395
+ };
396
+ args[handlerIdx] = wrappedHandler;
397
+ toolRegistry.register(name, (params) => handler(params, {}));
398
+ }
399
+ return _rawOriginalTool(...args);
400
+ });
139
401
  function extractText(result) {
140
402
  if (!result?.content)
141
403
  return "";
142
- return result.content
404
+ const full = result.content
143
405
  .filter((c) => c.type === "text")
144
406
  .map((c) => c.text)
145
- .join("\n")
146
- .slice(0, 500);
407
+ .join("\n");
408
+ if (full.length > 500)
409
+ return full.slice(0, 500) + " [TRUNCATED]";
410
+ return full;
147
411
  }
148
412
  server.tool = (...args) => {
149
413
  const handlerIdx = args.findIndex((a) => typeof a === "function");
@@ -151,6 +415,8 @@ server.tool = (...args) => {
151
415
  return originalTool(...args);
152
416
  const originalHandler = args[handlerIdx];
153
417
  const toolName = args[0];
418
+ // Register the original (unwrapped) handler for internal tool dispatch
419
+ toolRegistry.register(toolName, (params) => originalHandler(params, {}));
154
420
  const wrappedHandler = async (params, extra) => {
155
421
  // Skip intercepting memory tools to avoid recursion
156
422
  if (MEMORY_TOOLS.has(toolName)) {
@@ -159,8 +425,59 @@ server.tool = (...args) => {
159
425
  const sessionId = memory.getSessionId();
160
426
  const safeParams = typeof params === "object" && params !== null ? params : {};
161
427
  const start = Date.now();
428
+ // ── PRE-CALL: lazy-init world model on first session ──
429
+ if (sessionId && worldModel.getState().sessionId !== sessionId) {
430
+ worldModel.init(sessionId);
431
+ }
432
+ // ── PRE-CALL: notify perception to stay active (idle gating) ──
433
+ perceptionManager.notifyToolCall();
162
434
  // ── PRE-CALL: check for known error warnings (~0ms, in-memory) ──
163
435
  const knownError = memory.quickErrorCheck(toolName);
436
+ // ── PRE-CALL: auto-start perception if not running ──
437
+ if (!perceptionManager.isRunning && bridgeReady) {
438
+ const focusApp = worldModel.getState().focusedApp;
439
+ if (focusApp?.bundleId && focusApp?.pid) {
440
+ perceptionManager.tryAutoStart(focusApp, bridge).catch(() => { });
441
+ installSafariEnricher(focusApp.bundleId);
442
+ }
443
+ }
444
+ // ── PRE-CALL: update context tracker (fires playbook lookup only on domain change) ──
445
+ contextTracker.updateContext(toolName, safeParams);
446
+ const playbookHints = contextTracker.getHints(toolName, safeParams);
447
+ // ── PRE-CALL: compute adaptive budget from learning engine ──
448
+ const budgetBundleId = worldModel.getState().focusedApp?.bundleId;
449
+ if (budgetBundleId) {
450
+ const budget = learningEngine.getAdaptiveBudget(budgetBundleId);
451
+ if (budget.locateMs !== 800 || budget.actMs !== 200 || budget.verifyMs !== 2000) {
452
+ currentAdaptiveBudget = budget;
453
+ }
454
+ else {
455
+ currentAdaptiveBudget = null;
456
+ }
457
+ }
458
+ else {
459
+ currentAdaptiveBudget = null;
460
+ }
461
+ // Capture pre-call focused app for focus drift detection
462
+ const preBundleId = worldModel.getState().focusedApp?.bundleId ?? null;
463
+ // Update last known bundleId from world model, tool params, or context tracker
464
+ const paramBundleId = safeParams.bundleId ?? safeParams.pid;
465
+ if (preBundleId) {
466
+ lastKnownBundleId = preBundleId;
467
+ }
468
+ else if (typeof paramBundleId === "string" && paramBundleId) {
469
+ lastKnownBundleId = paramBundleId;
470
+ }
471
+ // Capture pre-call window title for navigation edge tracking
472
+ const preWindowTitle = worldModel.getFocusedWindow()?.title.value ?? null;
473
+ // Action tools = actually doing something. Navigation = just clicking around.
474
+ const ACTION_TOOLS = new Set([
475
+ "type_text", "key", "drag", "scroll", "menu_click", "applescript",
476
+ "ui_set_value", "ui_press",
477
+ "browser_type", "browser_click", "browser_fill_form", "browser_human_click",
478
+ "browser_js", "browser_navigate",
479
+ "type_with_fallback", "select_with_fallback", "scroll_with_fallback",
480
+ ]);
164
481
  try {
165
482
  const result = await originalHandler(params, extra);
166
483
  const durationMs = Date.now() - start;
@@ -177,15 +494,647 @@ server.tool = (...args) => {
177
494
  error: null,
178
495
  };
179
496
  memory.recordEvent(entry); // non-blocking write + session tracking
497
+ // ── POST-CALL: record success for playbook learning (in-memory only) ──
498
+ contextTracker.recordOutcome(toolName, safeParams, true, null);
499
+ // ── POST-CALL: Safari context gap + page context update ──
500
+ const postFocusApp = worldModel.getState().focusedApp;
501
+ const postBundleIdForCtx = postFocusApp?.bundleId ?? lastKnownBundleId;
502
+ if (postBundleIdForCtx) {
503
+ lastKnownBundleId = postBundleIdForCtx;
504
+ // Try focused window first, then search all windows for matching bundleId
505
+ let winTitle = null;
506
+ const focWin = worldModel.getFocusedWindow();
507
+ if (focWin?.title.value) {
508
+ winTitle = focWin.title.value;
509
+ }
510
+ else if (postFocusApp?.pid) {
511
+ // Focused window lost — search state for any window from this app
512
+ for (const [, win] of worldModel.getState().windows) {
513
+ if (win.pid === postFocusApp.pid && win.title.value) {
514
+ winTitle = win.title.value;
515
+ break;
516
+ }
517
+ }
518
+ }
519
+ if (winTitle) {
520
+ contextTracker.updateContextFromWindowTitle(postBundleIdForCtx, winTitle);
521
+ contextTracker.updatePageContext(winTitle);
522
+ }
523
+ else {
524
+ // Don't null out page context if we just can't find the window —
525
+ // keep the last known page context to avoid losing it on transient events
526
+ }
527
+ }
528
+ // ── POST-CALL: record page transitions for navigation graph ──
529
+ const pageTransition = contextTracker.consumePageTransition();
530
+ if (pageTransition && postBundleIdForCtx) {
531
+ try {
532
+ appMap.recordPageTransition(postBundleIdForCtx, pageTransition.from, pageTransition.to, toolName);
533
+ }
534
+ catch { /* non-critical — don't break tool execution for nav tracking */ }
535
+ }
536
+ // ── POST-CALL: detect focus drift ──
537
+ const postBundleId = worldModel.getState().focusedApp?.bundleId ?? null;
538
+ if (preBundleId && postBundleId && preBundleId !== postBundleId) {
539
+ const driftWarning = `⚠ Focus changed: ${preBundleId} → ${postBundleId}. Use \`focus\` to return.`;
540
+ if (result?.content && Array.isArray(result.content)) {
541
+ result.content.unshift({ type: "text", text: driftWarning });
542
+ }
543
+ }
544
+ // ── POST-CALL: feed learning engine (timing + locator outcomes) ──
545
+ const learnBundleId = worldModel.getState().focusedApp?.bundleId ?? lastKnownBundleId ?? "unknown";
546
+ learningEngine.recordToolTiming({ tool: toolName, bundleId: learnBundleId, durationMs, success: true });
547
+ // Record locator outcome if the tool used a target/selector
548
+ const locatorTarget = safeParams.target ?? safeParams.selector ?? safeParams.locator
549
+ ?? (toolName === "click_text" ? safeParams.text : undefined);
550
+ if (typeof locatorTarget === "string" && locatorTarget) {
551
+ const method = toolName.startsWith("browser_") ? "cdp"
552
+ : toolName.includes("ocr") ? "ocr"
553
+ : "ax";
554
+ learningEngine.recordLocatorOutcome({
555
+ bundleId: learnBundleId,
556
+ actionKey: toolName,
557
+ locator: locatorTarget,
558
+ method,
559
+ success: true,
560
+ });
561
+ // Auto-record verified pattern to patterns.jsonl via learning engine
562
+ learningEngine.recordPattern({
563
+ bundleId: learnBundleId,
564
+ tool: toolName,
565
+ locator: locatorTarget,
566
+ method,
567
+ success: true,
568
+ });
569
+ }
570
+ // ── POST-CALL: update app mastery map from successful action ──
571
+ // Check if the result signals an error (e.g. click_text "not found" returns isError: true)
572
+ const resultIsError = !!result?.isError;
573
+ const isActionTool = ACTION_TOOLS.has(toolName);
574
+ if (resultIsError && learnBundleId !== "unknown") {
575
+ // Redirect to failure mastery recording + count as edge case handled
576
+ try {
577
+ const failedLocatorSoft = safeParams.target ?? safeParams.selector ?? safeParams.locator
578
+ ?? (toolName === "click_text" ? safeParams.text : undefined);
579
+ if (typeof failedLocatorSoft === "string" && failedLocatorSoft) {
580
+ appMap.recordElementOutcome(learnBundleId, "auto", failedLocatorSoft, false, contextTracker.currentPageContext ?? undefined);
581
+ }
582
+ if (isActionTool) {
583
+ appMap.recordActionOutcome(learnBundleId, false);
584
+ }
585
+ // Track as edge case: encountering an error is an unexpected state
586
+ const edgeMapData = appMap.getLoaded(learnBundleId);
587
+ if (edgeMapData) {
588
+ edgeMapData.edgeCasesHandled = (edgeMapData.edgeCasesHandled ?? 0) + 1;
589
+ appMap.save(edgeMapData, true);
590
+ }
591
+ const failMapDataSoft = appMap.getLoaded(learnBundleId);
592
+ if (failMapDataSoft?.featureLadder) {
593
+ const failSignalSoft = [toolName, typeof failedLocatorSoft === "string" ? failedLocatorSoft : ""].join(" ").toLowerCase();
594
+ const failGenSignalsSoft = appMap.getGeneratedSignals(learnBundleId) ?? {};
595
+ for (const feature of failMapDataSoft.featureLadder) {
596
+ const fm = failMapDataSoft.featureMastery?.[feature.id];
597
+ if (!fm || fm.depth === 0)
598
+ continue;
599
+ const featureInSignal = failSignalSoft.includes(feature.id.replace(/_/g, " "));
600
+ const keywords = failGenSignalsSoft[feature.id];
601
+ const keywordMatch = keywords?.some((kw) => failSignalSoft.includes(kw));
602
+ if (featureInSignal || keywordMatch) {
603
+ appMap.recordFeatureSignal(learnBundleId, feature.id, fm.depth, false);
604
+ }
605
+ }
606
+ }
607
+ }
608
+ catch { /* non-fatal */ }
609
+ }
610
+ if (!resultIsError && learnBundleId !== "unknown") {
611
+ try {
612
+ if (!appMap.load(learnBundleId)) {
613
+ const focApp = worldModel.getState().focusedApp;
614
+ appMap.createEmpty(learnBundleId, focApp?.appName ?? learnBundleId);
615
+ }
616
+ // Record element outcome for tools with a locator target
617
+ if (typeof locatorTarget === "string" && locatorTarget) {
618
+ appMap.recordElementOutcome(learnBundleId, "auto", locatorTarget, true, contextTracker.currentPageContext ?? undefined);
619
+ // Write relative position from click coordinates
620
+ const resultText = extractText(result);
621
+ const screenMatch = resultText.match(/at screen \((\d+),\s*(\d+)\)/);
622
+ const windowMatch = resultText.match(/\[window: \((\d+),\s*(\d+)\) (\d+)[x×](\d+)\]/);
623
+ if (screenMatch && windowMatch) {
624
+ const sx = parseInt(screenMatch[1], 10);
625
+ const sy = parseInt(screenMatch[2], 10);
626
+ const wx = parseInt(windowMatch[1], 10);
627
+ const wy = parseInt(windowMatch[2], 10);
628
+ const ww = parseInt(windowMatch[3], 10);
629
+ const wh = parseInt(windowMatch[4], 10);
630
+ if (ww > 0 && wh > 0) {
631
+ const relX = Math.max(0, Math.min(1, (sx - wx) / ww));
632
+ const relY = Math.max(0, Math.min(1, (sy - wy) / wh));
633
+ appMap.updateElementPosition(learnBundleId, "auto_discovered", locatorTarget, relX, relY);
634
+ }
635
+ }
636
+ }
637
+ // Record action outcome (only for tools that DO something, not navigation)
638
+ if (isActionTool) {
639
+ appMap.recordActionOutcome(learnBundleId, true);
640
+ }
641
+ // ── Record input/output contract for element interaction tools ──
642
+ {
643
+ const CONTRACT_TOOLS = new Set(["click", "click_text", "type_text", "key", "menu_click"]);
644
+ if (CONTRACT_TOOLS.has(toolName) && typeof locatorTarget === "string" && locatorTarget) {
645
+ // Use "auto" to search all zones — page-specific zones may not exist yet
646
+ appMap.recordContract(learnBundleId, "auto", locatorTarget, toolName, ["action succeeded"]);
647
+ }
648
+ }
649
+ // ── Track shortcut usage (keyboard combos with modifier keys) ──
650
+ if (toolName === "key" && typeof safeParams.combo === "string") {
651
+ const combo = safeParams.combo.toLowerCase();
652
+ if (combo.includes("cmd+") || combo.includes("ctrl+") || combo.includes("alt+") || combo.includes("shift+")) {
653
+ const mapDataShortcut = appMap.getLoaded(learnBundleId);
654
+ if (mapDataShortcut) {
655
+ mapDataShortcut.shortcutsUsed = (mapDataShortcut.shortcutsUsed ?? 0) + 1;
656
+ appMap.save(mapDataShortcut, true);
657
+ }
658
+ }
659
+ }
660
+ // ── Track edge case handling (escape = dialog/popup dismissal) ──
661
+ if (toolName === "key" && safeParams.combo === "escape") {
662
+ const mapDataEdge = appMap.getLoaded(learnBundleId);
663
+ if (mapDataEdge) {
664
+ mapDataEdge.edgeCasesHandled = (mapDataEdge.edgeCasesHandled ?? 0) + 1;
665
+ appMap.save(mapDataEdge, true);
666
+ }
667
+ }
668
+ // ── Auto-detect feature depth from tool usage signals ──
669
+ // Depth: 1=navigated (screenshot/focus), 2=basic action (click/type),
670
+ // 3=multi-step workflow (action tools in sequence), 4=verified outcome
671
+ {
672
+ const mapData = appMap.getLoaded(learnBundleId);
673
+ if (mapData?.featureLadder) {
674
+ const signalText = [
675
+ toolName,
676
+ typeof locatorTarget === "string" ? locatorTarget : "",
677
+ typeof safeParams.text === "string" ? safeParams.text : "",
678
+ preWindowTitle ?? "",
679
+ worldModel.getFocusedWindow()?.title.value ?? "",
680
+ ].join(" ").toLowerCase();
681
+ // Determine depth from tool type and history:
682
+ // depth 1 = navigated (screenshot/focus/ocr)
683
+ // depth 2 = basic action (click/type/key on the feature)
684
+ // depth 3 = multi-step workflow (already at depth 2, hit again with different action tool)
685
+ // depth 4 = verified outcome (at depth 3, then verified via screenshot/ocr)
686
+ const NAV_TOOLS = new Set(["screenshot", "screenshot_file", "focus", "ocr", "ui_tree", "ui_find", "windows", "apps", "browser_tabs", "browser_page_info", "browser_dom"]);
687
+ const VERIFY_TOOLS = new Set(["screenshot", "screenshot_file", "ocr", "ui_tree", "ui_find", "browser_dom", "browser_page_info"]);
688
+ const isNavTool = NAV_TOOLS.has(toolName);
689
+ const isVerifyTool = VERIFY_TOOLS.has(toolName);
690
+ // Keyword map: featureId → keywords that signal the feature was used
691
+ // Hardcoded signals for apps with BUILTIN_LADDERS
692
+ const BUILTIN_FEATURE_SIGNALS = {
693
+ // Discord
694
+ browse_channels: ["channel", "server", "sidebar", "lounge", "information"],
695
+ send_message: ["message", "type_text", "browser_type", "chatter", "chat"],
696
+ direct_messages: ["direct message", "dm", "group chat", "friends"],
697
+ voice_video: ["voice", "stage", "listen", "audio", "video", "call", "screen share", "activity"],
698
+ threads_forums: ["thread", "forum", "post", "topic", "discussion"],
699
+ roles_permissions: ["role", "permission", "override", "hidden channel"],
700
+ notification_control: ["notification", "mention", "mute", "suppress"],
701
+ events_stage: ["event", "stage", "trivia", "interested", "schedule"],
702
+ onboarding_funnel: ["onboarding", "welcome", "get started", "rules screening", "starter", "channels & roles", "customize", "browse channels", "choose your channels"],
703
+ moderation_system: ["moderation", "automod", "ban", "modmail", "audit", "report", "rules", "safety", "raid"],
704
+ bot_ecosystem: ["bot", "automod", "integration", "app directory", "slash command", "verification", "add app", "add to server", "mee6", "webhook"],
705
+ server_architecture: ["category", "channel taxonomy", "channels & roles", "server guide", "server settings"],
706
+ community_growth: ["announcement", "event", "reward", "retention", "engagement"],
707
+ analytics_health: ["analytics", "insights", "server insights", "activity", "member count"],
708
+ monetization_membership: ["premium", "boost", "subscription", "tier", "monetiz"],
709
+ crisis_handling: ["raid", "spam", "harassment", "lockdown", "ban wave"],
710
+ cross_platform: ["github", "notion", "twitch", "stripe", "zapier", "webhook"],
711
+ staff_system: ["moderator", "staff", "escalation", "internal", "mod channel"],
712
+ brand_culture: ["community", "identity", "ritual", "culture", "recognition"],
713
+ governance_policy: ["rules", "policy", "enforcement", "appeal", "governance"],
714
+ // Safari
715
+ browse_navigate: ["navigate", "browser_navigate", "browser_open", "url"],
716
+ tabs_windows: ["tab", "browser_tabs", "window"],
717
+ bookmarks: ["bookmark", "reading list"],
718
+ history_search: ["history", "search"],
719
+ tab_groups: ["tab group", "profile"],
720
+ extensions: ["extension"],
721
+ dev_tools: ["inspector", "developer", "console", "browser_js"],
722
+ privacy_settings: ["privacy", "cookie", "blocker"],
723
+ web_apps: ["add to dock", "web app"],
724
+ // Finder
725
+ browse_files: ["finder", "file", "folder", "browse"],
726
+ copy_move: ["copy", "move", "rename", "delete", "trash"],
727
+ search: ["search", "spotlight"],
728
+ views_sort: ["view", "sort", "column", "icon", "list"],
729
+ tags_favorites: ["tag", "favorite", "sidebar"],
730
+ quick_actions: ["quick look", "quick action", "service"],
731
+ automator_scripts: ["automator", "terminal", "script", "applescript"],
732
+ // Generic (fallback for apps with generic ladders)
733
+ basic_navigation: ["navigate", "open", "browse", "launch"],
734
+ core_action: ["type_text", "click", "press", "key"],
735
+ settings: ["settings", "preferences", "config"],
736
+ advanced_features: ["advanced", "power", "shortcut", "automation"],
737
+ };
738
+ // Auto-generate ladder from reference if no builtin exists
739
+ if (!appMap.hasGeneratedLadder(learnBundleId)) {
740
+ const ref = _playbookStoreForContext.matchByBundleId(learnBundleId);
741
+ if (ref?.selectors && Object.keys(ref.selectors).length >= 2) {
742
+ const generated = appMap.generateLadderFromRef(learnBundleId, ref);
743
+ if (generated) {
744
+ // Reload mapData with new ladder
745
+ const refreshed = appMap.getLoaded(learnBundleId);
746
+ if (refreshed) {
747
+ Object.assign(mapData, refreshed);
748
+ }
749
+ }
750
+ }
751
+ }
752
+ // Merge auto-generated signals with builtins (generated takes priority)
753
+ const generatedSignals = appMap.getGeneratedSignals(learnBundleId);
754
+ const mergedSignals = { ...BUILTIN_FEATURE_SIGNALS };
755
+ if (generatedSignals) {
756
+ for (const [fid, kws] of Object.entries(generatedSignals)) {
757
+ mergedSignals[fid] = kws;
758
+ }
759
+ }
760
+ const hitFeatures = [];
761
+ for (const feature of mapData.featureLadder) {
762
+ const keywords = mergedSignals[feature.id];
763
+ if (!keywords)
764
+ continue;
765
+ if (keywords.some((kw) => signalText.includes(kw))) {
766
+ // Compute depth based on current state + tool type
767
+ const existing = mapData.featureMastery?.[feature.id];
768
+ const currentDepth = existing?.depth ?? 0;
769
+ let signalDepth;
770
+ if (isVerifyTool && currentDepth >= 3) {
771
+ // Verifying after a workflow = verified outcome (depth 4)
772
+ signalDepth = 4;
773
+ }
774
+ else if (!isNavTool && currentDepth >= 2 && (existing?.repeatCount ?? 0) >= 3) {
775
+ // Repeated action tool on a feature we've already actioned = workflow (depth 3)
776
+ signalDepth = 3;
777
+ }
778
+ else if (isNavTool) {
779
+ signalDepth = 1;
780
+ }
781
+ else {
782
+ signalDepth = 2;
783
+ }
784
+ appMap.recordFeatureSignal(learnBundleId, feature.id, signalDepth, true);
785
+ // Healing detection: success after prior failure = recovery
786
+ if (existing && existing.failCount > (existing.healingCount ?? 0)) {
787
+ appMap.recordHealing(learnBundleId, feature.id);
788
+ }
789
+ if (!isNavTool)
790
+ hitFeatures.push(feature.id);
791
+ }
792
+ }
793
+ // Cross-feature workflow detection: track distinct features hit by action tools.
794
+ // When 3+ distinct features are hit in a rolling window, record a cross-feature workflow.
795
+ if (!crossFeatureBuffer.has(learnBundleId)) {
796
+ crossFeatureBuffer.set(learnBundleId, { features: [], lastRecordedAt: 0 });
797
+ }
798
+ const cfBuf = crossFeatureBuffer.get(learnBundleId);
799
+ for (const fid of hitFeatures) {
800
+ if (!cfBuf.features.includes(fid))
801
+ cfBuf.features.push(fid);
802
+ }
803
+ // Trim to last 10 features
804
+ if (cfBuf.features.length > 10)
805
+ cfBuf.features = cfBuf.features.slice(-10);
806
+ // Record a cross-feature workflow every 3 distinct features (throttled)
807
+ if (cfBuf.features.length >= 3 && Date.now() - cfBuf.lastRecordedAt > 30_000) {
808
+ appMap.recordCrossFeatureWorkflow(learnBundleId);
809
+ cfBuf.lastRecordedAt = Date.now();
810
+ cfBuf.features = []; // Reset for next workflow
811
+ }
812
+ }
813
+ }
814
+ // Record navigation edge when window title changes (screen transition)
815
+ const postWindowTitle = worldModel.getFocusedWindow()?.title.value ?? null;
816
+ if (preWindowTitle && postWindowTitle && preWindowTitle !== postWindowTitle) {
817
+ const appName = worldModel.getState().focusedApp?.appName ?? "";
818
+ const titleSuffix = appName ? new RegExp(` - ${appName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}$`) : null;
819
+ const fromNode = titleSuffix ? preWindowTitle.replace(titleSuffix, "") : preWindowTitle;
820
+ const toNode = titleSuffix ? postWindowTitle.replace(titleSuffix, "") : postWindowTitle;
821
+ if (fromNode !== toNode) {
822
+ appMap.addNavNode(learnBundleId, fromNode, { type: "window", description: fromNode });
823
+ appMap.addNavNode(learnBundleId, toNode, { type: "window", description: toNode });
824
+ appMap.recordEdgeOutcome(learnBundleId, fromNode, locatorTarget ?? toolName, toNode, true);
825
+ learningEngine.recordTopologyOutcome({
826
+ bundleId: learnBundleId,
827
+ fromNode,
828
+ action: locatorTarget ?? toolName,
829
+ toNode,
830
+ success: true,
831
+ });
832
+ }
833
+ }
834
+ // ── State machine: detect state changes from tool results ──
835
+ // Two detection paths:
836
+ // 1. Keyword matching on result text (original regex patterns)
837
+ // 2. Structural detection: key combos that open/close UI elements
838
+ {
839
+ const stateResultText = extractText(result).toLowerCase();
840
+ const stateTrigger = locatorTarget ?? toolName;
841
+ // --- Structural state detection from tool + combo patterns ---
842
+ // Keyboard shortcuts that toggle UI state (works even when result text has no keywords)
843
+ if (toolName === "key" && typeof safeParams.combo === "string") {
844
+ const combo = safeParams.combo.toLowerCase();
845
+ // Cmd+K / Ctrl+K / Cmd+P = search/command palette (dialog open)
846
+ if (combo === "cmd+k" || combo === "ctrl+k" || combo === "cmd+p" || combo === "ctrl+p") {
847
+ const prevState = appMap.getCurrentState(learnBundleId);
848
+ const from = prevState["modal_state"] ?? "closed";
849
+ appMap.recordStateChange(learnBundleId, "modal_state", from, "open", combo);
850
+ }
851
+ // Escape = dismiss dialog/modal
852
+ if (combo === "escape") {
853
+ const prevState = appMap.getCurrentState(learnBundleId);
854
+ if (prevState["modal_state"] === "open") {
855
+ appMap.recordStateChange(learnBundleId, "modal_state", "open", "closed", combo);
856
+ }
857
+ }
858
+ // Cmd+\ or Cmd+Shift+S = sidebar toggle (common pattern)
859
+ if (combo === "cmd+\\" || combo === "ctrl+\\" || combo === "cmd+shift+s") {
860
+ const prevState = appMap.getCurrentState(learnBundleId);
861
+ const currentSidebar = prevState["sidebar_state"] ?? "expanded";
862
+ const newSidebar = currentSidebar === "expanded" ? "collapsed" : "expanded";
863
+ appMap.recordStateChange(learnBundleId, "sidebar_state", currentSidebar, newSidebar, combo);
864
+ }
865
+ }
866
+ // --- Keyword matching on result text (original patterns) ---
867
+ // Modal/dialog state
868
+ // V4: Require noun+verb proximity to prevent false injection from element labels.
869
+ if (/\b(modal|dialog|popup|alert|sheet|search|command palette)\s+\w*\s*\b(opened|appeared|shown|displayed|presented)\b/.test(stateResultText) ||
870
+ /\b(opened|appeared|shown|displayed|presented)\s+\w*\s*\b(modal|dialog|popup|alert|sheet)\b/.test(stateResultText) ||
871
+ /\b(modal|dialog|popup|alert|sheet)\s+(is|was|has been)\s+(opened|shown|displayed|presented)\b/.test(stateResultText)) {
872
+ const prevState = appMap.getCurrentState(learnBundleId);
873
+ const from = prevState["modal_state"] ?? "closed";
874
+ appMap.recordStateChange(learnBundleId, "modal_state", from, "open", stateTrigger);
875
+ }
876
+ else if (/\b(modal|dialog|popup|alert|sheet)\s+\w*\s*\b(closed|dismissed|hidden|disappeared)\b/.test(stateResultText) ||
877
+ /\b(closed|dismissed|hidden|disappeared)\s+\w*\s*\b(modal|dialog|popup|alert|sheet)\b/.test(stateResultText) ||
878
+ /\b(modal|dialog|popup|alert|sheet)\s+(is|was|has been)\s+(closed|dismissed|hidden)\b/.test(stateResultText)) {
879
+ const prevState = appMap.getCurrentState(learnBundleId);
880
+ const from = prevState["modal_state"] ?? "open";
881
+ appMap.recordStateChange(learnBundleId, "modal_state", from, "closed", stateTrigger);
882
+ }
883
+ // Sidebar/panel state
884
+ if (/\b(sidebar|panel)\s+\w*\s*\b(collapsed|hidden|closed|minimized)\b/.test(stateResultText) ||
885
+ /\b(collapsed|hidden|closed|minimized)\s+\w*\s*\b(sidebar|panel)\b/.test(stateResultText) ||
886
+ /\b(sidebar|panel)\s+(is|was|has been)\s+(collapsed|hidden|closed|minimized)\b/.test(stateResultText)) {
887
+ const prevState = appMap.getCurrentState(learnBundleId);
888
+ const from = prevState["sidebar_state"] ?? "expanded";
889
+ appMap.recordStateChange(learnBundleId, "sidebar_state", from, "collapsed", stateTrigger);
890
+ }
891
+ else if (/\b(sidebar|panel)\s+\w*\s*\b(expanded|shown|opened|visible|maximized)\b/.test(stateResultText) ||
892
+ /\b(expanded|shown|opened|visible|maximized)\s+\w*\s*\b(sidebar|panel)\b/.test(stateResultText) ||
893
+ /\b(sidebar|panel)\s+(is|was|has been)\s+(expanded|shown|opened|visible|maximized)\b/.test(stateResultText)) {
894
+ const prevState = appMap.getCurrentState(learnBundleId);
895
+ const from = prevState["sidebar_state"] ?? "collapsed";
896
+ appMap.recordStateChange(learnBundleId, "sidebar_state", from, "expanded", stateTrigger);
897
+ }
898
+ // View mode state (e.g., board/list/table/grid/timeline)
899
+ const viewModeMatch = stateResultText.match(/\b(board|list|table|grid|timeline|calendar|gallery|kanban)\s*view\b/);
900
+ if (!viewModeMatch) {
901
+ const altViewMatch = stateResultText.match(/(?:switched\s+to|view:\s*)\s*(board|list|table|grid|timeline|calendar|gallery|kanban)\b/);
902
+ if (altViewMatch) {
903
+ const newView = altViewMatch[1];
904
+ const prevState = appMap.getCurrentState(learnBundleId);
905
+ const from = prevState["view_mode"] ?? "unknown";
906
+ if (from !== newView) {
907
+ appMap.recordStateChange(learnBundleId, "view_mode", from, newView, stateTrigger);
908
+ }
909
+ }
910
+ }
911
+ else {
912
+ const newView = viewModeMatch[1];
913
+ const prevState = appMap.getCurrentState(learnBundleId);
914
+ const from = prevState["view_mode"] ?? "unknown";
915
+ if (from !== newView) {
916
+ appMap.recordStateChange(learnBundleId, "view_mode", from, newView, stateTrigger);
917
+ }
918
+ }
919
+ }
920
+ // ── Hierarchy extraction from UI inspection tools ──
921
+ // Extract parent/child containment from any tool that reveals structure
922
+ {
923
+ const HIERARCHY_TOOLS = new Set(["ui_tree", "ui_find", "screenshot", "ocr"]);
924
+ if (HIERARCHY_TOOLS.has(toolName)) {
925
+ try {
926
+ const treeText = extractText(result);
927
+ if (treeText) {
928
+ const lines = treeText.split("\n");
929
+ const hierarchyZone = contextTracker.currentPageContext
930
+ ? `page::${contextTracker.currentPageContext}` : "auto_discovered";
931
+ if (toolName === "ui_tree" || toolName === "ui_find") {
932
+ // Parse indented AX tree: depth 0 = root, depth 1 = top containers, depth 2 = children
933
+ // Format: " ".repeat(depth) + role "title" ...
934
+ const containers = [];
935
+ for (const line of lines) {
936
+ const stripped = line.replace(/\s+$/, "");
937
+ const indent = stripped.length - stripped.trimStart().length;
938
+ const depth = Math.floor(indent / 2);
939
+ const titleMatch = stripped.match(/"([^"]+)"/);
940
+ if (!titleMatch)
941
+ continue;
942
+ const label = titleMatch[1];
943
+ if (!label || label.length > 200)
944
+ continue;
945
+ if (depth <= 1) {
946
+ containers.push({ label, depth, children: [] });
947
+ }
948
+ else if (depth === 2 && containers.length > 0) {
949
+ const parent = containers[containers.length - 1];
950
+ if (parent && parent.children.length < 50) {
951
+ parent.children.push(label);
952
+ }
953
+ }
954
+ }
955
+ for (const container of containers) {
956
+ if (container.children.length > 0) {
957
+ appMap.recordHierarchy(learnBundleId, hierarchyZone, container.label, container.children, "ax_tree");
958
+ }
959
+ }
960
+ }
961
+ else {
962
+ // screenshot/ocr: extract spatial grouping from OCR lines
963
+ // OCR text is top-to-bottom — consecutive lines within the same
964
+ // vertical region (heading followed by items) form parent/child
965
+ const ocrLabels = [];
966
+ for (const line of lines) {
967
+ const trimmed = line.trim();
968
+ if (trimmed && trimmed.length >= 2 && trimmed.length <= 100) {
969
+ ocrLabels.push(trimmed);
970
+ }
971
+ }
972
+ // Heuristic: detect section headings from OCR text.
973
+ // A heading is a short label (1-2 words, <=20 chars) followed by 2+ lines,
974
+ // or a title-case label followed by bullet-prefixed items.
975
+ // Catches "Recents", "Private", "Tasks Tracker" in Notion, etc.
976
+ let currentParent = null;
977
+ let currentChildren = [];
978
+ const flushGroup = () => {
979
+ if (currentParent && currentChildren.length > 0) {
980
+ appMap.recordHierarchy(learnBundleId, hierarchyZone, currentParent, currentChildren.slice(0, 50), "ocr_spatial");
981
+ }
982
+ currentParent = null;
983
+ currentChildren = [];
984
+ };
985
+ for (let i = 0; i < ocrLabels.length; i++) {
986
+ const label = ocrLabels[i];
987
+ const isAllCaps = /^[A-Z][A-Z\s]{2,}$/.test(label);
988
+ const hasColon = label.endsWith(":");
989
+ // Short single/double-word section name (e.g. "Recents", "Private", "New database")
990
+ const isShortSection = /^[A-Z][a-z]+(\s+[a-z]+)?$/.test(label) && label.length <= 20;
991
+ // Title-case heading: 1-4 words
992
+ const isTitleCase = /^[A-Z][a-zA-Z]+(\s+[A-Za-z]+){0,3}$/.test(label) && label.length <= 30;
993
+ const hasFollowingContent = i + 2 < ocrLabels.length;
994
+ // Bullet/icon items (strong signal)
995
+ const nextHasBullet = (idx) => {
996
+ const next = ocrLabels[idx];
997
+ return next != null && /^[•\*\+\-\u2022\u25CF※®=¿]/.test(next);
998
+ };
999
+ const followedByBullets = hasFollowingContent && nextHasBullet(i + 1);
1000
+ const isHeading = isAllCaps || hasColon || (isShortSection && hasFollowingContent) || (isTitleCase && followedByBullets);
1001
+ if (isHeading) {
1002
+ flushGroup();
1003
+ currentParent = label.replace(/:$/, "");
1004
+ }
1005
+ else if (currentParent) {
1006
+ currentChildren.push(label);
1007
+ }
1008
+ }
1009
+ flushGroup();
1010
+ }
1011
+ }
1012
+ }
1013
+ catch { /* hierarchy extraction non-fatal */ }
1014
+ }
1015
+ }
1016
+ // ── Conditional UI visibility tracking (throttled) ──
1017
+ // Every 3rd inspection-like tool call, compare discovered elements against
1018
+ // known map elements to detect which appear/disappear by page context.
1019
+ {
1020
+ const VISIBILITY_TOOLS = new Set([
1021
+ "ui_tree", "ocr", "ui_find", "screenshot", "click_text",
1022
+ "windows", "browser_dom", "browser_page_info",
1023
+ ]);
1024
+ if (VISIBILITY_TOOLS.has(toolName)) {
1025
+ visibilityCheckCounter++;
1026
+ }
1027
+ if (visibilityCheckCounter % 3 === 0 && VISIBILITY_TOOLS.has(toolName)) {
1028
+ try {
1029
+ const visMapData = appMap.getLoaded(learnBundleId);
1030
+ const visPageCtx = contextTracker.currentPageContext ?? "";
1031
+ if (visMapData && visPageCtx) {
1032
+ // Collect element labels from the result text
1033
+ const visResultText = extractText(result);
1034
+ const discoveredLabels = new Set();
1035
+ // Extract quoted labels (from ui_tree/ui_find format)
1036
+ const labelMatches = visResultText.matchAll(/"([^"]{1,100})"/g);
1037
+ for (const m of labelMatches) {
1038
+ if (m[1])
1039
+ discoveredLabels.add(m[1]);
1040
+ }
1041
+ // Also extract unquoted OCR/screenshot text lines as potential labels
1042
+ for (const line of visResultText.split("\n")) {
1043
+ const trimmed = line.trim();
1044
+ if (trimmed && trimmed.length >= 2 && trimmed.length <= 80 && !/^[\[\(]/.test(trimmed)) {
1045
+ discoveredLabels.add(trimmed);
1046
+ }
1047
+ }
1048
+ // For known elements in the map, record whether they were seen or absent
1049
+ const knownElements = new Set();
1050
+ for (const zone of Object.values(visMapData.zones)) {
1051
+ for (const el of zone.elements) {
1052
+ knownElements.add(el.label);
1053
+ }
1054
+ }
1055
+ for (const label of knownElements) {
1056
+ const seen = discoveredLabels.has(label);
1057
+ appMap.recordElementVisibility(learnBundleId, label, visPageCtx, seen);
1058
+ }
1059
+ }
1060
+ }
1061
+ catch { /* visibility tracking non-fatal */ }
1062
+ }
1063
+ }
1064
+ // ── Timing recording: track tool response times per element ──
1065
+ {
1066
+ const TIMING_TOOLS = new Set([
1067
+ "click", "click_text", "type_text", "key", "menu_click",
1068
+ "browser_click", "browser_type",
1069
+ ]);
1070
+ if (TIMING_TOOLS.has(toolName)) {
1071
+ const timingLabel = locatorTarget ?? toolName;
1072
+ appMap.recordTiming(learnBundleId, toolName + "::" + timingLabel, "element_response", durationMs);
1073
+ }
1074
+ // Ready-signal recording
1075
+ // 1. Explicit wait tools
1076
+ if (toolName === "browser_wait" || toolName === "wait_for_state") {
1077
+ appMap.recordReadySignal(learnBundleId, lastSuccessfulToolName, "wait_completed", durationMs);
1078
+ }
1079
+ // 2. Any interaction tool that took notably long (>1.5s) = implicit wait
1080
+ // This captures slow page loads, animation waits, network-bound actions
1081
+ if (durationMs > 1500 && TIMING_TOOLS.has(toolName)) {
1082
+ appMap.recordReadySignal(learnBundleId, toolName, "slow_response", durationMs);
1083
+ }
1084
+ // 3. Screenshot/OCR after a navigation click = page-ready signal
1085
+ if ((toolName === "screenshot" || toolName === "ocr") && lastSuccessfulToolName === "click_text") {
1086
+ appMap.recordReadySignal(learnBundleId, "click_text", "page_ready", durationMs);
1087
+ }
1088
+ }
1089
+ // Refresh mastery level after updates
1090
+ appMap.refreshMastery(learnBundleId);
1091
+ }
1092
+ catch { /* app map update non-fatal */ }
1093
+ }
1094
+ // Track last successful tool name for ready-signal context
1095
+ lastSuccessfulToolName = toolName;
1096
+ // ── POST-CALL: capture for playbook recording if active ──
1097
+ if (mcpRecorder.isRecording) {
1098
+ const fullResultText = Array.isArray(result?.content) ? result.content.map((c) => c.text ?? "").join(" ") : "";
1099
+ const resultText = fullResultText.length > 500 ? fullResultText.substring(0, 500) + " [TRUNCATED]" : fullResultText;
1100
+ mcpRecorder.captureToolCall(toolName, safeParams, true, resultText, durationMs);
1101
+ }
180
1102
  // ── POST-CALL: auto-recall hints (~0ms, in-memory) ──
181
1103
  const hints = [];
182
- // Warn about known errors for this tool
1104
+ // Playbook-aware hints (errors, selectors, job suggestions)
1105
+ for (const h of playbookHints) {
1106
+ hints.push(h);
1107
+ }
1108
+ // World model summary (window/control state)
1109
+ const wmSummary = worldModel.toSummary();
1110
+ if (wmSummary && worldModel.getState().windows.size > 0) {
1111
+ hints.push(`World: ${wmSummary.split("\n")[0]}`);
1112
+ }
1113
+ // Perception freshness
1114
+ if (perceptionManager.isRunning) {
1115
+ hints.push(perceptionManager.getFreshnessSummary());
1116
+ }
1117
+ // Learning engine recommendations
1118
+ const patternRec = learningEngine.recommendPattern(learnBundleId, toolName);
1119
+ if (patternRec) {
1120
+ const rate = ((patternRec.successCount / Math.max(1, patternRec.successCount + patternRec.failCount)) * 100).toFixed(0);
1121
+ hints.push(`Pattern: "${patternRec.locator}" (${patternRec.method}, ${rate}% over ${patternRec.successCount + patternRec.failCount} uses)`);
1122
+ }
1123
+ const learnLocator = learningEngine.recommendLocator(learnBundleId, toolName);
1124
+ if (learnLocator) {
1125
+ hints.push(`Learning: best locator for ${toolName} → "${learnLocator.locator}" (${learnLocator.method}, ${learnLocator.score.toFixed(2)} score, ${learnLocator.successCount}/${learnLocator.successCount + learnLocator.failCount} success)`);
1126
+ }
1127
+ const adaptiveBudget = learningEngine.getAdaptiveBudget(learnBundleId);
1128
+ if (adaptiveBudget.locateMs !== 800 || adaptiveBudget.actMs !== 200 || adaptiveBudget.verifyMs !== 2000) {
1129
+ hints.push(`Learning: adaptive budgets → locate=${adaptiveBudget.locateMs}ms, act=${adaptiveBudget.actMs}ms, verify=${adaptiveBudget.verifyMs}ms`);
1130
+ }
1131
+ // Warn about known errors for this tool (from memory)
183
1132
  if (knownError) {
184
1133
  hints.push(`⚡ Memory: "${toolName}" has failed before: "${knownError.error}" (${knownError.occurrences}x). Fix: ${knownError.resolution}`);
185
1134
  }
186
1135
  // Suggest next step if we're mid-strategy
187
1136
  const recentTools = memory.getRecentToolNames();
188
- const strategyHint = memory.quickStrategyHint(recentTools);
1137
+ const strategyHint = memory.quickStrategyHint(recentTools, worldModel.getState().focusedApp?.bundleId);
189
1138
  if (strategyHint) {
190
1139
  activeStrategyFingerprint = strategyHint.fingerprint;
191
1140
  const nextParams = Object.keys(strategyHint.nextStep.params).length > 0
@@ -203,10 +1152,16 @@ server.tool = (...args) => {
203
1152
  memory.recordStrategyOutcome(activeStrategyFingerprint, true);
204
1153
  activeStrategyFingerprint = null;
205
1154
  }
206
- // Attach hints as _meta (doesn't pollute tool output for MCP clients)
1155
+ // Attach hints in BOTH content (visible) and _meta (for programmatic access)
207
1156
  if (hints.length > 0) {
1157
+ const hintText = hints.join("\n");
1158
+ const resultContent = Array.isArray(result?.content) ? result.content : [];
208
1159
  return {
209
1160
  ...result,
1161
+ content: [
1162
+ ...resultContent,
1163
+ { type: "text", text: `\n---\n${hintText}` },
1164
+ ],
210
1165
  _meta: { ...(result?._meta ?? {}), memoryHints: hints },
211
1166
  };
212
1167
  }
@@ -228,6 +1183,69 @@ server.tool = (...args) => {
228
1183
  error: errorMsg,
229
1184
  };
230
1185
  memory.recordEvent(entry); // non-blocking write + session tracking
1186
+ // ── Record failure for playbook learning (in-memory only) ──
1187
+ contextTracker.recordOutcome(toolName, safeParams, false, errorMsg);
1188
+ // ── Feed learning engine (failure timing + locator) ──
1189
+ const learnBundleIdErr = worldModel.getState().focusedApp?.bundleId ?? lastKnownBundleId ?? "unknown";
1190
+ learningEngine.recordToolTiming({ tool: toolName, bundleId: learnBundleIdErr, durationMs, success: false });
1191
+ const failedLocator = safeParams.target ?? safeParams.selector ?? safeParams.locator
1192
+ ?? (toolName === "click_text" ? safeParams.text : undefined);
1193
+ if (typeof failedLocator === "string" && failedLocator) {
1194
+ const method = toolName.startsWith("browser_") ? "cdp"
1195
+ : toolName.includes("ocr") ? "ocr"
1196
+ : "ax";
1197
+ learningEngine.recordLocatorOutcome({
1198
+ bundleId: learnBundleIdErr,
1199
+ actionKey: toolName,
1200
+ locator: failedLocator,
1201
+ method,
1202
+ success: false,
1203
+ });
1204
+ // Record failed pattern to patterns.jsonl
1205
+ learningEngine.recordPattern({
1206
+ bundleId: learnBundleIdErr,
1207
+ tool: toolName,
1208
+ locator: failedLocator,
1209
+ method,
1210
+ success: false,
1211
+ });
1212
+ }
1213
+ // ── POST-CALL: record failure in app mastery map ──
1214
+ if (learnBundleIdErr !== "unknown") {
1215
+ try {
1216
+ if (typeof failedLocator === "string" && failedLocator) {
1217
+ appMap.recordElementOutcome(learnBundleIdErr, "auto", failedLocator, false, contextTracker.currentPageContext ?? undefined);
1218
+ }
1219
+ // Record action failure
1220
+ const isFailedAction = ACTION_TOOLS.has(toolName);
1221
+ if (isFailedAction) {
1222
+ appMap.recordActionOutcome(learnBundleIdErr, false);
1223
+ }
1224
+ // Record feature signal failure (affects confidence and reliability)
1225
+ const failMapData = appMap.getLoaded(learnBundleIdErr);
1226
+ if (failMapData?.featureLadder) {
1227
+ const failSignal = [toolName, typeof failedLocator === "string" ? failedLocator : ""].join(" ").toLowerCase();
1228
+ const failGeneratedSignals = appMap.getGeneratedSignals(learnBundleIdErr) ?? {};
1229
+ for (const feature of failMapData.featureLadder) {
1230
+ const fm = failMapData.featureMastery?.[feature.id];
1231
+ if (!fm || fm.depth === 0)
1232
+ continue; // Only track failures on features we've seen
1233
+ // Check feature ID match OR keyword match (same as success path)
1234
+ const featureInSignal = failSignal.includes(feature.id.replace(/_/g, " "));
1235
+ const keywords = failGeneratedSignals[feature.id];
1236
+ const keywordMatch = keywords?.some((kw) => failSignal.includes(kw));
1237
+ if (featureInSignal || keywordMatch) {
1238
+ appMap.recordFeatureSignal(learnBundleIdErr, feature.id, fm.depth, false);
1239
+ }
1240
+ }
1241
+ }
1242
+ }
1243
+ catch { /* app map update non-fatal */ }
1244
+ }
1245
+ // ── Capture failure for playbook recording ──
1246
+ if (mcpRecorder.isRecording) {
1247
+ mcpRecorder.captureToolCall(toolName, safeParams, false, errorMsg, durationMs);
1248
+ }
231
1249
  // Record strategy failure if we were following one
232
1250
  if (activeStrategyFingerprint) {
233
1251
  memory.recordStrategyOutcome(activeStrategyFingerprint, false);
@@ -252,6 +1270,9 @@ server.tool = (...args) => {
252
1270
  }
253
1271
  throw err;
254
1272
  }
1273
+ finally {
1274
+ currentAdaptiveBudget = null;
1275
+ }
255
1276
  };
256
1277
  const newArgs = [...args];
257
1278
  newArgs[handlerIdx] = wrappedHandler;
@@ -263,31 +1284,236 @@ server.tool = (...args) => {
263
1284
  server.tool("apps", "List all running applications with bundle IDs and PIDs", {}, async () => {
264
1285
  await ensureBridge();
265
1286
  const apps = await bridge.call("app.list");
1287
+ // L3-04 fix: Some Electron apps (Slack, Discord) don't appear in NSWorkspace.runningApplications
1288
+ // despite being visible with windows. Augment with frontmost app if missing from list.
1289
+ try {
1290
+ const front = await bridge.call("app.frontmost", {});
1291
+ if (front.pid && !apps.some((a) => a.pid === front.pid)) {
1292
+ apps.push({ ...front, isActive: true });
1293
+ }
1294
+ }
1295
+ catch { /* ignore */ }
1296
+ // Also augment from window list — any app with visible windows should appear.
1297
+ // Filter out XPC services and system helpers that own tiny overlay windows.
1298
+ try {
1299
+ const wins = await bridge.call("app.windows");
1300
+ const appPids = new Set(apps.map((a) => a.pid));
1301
+ const seenWinPids = new Set();
1302
+ for (const w of wins) {
1303
+ const wPid = w.pid || w.ownerPid;
1304
+ const bid = w.bundleId || "";
1305
+ // Skip XPC services, system helpers, and loginwindow — not real user apps
1306
+ if (!wPid || appPids.has(wPid) || seenWinPids.has(wPid))
1307
+ continue;
1308
+ if (bid.includes(".xpc.") || bid === "com.apple.loginwindow" || bid === "unknown" || bid === "")
1309
+ continue;
1310
+ // Only include if the window has meaningful size (>50x50)
1311
+ const b = w.bounds || {};
1312
+ if ((b.width || 0) < 50 || (b.height || 0) < 50)
1313
+ continue;
1314
+ seenWinPids.add(wPid);
1315
+ apps.push({
1316
+ bundleId: bid,
1317
+ name: w.appName || "Unknown",
1318
+ pid: wPid,
1319
+ isActive: false,
1320
+ });
1321
+ }
1322
+ }
1323
+ catch { /* ignore */ }
266
1324
  const lines = apps.map((a) => `${a.name} (${a.bundleId}) pid=${a.pid}${a.isActive ? " ← active" : ""}`);
267
1325
  return { content: [{ type: "text", text: lines.join("\n") }] };
268
1326
  });
269
1327
  server.tool("windows", "List all visible windows with IDs, positions, and sizes", {}, async () => {
270
1328
  await ensureBridge();
271
1329
  const wins = await bridge.call("app.windows");
272
- const lines = wins.map((w) => {
1330
+ // Filter to meaningful windows: must have a title or reasonable size (>50x50)
1331
+ const meaningful = wins.filter((w) => {
1332
+ const b = w.bounds || {};
1333
+ const hasTitle = w.title && w.title.length > 0;
1334
+ const hasSize = (b.width || 0) > 50 && (b.height || 0) > 50;
1335
+ return hasTitle || hasSize;
1336
+ });
1337
+ const lines = meaningful.map((w) => {
273
1338
  const b = w.bounds || {};
274
- return `[${w.windowId}] ${w.appName} "${w.title}" (${Math.round(b.x || 0)},${Math.round(b.y || 0)}) ${Math.round(b.width || 0)}x${Math.round(b.height || 0)}`;
1339
+ const onScreen = w.isOnScreen === false ? " [minimized]" : "";
1340
+ return `[${w.windowId}] ${w.appName} "${w.title}" (${Math.round(b.x || 0)},${Math.round(b.y || 0)}) ${Math.round(b.width || 0)}x${Math.round(b.height || 0)}${onScreen}`;
275
1341
  });
276
1342
  return { content: [{ type: "text", text: lines.join("\n") }] };
277
1343
  });
278
- server.tool("focus", "Focus/activate an application", {
1344
+ server.tool("focus", "Focus/activate an application (or a specific window by windowId)", {
279
1345
  bundleId: z.string().describe("App bundle ID, e.g. com.apple.Safari"),
280
- }, async ({ bundleId }) => {
1346
+ windowId: z.number().optional().describe("Specific window ID from windows() raises that exact window. Use when multiple instances of the same app exist."),
1347
+ }, async ({ bundleId, windowId }) => {
281
1348
  await ensureBridge();
282
- await bridge.call("app.focus", { bundleId });
283
- return { content: [{ type: "text", text: "Focused " + bundleId }] };
1349
+ // Serialize focus calls only one can run at a time since only one app can be frontmost.
1350
+ // Without this, N concurrent focus() calls generate N*5 bridge calls that crash the bridge.
1351
+ let resolve;
1352
+ const prev = focusLock;
1353
+ focusLock = new Promise(r => { resolve = r; });
1354
+ await prev;
1355
+ try {
1356
+ // Step 0: Verify the app is actually running — fail fast with error content
1357
+ const runningApps = await bridge.call("app.list", {});
1358
+ let targetApp = runningApps?.find((a) => a.bundleId === bundleId);
1359
+ if (!targetApp) {
1360
+ // L3-04 fix: Some Electron apps (Slack, Discord) don't appear in app.list.
1361
+ // Check if they have visible windows before rejecting.
1362
+ try {
1363
+ const wins = await bridge.call("app.windows");
1364
+ const appWin = wins?.find((w) => w.bundleId === bundleId);
1365
+ if (appWin) {
1366
+ targetApp = { bundleId, name: appWin.appName, pid: appWin.pid || appWin.ownerPid };
1367
+ }
1368
+ }
1369
+ catch { /* ignore */ }
1370
+ if (!targetApp) {
1371
+ return { content: [{ type: "text", text: `Error: ${bundleId} is not running. Use launch("${bundleId}") first.` }], isError: true };
1372
+ }
1373
+ }
1374
+ // Step 1: Focus — use window.focus(windowId) when provided (L3-01 fix: precise window targeting)
1375
+ // This solves multi-instance Electron apps where bundleId-based focus raises the wrong window.
1376
+ let bridgeFocusError;
1377
+ try {
1378
+ if (windowId != null) {
1379
+ await bridge.call("window.focus", { windowId });
1380
+ }
1381
+ else {
1382
+ await bridge.call("app.focus", { bundleId });
1383
+ }
1384
+ }
1385
+ catch (e) {
1386
+ bridgeFocusError = e?.message ?? String(e);
1387
+ }
1388
+ // Step 2: Verify IMMEDIATELY — 150ms settle for macOS window server async transition.
1389
+ // 50ms was too short on cold start; 150ms handles even first-launch activation delays.
1390
+ await new Promise(r => setTimeout(r, 150));
1391
+ let focusMsg = "Focused " + bundleId;
1392
+ try {
1393
+ const front = await bridge.call("app.frontmost", {});
1394
+ if (front.bundleId !== bundleId) {
1395
+ // MCP-level retry: AppleScript activation as final fallback
1396
+ try {
1397
+ await bridge.call("as.run", { script: `tell application id "${bundleId}" to activate` });
1398
+ await new Promise(r => setTimeout(r, 200));
1399
+ const front2 = await bridge.call("app.frontmost", {});
1400
+ if (front2.bundleId === bundleId) {
1401
+ focusMsg = "Focused " + bundleId;
1402
+ }
1403
+ else {
1404
+ focusMsg = `Warning: focus requested for ${bundleId} but ${front2.bundleId} (${front2.name}) is frontmost. Try again or use launch() first.`;
1405
+ }
1406
+ }
1407
+ catch {
1408
+ focusMsg = `Warning: focus requested for ${bundleId} but ${front.bundleId} (${front.name}) is frontmost. Try again or use launch() first.`;
1409
+ }
1410
+ }
1411
+ }
1412
+ catch {
1413
+ if (bridgeFocusError) {
1414
+ focusMsg = `Warning: ${bridgeFocusError}. Call apps() to check if ${bundleId} is running.`;
1415
+ }
1416
+ }
1417
+ // Step 3: World model + perception (best-effort, after verification)
1418
+ try {
1419
+ const apps = await bridge.call("app.list", {});
1420
+ const app = apps?.find((a) => a.bundleId === bundleId);
1421
+ if (app) {
1422
+ let windowId;
1423
+ try {
1424
+ windowId = await resolveWindowId(app.pid);
1425
+ }
1426
+ catch { /* best-effort */ }
1427
+ if (windowId != null) {
1428
+ try {
1429
+ await bridge.call("window.focus", { windowId });
1430
+ }
1431
+ catch { /* best-effort */ }
1432
+ }
1433
+ const ctx = { bundleId, appName: app.name ?? bundleId, pid: app.pid, windowTitle: "", ...(windowId != null ? { windowId } : {}) };
1434
+ worldModel.updateFocusedApp(ctx);
1435
+ lastKnownBundleId = bundleId;
1436
+ try {
1437
+ await perceptionManager.ensureStarted(ctx);
1438
+ installSafariEnricher(bundleId);
1439
+ }
1440
+ catch { /* best-effort */ }
1441
+ }
1442
+ }
1443
+ catch { /* app.list failed — world model update is best-effort */ }
1444
+ return { content: [{ type: "text", text: focusMsg }] };
1445
+ }
1446
+ finally {
1447
+ resolve();
1448
+ }
284
1449
  });
285
- server.tool("launch", "Launch an application", {
1450
+ server.tool("launch", "Launch an application. Chrome/Chromium browsers are launched with CDP enabled (port 9222) for browser_* tools.", {
286
1451
  bundleId: z.string().describe("App bundle ID"),
287
- }, async ({ bundleId }) => {
1452
+ cdpPort: z.number().optional().describe("CDP port for Chrome/Chromium (default: 9222). Ignored for non-browser apps."),
1453
+ }, async ({ bundleId, cdpPort }) => {
288
1454
  await ensureBridge();
289
- const r = await bridge.call("app.launch", { bundleId });
290
- return { content: [{ type: "text", text: `Launched ${r.appName} pid=${r.pid}` }] };
1455
+ const riskyBundleIds = {
1456
+ "com.apple.Terminal": "Terminal",
1457
+ "com.apple.ScriptEditor2": "Script Editor",
1458
+ "com.googlecode.iterm2": "iTerm",
1459
+ "com.apple.ActivityMonitor": "Activity Monitor",
1460
+ };
1461
+ // Chrome/Chromium: launch with CDP enabled so browser_* tools work immediately
1462
+ const chromeBundleIds = {
1463
+ "com.google.Chrome": "Google Chrome",
1464
+ "com.google.Chrome.canary": "Google Chrome Canary",
1465
+ "com.brave.Browser": "Brave Browser",
1466
+ "com.microsoft.edgemac": "Microsoft Edge",
1467
+ "org.chromium.Chromium": "Chromium",
1468
+ };
1469
+ const chromeAppName = chromeBundleIds[bundleId];
1470
+ let r;
1471
+ if (chromeAppName) {
1472
+ const port = cdpPort ?? 9222;
1473
+ try {
1474
+ // Spawn Chrome binary directly with --remote-debugging-port.
1475
+ // Must use a dedicated user-data-dir because Chrome ignores the CDP flag
1476
+ // when the default profile is already locked by a previous instance.
1477
+ const { spawn } = await import("child_process");
1478
+ const os = await import("os");
1479
+ const chromeBinary = `/Applications/${chromeAppName}.app/Contents/MacOS/${chromeAppName}`;
1480
+ const cdpProfile = `${os.tmpdir()}/screenhand-cdp-${port}`;
1481
+ const proc = spawn(chromeBinary, [
1482
+ `--remote-debugging-port=${port}`,
1483
+ `--user-data-dir=${cdpProfile}`,
1484
+ ], { detached: true, stdio: "ignore" });
1485
+ proc.unref();
1486
+ // Wait for Chrome to start, then get its PID
1487
+ await new Promise(res => setTimeout(res, 1500));
1488
+ const apps = await bridge.call("app.list", {});
1489
+ const chromeApp = apps?.find((a) => a.bundleId === bundleId);
1490
+ r = { pid: chromeApp?.pid ?? 0, appName: chromeApp?.name ?? bundleId };
1491
+ }
1492
+ catch {
1493
+ // Fallback to normal launch if CDP launch fails
1494
+ r = await bridge.call("app.launch", { bundleId });
1495
+ }
1496
+ }
1497
+ else {
1498
+ r = await bridge.call("app.launch", { bundleId });
1499
+ }
1500
+ const riskyName = riskyBundleIds[bundleId];
1501
+ // Auto-start perception for the launched app
1502
+ try {
1503
+ const windowId = await resolveWindowId(r.pid);
1504
+ await perceptionManager.ensureStarted({ bundleId, appName: r.appName ?? bundleId, pid: r.pid, windowTitle: "", ...(windowId != null ? { windowId } : {}) });
1505
+ installSafariEnricher(bundleId);
1506
+ }
1507
+ catch { /* perception start is best-effort */ }
1508
+ let msg = `Launched ${r.appName} pid=${r.pid}`;
1509
+ if (chromeAppName) {
1510
+ const port = cdpPort ?? 9222;
1511
+ msg += `\nCDP enabled on port ${port} — browser_* tools ready`;
1512
+ }
1513
+ if (riskyName) {
1514
+ msg += `\nWarning: launching ${riskyName} \u2014 this app can execute arbitrary commands`;
1515
+ }
1516
+ return { content: [{ type: "text", text: msg }] };
291
1517
  });
292
1518
  // ═══════════════════════════════════════════════
293
1519
  // INSPECT — see what's on screen (debugging/design)
@@ -298,12 +1524,27 @@ server.tool("screenshot", "Take a screenshot and OCR it. Returns all visible tex
298
1524
  await ensureBridge();
299
1525
  let shot;
300
1526
  if (windowId) {
301
- shot = await bridge.call("cg.captureWindow", { windowId });
1527
+ shot = await bridge.call("cg.captureWindow", { windowId, safeCLI: isBrowserApp() });
302
1528
  }
303
1529
  else {
304
1530
  shot = await bridge.call("cg.captureScreen");
305
1531
  }
306
1532
  const ocr = await bridge.call("vision.ocr", { imagePath: shot.path });
1533
+ // Feed OCR regions into world model
1534
+ try {
1535
+ if (windowId && Array.isArray(ocr.regions) && ocr.regions.length > 0) {
1536
+ worldModel.ingestOCRRegions(windowId, ocr.regions.map((r) => ({
1537
+ text: r.text,
1538
+ bounds: {
1539
+ x: r.bounds.x,
1540
+ y: r.bounds.y,
1541
+ width: r.bounds.width,
1542
+ height: r.bounds.height,
1543
+ },
1544
+ })));
1545
+ }
1546
+ }
1547
+ catch { /* world model update is best-effort */ }
307
1548
  return { content: [{ type: "text", text: `Screenshot: ${shot.width}x${shot.height} (${shot.path})\n\n${ocr.text}` }] };
308
1549
  });
309
1550
  server.tool("screenshot_file", "Take a screenshot and return the file path (for viewing the actual image)", {
@@ -312,7 +1553,7 @@ server.tool("screenshot_file", "Take a screenshot and return the file path (for
312
1553
  await ensureBridge();
313
1554
  let shot;
314
1555
  if (windowId) {
315
- shot = await bridge.call("cg.captureWindow", { windowId });
1556
+ shot = await bridge.call("cg.captureWindow", { windowId, safeCLI: isBrowserApp() });
316
1557
  }
317
1558
  else {
318
1559
  shot = await bridge.call("cg.captureScreen");
@@ -325,7 +1566,7 @@ server.tool("ocr", "OCR a window with element positions. SLOW — prefer ui_tree
325
1566
  await ensureBridge();
326
1567
  let shot;
327
1568
  if (windowId) {
328
- shot = await bridge.call("cg.captureWindow", { windowId });
1569
+ shot = await bridge.call("cg.captureWindow", { windowId, safeCLI: isBrowserApp() });
329
1570
  }
330
1571
  else {
331
1572
  shot = await bridge.call("cg.captureScreen");
@@ -337,7 +1578,28 @@ server.tool("ocr", "OCR a window with element positions. SLOW — prefer ui_tree
337
1578
  const win = wins.find((w) => w.windowId === windowId);
338
1579
  winBounds = win?.bounds;
339
1580
  }
340
- const regions = ocr.regions.map((r) => `"${r.text}" (${Math.round(r.bounds.x)},${Math.round(r.bounds.y)}) ${Math.round(r.bounds.width)}x${Math.round(r.bounds.height)}`);
1581
+ const regions = ocr.regions.map((r) => {
1582
+ let text = redactSensitiveLabel(r.text);
1583
+ text = redactUsername(text);
1584
+ // Redact URLs in OCR text
1585
+ text = text.replace(/https?:\/\/[^\s"'`]+/g, (url) => sanitizeUrl(url));
1586
+ return `"${text}" (${Math.round(r.bounds.x)},${Math.round(r.bounds.y)}) ${Math.round(r.bounds.width)}x${Math.round(r.bounds.height)}`;
1587
+ });
1588
+ // Feed OCR regions into world model
1589
+ try {
1590
+ if (windowId && Array.isArray(ocr.regions) && ocr.regions.length > 0) {
1591
+ worldModel.ingestOCRRegions(windowId, ocr.regions.map((r) => ({
1592
+ text: r.text,
1593
+ bounds: {
1594
+ x: r.bounds.x,
1595
+ y: r.bounds.y,
1596
+ width: r.bounds.width,
1597
+ height: r.bounds.height,
1598
+ },
1599
+ })));
1600
+ }
1601
+ }
1602
+ catch { /* world model update is best-effort */ }
341
1603
  return {
342
1604
  content: [{
343
1605
  type: "text",
@@ -358,13 +1620,32 @@ server.tool("ui_tree", "PREFERRED: Get the full UI element tree of an app via Ac
358
1620
  maxDepth: z.number().optional().describe("Max depth (default 4). Use 2 for overview, 6+ for deep inspection."),
359
1621
  }, async ({ pid, maxDepth }) => {
360
1622
  await ensureBridge();
1623
+ // Check if PID is running before querying AX tree (L3-04: uses fallback checks)
1624
+ if (!(await isPidRunning(pid))) {
1625
+ return { content: [{ type: "text", text: `PID ${pid} is not running. Call apps() to get current PIDs.` }] };
1626
+ }
361
1627
  const tree = await bridge.call("ax.getElementTree", { pid, maxDepth: maxDepth || 4 });
1628
+ // Feed AX tree into world model for state tracking
1629
+ try {
1630
+ const wins = await bridge.call("window.list", {});
1631
+ const win = wins?.find((w) => w.pid === pid);
1632
+ if (win) {
1633
+ worldModel.ingestAXTree(win.windowId, tree, {
1634
+ bundleId: win.bundleId ?? "",
1635
+ appName: win.bundleId ?? "",
1636
+ pid,
1637
+ windowTitle: win.title ?? "",
1638
+ windowId: win.windowId,
1639
+ });
1640
+ }
1641
+ }
1642
+ catch { /* ignore — world model update is best-effort */ }
362
1643
  function format(node, depth) {
363
1644
  let line = " ".repeat(depth) + (node.role || "?");
364
1645
  if (node.title)
365
1646
  line += ` "${node.title}"`;
366
1647
  if (node.value)
367
- line += ` =${String(node.value).slice(0, 60)}`;
1648
+ line += ` =${String(node.value).slice(0, 200)}`;
368
1649
  if (node.bounds)
369
1650
  line += ` (${Math.round(node.bounds.x)},${Math.round(node.bounds.y)} ${Math.round(node.bounds.width)}x${Math.round(node.bounds.height)})`;
370
1651
  let result = line;
@@ -374,34 +1655,107 @@ server.tool("ui_tree", "PREFERRED: Get the full UI element tree of an app via Ac
374
1655
  }
375
1656
  return result;
376
1657
  }
377
- return { content: [{ type: "text", text: format(tree, 0) }] };
1658
+ return { content: [{ type: "text", text: redactUsername(format(tree, 0)) }] };
378
1659
  });
379
- server.tool("ui_find", "Find a specific UI element by text/title. Returns its role, bounds, and path.", {
1660
+ server.tool("ui_find", "Find a specific UI element by text, title, or value. Falls back to value search if title match fails (e.g. finds Safari URL bar by URL).", {
380
1661
  pid: z.number().describe("Process ID"),
381
- title: z.string().describe("Text to search for (partial match)"),
382
- }, async ({ pid, title }) => {
1662
+ title: z.string().describe("Text to search for — matches title first, then value (partial match)"),
1663
+ role: z.string().optional().describe("AX role filter, e.g. AXButton, AXMenuItem, AXTextField"),
1664
+ exact: z.boolean().optional().default(false).describe("Exact title match (default: partial)"),
1665
+ }, async ({ pid, title, role, exact }) => {
383
1666
  await ensureBridge();
384
- const r = await bridge.call("ax.findElement", { pid, title, exact: false });
1667
+ if (!(await isPidRunning(pid))) {
1668
+ return { content: [{ type: "text", text: `PID ${pid} is not running. Call apps() to get current PIDs.` }] };
1669
+ }
1670
+ let r;
1671
+ try {
1672
+ r = await bridge.call("ax.findElement", { pid, title, exact, ...(role ? { role } : {}) });
1673
+ }
1674
+ catch {
1675
+ // Title search failed — retry searching by value (e.g. AXTextField with URL as value)
1676
+ r = await bridge.call("ax.findElement", { pid, value: title, exact, ...(role ? { role } : {}) });
1677
+ }
1678
+ // Feed found element into world model as a minimal AX subtree
1679
+ try {
1680
+ if (r && r.role) {
1681
+ const wins = await bridge.call("window.list", {});
1682
+ const win = wins?.find((w) => w.pid === pid);
1683
+ if (win) {
1684
+ const subtree = {
1685
+ role: r.role,
1686
+ title: r.title ?? null,
1687
+ value: r.value ?? null,
1688
+ enabled: r.enabled ?? true,
1689
+ focused: r.focused ?? false,
1690
+ children: r.children ?? [],
1691
+ };
1692
+ if (r.bounds) {
1693
+ subtree.position = { x: r.bounds.x, y: r.bounds.y };
1694
+ subtree.size = { width: r.bounds.width, height: r.bounds.height };
1695
+ }
1696
+ worldModel.ingestAXTree(win.windowId, subtree, {
1697
+ bundleId: win.bundleId ?? "",
1698
+ appName: win.bundleId ?? "",
1699
+ pid,
1700
+ windowTitle: win.title ?? "",
1701
+ windowId: win.windowId,
1702
+ });
1703
+ }
1704
+ }
1705
+ }
1706
+ catch { /* world model update is best-effort */ }
385
1707
  return { content: [{ type: "text", text: JSON.stringify(r, null, 2) }] };
386
1708
  });
387
1709
  server.tool("ui_press", "PREFERRED: Find and press/click a UI element by its title via Accessibility. Faster and more reliable than click_text — no screenshot needed.", {
388
1710
  pid: z.number().describe("Process ID"),
389
1711
  title: z.string().describe("Element title to find and press"),
390
- }, async ({ pid, title }) => {
1712
+ role: z.string().optional().describe("AX role filter, e.g. AXButton, AXMenuItem, AXTextField"),
1713
+ exact: z.boolean().optional().default(false).describe("Exact title match (default: partial)"),
1714
+ }, async ({ pid, title, role, exact }) => {
391
1715
  await ensureBridge();
392
- const el = await bridge.call("ax.findElement", { pid, title, exact: false });
1716
+ if (!(await isPidRunning(pid))) {
1717
+ return { content: [{ type: "text", text: `PID ${pid} is not running. Call apps() to get current PIDs.` }] };
1718
+ }
1719
+ let el;
1720
+ try {
1721
+ el = await bridge.call("ax.findElement", { pid, title, exact, ...(role ? { role } : {}) });
1722
+ }
1723
+ catch {
1724
+ try {
1725
+ // Fallback: search by value (buttons/controls may have value instead of title)
1726
+ el = await bridge.call("ax.findElement", { pid, value: title, exact, ...(role ? { role } : {}) });
1727
+ }
1728
+ catch {
1729
+ // Check if a system dialog is blocking — different process owns the frontmost window
1730
+ try {
1731
+ const front = await bridge.call("app.frontmost", {});
1732
+ if (front.pid !== pid) {
1733
+ return { content: [{ type: "text", text: `Element "${title}" not found in PID ${pid}. A system dialog from "${front.name}" (${front.bundleId}, PID ${front.pid}) may be blocking. Dismiss it first, or use click(x, y) to interact with the dialog directly.` }], isError: true };
1734
+ }
1735
+ }
1736
+ catch { /* ignore frontmost check failure */ }
1737
+ throw new Error(`Element "${title}" not found (searched title, value, and description)`);
1738
+ }
1739
+ }
393
1740
  await bridge.call("ax.performAction", { pid, elementPath: el.elementPath, action: "AXPress" });
394
- return { content: [{ type: "text", text: `Pressed "${el.title}" (${el.role})` }] };
1741
+ return { content: [{ type: "text", text: `Pressed "${el.title || el.description || el.value}" (${el.role})` }] };
395
1742
  });
396
- server.tool("ui_set_value", "Set the value of a UI element (text field, slider, etc.)", {
1743
+ server.tool("ui_set_value", "Set the value of a UI element (text field, slider, etc.). Searches by title first, falls back to value match.", {
397
1744
  pid: z.number().describe("Process ID"),
398
1745
  title: z.string().describe("Element title to find"),
399
1746
  value: z.string().describe("Value to set"),
400
1747
  }, async ({ pid, title, value }) => {
401
1748
  await ensureBridge();
402
- const el = await bridge.call("ax.findElement", { pid, title, exact: false });
1749
+ let el;
1750
+ try {
1751
+ el = await bridge.call("ax.findElement", { pid, title, exact: false });
1752
+ }
1753
+ catch {
1754
+ // Fallback: search by value (combo boxes, text fields often have no title)
1755
+ el = await bridge.call("ax.findElement", { pid, value: title, exact: false });
1756
+ }
403
1757
  await bridge.call("ax.setElementValue", { pid, elementPath: el.elementPath, value });
404
- return { content: [{ type: "text", text: `Set "${el.title}" = "${value}"` }] };
1758
+ return { content: [{ type: "text", text: `Set "${el.title || el.value}" = "${value}"` }] };
405
1759
  });
406
1760
  server.tool("menu_click", "Click a menu item in an app's menu bar", {
407
1761
  pid: z.number().describe("Process ID"),
@@ -417,73 +1771,309 @@ server.tool("menu_click", "Click a menu item in an app's menu bar", {
417
1771
  server.tool("click", "Click at screen coordinates", {
418
1772
  x: z.number().describe("Screen X"),
419
1773
  y: z.number().describe("Screen Y"),
420
- }, async ({ x, y }) => {
1774
+ button: z.enum(["left", "right", "middle"]).optional().default("left").describe("Mouse button (default: left)"),
1775
+ clickCount: z.number().optional().default(1).describe("Click count: 1=single, 2=double (word select), 3=triple (line select)"),
1776
+ modifiers: z.array(z.enum(["cmd", "shift", "alt", "ctrl"])).optional().describe("Hold modifier keys during click (e.g. ['cmd'] for cmd+click, ['shift'] for shift+click)"),
1777
+ pid: z.number().optional().describe("Target process ID for PID-targeted event delivery"),
1778
+ }, async ({ x, y, button, clickCount, modifiers, pid }) => {
421
1779
  await ensureBridge();
422
- await bridge.call("cg.mouseMove", { x, y });
1780
+ await bridge.call("cg.mouseMove", { x, y, targetPid: pid });
423
1781
  await new Promise(r => setTimeout(r, 50));
424
- await bridge.call("cg.mouseClick", { x, y });
425
- return { content: [{ type: "text", text: `Clicked (${x}, ${y})` }] };
1782
+ await bridge.call("cg.mouseClick", { x, y, button: button || "left", clickCount: clickCount || 1, modifiers: modifiers || [], targetPid: pid });
1783
+ const extras = [];
1784
+ if (modifiers?.length)
1785
+ extras.push(modifiers.join("+"));
1786
+ if (button && button !== "left")
1787
+ extras.push(button);
1788
+ if (clickCount && clickCount > 1)
1789
+ extras.push(clickCount === 2 ? "double" : `${clickCount}x`);
1790
+ return { content: [{ type: "text", text: `Clicked (${x}, ${y})${extras.length ? ` [${extras.join(", ")}]` : ""}` }] };
426
1791
  });
427
1792
  server.tool("click_text", "SLOW fallback: Find text on screen via OCR and click it. Use ui_press instead when possible — it's 10x faster. Only use this for canvas/image content where Accessibility doesn't work.", {
428
1793
  windowId: z.number().describe("Window ID"),
429
1794
  text: z.string().describe("Text to find and click"),
430
1795
  offset_y: z.number().optional().describe("Y offset from text center (e.g. -25 for icon above label)"),
431
- }, async ({ windowId, text, offset_y }) => {
1796
+ prefer: z.enum(["first", "largest", "topmost", "leftmost"]).optional().default("first").describe("Match preference when multiple OCR hits: largest (headers), topmost, leftmost (sidebar), first (OCR order)"),
1797
+ }, async ({ windowId, text, offset_y, prefer }) => {
432
1798
  await ensureBridge();
433
1799
  const wins = await bridge.call("app.windows");
434
1800
  const win = wins.find((w) => w.windowId === windowId);
435
1801
  if (!win)
436
1802
  return { content: [{ type: "text", text: "Window not found" }] };
437
1803
  const wb = win.bounds;
438
- const shot = await bridge.call("cg.captureWindow", { windowId });
1804
+ const shot = await bridge.call("cg.captureWindow", { windowId, safeCLI: isBrowserApp() });
439
1805
  const ocr = await bridge.call("vision.ocr", { imagePath: shot.path });
440
- const match = ocr.regions.find((r) => r.text.toLowerCase().includes(text.toLowerCase()));
441
- if (!match) {
442
- return { content: [{ type: "text", text: `"${text}" not found. Available: ${ocr.regions.map((r) => r.text).slice(0, 20).join(", ")}` }] };
443
- }
444
- const shadowL = (shot.width - wb.width * 2) / 2;
445
- const shadowT = (shot.height - wb.height * 2) / 3;
446
- const sx = wb.x + (match.bounds.x + match.bounds.width / 2 - shadowL) / 2;
447
- const sy = wb.y + (match.bounds.y + match.bounds.height / 2 - shadowT) / 2 + (offset_y || 0);
1806
+ const allMatches = ocr.regions.filter((r) => r.text.toLowerCase().includes(text.toLowerCase()));
1807
+ if (allMatches.length === 0) {
1808
+ return { content: [{ type: "text", text: `"${text}" not found. Available: ${ocr.regions.map((r) => r.text).slice(0, 20).join(", ")}` }], isError: true };
1809
+ }
1810
+ // Sort by preference strategy
1811
+ if (prefer === "largest") {
1812
+ allMatches.sort((a, b) => (b.bounds.width * b.bounds.height) - (a.bounds.width * a.bounds.height));
1813
+ }
1814
+ else if (prefer === "topmost") {
1815
+ allMatches.sort((a, b) => a.bounds.y - b.bounds.y);
1816
+ }
1817
+ else if (prefer === "leftmost") {
1818
+ allMatches.sort((a, b) => a.bounds.x - b.bounds.x);
1819
+ }
1820
+ const match = allMatches[0];
1821
+ // Convert OCR pixel coordinates to screen coordinates.
1822
+ // shot.width/height are in pixels; wb.width/height are in screen points.
1823
+ // The scale factor handles both Retina (2x) and non-Retina (1x) displays.
1824
+ //
1825
+ // L3-05 fix: Window captures now use boundsIgnoreFraming to exclude shadow,
1826
+ // so image dimensions match window bounds × backing scale (2x on Retina).
1827
+ // Simple ratio mapping: OCR pixels → screen points.
1828
+ const scaleX = shot.width > 0 ? wb.width / shot.width : 1;
1829
+ const scaleY = shot.height > 0 ? wb.height / shot.height : 1;
1830
+ const centerPixelX = match.bounds.x + match.bounds.width / 2;
1831
+ const centerPixelY = match.bounds.y + match.bounds.height / 2;
1832
+ let sx = Math.round(wb.x + centerPixelX * scaleX);
1833
+ let sy = Math.round(wb.y + centerPixelY * scaleY + (offset_y || 0));
1834
+ // Clamp to window bounds — OCR boxes can extend slightly beyond the window
1835
+ sx = Math.max(wb.x + 2, Math.min(sx, wb.x + wb.width - 2));
1836
+ sy = Math.max(wb.y + 2, Math.min(sy, wb.y + wb.height - 2));
448
1837
  await bridge.call("cg.mouseMove", { x: sx, y: sy });
449
- await new Promise(r => setTimeout(r, 50));
1838
+ await new Promise(r => setTimeout(r, 80)); // 80ms dwell — longer than 50ms helps dense UIs register hover
450
1839
  await bridge.call("cg.mouseClick", { x: sx, y: sy });
451
- return { content: [{ type: "text", text: `Clicked "${match.text}" at (${Math.round(sx)}, ${Math.round(sy)})` }] };
1840
+ let response = `Clicked "${match.text}" at screen (${Math.round(sx)}, ${Math.round(sy)}) ` +
1841
+ `[OCR pixel: (${Math.round(match.bounds.x)}, ${Math.round(match.bounds.y)}) ${match.bounds.width}×${match.bounds.height}] ` +
1842
+ `[window: (${wb.x}, ${wb.y}) ${wb.width}×${wb.height}] ` +
1843
+ `[scale: ${scaleX.toFixed(3)}×${scaleY.toFixed(3)}]`;
1844
+ if (allMatches.length > 1) {
1845
+ response += ` [${allMatches.length} matches, used prefer="${prefer}"]`;
1846
+ response += `\n⚠ ${allMatches.length} matches found. Use prefer param or offset_y to disambiguate.`;
1847
+ }
1848
+ return { content: [{ type: "text", text: response }] };
452
1849
  });
453
- server.tool("type_text", "Type text using the keyboard", {
1850
+ server.tool("type_text", "Type text using the keyboard. Auto-detects Electron apps and routes through CDP for reliable editor input.", {
454
1851
  text: z.string().describe("Text to type"),
455
- }, async ({ text }) => {
456
- await ensureBridge();
457
- await bridge.call("cg.typeText", { text });
458
- return { content: [{ type: "text", text: "Typed: " + text }] };
459
- });
460
- server.tool("key", "Press a key combination", {
461
- combo: z.string().describe("Key combo: 'cmd+c', 'enter', 'cmd+shift+n', 'space'. Use + to separate."),
462
- }, async ({ combo }) => {
1852
+ pid: z.number().optional().describe("Target process ID for PID-targeted event delivery"),
1853
+ cdpPort: z.number().optional().describe("CDP port for Electron apps (e.g. 9229). When set, types via CDP instead of AX — fixes Copilot/panel focus theft."),
1854
+ }, async ({ text, pid, cdpPort: portOverride }) => {
463
1855
  await ensureBridge();
464
- await bridge.call("cg.keyCombo", { keys: combo.split("+") });
465
- return { content: [{ type: "text", text: "Key: " + combo }] };
466
- });
467
- server.tool("drag", "Drag from one point to another", {
468
- fromX: z.number(), fromY: z.number(),
469
- toX: z.number(), toY: z.number(),
470
- }, async ({ fromX, fromY, toX, toY }) => {
471
- await ensureBridge();
472
- await bridge.call("cg.mouseDrag", { fromX, fromY, toX, toY });
473
- return { content: [{ type: "text", text: `Dragged (${fromX},${fromY}) → (${toX},${toY})` }] };
474
- });
475
- server.tool("scroll", "Scroll at a position", {
476
- x: z.number(), y: z.number(),
1856
+ // Auto-resolve frontmost PID when none provided — global HID posting
1857
+ // fails silently in NSTextView apps (TextEdit, etc.), but PID-targeted
1858
+ // delivery works reliably in all apps.
1859
+ let targetPid = pid;
1860
+ if (!targetPid) {
1861
+ try {
1862
+ const front = await bridge.call("app.frontmost", {});
1863
+ targetPid = front.pid;
1864
+ }
1865
+ catch {
1866
+ // Fallback to global posting if frontmost detection fails
1867
+ }
1868
+ }
1869
+ // Verify the target process exists and has windows
1870
+ if (targetPid) {
1871
+ try {
1872
+ const apps = await bridge.call("app.list", {});
1873
+ let app = apps?.find((a) => a.pid === targetPid);
1874
+ if (!app) {
1875
+ // L3-04 fix: Some Electron apps (Slack, Discord) don't appear in NSWorkspace.runningApplications
1876
+ // despite being frontmost. Check app.frontmost as fallback before rejecting.
1877
+ try {
1878
+ const front = await bridge.call("app.frontmost", {});
1879
+ if (front.pid === targetPid) {
1880
+ app = front;
1881
+ }
1882
+ }
1883
+ catch { /* ignore */ }
1884
+ if (!app) {
1885
+ return { content: [{ type: "text", text: `PID ${targetPid} is not running. Call apps() to get current PIDs.` }] };
1886
+ }
1887
+ }
1888
+ const wins = await bridge.call("window.list", { pid: targetPid });
1889
+ if (!wins || wins.length === 0) {
1890
+ return { content: [{ type: "text", text: `Warning: PID ${targetPid} (${app.name}) has no windows. Keystrokes may be lost. Open a document first.` }] };
1891
+ }
1892
+ }
1893
+ catch {
1894
+ // Best-effort check — proceed with typing if validation fails
1895
+ }
1896
+ }
1897
+ // L3-02 fix: Raise the specific window before typing to ensure keystrokes land correctly.
1898
+ // Without this, Electron apps with multiple instances can lose keystrokes to the wrong window,
1899
+ // or text can go to a non-editor area (e.g. Walkthrough tab instead of editor).
1900
+ if (targetPid) {
1901
+ try {
1902
+ const winId = await resolveWindowId(targetPid);
1903
+ if (winId != null) {
1904
+ await bridge.call("window.focus", { windowId: winId });
1905
+ }
1906
+ }
1907
+ catch { /* best-effort — proceed with typing */ }
1908
+ }
1909
+ // L3-02 fix: Electron CDP typing — routes through CDP Input.dispatchKeyEvent
1910
+ // when cdpPort is specified or auto-detected. Solves Copilot chat / panel focus
1911
+ // theft where AX keystrokes go to chat input instead of Monaco editor.
1912
+ let electronCdpPort = portOverride;
1913
+ if (!electronCdpPort && targetPid) {
1914
+ // Auto-detect: probe Electron-common CDP ports, but ONLY use if the CDP target
1915
+ // belongs to the same app we're targeting. Without this check, typing to Slack
1916
+ // could get routed through VS Code's CDP port 9229.
1917
+ try {
1918
+ // Look up target app name for matching
1919
+ let targetAppName = "";
1920
+ try {
1921
+ const apps = await bridge.call("app.list", {});
1922
+ const app = apps?.find((a) => a.pid === targetPid);
1923
+ targetAppName = (app?.name || "").toLowerCase();
1924
+ if (!targetAppName) {
1925
+ const front = await bridge.call("app.frontmost", {});
1926
+ if (front.pid === targetPid)
1927
+ targetAppName = (front.name || "").toLowerCase();
1928
+ }
1929
+ }
1930
+ catch { /* ignore */ }
1931
+ for (const p of [9229, 9333]) {
1932
+ try {
1933
+ if (!CDP)
1934
+ CDP = (await import("chrome-remote-interface")).default;
1935
+ const version = await CDP.Version({ port: p });
1936
+ // Verify the CDP target matches the target app — check if the browser name
1937
+ // or any page title contains the app name (e.g. "Code" in VS Code page titles)
1938
+ const browserName = (version?.Browser || "").toLowerCase();
1939
+ if (targetAppName && !browserName.includes(targetAppName)) {
1940
+ // Double-check against page titles
1941
+ try {
1942
+ const targets = await CDP.List({ port: p });
1943
+ const titleMatch = targets?.some((t) => (t.title || "").toLowerCase().includes(targetAppName));
1944
+ if (!titleMatch)
1945
+ continue; // CDP doesn't belong to target app — skip
1946
+ }
1947
+ catch {
1948
+ continue;
1949
+ }
1950
+ }
1951
+ electronCdpPort = p;
1952
+ break;
1953
+ }
1954
+ catch { /* not available on this port */ }
1955
+ }
1956
+ }
1957
+ catch { /* auto-detect is best-effort */ }
1958
+ }
1959
+ if (electronCdpPort) {
1960
+ // CDP path: click editor to ensure focus, then type via key events
1961
+ try {
1962
+ const { client } = await getCDPClient(undefined, electronCdpPort);
1963
+ // Click the editor area to grab focus from Copilot/panels
1964
+ await client.Runtime.evaluate({
1965
+ expression: `(() => {
1966
+ const editor = document.querySelector('.monaco-editor .view-lines');
1967
+ if (editor) { editor.click(); return true; }
1968
+ // Generic fallback: focus the first contenteditable or active editor context
1969
+ const editable = document.querySelector('[contenteditable="true"]') || document.querySelector('.native-edit-context');
1970
+ if (editable) { editable.focus(); return true; }
1971
+ return false;
1972
+ })()`,
1973
+ returnByValue: true,
1974
+ });
1975
+ await randomDelay(30, 60);
1976
+ // Type character by character via CDP Input.dispatchKeyEvent
1977
+ for (const char of text) {
1978
+ await client.Input.dispatchKeyEvent({ type: "keyDown", text: char, key: char, unmodifiedText: char });
1979
+ await client.Input.dispatchKeyEvent({ type: "keyUp", text: char, key: char, unmodifiedText: char });
1980
+ await randomDelay(10, 30);
1981
+ }
1982
+ await client.close();
1983
+ const msg = `Typed via CDP (port ${electronCdpPort}): "${text}"`;
1984
+ return { content: [{ type: "text", text: msg }] };
1985
+ }
1986
+ catch (cdpErr) {
1987
+ // CDP failed — fall through to AX typing
1988
+ }
1989
+ }
1990
+ // AX path: standard cg.typeText via native bridge
1991
+ // L2-66 fix: Auto-chunk long text to prevent bridge timeout.
1992
+ // cg.typeText simulates individual keystrokes, so >500 chars can be slow.
1993
+ const CHUNK_SIZE = 500;
1994
+ if (text.length > CHUNK_SIZE) {
1995
+ for (let i = 0; i < text.length; i += CHUNK_SIZE) {
1996
+ const chunk = text.slice(i, i + CHUNK_SIZE);
1997
+ await bridge.call("cg.typeText", { text: chunk, targetPid });
1998
+ }
1999
+ }
2000
+ else {
2001
+ await bridge.call("cg.typeText", { text, targetPid });
2002
+ }
2003
+ const msg = targetPid ? `Typed to PID ${targetPid}: "${text}"` : "Typed: " + text;
2004
+ return { content: [{ type: "text", text: msg }] };
2005
+ });
2006
+ server.tool("key", "Press a key combination", {
2007
+ combo: z.string().describe("Key combo: 'cmd+c', 'enter', 'cmd+shift+n', 'space'. Use + to separate."),
2008
+ holdMs: z.number().optional().describe("Hold the key for this many ms (for accent picker, long-press menus). Default: tap."),
2009
+ pid: z.number().optional().describe("Target process ID for PID-targeted event delivery"),
2010
+ }, async ({ combo, holdMs, pid }) => {
2011
+ await ensureBridge();
2012
+ // Auto-resolve frontmost PID when none provided — ensures keystrokes
2013
+ // reach the correct app (same pattern as type_text auto-PID).
2014
+ let targetPid = pid;
2015
+ if (!targetPid) {
2016
+ try {
2017
+ const front = await bridge.call("app.frontmost", {});
2018
+ targetPid = front.pid;
2019
+ }
2020
+ catch { /* fallback to global posting */ }
2021
+ }
2022
+ const keys = combo.split("+");
2023
+ const hasModifier = keys.some(k => ["cmd", "ctrl", "alt", "shift"].includes(k.toLowerCase()));
2024
+ // macOS only processes modifier shortcuts (cmd+c, cmd+n, etc.) for the frontmost app.
2025
+ // When pid is targeted with modifiers, raise the specific window first.
2026
+ // L3-01 fix: use window.focus(windowId) instead of app.focus(bundleId) to avoid
2027
+ // targeting the wrong instance when multiple Electron apps share the same bundleId.
2028
+ if (targetPid && hasModifier) {
2029
+ try {
2030
+ const winId = await resolveWindowId(targetPid);
2031
+ if (winId != null) {
2032
+ await bridge.call("window.focus", { windowId: winId });
2033
+ }
2034
+ else {
2035
+ // Fallback to bundleId-based focus if no window found
2036
+ const apps = await bridge.call("app.list", {});
2037
+ const target = apps.find(a => a.pid === targetPid);
2038
+ if (target) {
2039
+ await bridge.call("app.focus", { bundleId: target.bundleId });
2040
+ }
2041
+ }
2042
+ }
2043
+ catch { /* focus is best-effort */ }
2044
+ }
2045
+ // Press-and-hold mode for accent picker / long-press menus
2046
+ if (holdMs && !hasModifier && keys.length === 1) {
2047
+ await bridge.call("cg.keyPressAndHold", { key: keys[0], durationMs: holdMs, targetPid });
2048
+ return { content: [{ type: "text", text: `Key held: ${combo} (${holdMs}ms)` + (targetPid ? ` (PID ${targetPid})` : "") }] };
2049
+ }
2050
+ await bridge.call("cg.keyCombo", { keys, targetPid });
2051
+ return { content: [{ type: "text", text: `Key: ${combo}` + (targetPid ? ` (PID ${targetPid})` : "") }] };
2052
+ });
2053
+ server.tool("drag", "Drag from one point to another", {
2054
+ fromX: z.number(), fromY: z.number(),
2055
+ toX: z.number(), toY: z.number(),
2056
+ modifiers: z.array(z.enum(["cmd", "shift", "alt", "ctrl"])).optional().describe("Hold modifier keys during drag (e.g. ['alt'] for option+drag copy in Finder)"),
2057
+ pid: z.number().optional().describe("Target process ID for PID-targeted event delivery"),
2058
+ }, async ({ fromX, fromY, toX, toY, modifiers, pid }) => {
2059
+ await ensureBridge();
2060
+ await bridge.call("cg.mouseDrag", { fromX, fromY, toX, toY, modifiers: modifiers || [], targetPid: pid });
2061
+ const modStr = modifiers?.length ? ` [${modifiers.join("+")}]` : "";
2062
+ return { content: [{ type: "text", text: `Dragged (${fromX},${fromY}) → (${toX},${toY})${modStr}` }] };
2063
+ });
2064
+ server.tool("scroll", "Scroll at a position", {
2065
+ x: z.number(), y: z.number(),
477
2066
  deltaX: z.number().optional().describe("Horizontal scroll (default 0)"),
478
2067
  deltaY: z.number().describe("Vertical scroll (negative = down)"),
479
- }, async ({ x, y, deltaX, deltaY }) => {
2068
+ pid: z.number().optional().describe("Target process ID for PID-targeted event delivery"),
2069
+ }, async ({ x, y, deltaX, deltaY, pid }) => {
480
2070
  await ensureBridge();
481
- await bridge.call("cg.scroll", { x, y, deltaX: deltaX || 0, deltaY });
2071
+ await bridge.call("cg.scroll", { x, y, deltaX: deltaX || 0, deltaY, targetPid: pid });
482
2072
  return { content: [{ type: "text", text: "Scrolled" }] };
483
2073
  });
484
2074
  // ── CDP helper: get client for a tab ──
485
- async function getCDPClient(tabId) {
486
- const { CDP: cdp, port } = await ensureCDP();
2075
+ async function getCDPClient(tabId, overridePort) {
2076
+ const { CDP: cdp, port } = await ensureCDP(overridePort);
487
2077
  let targetId = tabId;
488
2078
  if (!targetId) {
489
2079
  const targets = await cdp.List({ port });
@@ -493,6 +2083,11 @@ async function getCDPClient(tabId) {
493
2083
  targetId = page.id;
494
2084
  }
495
2085
  const client = await cdp({ port, target: targetId });
2086
+ // Activate CDP source in perception when a browser connection is established
2087
+ try {
2088
+ perceptionManager.activateCDP(client);
2089
+ }
2090
+ catch { /* best-effort */ }
496
2091
  return { client, targetId: targetId, CDP: cdp, port };
497
2092
  }
498
2093
  // ── Random delay helper ──
@@ -502,25 +2097,54 @@ function randomDelay(min, max) {
502
2097
  // ═══════════════════════════════════════════════
503
2098
  // BROWSER — control Chrome pages via CDP (10ms, not OCR)
504
2099
  // ═══════════════════════════════════════════════
505
- server.tool("browser_tabs", "List all open Chrome tabs", {}, async () => {
506
- const { CDP: cdp, port } = await ensureCDP();
2100
+ server.tool("browser_tabs", "List all open Chrome/Electron tabs. Use cdpPort to connect to a specific app (e.g. 9333 for Codex Desktop).", {
2101
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps). Omit to auto-detect."),
2102
+ }, async ({ cdpPort: portOverride }) => {
2103
+ const { CDP: cdp, port } = await ensureCDP(portOverride);
507
2104
  const targets = await cdp.List({ port });
508
2105
  const pages = targets.filter((t) => t.type === "page");
509
2106
  const lines = pages.map((t) => `[${t.id}] ${t.title} — ${t.url}`);
510
2107
  return { content: [{ type: "text", text: lines.join("\n") || "No tabs open" }] };
511
2108
  });
512
- server.tool("browser_open", "Open a URL in Chrome (creates new tab)", {
2109
+ server.tool("browser_open", "Open a URL in Chrome/Electron (creates new tab)", {
513
2110
  url: z.string().describe("URL to open"),
514
- }, async ({ url }) => {
515
- const { CDP: cdp, port } = await ensureCDP();
2111
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2112
+ }, async ({ url, cdpPort: portOverride }) => {
2113
+ // L2-71 fix: Block dangerous URL protocols
2114
+ const BLOCKED_PROTOCOLS = ["javascript:", "data:", "blob:", "vbscript:"];
2115
+ const urlLower = url.trim().toLowerCase();
2116
+ for (const proto of BLOCKED_PROTOCOLS) {
2117
+ if (urlLower.startsWith(proto)) {
2118
+ throw new Error(`Blocked: "${proto}" URLs are not allowed in browser_open for security reasons.`);
2119
+ }
2120
+ }
2121
+ // Capture bundleId BEFORE CDP call to prevent focus-change race
2122
+ const browserBundleId = worldModel.getState().focusedApp?.bundleId ?? "com.google.Chrome";
2123
+ const { CDP: cdp, port } = await ensureCDP(portOverride);
516
2124
  const target = await cdp.New({ port, url });
2125
+ // Feed new tab into world model
2126
+ try {
2127
+ worldModel.ingestCDPSnapshot(browserBundleId, url, target.title ?? url);
2128
+ }
2129
+ catch { /* world model update is best-effort */ }
517
2130
  return { content: [{ type: "text", text: `Opened: ${target.id} — ${url}` }] };
518
2131
  });
519
- server.tool("browser_navigate", "Navigate the active Chrome tab to a URL", {
2132
+ server.tool("browser_navigate", "Navigate the active Chrome/Electron tab to a URL", {
520
2133
  url: z.string().describe("URL to navigate to"),
521
2134
  tabId: z.string().optional().describe("Tab ID (from browser_tabs). Omit for most recent tab."),
522
- }, async ({ url, tabId }) => {
523
- const { CDP: cdp, port } = await ensureCDP();
2135
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2136
+ }, async ({ url, tabId, cdpPort: portOverride }) => {
2137
+ // L2-71 fix: Block dangerous URL protocols that could execute arbitrary code
2138
+ const BLOCKED_PROTOCOLS = ["javascript:", "data:", "blob:", "vbscript:"];
2139
+ const urlLower = url.trim().toLowerCase();
2140
+ for (const proto of BLOCKED_PROTOCOLS) {
2141
+ if (urlLower.startsWith(proto)) {
2142
+ throw new Error(`Blocked: "${proto}" URLs are not allowed in browser_navigate for security reasons. Use browser_js for JavaScript execution.`);
2143
+ }
2144
+ }
2145
+ // Capture bundleId BEFORE CDP call to prevent focus-change race
2146
+ const browserBundleId = worldModel.getState().focusedApp?.bundleId ?? "com.google.Chrome";
2147
+ const { CDP: cdp, port } = await ensureCDP(portOverride);
524
2148
  let targetId = tabId;
525
2149
  if (!targetId) {
526
2150
  const targets = await cdp.List({ port });
@@ -540,16 +2164,23 @@ server.tool("browser_navigate", "Navigate the active Chrome tab to a URL", {
540
2164
  break;
541
2165
  await new Promise(r => setTimeout(r, 200));
542
2166
  }
543
- const title = await client.Runtime.evaluate({ expression: "document.title", returnByValue: true });
2167
+ const titleResult = await client.Runtime.evaluate({ expression: "document.title", returnByValue: true });
2168
+ const pageTitle = titleResult.result.value ?? "";
544
2169
  await client.close();
545
- return { content: [{ type: "text", text: `Navigated to: ${title.result.value}` }] };
2170
+ // Feed navigation result into world model
2171
+ try {
2172
+ worldModel.ingestCDPSnapshot(browserBundleId, url, pageTitle);
2173
+ }
2174
+ catch { /* world model update is best-effort */ }
2175
+ return { content: [{ type: "text", text: `Navigated to: ${pageTitle}` }] };
546
2176
  });
547
- server.tool("browser_js", "Execute JavaScript in a Chrome tab. Returns the result. WARNING: This runs arbitrary JS in the browser context — avoid on sensitive pages (banking, email). All executions are audit-logged.", {
2177
+ server.tool("browser_js", "Execute JavaScript in a Chrome/Electron tab. Returns the result. WARNING: This runs arbitrary JS in the browser context — avoid on sensitive pages (banking, email). All executions are audit-logged.", {
548
2178
  code: z.string().describe("JavaScript to execute. Must be an expression that returns a value. Use (() => { ... })() for multi-line."),
549
2179
  tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
550
- }, async ({ code, tabId }) => {
551
- auditLog("browser_js", { code: code.slice(0, 500), tabId });
552
- const { CDP: cdp, port } = await ensureCDP();
2180
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2181
+ }, async ({ code, tabId, cdpPort: portOverride }) => {
2182
+ auditLog("browser_js", { code, tabId });
2183
+ const { CDP: cdp, port } = await ensureCDP(portOverride);
553
2184
  let targetId = tabId;
554
2185
  if (!targetId) {
555
2186
  const targets = await cdp.List({ port });
@@ -570,15 +2201,21 @@ server.tool("browser_js", "Execute JavaScript in a Chrome tab. Returns the resul
570
2201
  return { content: [{ type: "text", text: `JS Error: ${result.exceptionDetails.text}\n${result.exceptionDetails.exception?.description || ""}` }] };
571
2202
  }
572
2203
  const val = result.result.value;
573
- const text = typeof val === "object" ? JSON.stringify(val, null, 2) : String(val ?? "undefined");
2204
+ let text = typeof val === "object" ? JSON.stringify(val, null, 2) : String(val ?? "undefined");
2205
+ // Redact sensitive URLs and tokens in JS output
2206
+ text = text.replace(/https?:\/\/[^\s"'`]+/g, (url) => sanitizeUrl(url));
2207
+ text = redactSensitiveLabel(text);
574
2208
  return { content: [{ type: "text", text }] };
575
2209
  });
576
- server.tool("browser_dom", "Query the DOM of a Chrome page. Returns matching elements' text, attributes, and structure.", {
2210
+ server.tool("browser_dom", "Query the DOM of a Chrome/Electron page. Returns matching elements' text, attributes, and structure.", {
577
2211
  selector: z.string().describe("CSS selector, e.g. 'button', '.nav a', '#main h2'"),
578
2212
  tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
579
2213
  limit: z.number().optional().describe("Max results (default 20)"),
580
- }, async ({ selector, tabId, limit }) => {
581
- const { CDP: cdp, port } = await ensureCDP();
2214
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2215
+ }, async ({ selector, tabId, limit, cdpPort: portOverride }) => {
2216
+ // Capture bundleId before any async CDP calls to avoid race condition
2217
+ const browserBundleId = worldModel.getState().focusedApp?.bundleId ?? "com.google.Chrome";
2218
+ const { CDP: cdp, port } = await ensureCDP(portOverride);
582
2219
  let targetId = tabId;
583
2220
  if (!targetId) {
584
2221
  const targets = await cdp.List({ port });
@@ -607,14 +2244,27 @@ server.tool("browser_dom", "Query the DOM of a Chrome page. Returns matching ele
607
2244
  })()`,
608
2245
  returnByValue: true,
609
2246
  });
2247
+ // Feed page info into world model while client is still open
2248
+ try {
2249
+ const pageInfo = await client.Runtime.evaluate({
2250
+ expression: `({ url: location.href, title: document.title })`,
2251
+ returnByValue: true,
2252
+ });
2253
+ const info = pageInfo.result.value;
2254
+ if (info?.url) {
2255
+ worldModel.ingestCDPSnapshot(browserBundleId, info.url, info.title ?? "");
2256
+ }
2257
+ }
2258
+ catch { /* world model update is best-effort */ }
610
2259
  await client.close();
611
2260
  return { content: [{ type: "text", text: JSON.stringify(result.result.value, null, 2) }] };
612
2261
  });
613
- server.tool("browser_click", "Click an element in Chrome by CSS selector. Uses CDP Input.dispatchMouseEvent for realistic mouse events.", {
2262
+ server.tool("browser_click", "Click an element in Chrome/Electron by CSS selector. Uses CDP Input.dispatchMouseEvent for realistic mouse events.", {
614
2263
  selector: z.string().describe("CSS selector of element to click"),
615
2264
  tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
616
- }, async ({ selector, tabId }) => {
617
- const { client } = await getCDPClient(tabId);
2265
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2266
+ }, async ({ selector, tabId, cdpPort: portOverride }) => {
2267
+ const { client } = await getCDPClient(tabId, portOverride);
618
2268
  await client.Runtime.enable();
619
2269
  const result = await client.Runtime.evaluate({
620
2270
  expression: `(() => {
@@ -640,13 +2290,14 @@ server.tool("browser_click", "Click an element in Chrome by CSS selector. Uses C
640
2290
  await client.close();
641
2291
  return { content: [{ type: "text", text: `Clicked: "${val.text}" at (${Math.round(x)}, ${Math.round(y)})` }] };
642
2292
  });
643
- server.tool("browser_type", "Type into an input field in Chrome. Uses CDP Input.dispatchKeyEvent for real keyboard events (works with React/Angular).", {
2293
+ server.tool("browser_type", "Type into an input field in Chrome/Electron. Uses CDP Input.dispatchKeyEvent for real keyboard events (works with React/Angular).", {
644
2294
  selector: z.string().describe("CSS selector of the input"),
645
2295
  text: z.string().describe("Text to type"),
646
2296
  clear: z.boolean().optional().describe("Clear field first (default true)"),
647
2297
  tabId: z.string().optional().describe("Tab ID"),
648
- }, async ({ selector, text, clear, tabId }) => {
649
- const { client } = await getCDPClient(tabId);
2298
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2299
+ }, async ({ selector, text, clear, tabId, cdpPort: portOverride }) => {
2300
+ const { client } = await getCDPClient(tabId, portOverride);
650
2301
  await client.Runtime.enable();
651
2302
  // Focus the element
652
2303
  const focusResult = await client.Runtime.evaluate({
@@ -681,12 +2332,13 @@ server.tool("browser_type", "Type into an input field in Chrome. Uses CDP Input.
681
2332
  await client.close();
682
2333
  return { content: [{ type: "text", text: `Typed "${text}"` }] };
683
2334
  });
684
- server.tool("browser_wait", "Wait for a condition on a Chrome page", {
2335
+ server.tool("browser_wait", "Wait for a condition on a Chrome/Electron page", {
685
2336
  condition: z.string().describe("JS expression that returns truthy when ready. e.g. 'document.querySelector(\".loaded\")'"),
686
2337
  timeoutMs: z.number().optional().describe("Timeout in ms (default 10000)"),
687
2338
  tabId: z.string().optional().describe("Tab ID"),
688
- }, async ({ condition, timeoutMs, tabId }) => {
689
- const { CDP: cdp, port } = await ensureCDP();
2339
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2340
+ }, async ({ condition, timeoutMs, tabId, cdpPort: portOverride }) => {
2341
+ const { CDP: cdp, port } = await ensureCDP(portOverride);
690
2342
  let targetId = tabId;
691
2343
  if (!targetId) {
692
2344
  const targets = await cdp.List({ port });
@@ -712,8 +2364,11 @@ server.tool("browser_wait", "Wait for a condition on a Chrome page", {
712
2364
  });
713
2365
  server.tool("browser_page_info", "Get current page title, URL, and text content summary", {
714
2366
  tabId: z.string().optional().describe("Tab ID"),
715
- }, async ({ tabId }) => {
716
- const { CDP: cdp, port } = await ensureCDP();
2367
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2368
+ }, async ({ tabId, cdpPort: portOverride }) => {
2369
+ // Capture bundleId BEFORE CDP call to prevent focus-change race
2370
+ const browserBundleId = worldModel.getState().focusedApp?.bundleId ?? "com.google.Chrome";
2371
+ const { CDP: cdp, port } = await ensureCDP(portOverride);
717
2372
  let targetId = tabId;
718
2373
  if (!targetId) {
719
2374
  const targets = await cdp.List({ port });
@@ -733,6 +2388,14 @@ server.tool("browser_page_info", "Get current page title, URL, and text content
733
2388
  returnByValue: true,
734
2389
  });
735
2390
  await client.close();
2391
+ // Feed page info into world model
2392
+ try {
2393
+ const info = result.result.value;
2394
+ if (info?.url) {
2395
+ worldModel.ingestCDPSnapshot(browserBundleId, info.url, info.title ?? "");
2396
+ }
2397
+ }
2398
+ catch { /* world model update is best-effort */ }
736
2399
  return { content: [{ type: "text", text: JSON.stringify(result.result.value, null, 2) }] };
737
2400
  });
738
2401
  // ═══════════════════════════════════════════════
@@ -774,10 +2437,11 @@ if (origQuery) {
774
2437
  };
775
2438
  }
776
2439
  `;
777
- server.tool("browser_stealth", "Inject anti-detection patches into Chrome page. Call once after navigating to a protected site. Hides webdriver flag, patches plugins/languages/permissions.", {
2440
+ server.tool("browser_stealth", "Inject anti-detection patches into Chrome/Electron page. Call once after navigating to a protected site. Hides webdriver flag, patches plugins/languages/permissions.", {
778
2441
  tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
779
- }, async ({ tabId }) => {
780
- const { client } = await getCDPClient(tabId);
2442
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2443
+ }, async ({ tabId, cdpPort: portOverride }) => {
2444
+ const { client } = await getCDPClient(tabId, portOverride);
781
2445
  await client.Page.enable();
782
2446
  await client.Page.addScriptToEvaluateOnNewDocument({ source: STEALTH_SCRIPT });
783
2447
  // Also evaluate immediately on current page
@@ -795,8 +2459,9 @@ server.tool("browser_fill_form", "Fill a form field with human-like typing (anti
795
2459
  clear: z.boolean().optional().describe("Clear field first (default true)"),
796
2460
  delayMs: z.number().optional().describe("Avg delay between keystrokes in ms (default 50)"),
797
2461
  tabId: z.string().optional().describe("Tab ID"),
798
- }, async ({ selector, text, clear, delayMs, tabId }) => {
799
- const { client } = await getCDPClient(tabId);
2462
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2463
+ }, async ({ selector, text, clear, delayMs, tabId, cdpPort: portOverride }) => {
2464
+ const { client } = await getCDPClient(tabId, portOverride);
800
2465
  await client.Runtime.enable();
801
2466
  // Focus the element
802
2467
  const focusResult = await client.Runtime.evaluate({
@@ -834,14 +2499,15 @@ server.tool("browser_fill_form", "Fill a form field with human-like typing (anti
834
2499
  await client.close();
835
2500
  return { content: [{ type: "text", text: `Typed "${text}" (${text.length} chars, human-like)` }] };
836
2501
  });
837
- server.tool("browser_human_click", "Click an element with realistic mouse events (anti-detection). Dispatches mouseMoved → mousePressed → mouseReleased at element coordinates.", {
2502
+ // browser_human_click alias for browser_click (both already use realistic mouse events)
2503
+ server.tool("browser_human_click", "Alias for browser_click — both use realistic mouseMoved → mousePressed → mouseReleased events. Prefer browser_click directly.", {
838
2504
  selector: z.string().describe("CSS selector of element to click"),
839
2505
  tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
840
- }, async ({ selector, tabId }) => {
841
- const { client } = await getCDPClient(tabId);
2506
+ cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
2507
+ }, async ({ selector, tabId, cdpPort: portOverride }) => {
2508
+ const { client } = await getCDPClient(tabId, portOverride);
842
2509
  await client.Runtime.enable();
843
- // Get element center coordinates
844
- const rectResult = await client.Runtime.evaluate({
2510
+ const result = await client.Runtime.evaluate({
845
2511
  expression: `(() => {
846
2512
  const el = document.querySelector(${JSON.stringify(selector)});
847
2513
  if (!el) return { ok: false, reason: "Element not found: ${selector.replace(/"/g, '\\"')}" };
@@ -851,13 +2517,12 @@ server.tool("browser_human_click", "Click an element with realistic mouse events
851
2517
  })()`,
852
2518
  returnByValue: true,
853
2519
  });
854
- const val = rectResult.result.value;
2520
+ const val = result.result.value;
855
2521
  if (!val?.ok) {
856
2522
  await client.close();
857
2523
  return { content: [{ type: "text", text: val?.reason || "Element not found" }] };
858
2524
  }
859
2525
  const { x, y } = val;
860
- // Simulate realistic mouse event sequence
861
2526
  await client.Input.dispatchMouseEvent({ type: "mouseMoved", x, y });
862
2527
  await randomDelay(30, 60);
863
2528
  await client.Input.dispatchMouseEvent({ type: "mousePressed", x, y, button: "left", clickCount: 1 });
@@ -869,19 +2534,30 @@ server.tool("browser_human_click", "Click an element with realistic mouse events
869
2534
  // ═══════════════════════════════════════════════
870
2535
  // PLATFORM PLAYBOOKS — lazy-loaded site knowledge
871
2536
  // ═══════════════════════════════════════════════
872
- const playbooksDir = path.resolve(__dirname, "playbooks");
873
- server.tool("platform_guide", "Get automation guide for a platform (selectors, URLs, flows, error solutions). Available: devpost. Zero cost — only loads when called.", {
874
- platform: z.string().describe("Platform name, e.g. 'devpost'"),
2537
+ const coverageAuditor = new CoverageAuditor(referencesDir, playbooksDir, learningEngine, goalStore);
2538
+ server.tool("platform_guide", "Get automation guide for a platform (selectors, URLs, flows, error solutions). Reads from references/ (curated knowledge). Zero cost — only loads when called.", {
2539
+ platform: z.string().describe("Platform name, e.g. 'figma', 'x-twitter', 'devpost'"),
875
2540
  section: z.enum(["all", "urls", "flows", "selectors", "errors", "detection"]).optional().describe("Section to return (default: all). Use 'errors' for just error+solution pairs."),
876
2541
  }, async ({ platform, section }) => {
877
- const filePath = path.resolve(playbooksDir, `${platform.toLowerCase()}.json`);
2542
+ const safePlatName = platform.toLowerCase().replace(/[^a-z0-9_\-]/g, "_").slice(0, 100);
2543
+ const filePath = path.resolve(referencesDir, `${safePlatName}.json`);
2544
+ if (!filePath.startsWith(path.resolve(referencesDir))) {
2545
+ return { content: [{ type: "text", text: `Error: invalid platform name "${platform}"` }] };
2546
+ }
878
2547
  if (!fs.existsSync(filePath)) {
879
- const available = fs.existsSync(playbooksDir)
880
- ? fs.readdirSync(playbooksDir).filter(f => f.endsWith(".json")).map(f => f.replace(".json", ""))
2548
+ const available = fs.existsSync(referencesDir)
2549
+ ? fs.readdirSync(referencesDir).filter(f => f.endsWith(".json")).map(f => f.replace(".json", ""))
881
2550
  : [];
882
2551
  return { content: [{ type: "text", text: `No playbook for "${platform}". Available: ${available.join(", ") || "none"}` }] };
883
2552
  }
884
- const data = JSON.parse(fs.readFileSync(filePath, "utf-8"));
2553
+ // L2-73 fix: Gracefully handle malformed reference JSON
2554
+ let data;
2555
+ try {
2556
+ data = JSON.parse(fs.readFileSync(filePath, "utf-8"));
2557
+ }
2558
+ catch (parseErr) {
2559
+ return { content: [{ type: "text", text: `Warning: reference file for "${platform}" is malformed and was skipped. Error: ${parseErr instanceof Error ? parseErr.message : String(parseErr)}` }] };
2560
+ }
885
2561
  const s = section || "all";
886
2562
  if (s === "errors") {
887
2563
  const errors = data.errors || [];
@@ -915,6 +2591,152 @@ server.tool("platform_guide", "Get automation guide for a platform (selectors, U
915
2591
  // "all" — return full playbook
916
2592
  return { content: [{ type: "text", text: JSON.stringify(data, null, 2) }] };
917
2593
  });
2594
+ server.tool("playbook_preflight", "Quick feasibility check before automating a platform. Scans the page for known blockers (captchas, WebGL, iframes), checks against playbook errors, tests selector availability. Returns go/yellow/red.", {
2595
+ url: z.string().describe("URL to check, e.g. 'https://x.com'"),
2596
+ task: z.string().optional().describe("What you want to automate, e.g. 'post a tweet'"),
2597
+ tabId: z.string().optional().describe("Tab ID if page is already open"),
2598
+ }, async ({ url, task, tabId }) => {
2599
+ const issues = [];
2600
+ const warnings = [];
2601
+ const good = [];
2602
+ // 1. Extract domain and find matching playbook
2603
+ let domain;
2604
+ try {
2605
+ domain = new URL(url).hostname.replace(/^www\./, "");
2606
+ }
2607
+ catch {
2608
+ return { content: [{ type: "text", text: `❌ Invalid URL: ${url}` }] };
2609
+ }
2610
+ // Check references/ for curated knowledge
2611
+ const reference = _playbookStoreForContext.matchByDomain(domain);
2612
+ if (reference) {
2613
+ good.push(`Found reference: "${reference.id}" (${reference.successCount} successes, ${reference.failCount} failures)`);
2614
+ // Check known errors
2615
+ if (reference.errors && reference.errors.length > 0) {
2616
+ for (const err of reference.errors) {
2617
+ if (err.severity === "high") {
2618
+ issues.push(`🔴 ${err.error} → ${err.solution}`);
2619
+ }
2620
+ else {
2621
+ warnings.push(`🟡 ${err.error} → ${err.solution}`);
2622
+ }
2623
+ }
2624
+ }
2625
+ // Check selector availability
2626
+ if (reference.selectors) {
2627
+ const selectorCount = Object.values(reference.selectors).reduce((sum, group) => sum + Object.keys(group).length, 0);
2628
+ good.push(`${selectorCount} selectors documented in reference`);
2629
+ }
2630
+ if (reference.flows && Object.keys(reference.flows).length > 0) {
2631
+ good.push(`${Object.keys(reference.flows).length} flows documented`);
2632
+ }
2633
+ }
2634
+ else {
2635
+ warnings.push(`🟡 No playbook exists for ${domain} — first-time automation, expect trial and error`);
2636
+ }
2637
+ // Check playbooks/ for executable steps
2638
+ const execPlaybookPath = path.resolve(playbooksDir, `${reference?.id ?? domain.split(".")[0]}.json`);
2639
+ if (fs.existsSync(execPlaybookPath)) {
2640
+ try {
2641
+ const execPb = JSON.parse(fs.readFileSync(execPlaybookPath, "utf-8"));
2642
+ if (Array.isArray(execPb.steps) && execPb.steps.length > 0) {
2643
+ good.push(`Executable playbook found: ${execPb.steps.length} steps — use job_create(playbookId="${execPb.id}") for auto-run`);
2644
+ }
2645
+ }
2646
+ catch { /* skip */ }
2647
+ }
2648
+ else if (reference) {
2649
+ warnings.push(`🟡 Reference exists but no executable playbook — manual execution needed`);
2650
+ }
2651
+ // 2. Scan the page if we have CDP access
2652
+ try {
2653
+ const { CDP: cdp, port } = await ensureCDP();
2654
+ let targetId = tabId;
2655
+ if (!targetId) {
2656
+ const targets = await cdp.List({ port });
2657
+ const page = targets.find((t) => t.type === "page" && t.url?.includes(domain));
2658
+ targetId = page?.id;
2659
+ }
2660
+ if (targetId) {
2661
+ const client = await cdp({ port, target: targetId });
2662
+ // Check for common blockers
2663
+ const checks = await client.Runtime.evaluate({
2664
+ expression: `(() => {
2665
+ const results = {};
2666
+ // Captcha detection
2667
+ results.hasCaptcha = !!(
2668
+ document.querySelector('[class*="captcha"]') ||
2669
+ document.querySelector('[class*="recaptcha"]') ||
2670
+ document.querySelector('[data-sitekey]') ||
2671
+ document.querySelector('iframe[src*="captcha"]') ||
2672
+ document.querySelector('iframe[src*="recaptcha"]')
2673
+ );
2674
+ // WebGL canvas (can't click via DOM)
2675
+ results.hasWebGL = !!(document.querySelector('canvas[data-engine]') || document.querySelector('canvas.webgl'));
2676
+ // Shadow DOM
2677
+ const allEls = document.querySelectorAll('*');
2678
+ let shadowCount = 0;
2679
+ for (const el of allEls) { if (el.shadowRoot) shadowCount++; }
2680
+ results.shadowDomCount = shadowCount;
2681
+ // Iframes
2682
+ results.iframeCount = document.querySelectorAll('iframe').length;
2683
+ // React/SPA detection
2684
+ results.isReact = !!(window.__REACT_DEVTOOLS_GLOBAL_HOOK__ || document.querySelector('[data-reactroot]'));
2685
+ results.isNextJs = !!document.querySelector('#__next');
2686
+ results.pageTitle = document.title;
2687
+ results.url = location.href;
2688
+ return results;
2689
+ })()`,
2690
+ returnByValue: true,
2691
+ });
2692
+ await client.close();
2693
+ const r = checks.result.value;
2694
+ if (r) {
2695
+ good.push(`Page loaded: "${r.pageTitle}"`);
2696
+ if (r.hasCaptcha)
2697
+ issues.push(`🔴 CAPTCHA detected — cannot be automated, needs manual solve`);
2698
+ if (r.hasWebGL)
2699
+ warnings.push(`🟡 WebGL canvas detected — DOM clicks won't work, use Input.dispatchMouseEvent or coordinates`);
2700
+ if (r.shadowDomCount > 0)
2701
+ warnings.push(`🟡 ${r.shadowDomCount} Shadow DOM elements — standard selectors may not reach them`);
2702
+ if (r.iframeCount > 0)
2703
+ warnings.push(`🟡 ${r.iframeCount} iframes — may need to switch context`);
2704
+ if (r.isReact)
2705
+ warnings.push(`🟡 React app — el.value assignment may not work, use browser_fill_form instead`);
2706
+ }
2707
+ }
2708
+ else {
2709
+ warnings.push(`🟡 Page not open in Chrome — open ${url} first for deeper scan`);
2710
+ }
2711
+ }
2712
+ catch {
2713
+ warnings.push(`🟡 Chrome CDP not available — can't scan page. Launch Chrome with --remote-debugging-port=9222`);
2714
+ }
2715
+ // 3. Check memory for past errors on this domain
2716
+ const memErrors = memory.readErrors();
2717
+ const domainErrors = memErrors.filter(e => {
2718
+ const paramStr = JSON.stringify(e.params ?? {});
2719
+ return paramStr.includes(domain);
2720
+ });
2721
+ if (domainErrors.length > 0) {
2722
+ warnings.push(`🟡 ${domainErrors.length} past error(s) recorded for ${domain} in memory`);
2723
+ }
2724
+ // 4. Build verdict
2725
+ const rating = issues.length > 0 ? "🔴 RED" : warnings.length > 2 ? "🟡 YELLOW" : "🟢 GREEN";
2726
+ const lines = [
2727
+ `# Preflight: ${domain}`,
2728
+ `Rating: ${rating}`,
2729
+ "",
2730
+ ...good.map(g => `✅ ${g}`),
2731
+ ...(issues.length > 0 ? ["", "## Blockers", ...issues] : []),
2732
+ ...(warnings.length > 0 ? ["", "## Warnings", ...warnings] : []),
2733
+ "",
2734
+ issues.length > 0
2735
+ ? "⛔ Some tasks may not be fully automatable. Review blockers above."
2736
+ : "✅ Looks feasible. Proceed with automation.",
2737
+ ];
2738
+ return { content: [{ type: "text", text: lines.join("\n") }] };
2739
+ });
918
2740
  server.tool("export_playbook", "Generate a playbook JSON from your session. Extracts URLs, selectors, errors+solutions from memory. Share the output with ScreenHand to help others automate this platform.", {
919
2741
  platform: z.string().describe("Platform name, e.g. 'linkedin', 'twitter'"),
920
2742
  domain: z.string().describe("Domain to filter actions by, e.g. 'linkedin.com'"),
@@ -968,7 +2790,7 @@ server.tool("export_playbook", "Generate a playbook JSON from your session. Extr
968
2790
  s.tags.some(t => t.toLowerCase().includes(platform.toLowerCase())));
969
2791
  // 2. Scan current page for selectors if tab is available
970
2792
  let pageSelectors = {};
971
- if (tabId || true) {
2793
+ if (tabId) {
972
2794
  try {
973
2795
  const { client } = await getCDPClient(tabId);
974
2796
  await client.Runtime.enable();
@@ -1008,13 +2830,24 @@ server.tool("export_playbook", "Generate a playbook JSON from your session. Extr
1008
2830
  description: description || `Automation playbook for ${platform}`,
1009
2831
  urls: Object.fromEntries(Array.from(urlSet).sort().map((u, i) => {
1010
2832
  const urlObj = new URL(u);
2833
+ // L2-69 fix: Redact sensitive query params before exporting
2834
+ const sensitiveParams = new Set(["code", "token", "access_token", "refresh_token", "id_token",
2835
+ "secret", "password", "key", "api_key", "apikey", "auth",
2836
+ "session", "session_id", "sessionid", "state", "nonce"]);
2837
+ for (const paramName of urlObj.searchParams.keys()) {
2838
+ if (sensitiveParams.has(paramName.toLowerCase())) {
2839
+ urlObj.searchParams.set(paramName, "[REDACTED]");
2840
+ }
2841
+ }
2842
+ const safeUrl = urlObj.toString();
1011
2843
  const pathKey = urlObj.pathname.replace(/^\//, "").replace(/\//g, "_").replace(/[^a-zA-Z0-9_]/g, "") || "home";
1012
- return [pathKey, u];
2844
+ return [pathKey, safeUrl];
1013
2845
  })),
1014
2846
  flows: {
1015
2847
  discovered: {
2848
+ // S75 Option C: Redact PII from exported strategy steps
1016
2849
  steps: domainStrategies.length > 0
1017
- ? domainStrategies[0].steps.map((s) => `${s.tool}(${JSON.stringify(s.params)})`)
2850
+ ? domainStrategies[0].steps.map((s) => redactPII(`${s.tool}(${JSON.stringify(s.params)})`))
1018
2851
  : ["No strategies recorded yet. Use the platform, then call export_playbook again."],
1019
2852
  selectors: pageSelectors,
1020
2853
  },
@@ -1037,16 +2870,30 @@ server.tool("export_playbook", "Generate a playbook JSON from your session. Extr
1037
2870
  strategies_count: domainStrategies.length,
1038
2871
  },
1039
2872
  };
1040
- // 4. Save to playbooks dir
1041
- const outPath = path.resolve(playbooksDir, `${platform.toLowerCase()}.json`);
2873
+ // 4. Save to references dir (curated knowledge, not executable steps)
2874
+ const safePlatformName = platform.toLowerCase().replace(/[^a-z0-9_\-]/g, "_").slice(0, 100);
2875
+ const outPath = path.resolve(referencesDir, `${safePlatformName}.json`);
2876
+ // Guard: refuse to write outside references dir
2877
+ if (!outPath.startsWith(path.resolve(referencesDir))) {
2878
+ return { content: [{ type: "text", text: `Error: invalid platform name "${platform}" — path traversal detected` }] };
2879
+ }
1042
2880
  const exists = fs.existsSync(outPath);
1043
- if (!fs.existsSync(playbooksDir))
1044
- fs.mkdirSync(playbooksDir, { recursive: true });
2881
+ if (!fs.existsSync(referencesDir))
2882
+ fs.mkdirSync(referencesDir, { recursive: true });
1045
2883
  fs.writeFileSync(outPath, JSON.stringify(playbook, null, 2));
2884
+ // Track playbook export for teaching ability rating factor
2885
+ const expBundleId = worldModel.getState().focusedApp?.bundleId;
2886
+ if (expBundleId) {
2887
+ const expMapData = appMap.getLoaded(expBundleId);
2888
+ if (expMapData) {
2889
+ expMapData.playbooksExported = (expMapData.playbooksExported ?? 0) + 1;
2890
+ appMap.save(expMapData, true);
2891
+ }
2892
+ }
1046
2893
  return {
1047
2894
  content: [{
1048
2895
  type: "text",
1049
- text: `${exists ? "Updated" : "Created"} playbook: playbooks/${platform.toLowerCase()}.json\n\n` +
2896
+ text: `${exists ? "Updated" : "Created"} reference: references/${platform.toLowerCase()}.json\n\n` +
1050
2897
  `URLs found: ${urlSet.size}\n` +
1051
2898
  `Selectors found: ${Object.keys(pageSelectors).length}\n` +
1052
2899
  `Errors documented: ${domainErrors.length}\n` +
@@ -1057,12 +2904,181 @@ server.tool("export_playbook", "Generate a playbook JSON from your session. Extr
1057
2904
  };
1058
2905
  });
1059
2906
  // ═══════════════════════════════════════════════
2907
+ // PLAYBOOK RECORD — macro recorder for MCP tool calls
2908
+ // ═══════════════════════════════════════════════
2909
+ server.tool("playbook_record", "Macro recorder: start recording, do the flow, stop to save as executable playbook. Captures every click/type/navigate tool call as a PlaybookStep.", {
2910
+ action: z.enum(["start", "stop", "cancel", "status"]).describe("start/stop/cancel/status"),
2911
+ platform: z.string().optional().describe("Platform name (required for start)"),
2912
+ name: z.string().optional().describe("Playbook name (required for stop)"),
2913
+ description: z.string().optional().describe("Playbook description (for stop)"),
2914
+ cdpPort: z.number().optional().describe("CDP port if needed for browser_js steps (e.g. 9333 for Codex)"),
2915
+ }, async ({ action, platform, name, description, cdpPort }) => {
2916
+ switch (action) {
2917
+ case "start": {
2918
+ if (!platform)
2919
+ return { content: [{ type: "text", text: "Error: platform is required for start" }] };
2920
+ if (mcpRecorder.isRecording)
2921
+ return { content: [{ type: "text", text: "Already recording. Call stop or cancel first." }] };
2922
+ mcpRecorder.start(platform, cdpPort ?? undefined);
2923
+ return { content: [{ type: "text", text: `Recording started for "${platform}". All subsequent tool calls will be captured.\nCall playbook_record(action="stop", name="...") when done.` }] };
2924
+ }
2925
+ case "stop": {
2926
+ if (!mcpRecorder.isRecording)
2927
+ return { content: [{ type: "text", text: "No active recording." }] };
2928
+ if (!name)
2929
+ return { content: [{ type: "text", text: "Error: name is required for stop" }] };
2930
+ const playbook = mcpRecorder.stop(name, description ?? name);
2931
+ // Track playbook export for teaching ability rating factor
2932
+ const pbBundleId = worldModel.getState().focusedApp?.bundleId;
2933
+ if (pbBundleId) {
2934
+ const pbMapData = appMap.getLoaded(pbBundleId);
2935
+ if (pbMapData) {
2936
+ pbMapData.playbooksExported = (pbMapData.playbooksExported ?? 0) + 1;
2937
+ appMap.save(pbMapData, true);
2938
+ }
2939
+ }
2940
+ const stepList = playbook.steps.map((s, i) => ` ${i + 1}. [${s.action}] ${s.description ?? ""}`).join("\n");
2941
+ return { content: [{ type: "text", text: `Playbook saved: playbooks/${playbook.id}.json (${playbook.steps.length} steps)\n\n${stepList}` }] };
2942
+ }
2943
+ case "cancel": {
2944
+ mcpRecorder.cancel();
2945
+ return { content: [{ type: "text", text: "Recording cancelled." }] };
2946
+ }
2947
+ case "status": {
2948
+ if (!mcpRecorder.isRecording)
2949
+ return { content: [{ type: "text", text: "Not recording." }] };
2950
+ const steps = mcpRecorder.getSteps().map((s, i) => ` ${i + 1}. [${s.action}] ${s.description ?? ""}`).join("\n");
2951
+ return { content: [{ type: "text", text: `Recording active: ${mcpRecorder.stepCount} steps captured\n${steps}` }] };
2952
+ }
2953
+ }
2954
+ });
2955
+ // ═══════════════════════════════════════════════
2956
+ // PLATFORM EXPLORE — autonomous app exploration
2957
+ // ═══════════════════════════════════════════════
2958
+ server.tool("platform_explore", "Autonomously explore an app or website. Maps all interactive elements, tries each one, records working selectors and broken paths. Outputs a reference JSON.", {
2959
+ platform: z.string().describe("Platform name for the output file, e.g. 'figma', 'canva'"),
2960
+ url: z.string().optional().describe("URL for web app. Requires Chrome with --remote-debugging-port."),
2961
+ bundleId: z.string().optional().describe("macOS bundle ID for native app, e.g. 'com.figma.Desktop'"),
2962
+ maxElements: z.number().optional().describe("Max elements to test (default: 30)"),
2963
+ tabId: z.string().optional().describe("Existing Chrome tab ID if page is already open"),
2964
+ }, async ({ platform, url, bundleId, maxElements, tabId }) => {
2965
+ const max = maxElements ?? 30;
2966
+ if (url || tabId) {
2967
+ // Web exploration via CDP
2968
+ const { CDP: cdp, port } = await ensureCDP();
2969
+ let targetId = tabId;
2970
+ if (!targetId) {
2971
+ if (url) {
2972
+ // Navigate to URL in a new tab
2973
+ const targets = await cdp.List({ port });
2974
+ const page = targets.find((t) => t.type === "page");
2975
+ if (!page)
2976
+ throw new Error("No Chrome tabs open");
2977
+ targetId = page.id;
2978
+ const client = await cdp({ port, target: targetId });
2979
+ await client.Page.enable();
2980
+ await client.Page.navigate({ url });
2981
+ await new Promise(r => setTimeout(r, 3000));
2982
+ await client.close();
2983
+ }
2984
+ }
2985
+ if (!targetId)
2986
+ throw new Error("No tab available");
2987
+ const client = await cdp({ port, target: targetId });
2988
+ await client.Runtime.enable();
2989
+ const evaluate = async (expr) => {
2990
+ return client.Runtime.evaluate({ expression: expr, returnByValue: true, awaitPromise: true });
2991
+ };
2992
+ // Discover elements
2993
+ const elements = await discoverWebElements(evaluate, max);
2994
+ // Test each element
2995
+ const tested = [];
2996
+ for (const el of elements) {
2997
+ const result = await testWebElement(evaluate, el);
2998
+ tested.push(result);
2999
+ await new Promise(r => setTimeout(r, 300 + Math.random() * 500));
3000
+ }
3001
+ await client.close();
3002
+ // Compile and save
3003
+ const result = compileReference(platform, "web", tested, url);
3004
+ const filePath = saveExploreResult(referencesDir, result);
3005
+ return { content: [{ type: "text", text: `Exploration complete: ${filePath}\n\nElements found: ${elements.length}\nTested: ${result.testedElements}\nWorking selectors: ${result.workingSelectors}\nErrors: ${result.errors.length}\n\nKey discoveries:\n${result.keyDiscoveries.map(d => ` - ${d}`).join("\n")}` }] };
3006
+ }
3007
+ else if (bundleId) {
3008
+ // Native app exploration via bridge
3009
+ await ensureBridge();
3010
+ const apps = await bridge.call("app.list");
3011
+ const app = apps.find(a => a.bundleId === bundleId);
3012
+ if (!app) {
3013
+ await bridge.call("app.launch", { bundleId });
3014
+ await new Promise(r => setTimeout(r, 3000));
3015
+ }
3016
+ const appList = await bridge.call("app.list");
3017
+ const target = appList.find(a => a.bundleId === bundleId);
3018
+ if (!target)
3019
+ throw new Error(`App ${bundleId} not running`);
3020
+ const elements = await discoverNativeElements(bridge, target.pid, max);
3021
+ // For native apps, we record discovery but don't auto-click (too risky)
3022
+ const result = compileReference(platform, "native", elements.map(el => ({
3023
+ ...el, clickWorked: true, result: "discovered_not_tested",
3024
+ })), undefined, bundleId);
3025
+ const filePath = saveExploreResult(referencesDir, result);
3026
+ return { content: [{ type: "text", text: `Native app exploration complete: ${filePath}\n\nElements discovered: ${elements.length}\n(Native elements discovered but not auto-clicked for safety. Use playbook_record to test interactively.)` }] };
3027
+ }
3028
+ else {
3029
+ return { content: [{ type: "text", text: "Error: Provide either url (for web apps) or bundleId (for native apps)." }] };
3030
+ }
3031
+ });
3032
+ // ═══════════════════════════════════════════════
3033
+ // PLATFORM LEARN — scrape docs/help/shortcuts
3034
+ // ═══════════════════════════════════════════════
3035
+ server.tool("platform_learn", "Scrape official docs, help center, keyboard shortcuts for a platform. Crawls pages via Chrome and extracts structured data into a reference JSON.", {
3036
+ platform: z.string().describe("Platform name, e.g. 'figma', 'notion', 'slack'"),
3037
+ url: z.string().optional().describe("Root URL to start from. If omitted, guesses from platform name."),
3038
+ maxPages: z.number().optional().describe("Max pages to crawl (default: 5)"),
3039
+ }, async ({ platform, url, maxPages }) => {
3040
+ const max = maxPages ?? 5;
3041
+ const urls = buildDocUrls(platform, url);
3042
+ const { CDP: cdp, port } = await ensureCDP();
3043
+ const targets = await cdp.List({ port });
3044
+ const page = targets.find((t) => t.type === "page");
3045
+ if (!page)
3046
+ throw new Error("No Chrome tabs open. Open Chrome first.");
3047
+ const client = await cdp({ port, target: page.id });
3048
+ await client.Runtime.enable();
3049
+ await client.Page.enable();
3050
+ const crawled = [];
3051
+ let successCount = 0;
3052
+ for (const docUrl of urls) {
3053
+ if (successCount >= max)
3054
+ break;
3055
+ try {
3056
+ const result = await crawlPage(client, docUrl, 8000);
3057
+ if (result.success && result.content && result.content.text.length > 100) {
3058
+ crawled.push({ url: docUrl, content: result.content, ...(result.shortcuts ? { shortcuts: result.shortcuts } : {}), ...(result.selectors ? { selectors: result.selectors } : {}) });
3059
+ successCount++;
3060
+ }
3061
+ }
3062
+ catch {
3063
+ // Skip failed URLs silently
3064
+ }
3065
+ await new Promise(r => setTimeout(r, 1000 + Math.random() * 1000));
3066
+ }
3067
+ await client.close();
3068
+ if (crawled.length === 0) {
3069
+ return { content: [{ type: "text", text: `No documentation pages found for "${platform}". Try providing a specific URL.` }] };
3070
+ }
3071
+ const result = compileLearnResult(platform, crawled);
3072
+ const filePath = saveLearnResult(referencesDir, result);
3073
+ return { content: [{ type: "text", text: `Learning complete: ${filePath}\n\nPages crawled: ${crawled.length}\nShortcuts found: ${Object.keys(result.shortcuts).length}\nFeatures found: ${result.features.length}\nSelectors found: ${Object.values(result.selectors).reduce((n, g) => n + Object.keys(g).length, 0)}\nAPI endpoints: ${result.apiEndpoints.length}\nKnown limitations: ${result.knownLimitations.length}` }] };
3074
+ });
3075
+ // ═══════════════════════════════════════════════
1060
3076
  // APPLESCRIPT — control scriptable apps directly
1061
3077
  // ═══════════════════════════════════════════════
1062
3078
  server.tool("applescript", "Run an AppleScript command. For controlling Finder, Safari, Mail, Notes, etc. (macOS only). WARNING: Executes arbitrary AppleScript — can perform destructive actions (delete files, send emails). All executions are audit-logged.", {
1063
3079
  script: z.string().describe("AppleScript code to execute"),
1064
3080
  }, async ({ script }) => {
1065
- auditLog("applescript", { script: script.slice(0, 500) });
3081
+ auditLog("applescript", { script });
1066
3082
  if (process.platform === "win32") {
1067
3083
  return { content: [{ type: "text", text: "AppleScript is not supported on Windows. Use ui_tree, ui_press, and other accessibility tools instead." }] };
1068
3084
  }
@@ -1190,6 +3206,15 @@ originalTool("session_claim", "Claim exclusive control of an app window. Prevent
1190
3206
  app: z.string().describe("Bundle ID of the app (e.g., 'com.google.Chrome')"),
1191
3207
  windowId: z.number().describe("Window ID to claim (get from 'windows' tool)"),
1192
3208
  }, async ({ clientId, clientType, app, windowId }) => {
3209
+ // Validate window ID exists
3210
+ try {
3211
+ await ensureBridge();
3212
+ const wins = await bridge.call("window.list", {});
3213
+ if (wins && !wins.some((w) => w.windowId === windowId)) {
3214
+ return { content: [{ type: "text", text: `Window ${windowId} does not exist. Use the windows() tool to get valid window IDs.` }] };
3215
+ }
3216
+ }
3217
+ catch { /* best-effort validation — proceed if bridge unavailable */ }
1193
3218
  // Use filesystem-backed lease manager directly (shared with daemon)
1194
3219
  const lease = leaseManager.claim({ id: clientId, type: clientType, startedAt: new Date().toISOString() }, app, windowId);
1195
3220
  if (!lease) {
@@ -1211,6 +3236,8 @@ originalTool("session_heartbeat", "Keep your session lease alive. Call every 60
1211
3236
  originalTool("session_release", "Release your session lease so other clients can use the window.", {
1212
3237
  sessionId: z.string().describe("Session ID to release"),
1213
3238
  }, async ({ sessionId }) => {
3239
+ // Flush playbook learnings before releasing session
3240
+ contextTracker.flush();
1214
3241
  // Use filesystem-backed lease manager directly (shared with daemon)
1215
3242
  const released = leaseManager.release(sessionId);
1216
3243
  return { content: [{ type: "text", text: released ? `Session ${sessionId} released.` : `Session ${sessionId} not found.` }] };
@@ -1397,6 +3424,18 @@ originalTool("recovery_queue_add", "Add a manual recovery instruction for a stal
1397
3424
  type: z.enum(["nudge", "restart", "escalate", "custom"]).describe("Recovery type"),
1398
3425
  instruction: z.string().describe("What to do (e.g., 'Click the login button', 'Restart Chrome')"),
1399
3426
  }, async ({ sessionId, type, instruction }) => {
3427
+ // Validate that the session ID looks reasonable (basic format check)
3428
+ // Accept both lease-style (lease_*) and generic session IDs
3429
+ if (!sessionId || sessionId.length < 3 || sessionId.length > 200) {
3430
+ return { content: [{ type: "text", text: `Error: Invalid session ID "${sessionId}". Must be 3-200 characters.` }] };
3431
+ }
3432
+ // Validate session is active — reject orphaned recovery instructions
3433
+ const activeSessions = leaseManager.getActive();
3434
+ const isActive = activeSessions.some(s => s.sessionId === sessionId);
3435
+ if (!isActive) {
3436
+ return { content: [{ type: "text", text: `Session "${sessionId}" is not active. Use supervisor_status to find active sessions.` }] };
3437
+ }
3438
+ const warning = "";
1400
3439
  const recovery = {
1401
3440
  id: "recv_" + Date.now().toString(36) + "_" + Math.random().toString(36).slice(2, 8),
1402
3441
  sessionId,
@@ -1409,9 +3448,19 @@ originalTool("recovery_queue_add", "Add a manual recovery instruction for a stal
1409
3448
  };
1410
3449
  // Write to daemon's filesystem state so the daemon picks it up
1411
3450
  const recoveries = readDaemonRecoveries();
1412
- recoveries.push(recovery);
1413
- writeDaemonRecoveries(recoveries);
1414
- return { content: [{ type: "text", text: `Recovery queued: ${recovery.id} (type=${type})` }] };
3451
+ // Prune old completed/failed entries (keep last 50, drop entries older than 24h)
3452
+ const MAX_QUEUE_SIZE = 50;
3453
+ const MAX_AGE_MS = 24 * 60 * 60 * 1000;
3454
+ const cutoff = Date.now() - MAX_AGE_MS;
3455
+ const pruned = recoveries.filter((r) => {
3456
+ if (r.status === "pending")
3457
+ return true; // always keep pending
3458
+ const age = new Date(r.createdAt).getTime();
3459
+ return age > cutoff;
3460
+ }).slice(-MAX_QUEUE_SIZE);
3461
+ pruned.push(recovery);
3462
+ writeDaemonRecoveries(pruned);
3463
+ return { content: [{ type: "text", text: `Recovery queued: ${recovery.id} (type=${type})${warning}` }] };
1415
3464
  });
1416
3465
  originalTool("recovery_queue_list", "List recovery actions, optionally filtered by status.", {
1417
3466
  status: z.enum(["pending", "attempted", "succeeded", "failed"]).optional().describe("Filter by status"),
@@ -1590,7 +3639,7 @@ originalTool("supervisor_uninstall", "Uninstall the supervisor system service. S
1590
3639
  // EXECUTION CONTRACT — canonical fallback chain
1591
3640
  // ═══════════════════════════════════════════════
1592
3641
  import { METHOD_CAPABILITIES, DEFAULT_RETRY_POLICY, planExecution, executeWithFallback, } from "./src/runtime/execution-contract.js";
1593
- originalTool("execution_plan", "Show the execution plan for an action type. Returns the ordered fallback chain based on available infrastructure.", {
3642
+ server.tool("execution_plan", "Show the execution plan for an action type. Returns the ordered fallback chain based on available infrastructure.", {
1594
3643
  action: z.enum(["click", "type", "read", "locate", "select", "scroll"]).describe("Action type"),
1595
3644
  }, async ({ action }) => {
1596
3645
  const plan = planExecution(action, { hasBridge: true, hasCDP: cdpPort !== null });
@@ -1598,7 +3647,18 @@ originalTool("execution_plan", "Show the execution plan for an action type. Retu
1598
3647
  const cap = METHOD_CAPABILITIES[method];
1599
3648
  return `${i + 1}. ${method} (~${cap.avgLatencyMs}ms)${i === 0 ? " ← primary" : ""}`;
1600
3649
  });
1601
- lines.push("", `Retry policy: ${DEFAULT_RETRY_POLICY.maxRetriesPerMethod}/method, ${DEFAULT_RETRY_POLICY.maxTotalRetries} total, escalate after ${DEFAULT_RETRY_POLICY.escalateAfter}`);
3650
+ const policy = getAdaptedRetryPolicy();
3651
+ lines.push("", `Retry policy: ${policy.maxRetriesPerMethod}/method, ${policy.maxTotalRetries} total, escalate after ${policy.escalateAfter}, delay ${policy.delayBetweenRetriesMs}ms`);
3652
+ const appBundleId = worldModel.getState().focusedApp?.bundleId;
3653
+ if (appBundleId) {
3654
+ const budget = learningEngine.getAdaptiveBudget(appBundleId);
3655
+ lines.push(`Adaptive budgets: locate=${budget.locateMs}ms, act=${budget.actMs}ms, verify=${budget.verifyMs}ms`);
3656
+ }
3657
+ // Include app-specific hints from reference files and context tracker
3658
+ const hints = contextTracker.getHints(action, {});
3659
+ if (hints.length > 0) {
3660
+ lines.push("", "App-specific context:", ...hints.slice(0, 5));
3661
+ }
1602
3662
  return { content: [{ type: "text", text: `Execution plan for "${action}":\n${lines.join("\n")}` }] };
1603
3663
  });
1604
3664
  // ── Shared helpers for resilient action tools ──
@@ -1623,6 +3683,21 @@ async function resolvePid(bundleId) {
1623
3683
  function infra() {
1624
3684
  return { hasBridge: true, hasCDP: cdpPort !== null };
1625
3685
  }
3686
+ /**
3687
+ * Get a retry policy adapted by the learning engine's adaptive budgets.
3688
+ * If the learning engine shows the current app responds quickly, reduce retry delays.
3689
+ */
3690
+ function getAdaptedRetryPolicy() {
3691
+ if (!currentAdaptiveBudget)
3692
+ return DEFAULT_RETRY_POLICY;
3693
+ // Use the max of locate+act as a guide for retry delay — faster apps need shorter delays
3694
+ const typicalMs = Math.max(currentAdaptiveBudget.locateMs, currentAdaptiveBudget.actMs);
3695
+ // Retry delay = max(100ms, typical * 1.5), capped at the default
3696
+ const adaptedDelay = Math.min(DEFAULT_RETRY_POLICY.delayBetweenRetriesMs, Math.max(100, Math.ceil(typicalMs * 1.5)));
3697
+ if (adaptedDelay === DEFAULT_RETRY_POLICY.delayBetweenRetriesMs)
3698
+ return DEFAULT_RETRY_POLICY;
3699
+ return { ...DEFAULT_RETRY_POLICY, delayBetweenRetriesMs: adaptedDelay };
3700
+ }
1626
3701
  function formatResult(action, target, result) {
1627
3702
  if (result.ok) {
1628
3703
  const fallbackNote = result.fallbackFrom ? ` (fell back from ${result.fallbackFrom})` : "";
@@ -1631,7 +3706,7 @@ function formatResult(action, target, result) {
1631
3706
  return { content: [{ type: "text", text: `Failed to ${action} "${target}" — all methods exhausted. Last error: ${result.error}` }] };
1632
3707
  }
1633
3708
  // ── click_with_fallback ──
1634
- originalTool("click_with_fallback", "Click a target by text using the canonical fallback chain: AX → CDP → OCR. Automatically retries and falls through methods.", {
3709
+ server.tool("click_with_fallback", "Click a target by text using the canonical fallback chain: AX → CDP → OCR. Automatically retries and falls through methods.", {
1635
3710
  target: z.string().describe("Text, title, or identifier of the element to click"),
1636
3711
  bundleId: z.string().optional().describe("App bundle ID (for AX path)"),
1637
3712
  }, async ({ target, bundleId }) => {
@@ -1639,17 +3714,27 @@ originalTool("click_with_fallback", "Click a target by text using the canonical
1639
3714
  const plan = planExecution("click", infra())
1640
3715
  .filter((m) => m !== "coordinates");
1641
3716
  const targetPid = await resolvePid(bundleId);
1642
- const result = await executeWithFallback("click", plan, DEFAULT_RETRY_POLICY, async (method, attempt) => {
3717
+ const result = await executeWithFallback("click", plan, getAdaptedRetryPolicy(), async (method, attempt) => {
1643
3718
  const start = Date.now();
1644
3719
  try {
1645
3720
  switch (method) {
1646
3721
  case "ax": {
1647
- // Find element by title, then perform AXPress action
1648
- const found = await bridge.call("ax.findElement", {
1649
- pid: targetPid,
1650
- title: target,
1651
- exact: false,
1652
- });
3722
+ // L2-65 fix: Try exact match first to avoid wrong-window match on minimized windows
3723
+ let found;
3724
+ try {
3725
+ found = await bridge.call("ax.findElement", {
3726
+ pid: targetPid,
3727
+ title: target,
3728
+ exact: true,
3729
+ });
3730
+ }
3731
+ catch {
3732
+ found = await bridge.call("ax.findElement", {
3733
+ pid: targetPid,
3734
+ title: target,
3735
+ exact: false,
3736
+ });
3737
+ }
1653
3738
  await bridge.call("ax.performAction", {
1654
3739
  pid: targetPid,
1655
3740
  elementPath: found.elementPath,
@@ -1710,7 +3795,7 @@ originalTool("click_with_fallback", "Click a target by text using the canonical
1710
3795
  return formatResult("Clicked", target, result);
1711
3796
  });
1712
3797
  // ── type_with_fallback ──
1713
- originalTool("type_with_fallback", "Type text into a target field using the canonical fallback chain: AX → CDP → coordinates. Finds the field by label/placeholder, focuses it, then types.", {
3798
+ server.tool("type_with_fallback", "Type text into a target field using the canonical fallback chain: AX → CDP → coordinates. Finds the field by label/placeholder, focuses it, then types.", {
1714
3799
  target: z.string().describe("Label, placeholder, or title of the field to type into"),
1715
3800
  text: z.string().describe("Text to type"),
1716
3801
  bundleId: z.string().optional().describe("App bundle ID"),
@@ -1719,16 +3804,81 @@ originalTool("type_with_fallback", "Type text into a target field using the cano
1719
3804
  await ensureBridge();
1720
3805
  const plan = planExecution("type", infra());
1721
3806
  const targetPid = await resolvePid(bundleId);
1722
- const result = await executeWithFallback("type", plan, DEFAULT_RETRY_POLICY, async (method, attempt) => {
3807
+ const result = await executeWithFallback("type", plan, getAdaptedRetryPolicy(), async (method, attempt) => {
1723
3808
  const start = Date.now();
1724
3809
  try {
1725
3810
  switch (method) {
1726
3811
  case "ax": {
1727
- const found = await bridge.call("ax.findElement", {
1728
- pid: targetPid,
1729
- title: target,
1730
- exact: false,
1731
- });
3812
+ // L2-65 fix: Try exact match first to avoid wrong-window match on minimized windows
3813
+ let found;
3814
+ try {
3815
+ found = await bridge.call("ax.findElement", {
3816
+ pid: targetPid,
3817
+ title: target,
3818
+ exact: true,
3819
+ });
3820
+ }
3821
+ catch {
3822
+ found = await bridge.call("ax.findElement", {
3823
+ pid: targetPid,
3824
+ title: target,
3825
+ exact: false,
3826
+ });
3827
+ }
3828
+ // L2-62+L2-68 fix: If matched element is a window (short elementPath), find
3829
+ // the child AXTextArea/AXTextField SCOPED to the target window.
3830
+ const isLikelyWindow = found.elementPath.length <= 1;
3831
+ if (isLikelyWindow) {
3832
+ // Try window-scoped search first via getElementTree
3833
+ let scopedFound = false;
3834
+ try {
3835
+ const wins = await bridge.call("app.windows");
3836
+ const matchWin = wins.find((w) => w.title === target) ?? wins.find((w) => w.title?.includes(target));
3837
+ if (matchWin?.windowId) {
3838
+ const windowTree = await bridge.call("ax.getElementTree", {
3839
+ pid: targetPid,
3840
+ windowId: matchWin.windowId,
3841
+ maxDepth: 8,
3842
+ });
3843
+ const findInTree = (node, path) => {
3844
+ if (node?.role && (node.role === "AXTextArea" || node.role === "AXTextField")) {
3845
+ return path;
3846
+ }
3847
+ if (node?.children && Array.isArray(node.children)) {
3848
+ for (let i = 0; i < node.children.length; i++) {
3849
+ const r = findInTree(node.children[i], [...path, i]);
3850
+ if (r)
3851
+ return r;
3852
+ }
3853
+ }
3854
+ return null;
3855
+ };
3856
+ const textPath = findInTree(windowTree, found.elementPath);
3857
+ if (textPath) {
3858
+ found = found.bounds
3859
+ ? { elementPath: textPath, bounds: found.bounds }
3860
+ : { elementPath: textPath };
3861
+ scopedFound = true;
3862
+ }
3863
+ }
3864
+ }
3865
+ catch { /* fall through to unscoped search */ }
3866
+ // Fallback: unscoped search (original L2-62 behavior)
3867
+ if (!scopedFound) {
3868
+ for (const role of ["AXTextArea", "AXTextField"]) {
3869
+ try {
3870
+ const textEl = await bridge.call("ax.findElement", {
3871
+ pid: targetPid,
3872
+ role,
3873
+ maxDepth: 10,
3874
+ });
3875
+ found = textEl;
3876
+ break;
3877
+ }
3878
+ catch { /* try next role */ }
3879
+ }
3880
+ }
3881
+ }
1732
3882
  if (clearFirst) {
1733
3883
  await bridge.call("ax.setElementValue", { pid: targetPid, elementPath: found.elementPath, value: "" });
1734
3884
  }
@@ -1758,8 +3908,9 @@ originalTool("type_with_fallback", "Type text into a target field using the cano
1758
3908
  if (!evalResult.result?.value)
1759
3909
  throw new Error("Field not found via CDP");
1760
3910
  if (clearFirst) {
1761
- await Input.dispatchKeyEvent({ type: "keyDown", key: "a", code: "KeyA", modifiers: 2 });
1762
- await Input.dispatchKeyEvent({ type: "keyUp", key: "a", code: "KeyA", modifiers: 2 });
3911
+ const selectAllMod = process.platform === "darwin" ? 4 : 2; // Cmd on macOS, Ctrl on Windows/Linux
3912
+ await Input.dispatchKeyEvent({ type: "keyDown", key: "a", code: "KeyA", modifiers: selectAllMod });
3913
+ await Input.dispatchKeyEvent({ type: "keyUp", key: "a", code: "KeyA", modifiers: selectAllMod });
1763
3914
  }
1764
3915
  for (const char of text) {
1765
3916
  await Input.dispatchKeyEvent({ type: "keyDown", key: char, text: char });
@@ -1781,28 +3932,103 @@ originalTool("type_with_fallback", "Type text into a target field using the cano
1781
3932
  return formatResult("Typed into", target, result);
1782
3933
  });
1783
3934
  // ── read_with_fallback ──
1784
- originalTool("read_with_fallback", "Read text content from the screen or a specific element using the canonical fallback chain: AX → CDP → OCR. Returns the text found.", {
3935
+ server.tool("read_with_fallback", "Read text content from the screen or a specific element using the canonical fallback chain: AX → CDP → OCR. Returns the text found.", {
1785
3936
  target: z.string().optional().describe("Element label/title to read from (omit for full-screen OCR)"),
1786
3937
  bundleId: z.string().optional().describe("App bundle ID"),
1787
3938
  }, async ({ target, bundleId }) => {
1788
3939
  await ensureBridge();
1789
3940
  const plan = planExecution("read", infra());
1790
3941
  const targetPid = await resolvePid(bundleId);
1791
- const result = await executeWithFallback("read", plan, DEFAULT_RETRY_POLICY, async (method, attempt) => {
3942
+ const result = await executeWithFallback("read", plan, getAdaptedRetryPolicy(), async (method, attempt) => {
1792
3943
  const start = Date.now();
1793
3944
  try {
1794
3945
  switch (method) {
1795
3946
  case "ax": {
1796
3947
  if (target) {
1797
- const found = await bridge.call("ax.findElement", {
1798
- pid: targetPid,
1799
- title: target,
1800
- exact: false,
1801
- });
3948
+ // L2-65 fix: Try exact match first to avoid reading from the wrong
3949
+ // window when multiple windows share a title prefix (e.g. "Untitled 39" vs "Untitled 40").
3950
+ // Minimized windows may be skipped by the bridge search, so an inexact match
3951
+ // can silently return a sibling window's content with no warning.
3952
+ let found;
3953
+ try {
3954
+ found = await bridge.call("ax.findElement", {
3955
+ pid: targetPid,
3956
+ title: target,
3957
+ exact: true,
3958
+ });
3959
+ }
3960
+ catch {
3961
+ // Exact match failed — fall back to fuzzy match
3962
+ found = await bridge.call("ax.findElement", {
3963
+ pid: targetPid,
3964
+ title: target,
3965
+ exact: false,
3966
+ });
3967
+ }
1802
3968
  const val = await bridge.call("ax.getElementValue", {
1803
3969
  pid: targetPid,
1804
3970
  elementPath: found.elementPath,
1805
3971
  });
3972
+ // L2-59+L2-61+L2-68 fix: If matched element has no value (e.g. AXWindow), find a
3973
+ // text-bearing child element SCOPED to the target window.
3974
+ // L2-68: Previously used unscoped ax.findElement(role) which returned AXTextArea from
3975
+ // ANY window. Now uses ax.getElementTree(windowId) to scope the search.
3976
+ if (!val.value) {
3977
+ // Try to find the matching CG windowId by title
3978
+ let windowTree = null;
3979
+ try {
3980
+ const wins = await bridge.call("app.windows");
3981
+ const matchWin = wins.find((w) => w.title === target) ?? wins.find((w) => w.title?.includes(target));
3982
+ if (matchWin?.windowId) {
3983
+ windowTree = await bridge.call("ax.getElementTree", {
3984
+ pid: targetPid,
3985
+ windowId: matchWin.windowId,
3986
+ maxDepth: 8,
3987
+ });
3988
+ }
3989
+ }
3990
+ catch { /* fall through to unscoped search */ }
3991
+ // Walk the window tree to find first text-bearing element
3992
+ const textRoles = new Set(["AXTextArea", "AXTextField", "AXWebArea"]);
3993
+ const findTextInTree = (node, path) => {
3994
+ if (node?.role && textRoles.has(node.role) && node.value) {
3995
+ return { value: node.value, path };
3996
+ }
3997
+ if (node?.children && Array.isArray(node.children)) {
3998
+ for (let i = 0; i < node.children.length; i++) {
3999
+ const result = findTextInTree(node.children[i], [...path, i]);
4000
+ if (result)
4001
+ return result;
4002
+ }
4003
+ }
4004
+ return null;
4005
+ };
4006
+ if (windowTree) {
4007
+ const textNode = findTextInTree(windowTree, found.elementPath);
4008
+ if (textNode?.value) {
4009
+ return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: textNode.value };
4010
+ }
4011
+ }
4012
+ // Fallback: unscoped search (original L2-59 behavior) if window-scoped search fails
4013
+ const fallbackRoles = ["AXTextArea", "AXTextField", "AXWebArea"];
4014
+ for (const role of fallbackRoles) {
4015
+ try {
4016
+ const textEl = await bridge.call("ax.findElement", {
4017
+ pid: targetPid,
4018
+ role,
4019
+ maxDepth: 10,
4020
+ });
4021
+ const textVal = await bridge.call("ax.getElementValue", {
4022
+ pid: targetPid,
4023
+ elementPath: textEl.elementPath,
4024
+ });
4025
+ if (textVal.value) {
4026
+ return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: textVal.value };
4027
+ }
4028
+ }
4029
+ catch { /* try next role */ }
4030
+ }
4031
+ }
1806
4032
  return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: val.value ?? "" };
1807
4033
  }
1808
4034
  // No specific target — get the full element tree text
@@ -1874,23 +4100,34 @@ originalTool("read_with_fallback", "Read text content from the screen or a speci
1874
4100
  return { content: [{ type: "text", text: `Failed to read${target ? ` "${target}"` : ""} — all methods exhausted. Last error: ${result.error}` }] };
1875
4101
  });
1876
4102
  // ── locate_with_fallback ──
1877
- originalTool("locate_with_fallback", "Find an element's position on screen using the canonical fallback chain: AX → CDP → OCR. Returns bounds (x, y, width, height).", {
4103
+ server.tool("locate_with_fallback", "Find an element's position on screen using the canonical fallback chain: AX → CDP → OCR. Returns bounds (x, y, width, height).", {
1878
4104
  target: z.string().describe("Text, title, or identifier of the element to locate"),
1879
4105
  bundleId: z.string().optional().describe("App bundle ID"),
1880
4106
  }, async ({ target, bundleId }) => {
1881
4107
  await ensureBridge();
1882
4108
  const plan = planExecution("locate", infra());
1883
4109
  const targetPid = await resolvePid(bundleId);
1884
- const result = await executeWithFallback("locate", plan, DEFAULT_RETRY_POLICY, async (method, attempt) => {
4110
+ const result = await executeWithFallback("locate", plan, getAdaptedRetryPolicy(), async (method, attempt) => {
1885
4111
  const start = Date.now();
1886
4112
  try {
1887
4113
  switch (method) {
1888
4114
  case "ax": {
1889
- const found = await bridge.call("ax.findElement", {
1890
- pid: targetPid,
1891
- title: target,
1892
- exact: false,
1893
- });
4115
+ // L2-65 fix: Try exact match first
4116
+ let found;
4117
+ try {
4118
+ found = await bridge.call("ax.findElement", {
4119
+ pid: targetPid,
4120
+ title: target,
4121
+ exact: true,
4122
+ });
4123
+ }
4124
+ catch {
4125
+ found = await bridge.call("ax.findElement", {
4126
+ pid: targetPid,
4127
+ title: target,
4128
+ exact: false,
4129
+ });
4130
+ }
1894
4131
  if (!found.bounds)
1895
4132
  throw new Error("Element found but has no bounds");
1896
4133
  const b = found.bounds;
@@ -1946,7 +4183,7 @@ originalTool("locate_with_fallback", "Find an element's position on screen using
1946
4183
  return formatResult("Located", target, result);
1947
4184
  });
1948
4185
  // ── select_with_fallback ──
1949
- originalTool("select_with_fallback", "Select an option from a dropdown/menu using the canonical fallback chain: AX → CDP. Finds the control, opens it, and picks the specified option.", {
4186
+ server.tool("select_with_fallback", "Select an option from a dropdown/menu using the canonical fallback chain: AX → CDP. Finds the control, opens it, and picks the specified option.", {
1950
4187
  target: z.string().describe("Label or title of the dropdown/menu control"),
1951
4188
  option: z.string().describe("Text of the option to select"),
1952
4189
  bundleId: z.string().optional().describe("App bundle ID"),
@@ -1954,7 +4191,7 @@ originalTool("select_with_fallback", "Select an option from a dropdown/menu usin
1954
4191
  await ensureBridge();
1955
4192
  const plan = planExecution("select", infra());
1956
4193
  const targetPid = await resolvePid(bundleId);
1957
- const result = await executeWithFallback("select", plan, DEFAULT_RETRY_POLICY, async (method, attempt) => {
4194
+ const result = await executeWithFallback("select", plan, getAdaptedRetryPolicy(), async (method, attempt) => {
1958
4195
  const start = Date.now();
1959
4196
  try {
1960
4197
  switch (method) {
@@ -2021,7 +4258,7 @@ originalTool("select_with_fallback", "Select an option from a dropdown/menu usin
2021
4258
  return formatResult("Selected", `${target} → ${option}`, result);
2022
4259
  });
2023
4260
  // ── scroll_with_fallback ──
2024
- originalTool("scroll_with_fallback", "Scroll within an element or the active window using the canonical fallback chain: AX → CDP → coordinates. Scrolls until target text is visible, or by a fixed amount.", {
4261
+ server.tool("scroll_with_fallback", "Scroll within an element or the active window using the canonical fallback chain: AX → CDP → coordinates. Scrolls until target text is visible, or by a fixed amount.", {
2025
4262
  direction: z.enum(["up", "down", "left", "right"]).describe("Scroll direction"),
2026
4263
  amount: z.number().optional().describe("Scroll amount in pixels (default: 300)"),
2027
4264
  target: z.string().optional().describe("Scroll until this text is visible (overrides amount)"),
@@ -2031,6 +4268,17 @@ originalTool("scroll_with_fallback", "Scroll within an element or the active win
2031
4268
  const plan = planExecution("scroll", infra());
2032
4269
  const targetPid = await resolvePid(bundleId);
2033
4270
  const scrollAmount = amount ?? 300;
4271
+ // Resolve scroll coordinates — center of the frontmost window
4272
+ let scrollX = 400, scrollY = 400;
4273
+ try {
4274
+ const wins = await bridge.call("cg.windows", {});
4275
+ if (wins && wins.length > 0) {
4276
+ const w = wins[0];
4277
+ scrollX = Math.round(w.x + w.width / 2);
4278
+ scrollY = Math.round(w.y + w.height / 2);
4279
+ }
4280
+ }
4281
+ catch { /* fallback to default coords */ }
2034
4282
  // If target is specified, scroll in a loop until text is visible (max 10 scrolls)
2035
4283
  if (target) {
2036
4284
  for (let i = 0; i < 10; i++) {
@@ -2049,26 +4297,21 @@ originalTool("scroll_with_fallback", "Scroll within an element or the active win
2049
4297
  // Scroll once
2050
4298
  const deltaX = direction === "left" ? -scrollAmount : direction === "right" ? scrollAmount : 0;
2051
4299
  const deltaY = direction === "up" ? -scrollAmount : direction === "down" ? scrollAmount : 0;
2052
- await bridge.call("cg.scroll", { deltaX, deltaY });
4300
+ await bridge.call("cg.scroll", { x: scrollX, y: scrollY, deltaX, deltaY });
2053
4301
  await new Promise((r) => setTimeout(r, 400));
2054
4302
  }
2055
4303
  return { content: [{ type: "text", text: `Scrolled ${direction} 10 times but "${target}" not found.` }] };
2056
4304
  }
2057
4305
  // Fixed-amount scroll via fallback chain
2058
- const result = await executeWithFallback("scroll", plan, DEFAULT_RETRY_POLICY, async (method, attempt) => {
4306
+ const result = await executeWithFallback("scroll", plan, getAdaptedRetryPolicy(), async (method, attempt) => {
2059
4307
  const start = Date.now();
2060
4308
  try {
2061
4309
  const deltaX = direction === "left" ? -scrollAmount : direction === "right" ? scrollAmount : 0;
2062
4310
  const deltaY = direction === "up" ? -scrollAmount : direction === "down" ? scrollAmount : 0;
2063
4311
  switch (method) {
2064
4312
  case "ax": {
2065
- // Use AX scroll action on the focused element
2066
- const tree = await bridge.call("ax.getElementTree", {
2067
- pid: targetPid,
2068
- maxDepth: 1,
2069
- });
2070
- // Fall through to cg.scroll since AX scroll is less reliable
2071
- await bridge.call("cg.scroll", { deltaX, deltaY });
4313
+ // AX scroll is unreliable — use CG scroll directly (works on the focused app)
4314
+ await bridge.call("cg.scroll", { x: scrollX, y: scrollY, deltaX, deltaY });
2072
4315
  return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${direction} ${scrollAmount}px` };
2073
4316
  }
2074
4317
  case "cdp": {
@@ -2088,7 +4331,7 @@ originalTool("scroll_with_fallback", "Scroll within an element or the active win
2088
4331
  }
2089
4332
  }
2090
4333
  case "coordinates": {
2091
- await bridge.call("cg.scroll", { deltaX, deltaY });
4334
+ await bridge.call("cg.scroll", { x: scrollX, y: scrollY, deltaX, deltaY });
2092
4335
  return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${direction} ${scrollAmount}px` };
2093
4336
  }
2094
4337
  }
@@ -2101,7 +4344,7 @@ originalTool("scroll_with_fallback", "Scroll within an element or the active win
2101
4344
  return formatResult("Scrolled", `${direction} ${scrollAmount}px`, result);
2102
4345
  });
2103
4346
  // ── wait_for_state ──
2104
- originalTool("wait_for_state", "Wait until a condition is met on screen: text appears, text disappears, or element becomes available. Polls at intervals using the fallback chain.", {
4347
+ server.tool("wait_for_state", "Wait until a condition is met on screen: text appears, text disappears, or element becomes available. Polls at intervals using the fallback chain.", {
2105
4348
  condition: z.enum(["text_appears", "text_disappears", "element_exists"]).describe("What to wait for"),
2106
4349
  target: z.string().describe("Text or element to watch for"),
2107
4350
  timeoutMs: z.number().optional().describe("Maximum wait time in ms (default: 10000)"),
@@ -2123,13 +4366,29 @@ originalTool("wait_for_state", "Wait until a condition is met on screen: text ap
2123
4366
  found = true;
2124
4367
  }
2125
4368
  else {
2126
- // Text-based: try OCR
2127
- const shot = await bridge.call("cg.captureScreen", {});
2128
- const matches = await bridge.call("vision.findText", {
2129
- imagePath: shot.path,
2130
- searchText: target,
2131
- });
2132
- found = Array.isArray(matches) && matches.length > 0;
4369
+ // L2-67 fix: Try AX text search first (works for non-frontmost apps),
4370
+ // then fall back to OCR if AX doesn't find it.
4371
+ try {
4372
+ const axEl = await bridge.call("ax.findElement", { pid: targetPid, title: target, exact: false });
4373
+ found = true;
4374
+ }
4375
+ catch {
4376
+ // AX title search failed — also try reading text content via AX tree
4377
+ try {
4378
+ const tree = await bridge.call("ax.getElementTree", { pid: targetPid, maxDepth: 4 });
4379
+ const desc = tree.description ?? JSON.stringify(tree);
4380
+ found = desc.includes(target);
4381
+ }
4382
+ catch {
4383
+ // AX unavailable — fall back to OCR
4384
+ const shot = await bridge.call("cg.captureScreen", {});
4385
+ const matches = await bridge.call("vision.findText", {
4386
+ imagePath: shot.path,
4387
+ searchText: target,
4388
+ });
4389
+ found = Array.isArray(matches) && matches.length > 0;
4390
+ }
4391
+ }
2133
4392
  }
2134
4393
  }
2135
4394
  catch {
@@ -2172,13 +4431,13 @@ originalTool("wait_for_state", "Wait until a condition is met on screen: text ap
2172
4431
  // ═══════════════════════════════════════════════
2173
4432
  // JOBS — persistent multi-step automation with resume
2174
4433
  // ═══════════════════════════════════════════════
2175
- originalTool("job_create", "Create a new automation job. Jobs persist across restarts and can be resumed from the last successful step.", {
4434
+ originalTool("job_create", "Create a new automation job. Jobs persist across restarts and can be resumed from the last successful step. Supports chaining: set dependsOn to wait for another job, and vars for template substitution (e.g. {PROMPT_TEXT}).", {
2176
4435
  task: z.string().describe("Human-readable description of what this job should do"),
2177
4436
  playbookId: z.string().optional().describe("Playbook ID to drive this job (optional — AI-only if omitted)"),
2178
4437
  bundleId: z.string().optional().describe("Target application bundle ID (e.g., 'com.apple.Safari'). Omit for app-agnostic jobs."),
2179
4438
  windowId: z.number().optional().describe("Target window ID within the application. Omit for app-agnostic jobs."),
2180
4439
  steps: z.array(z.object({
2181
- action: z.string().describe("Action name (e.g., navigate, click, type_text, screenshot, key)"),
4440
+ action: z.string().describe("Action name (e.g., navigate, click, type_text, screenshot, key, browser_js, cdp_key_event)"),
2182
4441
  target: z.string().optional().describe("Target element or URL"),
2183
4442
  description: z.string().optional().describe("Human-readable description"),
2184
4443
  text: z.string().optional().describe("Text payload for type_text/type_into actions"),
@@ -2189,7 +4448,10 @@ originalTool("job_create", "Create a new automation job. Jobs persist across res
2189
4448
  priority: z.number().optional().describe("Priority (lower = higher priority, default: 10)"),
2190
4449
  maxRetries: z.number().optional().describe("Max retry attempts on failure (default: 3)"),
2191
4450
  sessionId: z.string().optional().describe("Bind to an existing supervisor session"),
2192
- }, async ({ task, playbookId, bundleId, windowId, steps, tags, priority, maxRetries, sessionId }) => {
4451
+ chainId: z.string().optional().describe("Chain ID to group linked jobs into a flow"),
4452
+ dependsOn: z.string().optional().describe("Job ID this job depends on — won't run until dependency is done"),
4453
+ vars: z.record(z.string(), z.string()).optional().describe("Variables for template substitution in playbook steps (e.g. {PROMPT_TEXT} → 'hello world'). Use {prev.outputKey} to reference outputs from dependsOn job."),
4454
+ }, async ({ task, playbookId, bundleId, windowId, steps, tags, priority, maxRetries, sessionId, chainId, dependsOn, vars }) => {
2193
4455
  const createOpts = { task };
2194
4456
  if (playbookId !== undefined)
2195
4457
  createOpts.playbookId = playbookId;
@@ -2207,8 +4469,50 @@ originalTool("job_create", "Create a new automation job. Jobs persist across res
2207
4469
  createOpts.maxRetries = maxRetries;
2208
4470
  if (sessionId !== undefined)
2209
4471
  createOpts.sessionId = sessionId;
4472
+ if (chainId !== undefined)
4473
+ createOpts.chainId = chainId;
4474
+ if (dependsOn !== undefined)
4475
+ createOpts.dependsOn = dependsOn;
4476
+ if (vars !== undefined)
4477
+ createOpts.vars = vars;
2210
4478
  const job = jobManager.create(createOpts);
2211
- return { content: [{ type: "text", text: `Job created: ${job.id}\nTask: ${job.task}\nState: ${job.state}\nSteps: ${job.steps.length}\nPriority: ${job.priority}\nTarget: ${job.bundleId ?? "(any app)"}${job.windowId != null ? ` window ${job.windowId}` : ""}` }] };
4479
+ const extra = [];
4480
+ if (job.chainId)
4481
+ extra.push(`Chain: ${job.chainId}`);
4482
+ if (job.dependsOn)
4483
+ extra.push(`Depends on: ${job.dependsOn}`);
4484
+ if (job.vars && Object.keys(job.vars).length > 0)
4485
+ extra.push(`Vars: ${Object.keys(job.vars).join(", ")}`);
4486
+ return { content: [{ type: "text", text: `Job created: ${job.id}\nTask: ${job.task}\nState: ${job.state}\nSteps: ${job.steps.length}\nPriority: ${job.priority}\nTarget: ${job.bundleId ?? "(any app)"}${job.windowId != null ? ` window ${job.windowId}` : ""}${extra.length > 0 ? "\n" + extra.join("\n") : ""}` }] };
4487
+ });
4488
+ originalTool("job_create_chain", "Create a chain of linked jobs that run sequentially. Each job waits for the previous one to finish. Use vars with {prev.outputKey} to pass data between jobs.", {
4489
+ jobs: z.array(z.object({
4490
+ task: z.string().describe("What this job does"),
4491
+ playbookId: z.string().optional().describe("Playbook ID"),
4492
+ bundleId: z.string().optional().describe("Target app bundle ID"),
4493
+ vars: z.record(z.string(), z.string()).optional().describe("Variables — use {prev.Read_Codex_response} to get output from prior job step"),
4494
+ tags: z.array(z.string()).optional(),
4495
+ })).describe("Ordered list of jobs to chain"),
4496
+ }, async ({ jobs }) => {
4497
+ const cleanJobs = jobs.map(j => {
4498
+ const clean = { task: j.task };
4499
+ if (j.playbookId)
4500
+ clean.playbookId = j.playbookId;
4501
+ if (j.bundleId)
4502
+ clean.bundleId = j.bundleId;
4503
+ if (j.vars)
4504
+ clean.vars = j.vars;
4505
+ if (j.tags)
4506
+ clean.tags = j.tags;
4507
+ return clean;
4508
+ });
4509
+ const chain = jobManager.createChain({ jobs: cleanJobs });
4510
+ const lines = [`Chain created: ${chain[0]?.chainId ?? "unknown"} (${chain.length} jobs)`];
4511
+ for (const job of chain) {
4512
+ lines.push(` ${job.id}: ${job.task}${job.dependsOn ? ` (after ${job.dependsOn})` : " (first)"}`);
4513
+ }
4514
+ lines.push("", "Run with: job_run_all() to execute the full chain sequentially.");
4515
+ return { content: [{ type: "text", text: lines.join("\n") }] };
2212
4516
  });
2213
4517
  originalTool("job_status", "Get detailed status of a job including step progress and resume point.", {
2214
4518
  jobId: z.string().describe("Job ID"),
@@ -2232,6 +4536,12 @@ originalTool("job_status", "Get detailed status of a job including step progress
2232
4536
  `Resume point: ${resume ? `step ${resume.stepIndex} — ${resume.step.description ?? resume.step.action}` : "(none — all done or no pending steps)"}`,
2233
4537
  `Retries: ${job.retries}/${job.maxRetries}`,
2234
4538
  ];
4539
+ if (job.chainId)
4540
+ lines.push(`Chain: ${job.chainId}`);
4541
+ if (job.dependsOn)
4542
+ lines.push(`Depends on: ${job.dependsOn}`);
4543
+ if (job.vars && Object.keys(job.vars).length > 0)
4544
+ lines.push(`Vars: ${JSON.stringify(job.vars)}`);
2235
4545
  if (job.blockReason)
2236
4546
  lines.push(`Block reason: ${job.blockReason}`);
2237
4547
  if (job.lastError)
@@ -2245,6 +4555,8 @@ originalTool("job_status", "Get detailed status of a job including step progress
2245
4555
  for (const s of job.steps) {
2246
4556
  const icon = s.status === "done" ? "✓" : s.status === "failed" ? "✗" : s.status === "skipped" ? "–" : "○";
2247
4557
  lines.push(` ${icon} [${s.index}] ${s.description ?? s.action}${s.error ? ` (${s.error})` : ""}${s.durationMs != null ? ` ${s.durationMs}ms` : ""}`);
4558
+ if (s.output)
4559
+ lines.push(` → ${s.output.substring(0, 200)}${s.output.length > 200 ? "..." : ""}`);
2248
4560
  }
2249
4561
  }
2250
4562
  return { content: [{ type: "text", text: lines.join("\n") }] };
@@ -2342,22 +4654,42 @@ originalTool("job_remove", "Remove a job entirely (any state).", {
2342
4654
  return { content: [{ type: "text", text: ok ? `Job ${jobId} removed.` : `Job ${jobId} not found.` }] };
2343
4655
  });
2344
4656
  // ── Job Runner + Worker ─────────────────────────
2345
- const PLAYBOOKS_DIR = path.join(os.homedir(), ".screenhand", "playbooks");
4657
+ const PLAYBOOKS_DIR = playbooksDir; // Use same dir as recorder (project-local ./playbooks/)
2346
4658
  let activeJobRunner = null;
4659
+ let activePlaybookStore = null;
4660
+ let activePlaybookEngine = null;
2347
4661
  function getJobRunner() {
4662
+ // Always reload playbooks from disk (new files may have been added)
4663
+ if (!activePlaybookStore) {
4664
+ activePlaybookStore = new PlaybookStore(PLAYBOOKS_DIR);
4665
+ }
4666
+ activePlaybookStore.load();
2348
4667
  if (!activeJobRunner) {
2349
4668
  // Build playbook engine stack: adapter → runtime → engine
2350
4669
  const adapter = new AccessibilityAdapter(bridge);
2351
4670
  const logger = new TimelineLogger();
2352
- const runtimeService = new AutomationRuntimeService(adapter, logger);
4671
+ const locCache = new LocatorCache();
4672
+ locCache.setLearningEngine(learningEngine);
4673
+ const runtimeService = new AutomationRuntimeService(adapter, logger, locCache);
2353
4674
  const playbookEngine = new PlaybookEngine(runtimeService);
2354
- const playbookStore = new PlaybookStore(PLAYBOOKS_DIR);
2355
- playbookStore.load();
4675
+ activePlaybookEngine = playbookEngine;
4676
+ // Wire CDP into playbook engine for browser_js / cdp_key_event steps
4677
+ playbookEngine.setCDPConnect(async (overridePort) => {
4678
+ if (overridePort) {
4679
+ if (!CDP)
4680
+ CDP = (await import("chrome-remote-interface")).default;
4681
+ const client = await CDP({ port: overridePort });
4682
+ return { Runtime: client.Runtime, Input: client.Input, close: () => client.close() };
4683
+ }
4684
+ const { CDP: CDPClient, port } = await ensureCDP();
4685
+ const client = await CDPClient({ port });
4686
+ return { Runtime: client.Runtime, Input: client.Input, close: () => client.close() };
4687
+ });
2356
4688
  activeJobRunner = new JobRunner(bridge, jobManager, leaseManager, supervisor, (() => {
2357
4689
  const cfg = {
2358
4690
  hasCDP: cdpPort !== null,
2359
4691
  playbookEngine,
2360
- playbookStore,
4692
+ playbookStore: activePlaybookStore,
2361
4693
  runtimeService,
2362
4694
  };
2363
4695
  if (cdpPort) {
@@ -2485,243 +4817,1056 @@ originalTool("worker_status", "Get the current status of the worker daemon (read
2485
4817
  return { content: [{ type: "text", text: lines.join("\n") }] };
2486
4818
  });
2487
4819
  // ═══════════════════════════════════════════════
2488
- // CODEX MONITOR watch VS Code terminals, auto-assign tasks
4820
+ // PLANNERgoal-oriented planning
2489
4821
  // ═══════════════════════════════════════════════
2490
- // Daemon state directory
2491
- const MONITOR_DIR = path.join(os.homedir(), ".screenhand", "monitor");
2492
- const MONITOR_STATE = path.join(MONITOR_DIR, "state.json");
2493
- const MONITOR_TASKS = path.join(MONITOR_DIR, "tasks.json");
2494
- const MONITOR_PID = path.join(MONITOR_DIR, "daemon.pid");
2495
- const MONITOR_LOG = path.join(MONITOR_DIR, "daemon.log");
2496
- const DAEMON_SCRIPT = path.resolve(__dirname, "scripts", "codex-monitor-daemon.ts");
2497
- function isDaemonRunning() {
2498
- try {
2499
- if (!fs.existsSync(MONITOR_PID))
2500
- return { running: false, pid: null };
2501
- const pid = Number(fs.readFileSync(MONITOR_PID, "utf-8").trim());
2502
- // Check if process is alive
2503
- process.kill(pid, 0);
2504
- return { running: true, pid };
4822
+ originalTool("plan_goal", "Create a goal and generate an execution plan. Returns the plan source (playbook/strategy/llm), steps, and confidence. Does NOT execute — use the returned plan for review or pass to job system.", {
4823
+ goal: z.string().describe("What you want to achieve (e.g. 'Export Premiere Pro timeline as H.264')"),
4824
+ }, async ({ goal: goalDescription }) => {
4825
+ const goal = planner.createGoal(goalDescription);
4826
+ await planner.planGoal(goal);
4827
+ goalStore.add(goal);
4828
+ const sg = goal.subgoals[0];
4829
+ const plan = sg.plan;
4830
+ if (!plan) {
4831
+ return { content: [{ type: "text", text: "No plan could be generated." }] };
2505
4832
  }
2506
- catch {
2507
- return { running: false, pid: null };
4833
+ const lines = [
4834
+ `Goal: ${goalDescription}`,
4835
+ `Plan source: ${plan.source}${plan.sourceId ? ` (${plan.sourceId})` : ""}`,
4836
+ `Confidence: ${(plan.confidence * 100).toFixed(0)}%`,
4837
+ `Steps: ${plan.steps.length}`,
4838
+ "",
4839
+ ];
4840
+ for (let i = 0; i < plan.steps.length; i++) {
4841
+ const step = plan.steps[i];
4842
+ const params = Object.keys(step.params).length > 0
4843
+ ? ` ${JSON.stringify(step.params)}`
4844
+ : "";
4845
+ const llmTag = step.requiresLLM ? " [LLM]" : "";
4846
+ const postcond = step.expectedPostcondition
4847
+ ? ` → verify: ${step.expectedPostcondition.type}(${step.expectedPostcondition.target})`
4848
+ : "";
4849
+ lines.push(` ${i + 1}. ${step.tool || step.description}${params}${llmTag}${postcond}`);
2508
4850
  }
2509
- }
2510
- function readDaemonState() {
2511
- try {
2512
- if (!fs.existsSync(MONITOR_STATE))
2513
- return null;
2514
- return JSON.parse(fs.readFileSync(MONITOR_STATE, "utf-8"));
4851
+ lines.push("", `Goal ID: ${goal.id}`);
4852
+ return {
4853
+ content: [{ type: "text", text: lines.join("\n") }],
4854
+ _meta: { goalId: goal.id, plan },
4855
+ };
4856
+ });
4857
+ originalTool("plan_execute", "Execute a goal's plan automatically. Runs deterministic steps internally. Pauses at LLM steps and returns the step description for you to resolve with plan_step_resolve. On completion, saves the strategy to memory for future reuse.", {
4858
+ goalId: z.string().describe("Goal ID from plan_goal"),
4859
+ }, async ({ goalId }) => {
4860
+ const goal = goalStore.get(goalId);
4861
+ if (!goal) {
4862
+ return { content: [{ type: "text", text: `Goal not found: ${goalId}` }] };
2515
4863
  }
2516
- catch {
2517
- return null;
4864
+ const adaptiveBudget = learningEngine.getAdaptiveBudget(worldModel.getState().focusedApp?.bundleId ?? "unknown");
4865
+ const executor = new PlanExecutor(worldModel, planner, toolRegistry.toExecutor(), { postconditionWaitMs: adaptiveBudget.verifyMs, defaultStepTimeout: Math.max(30_000, adaptiveBudget.actMs * 2) }, recoveryEngine, learningEngine);
4866
+ const result = await executor.executeGoal(goal);
4867
+ goalStore.update(goalId, goal);
4868
+ // Check if paused at an LLM step
4869
+ if ("paused" in result) {
4870
+ const pause = result;
4871
+ return {
4872
+ content: [{ type: "text", text: [
4873
+ `PAUSED at step ${pause.stepIndex + 1}/${pause.totalSteps} — requires your interpretation.`,
4874
+ `Step: ${pause.stepDescription}`,
4875
+ "",
4876
+ "Use plan_step_resolve to provide the tool + params for this step,",
4877
+ "then call plan_execute again to continue.",
4878
+ ].join("\n") }],
4879
+ _meta: { goalId, paused: true, stepIndex: pause.stepIndex },
4880
+ };
2518
4881
  }
2519
- }
2520
- function readDaemonTasks() {
2521
- try {
2522
- if (!fs.existsSync(MONITOR_TASKS))
2523
- return [];
2524
- return JSON.parse(fs.readFileSync(MONITOR_TASKS, "utf-8"));
4882
+ // Completed — save strategy to memory if successful
4883
+ if (result.success) {
4884
+ try {
4885
+ const sg = goal.subgoals.find((s) => s.status === "completed");
4886
+ if (sg?.plan) {
4887
+ const steps = sg.plan.steps
4888
+ .filter((s) => s.status === "completed" && s.tool)
4889
+ .map((s) => ({ tool: s.tool, params: s.params }));
4890
+ if (steps.length > 0) {
4891
+ memory.appendStrategy({
4892
+ id: "str_plan_" + Date.now().toString(36),
4893
+ task: goal.description,
4894
+ steps,
4895
+ totalDurationMs: result.durationMs,
4896
+ successCount: 1,
4897
+ failCount: 0,
4898
+ lastUsed: new Date().toISOString(),
4899
+ tags: ["auto-plan", sg.plan.source],
4900
+ fingerprint: "",
4901
+ });
4902
+ }
4903
+ }
4904
+ }
4905
+ catch { /* strategy recording is best-effort */ }
2525
4906
  }
2526
- catch {
2527
- return [];
4907
+ const lines = [
4908
+ result.success ? "Goal completed successfully." : `Goal failed: ${result.error}`,
4909
+ `Steps: ${result.stepsExecuted} executed, ${result.replans} replans`,
4910
+ `Duration: ${result.durationMs}ms`,
4911
+ `Subgoals: ${result.subgoalsCompleted}/${result.totalSubgoals} completed`,
4912
+ "",
4913
+ "── EXECUTION LOG ──",
4914
+ ...("executionLog" in result ? result.executionLog : []),
4915
+ ];
4916
+ return { content: [{ type: "text", text: lines.join("\n") }] };
4917
+ });
4918
+ originalTool("plan_step", "Execute the next single step of a goal. For incremental client-driven execution. Returns the step result, or pauses at LLM steps for you to interpret.", {
4919
+ goalId: z.string().describe("Goal ID from plan_goal"),
4920
+ }, async ({ goalId }) => {
4921
+ const goal = goalStore.get(goalId);
4922
+ if (!goal) {
4923
+ return { content: [{ type: "text", text: `Goal not found: ${goalId}` }] };
2528
4924
  }
2529
- }
2530
- function writeDaemonTasks(tasks) {
2531
- fs.mkdirSync(MONITOR_DIR, { recursive: true });
2532
- fs.writeFileSync(MONITOR_TASKS, JSON.stringify(tasks, null, 2));
2533
- }
2534
- server.tool("codex_monitor_start", "Start a background daemon that monitors VS Code terminals for Codex/AI agent activity. Runs independently — survives Claude Code restarts. Watches terminal output via OCR, detects running/idle/done.", {
2535
- vscodePid: z.number().describe("Process ID of VS Code (get from 'apps' tool)"),
2536
- windowId: z.number().optional().describe("Window ID of the VS Code window (get from 'windows' tool). Auto-detected if omitted."),
2537
- label: z.string().optional().describe("Label for this terminal (default: 'Terminal')"),
2538
- pollIntervalMs: z.number().optional().describe("How often to poll in ms (default: 3000)"),
2539
- autoAssign: z.boolean().optional().describe("Auto-assign queued tasks when terminal goes idle (default: true)"),
2540
- }, async ({ vscodePid, windowId, label, pollIntervalMs, autoAssign }) => {
2541
- const { running, pid } = isDaemonRunning();
2542
- if (running) {
2543
- return { content: [{ type: "text", text: `Daemon already running (pid=${pid}). Use codex_monitor_stop first to restart.` }] };
2544
- }
2545
- // Build daemon args
2546
- const daemonArgs = ["tsx", DAEMON_SCRIPT, "--pid", String(vscodePid)];
2547
- if (windowId)
2548
- daemonArgs.push("--window", String(windowId));
2549
- if (pollIntervalMs)
2550
- daemonArgs.push("--poll", String(pollIntervalMs));
2551
- if (label)
2552
- daemonArgs.push("--label", label);
2553
- if (autoAssign === false)
2554
- daemonArgs.push("--no-auto-assign");
2555
- // Spawn detached daemon
2556
- const child = spawn("npx", daemonArgs, {
2557
- detached: true,
2558
- stdio: "ignore",
2559
- cwd: __dirname,
2560
- });
2561
- child.unref();
2562
- const daemonPid = child.pid;
2563
- // Wait a moment for daemon to start and write state
2564
- await new Promise((r) => setTimeout(r, 3000));
2565
- const state = readDaemonState();
2566
- const terminalId = state?.terminals?.[0]?.id ?? "pending";
4925
+ const adaptiveBudget = learningEngine.getAdaptiveBudget(worldModel.getState().focusedApp?.bundleId ?? "unknown");
4926
+ const executor = new PlanExecutor(worldModel, planner, toolRegistry.toExecutor(), { postconditionWaitMs: adaptiveBudget.verifyMs, defaultStepTimeout: Math.max(30_000, adaptiveBudget.actMs * 2) }, recoveryEngine, learningEngine);
4927
+ const result = await executor.executeNextStep(goal);
4928
+ goalStore.update(goalId, goal);
4929
+ if ("paused" in result) {
4930
+ const pause = result;
4931
+ return {
4932
+ content: [{ type: "text", text: [
4933
+ `Step ${pause.stepIndex + 1}/${pause.totalSteps} requires LLM interpretation:`,
4934
+ ` ${pause.stepDescription}`,
4935
+ "",
4936
+ "Use plan_step_resolve to provide tool + params, or execute the step yourself and call plan_step again.",
4937
+ ].join("\n") }],
4938
+ };
4939
+ }
4940
+ if ("goalId" in result) {
4941
+ // PlanResult goal completed
4942
+ return {
4943
+ content: [{ type: "text", text: result.success
4944
+ ? `Goal completed: ${result.subgoalsCompleted}/${result.totalSubgoals} subgoals done.`
4945
+ : `Goal failed: ${result.error}` }],
4946
+ };
4947
+ }
4948
+ // StepResult
4949
+ const sr = result;
2567
4950
  return {
2568
- content: [{
2569
- type: "text",
2570
- text: `Background daemon started!\n` +
2571
- `Daemon PID: ${daemonPid}\n` +
2572
- `Terminal ID: ${terminalId}\n` +
2573
- `VS Code PID: ${vscodePid}\n` +
2574
- `Window ID: ${windowId ?? "auto-detecting"}\n` +
2575
- `Poll interval: ${pollIntervalMs ?? 3000}ms\n` +
2576
- `Auto-assign: ${autoAssign !== false}\n` +
2577
- `Log: ${MONITOR_LOG}\n` +
2578
- `State: ${MONITOR_STATE}\n\n` +
2579
- `The daemon runs independently — survives Claude Code restarts.\n` +
2580
- `Use codex_monitor_status to check on it anytime.`,
2581
- }],
4951
+ content: [{ type: "text", text: [
4952
+ sr.success ? `Step completed: ${sr.step.tool}` : `Step failed: ${sr.error}`,
4953
+ `Duration: ${sr.durationMs}ms`,
4954
+ sr.usedFallback ? "(used fallback tool)" : "",
4955
+ sr.postconditionMet ? "" : "Warning: postcondition not met",
4956
+ ].filter(Boolean).join("\n") }],
2582
4957
  };
2583
4958
  });
2584
- server.tool("codex_monitor_status", "Get status of the background monitor daemon. Shows terminal status, agent activity, task queue, and daemon health.", {
2585
- tail_log: z.number().optional().describe("Show last N lines of daemon log (default: 0, max: 50)"),
2586
- }, async ({ tail_log }) => {
2587
- const { running, pid } = isDaemonRunning();
2588
- const state = readDaemonState();
2589
- const tasks = readDaemonTasks();
4959
+ originalTool("plan_step_resolve", "Resolve a paused LLM step by providing the tool and params to use. The server executes the tool, verifies postconditions, and advances the plan.", {
4960
+ goalId: z.string().describe("Goal ID"),
4961
+ tool: z.string().describe("MCP tool name to execute for this step"),
4962
+ params: z.record(z.string(), z.unknown()).optional().describe("Tool parameters"),
4963
+ }, async ({ goalId, tool, params }) => {
4964
+ const goal = goalStore.get(goalId);
4965
+ if (!goal) {
4966
+ return { content: [{ type: "text", text: `Goal not found: ${goalId}` }] };
4967
+ }
4968
+ const adaptiveBudget = learningEngine.getAdaptiveBudget(worldModel.getState().focusedApp?.bundleId ?? "unknown");
4969
+ const executor = new PlanExecutor(worldModel, planner, toolRegistry.toExecutor(), { postconditionWaitMs: adaptiveBudget.verifyMs, defaultStepTimeout: Math.max(30_000, adaptiveBudget.actMs * 2) }, recoveryEngine, learningEngine);
4970
+ const result = await executor.resolveStep(goal, tool, params ?? {});
4971
+ goalStore.update(goalId, goal);
4972
+ return {
4973
+ content: [{ type: "text", text: result.success
4974
+ ? `Step resolved and completed: ${tool}`
4975
+ : `Step failed: ${result.error}` }],
4976
+ };
4977
+ });
4978
+ originalTool("plan_status", "Check the current status of a goal: subgoal progress, current step, completion state.", {
4979
+ goalId: z.string().describe("Goal ID"),
4980
+ }, async ({ goalId }) => {
4981
+ const goal = goalStore.get(goalId);
4982
+ if (!goal) {
4983
+ return { content: [{ type: "text", text: `Goal not found: ${goalId}` }] };
4984
+ }
4985
+ const lines = [
4986
+ `Goal: ${goal.description}`,
4987
+ `Status: ${goal.status}`,
4988
+ `Created: ${goal.createdAt}`,
4989
+ goal.completedAt ? `Completed: ${goal.completedAt}` : "",
4990
+ "",
4991
+ ].filter(Boolean);
4992
+ for (let i = 0; i < goal.subgoals.length; i++) {
4993
+ const sg = goal.subgoals[i];
4994
+ const plan = sg.plan;
4995
+ const progress = plan
4996
+ ? `${plan.currentStepIndex}/${plan.steps.length} steps`
4997
+ : "no plan";
4998
+ lines.push(` Subgoal ${i + 1}: ${sg.status} (${progress}, ${sg.attempts} attempts)`);
4999
+ if (sg.lastError)
5000
+ lines.push(` Error: ${sg.lastError}`);
5001
+ }
5002
+ if (goal.pausedAt) {
5003
+ lines.push("", `Paused at: subgoal ${goal.pausedAt.subgoalIndex + 1}, step ${goal.pausedAt.stepIndex + 1}`);
5004
+ }
5005
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5006
+ });
5007
+ originalTool("plan_list", "List all goals (active, completed, failed). Optionally filter by status.", {
5008
+ status: z.string().optional().describe("Filter by status: pending, active, completed, failed"),
5009
+ }, async ({ status }) => {
5010
+ const goals = status
5011
+ ? goalStore.list(status)
5012
+ : goalStore.list();
5013
+ if (goals.length === 0) {
5014
+ return { content: [{ type: "text", text: "No goals found." }] };
5015
+ }
5016
+ const lines = goals.map((g) => {
5017
+ const sgDone = g.subgoals.filter((s) => s.status === "completed").length;
5018
+ return ` ${g.id}: ${g.status} — "${g.description}" (${sgDone}/${g.subgoals.length} subgoals, ${g.createdAt})`;
5019
+ });
5020
+ return { content: [{ type: "text", text: [`${goals.length} goal(s):`, ...lines].join("\n") }] };
5021
+ });
5022
+ // ═══════════════════════════════════════════════
5023
+ // PERCEPTION + WORLD MODEL — continuous state tracking
5024
+ // ═══════════════════════════════════════════════
5025
+ originalTool("perception_status", "Get continuous perception status: multi-rate loop stats, freshness of AX/CDP/vision sources, and event counts.", {}, async () => {
5026
+ const stats = perceptionManager.getStats();
5027
+ const freshness = perceptionManager.getFreshnessSummary();
5028
+ const lines = [
5029
+ freshness,
5030
+ `Running: ${perceptionManager.isRunning}`,
5031
+ ];
5032
+ if (stats.started) {
5033
+ lines.push(`Started: ${stats.startedAt}`);
5034
+ lines.push("");
5035
+ const pcConfig = perceptionManager.getConfig();
5036
+ lines.push("Loop cycles:");
5037
+ lines.push(` Fast (${pcConfig?.fastIntervalMs ?? 100}ms): ${stats.fastCycles} cycles`);
5038
+ lines.push(` Medium (${pcConfig?.mediumIntervalMs ?? 500}ms): ${stats.mediumCycles} cycles`);
5039
+ lines.push(` Slow (${pcConfig?.slowIntervalMs ?? 2000}ms): ${stats.slowCycles} cycles`);
5040
+ lines.push("");
5041
+ lines.push("Events processed:");
5042
+ lines.push(` AX events: ${stats.axEventsProcessed}`);
5043
+ lines.push(` AX tree polls: ${stats.axTreePolls}`);
5044
+ lines.push(` CDP mutations: ${stats.cdpMutationsProcessed}`);
5045
+ lines.push(` CDP snapshots: ${stats.cdpSnapshots}`);
5046
+ lines.push(` Vision diffs: ${stats.visionDiffs}`);
5047
+ lines.push(` Vision OCRs: ${stats.visionOCRs}`);
5048
+ }
5049
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5050
+ });
5051
+ originalTool("world_state", "Get the current world model state: focused app, window/control counts, active dialogs, and last scan age. Use verbose=true to dump all controls.", {
5052
+ verbose: z.boolean().optional().default(false).describe("Dump all controls with roles, labels, positions, and confidence"),
5053
+ }, async ({ verbose }) => {
5054
+ const state = worldModel.getState();
5055
+ const summary = worldModel.toSummary();
5056
+ const focused = worldModel.getFocusedWindow();
5057
+ const dialogs = worldModel.getActiveDialogs();
2590
5058
  const lines = [];
2591
- lines.push(`Daemon: ${running ? "RUNNING" : "STOPPED"} (pid=${pid ?? "none"})`);
2592
- if (state?.terminals) {
2593
- for (const t of state.terminals) {
2594
- const lastOutput = (t.lastOutput || "").split("\n").slice(-5).join("\n").trim();
2595
- lines.push("");
2596
- lines.push(`--- ${t.id} ---`);
2597
- lines.push(` Status: ${(t.status || "unknown").toUpperCase()}`);
2598
- lines.push(` VS Code PID: ${t.vscodePid}`);
2599
- lines.push(` Window ID: ${t.windowId ?? "unknown"}`);
2600
- lines.push(` Current task: ${t.lastTask ?? "none"}`);
2601
- lines.push(` Tasks completed: ${t.tasksCompleted}`);
2602
- lines.push(` Last poll: ${t.lastPollAt}`);
2603
- lines.push(` Last output (tail):`);
2604
- lines.push(` ${lastOutput.split("\n").join("\n ")}`);
2605
- }
2606
- }
2607
- else if (!running) {
2608
- lines.push("\nNo monitor running. Use codex_monitor_start first.");
2609
- }
2610
- const queued = tasks.filter((t) => t.status === "queued").length;
2611
- const runningTasks = tasks.filter((t) => t.status === "running").length;
2612
- const completed = tasks.filter((t) => t.status === "completed").length;
5059
+ // Warn when world model is empty
5060
+ if (state.windows.size === 0 && !state.focusedApp) {
5061
+ if (!perceptionManager.isRunning) {
5062
+ lines.push("Warning: World model is empty. Run perception_start or use focus()/ui_tree to populate state.");
5063
+ }
5064
+ else {
5065
+ lines.push("World model is empty — perception is running but no data received yet.");
5066
+ }
5067
+ lines.push("");
5068
+ }
5069
+ lines.push(summary);
5070
+ if (focused) {
5071
+ lines.push(`\nFocused window: "${focused.title.value}" (id=${focused.windowId}, ${focused.controls.size} controls, confidence=${focused.title.confidence.toFixed(2)})`);
5072
+ }
5073
+ if (dialogs.length > 0) {
5074
+ lines.push("\nActive dialogs:");
5075
+ for (const d of dialogs) {
5076
+ lines.push(` - ${d.type}: "${d.title}" (${d.controls.size} controls, detected ${d.detectedAt})`);
5077
+ }
5078
+ }
5079
+ lines.push(`\nSession: ${state.sessionId || "(not initialized)"}`);
5080
+ // Show browser domain state (URL, title, tabs) if available
5081
+ for (const [bid, domain] of state.appDomains) {
5082
+ if (domain.family === "browser") {
5083
+ const bs = domain;
5084
+ if (bs.url?.value || bs.title?.value) {
5085
+ lines.push(`\nBrowser (${bid}):`);
5086
+ if (bs.url?.value)
5087
+ lines.push(` URL: ${bs.url.value}`);
5088
+ if (bs.title?.value)
5089
+ lines.push(` Title: ${bs.title.value}`);
5090
+ if (bs.tabs && bs.tabs.length > 0) {
5091
+ lines.push(` Tabs (${bs.tabs.length}):`);
5092
+ for (const tab of bs.tabs) {
5093
+ lines.push(` ${tab.index}. ${tab.isActive ? "▸ " : " "}${tab.title} | ${tab.url}`);
5094
+ }
5095
+ }
5096
+ }
5097
+ }
5098
+ }
5099
+ // Show tracked entities
5100
+ const entities = worldModel.getTrackedEntities();
5101
+ if (entities.size > 0) {
5102
+ lines.push(`\nTracked entities (${entities.size}):`);
5103
+ for (const entity of entities.values()) {
5104
+ const lastPos = entity.positions[entity.positions.length - 1];
5105
+ const posStr = lastPos ? `(${lastPos.x},${lastPos.y})` : "";
5106
+ lines.push(` - ${entity.type}: "${entity.label}" ${posStr} (seen ${entity.positions.length}x, since ${entity.firstSeen})`);
5107
+ }
5108
+ }
5109
+ if (verbose) {
5110
+ lines.push("\n── ALL CONTROLS ──");
5111
+ for (const [winId, win] of state.windows) {
5112
+ lines.push(`\nWindow ${winId}: "${win.title.value}" (${win.bundleId ?? "?"})`);
5113
+ if (win.focusedElement) {
5114
+ lines.push(` Focused: ${win.focusedElement.role} "${win.focusedElement.label.value}" @ (${win.focusedElement.position.x}, ${win.focusedElement.position.y})`);
5115
+ }
5116
+ // Group by role for readability
5117
+ const byRole = new Map();
5118
+ for (const ctrl of win.controls.values()) {
5119
+ const role = ctrl.role;
5120
+ if (!byRole.has(role))
5121
+ byRole.set(role, []);
5122
+ byRole.get(role).push({
5123
+ label: ctrl.label.value || "(no label)",
5124
+ pos: `${Math.round(ctrl.position.x)},${Math.round(ctrl.position.y)}`,
5125
+ size: `${ctrl.size.width}x${ctrl.size.height}`,
5126
+ conf: ctrl.label.confidence.toFixed(2),
5127
+ focused: ctrl.focused,
5128
+ });
5129
+ }
5130
+ for (const [role, controls] of [...byRole.entries()].sort((a, b) => b[1].length - a[1].length)) {
5131
+ lines.push(` [${role}] (${controls.length})`);
5132
+ for (const c of controls.slice(0, 50)) {
5133
+ const focus = c.focused ? " *FOCUSED*" : "";
5134
+ lines.push(` "${c.label}" @ (${c.pos}) ${c.size} conf=${c.conf}${focus}`);
5135
+ }
5136
+ if (controls.length > 50)
5137
+ lines.push(` ... +${controls.length - 50} more`);
5138
+ }
5139
+ }
5140
+ }
5141
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5142
+ });
5143
+ originalTool("world_state_diff", "Get stale UI controls that haven't been refreshed within a threshold. Useful for finding controls whose state may be outdated.", {
5144
+ thresholdMs: z.number().optional().describe("Stale threshold in ms (default: 5 minutes)"),
5145
+ }, async ({ thresholdMs }) => {
5146
+ const stale = worldModel.getStaleControls(thresholdMs);
5147
+ if (stale.length === 0) {
5148
+ // Distinguish "no data" from "all fresh"
5149
+ const totalControls = Array.from(worldModel.getState().windows.values()).reduce((sum, w) => sum + w.controls.size, 0);
5150
+ if (totalControls === 0) {
5151
+ const hint = perceptionManager.isRunning
5152
+ ? "Perception is running but no controls tracked yet."
5153
+ : "Run perception_start or ui_tree to populate state.";
5154
+ return { content: [{ type: "text", text: `World model has no tracked controls. ${hint}` }] };
5155
+ }
5156
+ return { content: [{ type: "text", text: "No stale controls — all state is fresh." }] };
5157
+ }
5158
+ const lines = [`${stale.length} stale control(s):`];
5159
+ for (const c of stale.slice(0, 20)) {
5160
+ const age = Math.round((Date.now() - new Date(c.value.updatedAt).getTime()) / 1000);
5161
+ lines.push(` ${c.stableId} ${c.role} "${c.label.value}" — ${age}s old`);
5162
+ }
5163
+ if (stale.length > 20)
5164
+ lines.push(` ... and ${stale.length - 20} more`);
5165
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5166
+ });
5167
+ originalTool("learning_status", "Get learning engine stats: locator preferences, recovery strategy rankings, adaptive budgets, and sensor preferences for a given app.", {
5168
+ bundleId: z.string().optional().describe("App bundle ID to query (default: currently focused app)"),
5169
+ }, async ({ bundleId }) => {
5170
+ const bid = bundleId ?? worldModel.getState().focusedApp?.bundleId ?? "unknown";
5171
+ const summary = learningEngine.getAppSummary(bid);
5172
+ const lines = [
5173
+ `Learning stats for ${bid}:`,
5174
+ ` Locator entries: ${summary.locatorEntries}`,
5175
+ ` Recovery entries: ${summary.recoveryEntries}`,
5176
+ ` Timing samples: ${summary.timingSamples}`,
5177
+ ` Sensor entries: ${summary.sensorEntries}`,
5178
+ ];
5179
+ if (summary.topLocatorMethod) {
5180
+ lines.push(` Best locator method: ${summary.topLocatorMethod}`);
5181
+ }
5182
+ if (summary.topSensor) {
5183
+ lines.push(` Best sensor: ${summary.topSensor}`);
5184
+ }
2613
5185
  lines.push("");
2614
- lines.push(`Tasks: ${queued} queued, ${runningTasks} running, ${completed} completed`);
2615
- // Optionally show daemon log tail
2616
- if (tail_log && tail_log > 0) {
5186
+ lines.push("Adaptive budgets:");
5187
+ lines.push(` Locate: ${summary.adaptiveBudget.locateMs}ms`);
5188
+ lines.push(` Act: ${summary.adaptiveBudget.actMs}ms`);
5189
+ lines.push(` Verify: ${summary.adaptiveBudget.verifyMs}ms`);
5190
+ const sensors = learningEngine.rankSensors(bid);
5191
+ if (sensors.length > 0) {
5192
+ lines.push("");
5193
+ lines.push("Sensor ranking:");
5194
+ for (const s of sensors) {
5195
+ lines.push(` ${s.sourceType}: score=${s.score.toFixed(3)}, avg=${Math.round(s.avgLatencyMs)}ms`);
5196
+ }
5197
+ }
5198
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5199
+ });
5200
+ // ── Perception lifecycle ──
5201
+ originalTool("perception_start", "Start continuous perception for the currently focused app (or specify bundleId). Begins multi-rate AX/CDP/vision polling loop: FAST (100ms AX events), MEDIUM (300ms AX/CDP poll), SLOW (1000ms vision/OCR).", {
5202
+ bundleId: z.string().optional().describe("Optional: specify app bundle ID directly instead of using focused app"),
5203
+ }, async ({ bundleId: overrideBundleId }) => {
5204
+ // Already running check
5205
+ if (perceptionManager.isRunning && !overrideBundleId) {
5206
+ const stats = perceptionManager.getStats();
5207
+ return { content: [{ type: "text", text: `Perception already running (started ${stats.startedAt}). Use perception_stop first to restart, or pass bundleId to switch target.` }] };
5208
+ }
5209
+ let app = worldModel.getState().focusedApp;
5210
+ // If bundleId override provided, try to resolve app info via bridge or AppleScript
5211
+ if (overrideBundleId && (!app || app.bundleId !== overrideBundleId)) {
2617
5212
  try {
2618
- const logContent = fs.readFileSync(MONITOR_LOG, "utf-8");
2619
- const logLines = logContent.trim().split("\n").slice(-(Math.min(tail_log, 50)));
2620
- lines.push("");
2621
- lines.push("--- Daemon Log ---");
2622
- lines.push(logLines.join("\n"));
5213
+ await ensureBridge();
5214
+ const apps = await bridge.call("app.list", {});
5215
+ const found = apps?.find((a) => a.bundleId === overrideBundleId);
5216
+ if (found) {
5217
+ app = { bundleId: overrideBundleId, appName: found.name ?? overrideBundleId, pid: found.pid };
5218
+ worldModel.updateFocusedApp({ bundleId: overrideBundleId, appName: found.name ?? overrideBundleId, pid: found.pid, windowTitle: "" });
5219
+ }
2623
5220
  }
2624
- catch {
2625
- lines.push("\n(no log file found)");
5221
+ catch { /* Bridge unavailable — fall through to AppleScript */ }
5222
+ // AppleScript fallback: bridge may not list windowless apps (e.g. freshly launched/killed TextEdit)
5223
+ if (!app || app.bundleId !== overrideBundleId) {
5224
+ try {
5225
+ const { stdout } = await execAsync(`osascript -e 'tell application "System Events" to get unix id of (first process whose bundle identifier is "${overrideBundleId.replace(/'/g, "'\\''")}")'`, { encoding: "utf-8", timeout: 5000 });
5226
+ const pid = parseInt((stdout ?? "").trim(), 10);
5227
+ if (!isNaN(pid)) {
5228
+ app = { bundleId: overrideBundleId, appName: overrideBundleId, pid };
5229
+ worldModel.updateFocusedApp({ bundleId: overrideBundleId, appName: overrideBundleId, pid, windowTitle: "" });
5230
+ }
5231
+ }
5232
+ catch { /* AppleScript also failed — app truly not running */ }
5233
+ }
5234
+ }
5235
+ // If bundleId was explicitly provided but we couldn't find the app, error out
5236
+ // instead of silently falling back to the frontmost app
5237
+ if (overrideBundleId && (!app || app.bundleId !== overrideBundleId)) {
5238
+ return { content: [{ type: "text", text: `Error: App with bundleId "${overrideBundleId}" is not running. Launch it first with launch(bundleId: "${overrideBundleId}").` }] };
5239
+ }
5240
+ // If still no app, try AppleScript to detect frontmost app
5241
+ if (!app) {
5242
+ try {
5243
+ const asScript = `tell application "System Events"
5244
+ set fp to first process whose frontmost is true
5245
+ return (bundle identifier of fp) & "|" & (name of fp) & "|" & (unix id of fp)
5246
+ end tell`;
5247
+ const { stdout: asOut } = await execAsync(`osascript -e '${asScript.replace(/'/g, "'\\''")}'`, { encoding: "utf-8", timeout: 5000 });
5248
+ const result = asOut ?? "";
5249
+ const [bid, name, pidStr] = result.trim().split("|");
5250
+ const pid = parseInt(pidStr ?? "", 10);
5251
+ if (bid && !isNaN(pid)) {
5252
+ app = { bundleId: bid, appName: name ?? bid, pid };
5253
+ worldModel.updateFocusedApp({ bundleId: bid, appName: name ?? bid, pid, windowTitle: "" });
5254
+ }
2626
5255
  }
5256
+ catch { /* AppleScript fallback failed */ }
5257
+ }
5258
+ if (!app) {
5259
+ return { content: [{ type: "text", text: "Error: No focused app detected. Focus an app with focus() first, or pass bundleId directly." }] };
5260
+ }
5261
+ let bridgeAvailable = false;
5262
+ try {
5263
+ await ensureBridge();
5264
+ bridgeAvailable = true;
5265
+ }
5266
+ catch { /* bridge unavailable — proceed without AX/vision */ }
5267
+ let windowId;
5268
+ if (bridgeAvailable) {
5269
+ try {
5270
+ windowId = await resolveWindowId(app.pid);
5271
+ }
5272
+ catch { /* best-effort */ }
5273
+ }
5274
+ const ctx = { bundleId: app.bundleId, appName: app.appName, pid: app.pid, windowTitle: "", ...(windowId != null ? { windowId } : {}) };
5275
+ await perceptionManager.ensureStarted(ctx);
5276
+ // Auto-connect CDP for browser apps — pass a connect factory so the
5277
+ // perception coordinator can reconnect when the WebSocket drops
5278
+ let cdpStatus = "skipped (not browser)";
5279
+ const isBrowser = isBrowserApp();
5280
+ console.error(`[perception_start] app=${app.bundleId} pid=${app.pid} windowId=${windowId} isBrowser=${isBrowser}`);
5281
+ if (isBrowser) {
5282
+ try {
5283
+ console.error("[perception_start] calling ensureCDP...");
5284
+ const { CDP: cdp, port } = await ensureCDP();
5285
+ console.error(`[perception_start] ensureCDP ok, port=${port}`);
5286
+ const connectFn = async () => {
5287
+ const targets = await cdp.List({ port });
5288
+ const page = targets.find((t) => t.type === "page");
5289
+ if (!page)
5290
+ throw new Error("No CDP page target");
5291
+ return cdp({ port, target: page.id });
5292
+ };
5293
+ const client = await connectFn();
5294
+ console.error(`[perception_start] CDP client created, client keys: ${Object.keys(client).slice(0, 5).join(",")}`);
5295
+ const coordinator = perceptionManager.getCoordinator();
5296
+ console.error(`[perception_start] coordinator exists: ${!!coordinator}, isRunning: ${coordinator?.isRunning}`);
5297
+ if (coordinator) {
5298
+ coordinator.activateCDP(client, connectFn);
5299
+ cdpStatus = `connected (port ${port})`;
5300
+ }
5301
+ else {
5302
+ cdpStatus = "no coordinator";
5303
+ }
5304
+ }
5305
+ catch (e) {
5306
+ cdpStatus = `failed: ${e?.message ?? e}`;
5307
+ console.error(`[perception_start] CDP error: ${cdpStatus}`);
5308
+ }
5309
+ }
5310
+ console.error(`[perception_start] CDP status: ${cdpStatus}`);
5311
+ // Set up Safari browser enricher (or clear it for non-Safari)
5312
+ installSafariEnricher(app.bundleId);
5313
+ return { content: [{ type: "text", text: `Perception started for ${app.bundleId} (${app.appName}). CDP: ${cdpStatus}` }] };
5314
+ });
5315
+ originalTool("perception_stop", "Stop continuous perception loop.", {}, async () => {
5316
+ if (!perceptionManager.isRunning) {
5317
+ return { content: [{ type: "text", text: "Perception was not running." }] };
5318
+ }
5319
+ const stats = perceptionManager.getStats();
5320
+ await perceptionManager.stop();
5321
+ const lines = ["Perception stopped."];
5322
+ if (stats.started) {
5323
+ lines.push(`Processed: ${stats.axEventsProcessed} AX events, ${stats.cdpSnapshots} CDP snapshots, ${stats.visionDiffs} vision diffs, ${stats.visionOCRs} OCRs.`);
5324
+ lines.push(`Cycles: ${stats.fastCycles} fast, ${stats.mediumCycles} medium, ${stats.slowCycles} slow.`);
2627
5325
  }
2628
5326
  return { content: [{ type: "text", text: lines.join("\n") }] };
2629
5327
  });
2630
- server.tool("codex_monitor_add_task", "Add a task to the daemon's queue. When a monitored terminal goes idle, the next task is automatically typed in and executed.", {
2631
- prompt: z.string().describe("The prompt/command to send to Codex when a terminal is available"),
2632
- priority: z.number().optional().describe("Priority (lower = higher priority, default: 10)"),
2633
- terminalId: z.string().optional().describe("Assign to a specific terminal (omit for any available)"),
2634
- }, async ({ prompt, priority, terminalId }) => {
2635
- const tasks = readDaemonTasks();
2636
- const task = {
2637
- id: "task_" + Date.now().toString(36) + Math.random().toString(36).slice(2, 6),
2638
- prompt,
2639
- priority: priority ?? 10,
2640
- terminalId: terminalId ?? null,
2641
- status: "queued",
2642
- createdAt: new Date().toISOString(),
2643
- assignedAt: null,
2644
- completedAt: null,
2645
- result: null,
2646
- };
2647
- tasks.push(task);
2648
- tasks.sort((a, b) => a.priority - b.priority);
2649
- writeDaemonTasks(tasks);
2650
- const queued = tasks.filter((t) => t.status === "queued").length;
2651
- return {
2652
- content: [{
2653
- type: "text",
2654
- text: `Task queued!\n` +
2655
- `ID: ${task.id}\n` +
2656
- `Prompt: "${prompt.slice(0, 100)}${prompt.length > 100 ? "..." : ""}"\n` +
2657
- `Priority: ${task.priority}\n` +
2658
- `Target terminal: ${task.terminalId ?? "any available"}\n` +
2659
- `Queue size: ${queued}`,
2660
- }],
2661
- };
5328
+ // ── Plan lifecycle ──
5329
+ originalTool("plan_cancel", "Cancel an active goal, marking it as failed.", {
5330
+ goalId: z.string().describe("Goal ID to cancel"),
5331
+ }, async ({ goalId }) => {
5332
+ const goal = goalStore.get(goalId);
5333
+ if (!goal) {
5334
+ return { content: [{ type: "text", text: `Goal not found: ${goalId}` }] };
5335
+ }
5336
+ goal.status = "failed";
5337
+ goal.completedAt = new Date().toISOString();
5338
+ goalStore.update(goalId, goal);
5339
+ return { content: [{ type: "text", text: `Goal cancelled: ${goalId}` }] };
2662
5340
  });
2663
- server.tool("codex_monitor_tasks", "List all tasks in the daemon's queue with their status.", {
2664
- status: z.enum(["all", "queued", "running", "completed", "failed"]).optional().describe("Filter by status (default: all)"),
2665
- }, async ({ status }) => {
2666
- let tasks = readDaemonTasks();
2667
- if (status && status !== "all") {
2668
- tasks = tasks.filter((t) => t.status === status);
2669
- }
2670
- if (tasks.length === 0) {
2671
- return { content: [{ type: "text", text: `No ${status ?? ""} tasks.` }] };
2672
- }
2673
- const lines = tasks.map((t, i) => {
2674
- const parts = [
2675
- `${i + 1}. [${t.status.toUpperCase()}] "${(t.prompt || "").slice(0, 80)}"`,
2676
- ` ID: ${t.id} | Priority: ${t.priority}`,
2677
- ` Terminal: ${t.terminalId ?? "any"}`,
2678
- ` Created: ${t.createdAt}`,
2679
- ];
2680
- if (t.assignedAt)
2681
- parts.push(` Assigned: ${t.assignedAt}`);
2682
- if (t.completedAt)
2683
- parts.push(` Completed: ${t.completedAt}`);
2684
- if (t.result)
2685
- parts.push(` Result: ${(t.result || "").slice(0, 100)}`);
2686
- return parts.join("\n");
2687
- });
2688
- return { content: [{ type: "text", text: lines.join("\n\n") }] };
5341
+ // ── Recovery status + configure ──
5342
+ originalTool("recovery_status", "Get recovery engine status: cooldowns, reference cache, learning engine connection.", {}, async () => {
5343
+ const status = recoveryEngine.getStatus();
5344
+ const lines = [
5345
+ "Recovery Engine Status:",
5346
+ ` Active cooldowns: ${status.cooldownCount}`,
5347
+ ` Reference cache entries: ${status.referenceCacheSize}`,
5348
+ ` Learning engine connected: ${status.learningEngineConnected}`,
5349
+ ];
5350
+ return { content: [{ type: "text", text: lines.join("\n") }] };
2689
5351
  });
2690
- server.tool("codex_monitor_assign_now", "Immediately type a prompt into the VS Code terminal (bypasses queue). Focuses VS Code, types, presses Enter.", {
2691
- prompt: z.string().describe("The prompt/command to type into the terminal"),
2692
- }, async ({ prompt }) => {
2693
- await ensureBridge();
5352
+ originalTool("recovery_configure", "Update recovery engine default budget configuration.", {
5353
+ maxRecoveryTimeMs: z.number().optional().describe("Max time for recovery attempts in ms"),
5354
+ maxStrategies: z.number().optional().describe("Max number of strategies to try"),
5355
+ }, async ({ maxRecoveryTimeMs, maxStrategies }) => {
5356
+ const updates = {};
5357
+ if (maxRecoveryTimeMs !== undefined)
5358
+ updates.maxRecoveryTimeMs = maxRecoveryTimeMs;
5359
+ if (maxStrategies !== undefined)
5360
+ updates.maxStrategies = maxStrategies;
5361
+ recoveryEngine.configure(updates);
5362
+ return { content: [{ type: "text", text: `Recovery config updated: ${JSON.stringify(updates)}` }] };
5363
+ });
5364
+ // ── Learning lifecycle ──
5365
+ originalTool("learning_reset", "Clear ALL learning data (locators, recovery, timing, sensors). Requires confirm=true.", {
5366
+ confirm: z.boolean().describe("Must be true to proceed"),
5367
+ }, async ({ confirm }) => {
5368
+ if (!confirm) {
5369
+ return { content: [{ type: "text", text: "Aborted: set confirm=true to clear all learning data." }] };
5370
+ }
5371
+ learningEngine.reset();
5372
+ return { content: [{ type: "text", text: "All learning data cleared and flushed to disk." }] };
5373
+ });
5374
+ // ═══════════════════════════════════════════════
5375
+ // ORCHESTRATOR — multi-agent task routing
5376
+ // ═══════════════════════════════════════════════
5377
+ const ORCHESTRATOR_DAEMON_SCRIPT = path.resolve(__dirname, "scripts", "orchestrator-daemon.ts");
5378
+ server.tool("orchestrator_start", "Start the multi-agent orchestrator daemon. Manages parallel worker slots: web tasks (CDP) run in parallel, native tasks (AX/keyboard) are serialized per-app. Survives restarts.", {
5379
+ webSlots: z.number().optional().describe("Number of parallel web worker slots (default: 4)"),
5380
+ nativeSlots: z.number().optional().describe("Number of native worker slots (default: 1)"),
5381
+ pollMs: z.number().optional().describe("Poll interval in ms (default: 1000)"),
5382
+ }, async ({ webSlots, nativeSlots, pollMs }) => {
5383
+ const existingPid = getOrchestratorPid();
5384
+ if (existingPid !== null) {
5385
+ return { content: [{ type: "text", text: `Orchestrator already running (pid=${existingPid}). Use orchestrator_stop first.` }] };
5386
+ }
5387
+ const compiledPath = fs.existsSync(path.resolve(__dirname, "scripts", "orchestrator-daemon.js"))
5388
+ ? path.resolve(__dirname, "scripts", "orchestrator-daemon.js")
5389
+ : path.resolve(__dirname, "dist", "scripts", "orchestrator-daemon.js");
5390
+ const daemonArgs = [];
5391
+ let child;
5392
+ let usedCompiled = false;
5393
+ if (fs.existsSync(compiledPath)) {
5394
+ daemonArgs.push(compiledPath);
5395
+ if (webSlots)
5396
+ daemonArgs.push("--web-slots", String(webSlots));
5397
+ if (nativeSlots)
5398
+ daemonArgs.push("--native-slots", String(nativeSlots));
5399
+ if (pollMs)
5400
+ daemonArgs.push("--poll", String(pollMs));
5401
+ child = spawn("node", daemonArgs, { detached: true, stdio: "ignore", cwd: __dirname });
5402
+ usedCompiled = true;
5403
+ }
5404
+ else {
5405
+ daemonArgs.push("tsx", ORCHESTRATOR_DAEMON_SCRIPT);
5406
+ if (webSlots)
5407
+ daemonArgs.push("--web-slots", String(webSlots));
5408
+ if (nativeSlots)
5409
+ daemonArgs.push("--native-slots", String(nativeSlots));
5410
+ if (pollMs)
5411
+ daemonArgs.push("--poll", String(pollMs));
5412
+ child = spawn("npx", daemonArgs, { detached: true, stdio: "ignore", cwd: __dirname });
5413
+ }
5414
+ child.unref();
5415
+ await new Promise((r) => setTimeout(r, 3000));
5416
+ const verifyPid = getOrchestratorPid();
5417
+ if (!verifyPid) {
5418
+ return { content: [{ type: "text", text: `Orchestrator failed to start (mode=${usedCompiled ? "compiled" : "tsx"}).\nCheck log: ${ORCH_LOG_FILE}` }] };
5419
+ }
5420
+ return { content: [{ type: "text", text: `Orchestrator started (pid=${verifyPid}).\nWeb slots: ${webSlots ?? 4} (parallel CDP) | Native slots: ${nativeSlots ?? 1} (serialized per-app)\nPoll: ${pollMs ?? 1000}ms\nLog: ${ORCH_LOG_FILE}\n\nSubmit tasks with orchestrator_submit. Web tasks run in parallel, native tasks queue per-app.` }] };
5421
+ });
5422
+ server.tool("orchestrator_stop", "Stop the orchestrator daemon. Running tasks finish before exit.", {}, async () => {
5423
+ const pid = getOrchestratorPid();
5424
+ if (!pid) {
5425
+ return { content: [{ type: "text", text: "No orchestrator daemon running." }] };
5426
+ }
2694
5427
  try {
2695
- await bridge.call("app.focus", { bundleId: "com.microsoft.VSCode" });
2696
- await new Promise((r) => setTimeout(r, 300));
2697
- await bridge.call("cg.typeText", { text: prompt });
2698
- await new Promise((r) => setTimeout(r, 100));
2699
- await bridge.call("cg.keyCombo", { keys: ["enter"] });
2700
- return { content: [{ type: "text", text: `Typed and sent: "${prompt.slice(0, 100)}"` }] };
5428
+ process.kill(pid, "SIGTERM");
5429
+ await new Promise((r) => setTimeout(r, 2000));
5430
+ return { content: [{ type: "text", text: `Orchestrator stopped (pid=${pid}).` }] };
2701
5431
  }
2702
5432
  catch (err) {
2703
- return { content: [{ type: "text", text: `Failed: ${err.message}` }] };
5433
+ return { content: [{ type: "text", text: `Failed to stop: ${err.message}` }] };
2704
5434
  }
2705
5435
  });
2706
- server.tool("codex_monitor_stop", "Stop the background monitor daemon.", {}, async () => {
2707
- const { running, pid } = isDaemonRunning();
2708
- if (!running) {
2709
- return { content: [{ type: "text", text: "No daemon running." }] };
5436
+ server.tool("orchestrator_submit", "Submit a task to the orchestrator. Web tasks (CDP) run in parallel, native tasks queue per-app. Returns immediately — task is processed asynchronously.", {
5437
+ task: z.string().describe("What to do"),
5438
+ mode: z.enum(["web", "native", "mixed"]).optional().describe("Execution mode: web (parallel CDP), native (serialized AX/keyboard), mixed (default: auto-detect)"),
5439
+ playbookId: z.string().optional().describe("Playbook to execute"),
5440
+ bundleId: z.string().optional().describe("Target app bundle ID (required for native tasks)"),
5441
+ windowId: z.number().optional().describe("Target window ID"),
5442
+ vars: z.record(z.string(), z.string()).optional().describe("Variables for playbook substitution"),
5443
+ priority: z.number().optional().describe("Priority: lower = higher (default: 10)"),
5444
+ }, async ({ task, mode, playbookId, bundleId, windowId, vars, priority }) => {
5445
+ const state = readOrchState();
5446
+ if (!state?.running) {
5447
+ return { content: [{ type: "text", text: "Orchestrator not running. Use orchestrator_start first." }] };
5448
+ }
5449
+ const newTask = createOrchestratorTask(task, {
5450
+ mode: mode ?? detectMode(playbookId, bundleId),
5451
+ ...(playbookId !== undefined ? { playbookId } : {}),
5452
+ ...(bundleId !== undefined ? { bundleId } : {}),
5453
+ ...(windowId !== undefined ? { windowId } : {}),
5454
+ ...(vars ? { vars } : {}),
5455
+ ...(priority !== undefined ? { priority } : {}),
5456
+ });
5457
+ state.tasks.push(newTask);
5458
+ state.totalSubmitted++;
5459
+ writeOrchState(state);
5460
+ const slotInfo = newTask.mode === "web"
5461
+ ? `→ will run on next free web slot (${state.webSlots} available)`
5462
+ : `→ will run on native slot (serialized for ${bundleId ?? "unknown app"})`;
5463
+ return { content: [{ type: "text", text: `Task submitted: ${newTask.id}\nMode: ${newTask.mode} ${slotInfo}\nPriority: ${newTask.priority}\n\nThe orchestrator will pick it up on the next poll cycle.` }] };
5464
+ });
5465
+ server.tool("orchestrator_status", "Get orchestrator status — worker slots, task queue, active/completed tasks.", {}, async () => {
5466
+ const state = readOrchState();
5467
+ if (!state) {
5468
+ return { content: [{ type: "text", text: "Orchestrator not running. Use orchestrator_start first." }] };
5469
+ }
5470
+ const lines = [
5471
+ `Running: ${state.running}${state.pid ? ` (pid=${state.pid})` : ""}`,
5472
+ `Started: ${state.startedAt}`,
5473
+ `Slots: ${state.webSlots} web (parallel) + ${state.nativeSlots} native (per-app serial)`,
5474
+ "",
5475
+ "Workers:",
5476
+ ];
5477
+ for (const w of state.workers) {
5478
+ const status = w.busy ? `BUSY → ${w.currentTaskId}` : "idle";
5479
+ lines.push(` [${w.id}] ${w.type} — ${status} (done: ${w.tasksCompleted}, failed: ${w.tasksFailed})`);
5480
+ }
5481
+ const queued = state.tasks.filter(t => t.status === "queued");
5482
+ const running = state.tasks.filter(t => t.status === "running" || t.status === "assigned");
5483
+ const done = state.tasks.filter(t => t.status === "done");
5484
+ const failed = state.tasks.filter(t => t.status === "failed");
5485
+ const blocked = state.tasks.filter(t => t.status === "blocked");
5486
+ lines.push("", `Tasks: ${state.totalSubmitted} submitted, ${state.totalCompleted} done, ${state.totalFailed} failed`);
5487
+ lines.push(`Queue: ${queued.length} queued, ${running.length} running, ${blocked.length} blocked`);
5488
+ if (running.length > 0) {
5489
+ lines.push("", "Running:");
5490
+ for (const t of running) {
5491
+ lines.push(` ${t.id}: "${t.task.slice(0, 60)}" [${t.mode}] → slot ${t.assignedWorker}`);
5492
+ }
5493
+ }
5494
+ if (queued.length > 0) {
5495
+ lines.push("", `Queued (next ${Math.min(queued.length, 5)}):`);
5496
+ for (const t of queued.slice(0, 5)) {
5497
+ lines.push(` ${t.id}: "${t.task.slice(0, 60)}" [${t.mode}] priority=${t.priority}`);
5498
+ }
5499
+ }
5500
+ if (done.length > 0) {
5501
+ lines.push("", `Recent completed (last ${Math.min(done.length, 5)}):`);
5502
+ for (const t of done.slice(-5)) {
5503
+ lines.push(` ${t.id}: "${t.task.slice(0, 60)}" → ${t.result?.slice(0, 80) ?? "ok"}`);
5504
+ }
5505
+ }
5506
+ if (failed.length > 0) {
5507
+ lines.push("", `Recent failed (last ${Math.min(failed.length, 3)}):`);
5508
+ for (const t of failed.slice(-3)) {
5509
+ lines.push(` ${t.id}: "${t.task.slice(0, 60)}" → ${t.error?.slice(0, 80) ?? "unknown"}`);
5510
+ }
5511
+ }
5512
+ if (Object.keys(state.nativeLocks).length > 0) {
5513
+ lines.push("", "Native app locks:");
5514
+ for (const [app, slot] of Object.entries(state.nativeLocks)) {
5515
+ lines.push(` ${app} → slot ${slot}`);
5516
+ }
5517
+ }
5518
+ lines.push("", `Log: ${ORCH_LOG_FILE}`);
5519
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5520
+ });
5521
+ // Helper aliases to keep tool code concise
5522
+ import { readOrchestratorState as readOrchState, writeOrchestratorState as writeOrchState, getOrchestratorDaemonPid as getOrchestratorPid, createTask as createOrchestratorTask, detectTaskMode as detectMode } from "./src/orchestrator/state.js";
5523
+ import { ORCHESTRATOR_LOG_FILE as ORCH_LOG_FILE } from "./src/orchestrator/types.js";
5524
+ // ═══════════════════════════════════════════════
5525
+ // OBSERVER — background app-level visual monitor
5526
+ // ═══════════════════════════════════════════════
5527
+ const OBSERVER_DAEMON_SCRIPT = path.resolve(__dirname, "scripts", "observer-daemon.ts");
5528
+ server.tool("observer_start", "Start the observer daemon to continuously watch an app window. Captures frames via CGWindowListCreateImage, runs OCR only when pixels change, detects popups. Zero overhead on engine — reads a JSON file.", {
5529
+ bundleId: z.string().describe("Bundle ID of the app to watch (e.g. com.blackmagic-design.DaVinciResolve)"),
5530
+ windowId: z.number().describe("Window ID to capture (get from the 'windows' tool)"),
5531
+ intervalMs: z.number().optional().describe("Capture interval in ms (default: 2000). Lower = more responsive but more CPU"),
5532
+ }, async ({ bundleId, windowId, intervalMs }) => {
5533
+ const existingPid = getObserverDaemonPid();
5534
+ if (existingPid !== null) {
5535
+ return { content: [{ type: "text", text: `Observer daemon already running (pid=${existingPid}). Use observer_stop first.` }] };
5536
+ }
5537
+ const compiledPath = fs.existsSync(path.resolve(__dirname, "scripts", "observer-daemon.js"))
5538
+ ? path.resolve(__dirname, "scripts", "observer-daemon.js")
5539
+ : path.resolve(__dirname, "dist", "scripts", "observer-daemon.js");
5540
+ const daemonArgs = [];
5541
+ let child;
5542
+ let usedCompiled = false;
5543
+ if (fs.existsSync(compiledPath)) {
5544
+ daemonArgs.push(compiledPath, "--bundleId", bundleId, "--windowId", String(windowId));
5545
+ if (intervalMs)
5546
+ daemonArgs.push("--interval", String(intervalMs));
5547
+ child = spawn("node", daemonArgs, { detached: true, stdio: "ignore", cwd: __dirname });
5548
+ usedCompiled = true;
5549
+ }
5550
+ else {
5551
+ daemonArgs.push("tsx", OBSERVER_DAEMON_SCRIPT, "--bundleId", bundleId, "--windowId", String(windowId));
5552
+ if (intervalMs)
5553
+ daemonArgs.push("--interval", String(intervalMs));
5554
+ child = spawn("npx", daemonArgs, { detached: true, stdio: "ignore", cwd: __dirname });
5555
+ }
5556
+ child.unref();
5557
+ await new Promise((r) => setTimeout(r, 2000));
5558
+ const verifyPid = getObserverDaemonPid();
5559
+ if (!verifyPid) {
5560
+ return { content: [{ type: "text", text: `Observer daemon failed to start (mode=${usedCompiled ? "compiled" : "tsx"}).\nCheck log: ${OBSERVER_LOG_FILE}` }] };
5561
+ }
5562
+ // Enable popup checks in the playbook engine (lazy-init if needed)
5563
+ if (!activePlaybookEngine) {
5564
+ getJobRunner(); // initializes activePlaybookEngine as a side effect
5565
+ }
5566
+ if (activePlaybookEngine)
5567
+ activePlaybookEngine.setPopupCheck(true);
5568
+ return { content: [{ type: "text", text: `Observer daemon started (pid=${verifyPid}).\nWatching: ${bundleId} (window ${windowId})\nInterval: ${intervalMs ?? 2000}ms\nLog: ${OBSERVER_LOG_FILE}\n\nPopup auto-dismiss enabled in playbook engine.\nUse observer_status to check frames/popups.` }] };
5569
+ });
5570
+ server.tool("observer_stop", "Stop the observer daemon.", {}, async () => {
5571
+ const pid = getObserverDaemonPid();
5572
+ if (!pid) {
5573
+ return { content: [{ type: "text", text: "No observer daemon running." }] };
2710
5574
  }
2711
5575
  try {
2712
5576
  process.kill(pid, "SIGTERM");
2713
- // Wait for it to clean up
2714
5577
  await new Promise((r) => setTimeout(r, 1000));
2715
- return { content: [{ type: "text", text: `Daemon stopped (pid=${pid}).` }] };
5578
+ if (activePlaybookEngine)
5579
+ activePlaybookEngine.setPopupCheck(false);
5580
+ return { content: [{ type: "text", text: `Observer daemon stopped (pid=${pid}).` }] };
2716
5581
  }
2717
5582
  catch (err) {
2718
- return { content: [{ type: "text", text: `Failed to stop daemon: ${err.message}` }] };
5583
+ return { content: [{ type: "text", text: `Failed to stop: ${err.message}` }] };
5584
+ }
5585
+ });
5586
+ server.tool("observer_status", "Get observer daemon status — frames captured, OCR text, popup detection.", {}, async () => {
5587
+ const state = readObserverState();
5588
+ if (!state) {
5589
+ return { content: [{ type: "text", text: "Observer not running. Use observer_start to begin watching an app." }] };
5590
+ }
5591
+ const lines = [
5592
+ `Running: ${state.running}${state.pid ? ` (pid=${state.pid})` : ""}`,
5593
+ `Watching: ${state.bundleId} (window ${state.windowId})`,
5594
+ `Interval: ${state.intervalMs}ms`,
5595
+ `Frames: ${state.framesCaptured} captured, ${state.framesChanged} changed, ${state.ocrRuns} OCR runs`,
5596
+ ];
5597
+ if (state.lastFrame) {
5598
+ lines.push(`Last frame: ${state.lastFrame.capturedAt} (changed: ${state.lastFrame.changed})`);
5599
+ const ocrPreview = state.lastFrame.ocrText.substring(0, 500);
5600
+ lines.push(`OCR text (first 500 chars):\n${ocrPreview}`);
5601
+ }
5602
+ if (state.popup) {
5603
+ lines.push(`\nPOPUP DETECTED: "${state.popup.pattern}"`);
5604
+ lines.push(` Action: ${state.popup.dismissAction}`);
5605
+ lines.push(` Detected: ${state.popup.detectedAt}`);
5606
+ }
5607
+ if (state.lastError) {
5608
+ lines.push(`\nLast error: ${state.lastError}`);
5609
+ }
5610
+ lines.push(`\nLog: ${OBSERVER_LOG_FILE}`);
5611
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5612
+ });
5613
+ server.tool("observer_ocr_roi", "Submit a targeted ROI OCR command to the running observer daemon. The daemon captures the window region, runs OCR, and stores the result. Non-blocking — returns a command ID you can poll with a second call.", {
5614
+ x: z.number().describe("X offset of the region (window-relative)"),
5615
+ y: z.number().describe("Y offset of the region (window-relative)"),
5616
+ width: z.number().describe("Width of the region"),
5617
+ height: z.number().describe("Height of the region"),
5618
+ windowId: z.number().optional().describe("Window ID (defaults to daemon's watched window)"),
5619
+ commandId: z.string().optional().describe("If provided, poll an existing command instead of submitting a new one"),
5620
+ }, async ({ x, y, width, height, windowId, commandId }) => {
5621
+ // Poll mode — check result of a previously submitted command
5622
+ if (commandId) {
5623
+ const cmd = getObserverCommand(commandId);
5624
+ if (!cmd) {
5625
+ return { content: [{ type: "text", text: `Command ${commandId} not found.` }] };
5626
+ }
5627
+ if (cmd.status === "pending" || cmd.status === "running") {
5628
+ return { content: [{ type: "text", text: `Command ${commandId}: ${cmd.status} — call again to poll.` }] };
5629
+ }
5630
+ if (cmd.status === "error") {
5631
+ return { content: [{ type: "text", text: `Command ${commandId} failed: ${cmd.error}` }] };
5632
+ }
5633
+ // done
5634
+ const r = cmd.result;
5635
+ const lines = [
5636
+ `Command ${commandId}: done at ${r.completedAt}`,
5637
+ `Text: ${r.text.substring(0, 1000)}`,
5638
+ `Regions: ${r.regions.length}`,
5639
+ ];
5640
+ for (const region of r.regions.slice(0, 20)) {
5641
+ lines.push(` "${region.text}" @ (${region.bounds.x}, ${region.bounds.y}, ${region.bounds.width}×${region.bounds.height})`);
5642
+ }
5643
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5644
+ }
5645
+ // Submit mode — create a new command
5646
+ const pid = getObserverDaemonPid();
5647
+ if (!pid) {
5648
+ return { content: [{ type: "text", text: "Observer daemon not running. Use observer_start first." }] };
5649
+ }
5650
+ const cmd = {
5651
+ type: "ocr_roi",
5652
+ roi: { x, y, width, height },
5653
+ };
5654
+ if (windowId !== undefined)
5655
+ cmd.windowId = windowId;
5656
+ const id = submitObserverCommand(cmd);
5657
+ return { content: [{ type: "text", text: `ROI OCR command submitted: ${id}\nRegion: (${x}, ${y}, ${width}×${height})\nThe daemon will process this on its next cycle. Call observer_ocr_roi with commandId="${id}" to poll the result.` }] };
5658
+ });
5659
+ // ═══════════════════════════════════════════════
5660
+ // PHASE 6: TOOL MASTERY — Ingestion + Community
5661
+ // ═══════════════════════════════════════════════
5662
+ server.tool("scan_menu_bar", "Scan an app's menu bar via AX tree. Extracts all menu paths, keyboard shortcuts, and enabled/disabled states. Automatically merges discovered shortcuts into the reference file.", {
5663
+ pid: z.number().describe("Process ID of the running app"),
5664
+ bundleId: z.string().describe("macOS bundle ID (e.g. com.adobe.Photoshop)"),
5665
+ appName: z.string().describe("Human-readable app name (e.g. Photoshop)"),
5666
+ mergeToReference: z.boolean().optional().describe("Merge discovered shortcuts into the reference file (default true)"),
5667
+ }, async ({ pid, bundleId, appName, mergeToReference }) => {
5668
+ await ensureBridge();
5669
+ const scanner = new MenuScanner(bridge);
5670
+ const result = await scanner.scan(pid, bundleId, appName);
5671
+ // Auto-merge to reference unless explicitly disabled
5672
+ let mergeInfo = "";
5673
+ if (mergeToReference !== false) {
5674
+ const merge = referenceMerger.mergeMenuScan(result);
5675
+ mergeInfo = `\nReference updated: ${merge.filePath} (${merge.added} added, ${merge.updated} updated)`;
5676
+ }
5677
+ const lines = [
5678
+ `Menu scan: ${result.appName} (${result.bundleId})`,
5679
+ `Total menus: ${result.totalMenus}, Total items: ${result.totalItems}`,
5680
+ `Shortcuts found: ${Object.keys(result.shortcuts).length}`,
5681
+ mergeInfo,
5682
+ "",
5683
+ "Shortcuts:",
5684
+ ];
5685
+ for (const [menuPath, keys] of Object.entries(result.shortcuts)) {
5686
+ // Redact username from menu paths + catch "Log Out <name>" pattern inline
5687
+ let safePath = redactUsername(menuPath);
5688
+ safePath = safePath.replace(/Log Out [^\n:]+/g, "Log Out [USER]");
5689
+ lines.push(` ${safePath}: ${keys}`);
5690
+ }
5691
+ let output = lines.join("\n");
5692
+ output = redactUsername(output);
5693
+ output = output.replace(/Log Out [^\n:]+/g, "Log Out [USER]");
5694
+ return { content: [{ type: "text", text: output }] };
5695
+ });
5696
+ server.tool("ingest_documentation", "Parse a documentation page (HTML, markdown, or text) and extract shortcuts, workflows, and tips. Merges extracted knowledge into the app's reference file.", {
5697
+ content: z.string().describe("The documentation content (HTML, markdown, or plain text)"),
5698
+ url: z.string().describe("Source URL of the documentation"),
5699
+ format: z.enum(["html", "markdown", "text"]).optional().describe("Content format (default html)"),
5700
+ bundleId: z.string().describe("macOS bundle ID for the app this documentation covers"),
5701
+ appName: z.string().describe("Human-readable app name"),
5702
+ mergeToReference: z.boolean().optional().describe("Merge extracted knowledge into reference file (default true)"),
5703
+ }, async ({ content, url, format, bundleId, appName, mergeToReference }) => {
5704
+ const parser = new DocParser();
5705
+ const result = parser.parse(content, url, format ?? "html");
5706
+ let mergeInfo = "";
5707
+ if (mergeToReference !== false) {
5708
+ const shortcutMerge = referenceMerger.mergeDocShortcuts(result.shortcuts, bundleId, appName);
5709
+ const flowMerge = referenceMerger.mergeDocFlows(result, bundleId, appName);
5710
+ mergeInfo = `\nReference updated: ${shortcutMerge.filePath}\n Shortcuts: ${shortcutMerge.added} added, ${shortcutMerge.updated} updated\n Flows: ${flowMerge.added} added`;
5711
+ }
5712
+ const lines = [
5713
+ `Documentation parsed: ${result.title}`,
5714
+ `Source: ${result.url}`,
5715
+ `Shortcuts: ${result.shortcuts.length}, Flows: ${result.flows.length}, Tips: ${result.tips.length}`,
5716
+ mergeInfo,
5717
+ ];
5718
+ if (result.shortcuts.length > 0) {
5719
+ lines.push("", "Shortcuts:");
5720
+ for (const s of result.shortcuts.slice(0, 30)) {
5721
+ lines.push(` ${s.name}: ${s.keys}${s.category ? ` (${s.category})` : ""}`);
5722
+ }
5723
+ }
5724
+ if (result.flows.length > 0) {
5725
+ lines.push("", "Workflows:");
5726
+ for (const f of result.flows.slice(0, 10)) {
5727
+ lines.push(` ${f.name} (${f.steps.length} steps)`);
5728
+ }
5729
+ }
5730
+ if (result.tips.length > 0) {
5731
+ lines.push("", "Tips:");
5732
+ for (const t of result.tips.slice(0, 10)) {
5733
+ lines.push(` - ${t}`);
5734
+ }
5735
+ }
5736
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5737
+ });
5738
+ server.tool("ingest_tutorial", "Extract structured playbook steps from a video transcript (e.g. YouTube captions). Converts tutorial narration into actionable automation steps with tool mappings.", {
5739
+ segments: z.array(z.object({
5740
+ text: z.string(),
5741
+ startTime: z.number(),
5742
+ duration: z.number(),
5743
+ })).describe("Transcript segments (text + timing from YouTube captions or similar)"),
5744
+ title: z.string().describe("Video title"),
5745
+ platform: z.string().describe("Target platform name (e.g. davinci-resolve, figma)"),
5746
+ }, async ({ segments, title, platform }) => {
5747
+ const extractor = new TutorialExtractor();
5748
+ const result = extractor.extract(segments, title, platform);
5749
+ const playbookSteps = extractor.toPlaybookSteps(result);
5750
+ const lines = [
5751
+ `Tutorial extracted: ${result.title}`,
5752
+ `Platform: ${result.platform}`,
5753
+ `Raw segments: ${result.rawSegments}, Action steps: ${result.actionSegments}`,
5754
+ `Playbook-ready steps: ${playbookSteps.length}`,
5755
+ "",
5756
+ "Steps:",
5757
+ ];
5758
+ for (let i = 0; i < result.steps.length; i++) {
5759
+ const step = result.steps[i];
5760
+ lines.push(` ${i + 1}. [${step.tool ?? "?"}] ${step.description}`);
5761
+ }
5762
+ return {
5763
+ content: [{
5764
+ type: "text",
5765
+ text: lines.join("\n"),
5766
+ }],
5767
+ };
5768
+ });
5769
+ server.tool("coverage_report", "Generate a coverage report for an app — shows what knowledge we have (shortcuts, selectors, flows, playbooks, errors) and identifies gaps with recommendations.", {
5770
+ bundleId: z.string().describe("macOS bundle ID (e.g. com.blackmagic-design.DaVinciResolveLite)"),
5771
+ appName: z.string().describe("Human-readable app name"),
5772
+ includeLiveMenuScan: z.boolean().optional().describe("Also scan the live menu bar for comparison (requires app to be running, needs pid)"),
5773
+ pid: z.number().optional().describe("Process ID (required if includeLiveMenuScan is true)"),
5774
+ }, async ({ bundleId, appName, includeLiveMenuScan, pid }) => {
5775
+ let menuScan;
5776
+ if (includeLiveMenuScan && pid) {
5777
+ await ensureBridge();
5778
+ const scanner = new MenuScanner(bridge);
5779
+ menuScan = await scanner.scan(pid, bundleId, appName);
2719
5780
  }
5781
+ const report = coverageAuditor.audit(bundleId, appName, menuScan);
5782
+ const lines = [
5783
+ `Coverage Report: ${report.app} (${report.bundleId})`,
5784
+ "",
5785
+ "Knowledge inventory:",
5786
+ ` Shortcuts: ${report.shortcutsKnown}`,
5787
+ ` Selectors: ${report.selectorsKnown}`,
5788
+ ` Flows: ${report.flowsKnown}`,
5789
+ ` Playbooks: ${report.playbooksAvailable}`,
5790
+ ` Error patterns: ${report.errorsDocumented}`,
5791
+ ];
5792
+ if (report.selectorStabilityScore > 0) {
5793
+ lines.push(` Selector stability: ${(report.selectorStabilityScore * 100).toFixed(0)}%`);
5794
+ }
5795
+ if (report.highValueGaps.length > 0) {
5796
+ lines.push("", "High-value gaps:");
5797
+ for (const gap of report.highValueGaps) {
5798
+ lines.push(` - ${gap}`);
5799
+ }
5800
+ }
5801
+ if (report.shortcutsNotInReference.length > 0) {
5802
+ lines.push("", `Undocumented shortcuts (${report.shortcutsNotInReference.length}):`);
5803
+ for (const s of report.shortcutsNotInReference.slice(0, 20)) {
5804
+ lines.push(` ${s}`);
5805
+ }
5806
+ }
5807
+ if (report.workflowsWithNoPlaybook.length > 0) {
5808
+ lines.push("", `Missing playbooks for common workflows: ${report.workflowsWithNoPlaybook.join(", ")}`);
5809
+ }
5810
+ return { content: [{ type: "text", text: lines.join("\n") }] };
5811
+ });
5812
+ originalTool("community_publish", "Publish a validated local playbook to the community repository. Requires the playbook to have been executed successfully multiple times. Strips sensitive data (passwords, file paths).", {
5813
+ playbookId: z.string().describe("ID of the local playbook to publish"),
5814
+ successRate: z.number().min(0).max(1).describe("Success rate from testing (0.0-1.0)"),
5815
+ executionCount: z.number().describe("Number of times the playbook has been executed"),
5816
+ minRuns: z.number().optional().describe("Minimum successful runs required (default 3)"),
5817
+ }, async ({ playbookId, successRate, executionCount }) => {
5818
+ // Look up the playbook from the store
5819
+ const playbook = _executablePlaybookStore.get(playbookId);
5820
+ if (!playbook) {
5821
+ return { content: [{ type: "text", text: `Playbook "${playbookId}" not found. Use export_playbook to list available playbooks.` }] };
5822
+ }
5823
+ // Server enforces minimum of 3 runs using playbook's own tracked data — client values are ignored
5824
+ const result = communityPublisher.publish(playbook, successRate, executionCount);
5825
+ if (!result) {
5826
+ const actualRuns = playbook.successCount + playbook.failCount;
5827
+ return { content: [{ type: "text", text: `Playbook not published. Requirements: at least 3 tracked executions and >50% success rate. Actual: ${actualRuns} tracked runs, ${actualRuns > 0 ? ((playbook.successCount / actualRuns) * 100).toFixed(0) : 0}% success.` }] };
5828
+ }
5829
+ communityFetcher.invalidateCache();
5830
+ return { content: [{ type: "text", text: `Published to community: ${result.id}\nName: ${result.name}\nSteps: ${result.steps.length}\nSuccess rate: ${(result.metadata.successRate * 100).toFixed(0)}%` }] };
5831
+ });
5832
+ originalTool("community_fetch", "Search community playbooks for a platform or workflow. Returns ranked results by success rate.", {
5833
+ platform: z.string().optional().describe("Filter by platform name"),
5834
+ bundleId: z.string().optional().describe("Filter by macOS bundle ID"),
5835
+ workflow: z.string().optional().describe("Search by workflow name/description"),
5836
+ limit: z.number().optional().describe("Max results (default 20)"),
5837
+ }, async ({ platform, bundleId, workflow, limit }) => {
5838
+ const query = {};
5839
+ if (platform !== undefined)
5840
+ query.platform = platform;
5841
+ if (bundleId !== undefined)
5842
+ query.bundleId = bundleId;
5843
+ if (workflow !== undefined)
5844
+ query.workflow = workflow;
5845
+ if (limit !== undefined)
5846
+ query.limit = limit;
5847
+ const results = await communityFetcher.fetchWithRemote(query);
5848
+ if (results.length === 0) {
5849
+ return { content: [{ type: "text", text: "No community playbooks found matching the query." }] };
5850
+ }
5851
+ const lines = [`Community playbooks (${results.length} results):`, ""];
5852
+ for (const pb of results) {
5853
+ lines.push(` ${pb.id}`);
5854
+ lines.push(` Name: ${pb.name}`);
5855
+ lines.push(` Platform: ${pb.platform} | Steps: ${pb.steps.length}`);
5856
+ lines.push(` Success: ${(pb.metadata.successRate * 100).toFixed(0)}% (${pb.metadata.executionCount} runs)`);
5857
+ lines.push(` Score: ${pb.ratings.score} | By: ${pb.metadata.author}`);
5858
+ lines.push("");
5859
+ }
5860
+ return { content: [{ type: "text", text: lines.join("\n") }] };
2720
5861
  });
2721
5862
  // ═══════════════════════════════════════════════
2722
5863
  // START
2723
5864
  // ═══════════════════════════════════════════════
2724
5865
  async function main() {
5866
+ // Flush playbook learnings on graceful shutdown
5867
+ process.on("SIGINT", () => { void perceptionManager.stop(); contextTracker.flush(); learningEngine.flush(); appMap.flush(); process.exit(0); });
5868
+ process.on("SIGTERM", () => { void perceptionManager.stop(); contextTracker.flush(); learningEngine.flush(); appMap.flush(); process.exit(0); });
5869
+ process.on("beforeExit", () => { void perceptionManager.stop(); contextTracker.flush(); learningEngine.flush(); appMap.flush(); });
2725
5870
  const transport = new StdioServerTransport();
2726
5871
  await server.connect(transport);
2727
5872
  }