@mseep/clawdcursor 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (354) hide show
  1. package/CHANGELOG.md +2264 -0
  2. package/LICENSE +21 -0
  3. package/README.md +385 -0
  4. package/SECURITY.md +44 -0
  5. package/SKILL.md +503 -0
  6. package/dist/core/agent-loop/agent.d.ts +42 -0
  7. package/dist/core/agent-loop/agent.js +1023 -0
  8. package/dist/core/agent-loop/agent.js.map +1 -0
  9. package/dist/core/agent-loop/batch-tool.d.ts +25 -0
  10. package/dist/core/agent-loop/batch-tool.js +218 -0
  11. package/dist/core/agent-loop/batch-tool.js.map +1 -0
  12. package/dist/core/agent-loop/coord-scale.d.ts +72 -0
  13. package/dist/core/agent-loop/coord-scale.js +89 -0
  14. package/dist/core/agent-loop/coord-scale.js.map +1 -0
  15. package/dist/core/agent-loop/focus-guard.d.ts +24 -0
  16. package/dist/core/agent-loop/focus-guard.js +29 -0
  17. package/dist/core/agent-loop/focus-guard.js.map +1 -0
  18. package/dist/core/agent-loop/project-mcp.d.ts +97 -0
  19. package/dist/core/agent-loop/project-mcp.js +253 -0
  20. package/dist/core/agent-loop/project-mcp.js.map +1 -0
  21. package/dist/core/agent-loop/prompt.d.ts +45 -0
  22. package/dist/core/agent-loop/prompt.js +426 -0
  23. package/dist/core/agent-loop/prompt.js.map +1 -0
  24. package/dist/core/agent-loop/tool-meta.d.ts +93 -0
  25. package/dist/core/agent-loop/tool-meta.js +651 -0
  26. package/dist/core/agent-loop/tool-meta.js.map +1 -0
  27. package/dist/core/agent-loop/tools.d.ts +38 -0
  28. package/dist/core/agent-loop/tools.js +2134 -0
  29. package/dist/core/agent-loop/tools.js.map +1 -0
  30. package/dist/core/agent-loop/types.d.ts +170 -0
  31. package/dist/core/agent-loop/types.js +12 -0
  32. package/dist/core/agent-loop/types.js.map +1 -0
  33. package/dist/core/agent.d.ts +51 -0
  34. package/dist/core/agent.js +245 -0
  35. package/dist/core/agent.js.map +1 -0
  36. package/dist/core/app-categories.d.ts +67 -0
  37. package/dist/core/app-categories.js +108 -0
  38. package/dist/core/app-categories.js.map +1 -0
  39. package/dist/core/banner.d.ts +70 -0
  40. package/dist/core/banner.js +245 -0
  41. package/dist/core/banner.js.map +1 -0
  42. package/dist/core/classify/capability.d.ts +45 -0
  43. package/dist/core/classify/capability.js +78 -0
  44. package/dist/core/classify/capability.js.map +1 -0
  45. package/dist/core/decompose/llm-decomposer.d.ts +35 -0
  46. package/dist/core/decompose/llm-decomposer.js +156 -0
  47. package/dist/core/decompose/llm-decomposer.js.map +1 -0
  48. package/dist/core/decompose/parser.d.ts +27 -0
  49. package/dist/core/decompose/parser.js +101 -0
  50. package/dist/core/decompose/parser.js.map +1 -0
  51. package/dist/core/observability/correlation.d.ts +19 -0
  52. package/dist/core/observability/correlation.js +36 -0
  53. package/dist/core/observability/correlation.js.map +1 -0
  54. package/dist/core/observability/cost-meter.d.ts +51 -0
  55. package/dist/core/observability/cost-meter.js +134 -0
  56. package/dist/core/observability/cost-meter.js.map +1 -0
  57. package/dist/core/observability/logger.d.ts +61 -0
  58. package/dist/core/observability/logger.js +550 -0
  59. package/dist/core/observability/logger.js.map +1 -0
  60. package/dist/core/router/aliases.d.ts +50 -0
  61. package/dist/core/router/aliases.js +104 -0
  62. package/dist/core/router/aliases.js.map +1 -0
  63. package/dist/core/router/normalize.d.ts +41 -0
  64. package/dist/core/router/normalize.js +80 -0
  65. package/dist/core/router/normalize.js.map +1 -0
  66. package/dist/core/safety.d.ts +126 -0
  67. package/dist/core/safety.js +568 -0
  68. package/dist/core/safety.js.map +1 -0
  69. package/dist/core/sense/a11y-resolver.d.ts +73 -0
  70. package/dist/core/sense/a11y-resolver.js +76 -0
  71. package/dist/core/sense/a11y-resolver.js.map +1 -0
  72. package/dist/core/sense/fingerprint.d.ts +41 -0
  73. package/dist/core/sense/fingerprint.js +123 -0
  74. package/dist/core/sense/fingerprint.js.map +1 -0
  75. package/dist/core/sense/rank.d.ts +70 -0
  76. package/dist/core/sense/rank.js +192 -0
  77. package/dist/core/sense/rank.js.map +1 -0
  78. package/dist/core/sense/reactive-check.d.ts +40 -0
  79. package/dist/core/sense/reactive-check.js +48 -0
  80. package/dist/core/sense/reactive-check.js.map +1 -0
  81. package/dist/core/sense/snapshot.d.ts +19 -0
  82. package/dist/core/sense/snapshot.js +100 -0
  83. package/dist/core/sense/snapshot.js.map +1 -0
  84. package/dist/core/sense/types.d.ts +66 -0
  85. package/dist/core/sense/types.js +9 -0
  86. package/dist/core/sense/types.js.map +1 -0
  87. package/dist/core/sense/ui-map-anchors.d.ts +7 -0
  88. package/dist/core/sense/ui-map-anchors.js +24 -0
  89. package/dist/core/sense/ui-map-anchors.js.map +1 -0
  90. package/dist/core/sense/ui-map-elements.d.ts +5 -0
  91. package/dist/core/sense/ui-map-elements.js +33 -0
  92. package/dist/core/sense/ui-map-elements.js.map +1 -0
  93. package/dist/core/sense/ui-map-find.d.ts +56 -0
  94. package/dist/core/sense/ui-map-find.js +153 -0
  95. package/dist/core/sense/ui-map-find.js.map +1 -0
  96. package/dist/core/sense/ui-map-fuse.d.ts +4 -0
  97. package/dist/core/sense/ui-map-fuse.js +44 -0
  98. package/dist/core/sense/ui-map-fuse.js.map +1 -0
  99. package/dist/core/sense/ui-map-geom.d.ts +3 -0
  100. package/dist/core/sense/ui-map-geom.js +16 -0
  101. package/dist/core/sense/ui-map-geom.js.map +1 -0
  102. package/dist/core/sense/ui-map-holder.d.ts +58 -0
  103. package/dist/core/sense/ui-map-holder.js +87 -0
  104. package/dist/core/sense/ui-map-holder.js.map +1 -0
  105. package/dist/core/sense/ui-map-normalize.d.ts +19 -0
  106. package/dist/core/sense/ui-map-normalize.js +65 -0
  107. package/dist/core/sense/ui-map-normalize.js.map +1 -0
  108. package/dist/core/sense/ui-map-render.d.ts +4 -0
  109. package/dist/core/sense/ui-map-render.js +34 -0
  110. package/dist/core/sense/ui-map-render.js.map +1 -0
  111. package/dist/core/sense/ui-map-resolve.d.ts +41 -0
  112. package/dist/core/sense/ui-map-resolve.js +59 -0
  113. package/dist/core/sense/ui-map-resolve.js.map +1 -0
  114. package/dist/core/sense/ui-map-types.d.ts +66 -0
  115. package/dist/core/sense/ui-map-types.js +11 -0
  116. package/dist/core/sense/ui-map-types.js.map +1 -0
  117. package/dist/core/sense/ui-map.d.ts +29 -0
  118. package/dist/core/sense/ui-map.js +113 -0
  119. package/dist/core/sense/ui-map.js.map +1 -0
  120. package/dist/core/verify/assertions.d.ts +132 -0
  121. package/dist/core/verify/assertions.js +284 -0
  122. package/dist/core/verify/assertions.js.map +1 -0
  123. package/dist/index.d.ts +21 -0
  124. package/dist/index.js +24 -0
  125. package/dist/index.js.map +1 -0
  126. package/dist/llm/browser-config.d.ts +36 -0
  127. package/dist/llm/browser-config.js +83 -0
  128. package/dist/llm/browser-config.js.map +1 -0
  129. package/dist/llm/client.d.ts +268 -0
  130. package/dist/llm/client.js +1094 -0
  131. package/dist/llm/client.js.map +1 -0
  132. package/dist/llm/config.d.ts +79 -0
  133. package/dist/llm/config.js +375 -0
  134. package/dist/llm/config.js.map +1 -0
  135. package/dist/llm/credentials.d.ts +35 -0
  136. package/dist/llm/credentials.js +491 -0
  137. package/dist/llm/credentials.js.map +1 -0
  138. package/dist/llm/external-creds.d.ts +42 -0
  139. package/dist/llm/external-creds.js +169 -0
  140. package/dist/llm/external-creds.js.map +1 -0
  141. package/dist/llm/providers.d.ts +123 -0
  142. package/dist/llm/providers.js +717 -0
  143. package/dist/llm/providers.js.map +1 -0
  144. package/dist/paths.d.ts +31 -0
  145. package/dist/paths.js +147 -0
  146. package/dist/paths.js.map +1 -0
  147. package/dist/platform/accessibility.d.ts +139 -0
  148. package/dist/platform/accessibility.js +670 -0
  149. package/dist/platform/accessibility.js.map +1 -0
  150. package/dist/platform/cdp-driver.d.ts +318 -0
  151. package/dist/platform/cdp-driver.js +1179 -0
  152. package/dist/platform/cdp-driver.js.map +1 -0
  153. package/dist/platform/index.d.ts +11 -0
  154. package/dist/platform/index.js +69 -0
  155. package/dist/platform/index.js.map +1 -0
  156. package/dist/platform/keys.d.ts +17 -0
  157. package/dist/platform/keys.js +129 -0
  158. package/dist/platform/keys.js.map +1 -0
  159. package/dist/platform/launch-poll.d.ts +101 -0
  160. package/dist/platform/launch-poll.js +177 -0
  161. package/dist/platform/launch-poll.js.map +1 -0
  162. package/dist/platform/linux.d.ts +173 -0
  163. package/dist/platform/linux.js +1253 -0
  164. package/dist/platform/linux.js.map +1 -0
  165. package/dist/platform/macos.d.ts +136 -0
  166. package/dist/platform/macos.js +976 -0
  167. package/dist/platform/macos.js.map +1 -0
  168. package/dist/platform/native-desktop.d.ts +145 -0
  169. package/dist/platform/native-desktop.js +936 -0
  170. package/dist/platform/native-desktop.js.map +1 -0
  171. package/dist/platform/native-helper.d.ts +130 -0
  172. package/dist/platform/native-helper.js +592 -0
  173. package/dist/platform/native-helper.js.map +1 -0
  174. package/dist/platform/ocr-engine.d.ts +78 -0
  175. package/dist/platform/ocr-engine.js +363 -0
  176. package/dist/platform/ocr-engine.js.map +1 -0
  177. package/dist/platform/ps-runner.d.ts +28 -0
  178. package/dist/platform/ps-runner.js +228 -0
  179. package/dist/platform/ps-runner.js.map +1 -0
  180. package/dist/platform/types.d.ts +397 -0
  181. package/dist/platform/types.js +15 -0
  182. package/dist/platform/types.js.map +1 -0
  183. package/dist/platform/uri-handler.d.ts +75 -0
  184. package/dist/platform/uri-handler.js +273 -0
  185. package/dist/platform/uri-handler.js.map +1 -0
  186. package/dist/platform/wayland-backend.d.ts +53 -0
  187. package/dist/platform/wayland-backend.js +348 -0
  188. package/dist/platform/wayland-backend.js.map +1 -0
  189. package/dist/platform/windows.d.ts +232 -0
  190. package/dist/platform/windows.js +1210 -0
  191. package/dist/platform/windows.js.map +1 -0
  192. package/dist/postbuild.d.ts +10 -0
  193. package/dist/postbuild.js +98 -0
  194. package/dist/postbuild.js.map +1 -0
  195. package/dist/schema/snapshot.d.ts +33 -0
  196. package/dist/schema/snapshot.js +90 -0
  197. package/dist/schema/snapshot.js.map +1 -0
  198. package/dist/shortcuts.d.ts +30 -0
  199. package/dist/shortcuts.js +261 -0
  200. package/dist/shortcuts.js.map +1 -0
  201. package/dist/surface/cli.d.ts +7 -0
  202. package/dist/surface/cli.js +1556 -0
  203. package/dist/surface/cli.js.map +1 -0
  204. package/dist/surface/dashboard.d.ts +8 -0
  205. package/dist/surface/dashboard.js +1193 -0
  206. package/dist/surface/dashboard.js.map +1 -0
  207. package/dist/surface/doctor.d.ts +29 -0
  208. package/dist/surface/doctor.js +1514 -0
  209. package/dist/surface/doctor.js.map +1 -0
  210. package/dist/surface/format.d.ts +10 -0
  211. package/dist/surface/format.js +37 -0
  212. package/dist/surface/format.js.map +1 -0
  213. package/dist/surface/http-utility.d.ts +65 -0
  214. package/dist/surface/http-utility.js +336 -0
  215. package/dist/surface/http-utility.js.map +1 -0
  216. package/dist/surface/mcp-server.d.ts +91 -0
  217. package/dist/surface/mcp-server.js +280 -0
  218. package/dist/surface/mcp-server.js.map +1 -0
  219. package/dist/surface/onboarding.d.ts +15 -0
  220. package/dist/surface/onboarding.js +184 -0
  221. package/dist/surface/onboarding.js.map +1 -0
  222. package/dist/surface/pidfile.d.ts +79 -0
  223. package/dist/surface/pidfile.js +263 -0
  224. package/dist/surface/pidfile.js.map +1 -0
  225. package/dist/surface/readiness.d.ts +45 -0
  226. package/dist/surface/readiness.js +230 -0
  227. package/dist/surface/readiness.js.map +1 -0
  228. package/dist/surface/report.d.ts +68 -0
  229. package/dist/surface/report.js +341 -0
  230. package/dist/surface/report.js.map +1 -0
  231. package/dist/surface/skill-register.d.ts +14 -0
  232. package/dist/surface/skill-register.js +150 -0
  233. package/dist/surface/skill-register.js.map +1 -0
  234. package/dist/surface/version.d.ts +6 -0
  235. package/dist/surface/version.js +27 -0
  236. package/dist/surface/version.js.map +1 -0
  237. package/dist/tools/a11y.d.ts +8 -0
  238. package/dist/tools/a11y.js +545 -0
  239. package/dist/tools/a11y.js.map +1 -0
  240. package/dist/tools/a11y_depth.d.ts +19 -0
  241. package/dist/tools/a11y_depth.js +455 -0
  242. package/dist/tools/a11y_depth.js.map +1 -0
  243. package/dist/tools/agent.d.ts +15 -0
  244. package/dist/tools/agent.js +248 -0
  245. package/dist/tools/agent.js.map +1 -0
  246. package/dist/tools/batch.d.ts +46 -0
  247. package/dist/tools/batch.js +230 -0
  248. package/dist/tools/batch.js.map +1 -0
  249. package/dist/tools/cdp.d.ts +8 -0
  250. package/dist/tools/cdp.js +233 -0
  251. package/dist/tools/cdp.js.map +1 -0
  252. package/dist/tools/compact.d.ts +63 -0
  253. package/dist/tools/compact.js +418 -0
  254. package/dist/tools/compact.js.map +1 -0
  255. package/dist/tools/cost-class.d.ts +38 -0
  256. package/dist/tools/cost-class.js +117 -0
  257. package/dist/tools/cost-class.js.map +1 -0
  258. package/dist/tools/desktop.d.ts +9 -0
  259. package/dist/tools/desktop.js +346 -0
  260. package/dist/tools/desktop.js.map +1 -0
  261. package/dist/tools/electron_bridge.d.ts +41 -0
  262. package/dist/tools/electron_bridge.js +261 -0
  263. package/dist/tools/electron_bridge.js.map +1 -0
  264. package/dist/tools/extras.d.ts +22 -0
  265. package/dist/tools/extras.js +942 -0
  266. package/dist/tools/extras.js.map +1 -0
  267. package/dist/tools/favorites.d.ts +13 -0
  268. package/dist/tools/favorites.js +137 -0
  269. package/dist/tools/favorites.js.map +1 -0
  270. package/dist/tools/introspection.d.ts +13 -0
  271. package/dist/tools/introspection.js +55 -0
  272. package/dist/tools/introspection.js.map +1 -0
  273. package/dist/tools/ocr.d.ts +8 -0
  274. package/dist/tools/ocr.js +66 -0
  275. package/dist/tools/ocr.js.map +1 -0
  276. package/dist/tools/orchestration.d.ts +7 -0
  277. package/dist/tools/orchestration.js +377 -0
  278. package/dist/tools/orchestration.js.map +1 -0
  279. package/dist/tools/playbooks/extract-compose.d.ts +22 -0
  280. package/dist/tools/playbooks/extract-compose.js +85 -0
  281. package/dist/tools/playbooks/extract-compose.js.map +1 -0
  282. package/dist/tools/playbooks/find-replace.d.ts +11 -0
  283. package/dist/tools/playbooks/find-replace.js +56 -0
  284. package/dist/tools/playbooks/find-replace.js.map +1 -0
  285. package/dist/tools/playbooks/index.d.ts +63 -0
  286. package/dist/tools/playbooks/index.js +70 -0
  287. package/dist/tools/playbooks/index.js.map +1 -0
  288. package/dist/tools/playbooks/keys-blocklist.d.ts +24 -0
  289. package/dist/tools/playbooks/keys-blocklist.js +89 -0
  290. package/dist/tools/playbooks/keys-blocklist.js.map +1 -0
  291. package/dist/tools/registry.d.ts +40 -0
  292. package/dist/tools/registry.js +560 -0
  293. package/dist/tools/registry.js.map +1 -0
  294. package/dist/tools/safety-gate.d.ts +16 -0
  295. package/dist/tools/safety-gate.js +70 -0
  296. package/dist/tools/safety-gate.js.map +1 -0
  297. package/dist/tools/scheduler.d.ts +76 -0
  298. package/dist/tools/scheduler.js +413 -0
  299. package/dist/tools/scheduler.js.map +1 -0
  300. package/dist/tools/shortcuts.d.ts +13 -0
  301. package/dist/tools/shortcuts.js +205 -0
  302. package/dist/tools/shortcuts.js.map +1 -0
  303. package/dist/tools/smart.d.ts +15 -0
  304. package/dist/tools/smart.js +785 -0
  305. package/dist/tools/smart.js.map +1 -0
  306. package/dist/tools/types.d.ts +174 -0
  307. package/dist/tools/types.js +67 -0
  308. package/dist/tools/types.js.map +1 -0
  309. package/dist/tools/window-text.d.ts +15 -0
  310. package/dist/tools/window-text.js +39 -0
  311. package/dist/tools/window-text.js.map +1 -0
  312. package/dist/types.d.ts +122 -0
  313. package/dist/types.js +41 -0
  314. package/dist/types.js.map +1 -0
  315. package/native/Package.swift +38 -0
  316. package/native/README.md +113 -0
  317. package/native/Sources/ClawdCursorHelper/main.swift +602 -0
  318. package/native/Sources/ClawdCursorHost/main.swift +182 -0
  319. package/native/Sources/PermissionCheck/main.swift +53 -0
  320. package/native/Sources/ScreenshotHelper/main.swift +219 -0
  321. package/native/build.sh +139 -0
  322. package/native/entitlements.plist +12 -0
  323. package/package.json +115 -0
  324. package/scripts/banner.ps1 +112 -0
  325. package/scripts/coord-accuracy.ps1 +140 -0
  326. package/scripts/coord-uwp.ps1 +80 -0
  327. package/scripts/edge-glow.ps1 +180 -0
  328. package/scripts/find-element.ps1 +198 -0
  329. package/scripts/get-foreground-window.ps1 +71 -0
  330. package/scripts/get-screen-context.ps1 +183 -0
  331. package/scripts/get-windows.ps1 +66 -0
  332. package/scripts/install-panic-hotkey.ps1 +46 -0
  333. package/scripts/interact-element.ps1 +431 -0
  334. package/scripts/invoke-element.ps1 +314 -0
  335. package/scripts/linux/atspi-bridge.py +356 -0
  336. package/scripts/linux/ocr-recognize.py +154 -0
  337. package/scripts/mac/_window-picker.jxa +163 -0
  338. package/scripts/mac/find-element.jxa +0 -0
  339. package/scripts/mac/find-element.sh +161 -0
  340. package/scripts/mac/focus-window.jxa +284 -0
  341. package/scripts/mac/get-focused-element.jxa +102 -0
  342. package/scripts/mac/get-foreground-window.jxa +173 -0
  343. package/scripts/mac/get-screen-context.jxa +197 -0
  344. package/scripts/mac/get-ui-tree.sh +141 -0
  345. package/scripts/mac/get-windows.jxa +117 -0
  346. package/scripts/mac/interact-element.sh +235 -0
  347. package/scripts/mac/invoke-element.jxa +408 -0
  348. package/scripts/mac/ocr-recognize.swift +124 -0
  349. package/scripts/ocr-recognize.ps1 +102 -0
  350. package/scripts/postinstall-native.js +48 -0
  351. package/scripts/ps-bridge.ps1 +830 -0
  352. package/scripts/smoke-mcp.ps1 +119 -0
  353. package/scripts/sync-version.ts +178 -0
  354. package/scripts/verify-install.js +81 -0
@@ -0,0 +1,2134 @@
1
+ "use strict";
2
+ /**
3
+ * Unified-agent tool catalog.
4
+ *
5
+ * ONE tool vocabulary across blind / hybrid / vision modes. The only
6
+ * difference between modes: in `blind`, the `screenshot` tool is removed
7
+ * from the catalog before the LLM sees it.
8
+ *
9
+ * Design rules:
10
+ * - Every mutation goes through PlatformAdapter (OS-agnostic).
11
+ * - NO ctx.platform call happens outside a tool's `execute()` — the agent
12
+ * loop never touches the adapter directly.
13
+ * - Terminal actions (`done` / `give_up` / `cannot_read`) just return
14
+ * `stop: true` with a terminalExit tag; the agent loop decides the
15
+ * AgentResult.
16
+ * - a11y-first wording. `invoke_element` and `set_field_value` are the
17
+ * preferred targeting tools; coord clicks are the fallback.
18
+ *
19
+ * Zero app-specific rules. A new LOB app works because a11y roles + the
20
+ * rank-before-truncate sense layer surface its buttons.
21
+ */
22
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
23
+ if (k2 === undefined) k2 = k;
24
+ var desc = Object.getOwnPropertyDescriptor(m, k);
25
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
26
+ desc = { enumerable: true, get: function() { return m[k]; } };
27
+ }
28
+ Object.defineProperty(o, k2, desc);
29
+ }) : (function(o, m, k, k2) {
30
+ if (k2 === undefined) k2 = k;
31
+ o[k2] = m[k];
32
+ }));
33
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
34
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
35
+ }) : function(o, v) {
36
+ o["default"] = v;
37
+ });
38
+ var __importStar = (this && this.__importStar) || (function () {
39
+ var ownKeys = function(o) {
40
+ ownKeys = Object.getOwnPropertyNames || function (o) {
41
+ var ar = [];
42
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
43
+ return ar;
44
+ };
45
+ return ownKeys(o);
46
+ };
47
+ return function (mod) {
48
+ if (mod && mod.__esModule) return mod;
49
+ var result = {};
50
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
51
+ __setModuleDefault(result, mod);
52
+ return result;
53
+ };
54
+ })();
55
+ Object.defineProperty(exports, "__esModule", { value: true });
56
+ exports.buildUnifiedTools = buildUnifiedTools;
57
+ exports.coerceCoord = coerceCoord;
58
+ const batch_tool_1 = require("./batch-tool");
59
+ const coord_scale_1 = require("./coord-scale");
60
+ const focus_guard_1 = require("./focus-guard");
61
+ const aliases_1 = require("../router/aliases");
62
+ const uri_handler_1 = require("../../platform/uri-handler");
63
+ const ocr_engine_1 = require("../../platform/ocr-engine");
64
+ const browser_config_1 = require("../../llm/browser-config");
65
+ const assertions_1 = require("../verify/assertions");
66
+ const ui_map_1 = require("../sense/ui-map");
67
+ const ui_map_render_1 = require("../sense/ui-map-render");
68
+ const prompt_1 = require("./prompt");
69
+ const ui_map_resolve_1 = require("../sense/ui-map-resolve");
70
+ const ui_map_find_1 = require("../sense/ui-map-find");
71
+ /** Lazy OCR singleton for the agent-loop perception tools (read_text, smart_click).
72
+ * Mirrors the pattern in src/tools/smart.ts. Construction never throws; the real
73
+ * availability check happens in isAvailable(). */
74
+ let _agentOcr = null;
75
+ function getAgentOcr() {
76
+ if (!_agentOcr)
77
+ _agentOcr = new ocr_engine_1.OcrEngine();
78
+ return _agentOcr;
79
+ }
80
+ /**
81
+ * Hedging-language phrases that indicate the agent is GUESSING about
82
+ * the task outcome instead of observing the actual screen state. Used
83
+ * by the `done` tool to reject speculative evidence claims like
84
+ * "the email should have been sent" — a real symptom from a Kimi run
85
+ * where the agent typed in a stale window and never noticed.
86
+ *
87
+ * Patterns are word-boundary anchored where possible so we don't
88
+ * false-positive on substrings (e.g., "shoulder" must not match
89
+ * "should"). Multi-word phrases match contiguous whitespace.
90
+ *
91
+ * The list is short on purpose — only the unambiguous "I'm guessing"
92
+ * phrases. Words like "looks", "shown", "displayed" are LEGITIMATE
93
+ * concrete-observation language and stay allowed.
94
+ */
95
+ const HEDGING_PATTERN = new RegExp([
96
+ // Modal verbs of uncertainty
97
+ '\\bshould\\s+(?:have|be|now)\\b',
98
+ '\\bshould\\s+(?:have\\s+been|be|now)\\b',
99
+ '\\bshould\\b(?=\\s+\\w)',
100
+ '\\bmight\\s+(?:have|be)\\b',
101
+ '\\bmay\\s+have\\b',
102
+ '\\bcould\\s+have\\b',
103
+ '\\bprobably\\b',
104
+ '\\blikely\\s+(?:has|have|is|was)\\b',
105
+ // Speaker-uncertainty phrasings
106
+ '\\bI\\s+think\\b',
107
+ '\\bI\\s+believe\\b',
108
+ '\\bI\\s+assume\\b',
109
+ '\\bassuming\\b',
110
+ '\\bif\\s+(?:successful|it\\s+worked|the\\s+\\w+\\s+worked)\\b',
111
+ // Approximate observation
112
+ '\\bappears?\\s+to\\b',
113
+ '\\bseems?\\s+to\\b',
114
+ '\\bpresumably\\b',
115
+ ].join('|'), 'i');
116
+ /**
117
+ * Build the unified tool catalog per mode + capability.
118
+ *
119
+ * Modes:
120
+ * - 'blind' → text-LLM; no `screenshot` tool in catalog
121
+ * - 'hybrid' → text-LLM; `screenshot` tool available on demand
122
+ * - 'vision' → vision-LLM; COMPOUND TOOL FORM (mouse/keyboard/window
123
+ * as action-discriminated schemas à la Anthropic
124
+ * computer_20250124) + perception + a11y + terminals
125
+ *
126
+ * Capability (text modes only):
127
+ * - When supplied and non-'general', filter to the scoped palette
128
+ * defined in `palettes.ts`. Typical palette ≈ 6–10 tools.
129
+ * - 'general' / undefined → full text-agent catalog (back-compat).
130
+ *
131
+ * Terminal actions (`done`, `give_up`, `cannot_read`) are always
132
+ * present regardless of mode/capability — the agent must always have
133
+ * an exit door.
134
+ */
135
+ /** Reuse a cost-compatible current UIMap from the holder, or compile a fresh one.
136
+ * Date.now() is called at the tool-invocation boundary (correct: snapshot is fresh).
137
+ * Returns null when there is no holder on this context (non-UIMap-aware call sites). */
138
+ async function finderMap(ctx, rawMaxCost) {
139
+ const holder = ctx.uiMaps;
140
+ if (!holder)
141
+ return null;
142
+ const requested = (rawMaxCost === 'cheap' || rawMaxCost === 'ocr_ok' || rawMaxCost === 'vision_ok') ? rawMaxCost : 'ocr_ok';
143
+ const now = Date.now();
144
+ const reuse = holder.currentIfCost(requested, now);
145
+ if (reuse)
146
+ return reuse;
147
+ const id = holder.nextId();
148
+ const map = await (0, ui_map_1.compileUIMap)((0, ui_map_1.defaultCompileDeps)(ctx.platform, now, id), { max_cost: requested });
149
+ holder.put(map, now, requested);
150
+ return map;
151
+ }
152
+ function buildUnifiedTools() {
153
+ const tools = [
154
+ // ─── PERCEPTION ─────────────────────────────────────────────
155
+ {
156
+ name: 'read_screen',
157
+ description: 'START HERE — cheapest perception. Read the accessibility tree of the focused window: buttons, inputs, text elements with coordinates. The snapshot is auto-attached each turn; call this again only when you expect the screen changed since the last turn. If the tree is empty, escalate to read_text (OCR) next, then screenshot only as a last resort.',
158
+ inputSchema: {
159
+ type: 'object',
160
+ properties: {
161
+ processId: { type: 'number', description: 'Optional: limit to a specific process' },
162
+ },
163
+ additionalProperties: false,
164
+ },
165
+ changesScreen: false,
166
+ async execute(args, ctx) {
167
+ const pid = typeof args.processId === 'number' ? args.processId : undefined;
168
+ const tree = await ctx.platform.getUiTree(pid);
169
+ if (tree.length === 0) {
170
+ return { success: true, text: '(empty a11y tree — app may be custom-canvas)' };
171
+ }
172
+ const lines = tree.slice(0, 60).map(el => `[${el.controlType || 'Element'}] "${el.name || ''}" @${el.bounds.x},${el.bounds.y} ${el.bounds.width}×${el.bounds.height}${el.value ? ` value="${el.value.slice(0, 40)}"` : ''}${el.focused ? ' [FOCUSED]' : ''}`);
173
+ const more = tree.length > 60 ? `\n… +${tree.length - 60} more` : '';
174
+ return { success: true, text: `Fresh a11y (${tree.length} els):\n${(0, prompt_1.wrapUntrustedScreenContent)(lines.join('\n') + more)}` };
175
+ },
176
+ },
177
+ {
178
+ name: 'list_windows',
179
+ description: 'List visible top-level windows with title, process, and bounds. Useful when the active window is wrong or missing.',
180
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
181
+ changesScreen: false,
182
+ async execute(_args, ctx) {
183
+ const windows = await ctx.platform.listWindows();
184
+ const active = await ctx.platform.getActiveWindow();
185
+ const lines = windows.slice(0, 20).map(w => {
186
+ const isActive = active && w.processId === active.processId && w.title === active.title;
187
+ return `${isActive ? '→' : ' '} [${w.processName}] "${w.title}" pid=${w.processId} ${w.bounds.width}×${w.bounds.height}`;
188
+ });
189
+ const more = windows.length > 20 ? `\n… +${windows.length - 20} more windows` : '';
190
+ return { success: true, text: `Windows (${windows.length}):\n${lines.join('\n')}${more}` };
191
+ },
192
+ },
193
+ // ─── A11Y ACTIONS (preferred) ───────────────────────────────
194
+ {
195
+ name: 'invoke_element',
196
+ description: 'Click/activate a UI element by its accessibility name. MORE RELIABLE than coord clicks — use this when the snapshot shows a named target.',
197
+ inputSchema: {
198
+ type: 'object',
199
+ properties: {
200
+ name: { type: 'string', description: 'Accessibility name of the element' },
201
+ automationId: { type: 'string', description: 'Element automation ID (more precise than name)' },
202
+ controlType: { type: 'string', description: 'Optional role filter (Button, MenuItem, Tab, etc.)' },
203
+ processId: { type: 'number', description: 'Optional: limit to a specific process' },
204
+ action: {
205
+ type: 'string',
206
+ enum: ['click', 'set-value', 'get-value', 'focus', 'expand', 'collapse'],
207
+ description: 'Action to perform (default: "click")',
208
+ },
209
+ value: { type: 'string', description: 'Value for set-value action' },
210
+ element_id: { type: 'string', description: 'Target a compiled element from compile_ui (requires snapshot_id)' },
211
+ snapshot_id: { type: 'string', description: 'The compile_ui snapshot the element_id came from (requires element_id)' },
212
+ expect: EXPECT_SCHEMA,
213
+ },
214
+ // `name` OR `automationId` must be supplied; neither is required at
215
+ // the JSON-schema level — the execute() body guards the total absence.
216
+ additionalProperties: false,
217
+ },
218
+ changesScreen: true,
219
+ async execute(args, ctx) {
220
+ const refIds = { element_id: typeof args.element_id === 'string' ? args.element_id : undefined,
221
+ snapshot_id: typeof args.snapshot_id === 'string' ? args.snapshot_id : undefined };
222
+ if (refIds.element_id || refIds.snapshot_id) {
223
+ const aw = await ctx.platform.getActiveWindow().catch(() => null);
224
+ const plan = (0, ui_map_resolve_1.resolveRef)(refIds, ctx.uiMaps, Date.now(), 'click', aw);
225
+ if (!plan.ok)
226
+ return { success: false, text: `invoke_element ref rejected: ${plan.error}`, isError: true };
227
+ if (plan.via === 'name') {
228
+ // Mirror the by-name activation CASCADE: click → select → toggle.
229
+ // A ref to a ListItem / combo-item / checkbox may not support
230
+ // InvokePattern, so we try the three activation verbs in order and
231
+ // stop at the first success — identical logic to the by-name path above.
232
+ const refLadder = ['click', 'select', 'toggle'];
233
+ let refRes = await ctx.platform.invokeElement({ name: plan.name, action: refLadder[0] });
234
+ let refUsed = refLadder[0];
235
+ for (let i = 1; i < refLadder.length && !refRes.success; i++) {
236
+ refUsed = refLadder[i];
237
+ refRes = await ctx.platform.invokeElement({ name: plan.name, action: refUsed });
238
+ }
239
+ await sleep(150);
240
+ return { success: refRes.success, text: refRes.success ? `Invoked "${plan.name}" via a11y${refUsed !== 'click' ? ` (${refUsed})` : ''} (via ${plan.element.id}).` : `a11y invoke of ${plan.element.id} missed.`, targetLabel: plan.name };
241
+ }
242
+ const [bx, by, bw, bh] = plan.bounds;
243
+ await ctx.platform.mouseClick(Math.round(bx + bw / 2), Math.round(by + bh / 2));
244
+ await sleep(150);
245
+ return { success: true, text: `Clicked ${plan.element.id} at its bounds center.`, targetLabel: plan.element.id };
246
+ }
247
+ // `automationId` is accepted for MCP backward-compat but the PlatformAdapter
248
+ // invokeElement interface does not expose automationId filtering — it is used
249
+ // only as a name alias when name is absent.
250
+ const rawName = typeof args.name === 'string' ? args.name : '';
251
+ const automationId = typeof args.automationId === 'string' ? args.automationId : undefined;
252
+ const name = rawName || automationId || '';
253
+ if (!name) {
254
+ return { success: false, text: 'invoke_element: "name" or "automationId" is required (the accessibility name of the element to invoke).' };
255
+ }
256
+ const controlType = typeof args.controlType === 'string' ? args.controlType : undefined;
257
+ const processId = typeof args.processId === 'number' ? args.processId : undefined;
258
+ const VALID_ACTIONS = ['click', 'set-value', 'get-value', 'focus', 'expand', 'collapse'];
259
+ const rawAction = typeof args.action === 'string' ? args.action : 'click';
260
+ const action = VALID_ACTIONS.includes(rawAction)
261
+ ? rawAction
262
+ : 'click';
263
+ const value = typeof args.value === 'string' ? args.value : undefined;
264
+ // OS-AGNOSTIC ACTIVATION CASCADE. "click" is the generic "activate this
265
+ // element" intent — but a named target can be a Button (InvokePattern),
266
+ // a checkbox (TogglePattern), or a ListItem / combo-item
267
+ // (SelectionItemPattern), and the agent operating BLIND can't see which.
268
+ // Live regression 2026-06-07: invoke "Cool blue" (a ListItem) failed
269
+ // because only SelectionItemPattern fit, forcing a coord-click fallback
270
+ // that needs a screenshot — the exact token cost clawdcursor avoids. So
271
+ // for the activate intent we try the activation verbs in order until one
272
+ // takes. EXPLICIT verbs (expand/collapse/get-value/set-value/focus) stay
273
+ // strict — the agent that asked to expand never silently gets a select.
274
+ // Pure adapter-string retries → works on every OS with zero per-OS code,
275
+ // and only the failing path pays the extra round-trips.
276
+ const ladder = action === 'click' ? ['click', 'select', 'toggle'] : [action];
277
+ let res = await ctx.platform.invokeElement({ name, controlType, processId, action: ladder[0], value });
278
+ let used = ladder[0];
279
+ for (let i = 1; i < ladder.length && !res.success; i++) {
280
+ used = ladder[i];
281
+ res = await ctx.platform.invokeElement({ name, controlType, processId, action: used, value });
282
+ }
283
+ await sleep(150);
284
+ return {
285
+ success: res.success,
286
+ text: res.success
287
+ ? (res.data && 'value' in res.data
288
+ ? `Invoked "${name}" (${used}) → value: "${res.data.value}"`
289
+ : `Invoked "${name}" via a11y${used !== 'click' ? ` (${used})` : ''}.`)
290
+ : `a11y invoke "${name}" missed — element not found or not actionable.`,
291
+ targetLabel: name,
292
+ };
293
+ },
294
+ },
295
+ {
296
+ name: 'set_field_value',
297
+ description: 'Set an editable field\'s value directly via accessibility (more reliable than click+type for forms).',
298
+ inputSchema: {
299
+ type: 'object',
300
+ properties: {
301
+ name: { type: 'string', description: 'Accessibility name of the field' },
302
+ value: { type: 'string' },
303
+ controlType: { type: 'string', description: 'Optional role filter (e.g. "Edit")' },
304
+ processId: { type: 'number' },
305
+ element_id: { type: 'string', description: 'Target a compiled element from compile_ui (requires snapshot_id)' },
306
+ snapshot_id: { type: 'string', description: 'The compile_ui snapshot the element_id came from (requires element_id)' },
307
+ expect: EXPECT_SCHEMA,
308
+ },
309
+ required: ['value'],
310
+ additionalProperties: false,
311
+ },
312
+ changesScreen: true,
313
+ async execute(args, ctx) {
314
+ const refIds = { element_id: typeof args.element_id === 'string' ? args.element_id : undefined,
315
+ snapshot_id: typeof args.snapshot_id === 'string' ? args.snapshot_id : undefined };
316
+ if (refIds.element_id || refIds.snapshot_id) {
317
+ const fillValue = String(args.value ?? '');
318
+ const aw = await ctx.platform.getActiveWindow().catch(() => null);
319
+ const plan = (0, ui_map_resolve_1.resolveRef)(refIds, ctx.uiMaps, Date.now(), 'fill', aw);
320
+ if (!plan.ok)
321
+ return { success: false, text: `set_field_value ref rejected: ${plan.error}`, isError: true };
322
+ if (plan.via === 'name') {
323
+ const res = await ctx.platform.invokeElement({ name: plan.name, action: 'set-value', value: fillValue });
324
+ await sleep(150);
325
+ return { success: res.success, text: res.success ? `Set "${plan.name}" = ${fillValue.length} chars (via ${plan.element.id}).` : `Set of ${plan.element.id} failed.`, targetLabel: plan.name };
326
+ }
327
+ const [bx, by, bw, bh] = plan.bounds;
328
+ await ctx.platform.mouseClick(Math.round(bx + bw / 2), Math.round(by + bh / 2));
329
+ await ctx.platform.typeText(fillValue);
330
+ await sleep(150);
331
+ return { success: true, text: `Filled ${plan.element.id} via bounds + type (${fillValue.length} chars).`, targetLabel: plan.element.id };
332
+ }
333
+ const name = String(args.name ?? '');
334
+ const value = String(args.value ?? '');
335
+ const controlType = typeof args.controlType === 'string' ? args.controlType : undefined;
336
+ const processId = typeof args.processId === 'number' ? args.processId : undefined;
337
+ const res = await ctx.platform.invokeElement({ name, controlType, processId, action: 'set-value', value });
338
+ await sleep(150);
339
+ return {
340
+ success: res.success,
341
+ text: res.success ? `Set "${name}" = ${value.length} chars` : `Set "${name}" failed.`,
342
+ targetLabel: name,
343
+ };
344
+ },
345
+ },
346
+ // ─── A11Y DEPTH (Tranche 2) ────────────────────────────────
347
+ {
348
+ name: 'a11y_expand',
349
+ description: 'Expand a tree node / combo / disclosure by a11y name (UIA ExpandCollapsePattern, AX AXExpanded).',
350
+ inputSchema: {
351
+ type: 'object',
352
+ properties: {
353
+ name: { type: 'string' },
354
+ controlType: { type: 'string' },
355
+ processId: { type: 'number' },
356
+ },
357
+ required: ['name'],
358
+ additionalProperties: false,
359
+ },
360
+ changesScreen: true,
361
+ async execute(args, ctx) {
362
+ const name = String(args.name ?? '');
363
+ const res = await ctx.platform.invokeElement({
364
+ name,
365
+ controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
366
+ processId: await resolveAgentPid(args, ctx),
367
+ action: 'expand',
368
+ });
369
+ return {
370
+ success: res.success,
371
+ text: res.success ? `Expanded "${name}".` : `Could not expand "${name}".`,
372
+ targetLabel: name,
373
+ };
374
+ },
375
+ },
376
+ {
377
+ name: 'a11y_collapse',
378
+ description: 'Collapse a tree node / combo / disclosure by a11y name.',
379
+ inputSchema: {
380
+ type: 'object',
381
+ properties: {
382
+ name: { type: 'string' },
383
+ controlType: { type: 'string' },
384
+ processId: { type: 'number' },
385
+ },
386
+ required: ['name'],
387
+ additionalProperties: false,
388
+ },
389
+ changesScreen: true,
390
+ async execute(args, ctx) {
391
+ const name = String(args.name ?? '');
392
+ const res = await ctx.platform.invokeElement({
393
+ name,
394
+ controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
395
+ processId: await resolveAgentPid(args, ctx),
396
+ action: 'collapse',
397
+ });
398
+ return {
399
+ success: res.success,
400
+ text: res.success ? `Collapsed "${name}".` : `Could not collapse "${name}".`,
401
+ targetLabel: name,
402
+ };
403
+ },
404
+ },
405
+ {
406
+ name: 'a11y_toggle',
407
+ description: 'Toggle a checkbox / switch / toggle-button by a11y name. Returns new state (On/Off/Indeterminate).',
408
+ inputSchema: {
409
+ type: 'object',
410
+ properties: {
411
+ name: { type: 'string' },
412
+ controlType: { type: 'string' },
413
+ processId: { type: 'number' },
414
+ },
415
+ required: ['name'],
416
+ additionalProperties: false,
417
+ },
418
+ changesScreen: true,
419
+ async execute(args, ctx) {
420
+ const name = String(args.name ?? '');
421
+ const res = await ctx.platform.invokeElement({
422
+ name,
423
+ controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
424
+ processId: await resolveAgentPid(args, ctx),
425
+ action: 'toggle',
426
+ });
427
+ if (!res.success)
428
+ return { success: false, text: `Could not toggle "${name}".`, targetLabel: name };
429
+ const state = res.data?.toggleState ?? 'unknown';
430
+ return { success: true, text: `Toggled "${name}" → ${state}.`, targetLabel: name };
431
+ },
432
+ },
433
+ {
434
+ name: 'a11y_select',
435
+ description: 'Select a list item / tab / radio by a11y name (UIA SelectionItemPattern, AX AXSelected).',
436
+ inputSchema: {
437
+ type: 'object',
438
+ properties: {
439
+ name: { type: 'string' },
440
+ controlType: { type: 'string' },
441
+ processId: { type: 'number' },
442
+ },
443
+ required: ['name'],
444
+ additionalProperties: false,
445
+ },
446
+ changesScreen: true,
447
+ async execute(args, ctx) {
448
+ const name = String(args.name ?? '');
449
+ const res = await ctx.platform.invokeElement({
450
+ name,
451
+ controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
452
+ processId: await resolveAgentPid(args, ctx),
453
+ action: 'select',
454
+ });
455
+ return {
456
+ success: res.success,
457
+ text: res.success ? `Selected "${name}".` : `Could not select "${name}".`,
458
+ targetLabel: name,
459
+ };
460
+ },
461
+ },
462
+ {
463
+ name: 'a11y_get_value',
464
+ description: 'Read the current value of a named field (UIA ValuePattern / AX AXValue). Useful to verify before typing.',
465
+ inputSchema: {
466
+ type: 'object',
467
+ properties: {
468
+ name: { type: 'string' },
469
+ controlType: { type: 'string' },
470
+ processId: { type: 'number' },
471
+ },
472
+ required: ['name'],
473
+ additionalProperties: false,
474
+ },
475
+ changesScreen: false,
476
+ async execute(args, ctx) {
477
+ const name = String(args.name ?? '');
478
+ const res = await ctx.platform.invokeElement({
479
+ name,
480
+ controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
481
+ processId: await resolveAgentPid(args, ctx),
482
+ action: 'get-value',
483
+ });
484
+ if (!res.success)
485
+ return { success: false, text: `"${name}" has no readable value.` };
486
+ const value = res.data?.value ?? '';
487
+ return { success: true, text: (0, prompt_1.wrapUntrustedScreenContent)(`"${name}" = "${truncate(String(value), 120)}"`) };
488
+ },
489
+ },
490
+ {
491
+ name: 'verify',
492
+ description: 'Deterministically check CURRENT state against machine-checkable assertions — the harness executes them, no guessing. Types: window_title_contains{value}, app_running{name}, element_exists{name}, element_value_contains{name,value}, clipboard_contains{value}, file_exists{path}, file_contains{path,value}, ocr_contains{value}, file_changed_since_start{path} (proves a file was written during THIS task). Cheaper and more reliable than a screenshot — use after a critical step or before done().',
493
+ inputSchema: {
494
+ type: 'object',
495
+ properties: {
496
+ assertions: {
497
+ type: 'array',
498
+ description: 'Up to 8 assertions, each {type, ...fields} per the types listed in the tool description.',
499
+ items: { type: 'object' },
500
+ },
501
+ },
502
+ required: ['assertions'],
503
+ additionalProperties: false,
504
+ },
505
+ changesScreen: false,
506
+ async execute(args, ctx) {
507
+ const parsed = (0, assertions_1.parseAssertions)(args.assertions);
508
+ if ('error' in parsed)
509
+ return { success: false, text: `verify rejected: ${parsed.error}` };
510
+ const report = await (0, assertions_1.checkAssertions)(parsed.assertions, {
511
+ adapter: ctx.platform,
512
+ ocrText: async () => (await getAgentOcr().recognizeScreen()).fullText ?? '',
513
+ });
514
+ return {
515
+ success: report.ok,
516
+ text: `${report.ok ? 'VERIFIED' : `FAILED ${report.failed}/${report.outcomes.length}`}:\n${(0, assertions_1.renderReport)(report)}`,
517
+ };
518
+ },
519
+ },
520
+ {
521
+ name: 'get_element_state',
522
+ description: 'Get state flags of a named element (focused/enabled/disabled/selected/busy/offscreen/expandable/expanded).',
523
+ inputSchema: {
524
+ type: 'object',
525
+ properties: {
526
+ name: { type: 'string' },
527
+ controlType: { type: 'string' },
528
+ processId: { type: 'number' },
529
+ },
530
+ required: ['name'],
531
+ additionalProperties: false,
532
+ },
533
+ changesScreen: false,
534
+ async execute(args, ctx) {
535
+ const name = String(args.name ?? '');
536
+ const hits = await ctx.platform.findElements({
537
+ name,
538
+ controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
539
+ processId: await resolveAgentPid(args, ctx),
540
+ });
541
+ if (hits.length === 0)
542
+ return { success: false, text: `No element named "${name}".` };
543
+ const el = hits[0];
544
+ return {
545
+ success: true,
546
+ text: JSON.stringify({
547
+ name: el.name,
548
+ controlType: el.controlType,
549
+ focused: el.focused ?? false,
550
+ enabled: el.enabled ?? true,
551
+ disabled: el.disabled ?? false,
552
+ selected: el.selected ?? false,
553
+ busy: el.busy ?? false,
554
+ offscreen: el.offscreen ?? false,
555
+ expandable: el.expandable ?? false,
556
+ expanded: el.expanded ?? false,
557
+ }),
558
+ };
559
+ },
560
+ },
561
+ // ─── INPUT (mouse) ──────────────────────────────────────────
562
+ {
563
+ name: 'click',
564
+ description: 'Click at (x,y). The default coordinate space follows context (image-space while a screenshot is in your context, else screen-space) — pass `space` explicitly when mixing sources: space:"screen" for a11y/@x,y map coords, space:"image" for coords read off the screenshot. Prefer invoke_element when the target has an a11y name.',
565
+ inputSchema: {
566
+ type: 'object',
567
+ properties: {
568
+ x: { type: 'number' },
569
+ y: { type: 'number' },
570
+ button: { type: 'string', enum: ['left', 'right'] },
571
+ count: { type: 'number', description: '1=single, 2=double' },
572
+ space: COORD_SPACE_SCHEMA,
573
+ expect: EXPECT_SCHEMA,
574
+ },
575
+ required: ['x', 'y'],
576
+ additionalProperties: false,
577
+ },
578
+ changesScreen: true,
579
+ async execute(args, ctx) {
580
+ const { x: ix, y: iy, warning } = coerceCoord(args.x, args.y);
581
+ if (!Number.isFinite(ix) || !Number.isFinite(iy)) {
582
+ return { success: false, isError: true, text: `click: x/y must be finite numbers, got x=${JSON.stringify(args.x)} y=${JSON.stringify(args.y)}` };
583
+ }
584
+ const button = args.button === 'right' ? 'right' : 'left';
585
+ const count = args.count === 2 ? 2 : 1;
586
+ // SCALE: 'image' coords (read off the 1280-wide screenshot) → physical;
587
+ // 'screen'/default (a11y coords, already physical) → pass through.
588
+ // Explicit space wins; else use ctx.coordSpaceDefault (set to 'image' on
589
+ // vision turns by the agent loop); fall back to 'screen'.
590
+ const space = args.space === 'image' ? 'image' : args.space === 'screen' ? 'screen' : (ctx.coordSpaceDefault ?? 'screen');
591
+ const scale = space === 'image' ? (0, coord_scale_1.imageScale)(ctx) : 1;
592
+ const x = (0, coord_scale_1.scaleCoord)(ix, scale);
593
+ const y = (0, coord_scale_1.scaleCoord)(iy, scale);
594
+ const fg0 = await ctx.platform.getActiveWindow().catch(() => null);
595
+ const raised = await (0, focus_guard_1.ensureTargetForeground)(ctx, fg0);
596
+ const before = raised ? await ctx.platform.getActiveWindow().catch(() => null) : fg0;
597
+ const activation = await ctx.platform.mouseClick(x, y, { button, count });
598
+ await sleep(150);
599
+ const after = await ctx.platform.getActiveWindow().catch(() => null);
600
+ const note = warning ? ` (${warning})` : '';
601
+ const focusWarn = focusTheftWarning(activation, before, after);
602
+ return { success: true, text: `Clicked ${button} x${count} at ${coordBreadcrumb(ix, iy, x, y, space, scale, ctx)}${raised}${focusBreadcrumb(before, after)}${note}${focusWarn}` };
603
+ },
604
+ },
605
+ {
606
+ name: 'drag',
607
+ description: 'Drag the mouse from (startX,startY) to (endX,endY) — select text, draw, resize. To TRACE A CURVE/PATH (gesture, curved track, drawing), pass `path` = an array of 12–20 {x,y} points instead: press at the first point, move through each, release at the last. The default coordinate space follows context; if you read coords off the SCREENSHOT, pass space:"image" so the tool scales them.',
608
+ inputSchema: {
609
+ type: 'object',
610
+ properties: {
611
+ startX: { type: 'number' },
612
+ startY: { type: 'number' },
613
+ endX: { type: 'number' },
614
+ endY: { type: 'number' },
615
+ path: {
616
+ type: 'array',
617
+ description: 'Stepped drag path: array of {x,y} points (min 2). When given, startX/startY/endX/endY are ignored. Press at first point, release at last.',
618
+ items: { type: 'object', properties: { x: { type: 'number' }, y: { type: 'number' } }, required: ['x', 'y'] },
619
+ },
620
+ space: COORD_SPACE_SCHEMA,
621
+ expect: EXPECT_SCHEMA,
622
+ },
623
+ additionalProperties: false,
624
+ },
625
+ changesScreen: true,
626
+ async execute(args, ctx) {
627
+ const space = args.space === 'image' ? 'image' : args.space === 'screen' ? 'screen' : (ctx.coordSpaceDefault ?? 'screen');
628
+ const scale = space === 'image' ? (0, coord_scale_1.imageScale)(ctx) : 1;
629
+ // Stepped path variant: press at the first point, walk the rest,
630
+ // release at the last (canvas tracing — same gesture the MCP-side
631
+ // mouse_drag_stepped performs).
632
+ if (args.path !== undefined) {
633
+ let pts;
634
+ try {
635
+ pts = typeof args.path === 'string' ? JSON.parse(args.path) : args.path;
636
+ }
637
+ catch {
638
+ return { success: false, isError: true, text: 'drag: `path` must be an array of {x,y} points' };
639
+ }
640
+ if (!Array.isArray(pts) || pts.length < 2 || !pts.every(p => p && Number.isFinite(Number(p.x)) && Number.isFinite(Number(p.y)))) {
641
+ return { success: false, isError: true, text: 'drag: `path` needs at least 2 {x,y} points with finite coords' };
642
+ }
643
+ const scaled = pts.map(p => ({ x: (0, coord_scale_1.scaleCoord)(Number(p.x), scale), y: (0, coord_scale_1.scaleCoord)(Number(p.y), scale) }));
644
+ const fg0p = await ctx.platform.getActiveWindow().catch(() => null);
645
+ const raisedP = await (0, focus_guard_1.ensureTargetForeground)(ctx, fg0p);
646
+ const beforeP = raisedP ? await ctx.platform.getActiveWindow().catch(() => null) : fg0p;
647
+ await ctx.platform.mouseMove(scaled[0].x, scaled[0].y);
648
+ await ctx.platform.mouseDown('left');
649
+ try {
650
+ for (let i = 1; i < scaled.length; i++) {
651
+ await ctx.platform.mouseMove(scaled[i].x, scaled[i].y);
652
+ await sleep(16); // let the app register the motion between segments
653
+ }
654
+ }
655
+ finally {
656
+ await ctx.platform.mouseUp('left');
657
+ }
658
+ await sleep(200);
659
+ const afterP = await ctx.platform.getActiveWindow().catch(() => null);
660
+ return { success: true, text: `Stepped-drag through ${pts.length} ${space} points → screen (${scaled[0].x},${scaled[0].y})…(${scaled[scaled.length - 1].x},${scaled[scaled.length - 1].y}) [×${scale}]${raisedP}${focusBreadcrumb(beforeP, afterP)}` };
661
+ }
662
+ const start = coerceCoord(args.startX, args.startY);
663
+ const end = coerceCoord(args.endX, args.endY);
664
+ if (![start.x, start.y, end.x, end.y].every(Number.isFinite)) {
665
+ return { success: false, isError: true, text: `drag: startX/startY/endX/endY must be finite numbers (or pass \`path\`), got ${JSON.stringify(args)}` };
666
+ }
667
+ const sx = (0, coord_scale_1.scaleCoord)(start.x, scale), sy = (0, coord_scale_1.scaleCoord)(start.y, scale);
668
+ const ex = (0, coord_scale_1.scaleCoord)(end.x, scale), ey = (0, coord_scale_1.scaleCoord)(end.y, scale);
669
+ const fg0 = await ctx.platform.getActiveWindow().catch(() => null);
670
+ const raised = await (0, focus_guard_1.ensureTargetForeground)(ctx, fg0);
671
+ const before = raised ? await ctx.platform.getActiveWindow().catch(() => null) : fg0;
672
+ await ctx.platform.mouseDrag(sx, sy, ex, ey);
673
+ await sleep(200);
674
+ const after = await ctx.platform.getActiveWindow().catch(() => null);
675
+ return { success: true, text: `Dragged ${space} (${start.x},${start.y})→(${end.x},${end.y}) → screen (${sx},${sy})→(${ex},${ey}) [×${scale}]${raised}${focusBreadcrumb(before, after)}` };
676
+ },
677
+ },
678
+ {
679
+ name: 'move',
680
+ description: 'Move the cursor to (x,y) WITHOUT clicking — hover/dwell over a target (pair with wait(ms) for a required dwell time). The default coordinate space follows context; pass space:"image" for coords read off the screenshot.',
681
+ inputSchema: {
682
+ type: 'object',
683
+ properties: {
684
+ x: { type: 'number' },
685
+ y: { type: 'number' },
686
+ space: COORD_SPACE_SCHEMA,
687
+ },
688
+ required: ['x', 'y'],
689
+ additionalProperties: false,
690
+ },
691
+ changesScreen: false,
692
+ async execute(args, ctx) {
693
+ const c = coerceCoord(args.x, args.y);
694
+ if (!Number.isFinite(c.x) || !Number.isFinite(c.y)) {
695
+ return { success: false, isError: true, text: `move: x/y must be finite numbers, got x=${JSON.stringify(args.x)} y=${JSON.stringify(args.y)}` };
696
+ }
697
+ const space = args.space === 'image' ? 'image' : args.space === 'screen' ? 'screen' : (ctx.coordSpaceDefault ?? 'screen');
698
+ const scale = space === 'image' ? (0, coord_scale_1.imageScale)(ctx) : 1;
699
+ const x = (0, coord_scale_1.scaleCoord)(c.x, scale), y = (0, coord_scale_1.scaleCoord)(c.y, scale);
700
+ await ctx.platform.mouseMove(x, y);
701
+ return { success: true, text: `Cursor moved (hover) to ${space} (${c.x},${c.y}) → screen (${x},${y}) [×${scale}]` };
702
+ },
703
+ },
704
+ {
705
+ name: 'scroll',
706
+ description: 'Scroll at (x,y) in a direction. Omit x,y to scroll at the screen center. If you read x,y off the SCREENSHOT, pass space:"image".',
707
+ inputSchema: {
708
+ type: 'object',
709
+ properties: {
710
+ x: { type: 'number' },
711
+ y: { type: 'number' },
712
+ direction: { type: 'string', enum: ['up', 'down'] },
713
+ amount: { type: 'number', description: 'Wheel ticks (default 3)' },
714
+ space: COORD_SPACE_SCHEMA,
715
+ },
716
+ required: ['direction'],
717
+ additionalProperties: false,
718
+ },
719
+ changesScreen: true,
720
+ async execute(args, ctx) {
721
+ const dir = args.direction === 'up' ? 'up' : 'down';
722
+ const amount = typeof args.amount === 'number' ? args.amount : 3;
723
+ // Default to screen-center when x/y missing; coerce strings via the helper.
724
+ const hasXY = args.x !== undefined || args.y !== undefined;
725
+ const space = args.space === 'image' ? 'image' : args.space === 'screen' ? 'screen' : (ctx.coordSpaceDefault ?? 'screen');
726
+ const scale = space === 'image' ? (0, coord_scale_1.imageScale)(ctx) : 1;
727
+ // No-coordinate default: center of the screen IN THE DRIVER'S SPACE
728
+ // (logical points on macOS, physical px elsewhere) — physicalWidth/2
729
+ // mislanded 2× off on Retina (audit 2026-06-11, M3).
730
+ const center = (0, coord_scale_1.screenCenter)(ctx);
731
+ let x = center.x;
732
+ let y = center.y;
733
+ if (hasXY) {
734
+ const c = coerceCoord(args.x, args.y);
735
+ if (Number.isFinite(c.x) && Number.isFinite(c.y)) {
736
+ x = (0, coord_scale_1.scaleCoord)(c.x, scale);
737
+ y = (0, coord_scale_1.scaleCoord)(c.y, scale);
738
+ }
739
+ }
740
+ await ctx.platform.mouseScroll(x, y, dir, amount);
741
+ await sleep(150);
742
+ return { success: true, text: `Scrolled ${dir} ${amount} at (${x},${y})` };
743
+ },
744
+ },
745
+ // ─── INPUT (keyboard) ───────────────────────────────────────
746
+ {
747
+ name: 'type',
748
+ description: 'Type text into the currently focused input. Prefer set_field_value when a field has an a11y name.',
749
+ inputSchema: {
750
+ type: 'object',
751
+ properties: {
752
+ text: { type: 'string' },
753
+ expect: EXPECT_SCHEMA,
754
+ },
755
+ required: ['text'],
756
+ additionalProperties: false,
757
+ },
758
+ changesScreen: true,
759
+ async execute(args, ctx) {
760
+ const text = String(args.text ?? '');
761
+ if (!text)
762
+ return { success: true, text: 'Typed 0 chars' };
763
+ // FAST PATH: paste via the clipboard (one Ctrl/Cmd+V — instant) instead
764
+ // of per-keystroke typing, which is visibly slow on anything longer than
765
+ // a few chars (~20ms/char). This is the legacy smart_type mechanism.
766
+ // Save + restore the prior clipboard so a pending copy isn't clobbered
767
+ // (e.g. a copy→paste→type flow). mod+v is portable across OSes.
768
+ // Char-by-char is kept as a fallback for fields that reject paste.
769
+ try {
770
+ const prior = await ctx.platform.readClipboard().catch(() => '');
771
+ await ctx.platform.writeClipboard(text);
772
+ await sleep(40);
773
+ await ctx.platform.keyPress('mod+v');
774
+ await sleep(150);
775
+ await ctx.platform.writeClipboard(prior).catch(() => { });
776
+ return { success: true, text: `Typed ${text.length} chars (paste): "${truncate(text, 60)}"` };
777
+ }
778
+ catch {
779
+ await ctx.platform.typeText(text);
780
+ await sleep(200);
781
+ return { success: true, text: `Typed ${text.length} chars: "${truncate(text, 60)}"` };
782
+ }
783
+ },
784
+ },
785
+ {
786
+ name: 'key',
787
+ description: 'Press a key or key combo. Use "mod" for Ctrl/Cmd. Use "+" for a chord (e.g. "mod+s", "shift+Tab"). Space-separate for a sequence ("Down Down End"). Examples: "Return", "Tab", "Escape", "F5", "ctrl+a".',
788
+ inputSchema: {
789
+ type: 'object',
790
+ properties: {
791
+ // `combo` is the canonical System B name. `key` is accepted as a
792
+ // backward-compatible alias (matches the MCP surface param name
793
+ // `key_press.key` and the compound surface alias).
794
+ combo: { type: 'string', description: 'Key/combo to press (e.g. "Return", "mod+s"). Space-separate for a sequence.' },
795
+ key: { type: 'string', description: 'Alias for combo — accepted for MCP/compound backward-compatibility.' },
796
+ expect: EXPECT_SCHEMA,
797
+ },
798
+ // Neither is required at the JSON-Schema level so the validator passes
799
+ // when only one is provided; the execute() guard catches a total absence.
800
+ additionalProperties: false,
801
+ },
802
+ changesScreen: true,
803
+ async execute(args, ctx) {
804
+ // (b) Accept `key` as a backward-compatible alias for `combo`.
805
+ const raw = (args.combo ?? args.key);
806
+ // (a) Guard: missing or empty argument → actionable error instead of crash.
807
+ if (raw === undefined || raw === null || String(raw).trim() === '') {
808
+ return {
809
+ success: false,
810
+ isError: true,
811
+ text: 'key: "combo" is required — the key or combo to press, e.g. "Return" or "mod+s". (The MCP surface alias is "key".)',
812
+ };
813
+ }
814
+ const input = String(raw).trim();
815
+ // Dangerous key combos that are blocked (mirrors System A BLOCKED_KEYS).
816
+ const BLOCKED = ['alt+f4', 'ctrl+alt+delete', 'ctrl+alt+del'];
817
+ // (b) "+" joins a chord; whitespace separates combos pressed in sequence.
818
+ const combos = input.split(/\s+/);
819
+ // (c) BLOCKED_KEYS guard — check every combo in the sequence.
820
+ for (const c of combos) {
821
+ const norm = c.toLowerCase().replace(/\s+/g, '');
822
+ if (BLOCKED.some(b => norm === b)) {
823
+ return { success: false, isError: true, text: `BLOCKED: "${c}" is a dangerous key combo.` };
824
+ }
825
+ }
826
+ for (const c of combos) {
827
+ await ctx.platform.keyPress(c);
828
+ if (combos.length > 1)
829
+ await sleep(50); // brief gap between sequence steps
830
+ }
831
+ await sleep(150);
832
+ return { success: true, text: `Pressed ${input}` };
833
+ },
834
+ },
835
+ // ─── APPS & WINDOWS ─────────────────────────────────────────
836
+ {
837
+ name: 'open_app',
838
+ description: 'Open an application by name (e.g. "Notepad", "TextEdit", "Safari").',
839
+ inputSchema: {
840
+ type: 'object',
841
+ properties: { name: { type: 'string' } },
842
+ required: ['name'],
843
+ additionalProperties: false,
844
+ },
845
+ changesScreen: true,
846
+ async execute(args, ctx) {
847
+ const name = String(args.name ?? '');
848
+ // Alias resolution lives at the agent-tool layer (PR1 of v0.9):
849
+ // the platform adapter is alias-data-agnostic, so we look up the
850
+ // canonical row here and forward the launch hints through
851
+ // `launchApp` opts. Cross-OS name mapping (Windows "Notepad" → mac
852
+ // "TextEdit") and UWP / executable / searchTerm details all flow
853
+ // through this single resolution point.
854
+ const alias = (0, aliases_1.resolveAlias)(name);
855
+ const platform = ctx.platform.platform;
856
+ // Pick the right name to hand to the platform launcher per OS.
857
+ // Falls back to the raw `name` when no alias matches.
858
+ let launchName = name;
859
+ if (alias) {
860
+ if (platform === 'darwin') {
861
+ launchName = alias.macOSAppName ?? name;
862
+ }
863
+ else if (platform === 'win32') {
864
+ launchName = alias.executable ?? name;
865
+ }
866
+ else {
867
+ // Linux: use the alias's executable but strip any `.exe`
868
+ // suffix that's there for the Windows path.
869
+ launchName = alias.executable?.replace(/\.exe$/i, '') ?? name;
870
+ }
871
+ }
872
+ const res = await ctx.platform.launchApp(launchName, {
873
+ alwaysNewInstance: alias?.alwaysNewInstance,
874
+ uwpAppId: alias?.uwpAppId,
875
+ // Pick the searchTerm that gives the OS native launcher (Start
876
+ // Menu / Spotlight) the best chance of resolving to the right
877
+ // app — alias.searchTerm wins when present, mac falls back to
878
+ // the bundle name.
879
+ searchTerm: alias?.searchTerm
880
+ ?? (platform === 'darwin' ? alias?.macOSAppName : undefined),
881
+ });
882
+ await sleep(800);
883
+ return {
884
+ success: true,
885
+ text: res.title ? `Opened "${name}" (pid=${res.pid}, window="${res.title}")` : `Launched "${name}" (no window surfaced yet)`,
886
+ };
887
+ },
888
+ },
889
+ {
890
+ name: 'focus_window',
891
+ description: 'Bring a window to the foreground. Match by processName, pid, or title substring.',
892
+ inputSchema: {
893
+ type: 'object',
894
+ properties: {
895
+ processName: { type: 'string' },
896
+ processId: { type: 'number' },
897
+ title: { type: 'string' },
898
+ },
899
+ additionalProperties: false,
900
+ },
901
+ changesScreen: true,
902
+ async execute(args, ctx) {
903
+ const q = {};
904
+ if (typeof args.processName === 'string')
905
+ q.processName = args.processName;
906
+ if (typeof args.processId === 'number')
907
+ q.processId = args.processId;
908
+ if (typeof args.title === 'string')
909
+ q.title = args.title;
910
+ const ok = await ctx.platform.focusWindow(q);
911
+ await sleep(250);
912
+ return { success: ok, text: ok ? 'Focused matching window.' : 'No matching window found.' };
913
+ },
914
+ },
915
+ // ─── WINDOW STATE + BOUNDS (Tranche 1B primitives) ──────────
916
+ {
917
+ name: 'maximize_window',
918
+ description: 'Maximize the foreground window (or a matched window). Polite request; WM may interpret.',
919
+ inputSchema: {
920
+ type: 'object',
921
+ properties: {
922
+ processName: { type: 'string' },
923
+ processId: { type: 'number' },
924
+ title: { type: 'string' },
925
+ },
926
+ additionalProperties: false,
927
+ },
928
+ changesScreen: true,
929
+ async execute(args, ctx) {
930
+ const q = buildWinQuery(args);
931
+ const ok = await ctx.platform.setWindowState('maximize', q);
932
+ return { success: ok, text: ok ? 'Maximized window.' : 'Maximize request ignored.' };
933
+ },
934
+ },
935
+ {
936
+ name: 'minimize_window',
937
+ description: 'Minimize the foreground or matched window to the taskbar / Dock.',
938
+ inputSchema: {
939
+ type: 'object',
940
+ properties: {
941
+ processName: { type: 'string' },
942
+ processId: { type: 'number' },
943
+ title: { type: 'string' },
944
+ },
945
+ additionalProperties: false,
946
+ },
947
+ changesScreen: true,
948
+ async execute(args, ctx) {
949
+ const q = buildWinQuery(args);
950
+ const ok = await ctx.platform.setWindowState('minimize', q);
951
+ return { success: ok, text: ok ? 'Minimized window.' : 'Minimize request failed.' };
952
+ },
953
+ },
954
+ {
955
+ name: 'restore_window',
956
+ description: 'Restore a minimized or maximized window to its previous bounds.',
957
+ inputSchema: {
958
+ type: 'object',
959
+ properties: {
960
+ processName: { type: 'string' },
961
+ processId: { type: 'number' },
962
+ title: { type: 'string' },
963
+ },
964
+ additionalProperties: false,
965
+ },
966
+ changesScreen: true,
967
+ async execute(args, ctx) {
968
+ const q = buildWinQuery(args);
969
+ const ok = await ctx.platform.setWindowState('normal', q);
970
+ return { success: ok, text: ok ? 'Restored window.' : 'Restore request failed.' };
971
+ },
972
+ },
973
+ {
974
+ name: 'close_window',
975
+ description: 'Polite close request (WM_CLOSE / AXCloseAction / _NET_CLOSE_WINDOW). App may prompt.',
976
+ inputSchema: {
977
+ type: 'object',
978
+ properties: {
979
+ processName: { type: 'string' },
980
+ processId: { type: 'number' },
981
+ title: { type: 'string' },
982
+ },
983
+ additionalProperties: false,
984
+ },
985
+ changesScreen: true,
986
+ async execute(args, ctx) {
987
+ const q = buildWinQuery(args);
988
+ const ok = await ctx.platform.setWindowState('close', q);
989
+ return { success: ok, text: ok ? 'Close request posted.' : 'Close request failed.', targetLabel: 'close_window' };
990
+ },
991
+ },
992
+ {
993
+ name: 'resize_window',
994
+ description: 'Set the foreground (or matched) window bounds in logical pixels. Omitted fields preserved.',
995
+ inputSchema: {
996
+ type: 'object',
997
+ properties: {
998
+ x: { type: 'number' }, y: { type: 'number' },
999
+ width: { type: 'number' }, height: { type: 'number' },
1000
+ processName: { type: 'string' },
1001
+ processId: { type: 'number' },
1002
+ title: { type: 'string' },
1003
+ },
1004
+ additionalProperties: false,
1005
+ },
1006
+ changesScreen: true,
1007
+ async execute(args, ctx) {
1008
+ const q = buildWinQuery(args);
1009
+ const x = typeof args.x === 'number' ? args.x : undefined;
1010
+ const y = typeof args.y === 'number' ? args.y : undefined;
1011
+ const width = typeof args.width === 'number' ? args.width : undefined;
1012
+ const height = typeof args.height === 'number' ? args.height : undefined;
1013
+ const ok = await ctx.platform.setWindowBounds({ x, y, width, height }, q);
1014
+ return { success: ok, text: ok ? `Resized window (x=${x ?? '-'}, y=${y ?? '-'}, w=${width ?? '-'}, h=${height ?? '-'}).` : 'Resize failed.' };
1015
+ },
1016
+ },
1017
+ {
1018
+ name: 'list_displays',
1019
+ description: 'Enumerate connected displays with logical bounds + DPI ratio. Use before display-specific screenshots.',
1020
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
1021
+ changesScreen: false,
1022
+ async execute(_args, ctx) {
1023
+ const displays = await ctx.platform.listDisplays();
1024
+ return { success: true, text: JSON.stringify(displays) };
1025
+ },
1026
+ },
1027
+ {
1028
+ name: 'switch_tab_os',
1029
+ description: 'Cycle next/previous browser tab (mod+Tab / mod+Shift+Tab) or jump to tab N (mod+1..9).',
1030
+ inputSchema: {
1031
+ type: 'object',
1032
+ properties: {
1033
+ index: { type: 'number', description: '1-9 for direct tab jump' },
1034
+ direction: { type: 'string', enum: ['next', 'previous'] },
1035
+ },
1036
+ additionalProperties: false,
1037
+ },
1038
+ changesScreen: true,
1039
+ async execute(args, ctx) {
1040
+ if (typeof args.index === 'number') {
1041
+ const n = Math.max(1, Math.min(9, Math.floor(args.index)));
1042
+ await ctx.platform.keyPress(`mod+${n}`);
1043
+ return { success: true, text: `Switched to tab ${n}` };
1044
+ }
1045
+ const dir = args.direction === 'previous' ? 'previous' : 'next';
1046
+ await ctx.platform.keyPress(dir === 'next' ? 'mod+Tab' : 'mod+shift+Tab');
1047
+ return { success: true, text: `Cycled to ${dir} tab` };
1048
+ },
1049
+ },
1050
+ // ─── ACCESSIBILITY DEPTH (Tranche 1B) ───────────────────────
1051
+ {
1052
+ name: 'focus_element',
1053
+ description: 'Keyboard-focus an element by a11y name. Does NOT raise window — use focus_window first if needed.',
1054
+ inputSchema: {
1055
+ type: 'object',
1056
+ properties: {
1057
+ name: { type: 'string' },
1058
+ controlType: { type: 'string' },
1059
+ processId: { type: 'number' },
1060
+ },
1061
+ required: ['name'],
1062
+ additionalProperties: false,
1063
+ },
1064
+ changesScreen: true,
1065
+ async execute(args, ctx) {
1066
+ const name = String(args.name ?? '');
1067
+ const result = await ctx.platform.invokeElement({
1068
+ name,
1069
+ controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
1070
+ processId: typeof args.processId === 'number' ? args.processId : undefined,
1071
+ action: 'focus',
1072
+ });
1073
+ return {
1074
+ success: result.success,
1075
+ text: result.success ? `Focused "${name}" via a11y.` : `Could not focus "${name}".`,
1076
+ targetLabel: name,
1077
+ };
1078
+ },
1079
+ },
1080
+ {
1081
+ name: 'wait_for_element',
1082
+ description: 'Poll the a11y tree until an element matching name/controlType appears. Useful after an action spawns a dialog.',
1083
+ inputSchema: {
1084
+ type: 'object',
1085
+ properties: {
1086
+ name: { type: 'string' },
1087
+ controlType: { type: 'string' },
1088
+ processId: { type: 'number' },
1089
+ timeoutMs: { type: 'number', description: 'Default 5000', maximum: 30000 },
1090
+ intervalMs: { type: 'number', description: 'Default 250' },
1091
+ },
1092
+ additionalProperties: false,
1093
+ },
1094
+ changesScreen: false,
1095
+ async execute(args, ctx) {
1096
+ const timeout = typeof args.timeoutMs === 'number' ? Math.min(30000, args.timeoutMs) : 5000;
1097
+ const element = await ctx.platform.waitForElement({
1098
+ name: typeof args.name === 'string' ? args.name : undefined,
1099
+ controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
1100
+ processId: typeof args.processId === 'number' ? args.processId : undefined,
1101
+ intervalMs: typeof args.intervalMs === 'number' ? args.intervalMs : 250,
1102
+ }, timeout);
1103
+ if (!element)
1104
+ return { success: false, text: `wait_for_element: timed out after ${timeout}ms` };
1105
+ return { success: true, text: `Found element: ${element.name} [${element.controlType}] @${element.bounds.x},${element.bounds.y}` };
1106
+ },
1107
+ },
1108
+ // ─── SYSTEM OPEN HELPERS (Tranche 1B) ───────────────────────
1109
+ {
1110
+ name: 'open_file',
1111
+ description: 'Open a file or folder in the OS default app (explorer / open / xdg-open).',
1112
+ inputSchema: {
1113
+ type: 'object',
1114
+ properties: { path: { type: 'string' } },
1115
+ required: ['path'],
1116
+ additionalProperties: false,
1117
+ },
1118
+ changesScreen: true,
1119
+ async execute(args, ctx) {
1120
+ const p = String(args.path ?? '');
1121
+ try {
1122
+ if (ctx.platform.platform === 'darwin')
1123
+ await ctx.platform.launchApp('open', { url: p });
1124
+ else if (ctx.platform.platform === 'linux')
1125
+ await ctx.platform.launchApp('xdg-open', { url: p });
1126
+ else
1127
+ await ctx.platform.launchApp('explorer.exe', { url: p });
1128
+ await sleep(500);
1129
+ return { success: true, text: `Opened: ${p}` };
1130
+ }
1131
+ catch (err) {
1132
+ const msg = err instanceof Error ? err.message : String(err);
1133
+ return { success: false, text: `open_file failed: ${msg}` };
1134
+ }
1135
+ },
1136
+ },
1137
+ {
1138
+ name: 'open_url',
1139
+ description: 'Open a URL in the default browser. Use instead of navigate_browser when you don\'t care which browser.',
1140
+ inputSchema: {
1141
+ type: 'object',
1142
+ properties: { url: { type: 'string' } },
1143
+ required: ['url'],
1144
+ additionalProperties: false,
1145
+ },
1146
+ changesScreen: true,
1147
+ async execute(args, ctx) {
1148
+ const u = String(args.url ?? '');
1149
+ if (!/^https?:\/\//i.test(u))
1150
+ return { success: false, text: 'open_url: URL must start with http(s)://' };
1151
+ try {
1152
+ if (ctx.platform.platform === 'darwin') {
1153
+ await ctx.platform.launchApp('open', { url: u });
1154
+ }
1155
+ else if (ctx.platform.platform === 'linux') {
1156
+ await ctx.platform.launchApp('xdg-open', { url: u });
1157
+ }
1158
+ else {
1159
+ // Windows: launch the REGISTERED https handler directly (e.g.
1160
+ // msedge.exe), not `explorer.exe <url>`. explorer drops the URL in a
1161
+ // background tab and opens no explorer window, so launchApp's
1162
+ // window-find misses and falls back to a Start-menu search that
1163
+ // presses Win and types — spurious "searching" that derails the run.
1164
+ // The resolved browser exe HAS a findable window, so launchApp
1165
+ // foregrounds it cleanly with no fallback.
1166
+ const { resolveSchemeHandlerExecutable } = await Promise.resolve().then(() => __importStar(require('../../platform/uri-handler')));
1167
+ const exe = await resolveSchemeHandlerExecutable('https').catch(() => null);
1168
+ await ctx.platform.launchApp(exe ?? 'explorer.exe', { url: u });
1169
+ }
1170
+ await sleep(800);
1171
+ return { success: true, text: `Opened URL: ${u}` };
1172
+ }
1173
+ catch (err) {
1174
+ const msg = err instanceof Error ? err.message : String(err);
1175
+ return { success: false, text: `open_url failed: ${msg}` };
1176
+ }
1177
+ },
1178
+ },
1179
+ {
1180
+ // open_uri — the general OS protocol-handler escape route.
1181
+ //
1182
+ // Every OS ships a protocol-handler registry. Windows uses
1183
+ // HKCR\\<scheme>\\shell\\open\\command. macOS uses LaunchServices.
1184
+ // Linux uses xdg-mime + .desktop files. The user's installed apps
1185
+ // register themselves as handlers and the OS routes for us:
1186
+ // mailto: → default mail client (Outlook, Mail.app, Thunderbird, Spark...)
1187
+ // tel: → default phone app (Skype, FaceTime, dialer...)
1188
+ // sms: → default messaging app
1189
+ // webcal: → default calendar
1190
+ // slack: → Slack
1191
+ // vscode: → VS Code
1192
+ // obsidian: → Obsidian
1193
+ // spotify: → Spotify
1194
+ // zoommtg: → Zoom
1195
+ // discord: → Discord
1196
+ // file: → OS file-association dispatcher
1197
+ // http(s): → default browser
1198
+ //
1199
+ // This is THE app-agnostic escape route. ONE tool, every app that
1200
+ // registers a protocol handler. Zero vision, zero a11y, zero
1201
+ // app-specific code. The agent picks the scheme; we just dispatch.
1202
+ name: 'open_uri',
1203
+ description: 'Open ANY registered URI scheme via the OS protocol-handler registry. ONE tool replaces dozens of app-specific shortcuts. Examples: mailto:bob@example.com?subject=hi&body=hello (mail), tel:+15551234 (phone), slack://channel?team=T123&id=C456 (Slack), vscode://file/path (VS Code), webcal://server/cal.ics (calendar), spotify:track:ID (Spotify), https://example.com (browser). Must be properly URL-encoded — pair with build_uri when you have semantic fields.',
1204
+ inputSchema: {
1205
+ type: 'object',
1206
+ properties: {
1207
+ uri: { type: 'string', description: 'A full URI with scheme (e.g. "mailto:bob@example.com?subject=hi&body=hello").' },
1208
+ },
1209
+ required: ['uri'],
1210
+ additionalProperties: false,
1211
+ },
1212
+ changesScreen: true,
1213
+ async execute(args, ctx) {
1214
+ const u = String(args.uri ?? '').trim();
1215
+ if (!u)
1216
+ return { success: false, isError: true, text: 'open_uri: uri is required' };
1217
+ const schemeMatch = u.match(/^([a-z][a-z0-9+.-]*):/i);
1218
+ if (!schemeMatch) {
1219
+ return { success: false, isError: true, text: 'open_uri: argument must be a URI with a scheme (e.g. mailto:, tel:, https:, slack:)' };
1220
+ }
1221
+ const scheme = schemeMatch[1].toLowerCase();
1222
+ try {
1223
+ if (ctx.platform.platform === 'darwin') {
1224
+ await ctx.platform.launchApp('open', { url: u });
1225
+ await sleep(1500);
1226
+ return {
1227
+ success: true,
1228
+ text: `Dispatched ${scheme}: URI to the OS default handler. The configured app for ${scheme}: should now be focused. Verify with read_screen / list_windows. To complete (e.g. send a composed mail), use one more keystroke (cmd+enter on macOS).`,
1229
+ };
1230
+ }
1231
+ if (ctx.platform.platform === 'linux') {
1232
+ await ctx.platform.launchApp('xdg-open', { url: u });
1233
+ await sleep(1500);
1234
+ return {
1235
+ success: true,
1236
+ text: `Dispatched ${scheme}: URI to the OS default handler. The configured app for ${scheme}: should now be focused. Verify with read_screen / list_windows. To complete (e.g. send a composed mail), use one more keystroke (ctrl+enter on Linux).`,
1237
+ };
1238
+ }
1239
+ // Windows: shell-routed dispatch (explorer.exe mailto:, rundll32
1240
+ // url.dll, cmd /c start) silently fails for New Outlook and other
1241
+ // UWP-packaged handlers — the handler returns without opening a
1242
+ // new window. The reliable path is to resolve the registered
1243
+ // handler executable and invoke IT directly with the URI, then
1244
+ // VERIFY a new visible window appeared. Without verification
1245
+ // open_uri returned "success" while nothing actually happened on
1246
+ // screen, sending the agent into stagnation loops.
1247
+ const exe = await (0, uri_handler_1.resolveSchemeHandlerExecutable)(scheme);
1248
+ if (!exe) {
1249
+ return {
1250
+ success: false,
1251
+ isError: true,
1252
+ text: `open_uri: no registered Windows handler found for "${scheme}:". Try a different scheme or drive the app's UI directly.`,
1253
+ };
1254
+ }
1255
+ const launchResult = await (0, uri_handler_1.launchHandlerAndVerify)(exe, u, { waitMs: 5000 });
1256
+ if (!launchResult.success) {
1257
+ return {
1258
+ success: false,
1259
+ isError: true,
1260
+ text: `open_uri: failed to launch handler "${exe}" for ${scheme}: — ${launchResult.error ?? 'unknown error'}`,
1261
+ };
1262
+ }
1263
+ if (!launchResult.windowOpened) {
1264
+ return {
1265
+ success: false,
1266
+ isError: true,
1267
+ text: `open_uri: handler "${exe}" was launched with ${scheme}: but no new window appeared within 5s. The handler probably routed the URI into an existing instance silently. Drive the app's UI directly (focus_window + click + type_text) instead of relying on the protocol dispatch.`,
1268
+ };
1269
+ }
1270
+ return {
1271
+ success: true,
1272
+ text: `Opened ${scheme}: in the registered handler. New window appeared: "${launchResult.hwndLabel ?? '(handle unknown)'}". To complete (e.g. send a composed mail), use one more keystroke (ctrl+enter).`,
1273
+ };
1274
+ }
1275
+ catch (err) {
1276
+ const msg = err instanceof Error ? err.message : String(err);
1277
+ return { success: false, isError: true, text: `open_uri failed: ${msg}` };
1278
+ }
1279
+ },
1280
+ },
1281
+ {
1282
+ // build_uri — pure helper that converts semantic fields to an
1283
+ // encoded URI. No I/O. Pair with open_uri to dispatch.
1284
+ name: 'build_uri',
1285
+ description: 'Build a properly-encoded URI from a scheme + path + query JSON. Returns the URI text; pair with open_uri to dispatch. Examples: scheme="mailto" path="bob@example.com" query={"subject":"hi","body":"hello"} → "mailto:bob@example.com?subject=hi&body=hello".',
1286
+ inputSchema: {
1287
+ type: 'object',
1288
+ properties: {
1289
+ scheme: { type: 'string', description: 'URI scheme without the colon (mailto, tel, sms, slack, ...).' },
1290
+ path: { type: 'string', description: 'Scheme-specific path. Encoded for you; @ and , are preserved for mailto, + for tel.' },
1291
+ query: { type: 'string', description: 'JSON object of query params, e.g. {"subject":"hi"}. Each value URL-encoded.' },
1292
+ },
1293
+ required: ['scheme'],
1294
+ additionalProperties: false,
1295
+ },
1296
+ changesScreen: false,
1297
+ async execute(args) {
1298
+ const s = String(args.scheme ?? '').trim().toLowerCase();
1299
+ if (!s || !/^[a-z][a-z0-9+.-]*$/.test(s)) {
1300
+ return { success: false, isError: true, text: 'build_uri: scheme must match /^[a-z][a-z0-9+.-]*$/' };
1301
+ }
1302
+ const safe = (v) => encodeURIComponent(v).replace(/'/g, '%27').replace(/"/g, '%22');
1303
+ const encodedPath = args.path
1304
+ ? safe(String(args.path))
1305
+ .replace(/%40/g, '@')
1306
+ .replace(/%2C/g, ',')
1307
+ .replace(/%2B/g, '+')
1308
+ .replace(/%2F/g, '/')
1309
+ : '';
1310
+ let queryStr = '';
1311
+ if (args.query) {
1312
+ let obj;
1313
+ try {
1314
+ obj = typeof args.query === 'string' ? JSON.parse(String(args.query)) : args.query;
1315
+ }
1316
+ catch {
1317
+ return { success: false, isError: true, text: 'build_uri: query must be valid JSON' };
1318
+ }
1319
+ const parts = [];
1320
+ for (const [k, v] of Object.entries(obj)) {
1321
+ if (v === undefined || v === null)
1322
+ continue;
1323
+ parts.push(`${safe(k)}=${safe(String(v))}`);
1324
+ }
1325
+ if (parts.length)
1326
+ queryStr = '?' + parts.join('&');
1327
+ }
1328
+ return { success: true, text: `${s}:${encodedPath}${queryStr}` };
1329
+ },
1330
+ },
1331
+ {
1332
+ name: 'get_system_time',
1333
+ description: 'Return current system time (ISO, epoch, timezone). Zero I/O.',
1334
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
1335
+ changesScreen: false,
1336
+ async execute() {
1337
+ const now = new Date();
1338
+ return {
1339
+ success: true,
1340
+ text: JSON.stringify({
1341
+ iso: now.toISOString(),
1342
+ epochMs: now.getTime(),
1343
+ timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
1344
+ }),
1345
+ };
1346
+ },
1347
+ },
1348
+ // ─── MOUSE + KEYBOARD EXTENDED (Tranche 1B) ────────────────
1349
+ {
1350
+ name: 'mouse_move_relative',
1351
+ description: 'Move cursor by a relative offset (dx, dy). Wayland-safe via cursor cache.',
1352
+ inputSchema: {
1353
+ type: 'object',
1354
+ properties: { dx: { type: 'number' }, dy: { type: 'number' } },
1355
+ required: ['dx', 'dy'],
1356
+ additionalProperties: false,
1357
+ },
1358
+ changesScreen: false,
1359
+ async execute(args, ctx) {
1360
+ await ctx.platform.mouseMoveRelative(Number(args.dx ?? 0), Number(args.dy ?? 0));
1361
+ return { success: true, text: `Cursor moved by (${args.dx}, ${args.dy})` };
1362
+ },
1363
+ },
1364
+ {
1365
+ name: 'mouse_down',
1366
+ description: 'Press a mouse button without releasing. Pair with mouse_up. Enables hold-and-drag + modifier clicks.',
1367
+ inputSchema: {
1368
+ type: 'object',
1369
+ properties: { button: { type: 'string', enum: ['left', 'right', 'middle'] } },
1370
+ additionalProperties: false,
1371
+ },
1372
+ changesScreen: true,
1373
+ async execute(args, ctx) {
1374
+ const b = args.button ?? 'left';
1375
+ await ctx.platform.mouseDown(b);
1376
+ return { success: true, text: `Mouse ${b} down.` };
1377
+ },
1378
+ },
1379
+ {
1380
+ name: 'mouse_up',
1381
+ description: 'Release a mouse button previously pressed with mouse_down.',
1382
+ inputSchema: {
1383
+ type: 'object',
1384
+ properties: { button: { type: 'string', enum: ['left', 'right', 'middle'] } },
1385
+ additionalProperties: false,
1386
+ },
1387
+ changesScreen: true,
1388
+ async execute(args, ctx) {
1389
+ const b = args.button ?? 'left';
1390
+ await ctx.platform.mouseUp(b);
1391
+ return { success: true, text: `Mouse ${b} up.` };
1392
+ },
1393
+ },
1394
+ {
1395
+ name: 'key_down',
1396
+ description: 'Press a key without releasing. Pair with key_up. Use to hold modifiers (shift, ctrl) during clicks.',
1397
+ inputSchema: {
1398
+ type: 'object',
1399
+ properties: { key: { type: 'string' } },
1400
+ required: ['key'],
1401
+ additionalProperties: false,
1402
+ },
1403
+ changesScreen: false,
1404
+ async execute(args, ctx) {
1405
+ await ctx.platform.keyDown(String(args.key ?? ''));
1406
+ return { success: true, text: `Key down: ${args.key}` };
1407
+ },
1408
+ },
1409
+ {
1410
+ name: 'key_up',
1411
+ description: 'Release a key previously pressed with key_down.',
1412
+ inputSchema: {
1413
+ type: 'object',
1414
+ properties: { key: { type: 'string' } },
1415
+ required: ['key'],
1416
+ additionalProperties: false,
1417
+ },
1418
+ changesScreen: false,
1419
+ async execute(args, ctx) {
1420
+ await ctx.platform.keyUp(String(args.key ?? ''));
1421
+ return { success: true, text: `Key up: ${args.key}` };
1422
+ },
1423
+ },
1424
+ {
1425
+ name: 'undo_last',
1426
+ description: 'Send the OS Undo keystroke (mod+Z).',
1427
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
1428
+ changesScreen: true,
1429
+ async execute(_args, ctx) {
1430
+ await ctx.platform.keyPress('mod+z');
1431
+ return { success: true, text: 'Sent undo.' };
1432
+ },
1433
+ },
1434
+ // ─── CLIPBOARD ─────────────────────────────────────────────
1435
+ {
1436
+ name: 'read_clipboard',
1437
+ description: 'Read the OS clipboard.',
1438
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
1439
+ changesScreen: false,
1440
+ async execute(_args, ctx) {
1441
+ const text = await ctx.platform.readClipboard();
1442
+ return { success: true, text: `Clipboard (${text.length} chars):\n${(0, prompt_1.wrapUntrustedScreenContent)(truncate(text, 500))}` };
1443
+ },
1444
+ },
1445
+ {
1446
+ name: 'write_clipboard',
1447
+ description: 'Write text to the OS clipboard.',
1448
+ inputSchema: {
1449
+ type: 'object',
1450
+ properties: { text: { type: 'string' } },
1451
+ required: ['text'],
1452
+ additionalProperties: false,
1453
+ },
1454
+ changesScreen: false,
1455
+ async execute(args, ctx) {
1456
+ const text = String(args.text ?? '');
1457
+ await ctx.platform.writeClipboard(text);
1458
+ return { success: true, text: `Wrote ${text.length} chars to clipboard.` };
1459
+ },
1460
+ },
1461
+ // ─── FLOW CONTROL ───────────────────────────────────────────
1462
+ {
1463
+ name: 'wait',
1464
+ description: 'Pause for N milliseconds (max 5000). Use after actions that trigger animations or page loads.',
1465
+ inputSchema: {
1466
+ type: 'object',
1467
+ properties: { ms: { type: 'number', maximum: 5000 } },
1468
+ required: ['ms'],
1469
+ additionalProperties: false,
1470
+ },
1471
+ changesScreen: false,
1472
+ async execute(args) {
1473
+ const ms = Math.min(5000, Math.max(0, Number(args.ms ?? 0)));
1474
+ await sleep(ms);
1475
+ return { success: true, text: `Waited ${ms}ms.` };
1476
+ },
1477
+ },
1478
+ // ─── VISION (hybrid + vision modes only) ────────────────────
1479
+ {
1480
+ name: 'screenshot',
1481
+ description: 'LAST RESORT — expensive: sends image bytes into LLM context. Escalation order: read_screen (a11y tree, free) → read_text (OCR, cheap) → screenshot (this, expensive). Only call this when both a11y and OCR failed to provide what you need (canvas-only app, icon-only UI, pixel-level verification).',
1482
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
1483
+ changesScreen: false,
1484
+ async execute(_args, ctx) {
1485
+ const shot = await ctx.platform.screenshot({ maxWidth: 1280 });
1486
+ ctx.screenshotsCaptured.n += 1;
1487
+ return {
1488
+ success: true,
1489
+ text: `Captured ${shot.width}×${shot.height}.`,
1490
+ screenshot: shot,
1491
+ };
1492
+ },
1493
+ },
1494
+ // ─── OCR PERCEPTION (webview / canvas, cheap — no vision model) ──────
1495
+ // When the a11y tree is empty (browser page, Electron, canvas, game), OCR
1496
+ // reads the visible TEXT so the TEXT model can keep driving — no screenshot
1497
+ // bytes, no escalation to the vision model. This is the cheap path: it keeps
1498
+ // haiku as the brain instead of handing the whole subtask to sonnet.
1499
+ {
1500
+ name: 'read_text',
1501
+ description: 'OCR the screen and return visible text + positions. Use when the a11y snapshot is empty/sparse (webview, canvas, PDF, game) to READ on-screen content. Cheaper than a screenshot (no image bytes). May take 1–3s.',
1502
+ inputSchema: {
1503
+ type: 'object',
1504
+ properties: {
1505
+ filter: { type: 'string', description: 'Optional: keep only lines containing this text (case-insensitive).' },
1506
+ },
1507
+ additionalProperties: false,
1508
+ },
1509
+ changesScreen: false,
1510
+ async execute(args, _ctx) {
1511
+ const ocr = getAgentOcr();
1512
+ if (!ocr.isAvailable())
1513
+ return { success: false, text: 'read_text: OCR not available on this platform — fall back to screenshot/vision.' };
1514
+ const result = await ocr.recognizeScreen();
1515
+ if (result.elements.length === 0)
1516
+ return { success: true, text: '(read_text: OCR found no text — screen may be blank, or OCR unavailable.)' };
1517
+ const lineMap = new Map();
1518
+ for (const el of result.elements) {
1519
+ const arr = lineMap.get(el.line) ?? [];
1520
+ arr.push(el);
1521
+ lineMap.set(el.line, arr);
1522
+ }
1523
+ const filter = typeof args.filter === 'string' ? args.filter.toLowerCase() : null;
1524
+ const lines = [];
1525
+ for (const [, toks] of [...lineMap.entries()].sort((a, b) => a[0] - b[0])) {
1526
+ const sorted = [...toks].sort((a, b) => a.x - b.x);
1527
+ const lineText = sorted.map(t => t.text).join(' ');
1528
+ if (filter && !lineText.toLowerCase().includes(filter))
1529
+ continue;
1530
+ const minX = Math.min(...sorted.map(t => t.x));
1531
+ const minY = Math.min(...sorted.map(t => t.y));
1532
+ lines.push(`@${minX},${minY} "${lineText}"`);
1533
+ }
1534
+ if (lines.length === 0)
1535
+ return { success: true, text: `(read_text: no lines match "${filter}")` };
1536
+ return { success: true, text: `OCR (${result.elements.length} words, ${result.durationMs}ms):\n${(0, prompt_1.wrapUntrustedScreenContent)(lines.join('\n'))}` };
1537
+ },
1538
+ },
1539
+ {
1540
+ name: 'compile_ui',
1541
+ description: 'Compile the current screen into one fused UI map (a11y + OCR + lazy vision) of elements with stable ids, roles, confidence and sources. Returns a ranked element list with a snapshot id; act on a specific element via invoke_element/set_field_value with {element_id, snapshot_id}. a11y-first; pulls OCR only when a11y is sparse or target_text is missing; pass max_cost:\'cheap\' to forbid OCR, or \'vision_ok\' to allow screenshots.',
1542
+ inputSchema: {
1543
+ type: 'object',
1544
+ properties: {
1545
+ purpose: { type: 'string', enum: ['general', 'find_text', 'act'], description: 'What the compile is for' },
1546
+ target_text: { type: 'string', description: 'If set and absent from a11y, pull OCR to find it' },
1547
+ max_cost: { type: 'string', enum: ['cheap', 'ocr_ok', 'vision_ok'], description: 'Hard ceiling on perception cost (default ocr_ok)' },
1548
+ },
1549
+ additionalProperties: false,
1550
+ },
1551
+ changesScreen: false,
1552
+ async execute(args, ctx) {
1553
+ const holder = ctx.uiMaps;
1554
+ if (!holder)
1555
+ return { success: false, text: 'compile_ui: no UIMap holder on this context.' };
1556
+ const now = Date.now();
1557
+ const id = holder.nextId();
1558
+ const hints = {
1559
+ purpose: typeof args.purpose === 'string' ? args.purpose : undefined,
1560
+ target_text: typeof args.target_text === 'string' ? args.target_text : undefined,
1561
+ max_cost: typeof args.max_cost === 'string' ? args.max_cost : undefined,
1562
+ };
1563
+ const map = await (0, ui_map_1.compileUIMap)((0, ui_map_1.defaultCompileDeps)(ctx.platform, now, id), hints);
1564
+ holder.put(map, now, hints.max_cost ?? 'ocr_ok');
1565
+ return { success: true, text: (0, prompt_1.wrapUntrustedScreenContent)((0, ui_map_render_1.renderUIMap)(map)) };
1566
+ },
1567
+ },
1568
+ {
1569
+ name: 'find_action_button',
1570
+ description: 'Semantically locate the best clickable element for an intent (e.g. "submit", "cancel", "search") over the compiled UI. Returns JSON {status:"ok"|"ambiguous"|"none", snapshot_id, best?, candidates}. On "ok", act with invoke_element({element_id: best.element_id, snapshot_id}). Deterministic synonym + text + confidence match.',
1571
+ inputSchema: { type: 'object', properties: {
1572
+ intent: { type: 'string', description: 'What you want to do (submit/cancel/search/login/...)' },
1573
+ max_cost: { type: 'string', enum: ['cheap', 'ocr_ok', 'vision_ok'], description: 'Perception cost ceiling (default ocr_ok)' },
1574
+ }, required: ['intent'], additionalProperties: false },
1575
+ changesScreen: false,
1576
+ async execute(args, ctx) {
1577
+ const map = await finderMap(ctx, args.max_cost);
1578
+ if (!map)
1579
+ return { success: false, text: 'find_action_button: no UIMap holder on this context.' };
1580
+ const r = (0, ui_map_find_1.findActionButton)(map.elements, map.snapshot_id, String(args.intent ?? ''));
1581
+ return { success: r.status === 'ok', text: JSON.stringify(r) };
1582
+ },
1583
+ },
1584
+ {
1585
+ name: 'find_input_field',
1586
+ description: 'Semantically locate the best editable field for a purpose (e.g. "recipient", "subject", "body", "search") over the compiled UI, including label-less fields via their adjacent label. Returns JSON {status, snapshot_id, best?, candidates}. On "ok", fill with set_field_value({element_id: best.element_id, snapshot_id, value}). Deterministic.',
1587
+ inputSchema: { type: 'object', properties: {
1588
+ purpose: { type: 'string', description: 'What the field is for (recipient/subject/body/search/...)' },
1589
+ max_cost: { type: 'string', enum: ['cheap', 'ocr_ok', 'vision_ok'], description: 'Perception cost ceiling (default ocr_ok)' },
1590
+ }, required: ['purpose'], additionalProperties: false },
1591
+ changesScreen: false,
1592
+ async execute(args, ctx) {
1593
+ const map = await finderMap(ctx, args.max_cost);
1594
+ if (!map)
1595
+ return { success: false, text: 'find_input_field: no UIMap holder on this context.' };
1596
+ const r = (0, ui_map_find_1.findInputField)(map.elements, map.snapshot_id, String(args.purpose ?? ''));
1597
+ return { success: r.status === 'ok', text: JSON.stringify(r) };
1598
+ },
1599
+ },
1600
+ {
1601
+ name: 'smart_click',
1602
+ description: 'OCR-locate visible text on screen and click its center. Use when the a11y tree is empty and invoke_element fails (webview/canvas). Pass the exact visible text (e.g. "Search", a video title, "Sign in").',
1603
+ inputSchema: {
1604
+ type: 'object',
1605
+ properties: {
1606
+ target: { type: 'string', description: 'The visible text to click.' },
1607
+ button: { type: 'string', enum: ['left', 'right'] },
1608
+ },
1609
+ required: ['target'],
1610
+ additionalProperties: false,
1611
+ },
1612
+ changesScreen: true,
1613
+ async execute(args, ctx) {
1614
+ const target = String(args.target ?? '').trim();
1615
+ if (!target)
1616
+ return { success: false, isError: true, text: 'smart_click: target required.' };
1617
+ const button = args.button === 'right' ? 'right' : 'left';
1618
+ const ocr = getAgentOcr();
1619
+ if (!ocr.isAvailable())
1620
+ return { success: false, text: 'smart_click: OCR not available — escalate to vision.' };
1621
+ const result = await ocr.recognizeScreen();
1622
+ if (result.elements.length === 0)
1623
+ return { success: false, text: 'smart_click: OCR found no text — escalate to vision.' };
1624
+ const hit = locateByOcr(target, result.elements);
1625
+ if (!hit)
1626
+ return { success: false, text: `smart_click: no match for "${target}". Call read_text to see visible text, then retry with exact text.` };
1627
+ // OCR coords are screen-space — pass straight to mouseClick, same as the
1628
+ // `click` tool does with a11y coords (no imageScale; that's image-space only).
1629
+ const fg0 = await ctx.platform.getActiveWindow().catch(() => null);
1630
+ const raised = await (0, focus_guard_1.ensureTargetForeground)(ctx, fg0);
1631
+ const before = await ctx.platform.getActiveWindow().catch(() => null);
1632
+ const activation = await ctx.platform.mouseClick(hit.x, hit.y, { button, count: 1 });
1633
+ await sleep(150);
1634
+ getAgentOcr().invalidateCache();
1635
+ const after = await ctx.platform.getActiveWindow().catch(() => null);
1636
+ const focusWarn = focusTheftWarning(activation, before, after);
1637
+ return { success: true, text: `smart_click: clicked "${hit.label}" (score ${hit.score.toFixed(2)}) at (${hit.x},${hit.y})${raised}${focusWarn}`, targetLabel: hit.label };
1638
+ },
1639
+ },
1640
+ // ─── BROWSER (CDP / DOM — reliable web automation, no pixels) ────────
1641
+ // For web pages, driving the DOM by selector/text is far more reliable
1642
+ // than OCR + coordinate clicks: no occlusion, no focus-stealing, no
1643
+ // image scaling. These tools operate a DEDICATED, agent-owned browser
1644
+ // instance (separate profile + debug port) so they never disturb the
1645
+ // user's own windows. They DEGRADE GRACEFULLY: if CDP isn't wired or a
1646
+ // browser can't be launched, they say so and the agent falls back to
1647
+ // read_text / smart_click. Haiku stays the brain — it reads DOM text and
1648
+ // decides; no vision model needed.
1649
+ {
1650
+ name: 'browser_connect',
1651
+ description: 'Open/attach a dedicated browser the agent controls via the DOM (reliable for web pages — no pixels). Call this FIRST for any website task, then use browser_navigate/read/click/type. If it fails, fall back to read_text/smart_click.',
1652
+ inputSchema: { type: 'object', properties: {}, additionalProperties: false },
1653
+ changesScreen: true,
1654
+ async execute(_args, ctx) {
1655
+ if (!ctx.cdp)
1656
+ return { success: false, text: 'browser_connect: CDP not available in this build — use read_text/smart_click for the page instead.' };
1657
+ // CLAWD_AGENT_CDP_OFF=1 → attach-only (never launch a new instance).
1658
+ const allowLaunch = !/^(1|true)$/i.test(process.env.CLAWD_AGENT_CDP_OFF ?? '');
1659
+ const ok = await ctx.cdp.ensureConnected({ launch: allowLaunch, exePaths: [...(0, browser_config_1.getEdgePaths)(), ...(0, browser_config_1.getChromePaths)()] }).catch(() => false);
1660
+ if (!ok)
1661
+ return { success: false, text: `browser_connect: could not ${allowLaunch ? 'launch or attach to' : 'attach to'} a CDP browser — fall back to read_text/smart_click.` };
1662
+ const url = await ctx.cdp.getUrl().catch(() => null);
1663
+ const title = await ctx.cdp.getTitle().catch(() => null);
1664
+ // Disclose provenance honestly: 'attached' means we connected to a
1665
+ // browser already on the user debug port — likely the USER'S own
1666
+ // session. Navigation is mechanically redirected into the agent's own
1667
+ // tab by the driver (root-cause fix 2026-06-11), so their tabs are
1668
+ // never navigated away; reads still see their current page.
1669
+ const mode = ctx.cdp.getConnectionMode?.() ?? 'unknown';
1670
+ const provenance = mode === 'attached'
1671
+ ? ' ⚠ ATTACHED to an EXISTING browser (likely the user\'s own session). browser_navigate automatically works in the agent\'s OWN tab — the user\'s tabs are never navigated away; reads before navigating still see their current page. Do not close their tabs/windows.'
1672
+ : mode === 'dedicated'
1673
+ ? ' (dedicated agent-owned instance — safe to drive freely). NOTE: this browser has its OWN profile — login state may DIFFER from the window you were driving. If a site demands login here but the on-screen window looked logged in, drive the on-screen window instead (keyboard/OCR) or use relaunch_with_cdp.'
1674
+ : '';
1675
+ return { success: true, text: `browser_connect: connected to "${title ?? '(blank)'}" at ${url ?? 'about:blank'}.${provenance} Use browser_navigate to open a URL, browser_read to see the page, browser_click/browser_type to interact.` };
1676
+ },
1677
+ },
1678
+ {
1679
+ name: 'browser_navigate',
1680
+ description: 'Navigate the agent-owned browser to a URL (waits for load). Requires browser_connect first.',
1681
+ inputSchema: {
1682
+ type: 'object',
1683
+ properties: { url: { type: 'string', description: 'The URL to open (e.g. https://www.youtube.com).' } },
1684
+ required: ['url'],
1685
+ additionalProperties: false,
1686
+ },
1687
+ changesScreen: true,
1688
+ async execute(args, ctx) {
1689
+ if (!ctx.cdp || !(await ctx.cdp.isConnected()))
1690
+ return { success: false, text: 'browser_navigate: not connected — call browser_connect first.' };
1691
+ const url = String(args.url ?? '').trim();
1692
+ if (!url)
1693
+ return { success: false, isError: true, text: 'browser_navigate: url required.' };
1694
+ const r = await ctx.cdp.navigate(url);
1695
+ return r.success ? { success: true, text: `browser_navigate: loaded ${r.value ?? url}` } : { success: false, text: `browser_navigate failed: ${r.error}` };
1696
+ },
1697
+ },
1698
+ {
1699
+ name: 'browser_read',
1700
+ description: 'Read the current page as structured DOM: interactive elements (links/buttons/inputs with selectors), or text for a CSS selector. Use instead of read_text on web pages. Requires browser_connect first.',
1701
+ inputSchema: {
1702
+ type: 'object',
1703
+ properties: {
1704
+ selector: { type: 'string', description: 'Optional CSS selector to read text from (default: structured interactive-element list for the whole page).' },
1705
+ },
1706
+ additionalProperties: false,
1707
+ },
1708
+ changesScreen: false,
1709
+ async execute(args, ctx) {
1710
+ if (!ctx.cdp || !(await ctx.cdp.isConnected()))
1711
+ return { success: false, text: 'browser_read: not connected — call browser_connect first.' };
1712
+ const selector = typeof args.selector === 'string' ? args.selector.trim() : '';
1713
+ const text = selector ? await ctx.cdp.readText(selector, 3000) : await ctx.cdp.getPageContext();
1714
+ // Page content is the highest-risk injection surface — always delimited.
1715
+ return { success: true, text: (0, prompt_1.wrapUntrustedScreenContent)(text) };
1716
+ },
1717
+ },
1718
+ {
1719
+ name: 'browser_click',
1720
+ description: 'Click a page element by visible text or CSS selector (DOM click — no coordinates). Requires browser_connect first.',
1721
+ inputSchema: {
1722
+ type: 'object',
1723
+ properties: {
1724
+ text: { type: 'string', description: 'Visible text of the element to click (preferred).' },
1725
+ selector: { type: 'string', description: 'CSS selector (alternative to text).' },
1726
+ },
1727
+ additionalProperties: false,
1728
+ },
1729
+ changesScreen: true,
1730
+ async execute(args, ctx) {
1731
+ if (!ctx.cdp || !(await ctx.cdp.isConnected()))
1732
+ return { success: false, text: 'browser_click: not connected — call browser_connect first.' };
1733
+ const text = typeof args.text === 'string' ? args.text.trim() : '';
1734
+ const selector = typeof args.selector === 'string' ? args.selector.trim() : '';
1735
+ if (!text && !selector)
1736
+ return { success: false, isError: true, text: 'browser_click: provide text or selector.' };
1737
+ const r = text ? await ctx.cdp.clickByText(text) : await ctx.cdp.click(selector);
1738
+ return r.success ? { success: true, text: `browser_click: clicked ${text ? `"${text}"` : selector} (${r.method})` } : { success: false, text: `browser_click failed: ${r.error}. Call browser_read to see the actual elements, then retry.` };
1739
+ },
1740
+ },
1741
+ {
1742
+ name: 'browser_type',
1743
+ description: 'Type text into a page input by CSS selector or associated label (DOM input — no coordinates). Requires browser_connect first.',
1744
+ inputSchema: {
1745
+ type: 'object',
1746
+ properties: {
1747
+ text: { type: 'string', description: 'Text to type.' },
1748
+ selector: { type: 'string', description: 'CSS selector for the input.' },
1749
+ label: { type: 'string', description: 'Label text associated with the input (alternative to selector).' },
1750
+ },
1751
+ required: ['text'],
1752
+ additionalProperties: false,
1753
+ },
1754
+ changesScreen: true,
1755
+ async execute(args, ctx) {
1756
+ if (!ctx.cdp || !(await ctx.cdp.isConnected()))
1757
+ return { success: false, text: 'browser_type: not connected — call browser_connect first.' };
1758
+ const text = String(args.text ?? '');
1759
+ const selector = typeof args.selector === 'string' ? args.selector.trim() : '';
1760
+ const label = typeof args.label === 'string' ? args.label.trim() : '';
1761
+ if (!selector && !label)
1762
+ return { success: false, isError: true, text: 'browser_type: provide selector or label.' };
1763
+ const r = label ? await ctx.cdp.typeByLabel(label, text) : await ctx.cdp.typeInField(selector, text);
1764
+ return r.success ? { success: true, text: `browser_type: typed into ${selector || `label "${label}"`}` } : { success: false, text: `browser_type failed: ${r.error}` };
1765
+ },
1766
+ },
1767
+ // ─── BATCHED PLANNING ───────────────────────────────────────
1768
+ // Run several known next actions in one turn (saves LLM round-trips).
1769
+ (0, batch_tool_1.buildBatchTool)(),
1770
+ // ─── TERMINAL ACTIONS ──────────────────────────────────────
1771
+ {
1772
+ name: 'done',
1773
+ description: 'Declare the task complete. Provide SPECIFIC screen evidence — a window title, a value visible in the document, a status bar message. Do NOT use hedging words ("should", "might", "probably", "I think", "I believe") — that means you are guessing. If the task CHANGED anything you MUST pass `assertions` (same types as the verify tool, plus `file_changed_since_start` for a file you wrote) that prove the RESULT — and the proof must reflect your change, not state that was already there (an ambient clock, an already-open window). The harness re-checks them against the live screen and rejects done if any fail or none is discriminating. If you can\'t see concrete evidence, take a screenshot or read_screen first.',
1774
+ inputSchema: {
1775
+ type: 'object',
1776
+ properties: {
1777
+ evidence: { type: 'string' },
1778
+ assertions: {
1779
+ type: 'array',
1780
+ description: 'Optional machine-checkable proofs (verify-tool types). The harness executes them; done is rejected if any fail.',
1781
+ items: { type: 'object' },
1782
+ },
1783
+ },
1784
+ required: ['evidence'],
1785
+ additionalProperties: false,
1786
+ },
1787
+ changesScreen: false,
1788
+ terminal: true,
1789
+ async execute(args, ctx) {
1790
+ const evidence = String(args.evidence ?? '').trim();
1791
+ // Guard 1: evidence must be present and non-trivial. An empty string
1792
+ // or "ok" / "done" gives the verifier nothing to work with.
1793
+ if (evidence.length < 8) {
1794
+ return {
1795
+ success: false,
1796
+ text: 'done rejected: evidence is empty or too short. Look at the screen and report a SPECIFIC concrete observation (window title, on-screen text, focused element) before declaring done.',
1797
+ isError: true,
1798
+ };
1799
+ }
1800
+ // Guard 2: hedging-language detection. Phrases like "should have
1801
+ // been sent", "might be open", "I think it worked" are speculative
1802
+ // — they signal the agent guessed instead of verifying. Force a
1803
+ // re-check by rejecting the call. The agent's next turn will see
1804
+ // this rejection and either take a screenshot/read_screen or
1805
+ // rephrase with concrete observations.
1806
+ //
1807
+ // Pattern is intentionally narrow: words must appear as standalone
1808
+ // tokens (or first-letter-of-token), not as part of larger words
1809
+ // like "shoulder" or "mighty". Word-boundary anchored.
1810
+ if (HEDGING_PATTERN.test(evidence)) {
1811
+ return {
1812
+ success: false,
1813
+ text: `done rejected: evidence contains hedging language ("should", "might", "probably", "I think", "I believe", "appears to", "seems to", "if successful"…). That means you are GUESSING, not observing. Take a screenshot or call read_screen, then describe what you actually see — concrete strings, not predictions.`,
1814
+ isError: true,
1815
+ };
1816
+ }
1817
+ // Guard 3 (the strong one): harness-executed assertions. The model's
1818
+ // prose is a CLAIM; these checks are PROOF — run against live ground
1819
+ // truth (UIA values, window list, clipboard, fs, OCR). A model that
1820
+ // hallucinates a result (live Outlook run 2026-06-06: "verified" a
1821
+ // recipient that was never committed) gets caught HERE, at done-time,
1822
+ // instead of the task silently failing after the run ends.
1823
+ const mutated = ctx.mutatedScreen === true;
1824
+ // NB (P1): hard-requiring `assertions` for EVERY mutating task (the
1825
+ // strictest anti-false-success gate) is intentionally NOT enforced here.
1826
+ // It would force every screen-changing task to carry a discriminating
1827
+ // proof — but real apps are frequently already open (the only cheap
1828
+ // proofs, window_title/app_running, are then non-discriminating), so it
1829
+ // both over-constrains agents and can't be satisfied against a static
1830
+ // app. Left as STRONG guidance in the `done` description; flagged for
1831
+ // Fable review as the stricter option (needs the run-agent suite to
1832
+ // model post-action state). The discriminating gate below + the
1833
+ // file_changed_since_start proof are the deployable 80%.
1834
+ if (args.assertions !== undefined) {
1835
+ const parsed = (0, assertions_1.parseAssertions)(args.assertions);
1836
+ if ('error' in parsed) {
1837
+ return { success: false, text: `done rejected: ${parsed.error}`, isError: true };
1838
+ }
1839
+ const report = await (0, assertions_1.checkAssertions)(parsed.assertions, {
1840
+ adapter: ctx.platform,
1841
+ ocrText: async () => (await getAgentOcr().recognizeScreen()).fullText ?? '',
1842
+ taskStartedAt: ctx.taskStartedAt,
1843
+ });
1844
+ if (!report.ok) {
1845
+ return {
1846
+ success: false,
1847
+ isError: true,
1848
+ text: `done rejected: ${report.failed} of ${report.outcomes.length} assertion(s) FAILED — the live screen does not back your claim:\n${(0, assertions_1.renderReport)(report)}\nFix the failing condition (the detail shows the actual state), or give_up with the reason.`,
1849
+ };
1850
+ }
1851
+ // Guard 3b (P1): for a mutating task, at least one PASSING proof must
1852
+ // be discriminating — not already true before the task acted.
1853
+ // Otherwise the "proof" demonstrates nothing changed because of you
1854
+ // (asserting an ambient clock / a window that was already open).
1855
+ if (mutated && ctx.taskBaseline && !(0, assertions_1.hasDiscriminatingEvidence)(parsed.assertions, report, ctx.taskBaseline)) {
1856
+ return {
1857
+ success: false,
1858
+ isError: true,
1859
+ text: `done rejected: every proof you gave was ALREADY true before you acted — none of them shows your change:\n${(0, assertions_1.renderReport)(report)}\nAssert the NEW state your action produced (file_changed_since_start for a file you wrote, element_value_contains for text you typed, a window title that wasn't open before), or give_up.`,
1860
+ };
1861
+ }
1862
+ return {
1863
+ success: true,
1864
+ text: `done: ${evidence}\nVERIFIED:\n${(0, assertions_1.renderReport)(report)}`,
1865
+ stop: true,
1866
+ terminalExit: 'done',
1867
+ };
1868
+ }
1869
+ return { success: true, text: `done: ${evidence}`, stop: true, terminalExit: 'done' };
1870
+ },
1871
+ },
1872
+ {
1873
+ name: 'give_up',
1874
+ description: 'Abandon the task when it\'s impossible from here (credentials missing, captcha, destructive action needs user confirm, stuck after retries).',
1875
+ inputSchema: {
1876
+ type: 'object',
1877
+ properties: { reason: { type: 'string' } },
1878
+ required: ['reason'],
1879
+ additionalProperties: false,
1880
+ },
1881
+ changesScreen: false,
1882
+ terminal: true,
1883
+ async execute(args) {
1884
+ const reason = String(args.reason ?? 'unknown');
1885
+ return { success: false, text: `give_up: ${reason}`, stop: true, terminalExit: 'give_up' };
1886
+ },
1887
+ },
1888
+ {
1889
+ name: 'cannot_read',
1890
+ description: 'Escalate from blind mode to vision — the a11y snapshot doesn\'t contain what you need. Only available in blind mode.',
1891
+ inputSchema: {
1892
+ type: 'object',
1893
+ properties: { reason: { type: 'string' } },
1894
+ required: ['reason'],
1895
+ additionalProperties: false,
1896
+ },
1897
+ changesScreen: false,
1898
+ terminal: true,
1899
+ async execute(args) {
1900
+ const reason = String(args.reason ?? 'a11y snapshot insufficient');
1901
+ return { success: false, text: `cannot_read: ${reason}`, stop: true, terminalExit: 'cannot_read' };
1902
+ },
1903
+ },
1904
+ ];
1905
+ // A/B toggle: CLAWD_AGENT_NO_BATCH=1 removes the batch tool so the SAME task
1906
+ // can be run per-call (one tool per turn) vs batched, for measurement.
1907
+ if (/^(1|true)$/i.test(process.env.CLAWD_AGENT_NO_BATCH ?? '')) {
1908
+ const bi = tools.findIndex(t => t.name === 'batch');
1909
+ if (bi >= 0)
1910
+ tools.splice(bi, 1);
1911
+ }
1912
+ // Full flat catalog. `screenshot` is available so the agent can call it
1913
+ // when a11y is insufficient. `cannot_read` is excluded — the model runs
1914
+ // in hybrid mode with direct screenshot access; there is no blind→vision
1915
+ // escalation path to trigger.
1916
+ return tools.filter(t => t.name !== 'cannot_read');
1917
+ }
1918
+ /**
1919
+ * Resolve `processId` to the active-window pid when the LLM omits it.
1920
+ * Without this, UIA / AX searches walk the entire system tree and
1921
+ * either take 10-20 seconds or hang outright. Pre-scoping to the
1922
+ * focused app's pid is almost always what the agent actually wants.
1923
+ *
1924
+ * Used by every agent-internal tool that calls `findElements` or
1925
+ * `invokeElement` with an optional `processId` arg.
1926
+ */
1927
+ async function resolveAgentPid(args, ctx) {
1928
+ if (typeof args.processId === 'number')
1929
+ return args.processId;
1930
+ try {
1931
+ const active = await ctx.platform.getActiveWindow();
1932
+ return active?.processId;
1933
+ }
1934
+ catch {
1935
+ return undefined;
1936
+ }
1937
+ }
1938
+ function buildWinQuery(args) {
1939
+ const q = {};
1940
+ if (typeof args.processName === 'string')
1941
+ q.processName = args.processName;
1942
+ if (typeof args.processId === 'number')
1943
+ q.processId = args.processId;
1944
+ if (typeof args.title === 'string')
1945
+ q.title = args.title;
1946
+ return Object.keys(q).length ? q : undefined;
1947
+ }
1948
+ /**
1949
+ * Shared `expect` arg schema for consequential tools. The agent loop (and the
1950
+ * batch executor) verify these post-conditions reactively after the action —
1951
+ * a failure surfaces as a DEVIATION (Layer C). Exposed on every tool the model
1952
+ * uses for send/save/submit-class actions, including the OCR/coordinate
1953
+ * fallbacks (click/smart_click/open_uri/browser_click) where verification
1954
+ * matters most (audit 2026-06-10, finding C2/M3).
1955
+ */
1956
+ const EXPECT_SCHEMA = {
1957
+ type: 'array',
1958
+ description: 'Optional post-conditions to verify after this action (same assertion types as the verify tool: window_title_contains, app_running, element_exists, element_value_contains, clipboard_contains, file_exists, file_contains, ocr_contains, file_changed_since_start). If any FAIL the action returns a DEVIATION and you must adapt. State an OUTCOME you can observe (a window title, a rendered element/chip, a status) — NOT the raw text you typed.',
1959
+ items: {
1960
+ type: 'object',
1961
+ properties: { type: { type: 'string', enum: ['window_title_contains', 'app_running', 'element_exists', 'element_value_contains', 'clipboard_contains', 'file_exists', 'file_contains', 'ocr_contains', 'file_changed_since_start'] } },
1962
+ required: ['type'],
1963
+ },
1964
+ };
1965
+ /** Shared `space` arg schema for the granular pointer tools (click/drag/scroll). */
1966
+ const COORD_SPACE_SCHEMA = {
1967
+ type: 'string',
1968
+ enum: ['screen', 'image'],
1969
+ description: 'Coordinate space of the x/y you pass. "screen" = accessibility/COMPILED-UI coords (@x,y), already correct for the real screen. "image" = coords you read off the SCREENSHOT (downscaled to 1280px wide); the tool scales them up to the real screen. When omitted, the DEFAULT FOLLOWS CONTEXT: "image" while a screenshot is in your context, "screen" otherwise. So pass space:"screen" explicitly when clicking an @x,y map coord on a screenshot turn, and space:"image" when you read coords off the picture.',
1970
+ };
1971
+ /** One-line coordinate breadcrumb for tool-result text: makes the input space,
1972
+ * the scaled screen coords, and the scale factor visible so a wrong-window
1973
+ * click is diagnosable from logs alone (no screenshot needed). */
1974
+ function coordBreadcrumb(ix, iy, sx, sy, space, scale, ctx) {
1975
+ const scaled = scale !== 1 ? ` → screen (${sx},${sy})` : '';
1976
+ return `${space} (${ix},${iy})${scaled} [×${scale}, screen ${ctx.screen.physicalWidth}×${ctx.screen.physicalHeight}]`;
1977
+ }
1978
+ /** Foreground-window before→after, so focus theft (clicks landing on the wrong
1979
+ * window) is visible in the result text. Empty when focus didn't change. */
1980
+ function focusBreadcrumb(before, after) {
1981
+ const b = before?.title ?? '?';
1982
+ const a = after?.title ?? '?';
1983
+ if (b === a)
1984
+ return '';
1985
+ return ` · focus "${truncateTitle(b)}"→"${truncateTitle(a)}"`;
1986
+ }
1987
+ function truncateTitle(s) {
1988
+ return s.length > 32 ? s.slice(0, 31) + '…' : s;
1989
+ }
1990
+ /**
1991
+ * Warn when a coordinate click could not be confirmed to land on the intended
1992
+ * window — the cause of a keystroke leak where an OTP typed after a missed
1993
+ * click went into the wrong window (session 2026-06-11). Two signals:
1994
+ * (a) the platform reported activation FAILED (Windows foreground-lock kept a
1995
+ * different window in front), or
1996
+ * (b) the foreground window CHANGED across the click (before ≠ after), which
1997
+ * for a click meant to interact with the already-focused window means the
1998
+ * click hit something else.
1999
+ * Returns a loud, actionable suffix telling the agent to verify focus before
2000
+ * typing; empty string when the click looks clean.
2001
+ */
2002
+ function focusTheftWarning(activation, before, after) {
2003
+ const activationFailed = activation && activation.activated === false;
2004
+ const foregroundChanged = !!before?.title && !!after?.title && before.title !== after.title;
2005
+ if (!activationFailed && !foregroundChanged)
2006
+ return '';
2007
+ const landed = after?.title ? `"${truncateTitle(after.title)}"` : 'an unknown window';
2008
+ return ` ⚠ FOCUS NOT CONFIRMED — the click may have landed on ${landed} instead of your target`
2009
+ + ` (Windows foreground-lock or coords over a different window). DO NOT type next:`
2010
+ + ` re-focus the intended window first (focus_window / window.focus by processId),`
2011
+ + ` or act on an a11y/el_NN target instead of coordinates.`;
2012
+ }
2013
+ /**
2014
+ * Locate a target string among OCR elements and return the click point (center
2015
+ * of the best-matching contiguous span) in SCREEN pixels. Ported from the
2016
+ * proven scoring in src/tools/smart.ts: exact > substring-ratio > token-overlap,
2017
+ * with a penalty for a single token matching a multi-word target (stops "begin"
2018
+ * in body text beating the "Begin Exam" button). Null when nothing scores ≥0.4.
2019
+ */
2020
+ function locateByOcr(target, elements) {
2021
+ const norm = (s) => s.toLowerCase().replace(/[^\w\s]/g, ' ').replace(/\s+/g, ' ').trim();
2022
+ const targetNorm = norm(target);
2023
+ const targetWords = targetNorm.split(' ').filter(Boolean);
2024
+ const targetWordSet = new Set(targetWords);
2025
+ const lineMap = new Map();
2026
+ for (const el of elements) {
2027
+ if (!el.text)
2028
+ continue;
2029
+ const a = lineMap.get(el.line) ?? [];
2030
+ a.push(el);
2031
+ lineMap.set(el.line, a);
2032
+ }
2033
+ let best = null;
2034
+ let bestScore = 0;
2035
+ const MAX_N = Math.min(8, targetWords.length + 2);
2036
+ for (const toks of lineMap.values()) {
2037
+ const sorted = [...toks].sort((a, b) => a.x - b.x);
2038
+ for (let i = 0; i < sorted.length; i++) {
2039
+ for (let n = 1; n <= MAX_N && i + n <= sorted.length; n++) {
2040
+ const span = sorted.slice(i, i + n);
2041
+ let contiguous = true;
2042
+ for (let k = 1; k < span.length; k++) {
2043
+ const gap = span[k].x - (span[k - 1].x + span[k - 1].width);
2044
+ if (gap > Math.max(span[k - 1].height * 1.5, 30)) {
2045
+ contiguous = false;
2046
+ break;
2047
+ }
2048
+ }
2049
+ if (!contiguous)
2050
+ continue;
2051
+ const phrase = norm(span.map(t => t.text).join(' '));
2052
+ let score = 0;
2053
+ if (phrase === targetNorm)
2054
+ score = 1.0;
2055
+ else if (phrase.includes(targetNorm) || targetNorm.includes(phrase)) {
2056
+ score = Math.min(phrase.length, targetNorm.length) / Math.max(phrase.length, targetNorm.length) * 0.9;
2057
+ }
2058
+ else {
2059
+ const pw = phrase.split(' ').filter(Boolean);
2060
+ const overlap = pw.filter(w => targetWordSet.has(w)).length;
2061
+ const cov = overlap / Math.max(targetWords.length, 1);
2062
+ if (cov >= 1)
2063
+ score = 0.85;
2064
+ else if (cov >= 0.5)
2065
+ score = 0.5 * cov;
2066
+ }
2067
+ if (targetWords.length > 1 && n === 1 && score < 0.95)
2068
+ score *= 0.55;
2069
+ if (score > bestScore) {
2070
+ bestScore = score;
2071
+ const minX = Math.min(...span.map(t => t.x));
2072
+ const minY = Math.min(...span.map(t => t.y));
2073
+ const maxX = Math.max(...span.map(t => t.x + t.width));
2074
+ const maxY = Math.max(...span.map(t => t.y + t.height));
2075
+ best = {
2076
+ x: Math.round((minX + maxX) / 2),
2077
+ y: Math.round((minY + maxY) / 2),
2078
+ label: span.map(t => t.text).join(' '),
2079
+ score,
2080
+ };
2081
+ }
2082
+ }
2083
+ }
2084
+ }
2085
+ return best && bestScore >= 0.4 ? best : null;
2086
+ }
2087
+ function sleep(ms) {
2088
+ return new Promise(r => setTimeout(r, ms));
2089
+ }
2090
+ function truncate(s, max) {
2091
+ return s.length > max ? s.slice(0, max - 1) + '…' : s;
2092
+ }
2093
+ /**
2094
+ * Coerce an LLM-supplied coordinate argument into a clean `{ x, y }` pair.
2095
+ * Models occasionally smush both axes into one field (e.g. `x="390, 79"`,
2096
+ * `x="(390, 79)"`, or `x="390 79"`). The strict number schema makes `Number(...)`
2097
+ * silently produce NaN, which then becomes a click at (NaN, y) — a crash
2098
+ * disguised as a no-op. This helper splits the smushed form when present
2099
+ * and falls back to a clean parse otherwise.
2100
+ *
2101
+ * App-agnostic, OS-agnostic, model-agnostic. Used by every coordinate-taking
2102
+ * tool (click, drag, scroll, hover, move).
2103
+ */
2104
+ function coerceCoord(rawX, rawY) {
2105
+ const parseOne = (v) => {
2106
+ if (typeof v === 'number')
2107
+ return v;
2108
+ if (typeof v === 'string') {
2109
+ // Strip parens, brackets, leading/trailing whitespace.
2110
+ const cleaned = v.replace(/[()[\]\s]/g, '');
2111
+ const n = Number(cleaned);
2112
+ return Number.isFinite(n) ? n : NaN;
2113
+ }
2114
+ return NaN;
2115
+ };
2116
+ // Case A: x is a string containing a comma or pair-like "390, 79" / "390 79" / "(390,79)".
2117
+ if (typeof rawX === 'string' && /[\s,]/.test(rawX)) {
2118
+ const parts = rawX.replace(/[()[\]]/g, '').split(/[,\s]+/).filter(Boolean);
2119
+ if (parts.length >= 2) {
2120
+ const x = Number(parts[0]);
2121
+ const y = Number(parts[1]);
2122
+ if (Number.isFinite(x) && Number.isFinite(y)) {
2123
+ return {
2124
+ x, y,
2125
+ warning: `coord parser: x came in as "${rawX}" — split into x=${x},y=${y}. Pass x and y as SEPARATE numeric args next time.`,
2126
+ };
2127
+ }
2128
+ }
2129
+ }
2130
+ const x = parseOne(rawX);
2131
+ const y = parseOne(rawY);
2132
+ return { x, y };
2133
+ }
2134
+ //# sourceMappingURL=tools.js.map