@mseep/clawdcursor 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (354) hide show
  1. package/CHANGELOG.md +2264 -0
  2. package/LICENSE +21 -0
  3. package/README.md +385 -0
  4. package/SECURITY.md +44 -0
  5. package/SKILL.md +503 -0
  6. package/dist/core/agent-loop/agent.d.ts +42 -0
  7. package/dist/core/agent-loop/agent.js +1023 -0
  8. package/dist/core/agent-loop/agent.js.map +1 -0
  9. package/dist/core/agent-loop/batch-tool.d.ts +25 -0
  10. package/dist/core/agent-loop/batch-tool.js +218 -0
  11. package/dist/core/agent-loop/batch-tool.js.map +1 -0
  12. package/dist/core/agent-loop/coord-scale.d.ts +72 -0
  13. package/dist/core/agent-loop/coord-scale.js +89 -0
  14. package/dist/core/agent-loop/coord-scale.js.map +1 -0
  15. package/dist/core/agent-loop/focus-guard.d.ts +24 -0
  16. package/dist/core/agent-loop/focus-guard.js +29 -0
  17. package/dist/core/agent-loop/focus-guard.js.map +1 -0
  18. package/dist/core/agent-loop/project-mcp.d.ts +97 -0
  19. package/dist/core/agent-loop/project-mcp.js +253 -0
  20. package/dist/core/agent-loop/project-mcp.js.map +1 -0
  21. package/dist/core/agent-loop/prompt.d.ts +45 -0
  22. package/dist/core/agent-loop/prompt.js +426 -0
  23. package/dist/core/agent-loop/prompt.js.map +1 -0
  24. package/dist/core/agent-loop/tool-meta.d.ts +93 -0
  25. package/dist/core/agent-loop/tool-meta.js +651 -0
  26. package/dist/core/agent-loop/tool-meta.js.map +1 -0
  27. package/dist/core/agent-loop/tools.d.ts +38 -0
  28. package/dist/core/agent-loop/tools.js +2134 -0
  29. package/dist/core/agent-loop/tools.js.map +1 -0
  30. package/dist/core/agent-loop/types.d.ts +170 -0
  31. package/dist/core/agent-loop/types.js +12 -0
  32. package/dist/core/agent-loop/types.js.map +1 -0
  33. package/dist/core/agent.d.ts +51 -0
  34. package/dist/core/agent.js +245 -0
  35. package/dist/core/agent.js.map +1 -0
  36. package/dist/core/app-categories.d.ts +67 -0
  37. package/dist/core/app-categories.js +108 -0
  38. package/dist/core/app-categories.js.map +1 -0
  39. package/dist/core/banner.d.ts +70 -0
  40. package/dist/core/banner.js +245 -0
  41. package/dist/core/banner.js.map +1 -0
  42. package/dist/core/classify/capability.d.ts +45 -0
  43. package/dist/core/classify/capability.js +78 -0
  44. package/dist/core/classify/capability.js.map +1 -0
  45. package/dist/core/decompose/llm-decomposer.d.ts +35 -0
  46. package/dist/core/decompose/llm-decomposer.js +156 -0
  47. package/dist/core/decompose/llm-decomposer.js.map +1 -0
  48. package/dist/core/decompose/parser.d.ts +27 -0
  49. package/dist/core/decompose/parser.js +101 -0
  50. package/dist/core/decompose/parser.js.map +1 -0
  51. package/dist/core/observability/correlation.d.ts +19 -0
  52. package/dist/core/observability/correlation.js +36 -0
  53. package/dist/core/observability/correlation.js.map +1 -0
  54. package/dist/core/observability/cost-meter.d.ts +51 -0
  55. package/dist/core/observability/cost-meter.js +134 -0
  56. package/dist/core/observability/cost-meter.js.map +1 -0
  57. package/dist/core/observability/logger.d.ts +61 -0
  58. package/dist/core/observability/logger.js +550 -0
  59. package/dist/core/observability/logger.js.map +1 -0
  60. package/dist/core/router/aliases.d.ts +50 -0
  61. package/dist/core/router/aliases.js +104 -0
  62. package/dist/core/router/aliases.js.map +1 -0
  63. package/dist/core/router/normalize.d.ts +41 -0
  64. package/dist/core/router/normalize.js +80 -0
  65. package/dist/core/router/normalize.js.map +1 -0
  66. package/dist/core/safety.d.ts +126 -0
  67. package/dist/core/safety.js +568 -0
  68. package/dist/core/safety.js.map +1 -0
  69. package/dist/core/sense/a11y-resolver.d.ts +73 -0
  70. package/dist/core/sense/a11y-resolver.js +76 -0
  71. package/dist/core/sense/a11y-resolver.js.map +1 -0
  72. package/dist/core/sense/fingerprint.d.ts +41 -0
  73. package/dist/core/sense/fingerprint.js +123 -0
  74. package/dist/core/sense/fingerprint.js.map +1 -0
  75. package/dist/core/sense/rank.d.ts +70 -0
  76. package/dist/core/sense/rank.js +192 -0
  77. package/dist/core/sense/rank.js.map +1 -0
  78. package/dist/core/sense/reactive-check.d.ts +40 -0
  79. package/dist/core/sense/reactive-check.js +48 -0
  80. package/dist/core/sense/reactive-check.js.map +1 -0
  81. package/dist/core/sense/snapshot.d.ts +19 -0
  82. package/dist/core/sense/snapshot.js +100 -0
  83. package/dist/core/sense/snapshot.js.map +1 -0
  84. package/dist/core/sense/types.d.ts +66 -0
  85. package/dist/core/sense/types.js +9 -0
  86. package/dist/core/sense/types.js.map +1 -0
  87. package/dist/core/sense/ui-map-anchors.d.ts +7 -0
  88. package/dist/core/sense/ui-map-anchors.js +24 -0
  89. package/dist/core/sense/ui-map-anchors.js.map +1 -0
  90. package/dist/core/sense/ui-map-elements.d.ts +5 -0
  91. package/dist/core/sense/ui-map-elements.js +33 -0
  92. package/dist/core/sense/ui-map-elements.js.map +1 -0
  93. package/dist/core/sense/ui-map-find.d.ts +56 -0
  94. package/dist/core/sense/ui-map-find.js +153 -0
  95. package/dist/core/sense/ui-map-find.js.map +1 -0
  96. package/dist/core/sense/ui-map-fuse.d.ts +4 -0
  97. package/dist/core/sense/ui-map-fuse.js +44 -0
  98. package/dist/core/sense/ui-map-fuse.js.map +1 -0
  99. package/dist/core/sense/ui-map-geom.d.ts +3 -0
  100. package/dist/core/sense/ui-map-geom.js +16 -0
  101. package/dist/core/sense/ui-map-geom.js.map +1 -0
  102. package/dist/core/sense/ui-map-holder.d.ts +58 -0
  103. package/dist/core/sense/ui-map-holder.js +87 -0
  104. package/dist/core/sense/ui-map-holder.js.map +1 -0
  105. package/dist/core/sense/ui-map-normalize.d.ts +19 -0
  106. package/dist/core/sense/ui-map-normalize.js +65 -0
  107. package/dist/core/sense/ui-map-normalize.js.map +1 -0
  108. package/dist/core/sense/ui-map-render.d.ts +4 -0
  109. package/dist/core/sense/ui-map-render.js +34 -0
  110. package/dist/core/sense/ui-map-render.js.map +1 -0
  111. package/dist/core/sense/ui-map-resolve.d.ts +41 -0
  112. package/dist/core/sense/ui-map-resolve.js +59 -0
  113. package/dist/core/sense/ui-map-resolve.js.map +1 -0
  114. package/dist/core/sense/ui-map-types.d.ts +66 -0
  115. package/dist/core/sense/ui-map-types.js +11 -0
  116. package/dist/core/sense/ui-map-types.js.map +1 -0
  117. package/dist/core/sense/ui-map.d.ts +29 -0
  118. package/dist/core/sense/ui-map.js +113 -0
  119. package/dist/core/sense/ui-map.js.map +1 -0
  120. package/dist/core/verify/assertions.d.ts +132 -0
  121. package/dist/core/verify/assertions.js +284 -0
  122. package/dist/core/verify/assertions.js.map +1 -0
  123. package/dist/index.d.ts +21 -0
  124. package/dist/index.js +24 -0
  125. package/dist/index.js.map +1 -0
  126. package/dist/llm/browser-config.d.ts +36 -0
  127. package/dist/llm/browser-config.js +83 -0
  128. package/dist/llm/browser-config.js.map +1 -0
  129. package/dist/llm/client.d.ts +268 -0
  130. package/dist/llm/client.js +1094 -0
  131. package/dist/llm/client.js.map +1 -0
  132. package/dist/llm/config.d.ts +79 -0
  133. package/dist/llm/config.js +375 -0
  134. package/dist/llm/config.js.map +1 -0
  135. package/dist/llm/credentials.d.ts +35 -0
  136. package/dist/llm/credentials.js +491 -0
  137. package/dist/llm/credentials.js.map +1 -0
  138. package/dist/llm/external-creds.d.ts +42 -0
  139. package/dist/llm/external-creds.js +169 -0
  140. package/dist/llm/external-creds.js.map +1 -0
  141. package/dist/llm/providers.d.ts +123 -0
  142. package/dist/llm/providers.js +717 -0
  143. package/dist/llm/providers.js.map +1 -0
  144. package/dist/paths.d.ts +31 -0
  145. package/dist/paths.js +147 -0
  146. package/dist/paths.js.map +1 -0
  147. package/dist/platform/accessibility.d.ts +139 -0
  148. package/dist/platform/accessibility.js +670 -0
  149. package/dist/platform/accessibility.js.map +1 -0
  150. package/dist/platform/cdp-driver.d.ts +318 -0
  151. package/dist/platform/cdp-driver.js +1179 -0
  152. package/dist/platform/cdp-driver.js.map +1 -0
  153. package/dist/platform/index.d.ts +11 -0
  154. package/dist/platform/index.js +69 -0
  155. package/dist/platform/index.js.map +1 -0
  156. package/dist/platform/keys.d.ts +17 -0
  157. package/dist/platform/keys.js +129 -0
  158. package/dist/platform/keys.js.map +1 -0
  159. package/dist/platform/launch-poll.d.ts +101 -0
  160. package/dist/platform/launch-poll.js +177 -0
  161. package/dist/platform/launch-poll.js.map +1 -0
  162. package/dist/platform/linux.d.ts +173 -0
  163. package/dist/platform/linux.js +1253 -0
  164. package/dist/platform/linux.js.map +1 -0
  165. package/dist/platform/macos.d.ts +136 -0
  166. package/dist/platform/macos.js +976 -0
  167. package/dist/platform/macos.js.map +1 -0
  168. package/dist/platform/native-desktop.d.ts +145 -0
  169. package/dist/platform/native-desktop.js +936 -0
  170. package/dist/platform/native-desktop.js.map +1 -0
  171. package/dist/platform/native-helper.d.ts +130 -0
  172. package/dist/platform/native-helper.js +592 -0
  173. package/dist/platform/native-helper.js.map +1 -0
  174. package/dist/platform/ocr-engine.d.ts +78 -0
  175. package/dist/platform/ocr-engine.js +363 -0
  176. package/dist/platform/ocr-engine.js.map +1 -0
  177. package/dist/platform/ps-runner.d.ts +28 -0
  178. package/dist/platform/ps-runner.js +228 -0
  179. package/dist/platform/ps-runner.js.map +1 -0
  180. package/dist/platform/types.d.ts +397 -0
  181. package/dist/platform/types.js +15 -0
  182. package/dist/platform/types.js.map +1 -0
  183. package/dist/platform/uri-handler.d.ts +75 -0
  184. package/dist/platform/uri-handler.js +273 -0
  185. package/dist/platform/uri-handler.js.map +1 -0
  186. package/dist/platform/wayland-backend.d.ts +53 -0
  187. package/dist/platform/wayland-backend.js +348 -0
  188. package/dist/platform/wayland-backend.js.map +1 -0
  189. package/dist/platform/windows.d.ts +232 -0
  190. package/dist/platform/windows.js +1210 -0
  191. package/dist/platform/windows.js.map +1 -0
  192. package/dist/postbuild.d.ts +10 -0
  193. package/dist/postbuild.js +98 -0
  194. package/dist/postbuild.js.map +1 -0
  195. package/dist/schema/snapshot.d.ts +33 -0
  196. package/dist/schema/snapshot.js +90 -0
  197. package/dist/schema/snapshot.js.map +1 -0
  198. package/dist/shortcuts.d.ts +30 -0
  199. package/dist/shortcuts.js +261 -0
  200. package/dist/shortcuts.js.map +1 -0
  201. package/dist/surface/cli.d.ts +7 -0
  202. package/dist/surface/cli.js +1556 -0
  203. package/dist/surface/cli.js.map +1 -0
  204. package/dist/surface/dashboard.d.ts +8 -0
  205. package/dist/surface/dashboard.js +1193 -0
  206. package/dist/surface/dashboard.js.map +1 -0
  207. package/dist/surface/doctor.d.ts +29 -0
  208. package/dist/surface/doctor.js +1514 -0
  209. package/dist/surface/doctor.js.map +1 -0
  210. package/dist/surface/format.d.ts +10 -0
  211. package/dist/surface/format.js +37 -0
  212. package/dist/surface/format.js.map +1 -0
  213. package/dist/surface/http-utility.d.ts +65 -0
  214. package/dist/surface/http-utility.js +336 -0
  215. package/dist/surface/http-utility.js.map +1 -0
  216. package/dist/surface/mcp-server.d.ts +91 -0
  217. package/dist/surface/mcp-server.js +280 -0
  218. package/dist/surface/mcp-server.js.map +1 -0
  219. package/dist/surface/onboarding.d.ts +15 -0
  220. package/dist/surface/onboarding.js +184 -0
  221. package/dist/surface/onboarding.js.map +1 -0
  222. package/dist/surface/pidfile.d.ts +79 -0
  223. package/dist/surface/pidfile.js +263 -0
  224. package/dist/surface/pidfile.js.map +1 -0
  225. package/dist/surface/readiness.d.ts +45 -0
  226. package/dist/surface/readiness.js +230 -0
  227. package/dist/surface/readiness.js.map +1 -0
  228. package/dist/surface/report.d.ts +68 -0
  229. package/dist/surface/report.js +341 -0
  230. package/dist/surface/report.js.map +1 -0
  231. package/dist/surface/skill-register.d.ts +14 -0
  232. package/dist/surface/skill-register.js +150 -0
  233. package/dist/surface/skill-register.js.map +1 -0
  234. package/dist/surface/version.d.ts +6 -0
  235. package/dist/surface/version.js +27 -0
  236. package/dist/surface/version.js.map +1 -0
  237. package/dist/tools/a11y.d.ts +8 -0
  238. package/dist/tools/a11y.js +545 -0
  239. package/dist/tools/a11y.js.map +1 -0
  240. package/dist/tools/a11y_depth.d.ts +19 -0
  241. package/dist/tools/a11y_depth.js +455 -0
  242. package/dist/tools/a11y_depth.js.map +1 -0
  243. package/dist/tools/agent.d.ts +15 -0
  244. package/dist/tools/agent.js +248 -0
  245. package/dist/tools/agent.js.map +1 -0
  246. package/dist/tools/batch.d.ts +46 -0
  247. package/dist/tools/batch.js +230 -0
  248. package/dist/tools/batch.js.map +1 -0
  249. package/dist/tools/cdp.d.ts +8 -0
  250. package/dist/tools/cdp.js +233 -0
  251. package/dist/tools/cdp.js.map +1 -0
  252. package/dist/tools/compact.d.ts +63 -0
  253. package/dist/tools/compact.js +418 -0
  254. package/dist/tools/compact.js.map +1 -0
  255. package/dist/tools/cost-class.d.ts +38 -0
  256. package/dist/tools/cost-class.js +117 -0
  257. package/dist/tools/cost-class.js.map +1 -0
  258. package/dist/tools/desktop.d.ts +9 -0
  259. package/dist/tools/desktop.js +346 -0
  260. package/dist/tools/desktop.js.map +1 -0
  261. package/dist/tools/electron_bridge.d.ts +41 -0
  262. package/dist/tools/electron_bridge.js +261 -0
  263. package/dist/tools/electron_bridge.js.map +1 -0
  264. package/dist/tools/extras.d.ts +22 -0
  265. package/dist/tools/extras.js +942 -0
  266. package/dist/tools/extras.js.map +1 -0
  267. package/dist/tools/favorites.d.ts +13 -0
  268. package/dist/tools/favorites.js +137 -0
  269. package/dist/tools/favorites.js.map +1 -0
  270. package/dist/tools/introspection.d.ts +13 -0
  271. package/dist/tools/introspection.js +55 -0
  272. package/dist/tools/introspection.js.map +1 -0
  273. package/dist/tools/ocr.d.ts +8 -0
  274. package/dist/tools/ocr.js +66 -0
  275. package/dist/tools/ocr.js.map +1 -0
  276. package/dist/tools/orchestration.d.ts +7 -0
  277. package/dist/tools/orchestration.js +377 -0
  278. package/dist/tools/orchestration.js.map +1 -0
  279. package/dist/tools/playbooks/extract-compose.d.ts +22 -0
  280. package/dist/tools/playbooks/extract-compose.js +85 -0
  281. package/dist/tools/playbooks/extract-compose.js.map +1 -0
  282. package/dist/tools/playbooks/find-replace.d.ts +11 -0
  283. package/dist/tools/playbooks/find-replace.js +56 -0
  284. package/dist/tools/playbooks/find-replace.js.map +1 -0
  285. package/dist/tools/playbooks/index.d.ts +63 -0
  286. package/dist/tools/playbooks/index.js +70 -0
  287. package/dist/tools/playbooks/index.js.map +1 -0
  288. package/dist/tools/playbooks/keys-blocklist.d.ts +24 -0
  289. package/dist/tools/playbooks/keys-blocklist.js +89 -0
  290. package/dist/tools/playbooks/keys-blocklist.js.map +1 -0
  291. package/dist/tools/registry.d.ts +40 -0
  292. package/dist/tools/registry.js +560 -0
  293. package/dist/tools/registry.js.map +1 -0
  294. package/dist/tools/safety-gate.d.ts +16 -0
  295. package/dist/tools/safety-gate.js +70 -0
  296. package/dist/tools/safety-gate.js.map +1 -0
  297. package/dist/tools/scheduler.d.ts +76 -0
  298. package/dist/tools/scheduler.js +413 -0
  299. package/dist/tools/scheduler.js.map +1 -0
  300. package/dist/tools/shortcuts.d.ts +13 -0
  301. package/dist/tools/shortcuts.js +205 -0
  302. package/dist/tools/shortcuts.js.map +1 -0
  303. package/dist/tools/smart.d.ts +15 -0
  304. package/dist/tools/smart.js +785 -0
  305. package/dist/tools/smart.js.map +1 -0
  306. package/dist/tools/types.d.ts +174 -0
  307. package/dist/tools/types.js +67 -0
  308. package/dist/tools/types.js.map +1 -0
  309. package/dist/tools/window-text.d.ts +15 -0
  310. package/dist/tools/window-text.js +39 -0
  311. package/dist/tools/window-text.js.map +1 -0
  312. package/dist/types.d.ts +122 -0
  313. package/dist/types.js +41 -0
  314. package/dist/types.js.map +1 -0
  315. package/native/Package.swift +38 -0
  316. package/native/README.md +113 -0
  317. package/native/Sources/ClawdCursorHelper/main.swift +602 -0
  318. package/native/Sources/ClawdCursorHost/main.swift +182 -0
  319. package/native/Sources/PermissionCheck/main.swift +53 -0
  320. package/native/Sources/ScreenshotHelper/main.swift +219 -0
  321. package/native/build.sh +139 -0
  322. package/native/entitlements.plist +12 -0
  323. package/package.json +115 -0
  324. package/scripts/banner.ps1 +112 -0
  325. package/scripts/coord-accuracy.ps1 +140 -0
  326. package/scripts/coord-uwp.ps1 +80 -0
  327. package/scripts/edge-glow.ps1 +180 -0
  328. package/scripts/find-element.ps1 +198 -0
  329. package/scripts/get-foreground-window.ps1 +71 -0
  330. package/scripts/get-screen-context.ps1 +183 -0
  331. package/scripts/get-windows.ps1 +66 -0
  332. package/scripts/install-panic-hotkey.ps1 +46 -0
  333. package/scripts/interact-element.ps1 +431 -0
  334. package/scripts/invoke-element.ps1 +314 -0
  335. package/scripts/linux/atspi-bridge.py +356 -0
  336. package/scripts/linux/ocr-recognize.py +154 -0
  337. package/scripts/mac/_window-picker.jxa +163 -0
  338. package/scripts/mac/find-element.jxa +0 -0
  339. package/scripts/mac/find-element.sh +161 -0
  340. package/scripts/mac/focus-window.jxa +284 -0
  341. package/scripts/mac/get-focused-element.jxa +102 -0
  342. package/scripts/mac/get-foreground-window.jxa +173 -0
  343. package/scripts/mac/get-screen-context.jxa +197 -0
  344. package/scripts/mac/get-ui-tree.sh +141 -0
  345. package/scripts/mac/get-windows.jxa +117 -0
  346. package/scripts/mac/interact-element.sh +235 -0
  347. package/scripts/mac/invoke-element.jxa +408 -0
  348. package/scripts/mac/ocr-recognize.swift +124 -0
  349. package/scripts/ocr-recognize.ps1 +102 -0
  350. package/scripts/postinstall-native.js +48 -0
  351. package/scripts/ps-bridge.ps1 +830 -0
  352. package/scripts/smoke-mcp.ps1 +119 -0
  353. package/scripts/sync-version.ts +178 -0
  354. package/scripts/verify-install.js +81 -0
@@ -0,0 +1,1023 @@
1
+ "use strict";
2
+ /**
3
+ * Unified agent loop — replaces text-agent + vision-agent with ONE harness.
4
+ *
5
+ * Design summary:
6
+ * • ONE tool vocabulary built by `tools.ts` (mode-parameterized).
7
+ * • Perception: accessibility snapshot EVERY turn (cheap, structured).
8
+ * Screenshots only when:
9
+ * (a) mode === 'vision' (turn 1 seed), or
10
+ * (b) the model explicitly calls the `screenshot` tool, or
11
+ * (c) a screen-changing tool ran AND mode is hybrid/vision
12
+ * (so the model sees the result before its next turn).
13
+ * • LLM: native tool_use via callLLMWithTools (Anthropic tool_use / OpenAI
14
+ * tool_calls). JSON-from-prose fallback for providers without native support.
15
+ * • Safety: every tool call runs through `safety.evaluate()` BEFORE its
16
+ * execute() fires. Single chokepoint.
17
+ * • Stagnation: FingerprintHistory tracks screen state; 3 identical
18
+ * fingerprints = force the agent to try something different or give_up.
19
+ * • Streaming logs: tree-shaped turn logs via the observability logger
20
+ * so the user can watch what the agent is thinking/doing in real time.
21
+ *
22
+ * Model-agnostic + OS-agnostic by construction: provider config comes from
23
+ * AgentLlmDeps, I/O goes through PlatformAdapter, zero `process.platform`
24
+ * branching here.
25
+ */
26
+ Object.defineProperty(exports, "__esModule", { value: true });
27
+ exports.runAgent = runAgent;
28
+ const node_crypto_1 = require("node:crypto");
29
+ const fingerprint_1 = require("../sense/fingerprint");
30
+ const snapshot_1 = require("../sense/snapshot");
31
+ const assertions_1 = require("../verify/assertions");
32
+ const ui_map_holder_1 = require("../sense/ui-map-holder");
33
+ const reactive_check_1 = require("../sense/reactive-check");
34
+ const ocr_engine_1 = require("../../platform/ocr-engine");
35
+ const ui_map_1 = require("../sense/ui-map");
36
+ const ui_map_render_1 = require("../sense/ui-map-render");
37
+ const logger_1 = require("../observability/logger");
38
+ const correlation_1 = require("../observability/correlation");
39
+ const safety_1 = require("../safety");
40
+ const client_1 = require("../../llm/client");
41
+ const prompt_1 = require("./prompt");
42
+ const coord_scale_1 = require("./coord-scale");
43
+ const tools_1 = require("./tools");
44
+ const tool_meta_1 = require("./tool-meta");
45
+ // Backstop turn cap. With the runaway guard (repeated identical actions) and
46
+ // stagnation hard-abort catching genuine stuck-loops early, max_turns is a
47
+ // safety net, not the primary detector — so it can be generous enough to
48
+ // support long sequential tasks (e.g. a multi-challenge benchmark) that
49
+ // legitimately need 30+ actions. Was 20, which truncated such runs mid-task.
50
+ const DEFAULT_MAX_TURNS = 70;
51
+ /**
52
+ * Number of consecutive identical fingerprints that triggers a stagnation
53
+ * WARNING in the next turn's prompt. Below this we trust the agent to
54
+ * recover on its own (a single side-effect-free tool call like
55
+ * `read_screen` legitimately leaves the fingerprint unchanged).
56
+ */
57
+ const STAGNATION_WINDOW = 3;
58
+ /**
59
+ * Number of consecutive stagnant turns after which the stagnation NUDGE
60
+ * escalates from soft to firm (a stronger, method-switching reminder).
61
+ *
62
+ * This is NOT a task-kill. v1.0.0 removed the pipeline ladder, so there is
63
+ * no rung to "escalate" to — and the stagnation signal is the a11y/OCR
64
+ * fingerprint, which is structurally blind to sparse-a11y form apps (new
65
+ * Outlook / `olk`, web & canvas UIs) where the agent may still be making
66
+ * real progress. Aborting on it killed winnable runs. True stuck-loops are
67
+ * caught by the runaway guard (same tool+args repeated); genuine flailing is
68
+ * capped by max_turns. After a firm nudge the counter re-arms so the
69
+ * reminder recurs in waves rather than every turn.
70
+ */
71
+ const STAGNATION_HARD_LIMIT = 5;
72
+ const MAX_HISTORY_SCREENSHOTS = 2;
73
+ /**
74
+ * After this many consecutive turns of `agent.no_tool_call` (model
75
+ * produced text but no parseable tool call), the rung aborts so the
76
+ * pipeline ladder can climb. Three is conservative — a single
77
+ * malformed turn from a degenerate model state can usually self-correct
78
+ * with the "retry with a tool call" reprompt, but three in a row
79
+ * means the model is stuck in a loop and the next strategy has a
80
+ * better chance.
81
+ */
82
+ const NO_TOOL_CALL_LIMIT = 3;
83
+ // Lazy OCR singleton for reactiveCheck — mirrors tools.ts getAgentOcr pattern.
84
+ let _reactiveOcr = null;
85
+ function reactiveOcr() { return (_reactiveOcr ??= new ocr_engine_1.OcrEngine()); }
86
+ /**
87
+ * Run the unified agent against a task.
88
+ *
89
+ * The function is a pure orchestrator — no side effects outside the
90
+ * tool calls themselves. Returns an AgentResult even on failure.
91
+ */
92
+ async function runAgent(input, deps) {
93
+ const startedAt = Date.now();
94
+ const maxTurns = input.maxTurns ?? DEFAULT_MAX_TURNS;
95
+ const isAborted = input.isAborted ?? (() => false);
96
+ const correlationId = (0, correlation_1.getCorrelationId)();
97
+ const log = correlationId ? logger_1.logger.with({ correlationId }) : logger_1.logger;
98
+ // Prefer text model; fall back to vision model if text is absent
99
+ // (vision models handle tool_use without images too).
100
+ const llmConfig = deps.llm.text || deps.llm.vision;
101
+ if (!llmConfig) {
102
+ return earlyExit('give_up', 'No model configured. Run `clawdcursor doctor` to set AI_TEXT_MODEL.', startedAt);
103
+ }
104
+ // Session-scoped UIMap holder (Part 2). Created per-call if not provided.
105
+ const holder = deps.uiMaps ?? new ui_map_holder_1.UIMapHolder();
106
+ // Set up perception state.
107
+ const fph = new fingerprint_1.FingerprintHistory(8);
108
+ const screenshotsCaptured = { n: 0 };
109
+ // Pixel-level change evidence. The a11y fingerprint is structurally blind
110
+ // to sparse-a11y apps (new Outlook / `olk`, web & canvas UIs) — it can sit
111
+ // flat for 30+ turns while the screen demonstrably advances. Screenshot
112
+ // bytes are ground truth: when the model captures one and it differs from
113
+ // the previous capture, the screen moved, whatever the fingerprint says.
114
+ let lastShotDigest = null;
115
+ let lastPixelMoveTurn = 0;
116
+ // Cache screen size once — used for scroll center coordinates.
117
+ let screen;
118
+ try {
119
+ screen = await deps.adapter.getScreenSize();
120
+ }
121
+ catch {
122
+ screen = { logicalWidth: 1920, logicalHeight: 1080, physicalWidth: 1920, physicalHeight: 1080, dpiRatio: 1 };
123
+ }
124
+ // Build the flat tool catalog. Mode and capability params are no longer
125
+ // accepted — the full catalog is served regardless of mode.
126
+ const tools = (0, tools_1.buildUnifiedTools)();
127
+ const toolMap = new Map(tools.map(t => [t.name, t]));
128
+ const llmTools = toUnifiedLLMTools(tools);
129
+ const systemPrompt = (0, prompt_1.buildSystemPrompt)();
130
+ // Seed the conversation.
131
+ const history = [];
132
+ const steps = [];
133
+ let llmCalls = 0;
134
+ let activeApp;
135
+ /**
136
+ * Counts consecutive turns where stagnation fired (a11y fingerprint flat
137
+ * after a screen-changing action). Reset to 0 when the fingerprint moves —
138
+ * or after a firm nudge at `STAGNATION_HARD_LIMIT` (the nudge re-arms in
139
+ * waves; it does NOT abort the task). The runaway guard + max_turns are the
140
+ * terminators.
141
+ */
142
+ // P1 verification integrity: task-level flag — set once any screen-changing
143
+ // tool actually moved the screen. The `done` gate requires machine-checkable
144
+ // evidence (and rejects non-discriminating evidence) when this is true.
145
+ let taskMutated = false;
146
+ let consecutiveStagnantTurns = 0;
147
+ /**
148
+ * Counts consecutive turns where the model produced no tool call.
149
+ * Reset to 0 whenever the model successfully emits a tool call. When
150
+ * this hits `NO_TOOL_CALL_LIMIT` the rung aborts with `'give_up'` so
151
+ * the pipeline ladder can climb to the next strategy. Without this,
152
+ * a Kimi/Moonshot model that fell into degenerate generation (loop
153
+ * of repeated tokens, hits max_tokens with no parseable tool call)
154
+ * just kept producing more garbage every turn for 5 minutes until the
155
+ * task-level timeout fired — 12 wasted turns, ~$0.03 wasted, 0
156
+ * actions taken. Real trace: Outlook subtask 3 ("type recipient")
157
+ * after focus_element failed legitimately on turn 1, the model
158
+ * emitted `functions.read_screen:1ORTYMQAQBAA…(1024 tokens of
159
+ * garbage)` for 11 turns straight.
160
+ */
161
+ let consecutiveNoToolCallTurns = 0;
162
+ // Cross-turn anchor continuity for compileUIMap. Hoisted above the turn-1
163
+ // block so storeUIMap can be called there and update prevAnchors.
164
+ let prevAnchors = undefined;
165
+ // P1: cheap baseline snapshot at task start (window list + clipboard, NO
166
+ // extra OCR — the strong discriminating signals are window titles, clipboard,
167
+ // file mtime; an ambient clock changes regardless so baseline OCR buys little
168
+ // and would cost a screen capture every task). The `done` gate uses this to
169
+ // reject completion evidence that was ALREADY true before the task acted.
170
+ // Best-effort — never blocks the run.
171
+ const taskBaseline = await (0, assertions_1.captureTaskBaseline)(deps.adapter).catch(() => undefined);
172
+ // Turn-1 perception — compiled UIMap (el_NN) so the agent acts on the same
173
+ // vocabulary from its very first decision and el_NN refs resolve immediately.
174
+ // Falls back to a plain a11y snapshot text if compilation fails.
175
+ try {
176
+ const firstSnapshot = await (0, snapshot_1.captureSnapshot)(deps.adapter);
177
+ activeApp = firstSnapshot.activeWindow?.processName;
178
+ fph.push(firstSnapshot.fingerprint);
179
+ // Turn-1 perception = the compiled UI map (el_NN), so the agent acts on the
180
+ // same vocabulary from its very first decision. storeUIMap stores it in the
181
+ // holder, so turn-1 el_NN refs resolve.
182
+ let firstUiRender;
183
+ try {
184
+ const ui0 = await storeUIMap(holder, firstSnapshot, deps.adapter, prevAnchors);
185
+ prevAnchors = ui0.anchors;
186
+ firstUiRender = ui0.render;
187
+ }
188
+ catch {
189
+ firstUiRender = (0, prompt_1.renderSnapshot)(firstSnapshot, { screenWidth: screen.physicalWidth, screenHeight: screen.physicalHeight, focusProcessId: firstSnapshot.activeWindow?.processId });
190
+ }
191
+ // DPI/scale header — tells the model how screenshot pixels map to tool coords.
192
+ const imgScaleNum = screen.physicalWidth > coord_scale_1.LLM_TARGET_WIDTH
193
+ ? screen.physicalWidth / coord_scale_1.LLM_TARGET_WIDTH
194
+ : 1;
195
+ const ssScale = imgScaleNum.toFixed(2);
196
+ const dpiNote = `\nDISPLAY: ${screen.physicalWidth}×${screen.physicalHeight} physical, screenshot ${coord_scale_1.LLM_TARGET_WIDTH}px wide (×${ssScale} to screen).`;
197
+ log.info('agent.coordinate_space', {
198
+ physical: `${screen.physicalWidth}×${screen.physicalHeight}`,
199
+ screenshotScale: ssScale,
200
+ snapshotSpace: 'screen',
201
+ });
202
+ // Anchor the agent to its working window (when the caller resolved one)
203
+ // so it refocuses there instead of thrashing to unrelated apps/tools.
204
+ const windowAnchor = input.targetWindow
205
+ ? `WORKING WINDOW: the "${input.targetWindow.processName}" window ("${input.targetWindow.title}") — perform this task there. If focus drifts to another window, refocus it with focus_window(processName:"${input.targetWindow.processName}") rather than opening new apps, tabs, or tools.\n\n`
206
+ : '';
207
+ const initialBlocks = [
208
+ {
209
+ type: 'text',
210
+ text: `${windowAnchor}TASK: ${input.task}${dpiNote}\n\nCOMPILED UI (act on an element via invoke_element/set_field_value with {element_id, snapshot_id}):\n${(0, prompt_1.wrapUntrustedScreenContent)(firstUiRender)}\n\nPICK ONE TOOL CALL.`,
211
+ },
212
+ ];
213
+ if (input.targetWindow)
214
+ log.info('agent.window_anchor', { window: input.targetWindow.title, process: input.targetWindow.processName });
215
+ history.push({ role: 'user', content: initialBlocks });
216
+ }
217
+ catch (err) {
218
+ const msg = err instanceof Error ? err.message : String(err);
219
+ log.warn('agent.perception.initial.failed', { error: msg });
220
+ return earlyExit('cannot_read', `initial perception failed: ${msg}`, startedAt);
221
+ }
222
+ // ─── Main turn loop ─────────────────────────────────────────
223
+ const outerSpan = (0, logger_1.beginSpan)();
224
+ try {
225
+ for (let turn = 1; turn <= maxTurns; turn++) {
226
+ if (isAborted())
227
+ return finish('aborted', 'aborted by user', steps, llmCalls, screenshotsCaptured.n, startedAt);
228
+ log.info(logger_1.EVENTS.AGENT_TURN_START, { turn, historyTurns: history.length });
229
+ const turnStart = Date.now();
230
+ // Route THIS turn to the vision model when a screenshot is in context —
231
+ // the text model (a11y-first) reads images poorly, and the configured
232
+ // vision model exists for exactly these turns. Text-model cost is kept
233
+ // for a11y turns. General: any vision-needing task.
234
+ //
235
+ // imageInContext is ALSO the coordinate-space signal: raw click/drag
236
+ // coords default to image-space only while a screenshot is actually in
237
+ // the model's context. Keying that default on "the vision model is
238
+ // active" conflated model choice with coordinate provenance — in a
239
+ // vision-only config it scaled a11y/@x,y screen coords from turn 1
240
+ // with no screenshot anywhere (audit 2026-06-10, finding C1). Old
241
+ // screenshots age out of history (see trimOldScreenshots), so neither
242
+ // the image default nor vision routing latches for the rest of the run.
243
+ const imageInContext = historyHasImage(history);
244
+ const activeLlm = (deps.llm.vision && imageInContext) ? deps.llm.vision : llmConfig;
245
+ log.info('agent.turn_model', { turn, model: activeLlm.model, vision: activeLlm === deps.llm.vision });
246
+ // 1. Call the LLM with tools. Retry TRANSIENT failures (overload, rate
247
+ // limit, timeout, 5xx, dropped socket) with exponential backoff — a
248
+ // single API blip must not throw away a long multi-step run (a live
249
+ // 14-challenge run died at turn 45 to one transient error after
250
+ // completing 10 steps). Non-transient errors (bad request, auth) fail
251
+ // fast — retrying them is pointless.
252
+ let llmResult;
253
+ {
254
+ const LLM_MAX_ATTEMPTS = 4;
255
+ let attempt = 0;
256
+ for (;;) {
257
+ attempt += 1;
258
+ try {
259
+ llmResult = await (0, client_1.callLLMWithTools)({
260
+ baseUrl: activeLlm.baseUrl,
261
+ model: activeLlm.model,
262
+ apiKey: activeLlm.apiKey,
263
+ isAnthropic: activeLlm.isAnthropic,
264
+ system: systemPrompt,
265
+ tools: llmTools,
266
+ messages: history,
267
+ maxTokens: activeLlm.maxTokens ?? 1024,
268
+ timeoutMs: 45_000,
269
+ toolChoice: 'auto',
270
+ signal: input.abortSignal,
271
+ });
272
+ llmCalls += 1;
273
+ break;
274
+ }
275
+ catch (err) {
276
+ // User abort (stop command) cancels the in-flight fetch via
277
+ // input.abortSignal — exit cleanly as 'aborted', never as
278
+ // llm_error, and never retry. The timeout signal throws
279
+ // 'TimeoutError', a user abort throws 'AbortError', so the two
280
+ // are distinguishable.
281
+ if (isAborted() || input.abortSignal?.aborted || (err instanceof Error && err.name === 'AbortError')) {
282
+ log.info('agent.aborted', { turn, during: 'llm_call' });
283
+ return finish('aborted', 'aborted by user', steps, llmCalls, screenshotsCaptured.n, startedAt);
284
+ }
285
+ const msg = err instanceof Error ? err.message : String(err);
286
+ const transient = /\b(timeout|timed out|429|rate.?limit|overload|529|50[0-4]|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|socket hang up|network|fetch failed)\b/i.test(msg);
287
+ if (attempt < LLM_MAX_ATTEMPTS && transient) {
288
+ const backoffMs = 800 * 2 ** (attempt - 1); // 0.8s, 1.6s, 3.2s
289
+ log.warn('agent.llm.retry', { turn, attempt, error: truncate(msg, 120), backoffMs });
290
+ await new Promise(r => setTimeout(r, backoffMs));
291
+ continue;
292
+ }
293
+ log.error('agent.llm.failed', { turn, attempt, error: msg });
294
+ return finish('llm_error', `LLM call failed after ${attempt} attempt(s): ${msg}`, steps, llmCalls, screenshotsCaptured.n, startedAt);
295
+ }
296
+ }
297
+ }
298
+ // 2. Log the agent's thinking, if any.
299
+ if (llmResult.text && llmResult.text.trim()) {
300
+ log.info(logger_1.EVENTS.AGENT_THINK, { turn, text: truncate(llmResult.text.trim(), 160) });
301
+ }
302
+ // 3. Record the assistant turn in history so the next turn sees it.
303
+ // SAFETY: when the model hit max_tokens with no parseable tool
304
+ // call, the content is almost certainly degenerate (token-loop
305
+ // garbage). Feeding it back as assistant context just feeds the
306
+ // loop. Replace with a short placeholder in that case.
307
+ const looksDegenerate = llmResult.toolCalls.length === 0
308
+ && llmResult.stopReason === 'length'
309
+ && llmResult.text.length > 200;
310
+ if (looksDegenerate) {
311
+ history.push({
312
+ role: 'assistant',
313
+ content: [{ type: 'text', text: '(previous response exceeded token limit and produced no tool call)' }],
314
+ });
315
+ }
316
+ else {
317
+ history.push({ role: 'assistant', content: llmResult.raw });
318
+ }
319
+ // 4. No tool call → treat as parse failure (re-prompt once).
320
+ if (llmResult.toolCalls.length === 0) {
321
+ consecutiveNoToolCallTurns += 1;
322
+ log.warn('agent.no_tool_call', {
323
+ turn,
324
+ stopReason: llmResult.stopReason,
325
+ text: truncate(llmResult.text, 200),
326
+ consecutive: consecutiveNoToolCallTurns,
327
+ degenerate: looksDegenerate,
328
+ });
329
+ // Hard abort if the model has produced no tool call N turns in
330
+ // a row — it's stuck in a degenerate state and won't recover.
331
+ // Exit with 'give_up' so the pipeline ladder climbs to the
332
+ // next rung (blind → hybrid → vision), which uses a different
333
+ // model / prompt shape and is likely to escape the loop.
334
+ if (consecutiveNoToolCallTurns >= NO_TOOL_CALL_LIMIT) {
335
+ log.error('agent.no_tool_call.runaway_abort', {
336
+ turn,
337
+ consecutive: consecutiveNoToolCallTurns,
338
+ hardLimit: NO_TOOL_CALL_LIMIT,
339
+ });
340
+ return finish('give_up', `Model produced no parseable tool call for ${consecutiveNoToolCallTurns} consecutive turns (last stopReason="${llmResult.stopReason}"). Likely degenerate generation — aborting rung so the pipeline ladder can escalate.`, steps, llmCalls, screenshotsCaptured.n, startedAt);
341
+ }
342
+ history.push({
343
+ role: 'user',
344
+ content: [{ type: 'text', text: 'You must call exactly one tool per turn. Try again with a tool call.' }],
345
+ });
346
+ steps.push({
347
+ turn,
348
+ toolName: '(no-tool)',
349
+ toolArgs: {},
350
+ result: { success: false, text: llmResult.text.slice(0, 200) || '(empty response)' },
351
+ durationMs: Date.now() - turnStart,
352
+ fingerprintChanged: false,
353
+ thought: llmResult.text,
354
+ });
355
+ continue;
356
+ }
357
+ // Successful tool-call emission resets the runaway counter.
358
+ consecutiveNoToolCallTurns = 0;
359
+ // 5. Process every tool call the model emitted this turn. Most
360
+ // models return exactly one; if more, we process them in order
361
+ // and all results flow back on the next turn.
362
+ const toolResults = [];
363
+ let terminal = null;
364
+ // Tracks whether ANY tool in this turn was supposed to change the
365
+ // screen. Pure-compute tools (build_uri, wait, list_windows,
366
+ // read_screen, etc.) don't move the fingerprint by design, so they
367
+ // must NOT count as stagnant turns. Without this, the agent's last
368
+ // turn before dispatching a mailto URI (build_uri -> open_uri) was
369
+ // killed by the stagnation hard-abort because build_uri is
370
+ // changesScreen=false. The agent had the right plan and got cut off
371
+ // one step before execution.
372
+ let anyScreenChangingTool = false;
373
+ for (const call of llmResult.toolCalls) {
374
+ if (isAborted())
375
+ return finish('aborted', 'aborted by user', steps, llmCalls, screenshotsCaptured.n, startedAt);
376
+ const tool = toolMap.get(call.name);
377
+ if (!tool) {
378
+ log.warn('agent.unknown_tool', { turn, tool: call.name });
379
+ toolResults.push({
380
+ id: call.id,
381
+ text: `Unknown tool "${call.name}". Available: ${tools.map(t => t.name).join(', ')}`,
382
+ isError: true,
383
+ });
384
+ steps.push({
385
+ turn,
386
+ toolName: call.name,
387
+ toolArgs: call.args,
388
+ result: { success: false, text: 'unknown tool' },
389
+ durationMs: Date.now() - turnStart,
390
+ fingerprintChanged: false,
391
+ thought: llmResult.text,
392
+ });
393
+ continue;
394
+ }
395
+ let targetLabel = typeof call.args.name === 'string' ? call.args.name
396
+ : typeof call.args.target === 'string' ? call.args.target
397
+ : undefined;
398
+ // el_NN ref clicks carry no name/target — look the element's name up from
399
+ // the holder's CURRENT map so the safety gate sees a real label (correct
400
+ // label-pattern rule + intent-match bypass) instead of the blunt "no target
401
+ // label" confirm. No safety weakening: same gate, more info; a stale/unknown
402
+ // snapshot → no label → blunt confirm still fires (safe default). The action
403
+ // path's own resolveRef still applies the full window guard at execute time.
404
+ if (!targetLabel && call.name === 'invoke_element'
405
+ && typeof call.args.element_id === 'string' && typeof call.args.snapshot_id === 'string') {
406
+ const res = holder.resolve(call.args.snapshot_id, Date.now());
407
+ if (res.ok) {
408
+ const el = res.map.elements.find(e => e.id === call.args.element_id);
409
+ if (el)
410
+ targetLabel = el.text ?? el.normalized_text ?? undefined;
411
+ }
412
+ }
413
+ // 5a. Safety gate — single chokepoint. Pass through the user's task
414
+ // text so the layer can detect intent-matched bypasses (when the user
415
+ // explicitly asked for a destructive action, the confirm tier is
416
+ // skipped — the agent isn't hallucinating a Send click out of nowhere,
417
+ // the user typed "hit send").
418
+ const decision = (0, safety_1.evaluate)({
419
+ tool: call.name,
420
+ args: call.args,
421
+ targetLabel,
422
+ activeApp,
423
+ userTaskText: input.task,
424
+ });
425
+ if (!(0, safety_1.isAllowed)(decision)) {
426
+ const reason = decision.decision === 'block'
427
+ ? decision.reason
428
+ : decision.decision === 'confirm'
429
+ ? `${decision.reason} — headless run: no human to confirm. DO NOT retry the same click. Name the target instead: find_action_button(intent:"...") then invoke_element({element_id, snapshot_id}), or invoke_element(name:"<label>"). If the user's task explicitly asked for this action, restate that intent.`
430
+ : `requires ${decision.decision}: ${decision.tier}`;
431
+ log.info('agent.tool.blocked', { turn, tool: call.name, decision: decision.decision, reason });
432
+ toolResults.push({
433
+ id: call.id,
434
+ text: `[${decision.decision}] ${reason}`,
435
+ isError: true,
436
+ });
437
+ steps.push({
438
+ turn,
439
+ toolName: call.name,
440
+ toolArgs: call.args,
441
+ result: { success: false, text: `safety_${decision.decision}: ${'reason' in decision ? decision.reason : decision.tier}` },
442
+ durationMs: Date.now() - turnStart,
443
+ fingerprintChanged: false,
444
+ thought: llmResult.text,
445
+ });
446
+ continue;
447
+ }
448
+ // 5a' v0.8.3 RUNAWAY GUARD. If the agent has issued the SAME
449
+ // tool+args combination more than REPEAT_THRESHOLD times in the
450
+ // last REPEAT_WINDOW turns, force-exit with `give_up`. This is the
451
+ // fix for the "Outlook keeps opening" class of bug — when the
452
+ // agent can't see the result of its own action (sparse WebView2
453
+ // a11y, for example) it sometimes re-issues the same action every
454
+ // turn. Platform-level idempotency on open_app already prevents
455
+ // duplicate Outlook windows; this guard protects against the same
456
+ // anti-pattern generalized to every tool.
457
+ const REPEAT_THRESHOLD = 3;
458
+ const REPEAT_WINDOW = 6;
459
+ const argKey = JSON.stringify(call.args ?? {});
460
+ const recentRepeats = steps
461
+ .slice(-REPEAT_WINDOW)
462
+ .filter(s => s.toolName === call.name && JSON.stringify(s.toolArgs ?? {}) === argKey)
463
+ .length;
464
+ // Only ACTION tools (changesScreen) can "run away" — re-issuing the
465
+ // same action because the agent can't see its result. Perception tools
466
+ // (screenshot, read_screen, list_windows, wait — all changesScreen:false)
467
+ // are HOW a vision agent sees a canvas that changes every challenge;
468
+ // repeating them is mandatory, not a loop. Counting them aborted a
469
+ // legitimately-progressing vision run mid-exam (live test 2026-05-28).
470
+ //
471
+ // Scroll is also exempt: traversing a long list/panel legitimately
472
+ // repeats the SAME scroll (same x,y,direction,amount) many times — that
473
+ // is forward progress, not a stuck loop. max_turns still caps a truly
474
+ // endless scroll. Observed: scrolling a 60-row list to row 48 tripped
475
+ // the guard after 3 identical scrolls and aborted the run mid-exam.
476
+ const isScroll = call.name === 'scroll'
477
+ || (call.name === 'mouse' && call.args?.action === 'scroll');
478
+ if (tool.changesScreen && !isScroll && recentRepeats >= REPEAT_THRESHOLD) {
479
+ log.warn('agent.runaway_guard', {
480
+ turn, tool: call.name, repeats: recentRepeats, window: REPEAT_WINDOW,
481
+ });
482
+ steps.push({
483
+ turn,
484
+ toolName: call.name,
485
+ toolArgs: call.args,
486
+ result: {
487
+ success: false,
488
+ text: `runaway-guard: ${call.name} called ${recentRepeats} times in last ${REPEAT_WINDOW} turns with same args — aborting to prevent infinite loop`,
489
+ },
490
+ durationMs: Date.now() - turnStart,
491
+ fingerprintChanged: false,
492
+ thought: llmResult.text,
493
+ });
494
+ return finish('give_up', `runaway-guard: repeated ${call.name} with identical args (${recentRepeats}× in last ${REPEAT_WINDOW} turns). The agent is likely unable to see whether the action succeeded — try a different approach or use detect_webview_apps + CDP bridge if the target is an Electron/WebView2 app.`, steps, llmCalls, screenshotsCaptured.n, startedAt);
495
+ }
496
+ // 5a''. cannot_read soft-guard. cannot_read is meant for genuinely
497
+ // unreadable screens (CAPTCHA, blank canvas, OCR garbage). Some models
498
+ // — especially safety-trained text models on irreversible actions like
499
+ // "Send" — try to use it as a "can I have a moment to think" pause AFTER
500
+ // they already located an interactive target. That stalls the pipeline
501
+ // for no good reason. If a perception/locator tool succeeded with REAL
502
+ // CONTENT in the last few turns, refuse cannot_read and tell the model
503
+ // to act on what it already found. Pattern-based; doesn't care which
504
+ // model is asking.
505
+ //
506
+ // v0.9.0: tightened to check for actual content, not just "success".
507
+ // A read_screen that returned "(empty a11y tree — app may be
508
+ // custom-canvas)" is technically successful but has no content for the
509
+ // model to act on — don't block cannot_read in that case.
510
+ if (call.name === 'cannot_read') {
511
+ const LOOKBACK = 4;
512
+ // Resolvers split into two tiers:
513
+ // STRONG: action-y tools whose success means the agent actually
514
+ // resolved a specific target (invoke_element, set_field_value,
515
+ // focus_window). Pure success = real resolution.
516
+ // WEAK: perception tools (read_screen, screenshot, a11y_snapshot,
517
+ // list_windows) where success can be returned with empty content.
518
+ // For those we ALSO require the result text to look non-empty.
519
+ const STRONG_RESOLVERS = new Set([
520
+ 'wait_for_element', 'find_element', 'invoke_element', 'set_field_value',
521
+ 'focus_window',
522
+ ]);
523
+ const WEAK_RESOLVERS = new Set([
524
+ 'read_screen', 'a11y_snapshot', 'screenshot', 'list_windows',
525
+ ]);
526
+ const EMPTY_TREE_HINTS = /empty a11y tree|app may be custom-canvas|\(empty\)|\(no elements found\)|no elements/i;
527
+ const recentReal = steps.slice(-LOOKBACK).some(s => {
528
+ if (!s.result.success)
529
+ return false;
530
+ if (STRONG_RESOLVERS.has(s.toolName))
531
+ return true;
532
+ if (WEAK_RESOLVERS.has(s.toolName)) {
533
+ const txt = s.result.text ?? '';
534
+ if (!txt || txt.length < 60)
535
+ return false;
536
+ if (EMPTY_TREE_HINTS.test(txt))
537
+ return false;
538
+ return true;
539
+ }
540
+ return false;
541
+ });
542
+ if (recentReal) {
543
+ log.info('agent.cannot_read.suppressed', {
544
+ turn, reason: 'recent perception or locator returned real content',
545
+ lookback: LOOKBACK,
546
+ });
547
+ toolResults.push({
548
+ id: call.id,
549
+ text: 'cannot_read refused: a recent perception/locator tool succeeded with real content in this run, so the screen IS readable. Act on what you already located (invoke_element / mouse_click / key) instead. cannot_read is for blank/garbled screens only.',
550
+ isError: true,
551
+ });
552
+ steps.push({
553
+ turn,
554
+ toolName: call.name,
555
+ toolArgs: call.args,
556
+ result: { success: false, text: 'cannot_read suppressed (perception just succeeded)' },
557
+ durationMs: Date.now() - turnStart,
558
+ fingerprintChanged: false,
559
+ thought: llmResult.text,
560
+ });
561
+ continue;
562
+ }
563
+ }
564
+ // 5a'''. BLIND-MODE RAW-COORDINATE-CLICK GUARD.
565
+ //
566
+ // Failure mode (BUG-D): in blind mode the LLM sometimes can't locate
567
+ // a target in the a11y snapshot and, instead of emitting `cannot_read`,
568
+ // starts random-clicking at guessed coordinates like click(1280,800).
569
+ // In a live run, this advanced an exam-test UI from the landing screen
570
+ // through several screens — real user-visible state damage — before
571
+ // the verifier even ran. The verifier alone (confidence threshold) is
572
+ // not a sufficient safety net because a more confident model could
573
+ // produce false-positive success.
574
+ //
575
+ // The guard: in blind mode, refuse `click(x, y)` unless an a11y-aware
576
+ // selector tool (invoke_element / set_field_value / focus_element /
577
+ // a11y_select / a11y_toggle / a11y_expand / a11y_collapse /
578
+ // wait_for_element) SUCCEEDED in the last A11Y_RECENCY turns. That
579
+ // tight window covers the legitimate "I just located the element by
580
+ // a11y; coord-click as fallback" pattern while rejecting guesses.
581
+ //
582
+ // 5b. Log and execute.
583
+ log.info(logger_1.EVENTS.AGENT_TOOL_CALL, { turn, tool: call.name, args: compactArgs(call.args), costClass: tool_meta_1.TOOL_META[call.name]?.costClass });
584
+ const toolStart = Date.now();
585
+ const ctx = {
586
+ platform: deps.adapter,
587
+ task: input.task,
588
+ screen,
589
+ screenshotsCaptured,
590
+ activeApp,
591
+ targetWindow: input.targetWindow,
592
+ cdp: deps.cdp ?? null,
593
+ uiMaps: holder,
594
+ coordSpaceDefault: imageInContext ? 'image' : 'screen',
595
+ // P1 verification integrity — the `done` gate reads these.
596
+ taskStartedAt: startedAt,
597
+ mutatedScreen: taskMutated,
598
+ taskBaseline,
599
+ };
600
+ let result;
601
+ try {
602
+ result = await tool.execute(call.args, ctx);
603
+ }
604
+ catch (err) {
605
+ const msg = err instanceof Error ? err.message : String(err);
606
+ result = { success: false, text: `tool threw: ${msg}` };
607
+ }
608
+ const toolMs = Date.now() - toolStart;
609
+ log.info(logger_1.EVENTS.AGENT_TOOL_RESULT, {
610
+ turn,
611
+ tool: call.name,
612
+ success: result.success,
613
+ ms: toolMs,
614
+ // 200 (was 120) so the click/drag coordinate-space + focus breadcrumb
615
+ // survives — that line is what makes wrong-window clicks diagnosable.
616
+ text: truncate(result.text, 200),
617
+ });
618
+ // 5c. Re-capture perception if the tool changed the screen. We do
619
+ // this AFTER the tool, BEFORE stagnation detection.
620
+ let postSnapshot = null;
621
+ if (tool.changesScreen) {
622
+ try {
623
+ postSnapshot = await (0, snapshot_1.captureSnapshot)(deps.adapter);
624
+ activeApp = postSnapshot.activeWindow?.processName ?? activeApp;
625
+ }
626
+ catch {
627
+ postSnapshot = null;
628
+ }
629
+ }
630
+ const fingerprintChanged = postSnapshot ? fph.getHistory().slice(-1)[0] !== postSnapshot.fingerprint : false;
631
+ if (postSnapshot)
632
+ fph.push(postSnapshot.fingerprint);
633
+ // Invalidate the UIMap holder only when the action actually DID
634
+ // something: the tool reported success, or the fingerprint moved
635
+ // anyway (a failed action that still touched the screen). A rejected
636
+ // action that provably changed nothing must NOT stale the current
637
+ // map — keying on the static changesScreen flag alone meant every
638
+ // ref-rejection re-minted the map and inflated the stagnation
639
+ // counter (audit 2026-06-10, findings A1/M3).
640
+ if (tool.changesScreen && (result.success || fingerprintChanged)) {
641
+ anyScreenChangingTool = true;
642
+ taskMutated = true; // P1: task-level — the `done` gate requires proof
643
+ holder.invalidate();
644
+ }
645
+ // Layer C: reactive step discipline — verify the agent-stated `expect`
646
+ // (HARD → DEVIATION) or apply the tolerant soft net when omitted. Reuses
647
+ // the verify engine + the fingerprintChanged signal already computed.
648
+ const reactive = await (0, reactive_check_1.reactiveCheck)({
649
+ expect: call.args.expect,
650
+ toolText: result.text,
651
+ toolSuccess: result.success,
652
+ changesScreen: tool.changesScreen,
653
+ observedChange: fingerprintChanged,
654
+ adapter: deps.adapter,
655
+ ocrText: async () => (await reactiveOcr().recognizeScreen()).fullText ?? '',
656
+ }).catch(() => null);
657
+ if (reactive) {
658
+ result = { ...result, success: reactive.success, text: reactive.text };
659
+ }
660
+ steps.push({
661
+ turn,
662
+ toolName: call.name,
663
+ toolArgs: call.args,
664
+ result: { success: result.success, text: result.text },
665
+ durationMs: toolMs,
666
+ fingerprintChanged,
667
+ thought: llmResult.text,
668
+ });
669
+ toolResults.push({
670
+ id: call.id,
671
+ text: result.text,
672
+ isError: !result.success,
673
+ screenshot: result.screenshot,
674
+ stop: result.stop,
675
+ terminalExit: result.terminalExit,
676
+ });
677
+ // Terminal action → wrap up after this turn.
678
+ if (result.stop && result.terminalExit) {
679
+ terminal = { exit: result.terminalExit, text: result.text };
680
+ break;
681
+ }
682
+ }
683
+ // 6. Build next-turn user payload: tool_result blocks + fresh
684
+ // perception + (for hybrid/vision) optional screenshot of the
685
+ // post-action state.
686
+ const nextBlocks = [];
687
+ // 6a. tool_result blocks preserve the Anthropic contract and feed
688
+ // OpenAI's `tool` messages when we normalize in llm-client.
689
+ for (const tr of toolResults) {
690
+ const content = [
691
+ { type: 'text', text: tr.text },
692
+ ];
693
+ if (tr.screenshot) {
694
+ content.push(shotToInnerBlock(tr.screenshot));
695
+ }
696
+ nextBlocks.push({ type: 'tool_result', tool_use_id: tr.id, content, is_error: tr.isError });
697
+ }
698
+ // 6b. If any tool changed the screen, append a fresh COMPILED UI map
699
+ // (el_NN) for the next turn — the single per-turn perception (the
700
+ // legacy a11y-snapshot render was unified away into this map).
701
+ const anyChanged = toolResults.some(r => !!r.screenshot) || steps[steps.length - 1]?.fingerprintChanged;
702
+ if (anyChanged || toolResults.length > 0) {
703
+ try {
704
+ const snap = await (0, snapshot_1.captureSnapshot)(deps.adapter);
705
+ activeApp = snap.activeWindow?.processName ?? activeApp;
706
+ nextBlocks.push({
707
+ type: 'text',
708
+ text: `\nRECENT ACTIONS:\n${(0, prompt_1.renderHistory)(steps, 6)}`,
709
+ });
710
+ // §6b UIMap (Part 2): compile + store a UIMap from the already-captured
711
+ // snapshot (no second a11y read). Skip on terminal turns — the loop
712
+ // exits right after, so a re-put would un-invalidate the holder and mask
713
+ // the changesScreen invalidation from the prior action turn.
714
+ if (terminal === null) {
715
+ try {
716
+ // Only mint a FRESH perception map when the screen actually changed
717
+ // (or there is no fresh current map). Otherwise reuse the current
718
+ // map — e.g. a finder/compile_ui established one this turn — so its
719
+ // snapshot_id stays current and el_NN refs resolve on the NEXT turn
720
+ // (the realistic find-this-turn / act-next-turn flow). Reusing also
721
+ // avoids a redundant recompile when nothing changed.
722
+ const curId = holder.currentId();
723
+ const currentFresh = curId !== undefined && holder.resolve(curId, Date.now()).ok === true;
724
+ let uiId;
725
+ let uiRender;
726
+ if (anyScreenChangingTool || !currentFresh) {
727
+ const ui = await storeUIMap(holder, snap, deps.adapter, prevAnchors);
728
+ prevAnchors = ui.anchors;
729
+ uiId = ui.id;
730
+ uiRender = ui.render;
731
+ }
732
+ else {
733
+ const cur = holder.current(); // currentFresh implies it exists
734
+ uiId = cur.snapshot_id;
735
+ uiRender = (0, ui_map_render_1.renderUIMap)(cur);
736
+ // Re-advertising this map to the model — restart its TTL clock
737
+ // so the ref survives the upcoming LLM round-trip (the clock
738
+ // otherwise still runs from the original mid-turn compile).
739
+ holder.touch(uiId, Date.now());
740
+ }
741
+ nextBlocks.push({
742
+ type: 'text',
743
+ text: `\nCOMPILED UI (act on an element via invoke_element/set_field_value with {element_id, snapshot_id="${uiId}"}):\n${(0, prompt_1.wrapUntrustedScreenContent)(uiRender)}`,
744
+ });
745
+ }
746
+ catch {
747
+ // UIMap compilation failure is non-fatal — the agent still has the a11y snapshot.
748
+ }
749
+ // NOTE: deliberately NO invalidate here. The map stored above was
750
+ // compiled from the POST-action snapshot — it is the freshest
751
+ // truth available, and its snapshot_id is exactly what the text
752
+ // block above invites the model to act on next turn. Invalidating
753
+ // it made every advertised el_NN ref dead on arrival (audit
754
+ // 2026-06-10, finding A1). The pre-action staleness hazard is
755
+ // already covered by the 5c invalidation that ran before this map
756
+ // was compiled.
757
+ }
758
+ }
759
+ catch {
760
+ nextBlocks.push({
761
+ type: 'text',
762
+ text: '\n(perception refresh failed — rely on tool results above)',
763
+ });
764
+ }
765
+ }
766
+ // 6c. Stagnation check — two-stage:
767
+ //
768
+ // Stage 1 (warn): the last STAGNATION_WINDOW (3) fingerprints are
769
+ // identical. Tell the agent to change approach — most of the time
770
+ // a single nudge is enough and we trust it to recover.
771
+ //
772
+ // Stage 2 (abort): stagnation has fired for STAGNATION_HARD_LIMIT
773
+ // consecutive turns. The agent is stuck — abort the rung with
774
+ // `exit: 'stagnation'` so the pipeline ladder climbs to hybrid
775
+ // or vision. Without this, the agent kept tying actions to a
776
+ // stale screen until max_turns and then fabricated `done()`
777
+ // evidence ("the email should have been sent...").
778
+ //
779
+ // The counter is reset to 0 every turn the fingerprint moves, so
780
+ // legitimate stagnant patches (slow window opening, transient a11y
781
+ // hiccup) don't trip the hard limit.
782
+ // Stagnation is only meaningful for turns where the agent *tried* to
783
+ // change the screen. Pure-compute tools (build_uri, wait, list_windows,
784
+ // read_screen, screenshot, ...) legitimately leave the fingerprint
785
+ // unchanged and must not be counted as stale. The previous behavior
786
+ // killed the Outlook send-email run mid-plan: the agent had called
787
+ // build_uri to construct a mailto URI and was one turn away from
788
+ // dispatching it via open_uri when the stagnation hard-abort fired.
789
+ // Pixel evidence overrides the a11y fingerprint. Live run 2026-06-06:
790
+ // an Outlook compose in `olk` (sparse, near-static a11y tree) warned
791
+ // "stagnation" on EVERY turn 7–37 while the screen demonstrably
792
+ // advanced — and the firm nudge ("switch to a FUNDAMENTALLY different
793
+ // method") drove the model to abandon the desktop app for a browser.
794
+ // Any screenshot whose bytes differ from the previous capture proves
795
+ // the screen moved; treat that as fresh progress for a full window.
796
+ for (const tr of toolResults) {
797
+ if (!tr.screenshot?.buffer?.length)
798
+ continue;
799
+ const digest = (0, node_crypto_1.createHash)('sha1').update(tr.screenshot.buffer).digest('hex');
800
+ if (lastShotDigest !== null && digest !== lastShotDigest)
801
+ lastPixelMoveTurn = turn;
802
+ lastShotDigest = digest;
803
+ }
804
+ const recentPixelMove = lastPixelMoveTurn > 0 && turn - lastPixelMoveTurn < STAGNATION_WINDOW;
805
+ const stagnant = fph.isStagnant(STAGNATION_WINDOW) && !recentPixelMove;
806
+ // In the hybrid loop the agent perceives via both a11y and screenshots.
807
+ // The a11y fingerprint can stay constant while the screen advances (canvas,
808
+ // browser WebView). Only count stagnation when the agent tried a screen-
809
+ // changing action but the fingerprint stayed the same. The runaway guard
810
+ // and max_turns are the primary backstops for non-stagnation scenarios.
811
+ if (stagnant && anyScreenChangingTool) {
812
+ consecutiveStagnantTurns += 1;
813
+ }
814
+ else if (!stagnant) {
815
+ consecutiveStagnantTurns = 0;
816
+ }
817
+ // else: neutral turn (compute-only tool) — leave the counter alone.
818
+ // Stagnation in the thin loop is a NUDGE, never a task-kill. The old
819
+ // code here returned exit:'stagnation' to force the pipeline ladder to
820
+ // climb to a hybrid/vision rung — but v1.0.0 removed the ladder, so the
821
+ // abort just killed the task. Worse, the fingerprint is a11y/OCR
822
+ // STRUCTURE only (see fingerprint.ts) — it cannot see a sparse-a11y form
823
+ // app advancing (new Outlook / `olk`, web & canvas UIs). That false
824
+ // signal aborted the Outlook send-email run at turn 33 while it was
825
+ // genuinely progressing (focusing To, typing the recipient). Real stuck-
826
+ // loops (same tool+args repeated) are already caught by the runaway guard
827
+ // above; genuine flailing is capped by max_turns. So here we only
828
+ // ESCALATE the nudge — and steer toward the methods that work when the
829
+ // a11y tree is blind: keyboard-only navigation and focus verification.
830
+ // Warn only on turns where the agent actually TRIED to change the
831
+ // screen. Pure observation/compute turns (screenshot, read_text,
832
+ // list_windows) legitimately leave the fingerprint flat — re-injecting
833
+ // the warning there just spams the prompt with a persistent "you're
834
+ // stuck" signal (it rode along on every screenshot()-only turn in the
835
+ // live Outlook run).
836
+ if (stagnant && anyScreenChangingTool) {
837
+ const firm = consecutiveStagnantTurns >= STAGNATION_HARD_LIMIT;
838
+ log.warn(logger_1.EVENTS.AGENT_STAGNATION, {
839
+ turn,
840
+ window: STAGNATION_WINDOW,
841
+ consecutiveStagnantTurns,
842
+ fingerprint: fph.getHistory().slice(-1)[0],
843
+ ...(firm ? { firm: true } : {}),
844
+ });
845
+ nextBlocks.push({
846
+ type: 'text',
847
+ text: firm
848
+ ? `\n⚠ STAGNATION (${consecutiveStagnantTurns} turns, no accessibility change). The screen may still be advancing — this app likely has a sparse a11y tree (new Outlook, web/canvas UIs). STOP repeating the last action. Switch APPROACH WITHIN this app: prefer a keyboard-only flow (open a fresh compose, the recipient field is focused — type, Return to commit the chip, Tab to the next field), or find_input_field/find_action_button to get an el_NN target, or call focus_window to confirm the right window is active, or give_up with a concrete reason. Do NOT open the web version of this app or switch to another app.`
849
+ : `\n⚠ STAGNATION (${consecutiveStagnantTurns}/${STAGNATION_HARD_LIMIT}): the last ${STAGNATION_WINDOW} actions did not change the accessibility tree. Try a DIFFERENT approach (keyboard shortcut, Tab between fields, different target, focus_window to check the active window) — or, if the screen really is changing, verify with a screenshot. give_up if you're truly stuck.`,
850
+ });
851
+ // Re-arm after a firm nudge so it recurs in waves (not every turn) and a
852
+ // later genuine change cleanly resets the cadence. max_turns + the
853
+ // runaway guard remain the actual terminators.
854
+ if (firm)
855
+ consecutiveStagnantTurns = 0;
856
+ }
857
+ history.push({ role: 'user', content: nextBlocks });
858
+ // 7. Trim old screenshots to stay under the token budget.
859
+ trimOldScreenshots(history, MAX_HISTORY_SCREENSHOTS);
860
+ const turnMs = Date.now() - turnStart;
861
+ log.info(logger_1.EVENTS.AGENT_TURN_END, {
862
+ turn,
863
+ ms: turnMs,
864
+ tools: toolResults.length,
865
+ changed: !!anyChanged,
866
+ });
867
+ if (terminal) {
868
+ return finish(terminal.exit, terminal.text, steps, llmCalls, screenshotsCaptured.n, startedAt);
869
+ }
870
+ }
871
+ }
872
+ finally {
873
+ outerSpan.end();
874
+ }
875
+ return finish('max_turns', `hit max turns (${maxTurns}) without a terminal action`, steps, llmCalls, screenshotsCaptured.n, startedAt);
876
+ }
877
+ // ─── Helpers ────────────────────────────────────────────────────────
878
+ /**
879
+ * True if any message in the model's context carries an image block (a
880
+ * screenshot). Such turns must go to the vision model, not the text model.
881
+ * Checks both top-level image blocks and images nested inside tool_result
882
+ * content arrays (the form the screenshot tool produces).
883
+ */
884
+ function historyHasImage(history) {
885
+ for (const m of history) {
886
+ const content = m.content;
887
+ if (!Array.isArray(content))
888
+ continue;
889
+ for (const b of content) {
890
+ if (!b || typeof b !== 'object')
891
+ continue;
892
+ // Top-level image block (direct image in a user turn).
893
+ if (b.type === 'image' || b.type === 'image_url')
894
+ return true;
895
+ // Image nested inside a tool_result block (produced by the screenshot tool).
896
+ if (b.type === 'tool_result' && Array.isArray(b.content)) {
897
+ for (const c of b.content) {
898
+ if (c && typeof c === 'object' && c.type === 'image')
899
+ return true;
900
+ }
901
+ }
902
+ }
903
+ }
904
+ return false;
905
+ }
906
+ /**
907
+ * Compile a UIMap from an already-captured snapshot and store it in the holder.
908
+ * REUSES the caller's snapshot — never triggers a second a11y read or real OCR/vision.
909
+ * Called in §6b so the agent sees el_NN ids on the NEXT turn.
910
+ */
911
+ async function storeUIMap(holder, snap, adapter, prevAnchors) {
912
+ const now = Date.now();
913
+ const id = holder.nextId();
914
+ const map = await (0, ui_map_1.compileUIMap)({
915
+ captureSnapshot: async () => snap, // REUSE — no second a11y read
916
+ ocr: async () => ({ elements: [], fullText: '', durationMs: 0 }), // loop perception is a11y-only
917
+ vision: async () => { throw new Error('no vision in loop perception'); },
918
+ getScreenSize: () => adapter.getScreenSize(),
919
+ getFocusedElement: () => adapter.getFocusedElement(),
920
+ prevAnchors,
921
+ now, snapshotId: id,
922
+ }, { max_cost: 'cheap' }); // cheap = window+a11y only
923
+ holder.put(map, now, 'cheap');
924
+ return { render: (0, ui_map_render_1.renderUIMap)(map), anchors: map.anchors, id };
925
+ }
926
+ function toUnifiedLLMTools(tools) {
927
+ return tools.map(t => ({
928
+ name: t.name,
929
+ description: t.description,
930
+ inputSchema: t.inputSchema,
931
+ }));
932
+ }
933
+ function compactArgs(args) {
934
+ // Deep-safe compact — strings over 60 chars are truncated for logs.
935
+ const out = {};
936
+ for (const [k, v] of Object.entries(args)) {
937
+ if (typeof v === 'string' && v.length > 60)
938
+ out[k] = v.slice(0, 57) + '…';
939
+ else
940
+ out[k] = v;
941
+ }
942
+ return out;
943
+ }
944
+ function truncate(s, max) {
945
+ return s.length > max ? s.slice(0, max - 1) + '…' : s;
946
+ }
947
+ function shotToInnerBlock(shot) {
948
+ return {
949
+ type: 'image',
950
+ source: { type: 'base64', media_type: 'image/png', data: shot.buffer.toString('base64') },
951
+ };
952
+ }
953
+ /**
954
+ * How long a screenshot stays in context, measured in HISTORY MESSAGES
955
+ * (each loop turn appends ~2: assistant + user). 6 ≈ 3 turns. After that
956
+ * the image is replaced with a placeholder, so (a) the model stops
957
+ * reasoning over stale pixels, (b) vision-model routing and the
958
+ * image-space coordinate default decay back to text/screen instead of
959
+ * latching for the rest of the run (audit 2026-06-10, finding C1), and
960
+ * (c) the run stops paying vision pricing on image-free turns.
961
+ */
962
+ const MAX_SCREENSHOT_AGE_MESSAGES = 6;
963
+ /**
964
+ * Remove image content from all but the most recent N RECENT user turns.
965
+ * Keeps the agent in budget when many screenshots accumulate; ages out
966
+ * even the newest screenshot once it falls MAX_SCREENSHOT_AGE_MESSAGES
967
+ * behind the head of history.
968
+ */
969
+ function trimOldScreenshots(history, keepLast) {
970
+ const imageTurnIndices = [];
971
+ history.forEach((turn, i) => {
972
+ if (Array.isArray(turn.content)) {
973
+ const hasImage = turn.content.some(b => b.type === 'image' || b.type === 'image_url' ||
974
+ (b.type === 'tool_result' && Array.isArray(b.content) && b.content.some((c) => c.type === 'image')));
975
+ if (hasImage)
976
+ imageTurnIndices.push(i);
977
+ }
978
+ });
979
+ const cutoff = history.length - MAX_SCREENSHOT_AGE_MESSAGES;
980
+ const keep = new Set(imageTurnIndices.filter(i => i >= cutoff).slice(-keepLast));
981
+ const dropList = imageTurnIndices.filter(i => !keep.has(i));
982
+ if (dropList.length === 0)
983
+ return;
984
+ const dropSet = new Set(dropList);
985
+ for (const i of dropSet) {
986
+ const turn = history[i];
987
+ if (!Array.isArray(turn.content))
988
+ continue;
989
+ turn.content = turn.content.map(b => {
990
+ if (b.type === 'image' || b.type === 'image_url') {
991
+ return { type: 'text', text: '[earlier screenshot removed to save tokens]' };
992
+ }
993
+ if (b.type === 'tool_result' && Array.isArray(b.content)) {
994
+ b.content = b.content.map((c) => c.type === 'image' ? { type: 'text', text: '[earlier tool screenshot removed]' } : c);
995
+ return b;
996
+ }
997
+ return b;
998
+ });
999
+ }
1000
+ }
1001
+ function finish(exit, text, steps, llmCalls, screenshotsCaptured, startedAt) {
1002
+ return {
1003
+ success: exit === 'done',
1004
+ exit,
1005
+ text,
1006
+ steps,
1007
+ llmCalls,
1008
+ screenshotsCaptured,
1009
+ durationMs: Date.now() - startedAt,
1010
+ };
1011
+ }
1012
+ function earlyExit(exit, text, startedAt) {
1013
+ return {
1014
+ success: exit === 'done',
1015
+ exit,
1016
+ text,
1017
+ steps: [],
1018
+ llmCalls: 0,
1019
+ screenshotsCaptured: 0,
1020
+ durationMs: Date.now() - startedAt,
1021
+ };
1022
+ }
1023
+ //# sourceMappingURL=agent.js.map