autokap 1.0.7 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. package/assets/cursors/macos.svg +4 -0
  2. package/assets/cursors/windows.svg +15 -0
  3. package/assets/skill/OPCODE-REFERENCE.md +607 -0
  4. package/assets/skill/README.md +39 -0
  5. package/assets/skill/SKILL.md +453 -468
  6. package/assets/skill/STUDIO-SKILL.md +476 -0
  7. package/assets/skill/references/examples.md +104 -0
  8. package/assets/skill/references/interactive-demo.md +225 -0
  9. package/assets/skill/references/mock-data.md +178 -0
  10. package/dist/action-verifier.d.ts +29 -0
  11. package/dist/action-verifier.js +133 -0
  12. package/dist/agent-action-recovery.d.ts +45 -0
  13. package/dist/agent-action-recovery.js +370 -0
  14. package/dist/agent-message-utils.d.ts +21 -0
  15. package/dist/agent-message-utils.js +77 -0
  16. package/dist/agent-url-utils.d.ts +30 -0
  17. package/dist/agent-url-utils.js +138 -0
  18. package/dist/agent.d.ts +92 -8
  19. package/dist/agent.js +2936 -781
  20. package/dist/ak-tree.d.ts +39 -0
  21. package/dist/ak-tree.js +368 -0
  22. package/dist/alt-text.d.ts +26 -0
  23. package/dist/alt-text.js +55 -0
  24. package/dist/auth-capture.d.ts +17 -0
  25. package/dist/auth-capture.js +164 -0
  26. package/dist/benchmark.d.ts +59 -0
  27. package/dist/benchmark.js +135 -0
  28. package/dist/browser-bar.d.ts +14 -6
  29. package/dist/browser-bar.js +145 -8
  30. package/dist/browser-pool.d.ts +7 -0
  31. package/dist/browser-pool.js +15 -5
  32. package/dist/browser-utils.d.ts +31 -0
  33. package/dist/browser-utils.js +97 -0
  34. package/dist/browser.d.ts +51 -1
  35. package/dist/browser.js +1481 -31
  36. package/dist/capture-alt-text.js +2 -1
  37. package/dist/capture-language-preflight.js +14 -0
  38. package/dist/capture-llm-page-identity.js +22 -10
  39. package/dist/capture-page-identity.d.ts +5 -7
  40. package/dist/capture-page-identity.js +211 -78
  41. package/dist/capture-preset-credentials.d.ts +50 -0
  42. package/dist/capture-preset-credentials.js +127 -0
  43. package/dist/capture-request-plan.d.ts +2 -2
  44. package/dist/capture-request-plan.js +64 -16
  45. package/dist/capture-run-optimizer.js +48 -33
  46. package/dist/capture-selector-memory.d.ts +5 -0
  47. package/dist/capture-selector-memory.js +18 -0
  48. package/dist/capture-strategy.d.ts +36 -0
  49. package/dist/capture-strategy.js +95 -0
  50. package/dist/capture-studio-sync.d.ts +1 -0
  51. package/dist/capture-studio-sync.js +9 -3
  52. package/dist/capture-surface-contract.d.ts +36 -0
  53. package/dist/capture-surface-contract.js +299 -0
  54. package/dist/capture-transition-engine.d.ts +28 -0
  55. package/dist/capture-transition-engine.js +292 -0
  56. package/dist/capture-variant-state.d.ts +2 -0
  57. package/dist/capture-variant-state.js +26 -0
  58. package/dist/capture-verification.d.ts +35 -0
  59. package/dist/capture-verification.js +95 -0
  60. package/dist/capture-viewport-lock.d.ts +48 -0
  61. package/dist/capture-viewport-lock.js +74 -0
  62. package/dist/circuit-breaker.d.ts +42 -0
  63. package/dist/circuit-breaker.js +119 -0
  64. package/dist/cli-config.d.ts +8 -1
  65. package/dist/cli-config.js +62 -6
  66. package/dist/cli-contract.d.ts +15 -0
  67. package/dist/cli-contract.js +167 -0
  68. package/dist/cli-runner-local.d.ts +12 -0
  69. package/dist/cli-runner-local.js +102 -0
  70. package/dist/cli-runner.d.ts +34 -0
  71. package/dist/cli-runner.js +433 -0
  72. package/dist/cli-utils.d.ts +0 -1
  73. package/dist/cli-utils.js +2 -5
  74. package/dist/cli.js +1005 -267
  75. package/dist/clip-orchestrator.js +9 -2
  76. package/dist/clip-postprocess.js +25 -16
  77. package/dist/cookie-dismiss.d.ts +2 -0
  78. package/dist/cookie-dismiss.js +48 -13
  79. package/dist/cost-logging.d.ts +8 -0
  80. package/dist/cost-logging.js +160 -46
  81. package/dist/cost-resolution-monitor.d.ts +16 -0
  82. package/dist/cost-resolution-monitor.js +34 -0
  83. package/dist/credential-templates.js +2 -2
  84. package/dist/cursor-overlay-script.d.ts +6 -0
  85. package/dist/cursor-overlay-script.js +169 -0
  86. package/dist/dom-css-purger.d.ts +65 -0
  87. package/dist/dom-css-purger.js +333 -0
  88. package/dist/dom-font-inliner.d.ts +45 -0
  89. package/dist/dom-font-inliner.js +148 -0
  90. package/dist/dom-patch-resolver.d.ts +52 -0
  91. package/dist/dom-patch-resolver.js +242 -0
  92. package/dist/dom-serializer.d.ts +82 -0
  93. package/dist/dom-serializer.js +378 -0
  94. package/dist/element-capture.d.ts +1 -41
  95. package/dist/element-capture.js +202 -446
  96. package/dist/env-validation.d.ts +5 -0
  97. package/dist/env-validation.js +29 -0
  98. package/dist/execution-schema.d.ts +4423 -0
  99. package/dist/execution-schema.js +507 -0
  100. package/dist/execution-types.d.ts +886 -0
  101. package/dist/execution-types.js +65 -0
  102. package/dist/fonts-loader.d.ts +14 -0
  103. package/dist/fonts-loader.js +55 -0
  104. package/dist/hybrid-navigator.js +12 -12
  105. package/dist/index.d.ts +9 -6
  106. package/dist/index.js +10 -4
  107. package/dist/legacy/agent-action-recovery.d.ts +45 -0
  108. package/dist/legacy/agent-action-recovery.js +370 -0
  109. package/dist/legacy/agent-message-utils.d.ts +21 -0
  110. package/dist/legacy/agent-message-utils.js +77 -0
  111. package/dist/legacy/agent-url-utils.d.ts +30 -0
  112. package/dist/legacy/agent-url-utils.js +138 -0
  113. package/dist/legacy/agent.d.ts +226 -0
  114. package/dist/legacy/agent.js +6666 -0
  115. package/dist/legacy/clip-orchestrator.d.ts +148 -0
  116. package/dist/legacy/clip-orchestrator.js +957 -0
  117. package/dist/legacy/credential-templates.d.ts +5 -0
  118. package/dist/legacy/credential-templates.js +60 -0
  119. package/dist/legacy/hybrid-navigator.d.ts +138 -0
  120. package/dist/legacy/hybrid-navigator.js +468 -0
  121. package/dist/legacy/llm-usage.d.ts +17 -0
  122. package/dist/legacy/llm-usage.js +45 -0
  123. package/dist/legacy/prompt-cache.d.ts +10 -0
  124. package/dist/legacy/prompt-cache.js +24 -0
  125. package/dist/legacy/prompts.d.ts +175 -0
  126. package/dist/legacy/prompts.js +1038 -0
  127. package/dist/legacy/tools.d.ts +4 -0
  128. package/dist/legacy/tools.js +216 -0
  129. package/dist/legacy/video-agent.d.ts +143 -0
  130. package/dist/legacy/video-agent.js +4788 -0
  131. package/dist/legacy/video-observation.d.ts +36 -0
  132. package/dist/legacy/video-observation.js +192 -0
  133. package/dist/legacy/video-planner.d.ts +12 -0
  134. package/dist/legacy/video-planner.js +501 -0
  135. package/dist/legacy/video-prompts.d.ts +37 -0
  136. package/dist/legacy/video-prompts.js +569 -0
  137. package/dist/legacy/video-tools.d.ts +3 -0
  138. package/dist/legacy/video-tools.js +59 -0
  139. package/dist/legacy/video-variant-state.d.ts +29 -0
  140. package/dist/legacy/video-variant-state.js +80 -0
  141. package/dist/legacy/vision-model.d.ts +17 -0
  142. package/dist/legacy/vision-model.js +74 -0
  143. package/dist/llm-healer.d.ts +63 -0
  144. package/dist/llm-healer.js +166 -0
  145. package/dist/llm-provider.d.ts +29 -0
  146. package/dist/llm-provider.js +80 -0
  147. package/dist/logger.d.ts +6 -2
  148. package/dist/logger.js +15 -1
  149. package/dist/mockup-html.js +35 -25
  150. package/dist/mockup.d.ts +95 -2
  151. package/dist/mockup.js +427 -166
  152. package/dist/mouse-animation.d.ts +2 -2
  153. package/dist/mouse-animation.js +34 -20
  154. package/dist/opcode-actions.d.ts +42 -0
  155. package/dist/opcode-actions.js +511 -0
  156. package/dist/opcode-runner.d.ts +51 -0
  157. package/dist/opcode-runner.js +770 -0
  158. package/dist/openrouter-client.d.ts +40 -0
  159. package/dist/openrouter-client.js +16 -0
  160. package/dist/overlay-engine.d.ts +24 -0
  161. package/dist/overlay-engine.js +176 -0
  162. package/dist/postcondition.d.ts +16 -0
  163. package/dist/postcondition.js +269 -0
  164. package/dist/program-patcher.d.ts +25 -0
  165. package/dist/program-patcher.js +44 -0
  166. package/dist/prompts.d.ts +13 -5
  167. package/dist/prompts.js +224 -351
  168. package/dist/provider-config.d.ts +12 -0
  169. package/dist/provider-config.js +15 -0
  170. package/dist/recovery-chain.d.ts +37 -0
  171. package/dist/recovery-chain.js +350 -0
  172. package/dist/remote-browser.d.ts +28 -4
  173. package/dist/remote-browser.js +60 -5
  174. package/dist/safari-browser-bar.d.ts +15 -0
  175. package/dist/safari-browser-bar.js +95 -0
  176. package/dist/safari-toolbar-asset.d.ts +15 -0
  177. package/dist/safari-toolbar-asset.js +12 -0
  178. package/dist/security.d.ts +2 -1
  179. package/dist/security.js +49 -10
  180. package/dist/selector-resolver.d.ts +34 -0
  181. package/dist/selector-resolver.js +181 -0
  182. package/dist/semantic-resolver.d.ts +35 -0
  183. package/dist/semantic-resolver.js +161 -0
  184. package/dist/server-capture-runtime.d.ts +5 -3
  185. package/dist/server-capture-runtime.js +42 -95
  186. package/dist/server-credit-usage.d.ts +2 -2
  187. package/dist/server-project-webhooks.d.ts +15 -1
  188. package/dist/server-project-webhooks.js +34 -8
  189. package/dist/server-screenshot-watermark.js +27 -5
  190. package/dist/session-profile.js +164 -1
  191. package/dist/sf-pro-symbols.d.ts +1 -0
  192. package/dist/sf-pro-symbols.js +55 -0
  193. package/dist/skill-packaging.d.ts +28 -0
  194. package/dist/skill-packaging.js +169 -0
  195. package/dist/smart-wait.d.ts +27 -0
  196. package/dist/smart-wait.js +81 -0
  197. package/dist/status-bar-render.d.ts +20 -0
  198. package/dist/status-bar-render.js +410 -0
  199. package/dist/status-bar.d.ts +9 -0
  200. package/dist/status-bar.js +298 -14
  201. package/dist/svg-browser-bar.d.ts +33 -0
  202. package/dist/svg-browser-bar.js +206 -0
  203. package/dist/svg-status-bar.d.ts +36 -0
  204. package/dist/svg-status-bar.js +597 -0
  205. package/dist/svg-text.d.ts +61 -0
  206. package/dist/svg-text.js +118 -0
  207. package/dist/tools.js +89 -451
  208. package/dist/types.d.ts +240 -5
  209. package/dist/types.js +23 -1
  210. package/dist/v2/action-verifier.d.ts +29 -0
  211. package/dist/v2/action-verifier.js +133 -0
  212. package/dist/v2/alt-text.d.ts +26 -0
  213. package/dist/v2/alt-text.js +55 -0
  214. package/dist/v2/benchmark.d.ts +59 -0
  215. package/dist/v2/benchmark.js +135 -0
  216. package/dist/v2/capture-strategy.d.ts +30 -0
  217. package/dist/v2/capture-strategy.js +67 -0
  218. package/dist/v2/capture-verification.d.ts +35 -0
  219. package/dist/v2/capture-verification.js +95 -0
  220. package/dist/v2/circuit-breaker.d.ts +42 -0
  221. package/dist/v2/circuit-breaker.js +119 -0
  222. package/dist/v2/cli-runner-local.d.ts +11 -0
  223. package/dist/v2/cli-runner-local.js +91 -0
  224. package/dist/v2/cli-runner.d.ts +34 -0
  225. package/dist/v2/cli-runner.js +300 -0
  226. package/dist/v2/compiler-prompts.d.ts +27 -0
  227. package/dist/v2/compiler-prompts.js +123 -0
  228. package/dist/v2/compiler.d.ts +37 -0
  229. package/dist/v2/compiler.js +147 -0
  230. package/dist/v2/explorer.d.ts +41 -0
  231. package/dist/v2/explorer.js +56 -0
  232. package/dist/v2/index.d.ts +37 -0
  233. package/dist/v2/index.js +31 -0
  234. package/dist/v2/llm-healer.d.ts +62 -0
  235. package/dist/v2/llm-healer.js +166 -0
  236. package/dist/v2/llm-provider.d.ts +29 -0
  237. package/dist/v2/llm-provider.js +80 -0
  238. package/dist/v2/opcode-runner.d.ts +47 -0
  239. package/dist/v2/opcode-runner.js +634 -0
  240. package/dist/v2/overlay-engine.d.ts +24 -0
  241. package/dist/v2/overlay-engine.js +150 -0
  242. package/dist/v2/postcondition.d.ts +16 -0
  243. package/dist/v2/postcondition.js +249 -0
  244. package/dist/v2/program-patcher.d.ts +25 -0
  245. package/dist/v2/program-patcher.js +44 -0
  246. package/dist/v2/recovery-chain.d.ts +30 -0
  247. package/dist/v2/recovery-chain.js +368 -0
  248. package/dist/v2/schema.d.ts +2580 -0
  249. package/dist/v2/schema.js +295 -0
  250. package/dist/v2/selector-resolver.d.ts +34 -0
  251. package/dist/v2/selector-resolver.js +181 -0
  252. package/dist/v2/semantic-resolver.d.ts +35 -0
  253. package/dist/v2/semantic-resolver.js +161 -0
  254. package/dist/v2/smart-wait.d.ts +27 -0
  255. package/dist/v2/smart-wait.js +81 -0
  256. package/dist/v2/types.d.ts +444 -0
  257. package/dist/v2/types.js +19 -0
  258. package/dist/v2/web-playwright-local.d.ts +69 -0
  259. package/dist/v2/web-playwright-local.js +392 -0
  260. package/dist/version.d.ts +1 -0
  261. package/dist/version.js +5 -0
  262. package/dist/video-agent.js +18 -13
  263. package/dist/video-planner.js +2 -1
  264. package/dist/video-prompts.js +3 -3
  265. package/dist/web-playwright-local.d.ts +126 -0
  266. package/dist/web-playwright-local.js +819 -0
  267. package/dist/ws-auth.js +4 -1
  268. package/dist/ws-broadcast.d.ts +34 -0
  269. package/dist/ws-broadcast.js +85 -0
  270. package/dist/ws-connection-limits.d.ts +12 -0
  271. package/dist/ws-connection-limits.js +44 -0
  272. package/dist/ws-handler-utils.d.ts +32 -0
  273. package/dist/ws-handler-utils.js +139 -0
  274. package/dist/ws-handler.js +294 -164
  275. package/dist/ws-metrics-server.d.ts +9 -0
  276. package/dist/ws-metrics-server.js +31 -0
  277. package/dist/ws-server.js +41 -1
  278. package/package.json +51 -34
@@ -5,47 +5,15 @@ import { logger } from './logger.js';
5
5
  import { getPostHog, DISTINCT_ID } from './posthog.js';
6
6
  import { isAbortError, throwIfAborted } from './abort.js';
7
7
  import { callVisionCapableModel } from './vision-model.js';
8
+ import { zdrParam } from './provider-config.js';
8
9
  const MAX_ELEMENT_ITERATIONS = 8;
9
10
  const ELEMENT_CAPTURE_TEMPERATURE = 0;
10
- function isTransientSearchSelector(selector) {
11
- return /\[data-ak-(search|container)-index=/.test(selector);
12
- }
13
11
  export function isLooseElementCaptureRejectionReason(reason) {
14
12
  if (!reason)
15
13
  return false;
16
14
  return /(too loose|too broad|tighten the frame|tighten the crop|significant unrelated surrounding content|large amount of unrelated page|minimal surrounding context|surrounding context|adjacent elements|wrong framing)/i
17
15
  .test(reason);
18
16
  }
19
- export function isTagOnlyStructuralSelector(selector) {
20
- const normalized = selector.trim();
21
- if (!normalized)
22
- return false;
23
- const usesStructuralPath = /[>+~]/.test(normalized)
24
- || /:(first|last|nth)-(child|of-type)/i.test(normalized);
25
- if (!usesStructuralPath)
26
- return false;
27
- const hasStableAnchor = /[#[]/.test(normalized)
28
- || /\.[A-Za-z_][A-Za-z0-9_-]*/.test(normalized)
29
- || /:has\(/i.test(normalized);
30
- if (hasStableAnchor)
31
- return false;
32
- const segments = normalized
33
- .split(/\s*[>+~]\s*/)
34
- .map((segment) => segment.trim())
35
- .filter(Boolean);
36
- if (segments.length === 0)
37
- return false;
38
- return segments.every((segment) => /^[a-z][a-z0-9-]*(?::(first|last|nth)-(child|of-type)(\([^)]+\))?)?$/i
39
- .test(segment));
40
- }
41
- export function shouldBlockUngroundedStructuralSelector(params) {
42
- if (!params.verifierRejectedAsTooLoose)
43
- return false;
44
- if (!isTagOnlyStructuralSelector(params.selector))
45
- return false;
46
- const grounded = new Set(params.groundedSelectors);
47
- return !grounded.has(params.selector);
48
- }
49
17
  export function outscaleAddsPadding(outscale) {
50
18
  if (!outscale)
51
19
  return false;
@@ -72,65 +40,6 @@ function buildTightOutscale(outscale) {
72
40
  export function buildVerificationOutscale(outscale) {
73
41
  return buildTightOutscale(outscale ?? {});
74
42
  }
75
- function getOrCreateSelectorEvidence(selectorEvidence, selector) {
76
- const existing = selectorEvidence.get(selector);
77
- if (existing)
78
- return existing;
79
- const created = {
80
- observedAsInteractive: false,
81
- directQueries: new Set(),
82
- containerQueries: new Set(),
83
- };
84
- selectorEvidence.set(selector, created);
85
- return created;
86
- }
87
- export function shouldAcceptDomCorroboratedSelector(params) {
88
- if (!params.verifierRejectedAsTooLoose)
89
- return false;
90
- if (params.looseFailureCount < 2)
91
- return false;
92
- if (!params.validation.boundingBox)
93
- return false;
94
- if (!params.observedAsInteractive)
95
- return false;
96
- const { width, height } = params.validation.boundingBox;
97
- if (width <= 0 || height <= 0)
98
- return false;
99
- if (params.viewport) {
100
- const bboxArea = width * height;
101
- const viewportArea = params.viewport.width * params.viewport.height;
102
- if (viewportArea > 0 && bboxArea > viewportArea * 0.25) {
103
- return false;
104
- }
105
- }
106
- return params.containerQueryCount >= 1 || params.directQueryCount >= 2;
107
- }
108
- export function computeElementCaptureDomSignature(params) {
109
- const sample = params.interactiveElements
110
- .slice(0, 40)
111
- .map((el) => [
112
- el.index,
113
- el.tag,
114
- el.role,
115
- (el.text || '').slice(0, 32).replace(/\s+/g, ' '),
116
- el.selector,
117
- el.visibilityState,
118
- ].join(':'))
119
- .join('|');
120
- return `${params.currentUrl}#${params.interactiveElements.length}#${sample}`;
121
- }
122
- export function shouldAllowSearchRefresh(params) {
123
- if (!params.cached)
124
- return true;
125
- if (params.cached.domSignature !== params.domSignature)
126
- return true;
127
- if (params.lastFailedTransientSelector
128
- && params.cached.selectors.includes(params.lastFailedTransientSelector)
129
- && params.cached.hasTransientSelectors) {
130
- return true;
131
- }
132
- return false;
133
- }
134
43
  const elementVerificationTools = [
135
44
  {
136
45
  type: 'function',
@@ -150,6 +59,24 @@ const elementVerificationTools = [
150
59
  },
151
60
  },
152
61
  },
62
+ {
63
+ type: 'function',
64
+ function: {
65
+ name: 'accept_with_note',
66
+ description: 'Approve the capture with a minor note. Use when the correct element IS shown and fully visible, but the framing includes some extra surrounding context that does not materially affect capture quality.',
67
+ parameters: {
68
+ type: 'object',
69
+ properties: {
70
+ reason: {
71
+ type: 'string',
72
+ description: 'Short note about the minor framing issue.',
73
+ },
74
+ },
75
+ required: ['reason'],
76
+ additionalProperties: false,
77
+ },
78
+ },
79
+ },
153
80
  {
154
81
  type: 'function',
155
82
  function: {
@@ -177,7 +104,7 @@ async function verifyElementCapture(client, model, element, screenshot, assessme
177
104
  const messages = [
178
105
  {
179
106
  role: 'system',
180
- content: 'You verify isolated UI element screenshots. The screenshot shows EXACTLY the pixel region captured by the element\'s CSS selector — every pixel visible is INSIDE that element\'s bounding box. Nothing outside the element is included.\n\nApprove if the screenshot shows the correct requested component, fully visible, and not clipped at the edges. Reject only for these reasons:\n1. The element is clipped — a side is cut off by the image edges\n2. The wrong element was captured — the content clearly does not match the description at all\n3. Only a tiny fragment is captured — e.g., just a heading when a full card was requested\n4. An overlay or modal is obscuring the element\n\nDo NOT reject for any of the following — these are correct captures:\n- The screenshot contains what looks like multiple cards, sections, or UI components side by side — they are INTERNAL layout of the captured element (e.g., a preview carousel, a thumbnail grid, embedded mockups)\n- The screenshot shows pricing cards, phone mockups, app screenshots, or promotional content these are screenshot previews INSIDE the component\n- Content appears "to the left", "to the right", "above", or "below" other content — spatial arrangement is the element\'s own layout, not neighboring page elements\n- The element has a complex internal structure with multiple visual sections\n\nCRITICAL: Since the screenshot IS the element\'s exact bounding box, there is no such thing as "adjacent cards" or "neighboring elements" visible in this image. What looks like a neighboring card is an internal preview thumbnail. What looks like surrounding context is internal structure. Only reject if the image edges clip the component or the wrong component is shown entirely.',
107
+ content: 'You verify isolated UI element screenshots. The screenshot shows EXACTLY the pixel region captured by the element\'s bounding box — every pixel visible is INSIDE that element. Nothing outside the element is included.\n\nYou have THREE tools:\n- **accept_capture** the correct component is shown, fully visible, well-framed\n- **accept_with_note** — the correct component IS shown and fully visible, but the framing includes some extra surrounding context. Use this instead of rejecting when the target element is clearly present and complete.\n- **retry_capture** — ONLY for serious issues: wrong element entirely, clipped edges, tiny fragment, or overlay obstruction\n\nReject (retry_capture) ONLY for:\n1. The element is clipped — a side is cut off by the image edges\n2. The wrong element was captured — the content clearly does not match the description at all\n3. Only a tiny fragment is captured — e.g., just a heading when a full card was requested\n4. An overlay or modal is obscuring the element\n\nWhen the correct element is visible but framing is slightly loose, use accept_with_notedo NOT reject.\n\nCRITICAL: Since the screenshot IS the element\'s exact bounding box, there is no such thing as "adjacent cards" or "neighboring elements" visible in this image. What looks like a neighboring card is an internal preview thumbnail. What looks like surrounding context is internal structure. Only reject if the image edges clip the component or the wrong component is shown entirely.',
181
108
  },
182
109
  {
183
110
  role: 'user',
@@ -197,9 +124,10 @@ This verification image is the raw element crop before any user-requested outsca
197
124
 
198
125
  The screenshot IS the captured element — every pixel is inside its bounding box.
199
126
 
200
- Approve if: the correct component is shown, fully visible, not clipped at image edges.
127
+ Use accept_capture if: the correct component is shown, fully visible, well-framed.
128
+ Use accept_with_note if: the correct component IS shown and fully visible, but framing is slightly loose with some extra context. This is still an approval.
201
129
 
202
- Reject ONLY if:
130
+ Use retry_capture ONLY if:
203
131
  - a side of the component is clipped by the image boundary
204
132
  - the content clearly does not match the description at all (completely wrong element)
205
133
  - only a tiny fragment is shown when a full component (card, panel, form) was requested
@@ -208,11 +136,14 @@ Reject ONLY if:
208
136
  Do NOT reject because:
209
137
  - the element contains screenshot previews, thumbnails, or mockups inside it
210
138
  - the internal layout has multiple visual sections that look like separate cards
211
- - content appears "above", "beside", or "around" other content within the image — this is the element's own internal layout, not neighboring page elements`,
139
+ - content appears "above", "beside", or "around" other content within the image — this is the element's own internal layout, not neighboring page elements
140
+ - the framing is slightly loose but the correct element is clearly present and complete`,
212
141
  },
213
142
  ],
214
143
  },
215
144
  ];
145
+ let response;
146
+ let usedModel = model;
216
147
  try {
217
148
  const visionResult = await callVisionCapableModel({
218
149
  primaryModel: model,
@@ -222,43 +153,48 @@ Do NOT reject because:
222
153
  messages,
223
154
  tools: elementVerificationTools,
224
155
  tool_choice: 'required',
225
- temperature: ELEMENT_CAPTURE_TEMPERATURE,
156
+ temperature: 0,
226
157
  max_tokens: 256,
227
- provider: { zdr: true },
158
+ provider: { ...zdrParam() },
228
159
  }, { signal }),
160
+ onFallbackActivated: (m, reason) => logger.info(`Element verification vision fallback: ${m} (reason: ${reason})`),
229
161
  });
230
- const response = visionResult.result;
231
- const usage = {
232
- stepNumber,
233
- stepType: 'element_capture',
234
- generationId: response.id ?? null,
235
- modelRequested: model,
236
- modelUsed: response.model ?? null,
237
- promptTokens: response.usage?.prompt_tokens ?? null,
238
- completionTokens: response.usage?.completion_tokens ?? null,
239
- totalTokens: response.usage?.total_tokens ?? null,
240
- imagesInPrompt: 1,
241
- };
242
- const toolCall = response.choices?.[0]?.message?.tool_calls?.[0];
243
- if (!toolCall || !('function' in toolCall)) {
244
- return { verified: false, reason: 'Element verification returned no actionable result', usage };
245
- }
246
- const args = JSON.parse(toolCall.function.arguments || '{}');
247
- if (toolCall.function.name === 'accept_capture') {
248
- logger.success(`Element capture verification passed: ${args.reason || 'approved'}`);
249
- return { verified: true, reason: args.reason, usage };
250
- }
251
- const reason = args.reason || 'Element capture was rejected';
252
- logger.ai(`Element capture verification failed: ${reason}`);
253
- return { verified: false, reason, usage };
162
+ response = visionResult.result;
163
+ usedModel = visionResult.model;
254
164
  }
255
165
  catch (err) {
256
- if (isAbortError(err)) {
166
+ if (isAbortError(err))
257
167
  throw err;
258
- }
259
- logger.error(`Element capture verification failed: ${err.message}`);
260
- return { verified: true, reason: 'Verification unavailable', usage: null };
168
+ logger.error(`Element verification failed: ${err.message}`);
169
+ return { verified: false, reason: 'Verification service unavailable', usage: null };
170
+ }
171
+ const usage = {
172
+ stepNumber,
173
+ stepType: 'element_capture',
174
+ generationId: response.id ?? null,
175
+ modelRequested: model,
176
+ modelUsed: response.model ?? usedModel,
177
+ promptTokens: response.usage?.prompt_tokens ?? null,
178
+ completionTokens: response.usage?.completion_tokens ?? null,
179
+ totalTokens: response.usage?.total_tokens ?? null,
180
+ imagesInPrompt: 1,
181
+ };
182
+ const toolCall = response.choices?.[0]?.message?.tool_calls?.[0];
183
+ if (!toolCall || !('function' in toolCall)) {
184
+ return { verified: false, reason: 'Verification returned no decision', usage };
185
+ }
186
+ const name = toolCall.function.name;
187
+ let args;
188
+ try {
189
+ args = JSON.parse(toolCall.function.arguments);
190
+ }
191
+ catch {
192
+ return { verified: false, reason: 'Verification returned invalid JSON', usage };
261
193
  }
194
+ if (name === 'accept_capture' || name === 'accept_with_note') {
195
+ return { verified: true, reason: args.reason || undefined, usage };
196
+ }
197
+ return { verified: false, reason: args.reason || 'Verifier rejected', usage };
262
198
  }
263
199
  export async function captureIsolatedElement(browser, element, apiKey, model, options = {}) {
264
200
  const { abortSignal, distinctId, fallbackModel, uploadImage } = options;
@@ -274,43 +210,18 @@ export async function captureIsolatedElement(browser, element, apiKey, model, op
274
210
  const actionHistory = [];
275
211
  const usageLog = [];
276
212
  let stepCounter = 0;
277
- // Map from query key → top candidate lines (shown again when a duplicate is blocked)
278
- const usedSearchQueries = new Map();
279
- const selectorEvidence = new Map();
280
- const looseFailureCountsBySelector = new Map();
281
- let lastFailedTransientSelector = null;
282
213
  let lastVerifierRejectedAsTooLoose = false;
283
- // Save original viewport to restore after element capture (agent may resize it)
284
- const originalViewport = browser.currentPage.viewportSize();
285
- const restoreViewport = async () => {
286
- const current = browser.currentPage.viewportSize();
287
- if (originalViewport && current &&
288
- (current.width !== originalViewport.width || current.height !== originalViewport.height)) {
289
- await browser.resizeViewport(originalViewport.width, originalViewport.height);
290
- }
291
- };
214
+ let looseRejectionCount = 0;
292
215
  try {
293
216
  for (let iteration = 1; iteration <= MAX_ELEMENT_ITERATIONS; iteration++) {
294
217
  throwIfAborted(abortSignal, `Element capture cancelled for "${element.name}".`);
295
- // 1. Capture page state (including a screenshot for visual disambiguation)
296
- const [accessibilityTree, interactiveElements, simplifiedDOM, screenshotBuf] = await Promise.all([
297
- browser.getAccessibilityTree(),
298
- browser.getInteractiveElements(),
299
- browser.getSimplifiedDOM(),
300
- browser.takeScreenshotForAI(),
301
- ]);
218
+ // 1. Capture page state with AKTree
219
+ const pageState = await browser.getPageState();
220
+ const screenshotBuf = pageState.cleanScreenshot;
221
+ const serializedAKTree = pageState.serializedAKTree;
302
222
  const screenshotUrl = uploadImage
303
223
  ? await uploadImage(screenshotBuf, 'image/png').catch(() => `data:image/png;base64,${screenshotBuf.toString('base64')}`)
304
224
  : `data:image/png;base64,${screenshotBuf.toString('base64')}`;
305
- const domSignature = computeElementCaptureDomSignature({
306
- currentUrl: browser.currentPage.url(),
307
- interactiveElements,
308
- });
309
- for (const interactiveElement of interactiveElements) {
310
- if (!interactiveElement.selector)
311
- continue;
312
- getOrCreateSelectorEvidence(selectorEvidence, interactiveElement.selector).observedAsInteractive = true;
313
- }
314
225
  // 2. Build messages
315
226
  const messages = [
316
227
  { role: 'system', content: buildElementSystemPrompt(element.description) },
@@ -319,20 +230,22 @@ export async function captureIsolatedElement(browser, element, apiKey, model, op
319
230
  content: buildElementIterationMessage({
320
231
  elementName: element.name,
321
232
  elementDescription: element.description,
322
- accessibilityTree,
323
- interactiveElements,
324
- simplifiedDOM,
233
+ serializedAKTree,
325
234
  currentUrl: browser.currentPage.url(),
326
235
  iteration,
327
236
  maxIterations: MAX_ELEMENT_ITERATIONS,
328
237
  actionHistory: actionHistory.length > 0 ? actionHistory : undefined,
329
238
  viewport: browser.currentPage.viewportSize() ?? undefined,
330
- forbiddenSearchQueries: usedSearchQueries.size > 0 ? [...usedSearchQueries.keys()] : undefined,
239
+ scrollInfo: {
240
+ scrollY: pageState.scrollInfo.scrollY,
241
+ scrollHeight: pageState.scrollInfo.scrollHeight,
242
+ viewportHeight: browser.currentPage.viewportSize()?.height ?? 900,
243
+ },
331
244
  screenshotUrl,
332
245
  }),
333
246
  },
334
247
  ];
335
- // 3. Call LLM (with vision fallback + coercion retry when the model ignores tool_choice: 'required')
248
+ // 3. Call LLM
336
249
  let response;
337
250
  let usedModel = model;
338
251
  let callMessages = messages;
@@ -350,7 +263,7 @@ export async function captureIsolatedElement(browser, element, apiKey, model, op
350
263
  tool_choice: 'required',
351
264
  temperature: ELEMENT_CAPTURE_TEMPERATURE,
352
265
  max_tokens: 1024,
353
- provider: { zdr: true },
266
+ provider: { ...zdrParam() },
354
267
  }, { signal: abortSignal }),
355
268
  onFallbackActivated: (m, reason) => logger.info(`Element capture vision fallback activated: ${m} (reason: ${reason})`),
356
269
  });
@@ -364,7 +277,6 @@ export async function captureIsolatedElement(browser, element, apiKey, model, op
364
277
  if (!assistantContent)
365
278
  break;
366
279
  logger.info(`Model returned text without tool call; coercing (attempt ${coercionAttempt}/${MAX_COERCION_RETRIES})...`);
367
- logger.ai(assistantContent.slice(0, 200));
368
280
  callMessages = [
369
281
  ...callMessages,
370
282
  { role: 'assistant', content: assistantContent },
@@ -387,9 +299,8 @@ export async function captureIsolatedElement(browser, element, apiKey, model, op
387
299
  });
388
300
  }
389
301
  catch (err) {
390
- if (isAbortError(err)) {
302
+ if (isAbortError(err))
391
303
  throw err;
392
- }
393
304
  logger.error(`Element capture API call failed: ${err.message}`);
394
305
  actionHistory.push(`Iteration ${iteration}: API error — ${err.message}`);
395
306
  continue;
@@ -410,336 +321,172 @@ export async function captureIsolatedElement(browser, element, apiKey, model, op
410
321
  }
411
322
  catch {
412
323
  logger.error(`Invalid JSON in element tool arguments: ${toolCall.function.arguments}`);
413
- callMessages = [
414
- ...callMessages,
415
- { role: 'assistant', content: null, tool_calls: [toolCall] },
416
- { role: 'tool', tool_call_id: toolCall.id, content: 'ERROR: Invalid JSON in tool arguments. Please retry with correctly formatted JSON.' },
417
- ];
418
324
  continue;
419
325
  }
420
326
  // 4. Handle tool calls
421
- if (name === 'capture_by_selector') {
422
- const selector = args.selector;
423
- const confidence = args.confidence;
424
- const reasoning = args.reasoning;
327
+ // ── capture(nodeId?) ──
328
+ if (name === 'capture') {
329
+ const nodeId = typeof args.nodeId === 'string' ? args.nodeId : undefined;
425
330
  const baseOutscale = element.outscale ?? { padding: element.padding ?? 0 };
426
- const groundedSelectors = new Set([
427
- ...interactiveElements.map((entry) => entry.selector).filter((entry) => !!entry),
428
- ...Array.from(usedSearchQueries.values()).flatMap((entry) => entry.selectors),
429
- ]);
430
- if (shouldBlockUngroundedStructuralSelector({
431
- selector,
432
- groundedSelectors,
433
- verifierRejectedAsTooLoose: lastVerifierRejectedAsTooLoose,
434
- })) {
435
- const latestSearchEntries = Array.from(usedSearchQueries.values());
436
- const latestSearch = latestSearchEntries[latestSearchEntries.length - 1];
437
- const candidateHint = latestSearch?.candidateLines.length
438
- ? `\nGrounded candidates from search_text (use one of their sel= values directly):\n${latestSearch.candidateLines.join('\n')}`
439
- : '';
440
- logger.info(`Element "${element.name}": blocking ungrounded structural selector "${selector}" after a loose-crop rejection`);
441
- actionHistory.push(`Iteration ${iteration}: capture_by_selector("${selector}") BLOCKED — after a verifier rejection for loose framing, do NOT invent a tag-only DOM path from <page_dom>. Re-run search_text with distinctive in-card text and use a grounded sel= directly, preferring any ↳ container selector.${candidateHint}`);
331
+ if (!nodeId) {
332
+ // Full page capture — unusual for element capture but handle it
333
+ logger.info(`Element "${element.name}": capture() without nodeId — full page`);
334
+ const buffer = await browser.takeScreenshot();
335
+ const viewport = browser.currentPage.viewportSize();
336
+ const { verified, reason: verifyReason, usage } = await verifyElementCapture(client, model, element, buffer, 'Full page capture', ++stepCounter, abortSignal, fallbackModel, uploadImage);
337
+ if (usage)
338
+ usageLog.push(usage);
339
+ if (verified) {
340
+ logger.success(`Element "${element.name}" captured (full page)`);
341
+ return { element, success: true, buffer, assessment: verifyReason || 'Full page', usage: usageLog };
342
+ }
343
+ actionHistory.push(`Iteration ${iteration}: capture() full page rejected ${verifyReason}. Specify a nodeId to crop to the target element.`);
442
344
  continue;
443
345
  }
444
- logger.info(`Element "${element.name}": capture_by_selector("${selector}") (confidence: ${confidence.toFixed(2)})`);
445
- logger.ai(reasoning);
446
- try {
447
- throwIfAborted(abortSignal, `Element capture cancelled for "${element.name}".`);
448
- const verificationOutscale = buildVerificationOutscale(baseOutscale);
449
- const { buffer, validation } = await browser.screenshotBySelector(selector, verificationOutscale);
450
- // Reject captures that are too large (>70% of viewport area) — likely a wrapper, not the target element.
451
- const viewport = browser.currentPage.viewportSize();
452
- if (viewport && validation.boundingBox) {
453
- const bb = validation.boundingBox;
454
- const captureArea = bb.width * bb.height;
455
- const viewportArea = viewport.width * viewport.height;
456
- if (captureArea > viewportArea * 0.7) {
457
- logger.info(`Element "${element.name}": selector "${selector}" covers ${Math.round(captureArea / viewportArea * 100)}% of viewport — too large, rejecting.`);
458
- actionHistory.push(`Iteration ${iteration}: capture_by_selector("${selector}") rejected element covers >70% of viewport, likely a wrapper not the target component. Use a more specific selector.`);
459
- continue;
460
- }
346
+ // Resolve nodeId to bounds
347
+ const entry = await browser.resolveAKNode(nodeId);
348
+ if (!entry) {
349
+ logger.error(`Element "${element.name}": nodeId "${nodeId}" not found in AKTree`);
350
+ actionHistory.push(`Iteration ${iteration}: capture("${nodeId}") — node not found. Re-read the AKTree and use a valid nodeId.`);
351
+ continue;
352
+ }
353
+ logger.info(`Element "${element.name}": capture("${nodeId}") — ${entry.label} ${entry.bounds.w}x${entry.bounds.h}`);
354
+ // Check area threshold
355
+ const viewport = browser.currentPage.viewportSize();
356
+ if (viewport) {
357
+ const captureArea = entry.bounds.w * entry.bounds.h;
358
+ const viewportArea = viewport.width * viewport.height;
359
+ const isLargeComponent = /\b(hero|header|footer|navigation|navbar|banner|full.?width|feature.?section|testimonial|above.the.fold)\b/i.test(element.description);
360
+ const areaThreshold = isLargeComponent ? 0.85 : 0.70;
361
+ if (captureArea > viewportArea * areaThreshold) {
362
+ logger.info(`Element "${element.name}": node "${nodeId}" covers ${Math.round(captureArea / viewportArea * 100)}% of viewport — too large`);
363
+ actionHistory.push(`Iteration ${iteration}: capture("${nodeId}") rejected — element covers >${Math.round(areaThreshold * 100)}% of viewport (${entry.bounds.w}x${entry.bounds.h}), likely a wrapper. Find a more specific child node with focus(within: "${nodeId}").`);
364
+ continue;
461
365
  }
462
- const { verified, reason: verifyReason, usage } = await verifyElementCapture(client, model, element, buffer, reasoning, ++stepCounter, abortSignal, fallbackModel, uploadImage);
366
+ }
367
+ // Capture the node
368
+ try {
369
+ const buffer = await browser.captureNode(nodeId);
370
+ // Verify
371
+ const { verified, reason: verifyReason, usage } = await verifyElementCapture(client, model, element, buffer, `Captured ${entry.label} (${nodeId})`, ++stepCounter, abortSignal, fallbackModel, uploadImage);
463
372
  if (usage)
464
373
  usageLog.push(usage);
465
374
  if (!verified) {
466
375
  lastVerifierRejectedAsTooLoose = isLooseElementCaptureRejectionReason(verifyReason);
467
376
  if (lastVerifierRejectedAsTooLoose) {
468
- const looseFailureCount = (looseFailureCountsBySelector.get(selector) ?? 0) + 1;
469
- looseFailureCountsBySelector.set(selector, looseFailureCount);
470
- const selectorEvidenceEntry = selectorEvidence.get(selector);
471
- const viewport = browser.currentPage.viewportSize();
472
- if (selectorEvidenceEntry && shouldAcceptDomCorroboratedSelector({
473
- looseFailureCount,
474
- verifierRejectedAsTooLoose: true,
475
- validation,
476
- viewport,
477
- observedAsInteractive: selectorEvidenceEntry.observedAsInteractive,
478
- directQueryCount: selectorEvidenceEntry.directQueries.size,
479
- containerQueryCount: selectorEvidenceEntry.containerQueries.size,
480
- })) {
481
- logger.info(`Element "${element.name}": accepting selector "${selector}" after repeated loose-only verifier rejections because DOM evidence consistently corroborates the same component`);
377
+ looseRejectionCount++;
378
+ // Accept after repeated loose-only rejections (likely false positive from verifier)
379
+ if (looseRejectionCount >= 3) {
380
+ logger.info(`Element "${element.name}": accepting after ${looseRejectionCount} loose-only rejections`);
482
381
  return {
483
- element,
484
- success: true,
485
- buffer,
486
- assessment: 'Accepted after repeated loose-only verifier rejections; DOM evidence consistently confirmed the same component and the verifier likely misread embedded preview content as surrounding page context.',
487
- capturedSelector: selector,
488
- validation,
489
- confidence,
382
+ element, success: true, buffer,
383
+ assessment: 'Accepted after repeated loose-only verifier rejections.',
490
384
  usage: usageLog,
491
385
  };
492
386
  }
493
387
  }
494
- lastFailedTransientSelector = isTransientSearchSelector(selector) ? selector : null;
495
- actionHistory.push(lastVerifierRejectedAsTooLoose
496
- ? `Iteration ${iteration}: capture_by_selector("${selector}") rejected by verifier — ${verifyReason}. The selector matched a real element, but the framing was too loose. Stay grounded on sel= values returned by the tools, search for a more distinctive in-card string if needed, and prefer any ↳ container selector instead of inventing a tag-only DOM path.`
497
- : `Iteration ${iteration}: capture_by_selector("${selector}") rejected by verifier — ${verifyReason}. Try a different or more specific selector.`);
388
+ actionHistory.push(`Iteration ${iteration}: capture("${nodeId}") rejected by verifier — ${verifyReason}. ${lastVerifierRejectedAsTooLoose ? 'Try a more specific child node with focus(within: "' + nodeId + '").' : 'Try a different nodeId.'}`);
498
389
  continue;
499
390
  }
500
391
  lastVerifierRejectedAsTooLoose = false;
392
+ // Apply outscale if needed
501
393
  const finalBuffer = outscaleAddsPadding(baseOutscale)
502
- ? (await browser.screenshotBySelector(selector, baseOutscale)).buffer
394
+ ? await browser.screenshotByRegion({ x: entry.bounds.x, y: entry.bounds.y, width: entry.bounds.w, height: entry.bounds.h }, baseOutscale)
503
395
  : buffer;
504
- logger.success(`Element "${element.name}" captured via selector "${selector}"`);
396
+ logger.success(`Element "${element.name}" captured via nodeId "${nodeId}"`);
505
397
  getPostHog().capture({
506
398
  distinctId: distinctId ?? DISTINCT_ID,
507
399
  event: 'element_capture_succeeded',
508
- properties: {
509
- element_name: element.name,
510
- method: 'selector',
511
- selector,
512
- confidence,
513
- iterations: iteration,
514
- },
400
+ properties: { element_name: element.name, method: 'nodeId', nodeId, iterations: iteration },
515
401
  });
516
402
  return {
517
- element,
518
- success: true,
519
- buffer: finalBuffer,
520
- assessment: verifyReason || reasoning,
521
- capturedSelector: selector,
522
- validation,
523
- confidence,
403
+ element, success: true, buffer: finalBuffer,
404
+ assessment: verifyReason || `Captured ${entry.label}`,
524
405
  usage: usageLog,
525
406
  };
526
407
  }
527
408
  catch (err) {
528
- if (isAbortError(err)) {
409
+ if (isAbortError(err))
529
410
  throw err;
530
- }
531
- // Discriminate structured selector validation errors from generic errors
532
- const selectorErr = err;
533
- if (selectorErr.error && selectorErr.errorMessage) {
534
- if (selectorErr.error === 'no_match' && isTransientSearchSelector(selector)) {
535
- lastFailedTransientSelector = selector;
536
- }
537
- else {
538
- lastFailedTransientSelector = null;
539
- }
540
- const selectorSuggestions = {
541
- no_match: 'The selector matched nothing. Try a broader selector, remove nth-child constraints, or use search_text to locate the element first.',
542
- ambiguous: 'Multiple elements matched. Add parent context, a unique ID, data-testid, or aria-label to narrow down to one element.',
543
- invisible: 'Element is hidden. Use scroll_to_element or scroll to bring it into view, or check if a parent element must be expanded first.',
544
- zero_size: 'Element has zero rendered size. It may be dynamically rendered or conditionally shown. Try scrolling the page or waiting.',
545
- };
546
- const suggestion = selectorErr.error === 'no_match' && isTransientSearchSelector(selector)
547
- ? 'This looks like a stale temporary selector from an earlier search_text result. Re-run search_text to refresh the sel= value, then use the new selector immediately.'
548
- : selectorSuggestions[selectorErr.error] ?? 'Refine the selector and retry.';
549
- logger.error(`Selector validation failed: ${selectorErr.errorMessage}`);
550
- actionHistory.push(`Iteration ${iteration}: capture_by_selector("${selector}") — ${selectorErr.errorMessage}. ${suggestion}`);
551
- }
552
- else {
553
- const errorMsg = err.message;
554
- logger.error(`Selector capture failed: ${errorMsg}`);
555
- actionHistory.push(`Iteration ${iteration}: capture_by_selector("${selector}") failed — ${errorMsg}`);
556
- }
411
+ logger.error(`Element "${element.name}": capture("${nodeId}") failed — ${err.message}`);
412
+ actionHistory.push(`Iteration ${iteration}: capture("${nodeId}") error ${err.message}`);
557
413
  continue;
558
414
  }
559
415
  }
560
- if (name === 'resize_viewport') {
561
- const w = args.width;
562
- const h = args.height;
563
- logger.info(`Element "${element.name}": resizing viewport to ${w}x${h}`);
564
- await browser.resizeViewport(w, h);
565
- await browser.wait(500);
566
- actionHistory.push(`Iteration ${iteration}: resized viewport to ${w}x${h}`);
416
+ // ── focus(query) ──
417
+ if (name === 'focus') {
418
+ const query = args;
419
+ try {
420
+ const focusResult = await browser.focusTree(query);
421
+ logger.info(`Element "${element.name}": focus → ${focusResult.matches.length} match(es)`);
422
+ actionHistory.push(`Iteration ${iteration}: focus(${JSON.stringify(args)}) ${focusResult.matches.length} match(es)\n${focusResult.serialized.slice(0, 800)}`);
423
+ }
424
+ catch (err) {
425
+ logger.error(`Element "${element.name}": focus failed — ${err.message}`);
426
+ actionHistory.push(`Iteration ${iteration}: focus error — ${err.message}`);
427
+ }
567
428
  continue;
568
429
  }
430
+ // ── scroll ──
569
431
  if (name === 'scroll') {
570
- const index = args.index;
571
- if (index !== undefined) {
572
- const align = args.align;
573
- const margin = args.margin;
574
- logger.info(`Element "${element.name}": scrolling to index ${index}${align ? ` (${align})` : ''}`);
575
- await browser.scrollElementIntoView(index, { align, margin });
576
- await browser.wait(300);
577
- actionHistory.push(`Iteration ${iteration}: scroll(index=${index}, align=${align || 'center'})`);
432
+ const centerOn = typeof args.centerOn === 'string' ? args.centerOn : undefined;
433
+ const target = typeof args.target === 'string' ? args.target : undefined;
434
+ const direction = args.direction ?? 'down';
435
+ const offset = typeof args.offset === 'number' ? args.offset : undefined;
436
+ if (centerOn) {
437
+ logger.info(`Element "${element.name}": scroll centerOn="${centerOn}"`);
438
+ try {
439
+ await browser.centerNodeInView(centerOn, { containerNodeId: target, offset });
440
+ await browser.wait(300);
441
+ actionHistory.push(`Iteration ${iteration}: scroll(centerOn="${centerOn}")`);
442
+ }
443
+ catch (err) {
444
+ actionHistory.push(`Iteration ${iteration}: scroll centerOn error — ${err.message}`);
445
+ }
578
446
  }
579
447
  else {
580
- const direction = args.direction ?? 'down';
581
- const amount = args.amount ?? 500;
582
- logger.info(`Element "${element.name}": scrolling ${direction} ${amount}px`);
448
+ const amount = offset ?? 500;
449
+ logger.info(`Element "${element.name}": scroll ${direction} ${amount}px`);
583
450
  await browser.scroll(direction, amount);
584
451
  await browser.wait(500);
585
452
  actionHistory.push(`Iteration ${iteration}: scroll(${direction}, ${amount}px)`);
586
453
  }
587
454
  continue;
588
455
  }
589
- if (name === 'dismiss_overlays') {
590
- logger.info(`Element "${element.name}": dismissing overlays`);
591
- const result = await browser.dismissOverlays();
592
- await browser.wait(300);
593
- actionHistory.push(`Iteration ${iteration}: dismiss_overlays() → ${result.dismissed ? `dismissed via ${result.method}` : 'no visible overlay dismissed'}`);
594
- continue;
595
- }
596
- if (name === 'search_text') {
597
- const query = args.query;
598
- const queryKey = query.toLowerCase().trim();
599
- // Block duplicate searches — the DOM hasn't changed, results would be identical.
600
- // Re-surface the top candidates so the model can act on them instead of searching again.
601
- const cachedSearch = usedSearchQueries.get(queryKey);
602
- if (cachedSearch
603
- && !shouldAllowSearchRefresh({
604
- cached: cachedSearch,
605
- domSignature,
606
- lastFailedTransientSelector,
607
- })) {
608
- const cached = cachedSearch;
609
- logger.info(`Element "${element.name}": duplicate search_text("${query}") blocked`);
610
- const candidateHint = cached.candidateLines.length > 0
611
- ? `\nTop candidates from that search (use their sel= in capture_by_selector):\n${cached.candidateLines.join('\n')}`
612
- : '';
613
- actionHistory.push(`Iteration ${iteration}: search_text("${query}") BLOCKED — DOM unchanged, results identical.${candidateHint}\nYou MUST either: (a) call capture_by_selector with a sel= from above, OR (b) search with a DIFFERENT query (unique text from the target: subtitle, price, feature text).`);
614
- continue;
615
- }
616
- logger.info(`Element "${element.name}": searching for "${query}"`);
617
- const results = await browser.searchText(query);
618
- for (const result of results) {
619
- getOrCreateSelectorEvidence(selectorEvidence, result.selector).directQueries.add(queryKey);
620
- if (result.container?.selector) {
621
- getOrCreateSelectorEvidence(selectorEvidence, result.container.selector).containerQueries.add(queryKey);
622
- }
623
- }
624
- const MAX_SEARCH_RESULTS_IN_HISTORY = 5;
625
- const resultLines = results.map((r, i) => {
626
- const cx = Math.round(r.boundingBox.x + r.boundingBox.width / 2);
627
- const cy = Math.round(r.boundingBox.y + r.boundingBox.height / 2);
628
- const visibility = r.visibilityState === 'full'
629
- ? 'fully-visible'
630
- : r.visibilityState === 'partial'
631
- ? 'partially-visible'
632
- : 'off-screen';
633
- const text = r.text.slice(0, 80).replace(/\s+/g, ' ');
634
- let line = ` ${i}. <${r.tag}> "${text}" @${cx},${cy} ${r.boundingBox.width}x${r.boundingBox.height} (${visibility}) sel="${r.selector}"`;
635
- if (r.container) {
636
- line += `\n ↳ container: <${r.container.tag}> ${r.container.boundingBox.width}x${r.container.boundingBox.height} (${r.container.reason}) sel="${r.container.selector}"`;
637
- }
638
- return line;
639
- });
640
- const resultText = results.length > 0
641
- ? resultLines.slice(0, MAX_SEARCH_RESULTS_IN_HISTORY).join('\n')
642
- + (results.length > MAX_SEARCH_RESULTS_IN_HISTORY ? `\n (${results.length - MAX_SEARCH_RESULTS_IN_HISTORY} more results not shown)` : '')
643
- : ' (no matches found)';
644
- // Cache the top 3 most promising candidates (prefer off-screen small elements — likely specific items)
645
- const topCandidates = resultLines
646
- .filter(l => !l.includes('<body>'))
647
- .slice(0, 3);
648
- // Pick the best bounding box for coordinate-based fallback:
649
- // prefer a result whose container has a reasonable card-like size (>= 100x100, < 70% viewport).
650
- const viewportArea = 1440 * 900; // approximate
651
- let bestRegion = null;
652
- for (const result of results) {
653
- const container = result.container;
654
- if (container) {
655
- const cb = container.boundingBox;
656
- if (cb.width >= 100 && cb.height >= 100 && cb.width * cb.height < viewportArea * 0.7) {
657
- bestRegion = cb;
658
- break;
659
- }
660
- }
456
+ // ── analyze_screenshot ──
457
+ if (name === 'analyze_screenshot') {
458
+ const question = args.question || 'Describe what you see';
459
+ logger.info(`Element "${element.name}": analyze_screenshot — "${question}"`);
460
+ try {
461
+ const analysisResponse = await client.chat.completions.create({
462
+ model,
463
+ messages: [
464
+ {
465
+ role: 'user',
466
+ content: [
467
+ { type: 'image_url', image_url: { url: screenshotUrl } },
468
+ { type: 'text', text: question },
469
+ ],
470
+ },
471
+ ],
472
+ max_tokens: 256,
473
+ provider: { ...zdrParam() },
474
+ }, { signal: abortSignal });
475
+ const answer = analysisResponse.choices?.[0]?.message?.content || '(no answer)';
476
+ actionHistory.push(`Iteration ${iteration}: analyze_screenshot("${question}") → ${answer.slice(0, 300)}`);
661
477
  }
662
- // Fallback: if no result had a suitably-sized container, use existing logic
663
- if (!bestRegion) {
664
- const bestResult = results[0] ?? null;
665
- if (bestResult) {
666
- const container = bestResult.container;
667
- if (container && container.boundingBox.width * container.boundingBox.height < viewportArea * 0.7) {
668
- bestRegion = container.boundingBox;
669
- }
670
- else {
671
- bestRegion = bestResult.boundingBox;
672
- }
673
- }
478
+ catch (err) {
479
+ if (isAbortError(err))
480
+ throw err;
481
+ actionHistory.push(`Iteration ${iteration}: analyze_screenshot error — ${err.message}`);
674
482
  }
675
- usedSearchQueries.set(queryKey, {
676
- candidateLines: topCandidates,
677
- domSignature,
678
- selectors: results.slice(0, 5).flatMap((result) => [
679
- result.selector,
680
- ...(result.container?.selector ? [result.container.selector] : []),
681
- ]),
682
- hasTransientSelectors: results.some((result) => isTransientSearchSelector(result.selector)
683
- || (result.container?.selector ? isTransientSearchSelector(result.container.selector) : false)),
684
- bestRegion,
685
- });
686
- lastFailedTransientSelector = null;
687
- logger.info(`Search results:\n${resultLines.join('\n')}`);
688
- actionHistory.push(`Iteration ${iteration}: search_text("${query}") → ${results.length} match(es)\n${resultText}`);
689
483
  continue;
690
484
  }
691
- if (name === 'give_up') {
692
- const reason = args.reason || 'Unknown reason';
693
- logger.error(`Element "${element.name}" not found: ${reason}`);
694
- getPostHog().capture({
695
- distinctId: distinctId ?? DISTINCT_ID,
696
- event: 'element_capture_failed',
697
- properties: {
698
- element_name: element.name,
699
- reason,
700
- iterations: iteration,
701
- failure_type: 'gave_up',
702
- },
703
- });
704
- return {
705
- element,
706
- success: false,
707
- buffer: Buffer.alloc(0),
708
- assessment: reason,
709
- usage: usageLog,
710
- };
711
- }
712
- }
713
- // Coordinate-based fallback: if the LLM couldn't find a valid CSS selector but
714
- // search_text found the element with a bounding box, capture by region as a last resort.
715
- const allRegions = Array.from(usedSearchQueries.values())
716
- .map(entry => entry.bestRegion)
717
- .filter((r) => r != null && r.width > 0 && r.height > 0);
718
- if (allRegions.length > 0) {
719
- // Prefer regions with reasonable card-like dimensions over tiny text bboxes
720
- const region = [...allRegions].sort((a, b) => {
721
- const aOk = a.width >= 100 && a.height >= 100 ? 1 : 0;
722
- const bOk = b.width >= 100 && b.height >= 100 ? 1 : 0;
723
- if (aOk !== bOk)
724
- return bOk - aOk;
725
- return (b.width * b.height) - (a.width * a.height);
726
- })[0];
727
- try {
728
- logger.info(`Element "${element.name}": falling back to coordinate-based capture at ${region.x},${region.y} ${region.width}x${region.height}`);
729
- const buffer = await browser.screenshotByRegion(region, element.outscale ?? element.padding ?? 0);
730
- return {
731
- element,
732
- success: true,
733
- buffer,
734
- assessment: `Captured by coordinate fallback at (${region.x},${region.y}) ${region.width}x${region.height}`,
735
- capturedRegion: region,
736
- usage: usageLog,
737
- };
738
- }
739
- catch (err) {
740
- logger.error(`Element "${element.name}": coordinate fallback failed: ${err.message}`);
741
- }
485
+ // Unknown tool
486
+ logger.warn(`Element "${element.name}": unknown tool "${name}"`);
487
+ actionHistory.push(`Iteration ${iteration}: unknown tool "${name}" use focus, scroll, analyze_screenshot, or capture.`);
742
488
  }
489
+ // Max iterations reached — no coordinate fallback needed since AKTree provides bounds directly
743
490
  logger.error(`Element "${element.name}": max iterations reached`);
744
491
  getPostHog().capture({
745
492
  distinctId: distinctId ?? DISTINCT_ID,
@@ -759,8 +506,17 @@ export async function captureIsolatedElement(browser, element, apiKey, model, op
759
506
  usage: usageLog,
760
507
  };
761
508
  }
762
- finally {
763
- await restoreViewport();
509
+ catch (err) {
510
+ if (isAbortError(err))
511
+ throw err;
512
+ logger.error(`Element capture unexpected error: ${err.message}`);
513
+ return {
514
+ element,
515
+ success: false,
516
+ buffer: Buffer.alloc(0),
517
+ assessment: `Unexpected error: ${err.message}`,
518
+ usage: usageLog,
519
+ };
764
520
  }
765
521
  }
766
522
  //# sourceMappingURL=element-capture.js.map