autokap 1.0.6 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (347) hide show
  1. package/assets/chrome/ios-statusbar-comparison-reference.jpg +0 -0
  2. package/assets/chrome/ios-statusbar-dark-reference.jpg +0 -0
  3. package/assets/chrome/ios-statusbar-light-reference.jpg +0 -0
  4. package/assets/cursors/macos.svg +4 -0
  5. package/assets/cursors/windows.svg +15 -0
  6. package/assets/devices/ipad-pro-11-m4.json +52 -0
  7. package/assets/devices/iphone-16-pro.json +53 -0
  8. package/assets/devices/macbook-air-13.json +45 -0
  9. package/assets/frames/MacBook Air 13.svg +242 -0
  10. package/assets/frames/Status bar - iPhone.png +0 -0
  11. Menu bar- iPad.png +0 -0
  12. package/assets/frames/iPad Pro M4 11_.png +0 -0
  13. package/assets/frames/iPhone 16 Pro.png +0 -0
  14. package/assets/icons/Cellular Connection.svg +3 -0
  15. package/assets/icons/Union.svg +6 -0
  16. package/assets/icons/Wifi.svg +3 -0
  17. package/assets/icons/battery.svg +5 -0
  18. package/assets/icons/battery_charging.svg +8 -0
  19. package/assets/skill/OPCODE-REFERENCE.md +607 -0
  20. package/assets/skill/README.md +39 -0
  21. package/assets/skill/SKILL.md +453 -468
  22. package/assets/skill/STUDIO-SKILL.md +476 -0
  23. package/assets/skill/references/examples.md +104 -0
  24. package/assets/skill/references/interactive-demo.md +225 -0
  25. package/assets/skill/references/mock-data.md +178 -0
  26. package/dist/abort.d.ts +5 -0
  27. package/dist/abort.js +44 -0
  28. package/dist/action-verifier.d.ts +29 -0
  29. package/dist/action-verifier.js +133 -0
  30. package/dist/agent-action-recovery.d.ts +45 -0
  31. package/dist/agent-action-recovery.js +370 -0
  32. package/dist/agent-message-utils.d.ts +21 -0
  33. package/dist/agent-message-utils.js +77 -0
  34. package/dist/agent-url-utils.d.ts +30 -0
  35. package/dist/agent-url-utils.js +138 -0
  36. package/dist/agent.d.ts +226 -0
  37. package/dist/agent.js +6666 -0
  38. package/dist/ak-tree.d.ts +39 -0
  39. package/dist/ak-tree.js +368 -0
  40. package/dist/alt-text.d.ts +26 -0
  41. package/dist/alt-text.js +55 -0
  42. package/dist/auth-capture.d.ts +17 -0
  43. package/dist/auth-capture.js +164 -0
  44. package/dist/benchmark.d.ts +59 -0
  45. package/dist/benchmark.js +135 -0
  46. package/dist/billing-operation-logging.d.ts +38 -0
  47. package/dist/billing-operation-logging.js +248 -0
  48. package/dist/browser-bar.d.ts +48 -0
  49. package/dist/browser-bar.js +284 -0
  50. package/dist/browser-pool.d.ts +7 -0
  51. package/dist/browser-pool.js +15 -5
  52. package/dist/browser-utils.d.ts +31 -0
  53. package/dist/browser-utils.js +97 -0
  54. package/dist/browser.d.ts +76 -1
  55. package/dist/browser.js +1657 -39
  56. package/dist/capture-alt-text.d.ts +12 -0
  57. package/dist/capture-alt-text.js +52 -0
  58. package/dist/capture-encryption.d.ts +10 -0
  59. package/dist/capture-encryption.js +41 -0
  60. package/dist/capture-language-preflight.d.ts +41 -0
  61. package/dist/capture-language-preflight.js +300 -0
  62. package/dist/capture-llm-page-identity.d.ts +15 -0
  63. package/dist/capture-llm-page-identity.js +128 -0
  64. package/dist/capture-model-resolution.d.ts +9 -0
  65. package/dist/capture-model-resolution.js +21 -0
  66. package/dist/capture-page-identity.d.ts +7 -0
  67. package/dist/capture-page-identity.js +352 -0
  68. package/dist/capture-preset-credentials.d.ts +62 -0
  69. package/dist/capture-preset-credentials.js +184 -0
  70. package/dist/capture-request-plan.d.ts +58 -0
  71. package/dist/capture-request-plan.js +264 -0
  72. package/dist/capture-run-optimizer.d.ts +139 -0
  73. package/dist/capture-run-optimizer.js +863 -0
  74. package/dist/capture-selector-memory.d.ts +31 -0
  75. package/dist/capture-selector-memory.js +345 -0
  76. package/dist/capture-session-profile-encryption.d.ts +2 -0
  77. package/dist/capture-session-profile-encryption.js +22 -0
  78. package/dist/capture-step-timeout.d.ts +10 -0
  79. package/dist/capture-step-timeout.js +30 -0
  80. package/dist/capture-strategy.d.ts +36 -0
  81. package/dist/capture-strategy.js +95 -0
  82. package/dist/capture-studio-sync.d.ts +23 -0
  83. package/dist/capture-studio-sync.js +172 -0
  84. package/dist/capture-surface-contract.d.ts +36 -0
  85. package/dist/capture-surface-contract.js +299 -0
  86. package/dist/capture-transition-engine.d.ts +28 -0
  87. package/dist/capture-transition-engine.js +292 -0
  88. package/dist/capture-variant-state.d.ts +56 -0
  89. package/dist/capture-variant-state.js +182 -0
  90. package/dist/capture-verification.d.ts +35 -0
  91. package/dist/capture-verification.js +95 -0
  92. package/dist/capture-viewport-lock.d.ts +48 -0
  93. package/dist/capture-viewport-lock.js +74 -0
  94. package/dist/circuit-breaker.d.ts +42 -0
  95. package/dist/circuit-breaker.js +119 -0
  96. package/dist/cli-config.d.ts +8 -1
  97. package/dist/cli-config.js +62 -6
  98. package/dist/cli-contract.d.ts +15 -0
  99. package/dist/cli-contract.js +167 -0
  100. package/dist/cli-runner-local.d.ts +12 -0
  101. package/dist/cli-runner-local.js +102 -0
  102. package/dist/cli-runner.d.ts +34 -0
  103. package/dist/cli-runner.js +433 -0
  104. package/dist/cli-utils.d.ts +0 -1
  105. package/dist/cli-utils.js +2 -5
  106. package/dist/cli.js +1005 -252
  107. package/dist/clip-orchestrator.d.ts +148 -0
  108. package/dist/clip-orchestrator.js +957 -0
  109. package/dist/clip-postprocess.d.ts +42 -0
  110. package/dist/clip-postprocess.js +201 -0
  111. package/dist/cookie-dismiss.d.ts +2 -0
  112. package/dist/cookie-dismiss.js +48 -13
  113. package/dist/cost-logging.d.ts +35 -0
  114. package/dist/cost-logging.js +242 -0
  115. package/dist/cost-resolution-monitor.d.ts +16 -0
  116. package/dist/cost-resolution-monitor.js +34 -0
  117. package/dist/credential-templates.d.ts +5 -0
  118. package/dist/credential-templates.js +60 -0
  119. package/dist/cursor-overlay-script.d.ts +6 -0
  120. package/dist/cursor-overlay-script.js +169 -0
  121. package/dist/dom-css-purger.d.ts +65 -0
  122. package/dist/dom-css-purger.js +333 -0
  123. package/dist/dom-font-inliner.d.ts +45 -0
  124. package/dist/dom-font-inliner.js +148 -0
  125. package/dist/dom-patch-resolver.d.ts +52 -0
  126. package/dist/dom-patch-resolver.js +242 -0
  127. package/dist/dom-serializer.d.ts +82 -0
  128. package/dist/dom-serializer.js +378 -0
  129. package/dist/element-capture.d.ts +13 -0
  130. package/dist/element-capture.js +522 -0
  131. package/dist/env-validation.d.ts +5 -0
  132. package/dist/env-validation.js +29 -0
  133. package/dist/execution-schema.d.ts +4423 -0
  134. package/dist/execution-schema.js +507 -0
  135. package/dist/execution-types.d.ts +886 -0
  136. package/dist/execution-types.js +65 -0
  137. package/dist/fonts-loader.d.ts +14 -0
  138. package/dist/fonts-loader.js +55 -0
  139. package/dist/hybrid-navigator.d.ts +138 -0
  140. package/dist/hybrid-navigator.js +468 -0
  141. package/dist/index.d.ts +18 -0
  142. package/dist/index.js +17 -0
  143. package/dist/legacy/agent-action-recovery.d.ts +45 -0
  144. package/dist/legacy/agent-action-recovery.js +370 -0
  145. package/dist/legacy/agent-message-utils.d.ts +21 -0
  146. package/dist/legacy/agent-message-utils.js +77 -0
  147. package/dist/legacy/agent-url-utils.d.ts +30 -0
  148. package/dist/legacy/agent-url-utils.js +138 -0
  149. package/dist/legacy/agent.d.ts +226 -0
  150. package/dist/legacy/agent.js +6666 -0
  151. package/dist/legacy/clip-orchestrator.d.ts +148 -0
  152. package/dist/legacy/clip-orchestrator.js +957 -0
  153. package/dist/legacy/credential-templates.d.ts +5 -0
  154. package/dist/legacy/credential-templates.js +60 -0
  155. package/dist/legacy/hybrid-navigator.d.ts +138 -0
  156. package/dist/legacy/hybrid-navigator.js +468 -0
  157. package/dist/legacy/llm-usage.d.ts +17 -0
  158. package/dist/legacy/llm-usage.js +45 -0
  159. package/dist/legacy/prompt-cache.d.ts +10 -0
  160. package/dist/legacy/prompt-cache.js +24 -0
  161. package/dist/legacy/prompts.d.ts +175 -0
  162. package/dist/legacy/prompts.js +1038 -0
  163. package/dist/legacy/tools.d.ts +4 -0
  164. package/dist/legacy/tools.js +216 -0
  165. package/dist/legacy/video-agent.d.ts +143 -0
  166. package/dist/legacy/video-agent.js +4788 -0
  167. package/dist/legacy/video-observation.d.ts +36 -0
  168. package/dist/legacy/video-observation.js +192 -0
  169. package/dist/legacy/video-planner.d.ts +12 -0
  170. package/dist/legacy/video-planner.js +501 -0
  171. package/dist/legacy/video-prompts.d.ts +37 -0
  172. package/dist/legacy/video-prompts.js +569 -0
  173. package/dist/legacy/video-tools.d.ts +3 -0
  174. package/dist/legacy/video-tools.js +59 -0
  175. package/dist/legacy/video-variant-state.d.ts +29 -0
  176. package/dist/legacy/video-variant-state.js +80 -0
  177. package/dist/legacy/vision-model.d.ts +17 -0
  178. package/dist/legacy/vision-model.js +74 -0
  179. package/dist/llm-healer.d.ts +63 -0
  180. package/dist/llm-healer.js +166 -0
  181. package/dist/llm-provider.d.ts +29 -0
  182. package/dist/llm-provider.js +80 -0
  183. package/dist/llm-usage.d.ts +17 -0
  184. package/dist/llm-usage.js +45 -0
  185. package/dist/logger.d.ts +6 -2
  186. package/dist/logger.js +15 -1
  187. package/dist/mockup-html.d.ts +119 -0
  188. package/dist/mockup-html.js +263 -0
  189. package/dist/mockup.d.ts +187 -0
  190. package/dist/mockup.js +869 -0
  191. package/dist/mouse-animation.d.ts +46 -0
  192. package/dist/mouse-animation.js +114 -0
  193. package/dist/opcode-actions.d.ts +42 -0
  194. package/dist/opcode-actions.js +511 -0
  195. package/dist/opcode-runner.d.ts +51 -0
  196. package/dist/opcode-runner.js +770 -0
  197. package/dist/openrouter-client.d.ts +40 -0
  198. package/dist/openrouter-client.js +16 -0
  199. package/dist/overlay-engine.d.ts +24 -0
  200. package/dist/overlay-engine.js +176 -0
  201. package/dist/overlay-utils.d.ts +14 -0
  202. package/dist/overlay-utils.js +13 -0
  203. package/dist/postcondition.d.ts +16 -0
  204. package/dist/postcondition.js +269 -0
  205. package/dist/posthog.d.ts +4 -0
  206. package/dist/posthog.js +26 -0
  207. package/dist/program-patcher.d.ts +25 -0
  208. package/dist/program-patcher.js +44 -0
  209. package/dist/prompt-cache.d.ts +10 -0
  210. package/dist/prompt-cache.js +24 -0
  211. package/dist/prompts.d.ts +175 -0
  212. package/dist/prompts.js +1038 -0
  213. package/dist/provider-config.d.ts +12 -0
  214. package/dist/provider-config.js +15 -0
  215. package/dist/recovery-chain.d.ts +37 -0
  216. package/dist/recovery-chain.js +350 -0
  217. package/dist/remote-browser.d.ts +215 -0
  218. package/dist/remote-browser.js +360 -0
  219. package/dist/safari-browser-bar.d.ts +15 -0
  220. package/dist/safari-browser-bar.js +95 -0
  221. package/dist/safari-toolbar-asset.d.ts +15 -0
  222. package/dist/safari-toolbar-asset.js +12 -0
  223. package/dist/security.d.ts +21 -0
  224. package/dist/security.js +608 -0
  225. package/dist/selector-resolver.d.ts +34 -0
  226. package/dist/selector-resolver.js +181 -0
  227. package/dist/semantic-resolver.d.ts +35 -0
  228. package/dist/semantic-resolver.js +161 -0
  229. package/dist/server-capture-runtime.d.ts +125 -0
  230. package/dist/server-capture-runtime.js +585 -0
  231. package/dist/server-credit-usage.d.ts +12 -0
  232. package/dist/server-credit-usage.js +41 -0
  233. package/dist/server-posthog.d.ts +2 -0
  234. package/dist/server-posthog.js +16 -0
  235. package/dist/server-project-webhooks.d.ts +59 -0
  236. package/dist/server-project-webhooks.js +123 -0
  237. package/dist/server-screenshot-watermark.d.ts +7 -0
  238. package/dist/server-screenshot-watermark.js +60 -0
  239. package/dist/session-profile.d.ts +86 -0
  240. package/dist/session-profile.js +1536 -0
  241. package/dist/sf-pro-fonts.d.ts +4 -0
  242. package/dist/sf-pro-fonts.js +7 -0
  243. package/dist/sf-pro-symbols.d.ts +1 -0
  244. package/dist/sf-pro-symbols.js +55 -0
  245. package/dist/skill-packaging.d.ts +28 -0
  246. package/dist/skill-packaging.js +169 -0
  247. package/dist/smart-wait.d.ts +27 -0
  248. package/dist/smart-wait.js +81 -0
  249. package/dist/status-bar-l10n.d.ts +14 -0
  250. package/dist/status-bar-l10n.js +177 -0
  251. package/dist/status-bar-render.d.ts +20 -0
  252. package/dist/status-bar-render.js +410 -0
  253. package/dist/status-bar.d.ts +53 -0
  254. package/dist/status-bar.js +620 -0
  255. package/dist/svg-browser-bar.d.ts +33 -0
  256. package/dist/svg-browser-bar.js +206 -0
  257. package/dist/svg-status-bar.d.ts +36 -0
  258. package/dist/svg-status-bar.js +597 -0
  259. package/dist/svg-text.d.ts +61 -0
  260. package/dist/svg-text.js +118 -0
  261. package/dist/tools.d.ts +4 -0
  262. package/dist/tools.js +216 -0
  263. package/dist/types.d.ts +240 -5
  264. package/dist/types.js +23 -1
  265. package/dist/v2/action-verifier.d.ts +29 -0
  266. package/dist/v2/action-verifier.js +133 -0
  267. package/dist/v2/alt-text.d.ts +26 -0
  268. package/dist/v2/alt-text.js +55 -0
  269. package/dist/v2/benchmark.d.ts +59 -0
  270. package/dist/v2/benchmark.js +135 -0
  271. package/dist/v2/capture-strategy.d.ts +30 -0
  272. package/dist/v2/capture-strategy.js +67 -0
  273. package/dist/v2/capture-verification.d.ts +35 -0
  274. package/dist/v2/capture-verification.js +95 -0
  275. package/dist/v2/circuit-breaker.d.ts +42 -0
  276. package/dist/v2/circuit-breaker.js +119 -0
  277. package/dist/v2/cli-runner-local.d.ts +11 -0
  278. package/dist/v2/cli-runner-local.js +91 -0
  279. package/dist/v2/cli-runner.d.ts +34 -0
  280. package/dist/v2/cli-runner.js +300 -0
  281. package/dist/v2/compiler-prompts.d.ts +27 -0
  282. package/dist/v2/compiler-prompts.js +123 -0
  283. package/dist/v2/compiler.d.ts +37 -0
  284. package/dist/v2/compiler.js +147 -0
  285. package/dist/v2/explorer.d.ts +41 -0
  286. package/dist/v2/explorer.js +56 -0
  287. package/dist/v2/index.d.ts +37 -0
  288. package/dist/v2/index.js +31 -0
  289. package/dist/v2/llm-healer.d.ts +62 -0
  290. package/dist/v2/llm-healer.js +166 -0
  291. package/dist/v2/llm-provider.d.ts +29 -0
  292. package/dist/v2/llm-provider.js +80 -0
  293. package/dist/v2/opcode-runner.d.ts +47 -0
  294. package/dist/v2/opcode-runner.js +634 -0
  295. package/dist/v2/overlay-engine.d.ts +24 -0
  296. package/dist/v2/overlay-engine.js +150 -0
  297. package/dist/v2/postcondition.d.ts +16 -0
  298. package/dist/v2/postcondition.js +249 -0
  299. package/dist/v2/program-patcher.d.ts +25 -0
  300. package/dist/v2/program-patcher.js +44 -0
  301. package/dist/v2/recovery-chain.d.ts +30 -0
  302. package/dist/v2/recovery-chain.js +368 -0
  303. package/dist/v2/schema.d.ts +2580 -0
  304. package/dist/v2/schema.js +295 -0
  305. package/dist/v2/selector-resolver.d.ts +34 -0
  306. package/dist/v2/selector-resolver.js +181 -0
  307. package/dist/v2/semantic-resolver.d.ts +35 -0
  308. package/dist/v2/semantic-resolver.js +161 -0
  309. package/dist/v2/smart-wait.d.ts +27 -0
  310. package/dist/v2/smart-wait.js +81 -0
  311. package/dist/v2/types.d.ts +444 -0
  312. package/dist/v2/types.js +19 -0
  313. package/dist/v2/web-playwright-local.d.ts +69 -0
  314. package/dist/v2/web-playwright-local.js +392 -0
  315. package/dist/version.d.ts +1 -0
  316. package/dist/version.js +5 -0
  317. package/dist/video-agent.d.ts +143 -0
  318. package/dist/video-agent.js +4788 -0
  319. package/dist/video-observation.d.ts +36 -0
  320. package/dist/video-observation.js +192 -0
  321. package/dist/video-planner.d.ts +12 -0
  322. package/dist/video-planner.js +501 -0
  323. package/dist/video-prompts.d.ts +37 -0
  324. package/dist/video-prompts.js +554 -0
  325. package/dist/video-tools.d.ts +3 -0
  326. package/dist/video-tools.js +59 -0
  327. package/dist/video-variant-state.d.ts +29 -0
  328. package/dist/video-variant-state.js +80 -0
  329. package/dist/vision-model.d.ts +17 -0
  330. package/dist/vision-model.js +74 -0
  331. package/dist/web-playwright-local.d.ts +126 -0
  332. package/dist/web-playwright-local.js +819 -0
  333. package/dist/ws-auth.d.ts +20 -0
  334. package/dist/ws-auth.js +70 -0
  335. package/dist/ws-broadcast.d.ts +34 -0
  336. package/dist/ws-broadcast.js +85 -0
  337. package/dist/ws-connection-limits.d.ts +12 -0
  338. package/dist/ws-connection-limits.js +44 -0
  339. package/dist/ws-handler-utils.d.ts +32 -0
  340. package/dist/ws-handler-utils.js +139 -0
  341. package/dist/ws-handler.d.ts +10 -0
  342. package/dist/ws-handler.js +1793 -0
  343. package/dist/ws-metrics-server.d.ts +9 -0
  344. package/dist/ws-metrics-server.js +31 -0
  345. package/dist/ws-server.d.ts +9 -0
  346. package/dist/ws-server.js +92 -0
  347. package/package.json +142 -71
@@ -0,0 +1,522 @@
1
+ import OpenAI from 'openai';
2
+ import { elementCaptureTools } from './tools.js';
3
+ import { buildElementSystemPrompt, buildElementIterationMessage } from './prompts.js';
4
+ import { logger } from './logger.js';
5
+ import { getPostHog, DISTINCT_ID } from './posthog.js';
6
+ import { isAbortError, throwIfAborted } from './abort.js';
7
+ import { callVisionCapableModel } from './vision-model.js';
8
+ import { zdrParam } from './provider-config.js';
9
+ const MAX_ELEMENT_ITERATIONS = 8;
10
+ const ELEMENT_CAPTURE_TEMPERATURE = 0;
11
+ export function isLooseElementCaptureRejectionReason(reason) {
12
+ if (!reason)
13
+ return false;
14
+ return /(too loose|too broad|tighten the frame|tighten the crop|significant unrelated surrounding content|large amount of unrelated page|minimal surrounding context|surrounding context|adjacent elements|wrong framing)/i
15
+ .test(reason);
16
+ }
17
+ export function outscaleAddsPadding(outscale) {
18
+ if (!outscale)
19
+ return false;
20
+ return [
21
+ outscale.padding,
22
+ outscale.paddingTop,
23
+ outscale.paddingRight,
24
+ outscale.paddingBottom,
25
+ outscale.paddingLeft,
26
+ outscale.paddingPercent,
27
+ ].some((value) => typeof value === 'number' && value > 0);
28
+ }
29
+ function buildTightOutscale(outscale) {
30
+ return {
31
+ ...outscale,
32
+ padding: 0,
33
+ paddingTop: 0,
34
+ paddingRight: 0,
35
+ paddingBottom: 0,
36
+ paddingLeft: 0,
37
+ paddingPercent: 0,
38
+ };
39
+ }
40
+ export function buildVerificationOutscale(outscale) {
41
+ return buildTightOutscale(outscale ?? {});
42
+ }
43
+ const elementVerificationTools = [
44
+ {
45
+ type: 'function',
46
+ function: {
47
+ name: 'accept_capture',
48
+ description: 'Approve the isolated element screenshot when the requested element/component is fully present and well framed.',
49
+ parameters: {
50
+ type: 'object',
51
+ properties: {
52
+ reason: {
53
+ type: 'string',
54
+ description: 'Short explanation of why the crop is good.',
55
+ },
56
+ },
57
+ required: ['reason'],
58
+ additionalProperties: false,
59
+ },
60
+ },
61
+ },
62
+ {
63
+ type: 'function',
64
+ function: {
65
+ name: 'accept_with_note',
66
+ description: 'Approve the capture with a minor note. Use when the correct element IS shown and fully visible, but the framing includes some extra surrounding context that does not materially affect capture quality.',
67
+ parameters: {
68
+ type: 'object',
69
+ properties: {
70
+ reason: {
71
+ type: 'string',
72
+ description: 'Short note about the minor framing issue.',
73
+ },
74
+ },
75
+ required: ['reason'],
76
+ additionalProperties: false,
77
+ },
78
+ },
79
+ },
80
+ {
81
+ type: 'function',
82
+ function: {
83
+ name: 'retry_capture',
84
+ description: 'Reject the isolated element screenshot when the crop is wrong, clipped, too loose, or shows the wrong target.',
85
+ parameters: {
86
+ type: 'object',
87
+ properties: {
88
+ reason: {
89
+ type: 'string',
90
+ description: 'What is wrong and what needs to be fixed before capturing again.',
91
+ },
92
+ },
93
+ required: ['reason'],
94
+ additionalProperties: false,
95
+ },
96
+ },
97
+ },
98
+ ];
99
+ async function verifyElementCapture(client, model, element, screenshot, assessment, stepNumber, signal, fallbackModel, uploadImage) {
100
+ throwIfAborted(signal, 'Element capture verification cancelled.');
101
+ const imgUrl = uploadImage
102
+ ? await uploadImage(screenshot, 'image/png').catch(() => `data:image/png;base64,${screenshot.toString('base64')}`)
103
+ : `data:image/png;base64,${screenshot.toString('base64')}`;
104
+ const messages = [
105
+ {
106
+ role: 'system',
107
+ content: 'You verify isolated UI element screenshots. The screenshot shows EXACTLY the pixel region captured by the element\'s bounding box — every pixel visible is INSIDE that element. Nothing outside the element is included.\n\nYou have THREE tools:\n- **accept_capture** — the correct component is shown, fully visible, well-framed\n- **accept_with_note** — the correct component IS shown and fully visible, but the framing includes some extra surrounding context. Use this instead of rejecting when the target element is clearly present and complete.\n- **retry_capture** — ONLY for serious issues: wrong element entirely, clipped edges, tiny fragment, or overlay obstruction\n\nReject (retry_capture) ONLY for:\n1. The element is clipped — a side is cut off by the image edges\n2. The wrong element was captured — the content clearly does not match the description at all\n3. Only a tiny fragment is captured — e.g., just a heading when a full card was requested\n4. An overlay or modal is obscuring the element\n\nWhen the correct element is visible but framing is slightly loose, use accept_with_note — do NOT reject.\n\nCRITICAL: Since the screenshot IS the element\'s exact bounding box, there is no such thing as "adjacent cards" or "neighboring elements" visible in this image. What looks like a neighboring card is an internal preview thumbnail. What looks like surrounding context is internal structure. Only reject if the image edges clip the component or the wrong component is shown entirely.',
108
+ },
109
+ {
110
+ role: 'user',
111
+ content: [
112
+ {
113
+ type: 'image_url',
114
+ image_url: { url: imgUrl },
115
+ },
116
+ {
117
+ type: 'text',
118
+ text: `## Requested isolated capture
119
+ Name: "${element.name}"
120
+ Description: "${element.description}"
121
+ Candidate assessment: "${assessment}"
122
+
123
+ This verification image is the raw element crop before any user-requested outscale or extra margin is applied programmatically after validation.
124
+
125
+ The screenshot IS the captured element — every pixel is inside its bounding box.
126
+
127
+ Use accept_capture if: the correct component is shown, fully visible, well-framed.
128
+ Use accept_with_note if: the correct component IS shown and fully visible, but framing is slightly loose with some extra context. This is still an approval.
129
+
130
+ Use retry_capture ONLY if:
131
+ - a side of the component is clipped by the image boundary
132
+ - the content clearly does not match the description at all (completely wrong element)
133
+ - only a tiny fragment is shown when a full component (card, panel, form) was requested
134
+ - an overlay or sticky header is covering it
135
+
136
+ Do NOT reject because:
137
+ - the element contains screenshot previews, thumbnails, or mockups inside it
138
+ - the internal layout has multiple visual sections that look like separate cards
139
+ - content appears "above", "beside", or "around" other content within the image — this is the element's own internal layout, not neighboring page elements
140
+ - the framing is slightly loose but the correct element is clearly present and complete`,
141
+ },
142
+ ],
143
+ },
144
+ ];
145
+ let response;
146
+ let usedModel = model;
147
+ try {
148
+ const visionResult = await callVisionCapableModel({
149
+ primaryModel: model,
150
+ fallbackModel,
151
+ callModel: (m) => client.chat.completions.create({
152
+ model: m,
153
+ messages,
154
+ tools: elementVerificationTools,
155
+ tool_choice: 'required',
156
+ temperature: 0,
157
+ max_tokens: 256,
158
+ provider: { ...zdrParam() },
159
+ }, { signal }),
160
+ onFallbackActivated: (m, reason) => logger.info(`Element verification vision fallback: ${m} (reason: ${reason})`),
161
+ });
162
+ response = visionResult.result;
163
+ usedModel = visionResult.model;
164
+ }
165
+ catch (err) {
166
+ if (isAbortError(err))
167
+ throw err;
168
+ logger.error(`Element verification failed: ${err.message}`);
169
+ return { verified: false, reason: 'Verification service unavailable', usage: null };
170
+ }
171
+ const usage = {
172
+ stepNumber,
173
+ stepType: 'element_capture',
174
+ generationId: response.id ?? null,
175
+ modelRequested: model,
176
+ modelUsed: response.model ?? usedModel,
177
+ promptTokens: response.usage?.prompt_tokens ?? null,
178
+ completionTokens: response.usage?.completion_tokens ?? null,
179
+ totalTokens: response.usage?.total_tokens ?? null,
180
+ imagesInPrompt: 1,
181
+ };
182
+ const toolCall = response.choices?.[0]?.message?.tool_calls?.[0];
183
+ if (!toolCall || !('function' in toolCall)) {
184
+ return { verified: false, reason: 'Verification returned no decision', usage };
185
+ }
186
+ const name = toolCall.function.name;
187
+ let args;
188
+ try {
189
+ args = JSON.parse(toolCall.function.arguments);
190
+ }
191
+ catch {
192
+ return { verified: false, reason: 'Verification returned invalid JSON', usage };
193
+ }
194
+ if (name === 'accept_capture' || name === 'accept_with_note') {
195
+ return { verified: true, reason: args.reason || undefined, usage };
196
+ }
197
+ return { verified: false, reason: args.reason || 'Verifier rejected', usage };
198
+ }
199
+ export async function captureIsolatedElement(browser, element, apiKey, model, options = {}) {
200
+ const { abortSignal, distinctId, fallbackModel, uploadImage } = options;
201
+ const client = new OpenAI({
202
+ baseURL: 'https://openrouter.ai/api/v1',
203
+ apiKey,
204
+ defaultHeaders: {
205
+ 'HTTP-Referer': 'https://github.com/screenshot-agent',
206
+ 'X-Title': 'Screenshot Agent',
207
+ },
208
+ });
209
+ logger.info(`Identifying element: "${element.name}" — "${element.description}"`);
210
+ const actionHistory = [];
211
+ const usageLog = [];
212
+ let stepCounter = 0;
213
+ let lastVerifierRejectedAsTooLoose = false;
214
+ let looseRejectionCount = 0;
215
+ try {
216
+ for (let iteration = 1; iteration <= MAX_ELEMENT_ITERATIONS; iteration++) {
217
+ throwIfAborted(abortSignal, `Element capture cancelled for "${element.name}".`);
218
+ // 1. Capture page state with AKTree
219
+ const pageState = await browser.getPageState();
220
+ const screenshotBuf = pageState.cleanScreenshot;
221
+ const serializedAKTree = pageState.serializedAKTree;
222
+ const screenshotUrl = uploadImage
223
+ ? await uploadImage(screenshotBuf, 'image/png').catch(() => `data:image/png;base64,${screenshotBuf.toString('base64')}`)
224
+ : `data:image/png;base64,${screenshotBuf.toString('base64')}`;
225
+ // 2. Build messages
226
+ const messages = [
227
+ { role: 'system', content: buildElementSystemPrompt(element.description) },
228
+ {
229
+ role: 'user',
230
+ content: buildElementIterationMessage({
231
+ elementName: element.name,
232
+ elementDescription: element.description,
233
+ serializedAKTree,
234
+ currentUrl: browser.currentPage.url(),
235
+ iteration,
236
+ maxIterations: MAX_ELEMENT_ITERATIONS,
237
+ actionHistory: actionHistory.length > 0 ? actionHistory : undefined,
238
+ viewport: browser.currentPage.viewportSize() ?? undefined,
239
+ scrollInfo: {
240
+ scrollY: pageState.scrollInfo.scrollY,
241
+ scrollHeight: pageState.scrollInfo.scrollHeight,
242
+ viewportHeight: browser.currentPage.viewportSize()?.height ?? 900,
243
+ },
244
+ screenshotUrl,
245
+ }),
246
+ },
247
+ ];
248
+ // 3. Call LLM
249
+ let response;
250
+ let usedModel = model;
251
+ let callMessages = messages;
252
+ const MAX_COERCION_RETRIES = 2;
253
+ try {
254
+ for (let coercionAttempt = 1; coercionAttempt <= MAX_COERCION_RETRIES + 1; coercionAttempt++) {
255
+ throwIfAborted(abortSignal, `Element capture cancelled for "${element.name}".`);
256
+ const visionResult = await callVisionCapableModel({
257
+ primaryModel: model,
258
+ fallbackModel,
259
+ callModel: (m) => client.chat.completions.create({
260
+ model: m,
261
+ messages: callMessages,
262
+ tools: elementCaptureTools,
263
+ tool_choice: 'required',
264
+ temperature: ELEMENT_CAPTURE_TEMPERATURE,
265
+ max_tokens: 1024,
266
+ provider: { ...zdrParam() },
267
+ }, { signal: abortSignal }),
268
+ onFallbackActivated: (m, reason) => logger.info(`Element capture vision fallback activated: ${m} (reason: ${reason})`),
269
+ });
270
+ response = visionResult.result;
271
+ usedModel = visionResult.model;
272
+ const msg = response.choices?.[0]?.message;
273
+ const hasTool = !!msg?.tool_calls?.[0] && 'function' in msg.tool_calls[0];
274
+ if (hasTool || coercionAttempt > MAX_COERCION_RETRIES)
275
+ break;
276
+ const assistantContent = msg?.content;
277
+ if (!assistantContent)
278
+ break;
279
+ logger.info(`Model returned text without tool call; coercing (attempt ${coercionAttempt}/${MAX_COERCION_RETRIES})...`);
280
+ callMessages = [
281
+ ...callMessages,
282
+ { role: 'assistant', content: assistantContent },
283
+ {
284
+ role: 'user',
285
+ content: 'You MUST call one of the available tools. Do not respond with text — select the most appropriate tool and call it now.',
286
+ },
287
+ ];
288
+ }
289
+ usageLog.push({
290
+ stepNumber: ++stepCounter,
291
+ stepType: 'element_capture',
292
+ generationId: response.id ?? null,
293
+ modelRequested: model,
294
+ modelUsed: response.model ?? usedModel,
295
+ promptTokens: response.usage?.prompt_tokens ?? null,
296
+ completionTokens: response.usage?.completion_tokens ?? null,
297
+ totalTokens: response.usage?.total_tokens ?? null,
298
+ imagesInPrompt: 1,
299
+ });
300
+ }
301
+ catch (err) {
302
+ if (isAbortError(err))
303
+ throw err;
304
+ logger.error(`Element capture API call failed: ${err.message}`);
305
+ actionHistory.push(`Iteration ${iteration}: API error — ${err.message}`);
306
+ continue;
307
+ }
308
+ const message = response.choices?.[0]?.message;
309
+ const toolCall = message?.tool_calls?.[0];
310
+ if (message?.content && !toolCall) {
311
+ logger.ai(message.content.slice(0, 200));
312
+ }
313
+ if (!toolCall || !('function' in toolCall)) {
314
+ logger.error(`No tool call at element iteration ${iteration}`);
315
+ continue;
316
+ }
317
+ const name = toolCall.function.name;
318
+ let args;
319
+ try {
320
+ args = JSON.parse(toolCall.function.arguments);
321
+ }
322
+ catch {
323
+ logger.error(`Invalid JSON in element tool arguments: ${toolCall.function.arguments}`);
324
+ continue;
325
+ }
326
+ // 4. Handle tool calls
327
+ // ── capture(nodeId?) ──
328
+ if (name === 'capture') {
329
+ const nodeId = typeof args.nodeId === 'string' ? args.nodeId : undefined;
330
+ const baseOutscale = element.outscale ?? { padding: element.padding ?? 0 };
331
+ if (!nodeId) {
332
+ // Full page capture — unusual for element capture but handle it
333
+ logger.info(`Element "${element.name}": capture() without nodeId — full page`);
334
+ const buffer = await browser.takeScreenshot();
335
+ const viewport = browser.currentPage.viewportSize();
336
+ const { verified, reason: verifyReason, usage } = await verifyElementCapture(client, model, element, buffer, 'Full page capture', ++stepCounter, abortSignal, fallbackModel, uploadImage);
337
+ if (usage)
338
+ usageLog.push(usage);
339
+ if (verified) {
340
+ logger.success(`Element "${element.name}" captured (full page)`);
341
+ return { element, success: true, buffer, assessment: verifyReason || 'Full page', usage: usageLog };
342
+ }
343
+ actionHistory.push(`Iteration ${iteration}: capture() full page rejected — ${verifyReason}. Specify a nodeId to crop to the target element.`);
344
+ continue;
345
+ }
346
+ // Resolve nodeId to bounds
347
+ const entry = await browser.resolveAKNode(nodeId);
348
+ if (!entry) {
349
+ logger.error(`Element "${element.name}": nodeId "${nodeId}" not found in AKTree`);
350
+ actionHistory.push(`Iteration ${iteration}: capture("${nodeId}") — node not found. Re-read the AKTree and use a valid nodeId.`);
351
+ continue;
352
+ }
353
+ logger.info(`Element "${element.name}": capture("${nodeId}") — ${entry.label} ${entry.bounds.w}x${entry.bounds.h}`);
354
+ // Check area threshold
355
+ const viewport = browser.currentPage.viewportSize();
356
+ if (viewport) {
357
+ const captureArea = entry.bounds.w * entry.bounds.h;
358
+ const viewportArea = viewport.width * viewport.height;
359
+ const isLargeComponent = /\b(hero|header|footer|navigation|navbar|banner|full.?width|feature.?section|testimonial|above.the.fold)\b/i.test(element.description);
360
+ const areaThreshold = isLargeComponent ? 0.85 : 0.70;
361
+ if (captureArea > viewportArea * areaThreshold) {
362
+ logger.info(`Element "${element.name}": node "${nodeId}" covers ${Math.round(captureArea / viewportArea * 100)}% of viewport — too large`);
363
+ actionHistory.push(`Iteration ${iteration}: capture("${nodeId}") rejected — element covers >${Math.round(areaThreshold * 100)}% of viewport (${entry.bounds.w}x${entry.bounds.h}), likely a wrapper. Find a more specific child node with focus(within: "${nodeId}").`);
364
+ continue;
365
+ }
366
+ }
367
+ // Capture the node
368
+ try {
369
+ const buffer = await browser.captureNode(nodeId);
370
+ // Verify
371
+ const { verified, reason: verifyReason, usage } = await verifyElementCapture(client, model, element, buffer, `Captured ${entry.label} (${nodeId})`, ++stepCounter, abortSignal, fallbackModel, uploadImage);
372
+ if (usage)
373
+ usageLog.push(usage);
374
+ if (!verified) {
375
+ lastVerifierRejectedAsTooLoose = isLooseElementCaptureRejectionReason(verifyReason);
376
+ if (lastVerifierRejectedAsTooLoose) {
377
+ looseRejectionCount++;
378
+ // Accept after repeated loose-only rejections (likely false positive from verifier)
379
+ if (looseRejectionCount >= 3) {
380
+ logger.info(`Element "${element.name}": accepting after ${looseRejectionCount} loose-only rejections`);
381
+ return {
382
+ element, success: true, buffer,
383
+ assessment: 'Accepted after repeated loose-only verifier rejections.',
384
+ usage: usageLog,
385
+ };
386
+ }
387
+ }
388
+ actionHistory.push(`Iteration ${iteration}: capture("${nodeId}") rejected by verifier — ${verifyReason}. ${lastVerifierRejectedAsTooLoose ? 'Try a more specific child node with focus(within: "' + nodeId + '").' : 'Try a different nodeId.'}`);
389
+ continue;
390
+ }
391
+ lastVerifierRejectedAsTooLoose = false;
392
+ // Apply outscale if needed
393
+ const finalBuffer = outscaleAddsPadding(baseOutscale)
394
+ ? await browser.screenshotByRegion({ x: entry.bounds.x, y: entry.bounds.y, width: entry.bounds.w, height: entry.bounds.h }, baseOutscale)
395
+ : buffer;
396
+ logger.success(`Element "${element.name}" captured via nodeId "${nodeId}"`);
397
+ getPostHog().capture({
398
+ distinctId: distinctId ?? DISTINCT_ID,
399
+ event: 'element_capture_succeeded',
400
+ properties: { element_name: element.name, method: 'nodeId', nodeId, iterations: iteration },
401
+ });
402
+ return {
403
+ element, success: true, buffer: finalBuffer,
404
+ assessment: verifyReason || `Captured ${entry.label}`,
405
+ usage: usageLog,
406
+ };
407
+ }
408
+ catch (err) {
409
+ if (isAbortError(err))
410
+ throw err;
411
+ logger.error(`Element "${element.name}": capture("${nodeId}") failed — ${err.message}`);
412
+ actionHistory.push(`Iteration ${iteration}: capture("${nodeId}") error — ${err.message}`);
413
+ continue;
414
+ }
415
+ }
416
+ // ── focus(query) ──
417
+ if (name === 'focus') {
418
+ const query = args;
419
+ try {
420
+ const focusResult = await browser.focusTree(query);
421
+ logger.info(`Element "${element.name}": focus → ${focusResult.matches.length} match(es)`);
422
+ actionHistory.push(`Iteration ${iteration}: focus(${JSON.stringify(args)}) → ${focusResult.matches.length} match(es)\n${focusResult.serialized.slice(0, 800)}`);
423
+ }
424
+ catch (err) {
425
+ logger.error(`Element "${element.name}": focus failed — ${err.message}`);
426
+ actionHistory.push(`Iteration ${iteration}: focus error — ${err.message}`);
427
+ }
428
+ continue;
429
+ }
430
+ // ── scroll ──
431
+ if (name === 'scroll') {
432
+ const centerOn = typeof args.centerOn === 'string' ? args.centerOn : undefined;
433
+ const target = typeof args.target === 'string' ? args.target : undefined;
434
+ const direction = args.direction ?? 'down';
435
+ const offset = typeof args.offset === 'number' ? args.offset : undefined;
436
+ if (centerOn) {
437
+ logger.info(`Element "${element.name}": scroll centerOn="${centerOn}"`);
438
+ try {
439
+ await browser.centerNodeInView(centerOn, { containerNodeId: target, offset });
440
+ await browser.wait(300);
441
+ actionHistory.push(`Iteration ${iteration}: scroll(centerOn="${centerOn}")`);
442
+ }
443
+ catch (err) {
444
+ actionHistory.push(`Iteration ${iteration}: scroll centerOn error — ${err.message}`);
445
+ }
446
+ }
447
+ else {
448
+ const amount = offset ?? 500;
449
+ logger.info(`Element "${element.name}": scroll ${direction} ${amount}px`);
450
+ await browser.scroll(direction, amount);
451
+ await browser.wait(500);
452
+ actionHistory.push(`Iteration ${iteration}: scroll(${direction}, ${amount}px)`);
453
+ }
454
+ continue;
455
+ }
456
+ // ── analyze_screenshot ──
457
+ if (name === 'analyze_screenshot') {
458
+ const question = args.question || 'Describe what you see';
459
+ logger.info(`Element "${element.name}": analyze_screenshot — "${question}"`);
460
+ try {
461
+ const analysisResponse = await client.chat.completions.create({
462
+ model,
463
+ messages: [
464
+ {
465
+ role: 'user',
466
+ content: [
467
+ { type: 'image_url', image_url: { url: screenshotUrl } },
468
+ { type: 'text', text: question },
469
+ ],
470
+ },
471
+ ],
472
+ max_tokens: 256,
473
+ provider: { ...zdrParam() },
474
+ }, { signal: abortSignal });
475
+ const answer = analysisResponse.choices?.[0]?.message?.content || '(no answer)';
476
+ actionHistory.push(`Iteration ${iteration}: analyze_screenshot("${question}") → ${answer.slice(0, 300)}`);
477
+ }
478
+ catch (err) {
479
+ if (isAbortError(err))
480
+ throw err;
481
+ actionHistory.push(`Iteration ${iteration}: analyze_screenshot error — ${err.message}`);
482
+ }
483
+ continue;
484
+ }
485
+ // Unknown tool
486
+ logger.warn(`Element "${element.name}": unknown tool "${name}"`);
487
+ actionHistory.push(`Iteration ${iteration}: unknown tool "${name}" — use focus, scroll, analyze_screenshot, or capture.`);
488
+ }
489
+ // Max iterations reached — no coordinate fallback needed since AKTree provides bounds directly
490
+ logger.error(`Element "${element.name}": max iterations reached`);
491
+ getPostHog().capture({
492
+ distinctId: distinctId ?? DISTINCT_ID,
493
+ event: 'element_capture_failed',
494
+ properties: {
495
+ element_name: element.name,
496
+ reason: 'Max iterations reached',
497
+ iterations: MAX_ELEMENT_ITERATIONS,
498
+ failure_type: 'max_iterations',
499
+ },
500
+ });
501
+ return {
502
+ element,
503
+ success: false,
504
+ buffer: Buffer.alloc(0),
505
+ assessment: 'Max iterations reached for element identification',
506
+ usage: usageLog,
507
+ };
508
+ }
509
+ catch (err) {
510
+ if (isAbortError(err))
511
+ throw err;
512
+ logger.error(`Element capture unexpected error: ${err.message}`);
513
+ return {
514
+ element,
515
+ success: false,
516
+ buffer: Buffer.alloc(0),
517
+ assessment: `Unexpected error: ${err.message}`,
518
+ usage: usageLog,
519
+ };
520
+ }
521
+ }
522
+ //# sourceMappingURL=element-capture.js.map
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Startup validation for required environment variables.
3
+ * Call immediately after loading .env to fail fast with a clear message.
4
+ */
5
+ export declare function validateRequiredEnv(): void;
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Startup validation for required environment variables.
3
+ * Call immediately after loading .env to fail fast with a clear message.
4
+ */
5
+ const REQUIRED_ENV_VARS = [
6
+ 'SUPABASE_SERVICE_ROLE_KEY',
7
+ 'OPENROUTER_API_KEY',
8
+ ];
9
+ const REQUIRED_WITH_PUBLIC_FALLBACK = [
10
+ { key: 'SUPABASE_URL', fallback: 'NEXT_PUBLIC_SUPABASE_URL' },
11
+ ];
12
+ export function validateRequiredEnv() {
13
+ const missing = [];
14
+ for (const key of REQUIRED_ENV_VARS) {
15
+ if (!process.env[key]) {
16
+ missing.push(key);
17
+ }
18
+ }
19
+ for (const { key, fallback } of REQUIRED_WITH_PUBLIC_FALLBACK) {
20
+ if (!process.env[key] && !process.env[fallback]) {
21
+ missing.push(`${key} (or ${fallback})`);
22
+ }
23
+ }
24
+ if (missing.length > 0) {
25
+ console.error(`\n[AutoKap] Missing required environment variables:\n${missing.map((v) => ` - ${v}`).join('\n')}\n\nCopy .env.example to .env and fill in the values.\n`);
26
+ process.exit(1);
27
+ }
28
+ }
29
+ //# sourceMappingURL=env-validation.js.map